]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_fault.c
xnu-1228.9.59.tar.gz
[apple/xnu.git] / osfmk / vm / vm_fault.c
1 /*
2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm_fault.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Page fault handling module.
63 */
64
65 #include <mach_cluster_stats.h>
66 #include <mach_pagemap.h>
67 #include <mach_kdb.h>
68 #include <libkern/OSAtomic.h>
69
70 #include <mach/mach_types.h>
71 #include <mach/kern_return.h>
72 #include <mach/message.h> /* for error codes */
73 #include <mach/vm_param.h>
74 #include <mach/vm_behavior.h>
75 #include <mach/memory_object.h>
76 /* For memory_object_data_{request,unlock} */
77 #include <mach/sdt.h>
78
79 #include <kern/kern_types.h>
80 #include <kern/host_statistics.h>
81 #include <kern/counters.h>
82 #include <kern/task.h>
83 #include <kern/thread.h>
84 #include <kern/sched_prim.h>
85 #include <kern/host.h>
86 #include <kern/xpr.h>
87 #include <kern/mach_param.h>
88 #include <kern/macro_help.h>
89 #include <kern/zalloc.h>
90 #include <kern/misc_protos.h>
91
92 #include <ppc/proc_reg.h>
93
94 #include <vm/vm_fault.h>
95 #include <vm/vm_map.h>
96 #include <vm/vm_object.h>
97 #include <vm/vm_page.h>
98 #include <vm/vm_kern.h>
99 #include <vm/pmap.h>
100 #include <vm/vm_pageout.h>
101 #include <vm/vm_protos.h>
102 #include <vm/vm_external.h>
103 #include <vm/memory_object.h>
104 #include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
105
106 #include <sys/kdebug.h>
107
108 #define VM_FAULT_CLASSIFY 0
109
110 /* Zero-filled pages are marked "m->zero_fill" and put on the
111 * special zero-fill inactive queue only if they belong to
112 * an object at least this big.
113 */
114 #define VM_ZF_OBJECT_SIZE_THRESHOLD (0x200000)
115
116 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
117
118 int vm_object_pagein_throttle = 16;
119
120 extern int cs_debug;
121
122 #if MACH_KDB
123 extern struct db_watchpoint *db_watchpoint_list;
124 #endif /* MACH_KDB */
125
126
127 /* Forward declarations of internal routines. */
128 extern kern_return_t vm_fault_wire_fast(
129 vm_map_t map,
130 vm_map_offset_t va,
131 vm_map_entry_t entry,
132 pmap_t pmap,
133 vm_map_offset_t pmap_addr);
134
135 extern void vm_fault_continue(void);
136
137 extern void vm_fault_copy_cleanup(
138 vm_page_t page,
139 vm_page_t top_page);
140
141 extern void vm_fault_copy_dst_cleanup(
142 vm_page_t page);
143
144 #if VM_FAULT_CLASSIFY
145 extern void vm_fault_classify(vm_object_t object,
146 vm_object_offset_t offset,
147 vm_prot_t fault_type);
148
149 extern void vm_fault_classify_init(void);
150 #endif
151
152
153 unsigned long vm_cs_validates = 0;
154 unsigned long vm_cs_revalidates = 0;
155 unsigned long vm_cs_query_modified = 0;
156 unsigned long vm_cs_validated_dirtied = 0;
157
158 #if CONFIG_ENFORCE_SIGNED_CODE
159 #if SECURE_KERNEL
160 const int cs_enforcement_disable=0;
161 #else
162 int cs_enforcement_disable=1;
163 #endif
164 #endif
165
166 /*
167 * Routine: vm_fault_init
168 * Purpose:
169 * Initialize our private data structures.
170 */
171 void
172 vm_fault_init(void)
173 {
174 #if !SECURE_KERNEL
175 #if CONFIG_ENFORCE_SIGNED_CODE
176 PE_parse_boot_argn("cs_enforcement_disable", &cs_enforcement_disable, sizeof (cs_enforcement_disable));
177 #endif
178 PE_parse_boot_argn("cs_debug", &cs_debug, sizeof (cs_debug));
179 #endif
180 }
181
182 /*
183 * Routine: vm_fault_cleanup
184 * Purpose:
185 * Clean up the result of vm_fault_page.
186 * Results:
187 * The paging reference for "object" is released.
188 * "object" is unlocked.
189 * If "top_page" is not null, "top_page" is
190 * freed and the paging reference for the object
191 * containing it is released.
192 *
193 * In/out conditions:
194 * "object" must be locked.
195 */
196 void
197 vm_fault_cleanup(
198 register vm_object_t object,
199 register vm_page_t top_page)
200 {
201 vm_object_paging_end(object);
202 vm_object_unlock(object);
203
204 if (top_page != VM_PAGE_NULL) {
205 object = top_page->object;
206
207 vm_object_lock(object);
208 VM_PAGE_FREE(top_page);
209 vm_object_paging_end(object);
210 vm_object_unlock(object);
211 }
212 }
213
214 #if MACH_CLUSTER_STATS
215 #define MAXCLUSTERPAGES 16
216 struct {
217 unsigned long pages_in_cluster;
218 unsigned long pages_at_higher_offsets;
219 unsigned long pages_at_lower_offsets;
220 } cluster_stats_in[MAXCLUSTERPAGES];
221 #define CLUSTER_STAT(clause) clause
222 #define CLUSTER_STAT_HIGHER(x) \
223 ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
224 #define CLUSTER_STAT_LOWER(x) \
225 ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
226 #define CLUSTER_STAT_CLUSTER(x) \
227 ((cluster_stats_in[(x)].pages_in_cluster)++)
228 #else /* MACH_CLUSTER_STATS */
229 #define CLUSTER_STAT(clause)
230 #endif /* MACH_CLUSTER_STATS */
231
232 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
233
234
235 boolean_t vm_page_deactivate_behind = TRUE;
236 /*
237 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
238 */
239 int vm_default_ahead = 0;
240 int vm_default_behind = MAX_UPL_TRANSFER;
241
242 #define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
243
244 /*
245 * vm_page_is_sequential
246 *
247 * Determine if sequential access is in progress
248 * in accordance with the behavior specified.
249 * Update state to indicate current access pattern.
250 *
251 * object must have at least the shared lock held
252 */
253 static
254 void
255 vm_fault_is_sequential(
256 vm_object_t object,
257 vm_object_offset_t offset,
258 vm_behavior_t behavior)
259 {
260 vm_object_offset_t last_alloc;
261 int sequential;
262 int orig_sequential;
263
264 last_alloc = object->last_alloc;
265 sequential = object->sequential;
266 orig_sequential = sequential;
267
268 switch (behavior) {
269 case VM_BEHAVIOR_RANDOM:
270 /*
271 * reset indicator of sequential behavior
272 */
273 sequential = 0;
274 break;
275
276 case VM_BEHAVIOR_SEQUENTIAL:
277 if (offset && last_alloc == offset - PAGE_SIZE_64) {
278 /*
279 * advance indicator of sequential behavior
280 */
281 if (sequential < MAX_SEQUENTIAL_RUN)
282 sequential += PAGE_SIZE;
283 } else {
284 /*
285 * reset indicator of sequential behavior
286 */
287 sequential = 0;
288 }
289 break;
290
291 case VM_BEHAVIOR_RSEQNTL:
292 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
293 /*
294 * advance indicator of sequential behavior
295 */
296 if (sequential > -MAX_SEQUENTIAL_RUN)
297 sequential -= PAGE_SIZE;
298 } else {
299 /*
300 * reset indicator of sequential behavior
301 */
302 sequential = 0;
303 }
304 break;
305
306 case VM_BEHAVIOR_DEFAULT:
307 default:
308 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
309 /*
310 * advance indicator of sequential behavior
311 */
312 if (sequential < 0)
313 sequential = 0;
314 if (sequential < MAX_SEQUENTIAL_RUN)
315 sequential += PAGE_SIZE;
316
317 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
318 /*
319 * advance indicator of sequential behavior
320 */
321 if (sequential > 0)
322 sequential = 0;
323 if (sequential > -MAX_SEQUENTIAL_RUN)
324 sequential -= PAGE_SIZE;
325 } else {
326 /*
327 * reset indicator of sequential behavior
328 */
329 sequential = 0;
330 }
331 break;
332 }
333 if (sequential != orig_sequential) {
334 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
335 /*
336 * if someone else has already updated object->sequential
337 * don't bother trying to update it or object->last_alloc
338 */
339 return;
340 }
341 }
342 /*
343 * I'd like to do this with a OSCompareAndSwap64, but that
344 * doesn't exist for PPC... however, it shouldn't matter
345 * that much... last_alloc is maintained so that we can determine
346 * if a sequential access pattern is taking place... if only
347 * one thread is banging on this object, no problem with the unprotected
348 * update... if 2 or more threads are banging away, we run the risk of
349 * someone seeing a mangled update... however, in the face of multiple
350 * accesses, no sequential access pattern can develop anyway, so we
351 * haven't lost any real info.
352 */
353 object->last_alloc = offset;
354 }
355
356
357 /*
358 * vm_page_deactivate_behind
359 *
360 * Determine if sequential access is in progress
361 * in accordance with the behavior specified. If
362 * so, compute a potential page to deactivate and
363 * deactivate it.
364 *
365 * object must be locked.
366 *
367 * return TRUE if we actually deactivate a page
368 */
369 static
370 boolean_t
371 vm_fault_deactivate_behind(
372 vm_object_t object,
373 vm_object_offset_t offset,
374 vm_behavior_t behavior)
375 {
376 vm_page_t m = NULL;
377 int sequential_run;
378 int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
379
380 #if TRACEFAULTPAGE
381 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
382 #endif
383
384 if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
385 /*
386 * Do not deactivate pages from the kernel object: they
387 * are not intended to become pageable.
388 * or we've disabled the deactivate behind mechanism
389 */
390 return FALSE;
391 }
392 if ((sequential_run = object->sequential)) {
393 if (sequential_run < 0) {
394 sequential_behavior = VM_BEHAVIOR_RSEQNTL;
395 sequential_run = 0 - sequential_run;
396 } else {
397 sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
398 }
399 }
400 switch (behavior) {
401 case VM_BEHAVIOR_RANDOM:
402 break;
403 case VM_BEHAVIOR_SEQUENTIAL:
404 if (sequential_run >= (int)PAGE_SIZE)
405 m = vm_page_lookup(object, offset - PAGE_SIZE_64);
406 break;
407 case VM_BEHAVIOR_RSEQNTL:
408 if (sequential_run >= (int)PAGE_SIZE)
409 m = vm_page_lookup(object, offset + PAGE_SIZE_64);
410 break;
411 case VM_BEHAVIOR_DEFAULT:
412 default:
413 { vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
414
415 /*
416 * determine if the run of sequential accesss has been
417 * long enough on an object with default access behavior
418 * to consider it for deactivation
419 */
420 if ((uint64_t)sequential_run >= behind) {
421 if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
422 if (offset >= behind)
423 m = vm_page_lookup(object, offset - behind);
424 } else {
425 if (offset < -behind)
426 m = vm_page_lookup(object, offset + behind);
427 }
428 }
429 break;
430 }
431 }
432 if (m) {
433 if (!m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
434 pmap_clear_reference(m->phys_page);
435 m->deactivated = TRUE;
436 #if TRACEFAULTPAGE
437 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
438 #endif
439 return TRUE;
440 }
441 }
442 return FALSE;
443 }
444
445
446 /*
447 * check for various conditions that would
448 * prevent us from creating a ZF page...
449 * cleanup is based on being called from vm_fault_page
450 *
451 * object must be locked
452 * object == m->object
453 */
454 static vm_fault_return_t
455 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
456 {
457 if (object->shadow_severed) {
458 /*
459 * the shadow chain was severed
460 * just have to return an error at this point
461 */
462 if (m != VM_PAGE_NULL)
463 VM_PAGE_FREE(m);
464 vm_fault_cleanup(object, first_m);
465
466 thread_interrupt_level(interruptible_state);
467
468 return (VM_FAULT_MEMORY_ERROR);
469 }
470 if (vm_backing_store_low) {
471 /*
472 * are we protecting the system from
473 * backing store exhaustion. If so
474 * sleep unless we are privileged.
475 */
476 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
477
478 if (m != VM_PAGE_NULL)
479 VM_PAGE_FREE(m);
480 vm_fault_cleanup(object, first_m);
481
482 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
483
484 thread_block(THREAD_CONTINUE_NULL);
485 thread_interrupt_level(interruptible_state);
486
487 return (VM_FAULT_RETRY);
488 }
489 }
490 if (VM_PAGE_ZFILL_THROTTLED()) {
491 /*
492 * we're throttling zero-fills...
493 * treat this as if we couldn't grab a page
494 */
495 if (m != VM_PAGE_NULL)
496 VM_PAGE_FREE(m);
497 vm_fault_cleanup(object, first_m);
498
499 thread_interrupt_level(interruptible_state);
500
501 return (VM_FAULT_MEMORY_SHORTAGE);
502 }
503 return (VM_FAULT_SUCCESS);
504 }
505
506
507 /*
508 * do the work to zero fill a page and
509 * inject it into the correct paging queue
510 *
511 * m->object must be locked
512 * page queue lock must NOT be held
513 */
514 static int
515 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
516 {
517 int my_fault = DBG_ZERO_FILL_FAULT;
518
519 /*
520 * This is is a zero-fill page fault...
521 *
522 * Checking the page lock is a waste of
523 * time; this page was absent, so
524 * it can't be page locked by a pager.
525 *
526 * we also consider it undefined
527 * with respect to instruction
528 * execution. i.e. it is the responsibility
529 * of higher layers to call for an instruction
530 * sync after changing the contents and before
531 * sending a program into this area. We
532 * choose this approach for performance
533 */
534 m->pmapped = TRUE;
535
536 m->cs_validated = FALSE;
537 m->cs_tainted = FALSE;
538
539 if (no_zero_fill == TRUE)
540 my_fault = DBG_NZF_PAGE_FAULT;
541 else {
542 vm_page_zero_fill(m);
543
544 VM_STAT_INCR(zero_fill_count);
545 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
546 }
547 assert(!m->laundry);
548 assert(m->object != kernel_object);
549 //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
550
551 if (!IP_VALID(memory_manager_default) &&
552 (m->object->purgable == VM_PURGABLE_DENY ||
553 m->object->purgable == VM_PURGABLE_NONVOLATILE ||
554 m->object->purgable == VM_PURGABLE_VOLATILE )) {
555 vm_page_lock_queues();
556
557 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
558 m->throttled = TRUE;
559 vm_page_throttled_count++;
560
561 vm_page_unlock_queues();
562 } else {
563 if (m->object->size > VM_ZF_OBJECT_SIZE_THRESHOLD) {
564 m->zero_fill = TRUE;
565 OSAddAtomic(1, (SInt32 *)&vm_zf_count);
566 }
567 }
568 return (my_fault);
569 }
570
571
572 /*
573 * Routine: vm_fault_page
574 * Purpose:
575 * Find the resident page for the virtual memory
576 * specified by the given virtual memory object
577 * and offset.
578 * Additional arguments:
579 * The required permissions for the page is given
580 * in "fault_type". Desired permissions are included
581 * in "protection".
582 * fault_info is passed along to determine pagein cluster
583 * limits... it contains the expected reference pattern,
584 * cluster size if available, etc...
585 *
586 * If the desired page is known to be resident (for
587 * example, because it was previously wired down), asserting
588 * the "unwiring" parameter will speed the search.
589 *
590 * If the operation can be interrupted (by thread_abort
591 * or thread_terminate), then the "interruptible"
592 * parameter should be asserted.
593 *
594 * Results:
595 * The page containing the proper data is returned
596 * in "result_page".
597 *
598 * In/out conditions:
599 * The source object must be locked and referenced,
600 * and must donate one paging reference. The reference
601 * is not affected. The paging reference and lock are
602 * consumed.
603 *
604 * If the call succeeds, the object in which "result_page"
605 * resides is left locked and holding a paging reference.
606 * If this is not the original object, a busy page in the
607 * original object is returned in "top_page", to prevent other
608 * callers from pursuing this same data, along with a paging
609 * reference for the original object. The "top_page" should
610 * be destroyed when this guarantee is no longer required.
611 * The "result_page" is also left busy. It is not removed
612 * from the pageout queues.
613 */
614
615 vm_fault_return_t
616 vm_fault_page(
617 /* Arguments: */
618 vm_object_t first_object, /* Object to begin search */
619 vm_object_offset_t first_offset, /* Offset into object */
620 vm_prot_t fault_type, /* What access is requested */
621 boolean_t must_be_resident,/* Must page be resident? */
622 /* Modifies in place: */
623 vm_prot_t *protection, /* Protection for mapping */
624 /* Returns: */
625 vm_page_t *result_page, /* Page found, if successful */
626 vm_page_t *top_page, /* Page in top object, if
627 * not result_page. */
628 int *type_of_fault, /* if non-null, fill in with type of fault
629 * COW, zero-fill, etc... returned in trace point */
630 /* More arguments: */
631 kern_return_t *error_code, /* code if page is in error */
632 boolean_t no_zero_fill, /* don't zero fill absent pages */
633 #if MACH_PAGEMAP
634 boolean_t data_supply, /* treat as data_supply if
635 * it is a write fault and a full
636 * page is provided */
637 #else
638 __unused boolean_t data_supply,
639 #endif
640 vm_object_fault_info_t fault_info)
641 {
642 vm_page_t m;
643 vm_object_t object;
644 vm_object_offset_t offset;
645 vm_page_t first_m;
646 vm_object_t next_object;
647 vm_object_t copy_object;
648 boolean_t look_for_page;
649 vm_prot_t access_required = fault_type;
650 vm_prot_t wants_copy_flag;
651 CLUSTER_STAT(int pages_at_higher_offsets;)
652 CLUSTER_STAT(int pages_at_lower_offsets;)
653 kern_return_t wait_result;
654 boolean_t interruptible_state;
655 vm_fault_return_t error;
656 int my_fault;
657 uint32_t try_failed_count;
658 int interruptible; /* how may fault be interrupted? */
659 memory_object_t pager;
660
661 /*
662 * MACH page map - an optional optimization where a bit map is maintained
663 * by the VM subsystem for internal objects to indicate which pages of
664 * the object currently reside on backing store. This existence map
665 * duplicates information maintained by the vnode pager. It is
666 * created at the time of the first pageout against the object, i.e.
667 * at the same time pager for the object is created. The optimization
668 * is designed to eliminate pager interaction overhead, if it is
669 * 'known' that the page does not exist on backing store.
670 *
671 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
672 * either marked as paged out in the existence map for the object or no
673 * existence map exists for the object. MUST_ASK_PAGER() is one of the
674 * criteria in the decision to invoke the pager. It is also used as one
675 * of the criteria to terminate the scan for adjacent pages in a clustered
676 * pagein operation. Note that MUST_ASK_PAGER() always evaluates to TRUE for
677 * permanent objects. Note also that if the pager for an internal object
678 * has not been created, the pager is not invoked regardless of the value
679 * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
680 * for which a pager has been created.
681 *
682 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
683 * is marked as paged out in the existence map for the object. PAGED_OUT()
684 * PAGED_OUT() is used to determine if a page has already been pushed
685 * into a copy object in order to avoid a redundant page out operation.
686 */
687 #if MACH_PAGEMAP
688 #define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \
689 != VM_EXTERNAL_STATE_ABSENT)
690 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
691 == VM_EXTERNAL_STATE_EXISTS)
692 #else
693 #define MUST_ASK_PAGER(o, f) (TRUE)
694 #define PAGED_OUT(o, f) (FALSE)
695 #endif
696
697 /*
698 * Recovery actions
699 */
700 #define PREPARE_RELEASE_PAGE(m) \
701 MACRO_BEGIN \
702 vm_page_lock_queues(); \
703 MACRO_END
704
705 #define DO_RELEASE_PAGE(m) \
706 MACRO_BEGIN \
707 PAGE_WAKEUP_DONE(m); \
708 if (!m->active && !m->inactive && !m->throttled)\
709 vm_page_activate(m); \
710 vm_page_unlock_queues(); \
711 MACRO_END
712
713 #define RELEASE_PAGE(m) \
714 MACRO_BEGIN \
715 PREPARE_RELEASE_PAGE(m); \
716 DO_RELEASE_PAGE(m); \
717 MACRO_END
718
719 #if TRACEFAULTPAGE
720 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
721 #endif
722
723
724 #if MACH_KDB
725 /*
726 * If there are watchpoints set, then
727 * we don't want to give away write permission
728 * on a read fault. Make the task write fault,
729 * so that the watchpoint code notices the access.
730 */
731 if (db_watchpoint_list) {
732 /*
733 * If we aren't asking for write permission,
734 * then don't give it away. We're using write
735 * faults to set the dirty bit.
736 */
737 if (!(fault_type & VM_PROT_WRITE))
738 *protection &= ~VM_PROT_WRITE;
739 }
740 #endif /* MACH_KDB */
741
742 interruptible = fault_info->interruptible;
743 interruptible_state = thread_interrupt_level(interruptible);
744
745 /*
746 * INVARIANTS (through entire routine):
747 *
748 * 1) At all times, we must either have the object
749 * lock or a busy page in some object to prevent
750 * some other thread from trying to bring in
751 * the same page.
752 *
753 * Note that we cannot hold any locks during the
754 * pager access or when waiting for memory, so
755 * we use a busy page then.
756 *
757 * 2) To prevent another thread from racing us down the
758 * shadow chain and entering a new page in the top
759 * object before we do, we must keep a busy page in
760 * the top object while following the shadow chain.
761 *
762 * 3) We must increment paging_in_progress on any object
763 * for which we have a busy page before dropping
764 * the object lock
765 *
766 * 4) We leave busy pages on the pageout queues.
767 * If the pageout daemon comes across a busy page,
768 * it will remove the page from the pageout queues.
769 */
770
771 object = first_object;
772 offset = first_offset;
773 first_m = VM_PAGE_NULL;
774 access_required = fault_type;
775
776
777 XPR(XPR_VM_FAULT,
778 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
779 (integer_t)object, offset, fault_type, *protection, 0);
780
781 /*
782 * default type of fault
783 */
784 my_fault = DBG_CACHE_HIT_FAULT;
785
786 while (TRUE) {
787 #if TRACEFAULTPAGE
788 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
789 #endif
790 if (!object->alive) {
791 /*
792 * object is no longer valid
793 * clean up and return error
794 */
795 vm_fault_cleanup(object, first_m);
796 thread_interrupt_level(interruptible_state);
797
798 return (VM_FAULT_MEMORY_ERROR);
799 }
800
801 /*
802 * See whether the page at 'offset' is resident
803 */
804 m = vm_page_lookup(object, offset);
805 #if TRACEFAULTPAGE
806 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
807 #endif
808 if (m != VM_PAGE_NULL) {
809
810 if (m->busy) {
811 /*
812 * The page is being brought in,
813 * wait for it and then retry.
814 *
815 * A possible optimization: if the page
816 * is known to be resident, we can ignore
817 * pages that are absent (regardless of
818 * whether they're busy).
819 */
820 #if TRACEFAULTPAGE
821 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
822 #endif
823 wait_result = PAGE_SLEEP(object, m, interruptible);
824 XPR(XPR_VM_FAULT,
825 "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
826 (integer_t)object, offset,
827 (integer_t)m, 0, 0);
828 counter(c_vm_fault_page_block_busy_kernel++);
829
830 if (wait_result != THREAD_AWAKENED) {
831 vm_fault_cleanup(object, first_m);
832 thread_interrupt_level(interruptible_state);
833
834 if (wait_result == THREAD_RESTART)
835 return (VM_FAULT_RETRY);
836 else
837 return (VM_FAULT_INTERRUPTED);
838 }
839 continue;
840 }
841
842 if (m->phys_page == vm_page_guard_addr) {
843 /*
844 * Guard page: off limits !
845 */
846 if (fault_type == VM_PROT_NONE) {
847 /*
848 * The fault is not requesting any
849 * access to the guard page, so it must
850 * be just to wire or unwire it.
851 * Let's pretend it succeeded...
852 */
853 m->busy = TRUE;
854 *result_page = m;
855 assert(first_m == VM_PAGE_NULL);
856 *top_page = first_m;
857 if (type_of_fault)
858 *type_of_fault = DBG_GUARD_FAULT;
859 return VM_FAULT_SUCCESS;
860 } else {
861 /*
862 * The fault requests access to the
863 * guard page: let's deny that !
864 */
865 vm_fault_cleanup(object, first_m);
866 thread_interrupt_level(interruptible_state);
867 return VM_FAULT_MEMORY_ERROR;
868 }
869 }
870
871 if (m->error) {
872 /*
873 * The page is in error, give up now.
874 */
875 #if TRACEFAULTPAGE
876 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
877 #endif
878 if (error_code)
879 *error_code = KERN_MEMORY_ERROR;
880 VM_PAGE_FREE(m);
881
882 vm_fault_cleanup(object, first_m);
883 thread_interrupt_level(interruptible_state);
884
885 return (VM_FAULT_MEMORY_ERROR);
886 }
887 if (m->restart) {
888 /*
889 * The pager wants us to restart
890 * at the top of the chain,
891 * typically because it has moved the
892 * page to another pager, then do so.
893 */
894 #if TRACEFAULTPAGE
895 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
896 #endif
897 VM_PAGE_FREE(m);
898
899 vm_fault_cleanup(object, first_m);
900 thread_interrupt_level(interruptible_state);
901
902 return (VM_FAULT_RETRY);
903 }
904 if (m->absent) {
905 /*
906 * The page isn't busy, but is absent,
907 * therefore it's deemed "unavailable".
908 *
909 * Remove the non-existent page (unless it's
910 * in the top object) and move on down to the
911 * next object (if there is one).
912 */
913 #if TRACEFAULTPAGE
914 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
915 #endif
916 next_object = object->shadow;
917
918 if (next_object == VM_OBJECT_NULL) {
919 /*
920 * Absent page at bottom of shadow
921 * chain; zero fill the page we left
922 * busy in the first object, and free
923 * the absent page.
924 */
925 assert(!must_be_resident);
926
927 /*
928 * check for any conditions that prevent
929 * us from creating a new zero-fill page
930 * vm_fault_check will do all of the
931 * fault cleanup in the case of an error condition
932 * including resetting the thread_interrupt_level
933 */
934 error = vm_fault_check(object, m, first_m, interruptible_state);
935
936 if (error != VM_FAULT_SUCCESS)
937 return (error);
938
939 XPR(XPR_VM_FAULT,
940 "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
941 (integer_t)object, offset,
942 (integer_t)m,
943 (integer_t)first_object, 0);
944
945 if (object != first_object) {
946 /*
947 * free the absent page we just found
948 */
949 VM_PAGE_FREE(m);
950
951 /*
952 * drop reference and lock on current object
953 */
954 vm_object_paging_end(object);
955 vm_object_unlock(object);
956
957 /*
958 * grab the original page we
959 * 'soldered' in place and
960 * retake lock on 'first_object'
961 */
962 m = first_m;
963 first_m = VM_PAGE_NULL;
964
965 object = first_object;
966 offset = first_offset;
967
968 vm_object_lock(object);
969 } else {
970 /*
971 * we're going to use the absent page we just found
972 * so convert it to a 'busy' page
973 */
974 m->absent = FALSE;
975 m->busy = TRUE;
976 }
977 /*
978 * zero-fill the page and put it on
979 * the correct paging queue
980 */
981 my_fault = vm_fault_zero_page(m, no_zero_fill);
982
983 break;
984 } else {
985 if (must_be_resident)
986 vm_object_paging_end(object);
987 else if (object != first_object) {
988 vm_object_paging_end(object);
989 VM_PAGE_FREE(m);
990 } else {
991 first_m = m;
992 m->absent = FALSE;
993 m->busy = TRUE;
994
995 vm_page_lockspin_queues();
996 VM_PAGE_QUEUES_REMOVE(m);
997 vm_page_unlock_queues();
998 }
999 XPR(XPR_VM_FAULT,
1000 "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1001 (integer_t)object, offset,
1002 (integer_t)next_object,
1003 offset+object->shadow_offset,0);
1004
1005 offset += object->shadow_offset;
1006 fault_info->lo_offset += object->shadow_offset;
1007 fault_info->hi_offset += object->shadow_offset;
1008 access_required = VM_PROT_READ;
1009
1010 vm_object_lock(next_object);
1011 vm_object_unlock(object);
1012 object = next_object;
1013 vm_object_paging_begin(object);
1014
1015 /*
1016 * reset to default type of fault
1017 */
1018 my_fault = DBG_CACHE_HIT_FAULT;
1019
1020 continue;
1021 }
1022 }
1023 if ((m->cleaning)
1024 && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1025 && (fault_type & VM_PROT_WRITE)) {
1026 /*
1027 * This is a copy-on-write fault that will
1028 * cause us to revoke access to this page, but
1029 * this page is in the process of being cleaned
1030 * in a clustered pageout. We must wait until
1031 * the cleaning operation completes before
1032 * revoking access to the original page,
1033 * otherwise we might attempt to remove a
1034 * wired mapping.
1035 */
1036 #if TRACEFAULTPAGE
1037 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
1038 #endif
1039 XPR(XPR_VM_FAULT,
1040 "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1041 (integer_t)object, offset,
1042 (integer_t)m, 0, 0);
1043 /*
1044 * take an extra ref so that object won't die
1045 */
1046 vm_object_reference_locked(object);
1047
1048 vm_fault_cleanup(object, first_m);
1049
1050 counter(c_vm_fault_page_block_backoff_kernel++);
1051 vm_object_lock(object);
1052 assert(object->ref_count > 0);
1053
1054 m = vm_page_lookup(object, offset);
1055
1056 if (m != VM_PAGE_NULL && m->cleaning) {
1057 PAGE_ASSERT_WAIT(m, interruptible);
1058
1059 vm_object_unlock(object);
1060 wait_result = thread_block(THREAD_CONTINUE_NULL);
1061 vm_object_deallocate(object);
1062
1063 goto backoff;
1064 } else {
1065 vm_object_unlock(object);
1066
1067 vm_object_deallocate(object);
1068 thread_interrupt_level(interruptible_state);
1069
1070 return (VM_FAULT_RETRY);
1071 }
1072 }
1073 if (type_of_fault == NULL && m->speculative) {
1074 /*
1075 * If we were passed a non-NULL pointer for
1076 * "type_of_fault", than we came from
1077 * vm_fault... we'll let it deal with
1078 * this condition, since it
1079 * needs to see m->speculative to correctly
1080 * account the pageins, otherwise...
1081 * take it off the speculative queue, we'll
1082 * let the caller of vm_fault_page deal
1083 * with getting it onto the correct queue
1084 */
1085 vm_page_lockspin_queues();
1086 VM_PAGE_QUEUES_REMOVE(m);
1087 vm_page_unlock_queues();
1088 }
1089
1090 if (m->encrypted) {
1091 /*
1092 * ENCRYPTED SWAP:
1093 * the user needs access to a page that we
1094 * encrypted before paging it out.
1095 * Decrypt the page now.
1096 * Keep it busy to prevent anyone from
1097 * accessing it during the decryption.
1098 */
1099 m->busy = TRUE;
1100 vm_page_decrypt(m, 0);
1101 assert(object == m->object);
1102 assert(m->busy);
1103 PAGE_WAKEUP_DONE(m);
1104
1105 /*
1106 * Retry from the top, in case
1107 * something changed while we were
1108 * decrypting.
1109 */
1110 continue;
1111 }
1112 ASSERT_PAGE_DECRYPTED(m);
1113
1114 if (m->object->code_signed) {
1115 /*
1116 * CODE SIGNING:
1117 * We just paged in a page from a signed
1118 * memory object but we don't need to
1119 * validate it now. We'll validate it if
1120 * when it gets mapped into a user address
1121 * space for the first time or when the page
1122 * gets copied to another object as a result
1123 * of a copy-on-write.
1124 */
1125 }
1126
1127 /*
1128 * We mark the page busy and leave it on
1129 * the pageout queues. If the pageout
1130 * deamon comes across it, then it will
1131 * remove the page from the queue, but not the object
1132 */
1133 #if TRACEFAULTPAGE
1134 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1135 #endif
1136 XPR(XPR_VM_FAULT,
1137 "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1138 (integer_t)object, offset, (integer_t)m, 0, 0);
1139 assert(!m->busy);
1140 assert(!m->absent);
1141
1142 m->busy = TRUE;
1143 break;
1144 }
1145
1146
1147 /*
1148 * we get here when there is no page present in the object at
1149 * the offset we're interested in... we'll allocate a page
1150 * at this point if the pager associated with
1151 * this object can provide the data or we're the top object...
1152 * object is locked; m == NULL
1153 */
1154 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply);
1155
1156 #if TRACEFAULTPAGE
1157 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1158 #endif
1159 if ((look_for_page || (object == first_object)) && !must_be_resident && !object->phys_contiguous) {
1160 /*
1161 * Allocate a new page for this object/offset pair
1162 */
1163 m = vm_page_grab();
1164 #if TRACEFAULTPAGE
1165 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1166 #endif
1167 if (m == VM_PAGE_NULL) {
1168
1169 vm_fault_cleanup(object, first_m);
1170 thread_interrupt_level(interruptible_state);
1171
1172 return (VM_FAULT_MEMORY_SHORTAGE);
1173 }
1174 vm_page_insert(m, object, offset);
1175 }
1176 if (look_for_page && !must_be_resident) {
1177 kern_return_t rc;
1178
1179 /*
1180 * If the memory manager is not ready, we
1181 * cannot make requests.
1182 */
1183 if (!object->pager_ready) {
1184 #if TRACEFAULTPAGE
1185 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1186 #endif
1187 if (m != VM_PAGE_NULL)
1188 VM_PAGE_FREE(m);
1189
1190 XPR(XPR_VM_FAULT,
1191 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1192 (integer_t)object, offset, 0, 0, 0);
1193
1194 /*
1195 * take an extra ref so object won't die
1196 */
1197 vm_object_reference_locked(object);
1198 vm_fault_cleanup(object, first_m);
1199 counter(c_vm_fault_page_block_backoff_kernel++);
1200
1201 vm_object_lock(object);
1202 assert(object->ref_count > 0);
1203
1204 if (!object->pager_ready) {
1205 wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1206
1207 vm_object_unlock(object);
1208 if (wait_result == THREAD_WAITING)
1209 wait_result = thread_block(THREAD_CONTINUE_NULL);
1210 vm_object_deallocate(object);
1211
1212 goto backoff;
1213 } else {
1214 vm_object_unlock(object);
1215 vm_object_deallocate(object);
1216 thread_interrupt_level(interruptible_state);
1217
1218 return (VM_FAULT_RETRY);
1219 }
1220 }
1221 if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1222 /*
1223 * If there are too many outstanding page
1224 * requests pending on this external object, we
1225 * wait for them to be resolved now.
1226 */
1227 #if TRACEFAULTPAGE
1228 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1229 #endif
1230 if (m != VM_PAGE_NULL)
1231 VM_PAGE_FREE(m);
1232 /*
1233 * take an extra ref so object won't die
1234 */
1235 vm_object_reference_locked(object);
1236
1237 vm_fault_cleanup(object, first_m);
1238
1239 counter(c_vm_fault_page_block_backoff_kernel++);
1240
1241 vm_object_lock(object);
1242 assert(object->ref_count > 0);
1243
1244 if (object->paging_in_progress > vm_object_pagein_throttle) {
1245 vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS, interruptible);
1246
1247 vm_object_unlock(object);
1248 wait_result = thread_block(THREAD_CONTINUE_NULL);
1249 vm_object_deallocate(object);
1250
1251 goto backoff;
1252 } else {
1253 vm_object_unlock(object);
1254 vm_object_deallocate(object);
1255 thread_interrupt_level(interruptible_state);
1256
1257 return (VM_FAULT_RETRY);
1258 }
1259 }
1260 if (m != VM_PAGE_NULL) {
1261 /*
1262 * Indicate that the page is waiting for data
1263 * from the memory manager.
1264 */
1265 m->list_req_pending = TRUE;
1266 m->absent = TRUE;
1267 }
1268
1269 #if TRACEFAULTPAGE
1270 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1271 #endif
1272
1273 /*
1274 * It's possible someone called vm_object_destroy while we weren't
1275 * holding the object lock. If that has happened, then bail out
1276 * here.
1277 */
1278
1279 pager = object->pager;
1280
1281 if (pager == MEMORY_OBJECT_NULL) {
1282 vm_fault_cleanup(object, first_m);
1283 thread_interrupt_level(interruptible_state);
1284 return VM_FAULT_MEMORY_ERROR;
1285 }
1286
1287 /*
1288 * We have an absent page in place for the faulting offset,
1289 * so we can release the object lock.
1290 */
1291
1292 vm_object_unlock(object);
1293
1294 /*
1295 * If this object uses a copy_call strategy,
1296 * and we are interested in a copy of this object
1297 * (having gotten here only by following a
1298 * shadow chain), then tell the memory manager
1299 * via a flag added to the desired_access
1300 * parameter, so that it can detect a race
1301 * between our walking down the shadow chain
1302 * and its pushing pages up into a copy of
1303 * the object that it manages.
1304 */
1305 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1306 wants_copy_flag = VM_PROT_WANTS_COPY;
1307 else
1308 wants_copy_flag = VM_PROT_NONE;
1309
1310 XPR(XPR_VM_FAULT,
1311 "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1312 (integer_t)object, offset, (integer_t)m,
1313 access_required | wants_copy_flag, 0);
1314
1315 /*
1316 * Call the memory manager to retrieve the data.
1317 */
1318 rc = memory_object_data_request(
1319 pager,
1320 offset + object->paging_offset,
1321 PAGE_SIZE,
1322 access_required | wants_copy_flag,
1323 (memory_object_fault_info_t)fault_info);
1324
1325 #if TRACEFAULTPAGE
1326 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1327 #endif
1328 vm_object_lock(object);
1329
1330 if (rc != KERN_SUCCESS) {
1331
1332 vm_fault_cleanup(object, first_m);
1333 thread_interrupt_level(interruptible_state);
1334
1335 return ((rc == MACH_SEND_INTERRUPTED) ?
1336 VM_FAULT_INTERRUPTED :
1337 VM_FAULT_MEMORY_ERROR);
1338 }
1339 if ((interruptible != THREAD_UNINT) && (current_thread()->sched_mode & TH_MODE_ABORT)) {
1340
1341 vm_fault_cleanup(object, first_m);
1342 thread_interrupt_level(interruptible_state);
1343
1344 return (VM_FAULT_INTERRUPTED);
1345 }
1346 if (m == VM_PAGE_NULL && object->phys_contiguous) {
1347 /*
1348 * No page here means that the object we
1349 * initially looked up was "physically
1350 * contiguous" (i.e. device memory). However,
1351 * with Virtual VRAM, the object might not
1352 * be backed by that device memory anymore,
1353 * so we're done here only if the object is
1354 * still "phys_contiguous".
1355 * Otherwise, if the object is no longer
1356 * "phys_contiguous", we need to retry the
1357 * page fault against the object's new backing
1358 * store (different memory object).
1359 */
1360 break;
1361 }
1362 /*
1363 * potentially a pagein fault
1364 * if we make it through the state checks
1365 * above, than we'll count it as such
1366 */
1367 my_fault = DBG_PAGEIN_FAULT;
1368
1369 /*
1370 * Retry with same object/offset, since new data may
1371 * be in a different page (i.e., m is meaningless at
1372 * this point).
1373 */
1374 continue;
1375 }
1376
1377 /*
1378 * We get here if the object has no pager, or an existence map
1379 * exists and indicates the page isn't present on the pager
1380 * or we're unwiring a page. If a pager exists, but there
1381 * is no existence map, then the m->absent case above handles
1382 * the ZF case when the pager can't provide the page
1383 */
1384 #if TRACEFAULTPAGE
1385 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1386 #endif
1387 if (object == first_object)
1388 first_m = m;
1389 else
1390 assert(m == VM_PAGE_NULL);
1391
1392 XPR(XPR_VM_FAULT,
1393 "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1394 (integer_t)object, offset, (integer_t)m,
1395 (integer_t)object->shadow, 0);
1396
1397 next_object = object->shadow;
1398
1399 if (next_object == VM_OBJECT_NULL) {
1400 /*
1401 * we've hit the bottom of the shadown chain,
1402 * fill the page in the top object with zeros.
1403 */
1404 assert(!must_be_resident);
1405
1406 if (object != first_object) {
1407 vm_object_paging_end(object);
1408 vm_object_unlock(object);
1409
1410 object = first_object;
1411 offset = first_offset;
1412 vm_object_lock(object);
1413 }
1414 m = first_m;
1415 assert(m->object == object);
1416 first_m = VM_PAGE_NULL;
1417
1418 /*
1419 * check for any conditions that prevent
1420 * us from creating a new zero-fill page
1421 * vm_fault_check will do all of the
1422 * fault cleanup in the case of an error condition
1423 * including resetting the thread_interrupt_level
1424 */
1425 error = vm_fault_check(object, m, first_m, interruptible_state);
1426
1427 if (error != VM_FAULT_SUCCESS)
1428 return (error);
1429
1430 if (m == VM_PAGE_NULL) {
1431 m = vm_page_grab();
1432
1433 if (m == VM_PAGE_NULL) {
1434 vm_fault_cleanup(object, VM_PAGE_NULL);
1435 thread_interrupt_level(interruptible_state);
1436
1437 return (VM_FAULT_MEMORY_SHORTAGE);
1438 }
1439 vm_page_insert(m, object, offset);
1440 }
1441 my_fault = vm_fault_zero_page(m, no_zero_fill);
1442
1443 break;
1444
1445 } else {
1446 /*
1447 * Move on to the next object. Lock the next
1448 * object before unlocking the current one.
1449 */
1450 if ((object != first_object) || must_be_resident)
1451 vm_object_paging_end(object);
1452
1453 offset += object->shadow_offset;
1454 fault_info->lo_offset += object->shadow_offset;
1455 fault_info->hi_offset += object->shadow_offset;
1456 access_required = VM_PROT_READ;
1457
1458 vm_object_lock(next_object);
1459 vm_object_unlock(object);
1460
1461 object = next_object;
1462 vm_object_paging_begin(object);
1463 }
1464 }
1465
1466 /*
1467 * PAGE HAS BEEN FOUND.
1468 *
1469 * This page (m) is:
1470 * busy, so that we can play with it;
1471 * not absent, so that nobody else will fill it;
1472 * possibly eligible for pageout;
1473 *
1474 * The top-level page (first_m) is:
1475 * VM_PAGE_NULL if the page was found in the
1476 * top-level object;
1477 * busy, not absent, and ineligible for pageout.
1478 *
1479 * The current object (object) is locked. A paging
1480 * reference is held for the current and top-level
1481 * objects.
1482 */
1483
1484 #if TRACEFAULTPAGE
1485 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1486 #endif
1487 #if EXTRA_ASSERTIONS
1488 if (m != VM_PAGE_NULL) {
1489 assert(m->busy && !m->absent);
1490 assert((first_m == VM_PAGE_NULL) ||
1491 (first_m->busy && !first_m->absent &&
1492 !first_m->active && !first_m->inactive));
1493 }
1494 #endif /* EXTRA_ASSERTIONS */
1495
1496 /*
1497 * ENCRYPTED SWAP:
1498 * If we found a page, we must have decrypted it before we
1499 * get here...
1500 */
1501 if (m != VM_PAGE_NULL) {
1502 ASSERT_PAGE_DECRYPTED(m);
1503 }
1504
1505 XPR(XPR_VM_FAULT,
1506 "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1507 (integer_t)object, offset, (integer_t)m,
1508 (integer_t)first_object, (integer_t)first_m);
1509
1510 /*
1511 * If the page is being written, but isn't
1512 * already owned by the top-level object,
1513 * we have to copy it into a new page owned
1514 * by the top-level object.
1515 */
1516 if ((object != first_object) && (m != VM_PAGE_NULL)) {
1517
1518 #if TRACEFAULTPAGE
1519 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1520 #endif
1521 if (fault_type & VM_PROT_WRITE) {
1522 vm_page_t copy_m;
1523
1524 /*
1525 * We only really need to copy if we
1526 * want to write it.
1527 */
1528 assert(!must_be_resident);
1529
1530 /*
1531 * are we protecting the system from
1532 * backing store exhaustion. If so
1533 * sleep unless we are privileged.
1534 */
1535 if (vm_backing_store_low) {
1536 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1537
1538 RELEASE_PAGE(m);
1539 vm_fault_cleanup(object, first_m);
1540
1541 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1542
1543 thread_block(THREAD_CONTINUE_NULL);
1544 thread_interrupt_level(interruptible_state);
1545
1546 return (VM_FAULT_RETRY);
1547 }
1548 }
1549 /*
1550 * If we try to collapse first_object at this
1551 * point, we may deadlock when we try to get
1552 * the lock on an intermediate object (since we
1553 * have the bottom object locked). We can't
1554 * unlock the bottom object, because the page
1555 * we found may move (by collapse) if we do.
1556 *
1557 * Instead, we first copy the page. Then, when
1558 * we have no more use for the bottom object,
1559 * we unlock it and try to collapse.
1560 *
1561 * Note that we copy the page even if we didn't
1562 * need to... that's the breaks.
1563 */
1564
1565 /*
1566 * Allocate a page for the copy
1567 */
1568 copy_m = vm_page_grab();
1569
1570 if (copy_m == VM_PAGE_NULL) {
1571 RELEASE_PAGE(m);
1572
1573 vm_fault_cleanup(object, first_m);
1574 thread_interrupt_level(interruptible_state);
1575
1576 return (VM_FAULT_MEMORY_SHORTAGE);
1577 }
1578 XPR(XPR_VM_FAULT,
1579 "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1580 (integer_t)object, offset,
1581 (integer_t)m, (integer_t)copy_m, 0);
1582
1583 vm_page_copy(m, copy_m);
1584
1585 /*
1586 * If another map is truly sharing this
1587 * page with us, we have to flush all
1588 * uses of the original page, since we
1589 * can't distinguish those which want the
1590 * original from those which need the
1591 * new copy.
1592 *
1593 * XXXO If we know that only one map has
1594 * access to this page, then we could
1595 * avoid the pmap_disconnect() call.
1596 */
1597 if (m->pmapped)
1598 pmap_disconnect(m->phys_page);
1599
1600 assert(!m->cleaning);
1601
1602 /*
1603 * We no longer need the old page or object.
1604 */
1605 PAGE_WAKEUP_DONE(m);
1606 vm_object_paging_end(object);
1607 vm_object_unlock(object);
1608
1609 my_fault = DBG_COW_FAULT;
1610 VM_STAT_INCR(cow_faults);
1611 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1612 current_task()->cow_faults++;
1613
1614 object = first_object;
1615 offset = first_offset;
1616
1617 vm_object_lock(object);
1618 /*
1619 * get rid of the place holder
1620 * page that we soldered in earlier
1621 */
1622 VM_PAGE_FREE(first_m);
1623 first_m = VM_PAGE_NULL;
1624
1625 /*
1626 * and replace it with the
1627 * page we just copied into
1628 */
1629 assert(copy_m->busy);
1630 vm_page_insert(copy_m, object, offset);
1631 copy_m->dirty = TRUE;
1632
1633 m = copy_m;
1634 /*
1635 * Now that we've gotten the copy out of the
1636 * way, let's try to collapse the top object.
1637 * But we have to play ugly games with
1638 * paging_in_progress to do that...
1639 */
1640 vm_object_paging_end(object);
1641 vm_object_collapse(object, offset, TRUE);
1642 vm_object_paging_begin(object);
1643
1644 } else
1645 *protection &= (~VM_PROT_WRITE);
1646 }
1647 /*
1648 * Now check whether the page needs to be pushed into the
1649 * copy object. The use of asymmetric copy on write for
1650 * shared temporary objects means that we may do two copies to
1651 * satisfy the fault; one above to get the page from a
1652 * shadowed object, and one here to push it into the copy.
1653 */
1654 try_failed_count = 0;
1655
1656 while ((copy_object = first_object->copy) != VM_OBJECT_NULL && (m != VM_PAGE_NULL)) {
1657 vm_object_offset_t copy_offset;
1658 vm_page_t copy_m;
1659
1660 #if TRACEFAULTPAGE
1661 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1662 #endif
1663 /*
1664 * If the page is being written, but hasn't been
1665 * copied to the copy-object, we have to copy it there.
1666 */
1667 if ((fault_type & VM_PROT_WRITE) == 0) {
1668 *protection &= ~VM_PROT_WRITE;
1669 break;
1670 }
1671
1672 /*
1673 * If the page was guaranteed to be resident,
1674 * we must have already performed the copy.
1675 */
1676 if (must_be_resident)
1677 break;
1678
1679 /*
1680 * Try to get the lock on the copy_object.
1681 */
1682 if (!vm_object_lock_try(copy_object)) {
1683
1684 vm_object_unlock(object);
1685 try_failed_count++;
1686
1687 mutex_pause(try_failed_count); /* wait a bit */
1688 vm_object_lock(object);
1689
1690 continue;
1691 }
1692 try_failed_count = 0;
1693
1694 /*
1695 * Make another reference to the copy-object,
1696 * to keep it from disappearing during the
1697 * copy.
1698 */
1699 vm_object_reference_locked(copy_object);
1700
1701 /*
1702 * Does the page exist in the copy?
1703 */
1704 copy_offset = first_offset - copy_object->shadow_offset;
1705
1706 if (copy_object->size <= copy_offset)
1707 /*
1708 * Copy object doesn't cover this page -- do nothing.
1709 */
1710 ;
1711 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1712 /*
1713 * Page currently exists in the copy object
1714 */
1715 if (copy_m->busy) {
1716 /*
1717 * If the page is being brought
1718 * in, wait for it and then retry.
1719 */
1720 RELEASE_PAGE(m);
1721
1722 /*
1723 * take an extra ref so object won't die
1724 */
1725 vm_object_reference_locked(copy_object);
1726 vm_object_unlock(copy_object);
1727 vm_fault_cleanup(object, first_m);
1728 counter(c_vm_fault_page_block_backoff_kernel++);
1729
1730 vm_object_lock(copy_object);
1731 assert(copy_object->ref_count > 0);
1732 VM_OBJ_RES_DECR(copy_object);
1733 vm_object_lock_assert_exclusive(copy_object);
1734 copy_object->ref_count--;
1735 assert(copy_object->ref_count > 0);
1736 copy_m = vm_page_lookup(copy_object, copy_offset);
1737 /*
1738 * ENCRYPTED SWAP:
1739 * it's OK if the "copy_m" page is encrypted,
1740 * because we're not moving it nor handling its
1741 * contents.
1742 */
1743 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1744 PAGE_ASSERT_WAIT(copy_m, interruptible);
1745
1746 vm_object_unlock(copy_object);
1747 wait_result = thread_block(THREAD_CONTINUE_NULL);
1748 vm_object_deallocate(copy_object);
1749
1750 goto backoff;
1751 } else {
1752 vm_object_unlock(copy_object);
1753 vm_object_deallocate(copy_object);
1754 thread_interrupt_level(interruptible_state);
1755
1756 return (VM_FAULT_RETRY);
1757 }
1758 }
1759 }
1760 else if (!PAGED_OUT(copy_object, copy_offset)) {
1761 /*
1762 * If PAGED_OUT is TRUE, then the page used to exist
1763 * in the copy-object, and has already been paged out.
1764 * We don't need to repeat this. If PAGED_OUT is
1765 * FALSE, then either we don't know (!pager_created,
1766 * for example) or it hasn't been paged out.
1767 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1768 * We must copy the page to the copy object.
1769 */
1770
1771 if (vm_backing_store_low) {
1772 /*
1773 * we are protecting the system from
1774 * backing store exhaustion. If so
1775 * sleep unless we are privileged.
1776 */
1777 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1778 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1779
1780 RELEASE_PAGE(m);
1781 VM_OBJ_RES_DECR(copy_object);
1782 vm_object_lock_assert_exclusive(copy_object);
1783 copy_object->ref_count--;
1784 assert(copy_object->ref_count > 0);
1785
1786 vm_object_unlock(copy_object);
1787 vm_fault_cleanup(object, first_m);
1788 thread_block(THREAD_CONTINUE_NULL);
1789 thread_interrupt_level(interruptible_state);
1790
1791 return (VM_FAULT_RETRY);
1792 }
1793 }
1794 /*
1795 * Allocate a page for the copy
1796 */
1797 copy_m = vm_page_alloc(copy_object, copy_offset);
1798
1799 if (copy_m == VM_PAGE_NULL) {
1800 RELEASE_PAGE(m);
1801
1802 VM_OBJ_RES_DECR(copy_object);
1803 vm_object_lock_assert_exclusive(copy_object);
1804 copy_object->ref_count--;
1805 assert(copy_object->ref_count > 0);
1806
1807 vm_object_unlock(copy_object);
1808 vm_fault_cleanup(object, first_m);
1809 thread_interrupt_level(interruptible_state);
1810
1811 return (VM_FAULT_MEMORY_SHORTAGE);
1812 }
1813 /*
1814 * Must copy page into copy-object.
1815 */
1816 vm_page_copy(m, copy_m);
1817
1818 /*
1819 * If the old page was in use by any users
1820 * of the copy-object, it must be removed
1821 * from all pmaps. (We can't know which
1822 * pmaps use it.)
1823 */
1824 if (m->pmapped)
1825 pmap_disconnect(m->phys_page);
1826
1827 /*
1828 * If there's a pager, then immediately
1829 * page out this page, using the "initialize"
1830 * option. Else, we use the copy.
1831 */
1832 if ((!copy_object->pager_created)
1833 #if MACH_PAGEMAP
1834 || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
1835 #endif
1836 ) {
1837
1838 vm_page_lockspin_queues();
1839 assert(!m->cleaning);
1840 vm_page_activate(copy_m);
1841 vm_page_unlock_queues();
1842
1843 copy_m->dirty = TRUE;
1844 PAGE_WAKEUP_DONE(copy_m);
1845 }
1846 else {
1847 assert(copy_m->busy == TRUE);
1848 assert(!m->cleaning);
1849
1850 /*
1851 * dirty is protected by the object lock
1852 */
1853 copy_m->dirty = TRUE;
1854
1855 /*
1856 * The page is already ready for pageout:
1857 * not on pageout queues and busy.
1858 * Unlock everything except the
1859 * copy_object itself.
1860 */
1861 vm_object_unlock(object);
1862
1863 /*
1864 * Write the page to the copy-object,
1865 * flushing it from the kernel.
1866 */
1867 vm_pageout_initialize_page(copy_m);
1868
1869 /*
1870 * Since the pageout may have
1871 * temporarily dropped the
1872 * copy_object's lock, we
1873 * check whether we'll have
1874 * to deallocate the hard way.
1875 */
1876 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
1877 vm_object_unlock(copy_object);
1878 vm_object_deallocate(copy_object);
1879 vm_object_lock(object);
1880
1881 continue;
1882 }
1883 /*
1884 * Pick back up the old object's
1885 * lock. [It is safe to do so,
1886 * since it must be deeper in the
1887 * object tree.]
1888 */
1889 vm_object_lock(object);
1890 }
1891 /*
1892 * Because we're pushing a page upward
1893 * in the object tree, we must restart
1894 * any faults that are waiting here.
1895 * [Note that this is an expansion of
1896 * PAGE_WAKEUP that uses the THREAD_RESTART
1897 * wait result]. Can't turn off the page's
1898 * busy bit because we're not done with it.
1899 */
1900 if (m->wanted) {
1901 m->wanted = FALSE;
1902 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
1903 }
1904 }
1905 /*
1906 * The reference count on copy_object must be
1907 * at least 2: one for our extra reference,
1908 * and at least one from the outside world
1909 * (we checked that when we last locked
1910 * copy_object).
1911 */
1912 vm_object_lock_assert_exclusive(copy_object);
1913 copy_object->ref_count--;
1914 assert(copy_object->ref_count > 0);
1915
1916 VM_OBJ_RES_DECR(copy_object);
1917 vm_object_unlock(copy_object);
1918
1919 break;
1920 }
1921 *result_page = m;
1922 *top_page = first_m;
1923
1924 XPR(XPR_VM_FAULT,
1925 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1926 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1927
1928 if (m != VM_PAGE_NULL) {
1929 if (my_fault == DBG_PAGEIN_FAULT) {
1930
1931 VM_STAT_INCR(pageins);
1932 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
1933 DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1934 current_task()->pageins++;
1935
1936 if (m->object->internal) {
1937 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
1938 } else {
1939 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
1940 }
1941
1942 /*
1943 * evaluate access pattern and update state
1944 * vm_fault_deactivate_behind depends on the
1945 * state being up to date
1946 */
1947 vm_fault_is_sequential(object, offset, fault_info->behavior);
1948
1949 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
1950 }
1951 if (type_of_fault)
1952 *type_of_fault = my_fault;
1953 } else
1954 vm_object_unlock(object);
1955
1956 thread_interrupt_level(interruptible_state);
1957
1958 #if TRACEFAULTPAGE
1959 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
1960 #endif
1961 return (VM_FAULT_SUCCESS);
1962
1963 backoff:
1964 thread_interrupt_level(interruptible_state);
1965
1966 if (wait_result == THREAD_INTERRUPTED)
1967 return (VM_FAULT_INTERRUPTED);
1968 return (VM_FAULT_RETRY);
1969
1970 #undef RELEASE_PAGE
1971 }
1972
1973
1974
1975 /*
1976 * CODE SIGNING:
1977 * When soft faulting a page, we have to validate the page if:
1978 * 1. the page is being mapped in user space
1979 * 2. the page hasn't already been found to be "tainted"
1980 * 3. the page belongs to a code-signed object
1981 * 4. the page has not been validated yet or has been mapped for write.
1982 */
1983 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page) \
1984 ((pmap) != kernel_pmap /*1*/ && \
1985 !(page)->cs_tainted /*2*/ && \
1986 (page)->object->code_signed /*3*/ && \
1987 (!(page)->cs_validated || (page)->wpmapped /*4*/))
1988
1989
1990 /*
1991 * page queue lock must NOT be held
1992 * m->object must be locked
1993 *
1994 * NOTE: m->object could be locked "shared" only if we are called
1995 * from vm_fault() as part of a soft fault. If so, we must be
1996 * careful not to modify the VM object in any way that is not
1997 * legal under a shared lock...
1998 */
1999 unsigned long cs_enter_tainted_rejected = 0;
2000 unsigned long cs_enter_tainted_accepted = 0;
2001 kern_return_t
2002 vm_fault_enter(vm_page_t m,
2003 pmap_t pmap,
2004 vm_map_offset_t vaddr,
2005 vm_prot_t prot,
2006 boolean_t wired,
2007 boolean_t change_wiring,
2008 boolean_t no_cache,
2009 int *type_of_fault)
2010 {
2011 unsigned int cache_attr;
2012 kern_return_t kr;
2013 boolean_t previously_pmapped = m->pmapped;
2014
2015 vm_object_lock_assert_held(m->object);
2016 #if DEBUG
2017 mutex_assert(&vm_page_queue_lock, MA_NOTOWNED);
2018 #endif /* DEBUG */
2019
2020 if (m->phys_page == vm_page_guard_addr) {
2021 assert(m->fictitious);
2022 return KERN_SUCCESS;
2023 }
2024
2025 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2026
2027 if (m->pmapped == FALSE) {
2028 /*
2029 * This is the first time this page is being
2030 * mapped in an address space (pmapped == FALSE).
2031 *
2032 * Part of that page may still be in the data cache
2033 * and not flushed to memory. In case we end up
2034 * accessing that page via the instruction cache,
2035 * we need to ensure that the 2 caches are in sync.
2036 */
2037 pmap_sync_page_data_phys(m->phys_page);
2038
2039 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2040 /*
2041 * found it in the cache, but this
2042 * is the first fault-in of the page (m->pmapped == FALSE)
2043 * so it must have come in as part of
2044 * a cluster... account 1 pagein against it
2045 */
2046 VM_STAT_INCR(pageins);
2047 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2048
2049 if (m->object->internal) {
2050 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2051 } else {
2052 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2053 }
2054
2055 current_task()->pageins++;
2056
2057 *type_of_fault = DBG_PAGEIN_FAULT;
2058 }
2059 VM_PAGE_CONSUME_CLUSTERED(m);
2060
2061 } else if (cache_attr != VM_WIMG_DEFAULT)
2062 pmap_sync_page_attributes_phys(m->phys_page);
2063
2064 if (*type_of_fault != DBG_COW_FAULT) {
2065 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2066
2067 if (pmap == kernel_pmap) {
2068 DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2069 }
2070 }
2071
2072 if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2073 vm_object_lock_assert_exclusive(m->object);
2074
2075 if (m->cs_validated) {
2076 vm_cs_revalidates++;
2077 }
2078
2079 /* VM map is locked, so 1 ref will remain on VM object */
2080 vm_page_validate_cs(m);
2081 }
2082
2083 if (m->cs_tainted /* always invalidate a tainted page */
2084 #if CONFIG_ENFORCE_SIGNED_CODE
2085 /*
2086 * Code Signing enforcement invalidates an executable page that
2087 * has no code directory, and thus could not be validated.
2088 */
2089 || ((prot & VM_PROT_EXECUTE) && !m->cs_validated )
2090 #endif
2091 ) {
2092 /*
2093 * CODE SIGNING:
2094 * This page has been tainted and can not be trusted.
2095 * Let's notify the current process and let it take any
2096 * necessary precautions before we enter the tainted page
2097 * into its address space.
2098 */
2099 kr = KERN_SUCCESS;
2100 #if CONFIG_ENFORCE_SIGNED_CODE
2101 if (!cs_enforcement_disable) {
2102 #endif
2103 if (cs_invalid_page((addr64_t) vaddr)) {
2104 /* reject the tainted page: abort the page fault */
2105 kr = KERN_MEMORY_ERROR;
2106 cs_enter_tainted_rejected++;
2107 } else {
2108 /* proceed with the tainted page */
2109 kr = KERN_SUCCESS;
2110 cs_enter_tainted_accepted++;
2111 }
2112 #if CONFIG_ENFORCE_SIGNED_CODE
2113 }
2114 #endif
2115 if (cs_debug || kr != KERN_SUCCESS) {
2116 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2117 "page %p obj %p off 0x%llx *** INVALID PAGE ***\n",
2118 (long long)vaddr, m, m->object, m->offset);
2119 }
2120 } else {
2121 /* proceed with the valid page */
2122 kr = KERN_SUCCESS;
2123 }
2124
2125 if (kr == KERN_SUCCESS) {
2126 /*
2127 * NOTE: we may only hold the vm_object lock SHARED
2128 * at this point, but the update of pmapped is ok
2129 * since this is the ONLY bit updated behind the SHARED
2130 * lock... however, we need to figure out how to do an atomic
2131 * update on a bit field to make this less fragile... right
2132 * now I don't know how to coerce 'C' to give me the offset info
2133 * that's needed for an AtomicCompareAndSwap
2134 */
2135 m->pmapped = TRUE;
2136 if (prot & VM_PROT_WRITE) {
2137 vm_object_lock_assert_exclusive(m->object);
2138 m->wpmapped = TRUE;
2139 }
2140
2141 PMAP_ENTER(pmap, vaddr, m, prot, cache_attr, wired);
2142 }
2143
2144 /*
2145 * Hold queues lock to manipulate
2146 * the page queues. Change wiring
2147 * case is obvious.
2148 */
2149 if (change_wiring) {
2150 vm_page_lockspin_queues();
2151
2152 if (wired) {
2153 if (kr == KERN_SUCCESS) {
2154 vm_page_wire(m);
2155 }
2156 } else {
2157 vm_page_unwire(m);
2158 }
2159 vm_page_unlock_queues();
2160
2161 } else {
2162 if (kr != KERN_SUCCESS) {
2163 vm_page_lock_queues();
2164 vm_page_deactivate(m);
2165 vm_page_unlock_queues();
2166 } else {
2167 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count && !m->throttled) {
2168 vm_page_lockspin_queues();
2169 /*
2170 * test again now that we hold the page queue lock
2171 */
2172 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count) {
2173
2174 /*
2175 * If this is a no_cache mapping and the page has never been
2176 * mapped before or was previously a no_cache page, then we
2177 * want to leave pages in the speculative state so that they
2178 * can be readily recycled if free memory runs low. Otherwise
2179 * the page is activated as normal.
2180 */
2181
2182 if (no_cache && (!previously_pmapped || m->no_cache)) {
2183 m->no_cache = TRUE;
2184
2185 if (m->active || m->inactive)
2186 VM_PAGE_QUEUES_REMOVE(m);
2187
2188 if (!m->speculative)
2189 vm_page_speculate(m, TRUE);
2190
2191 } else if (!m->active && !m->inactive)
2192 vm_page_activate(m);
2193
2194 }
2195
2196 vm_page_unlock_queues();
2197 }
2198 }
2199 }
2200 return kr;
2201 }
2202
2203
2204 /*
2205 * Routine: vm_fault
2206 * Purpose:
2207 * Handle page faults, including pseudo-faults
2208 * used to change the wiring status of pages.
2209 * Returns:
2210 * Explicit continuations have been removed.
2211 * Implementation:
2212 * vm_fault and vm_fault_page save mucho state
2213 * in the moral equivalent of a closure. The state
2214 * structure is allocated when first entering vm_fault
2215 * and deallocated when leaving vm_fault.
2216 */
2217
2218 extern int _map_enter_debug;
2219
2220 unsigned long vm_fault_collapse_total = 0;
2221 unsigned long vm_fault_collapse_skipped = 0;
2222
2223 kern_return_t
2224 vm_fault(
2225 vm_map_t map,
2226 vm_map_offset_t vaddr,
2227 vm_prot_t fault_type,
2228 boolean_t change_wiring,
2229 int interruptible,
2230 pmap_t caller_pmap,
2231 vm_map_offset_t caller_pmap_addr)
2232 {
2233 vm_map_version_t version; /* Map version for verificiation */
2234 boolean_t wired; /* Should mapping be wired down? */
2235 vm_object_t object; /* Top-level object */
2236 vm_object_offset_t offset; /* Top-level offset */
2237 vm_prot_t prot; /* Protection for mapping */
2238 vm_object_t old_copy_object; /* Saved copy object */
2239 vm_page_t result_page; /* Result of vm_fault_page */
2240 vm_page_t top_page; /* Placeholder page */
2241 kern_return_t kr;
2242
2243 vm_page_t m; /* Fast access to result_page */
2244 kern_return_t error_code;
2245 vm_object_t cur_object;
2246 vm_object_offset_t cur_offset;
2247 vm_page_t cur_m;
2248 vm_object_t new_object;
2249 int type_of_fault;
2250 pmap_t pmap;
2251 boolean_t interruptible_state;
2252 vm_map_t real_map = map;
2253 vm_map_t original_map = map;
2254 vm_prot_t original_fault_type;
2255 struct vm_object_fault_info fault_info;
2256 boolean_t need_collapse = FALSE;
2257 int object_lock_type = 0;
2258 int cur_object_lock_type;
2259
2260
2261 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
2262 (int)((uint64_t)vaddr >> 32),
2263 (int)vaddr,
2264 0,
2265 0,
2266 0);
2267
2268 if (get_preemption_level() != 0) {
2269 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2270 (int)((uint64_t)vaddr >> 32),
2271 (int)vaddr,
2272 KERN_FAILURE,
2273 0,
2274 0);
2275
2276 return (KERN_FAILURE);
2277 }
2278 interruptible_state = thread_interrupt_level(interruptible);
2279
2280 VM_STAT_INCR(faults);
2281 current_task()->faults++;
2282 original_fault_type = fault_type;
2283
2284 if (fault_type & VM_PROT_WRITE)
2285 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2286 else
2287 object_lock_type = OBJECT_LOCK_SHARED;
2288
2289 cur_object_lock_type = OBJECT_LOCK_SHARED;
2290
2291 RetryFault:
2292 /*
2293 * assume we will hit a page in the cache
2294 * otherwise, explicitly override with
2295 * the real fault type once we determine it
2296 */
2297 type_of_fault = DBG_CACHE_HIT_FAULT;
2298
2299 /*
2300 * Find the backing store object and offset into
2301 * it to begin the search.
2302 */
2303 fault_type = original_fault_type;
2304 map = original_map;
2305 vm_map_lock_read(map);
2306
2307 kr = vm_map_lookup_locked(&map, vaddr, fault_type,
2308 object_lock_type, &version,
2309 &object, &offset, &prot, &wired,
2310 &fault_info,
2311 &real_map);
2312
2313 if (kr != KERN_SUCCESS) {
2314 vm_map_unlock_read(map);
2315 goto done;
2316 }
2317 pmap = real_map->pmap;
2318 fault_info.interruptible = interruptible;
2319
2320 /*
2321 * If the page is wired, we must fault for the current protection
2322 * value, to avoid further faults.
2323 */
2324 if (wired) {
2325 fault_type = prot | VM_PROT_WRITE;
2326 /*
2327 * since we're treating this fault as a 'write'
2328 * we must hold the top object lock exclusively
2329 */
2330 if (object_lock_type == OBJECT_LOCK_SHARED) {
2331
2332 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2333
2334 if (vm_object_lock_upgrade(object) == FALSE) {
2335 /*
2336 * couldn't upgrade, so explictly
2337 * take the lock exclusively
2338 */
2339 vm_object_lock(object);
2340 }
2341 }
2342 }
2343
2344 #if VM_FAULT_CLASSIFY
2345 /*
2346 * Temporary data gathering code
2347 */
2348 vm_fault_classify(object, offset, fault_type);
2349 #endif
2350 /*
2351 * Fast fault code. The basic idea is to do as much as
2352 * possible while holding the map lock and object locks.
2353 * Busy pages are not used until the object lock has to
2354 * be dropped to do something (copy, zero fill, pmap enter).
2355 * Similarly, paging references aren't acquired until that
2356 * point, and object references aren't used.
2357 *
2358 * If we can figure out what to do
2359 * (zero fill, copy on write, pmap enter) while holding
2360 * the locks, then it gets done. Otherwise, we give up,
2361 * and use the original fault path (which doesn't hold
2362 * the map lock, and relies on busy pages).
2363 * The give up cases include:
2364 * - Have to talk to pager.
2365 * - Page is busy, absent or in error.
2366 * - Pager has locked out desired access.
2367 * - Fault needs to be restarted.
2368 * - Have to push page into copy object.
2369 *
2370 * The code is an infinite loop that moves one level down
2371 * the shadow chain each time. cur_object and cur_offset
2372 * refer to the current object being examined. object and offset
2373 * are the original object from the map. The loop is at the
2374 * top level if and only if object and cur_object are the same.
2375 *
2376 * Invariants: Map lock is held throughout. Lock is held on
2377 * original object and cur_object (if different) when
2378 * continuing or exiting loop.
2379 *
2380 */
2381
2382
2383 /*
2384 * If this page is to be inserted in a copy delay object
2385 * for writing, and if the object has a copy, then the
2386 * copy delay strategy is implemented in the slow fault page.
2387 */
2388 if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
2389 object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
2390 goto handle_copy_delay;
2391
2392 cur_object = object;
2393 cur_offset = offset;
2394
2395 while (TRUE) {
2396 m = vm_page_lookup(cur_object, cur_offset);
2397
2398 if (m != VM_PAGE_NULL) {
2399 if (m->busy) {
2400 wait_result_t result;
2401
2402 /*
2403 * in order to do the PAGE_ASSERT_WAIT, we must
2404 * have object that 'm' belongs to locked exclusively
2405 */
2406 if (object != cur_object) {
2407 vm_object_unlock(object);
2408
2409 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2410
2411 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2412
2413 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2414 /*
2415 * couldn't upgrade so go do a full retry
2416 * immediately since we've already dropped
2417 * the top object lock associated with this page
2418 * and the current one got dropped due to the
2419 * failed upgrade... the state is no longer valid
2420 */
2421 vm_map_unlock_read(map);
2422 if (real_map != map)
2423 vm_map_unlock(real_map);
2424
2425 goto RetryFault;
2426 }
2427 }
2428 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2429
2430 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2431
2432 if (vm_object_lock_upgrade(object) == FALSE) {
2433 /*
2434 * couldn't upgrade, so explictly take the lock
2435 * exclusively and go relookup the page since we
2436 * will have dropped the object lock and
2437 * a different thread could have inserted
2438 * a page at this offset
2439 * no need for a full retry since we're
2440 * at the top level of the object chain
2441 */
2442 vm_object_lock(object);
2443
2444 continue;
2445 }
2446 }
2447 vm_map_unlock_read(map);
2448 if (real_map != map)
2449 vm_map_unlock(real_map);
2450
2451 result = PAGE_ASSERT_WAIT(m, interruptible);
2452
2453 vm_object_unlock(cur_object);
2454
2455 if (result == THREAD_WAITING) {
2456 result = thread_block(THREAD_CONTINUE_NULL);
2457
2458 counter(c_vm_fault_page_block_busy_kernel++);
2459 }
2460 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2461 goto RetryFault;
2462
2463 kr = KERN_ABORTED;
2464 goto done;
2465 }
2466 if (m->phys_page == vm_page_guard_addr) {
2467 /*
2468 * Guard page: let the slow path deal with it
2469 */
2470 break;
2471 }
2472 if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
2473 /*
2474 * Unusual case... let the slow path deal with it
2475 */
2476 break;
2477 }
2478 if (m->encrypted) {
2479 /*
2480 * ENCRYPTED SWAP:
2481 * We've soft-faulted (because it's not in the page
2482 * table) on an encrypted page.
2483 * Keep the page "busy" so that no one messes with
2484 * it during the decryption.
2485 * Release the extra locks we're holding, keep only
2486 * the page's VM object lock.
2487 *
2488 * in order to set 'busy' on 'm', we must
2489 * have object that 'm' belongs to locked exclusively
2490 */
2491 if (object != cur_object) {
2492 vm_object_unlock(object);
2493
2494 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2495
2496 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2497
2498 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2499 /*
2500 * couldn't upgrade so go do a full retry
2501 * immediately since we've already dropped
2502 * the top object lock associated with this page
2503 * and the current one got dropped due to the
2504 * failed upgrade... the state is no longer valid
2505 */
2506 vm_map_unlock_read(map);
2507 if (real_map != map)
2508 vm_map_unlock(real_map);
2509
2510 goto RetryFault;
2511 }
2512 }
2513 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2514
2515 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2516
2517 if (vm_object_lock_upgrade(object) == FALSE) {
2518 /*
2519 * couldn't upgrade, so explictly take the lock
2520 * exclusively and go relookup the page since we
2521 * will have dropped the object lock and
2522 * a different thread could have inserted
2523 * a page at this offset
2524 * no need for a full retry since we're
2525 * at the top level of the object chain
2526 */
2527 vm_object_lock(object);
2528
2529 continue;
2530 }
2531 }
2532 m->busy = TRUE;
2533
2534 vm_map_unlock_read(map);
2535 if (real_map != map)
2536 vm_map_unlock(real_map);
2537
2538 vm_page_decrypt(m, 0);
2539
2540 assert(m->busy);
2541 PAGE_WAKEUP_DONE(m);
2542
2543 vm_object_unlock(cur_object);
2544 /*
2545 * Retry from the top, in case anything
2546 * changed while we were decrypting...
2547 */
2548 goto RetryFault;
2549 }
2550 ASSERT_PAGE_DECRYPTED(m);
2551
2552 if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) {
2553 /*
2554 * We might need to validate this page
2555 * against its code signature, so we
2556 * want to hold the VM object exclusively.
2557 */
2558 if (object != cur_object) {
2559 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2560 vm_object_unlock(object);
2561 vm_object_unlock(cur_object);
2562
2563 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2564
2565 vm_map_unlock_read(map);
2566 if (real_map != map)
2567 vm_map_unlock(real_map);
2568
2569 goto RetryFault;
2570 }
2571
2572 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2573
2574 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2575
2576 if (vm_object_lock_upgrade(object) == FALSE) {
2577 /*
2578 * couldn't upgrade, so explictly take the lock
2579 * exclusively and go relookup the page since we
2580 * will have dropped the object lock and
2581 * a different thread could have inserted
2582 * a page at this offset
2583 * no need for a full retry since we're
2584 * at the top level of the object chain
2585 */
2586 vm_object_lock(object);
2587
2588 continue;
2589 }
2590 }
2591 }
2592 /*
2593 * Two cases of map in faults:
2594 * - At top level w/o copy object.
2595 * - Read fault anywhere.
2596 * --> must disallow write.
2597 */
2598
2599 if (object == cur_object && object->copy == VM_OBJECT_NULL) {
2600 if ((fault_type & VM_PROT_WRITE) == 0) {
2601 /*
2602 * This is not a "write" fault, so we
2603 * might not have taken the object lock
2604 * exclusively and we might not be able
2605 * to update the "wpmapped" bit in
2606 * vm_fault_enter().
2607 * Let's just grant read access to
2608 * the page for now and we'll
2609 * soft-fault again if we need write
2610 * access later...
2611 */
2612 prot &= ~VM_PROT_WRITE;
2613 }
2614 goto FastPmapEnter;
2615 }
2616
2617 if ((fault_type & VM_PROT_WRITE) == 0) {
2618
2619 prot &= ~VM_PROT_WRITE;
2620
2621 /*
2622 * Set up to map the page...
2623 * mark the page busy, drop
2624 * unneeded object lock
2625 */
2626 if (object != cur_object) {
2627 /*
2628 * don't need the original object anymore
2629 */
2630 vm_object_unlock(object);
2631
2632 /*
2633 * switch to the object that has the new page
2634 */
2635 object = cur_object;
2636 object_lock_type = cur_object_lock_type;
2637 }
2638 FastPmapEnter:
2639 /*
2640 * prepare for the pmap_enter...
2641 * object and map are both locked
2642 * m contains valid data
2643 * object == m->object
2644 * cur_object == NULL or it's been unlocked
2645 * no paging references on either object or cur_object
2646 */
2647 #if MACH_KDB
2648 if (db_watchpoint_list && (fault_type & VM_PROT_WRITE) == 0)
2649 prot &= ~VM_PROT_WRITE;
2650 #endif
2651 if (caller_pmap) {
2652 kr = vm_fault_enter(m,
2653 caller_pmap,
2654 caller_pmap_addr,
2655 prot,
2656 wired,
2657 change_wiring,
2658 fault_info.no_cache,
2659 &type_of_fault);
2660 } else {
2661 kr = vm_fault_enter(m,
2662 pmap,
2663 vaddr,
2664 prot,
2665 wired,
2666 change_wiring,
2667 fault_info.no_cache,
2668 &type_of_fault);
2669 }
2670
2671 if (need_collapse == TRUE)
2672 vm_object_collapse(object, offset, TRUE);
2673
2674 if (type_of_fault == DBG_PAGEIN_FAULT) {
2675 /*
2676 * evaluate access pattern and update state
2677 * vm_fault_deactivate_behind depends on the
2678 * state being up to date
2679 */
2680 vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
2681
2682 vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
2683 }
2684 /*
2685 * That's it, clean up and return.
2686 */
2687 if (m->busy)
2688 PAGE_WAKEUP_DONE(m);
2689
2690 vm_object_unlock(object);
2691
2692 vm_map_unlock_read(map);
2693 if (real_map != map)
2694 vm_map_unlock(real_map);
2695
2696 goto done;
2697 }
2698 /*
2699 * COPY ON WRITE FAULT
2700 *
2701 * If objects match, then
2702 * object->copy must not be NULL (else control
2703 * would be in previous code block), and we
2704 * have a potential push into the copy object
2705 * with which we can't cope with here.
2706 */
2707 if (cur_object == object) {
2708 /*
2709 * must take the slow path to
2710 * deal with the copy push
2711 */
2712 break;
2713 }
2714 assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
2715
2716 /*
2717 * This is now a shadow based copy on write
2718 * fault -- it requires a copy up the shadow
2719 * chain.
2720 *
2721 * Allocate a page in the original top level
2722 * object. Give up if allocate fails. Also
2723 * need to remember current page, as it's the
2724 * source of the copy.
2725 *
2726 * at this point we hold locks on both
2727 * object and cur_object... no need to take
2728 * paging refs or mark pages BUSY since
2729 * we don't drop either object lock until
2730 * the page has been copied and inserted
2731 */
2732 cur_m = m;
2733 m = vm_page_grab();
2734
2735 if (m == VM_PAGE_NULL) {
2736 /*
2737 * no free page currently available...
2738 * must take the slow path
2739 */
2740 break;
2741 }
2742 /*
2743 * Now do the copy. Mark the source page busy...
2744 *
2745 * NOTE: This code holds the map lock across
2746 * the page copy.
2747 */
2748 vm_page_copy(cur_m, m);
2749 vm_page_insert(m, object, offset);
2750 m->dirty = TRUE;
2751
2752 /*
2753 * Now cope with the source page and object
2754 */
2755 if (object->ref_count > 1 && cur_m->pmapped)
2756 pmap_disconnect(cur_m->phys_page);
2757
2758 need_collapse = TRUE;
2759
2760 if (!cur_object->internal &&
2761 cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
2762 /*
2763 * The object from which we've just
2764 * copied a page is most probably backed
2765 * by a vnode. We don't want to waste too
2766 * much time trying to collapse the VM objects
2767 * and create a bottleneck when several tasks
2768 * map the same file.
2769 */
2770 if (cur_object->copy == object) {
2771 /*
2772 * Shared mapping or no COW yet.
2773 * We can never collapse a copy
2774 * object into its backing object.
2775 */
2776 need_collapse = FALSE;
2777 } else if (cur_object->copy == object->shadow &&
2778 object->shadow->resident_page_count == 0) {
2779 /*
2780 * Shared mapping after a COW occurred.
2781 */
2782 need_collapse = FALSE;
2783 }
2784 }
2785 vm_object_unlock(cur_object);
2786
2787 if (need_collapse == FALSE)
2788 vm_fault_collapse_skipped++;
2789 vm_fault_collapse_total++;
2790
2791 type_of_fault = DBG_COW_FAULT;
2792 VM_STAT_INCR(cow_faults);
2793 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2794 current_task()->cow_faults++;
2795
2796 goto FastPmapEnter;
2797
2798 } else {
2799 /*
2800 * No page at cur_object, cur_offset... m == NULL
2801 */
2802 if (cur_object->pager_created) {
2803 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) {
2804 /*
2805 * May have to talk to a pager...
2806 * take the slow path.
2807 */
2808 break;
2809 }
2810 /*
2811 * existence map present and indicates
2812 * that the pager doesn't have this page
2813 */
2814 }
2815 if (cur_object->shadow == VM_OBJECT_NULL) {
2816 /*
2817 * Zero fill fault. Page gets
2818 * inserted into the original object.
2819 */
2820 if (cur_object->shadow_severed) {
2821
2822 if (object != cur_object)
2823 vm_object_unlock(cur_object);
2824 vm_object_unlock(object);
2825
2826 vm_map_unlock_read(map);
2827 if (real_map != map)
2828 vm_map_unlock(real_map);
2829
2830 kr = KERN_MEMORY_ERROR;
2831 goto done;
2832 }
2833 if (VM_PAGE_ZFILL_THROTTLED()) {
2834 /*
2835 * drop all of our locks...
2836 * wait until the free queue is
2837 * pumped back up and then
2838 * redrive the fault
2839 */
2840 if (object != cur_object)
2841 vm_object_unlock(cur_object);
2842 vm_object_unlock(object);
2843 vm_map_unlock_read(map);
2844 if (real_map != map)
2845 vm_map_unlock(real_map);
2846
2847 if (vm_page_wait((change_wiring) ?
2848 THREAD_UNINT :
2849 THREAD_ABORTSAFE))
2850 goto RetryFault;
2851
2852 kr = KERN_ABORTED;
2853 goto done;
2854 }
2855 if (vm_backing_store_low) {
2856 /*
2857 * we are protecting the system from
2858 * backing store exhaustion...
2859 * must take the slow path if we're
2860 * not privileged
2861 */
2862 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
2863 break;
2864 }
2865 if (cur_object != object) {
2866 vm_object_unlock(cur_object);
2867
2868 cur_object = object;
2869 }
2870 if (object_lock_type == OBJECT_LOCK_SHARED) {
2871
2872 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2873
2874 if (vm_object_lock_upgrade(object) == FALSE) {
2875 /*
2876 * couldn't upgrade so do a full retry on the fault
2877 * since we dropped the object lock which
2878 * could allow another thread to insert
2879 * a page at this offset
2880 */
2881 vm_map_unlock_read(map);
2882 if (real_map != map)
2883 vm_map_unlock(real_map);
2884
2885 goto RetryFault;
2886 }
2887 }
2888 m = vm_page_alloc(object, offset);
2889
2890 if (m == VM_PAGE_NULL) {
2891 /*
2892 * no free page currently available...
2893 * must take the slow path
2894 */
2895 break;
2896 }
2897
2898 /*
2899 * Now zero fill page...
2900 * the page is probably going to
2901 * be written soon, so don't bother
2902 * to clear the modified bit
2903 *
2904 * NOTE: This code holds the map
2905 * lock across the zero fill.
2906 */
2907 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
2908
2909 goto FastPmapEnter;
2910 }
2911 /*
2912 * On to the next level in the shadow chain
2913 */
2914 cur_offset += cur_object->shadow_offset;
2915 new_object = cur_object->shadow;
2916
2917 /*
2918 * take the new_object's lock with the indicated state
2919 */
2920 if (cur_object_lock_type == OBJECT_LOCK_SHARED)
2921 vm_object_lock_shared(new_object);
2922 else
2923 vm_object_lock(new_object);
2924
2925 if (cur_object != object)
2926 vm_object_unlock(cur_object);
2927
2928 cur_object = new_object;
2929
2930 continue;
2931 }
2932 }
2933 /*
2934 * Cleanup from fast fault failure. Drop any object
2935 * lock other than original and drop map lock.
2936 */
2937 if (object != cur_object)
2938 vm_object_unlock(cur_object);
2939
2940 /*
2941 * must own the object lock exclusively at this point
2942 */
2943 if (object_lock_type == OBJECT_LOCK_SHARED) {
2944 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2945
2946 if (vm_object_lock_upgrade(object) == FALSE) {
2947 /*
2948 * couldn't upgrade, so explictly
2949 * take the lock exclusively
2950 * no need to retry the fault at this
2951 * point since "vm_fault_page" will
2952 * completely re-evaluate the state
2953 */
2954 vm_object_lock(object);
2955 }
2956 }
2957
2958 handle_copy_delay:
2959 vm_map_unlock_read(map);
2960 if (real_map != map)
2961 vm_map_unlock(real_map);
2962
2963 /*
2964 * Make a reference to this object to
2965 * prevent its disposal while we are messing with
2966 * it. Once we have the reference, the map is free
2967 * to be diddled. Since objects reference their
2968 * shadows (and copies), they will stay around as well.
2969 */
2970 vm_object_reference_locked(object);
2971 vm_object_paging_begin(object);
2972
2973 XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2974
2975 error_code = 0;
2976
2977 kr = vm_fault_page(object, offset, fault_type,
2978 (change_wiring && !wired),
2979 &prot, &result_page, &top_page,
2980 &type_of_fault,
2981 &error_code, map->no_zero_fill,
2982 FALSE, &fault_info);
2983
2984 /*
2985 * if kr != VM_FAULT_SUCCESS, then the paging reference
2986 * has been dropped and the object unlocked... the ref_count
2987 * is still held
2988 *
2989 * if kr == VM_FAULT_SUCCESS, then the paging reference
2990 * is still held along with the ref_count on the original object
2991 *
2992 * if m != NULL, then the object it belongs to
2993 * is returned locked with a paging reference
2994 *
2995 * if top_page != NULL, then it's BUSY and the
2996 * object it belongs to has a paging reference
2997 * but is returned unlocked
2998 */
2999 if (kr != VM_FAULT_SUCCESS) {
3000 /*
3001 * we didn't succeed, lose the object reference immediately.
3002 */
3003 vm_object_deallocate(object);
3004
3005 /*
3006 * See why we failed, and take corrective action.
3007 */
3008 switch (kr) {
3009 case VM_FAULT_MEMORY_SHORTAGE:
3010 if (vm_page_wait((change_wiring) ?
3011 THREAD_UNINT :
3012 THREAD_ABORTSAFE))
3013 goto RetryFault;
3014 /*
3015 * fall thru
3016 */
3017 case VM_FAULT_INTERRUPTED:
3018 kr = KERN_ABORTED;
3019 goto done;
3020 case VM_FAULT_RETRY:
3021 goto RetryFault;
3022 case VM_FAULT_MEMORY_ERROR:
3023 if (error_code)
3024 kr = error_code;
3025 else
3026 kr = KERN_MEMORY_ERROR;
3027 goto done;
3028 }
3029 }
3030 m = result_page;
3031
3032 if (m != VM_PAGE_NULL) {
3033 assert((change_wiring && !wired) ?
3034 (top_page == VM_PAGE_NULL) :
3035 ((top_page == VM_PAGE_NULL) == (m->object == object)));
3036 }
3037
3038 /*
3039 * What to do with the resulting page from vm_fault_page
3040 * if it doesn't get entered into the physical map:
3041 */
3042 #define RELEASE_PAGE(m) \
3043 MACRO_BEGIN \
3044 PAGE_WAKEUP_DONE(m); \
3045 vm_page_lockspin_queues(); \
3046 if (!m->active && !m->inactive && !m->throttled)\
3047 vm_page_activate(m); \
3048 vm_page_unlock_queues(); \
3049 MACRO_END
3050
3051 /*
3052 * We must verify that the maps have not changed
3053 * since our last lookup.
3054 */
3055 if (m != VM_PAGE_NULL) {
3056 old_copy_object = m->object->copy;
3057 vm_object_unlock(m->object);
3058 } else
3059 old_copy_object = VM_OBJECT_NULL;
3060
3061 /*
3062 * no object locks are held at this point
3063 */
3064 if ((map != original_map) || !vm_map_verify(map, &version)) {
3065 vm_object_t retry_object;
3066 vm_object_offset_t retry_offset;
3067 vm_prot_t retry_prot;
3068
3069 /*
3070 * To avoid trying to write_lock the map while another
3071 * thread has it read_locked (in vm_map_pageable), we
3072 * do not try for write permission. If the page is
3073 * still writable, we will get write permission. If it
3074 * is not, or has been marked needs_copy, we enter the
3075 * mapping without write permission, and will merely
3076 * take another fault.
3077 */
3078 map = original_map;
3079 vm_map_lock_read(map);
3080
3081 kr = vm_map_lookup_locked(&map, vaddr,
3082 fault_type & ~VM_PROT_WRITE,
3083 OBJECT_LOCK_EXCLUSIVE, &version,
3084 &retry_object, &retry_offset, &retry_prot,
3085 &wired,
3086 &fault_info,
3087 &real_map);
3088 pmap = real_map->pmap;
3089
3090 if (kr != KERN_SUCCESS) {
3091 vm_map_unlock_read(map);
3092
3093 if (m != VM_PAGE_NULL) {
3094 /*
3095 * retake the lock so that
3096 * we can drop the paging reference
3097 * in vm_fault_cleanup and do the
3098 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3099 */
3100 vm_object_lock(m->object);
3101
3102 RELEASE_PAGE(m);
3103
3104 vm_fault_cleanup(m->object, top_page);
3105 } else {
3106 /*
3107 * retake the lock so that
3108 * we can drop the paging reference
3109 * in vm_fault_cleanup
3110 */
3111 vm_object_lock(object);
3112
3113 vm_fault_cleanup(object, top_page);
3114 }
3115 vm_object_deallocate(object);
3116
3117 goto done;
3118 }
3119 vm_object_unlock(retry_object);
3120
3121 if ((retry_object != object) || (retry_offset != offset)) {
3122
3123 vm_map_unlock_read(map);
3124 if (real_map != map)
3125 vm_map_unlock(real_map);
3126
3127 if (m != VM_PAGE_NULL) {
3128 /*
3129 * retake the lock so that
3130 * we can drop the paging reference
3131 * in vm_fault_cleanup and do the
3132 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3133 */
3134 vm_object_lock(m->object);
3135
3136 RELEASE_PAGE(m);
3137
3138 vm_fault_cleanup(m->object, top_page);
3139 } else {
3140 /*
3141 * retake the lock so that
3142 * we can drop the paging reference
3143 * in vm_fault_cleanup
3144 */
3145 vm_object_lock(object);
3146
3147 vm_fault_cleanup(object, top_page);
3148 }
3149 vm_object_deallocate(object);
3150
3151 goto RetryFault;
3152 }
3153 /*
3154 * Check whether the protection has changed or the object
3155 * has been copied while we left the map unlocked.
3156 */
3157 prot &= retry_prot;
3158 }
3159 if (m != VM_PAGE_NULL) {
3160 vm_object_lock(m->object);
3161
3162 if (m->object->copy != old_copy_object) {
3163 /*
3164 * The copy object changed while the top-level object
3165 * was unlocked, so take away write permission.
3166 */
3167 prot &= ~VM_PROT_WRITE;
3168 }
3169 } else
3170 vm_object_lock(object);
3171
3172 /*
3173 * If we want to wire down this page, but no longer have
3174 * adequate permissions, we must start all over.
3175 */
3176 if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
3177
3178 vm_map_verify_done(map, &version);
3179 if (real_map != map)
3180 vm_map_unlock(real_map);
3181
3182 if (m != VM_PAGE_NULL) {
3183 RELEASE_PAGE(m);
3184
3185 vm_fault_cleanup(m->object, top_page);
3186 } else
3187 vm_fault_cleanup(object, top_page);
3188
3189 vm_object_deallocate(object);
3190
3191 goto RetryFault;
3192 }
3193 if (m != VM_PAGE_NULL) {
3194 /*
3195 * Put this page into the physical map.
3196 * We had to do the unlock above because pmap_enter
3197 * may cause other faults. The page may be on
3198 * the pageout queues. If the pageout daemon comes
3199 * across the page, it will remove it from the queues.
3200 */
3201 if (caller_pmap) {
3202 kr = vm_fault_enter(m,
3203 caller_pmap,
3204 caller_pmap_addr,
3205 prot,
3206 wired,
3207 change_wiring,
3208 fault_info.no_cache,
3209 &type_of_fault);
3210 } else {
3211 kr = vm_fault_enter(m,
3212 pmap,
3213 vaddr,
3214 prot,
3215 wired,
3216 change_wiring,
3217 fault_info.no_cache,
3218 &type_of_fault);
3219 }
3220 if (kr != KERN_SUCCESS) {
3221 /* abort this page fault */
3222 vm_map_verify_done(map, &version);
3223 if (real_map != map)
3224 vm_map_unlock(real_map);
3225 PAGE_WAKEUP_DONE(m);
3226 vm_fault_cleanup(m->object, top_page);
3227 vm_object_deallocate(object);
3228 goto done;
3229 }
3230 } else {
3231
3232 vm_map_entry_t entry;
3233 vm_map_offset_t laddr;
3234 vm_map_offset_t ldelta, hdelta;
3235
3236 /*
3237 * do a pmap block mapping from the physical address
3238 * in the object
3239 */
3240
3241 #ifdef ppc
3242 /* While we do not worry about execution protection in */
3243 /* general, certian pages may have instruction execution */
3244 /* disallowed. We will check here, and if not allowed */
3245 /* to execute, we return with a protection failure. */
3246
3247 if ((fault_type & VM_PROT_EXECUTE) &&
3248 (!pmap_eligible_for_execute((ppnum_t)(object->shadow_offset >> 12)))) {
3249
3250 vm_map_verify_done(map, &version);
3251
3252 if (real_map != map)
3253 vm_map_unlock(real_map);
3254
3255 vm_fault_cleanup(object, top_page);
3256 vm_object_deallocate(object);
3257
3258 kr = KERN_PROTECTION_FAILURE;
3259 goto done;
3260 }
3261 #endif /* ppc */
3262
3263 if (real_map != map)
3264 vm_map_unlock(real_map);
3265
3266 if (original_map != map) {
3267 vm_map_unlock_read(map);
3268 vm_map_lock_read(original_map);
3269 map = original_map;
3270 }
3271 real_map = map;
3272
3273 laddr = vaddr;
3274 hdelta = 0xFFFFF000;
3275 ldelta = 0xFFFFF000;
3276
3277 while (vm_map_lookup_entry(map, laddr, &entry)) {
3278 if (ldelta > (laddr - entry->vme_start))
3279 ldelta = laddr - entry->vme_start;
3280 if (hdelta > (entry->vme_end - laddr))
3281 hdelta = entry->vme_end - laddr;
3282 if (entry->is_sub_map) {
3283
3284 laddr = (laddr - entry->vme_start)
3285 + entry->offset;
3286 vm_map_lock_read(entry->object.sub_map);
3287
3288 if (map != real_map)
3289 vm_map_unlock_read(map);
3290 if (entry->use_pmap) {
3291 vm_map_unlock_read(real_map);
3292 real_map = entry->object.sub_map;
3293 }
3294 map = entry->object.sub_map;
3295
3296 } else {
3297 break;
3298 }
3299 }
3300
3301 if (vm_map_lookup_entry(map, laddr, &entry) &&
3302 (entry->object.vm_object != NULL) &&
3303 (entry->object.vm_object == object)) {
3304
3305 if (caller_pmap) {
3306 /*
3307 * Set up a block mapped area
3308 */
3309 pmap_map_block(caller_pmap,
3310 (addr64_t)(caller_pmap_addr - ldelta),
3311 (((vm_map_offset_t) (entry->object.vm_object->shadow_offset)) +
3312 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3313 ((ldelta + hdelta) >> 12), prot,
3314 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3315 } else {
3316 /*
3317 * Set up a block mapped area
3318 */
3319 pmap_map_block(real_map->pmap,
3320 (addr64_t)(vaddr - ldelta),
3321 (((vm_map_offset_t)(entry->object.vm_object->shadow_offset)) +
3322 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3323 ((ldelta + hdelta) >> 12), prot,
3324 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3325 }
3326 }
3327 }
3328
3329 /*
3330 * Unlock everything, and return
3331 */
3332 vm_map_verify_done(map, &version);
3333 if (real_map != map)
3334 vm_map_unlock(real_map);
3335
3336 if (m != VM_PAGE_NULL) {
3337 PAGE_WAKEUP_DONE(m);
3338
3339 vm_fault_cleanup(m->object, top_page);
3340 } else
3341 vm_fault_cleanup(object, top_page);
3342
3343 vm_object_deallocate(object);
3344
3345 #undef RELEASE_PAGE
3346
3347 kr = KERN_SUCCESS;
3348 done:
3349 thread_interrupt_level(interruptible_state);
3350
3351 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3352 (int)((uint64_t)vaddr >> 32),
3353 (int)vaddr,
3354 kr,
3355 type_of_fault,
3356 0);
3357
3358 return (kr);
3359 }
3360
3361 /*
3362 * vm_fault_wire:
3363 *
3364 * Wire down a range of virtual addresses in a map.
3365 */
3366 kern_return_t
3367 vm_fault_wire(
3368 vm_map_t map,
3369 vm_map_entry_t entry,
3370 pmap_t pmap,
3371 vm_map_offset_t pmap_addr)
3372 {
3373
3374 register vm_map_offset_t va;
3375 register vm_map_offset_t end_addr = entry->vme_end;
3376 register kern_return_t rc;
3377
3378 assert(entry->in_transition);
3379
3380 if ((entry->object.vm_object != NULL) &&
3381 !entry->is_sub_map &&
3382 entry->object.vm_object->phys_contiguous) {
3383 return KERN_SUCCESS;
3384 }
3385
3386 /*
3387 * Inform the physical mapping system that the
3388 * range of addresses may not fault, so that
3389 * page tables and such can be locked down as well.
3390 */
3391
3392 pmap_pageable(pmap, pmap_addr,
3393 pmap_addr + (end_addr - entry->vme_start), FALSE);
3394
3395 /*
3396 * We simulate a fault to get the page and enter it
3397 * in the physical map.
3398 */
3399
3400 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3401 if ((rc = vm_fault_wire_fast(
3402 map, va, entry, pmap,
3403 pmap_addr + (va - entry->vme_start)
3404 )) != KERN_SUCCESS) {
3405 rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
3406 (pmap == kernel_pmap) ?
3407 THREAD_UNINT : THREAD_ABORTSAFE,
3408 pmap, pmap_addr + (va - entry->vme_start));
3409 DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
3410 }
3411
3412 if (rc != KERN_SUCCESS) {
3413 struct vm_map_entry tmp_entry = *entry;
3414
3415 /* unwire wired pages */
3416 tmp_entry.vme_end = va;
3417 vm_fault_unwire(map,
3418 &tmp_entry, FALSE, pmap, pmap_addr);
3419
3420 return rc;
3421 }
3422 }
3423 return KERN_SUCCESS;
3424 }
3425
3426 /*
3427 * vm_fault_unwire:
3428 *
3429 * Unwire a range of virtual addresses in a map.
3430 */
3431 void
3432 vm_fault_unwire(
3433 vm_map_t map,
3434 vm_map_entry_t entry,
3435 boolean_t deallocate,
3436 pmap_t pmap,
3437 vm_map_offset_t pmap_addr)
3438 {
3439 register vm_map_offset_t va;
3440 register vm_map_offset_t end_addr = entry->vme_end;
3441 vm_object_t object;
3442 struct vm_object_fault_info fault_info;
3443
3444 object = (entry->is_sub_map)
3445 ? VM_OBJECT_NULL : entry->object.vm_object;
3446
3447 /*
3448 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
3449 * do anything since such memory is wired by default. So we don't have
3450 * anything to undo here.
3451 */
3452
3453 if (object != VM_OBJECT_NULL && object->phys_contiguous)
3454 return;
3455
3456 fault_info.interruptible = THREAD_UNINT;
3457 fault_info.behavior = entry->behavior;
3458 fault_info.user_tag = entry->alias;
3459 fault_info.lo_offset = entry->offset;
3460 fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
3461 fault_info.no_cache = entry->no_cache;
3462
3463 /*
3464 * Since the pages are wired down, we must be able to
3465 * get their mappings from the physical map system.
3466 */
3467
3468 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3469
3470 if (object == VM_OBJECT_NULL) {
3471 if (pmap) {
3472 pmap_change_wiring(pmap,
3473 pmap_addr + (va - entry->vme_start), FALSE);
3474 }
3475 (void) vm_fault(map, va, VM_PROT_NONE,
3476 TRUE, THREAD_UNINT, pmap, pmap_addr);
3477 } else {
3478 vm_prot_t prot;
3479 vm_page_t result_page;
3480 vm_page_t top_page;
3481 vm_object_t result_object;
3482 vm_fault_return_t result;
3483
3484 fault_info.cluster_size = end_addr - va;
3485
3486 do {
3487 prot = VM_PROT_NONE;
3488
3489 vm_object_lock(object);
3490 vm_object_paging_begin(object);
3491 XPR(XPR_VM_FAULT,
3492 "vm_fault_unwire -> vm_fault_page\n",
3493 0,0,0,0,0);
3494 result = vm_fault_page(
3495 object,
3496 entry->offset + (va - entry->vme_start),
3497 VM_PROT_NONE, TRUE,
3498 &prot, &result_page, &top_page,
3499 (int *)0,
3500 NULL, map->no_zero_fill,
3501 FALSE, &fault_info);
3502 } while (result == VM_FAULT_RETRY);
3503
3504 /*
3505 * If this was a mapping to a file on a device that has been forcibly
3506 * unmounted, then we won't get a page back from vm_fault_page(). Just
3507 * move on to the next one in case the remaining pages are mapped from
3508 * different objects. During a forced unmount, the object is terminated
3509 * so the alive flag will be false if this happens. A forced unmount will
3510 * will occur when an external disk is unplugged before the user does an
3511 * eject, so we don't want to panic in that situation.
3512 */
3513
3514 if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
3515 continue;
3516
3517 if (result != VM_FAULT_SUCCESS)
3518 panic("vm_fault_unwire: failure");
3519
3520 result_object = result_page->object;
3521
3522 if ((pmap) && (result_page->phys_page != vm_page_guard_addr)) {
3523 pmap_change_wiring(pmap,
3524 pmap_addr + (va - entry->vme_start), FALSE);
3525 }
3526 if (deallocate) {
3527 assert(result_page->phys_page !=
3528 vm_page_fictitious_addr);
3529 pmap_disconnect(result_page->phys_page);
3530 VM_PAGE_FREE(result_page);
3531 } else {
3532 vm_page_lockspin_queues();
3533 vm_page_unwire(result_page);
3534 vm_page_unlock_queues();
3535 PAGE_WAKEUP_DONE(result_page);
3536 }
3537 vm_fault_cleanup(result_object, top_page);
3538 }
3539 }
3540
3541 /*
3542 * Inform the physical mapping system that the range
3543 * of addresses may fault, so that page tables and
3544 * such may be unwired themselves.
3545 */
3546
3547 pmap_pageable(pmap, pmap_addr,
3548 pmap_addr + (end_addr - entry->vme_start), TRUE);
3549
3550 }
3551
3552 /*
3553 * vm_fault_wire_fast:
3554 *
3555 * Handle common case of a wire down page fault at the given address.
3556 * If successful, the page is inserted into the associated physical map.
3557 * The map entry is passed in to avoid the overhead of a map lookup.
3558 *
3559 * NOTE: the given address should be truncated to the
3560 * proper page address.
3561 *
3562 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
3563 * a standard error specifying why the fault is fatal is returned.
3564 *
3565 * The map in question must be referenced, and remains so.
3566 * Caller has a read lock on the map.
3567 *
3568 * This is a stripped version of vm_fault() for wiring pages. Anything
3569 * other than the common case will return KERN_FAILURE, and the caller
3570 * is expected to call vm_fault().
3571 */
3572 kern_return_t
3573 vm_fault_wire_fast(
3574 __unused vm_map_t map,
3575 vm_map_offset_t va,
3576 vm_map_entry_t entry,
3577 pmap_t pmap,
3578 vm_map_offset_t pmap_addr)
3579 {
3580 vm_object_t object;
3581 vm_object_offset_t offset;
3582 register vm_page_t m;
3583 vm_prot_t prot;
3584 thread_t thread = current_thread();
3585 int type_of_fault;
3586 kern_return_t kr;
3587
3588 VM_STAT_INCR(faults);
3589
3590 if (thread != THREAD_NULL && thread->task != TASK_NULL)
3591 thread->task->faults++;
3592
3593 /*
3594 * Recovery actions
3595 */
3596
3597 #undef RELEASE_PAGE
3598 #define RELEASE_PAGE(m) { \
3599 PAGE_WAKEUP_DONE(m); \
3600 vm_page_lockspin_queues(); \
3601 vm_page_unwire(m); \
3602 vm_page_unlock_queues(); \
3603 }
3604
3605
3606 #undef UNLOCK_THINGS
3607 #define UNLOCK_THINGS { \
3608 vm_object_paging_end(object); \
3609 vm_object_unlock(object); \
3610 }
3611
3612 #undef UNLOCK_AND_DEALLOCATE
3613 #define UNLOCK_AND_DEALLOCATE { \
3614 UNLOCK_THINGS; \
3615 vm_object_deallocate(object); \
3616 }
3617 /*
3618 * Give up and have caller do things the hard way.
3619 */
3620
3621 #define GIVE_UP { \
3622 UNLOCK_AND_DEALLOCATE; \
3623 return(KERN_FAILURE); \
3624 }
3625
3626
3627 /*
3628 * If this entry is not directly to a vm_object, bail out.
3629 */
3630 if (entry->is_sub_map)
3631 return(KERN_FAILURE);
3632
3633 /*
3634 * Find the backing store object and offset into it.
3635 */
3636
3637 object = entry->object.vm_object;
3638 offset = (va - entry->vme_start) + entry->offset;
3639 prot = entry->protection;
3640
3641 /*
3642 * Make a reference to this object to prevent its
3643 * disposal while we are messing with it.
3644 */
3645
3646 vm_object_lock(object);
3647 vm_object_reference_locked(object);
3648 vm_object_paging_begin(object);
3649
3650 /*
3651 * INVARIANTS (through entire routine):
3652 *
3653 * 1) At all times, we must either have the object
3654 * lock or a busy page in some object to prevent
3655 * some other thread from trying to bring in
3656 * the same page.
3657 *
3658 * 2) Once we have a busy page, we must remove it from
3659 * the pageout queues, so that the pageout daemon
3660 * will not grab it away.
3661 *
3662 */
3663
3664 /*
3665 * Look for page in top-level object. If it's not there or
3666 * there's something going on, give up.
3667 * ENCRYPTED SWAP: use the slow fault path, since we'll need to
3668 * decrypt the page before wiring it down.
3669 */
3670 m = vm_page_lookup(object, offset);
3671 if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
3672 (m->unusual && ( m->error || m->restart || m->absent))) {
3673
3674 GIVE_UP;
3675 }
3676 ASSERT_PAGE_DECRYPTED(m);
3677
3678 if (m->fictitious &&
3679 m->phys_page == vm_page_guard_addr) {
3680 /*
3681 * Guard pages are fictitious pages and are never
3682 * entered into a pmap, so let's say it's been wired...
3683 */
3684 kr = KERN_SUCCESS;
3685 goto done;
3686 }
3687
3688 /*
3689 * Wire the page down now. All bail outs beyond this
3690 * point must unwire the page.
3691 */
3692
3693 vm_page_lockspin_queues();
3694 vm_page_wire(m);
3695 vm_page_unlock_queues();
3696
3697 /*
3698 * Mark page busy for other threads.
3699 */
3700 assert(!m->busy);
3701 m->busy = TRUE;
3702 assert(!m->absent);
3703
3704 /*
3705 * Give up if the page is being written and there's a copy object
3706 */
3707 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3708 RELEASE_PAGE(m);
3709 GIVE_UP;
3710 }
3711
3712 /*
3713 * Put this page into the physical map.
3714 */
3715 type_of_fault = DBG_CACHE_HIT_FAULT;
3716 kr = vm_fault_enter(m,
3717 pmap,
3718 pmap_addr,
3719 prot,
3720 TRUE,
3721 FALSE,
3722 FALSE,
3723 &type_of_fault);
3724
3725 done:
3726 /*
3727 * Unlock everything, and return
3728 */
3729
3730 PAGE_WAKEUP_DONE(m);
3731 UNLOCK_AND_DEALLOCATE;
3732
3733 return kr;
3734
3735 }
3736
3737 /*
3738 * Routine: vm_fault_copy_cleanup
3739 * Purpose:
3740 * Release a page used by vm_fault_copy.
3741 */
3742
3743 void
3744 vm_fault_copy_cleanup(
3745 vm_page_t page,
3746 vm_page_t top_page)
3747 {
3748 vm_object_t object = page->object;
3749
3750 vm_object_lock(object);
3751 PAGE_WAKEUP_DONE(page);
3752 vm_page_lockspin_queues();
3753 if (!page->active && !page->inactive && !page->throttled)
3754 vm_page_activate(page);
3755 vm_page_unlock_queues();
3756 vm_fault_cleanup(object, top_page);
3757 }
3758
3759 void
3760 vm_fault_copy_dst_cleanup(
3761 vm_page_t page)
3762 {
3763 vm_object_t object;
3764
3765 if (page != VM_PAGE_NULL) {
3766 object = page->object;
3767 vm_object_lock(object);
3768 vm_page_lockspin_queues();
3769 vm_page_unwire(page);
3770 vm_page_unlock_queues();
3771 vm_object_paging_end(object);
3772 vm_object_unlock(object);
3773 }
3774 }
3775
3776 /*
3777 * Routine: vm_fault_copy
3778 *
3779 * Purpose:
3780 * Copy pages from one virtual memory object to another --
3781 * neither the source nor destination pages need be resident.
3782 *
3783 * Before actually copying a page, the version associated with
3784 * the destination address map wil be verified.
3785 *
3786 * In/out conditions:
3787 * The caller must hold a reference, but not a lock, to
3788 * each of the source and destination objects and to the
3789 * destination map.
3790 *
3791 * Results:
3792 * Returns KERN_SUCCESS if no errors were encountered in
3793 * reading or writing the data. Returns KERN_INTERRUPTED if
3794 * the operation was interrupted (only possible if the
3795 * "interruptible" argument is asserted). Other return values
3796 * indicate a permanent error in copying the data.
3797 *
3798 * The actual amount of data copied will be returned in the
3799 * "copy_size" argument. In the event that the destination map
3800 * verification failed, this amount may be less than the amount
3801 * requested.
3802 */
3803 kern_return_t
3804 vm_fault_copy(
3805 vm_object_t src_object,
3806 vm_object_offset_t src_offset,
3807 vm_map_size_t *copy_size, /* INOUT */
3808 vm_object_t dst_object,
3809 vm_object_offset_t dst_offset,
3810 vm_map_t dst_map,
3811 vm_map_version_t *dst_version,
3812 int interruptible)
3813 {
3814 vm_page_t result_page;
3815
3816 vm_page_t src_page;
3817 vm_page_t src_top_page;
3818 vm_prot_t src_prot;
3819
3820 vm_page_t dst_page;
3821 vm_page_t dst_top_page;
3822 vm_prot_t dst_prot;
3823
3824 vm_map_size_t amount_left;
3825 vm_object_t old_copy_object;
3826 kern_return_t error = 0;
3827
3828 vm_map_size_t part_size;
3829 struct vm_object_fault_info fault_info_src;
3830 struct vm_object_fault_info fault_info_dst;
3831
3832 /*
3833 * In order not to confuse the clustered pageins, align
3834 * the different offsets on a page boundary.
3835 */
3836
3837 #define RETURN(x) \
3838 MACRO_BEGIN \
3839 *copy_size -= amount_left; \
3840 MACRO_RETURN(x); \
3841 MACRO_END
3842
3843 amount_left = *copy_size;
3844
3845 fault_info_src.interruptible = interruptible;
3846 fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
3847 fault_info_src.user_tag = 0;
3848 fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
3849 fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
3850 fault_info_src.no_cache = FALSE;
3851
3852 fault_info_dst.interruptible = interruptible;
3853 fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
3854 fault_info_dst.user_tag = 0;
3855 fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
3856 fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
3857 fault_info_dst.no_cache = FALSE;
3858
3859 do { /* while (amount_left > 0) */
3860 /*
3861 * There may be a deadlock if both source and destination
3862 * pages are the same. To avoid this deadlock, the copy must
3863 * start by getting the destination page in order to apply
3864 * COW semantics if any.
3865 */
3866
3867 RetryDestinationFault: ;
3868
3869 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3870
3871 vm_object_lock(dst_object);
3872 vm_object_paging_begin(dst_object);
3873
3874 fault_info_dst.cluster_size = amount_left;
3875
3876 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3877 switch (vm_fault_page(dst_object,
3878 vm_object_trunc_page(dst_offset),
3879 VM_PROT_WRITE|VM_PROT_READ,
3880 FALSE,
3881 &dst_prot, &dst_page, &dst_top_page,
3882 (int *)0,
3883 &error,
3884 dst_map->no_zero_fill,
3885 FALSE, &fault_info_dst)) {
3886 case VM_FAULT_SUCCESS:
3887 break;
3888 case VM_FAULT_RETRY:
3889 goto RetryDestinationFault;
3890 case VM_FAULT_MEMORY_SHORTAGE:
3891 if (vm_page_wait(interruptible))
3892 goto RetryDestinationFault;
3893 /* fall thru */
3894 case VM_FAULT_INTERRUPTED:
3895 RETURN(MACH_SEND_INTERRUPTED);
3896 case VM_FAULT_MEMORY_ERROR:
3897 if (error)
3898 return (error);
3899 else
3900 return(KERN_MEMORY_ERROR);
3901 }
3902 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3903
3904 old_copy_object = dst_page->object->copy;
3905
3906 /*
3907 * There exists the possiblity that the source and
3908 * destination page are the same. But we can't
3909 * easily determine that now. If they are the
3910 * same, the call to vm_fault_page() for the
3911 * destination page will deadlock. To prevent this we
3912 * wire the page so we can drop busy without having
3913 * the page daemon steal the page. We clean up the
3914 * top page but keep the paging reference on the object
3915 * holding the dest page so it doesn't go away.
3916 */
3917
3918 vm_page_lockspin_queues();
3919 vm_page_wire(dst_page);
3920 vm_page_unlock_queues();
3921 PAGE_WAKEUP_DONE(dst_page);
3922 vm_object_unlock(dst_page->object);
3923
3924 if (dst_top_page != VM_PAGE_NULL) {
3925 vm_object_lock(dst_object);
3926 VM_PAGE_FREE(dst_top_page);
3927 vm_object_paging_end(dst_object);
3928 vm_object_unlock(dst_object);
3929 }
3930
3931 RetrySourceFault: ;
3932
3933 if (src_object == VM_OBJECT_NULL) {
3934 /*
3935 * No source object. We will just
3936 * zero-fill the page in dst_object.
3937 */
3938 src_page = VM_PAGE_NULL;
3939 result_page = VM_PAGE_NULL;
3940 } else {
3941 vm_object_lock(src_object);
3942 src_page = vm_page_lookup(src_object,
3943 vm_object_trunc_page(src_offset));
3944 if (src_page == dst_page) {
3945 src_prot = dst_prot;
3946 result_page = VM_PAGE_NULL;
3947 } else {
3948 src_prot = VM_PROT_READ;
3949 vm_object_paging_begin(src_object);
3950
3951 fault_info_src.cluster_size = amount_left;
3952
3953 XPR(XPR_VM_FAULT,
3954 "vm_fault_copy(2) -> vm_fault_page\n",
3955 0,0,0,0,0);
3956 switch (vm_fault_page(
3957 src_object,
3958 vm_object_trunc_page(src_offset),
3959 VM_PROT_READ, FALSE,
3960 &src_prot,
3961 &result_page, &src_top_page,
3962 (int *)0, &error, FALSE,
3963 FALSE, &fault_info_src)) {
3964
3965 case VM_FAULT_SUCCESS:
3966 break;
3967 case VM_FAULT_RETRY:
3968 goto RetrySourceFault;
3969 case VM_FAULT_MEMORY_SHORTAGE:
3970 if (vm_page_wait(interruptible))
3971 goto RetrySourceFault;
3972 /* fall thru */
3973 case VM_FAULT_INTERRUPTED:
3974 vm_fault_copy_dst_cleanup(dst_page);
3975 RETURN(MACH_SEND_INTERRUPTED);
3976 case VM_FAULT_MEMORY_ERROR:
3977 vm_fault_copy_dst_cleanup(dst_page);
3978 if (error)
3979 return (error);
3980 else
3981 return(KERN_MEMORY_ERROR);
3982 }
3983
3984
3985 assert((src_top_page == VM_PAGE_NULL) ==
3986 (result_page->object == src_object));
3987 }
3988 assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
3989 vm_object_unlock(result_page->object);
3990 }
3991
3992 if (!vm_map_verify(dst_map, dst_version)) {
3993 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3994 vm_fault_copy_cleanup(result_page, src_top_page);
3995 vm_fault_copy_dst_cleanup(dst_page);
3996 break;
3997 }
3998
3999 vm_object_lock(dst_page->object);
4000
4001 if (dst_page->object->copy != old_copy_object) {
4002 vm_object_unlock(dst_page->object);
4003 vm_map_verify_done(dst_map, dst_version);
4004 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4005 vm_fault_copy_cleanup(result_page, src_top_page);
4006 vm_fault_copy_dst_cleanup(dst_page);
4007 break;
4008 }
4009 vm_object_unlock(dst_page->object);
4010
4011 /*
4012 * Copy the page, and note that it is dirty
4013 * immediately.
4014 */
4015
4016 if (!page_aligned(src_offset) ||
4017 !page_aligned(dst_offset) ||
4018 !page_aligned(amount_left)) {
4019
4020 vm_object_offset_t src_po,
4021 dst_po;
4022
4023 src_po = src_offset - vm_object_trunc_page(src_offset);
4024 dst_po = dst_offset - vm_object_trunc_page(dst_offset);
4025
4026 if (dst_po > src_po) {
4027 part_size = PAGE_SIZE - dst_po;
4028 } else {
4029 part_size = PAGE_SIZE - src_po;
4030 }
4031 if (part_size > (amount_left)){
4032 part_size = amount_left;
4033 }
4034
4035 if (result_page == VM_PAGE_NULL) {
4036 vm_page_part_zero_fill(dst_page,
4037 dst_po, part_size);
4038 } else {
4039 vm_page_part_copy(result_page, src_po,
4040 dst_page, dst_po, part_size);
4041 if(!dst_page->dirty){
4042 vm_object_lock(dst_object);
4043 dst_page->dirty = TRUE;
4044 vm_object_unlock(dst_page->object);
4045 }
4046
4047 }
4048 } else {
4049 part_size = PAGE_SIZE;
4050
4051 if (result_page == VM_PAGE_NULL)
4052 vm_page_zero_fill(dst_page);
4053 else{
4054 vm_page_copy(result_page, dst_page);
4055 if(!dst_page->dirty){
4056 vm_object_lock(dst_object);
4057 dst_page->dirty = TRUE;
4058 vm_object_unlock(dst_page->object);
4059 }
4060 }
4061
4062 }
4063
4064 /*
4065 * Unlock everything, and return
4066 */
4067
4068 vm_map_verify_done(dst_map, dst_version);
4069
4070 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4071 vm_fault_copy_cleanup(result_page, src_top_page);
4072 vm_fault_copy_dst_cleanup(dst_page);
4073
4074 amount_left -= part_size;
4075 src_offset += part_size;
4076 dst_offset += part_size;
4077 } while (amount_left > 0);
4078
4079 RETURN(KERN_SUCCESS);
4080 #undef RETURN
4081
4082 /*NOTREACHED*/
4083 }
4084
4085 #if VM_FAULT_CLASSIFY
4086 /*
4087 * Temporary statistics gathering support.
4088 */
4089
4090 /*
4091 * Statistics arrays:
4092 */
4093 #define VM_FAULT_TYPES_MAX 5
4094 #define VM_FAULT_LEVEL_MAX 8
4095
4096 int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4097
4098 #define VM_FAULT_TYPE_ZERO_FILL 0
4099 #define VM_FAULT_TYPE_MAP_IN 1
4100 #define VM_FAULT_TYPE_PAGER 2
4101 #define VM_FAULT_TYPE_COPY 3
4102 #define VM_FAULT_TYPE_OTHER 4
4103
4104
4105 void
4106 vm_fault_classify(vm_object_t object,
4107 vm_object_offset_t offset,
4108 vm_prot_t fault_type)
4109 {
4110 int type, level = 0;
4111 vm_page_t m;
4112
4113 while (TRUE) {
4114 m = vm_page_lookup(object, offset);
4115 if (m != VM_PAGE_NULL) {
4116 if (m->busy || m->error || m->restart || m->absent) {
4117 type = VM_FAULT_TYPE_OTHER;
4118 break;
4119 }
4120 if (((fault_type & VM_PROT_WRITE) == 0) ||
4121 ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4122 type = VM_FAULT_TYPE_MAP_IN;
4123 break;
4124 }
4125 type = VM_FAULT_TYPE_COPY;
4126 break;
4127 }
4128 else {
4129 if (object->pager_created) {
4130 type = VM_FAULT_TYPE_PAGER;
4131 break;
4132 }
4133 if (object->shadow == VM_OBJECT_NULL) {
4134 type = VM_FAULT_TYPE_ZERO_FILL;
4135 break;
4136 }
4137
4138 offset += object->shadow_offset;
4139 object = object->shadow;
4140 level++;
4141 continue;
4142 }
4143 }
4144
4145 if (level > VM_FAULT_LEVEL_MAX)
4146 level = VM_FAULT_LEVEL_MAX;
4147
4148 vm_fault_stats[type][level] += 1;
4149
4150 return;
4151 }
4152
4153 /* cleanup routine to call from debugger */
4154
4155 void
4156 vm_fault_classify_init(void)
4157 {
4158 int type, level;
4159
4160 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4161 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4162 vm_fault_stats[type][level] = 0;
4163 }
4164 }
4165
4166 return;
4167 }
4168 #endif /* VM_FAULT_CLASSIFY */
4169
4170
4171 extern int cs_validation;
4172
4173 void
4174 vm_page_validate_cs_mapped(
4175 vm_page_t page,
4176 const void *kaddr)
4177 {
4178 vm_object_t object;
4179 vm_object_offset_t offset;
4180 kern_return_t kr;
4181 memory_object_t pager;
4182 void *blobs;
4183 boolean_t validated, tainted;
4184
4185 assert(page->busy);
4186 vm_object_lock_assert_exclusive(page->object);
4187
4188 if (!cs_validation) {
4189 return;
4190 }
4191
4192 if (page->wpmapped && !page->cs_tainted) {
4193 /*
4194 * This page was mapped for "write" access sometime in the
4195 * past and could still be modifiable in the future.
4196 * Consider it tainted.
4197 * [ If the page was already found to be "tainted", no
4198 * need to re-validate. ]
4199 */
4200 page->cs_validated = TRUE;
4201 page->cs_tainted = TRUE;
4202 if (cs_debug) {
4203 printf("CODESIGNING: vm_page_validate_cs: "
4204 "page %p obj %p off 0x%llx "
4205 "was modified\n",
4206 page, page->object, page->offset);
4207 }
4208 vm_cs_validated_dirtied++;
4209 }
4210
4211 if (page->cs_validated) {
4212 return;
4213 }
4214
4215 vm_cs_validates++;
4216
4217 object = page->object;
4218 assert(object->code_signed);
4219 offset = page->offset;
4220
4221 if (!object->alive || object->terminating || object->pager == NULL) {
4222 /*
4223 * The object is terminating and we don't have its pager
4224 * so we can't validate the data...
4225 */
4226 return;
4227 }
4228 /*
4229 * Since we get here to validate a page that was brought in by
4230 * the pager, we know that this pager is all setup and ready
4231 * by now.
4232 */
4233 assert(!object->internal);
4234 assert(object->pager != NULL);
4235 assert(object->pager_ready);
4236
4237 pager = object->pager;
4238
4239 kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
4240 if (kr != KERN_SUCCESS) {
4241 blobs = NULL;
4242 }
4243
4244 /* verify the SHA1 hash for this page */
4245 validated = cs_validate_page(blobs,
4246 offset + object->paging_offset,
4247 (const void *)kaddr,
4248 &tainted);
4249
4250 page->cs_validated = validated;
4251 if (validated) {
4252 page->cs_tainted = tainted;
4253 }
4254 }
4255
4256 void
4257 vm_page_validate_cs(
4258 vm_page_t page)
4259 {
4260 vm_object_t object;
4261 vm_object_offset_t offset;
4262 vm_map_offset_t koffset;
4263 vm_map_size_t ksize;
4264 vm_offset_t kaddr;
4265 kern_return_t kr;
4266 boolean_t busy_page;
4267
4268 vm_object_lock_assert_held(page->object);
4269
4270 if (!cs_validation) {
4271 return;
4272 }
4273
4274 if (page->wpmapped && !page->cs_tainted) {
4275 vm_object_lock_assert_exclusive(page->object);
4276
4277 /*
4278 * This page was mapped for "write" access sometime in the
4279 * past and could still be modifiable in the future.
4280 * Consider it tainted.
4281 * [ If the page was already found to be "tainted", no
4282 * need to re-validate. ]
4283 */
4284 page->cs_validated = TRUE;
4285 page->cs_tainted = TRUE;
4286 if (cs_debug) {
4287 printf("CODESIGNING: vm_page_validate_cs: "
4288 "page %p obj %p off 0x%llx "
4289 "was modified\n",
4290 page, page->object, page->offset);
4291 }
4292 vm_cs_validated_dirtied++;
4293 }
4294
4295 if (page->cs_validated) {
4296 return;
4297 }
4298
4299 vm_object_lock_assert_exclusive(page->object);
4300
4301 object = page->object;
4302 assert(object->code_signed);
4303 offset = page->offset;
4304
4305 busy_page = page->busy;
4306 if (!busy_page) {
4307 /* keep page busy while we map (and unlock) the VM object */
4308 page->busy = TRUE;
4309 }
4310
4311 /*
4312 * Take a paging reference on the VM object
4313 * to protect it from collapse or bypass,
4314 * and keep it from disappearing too.
4315 */
4316 vm_object_paging_begin(object);
4317
4318 /* map the page in the kernel address space */
4319 koffset = 0;
4320 ksize = PAGE_SIZE_64;
4321 kr = vm_paging_map_object(&koffset,
4322 page,
4323 object,
4324 offset,
4325 &ksize,
4326 VM_PROT_READ,
4327 FALSE); /* can't unlock object ! */
4328 if (kr != KERN_SUCCESS) {
4329 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
4330 }
4331 kaddr = CAST_DOWN(vm_offset_t, koffset);
4332
4333 /* validate the mapped page */
4334 vm_page_validate_cs_mapped(page, (const void *) kaddr);
4335
4336 assert(page->busy);
4337 assert(object == page->object);
4338 vm_object_lock_assert_exclusive(object);
4339
4340 if (!busy_page) {
4341 PAGE_WAKEUP_DONE(page);
4342 }
4343 if (koffset != 0) {
4344 /* unmap the map from the kernel address space */
4345 vm_paging_unmap_object(object, koffset, koffset + ksize);
4346 koffset = 0;
4347 ksize = 0;
4348 kaddr = 0;
4349 }
4350 vm_object_paging_end(object);
4351 }