]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_fault.c
xnu-1228.15.4.tar.gz
[apple/xnu.git] / osfmk / vm / vm_fault.c
1 /*
2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm_fault.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Page fault handling module.
63 */
64
65 #include <mach_cluster_stats.h>
66 #include <mach_pagemap.h>
67 #include <mach_kdb.h>
68 #include <libkern/OSAtomic.h>
69
70 #include <mach/mach_types.h>
71 #include <mach/kern_return.h>
72 #include <mach/message.h> /* for error codes */
73 #include <mach/vm_param.h>
74 #include <mach/vm_behavior.h>
75 #include <mach/memory_object.h>
76 /* For memory_object_data_{request,unlock} */
77 #include <mach/sdt.h>
78
79 #include <kern/kern_types.h>
80 #include <kern/host_statistics.h>
81 #include <kern/counters.h>
82 #include <kern/task.h>
83 #include <kern/thread.h>
84 #include <kern/sched_prim.h>
85 #include <kern/host.h>
86 #include <kern/xpr.h>
87 #include <kern/mach_param.h>
88 #include <kern/macro_help.h>
89 #include <kern/zalloc.h>
90 #include <kern/misc_protos.h>
91
92 #include <ppc/proc_reg.h>
93
94 #include <vm/vm_fault.h>
95 #include <vm/vm_map.h>
96 #include <vm/vm_object.h>
97 #include <vm/vm_page.h>
98 #include <vm/vm_kern.h>
99 #include <vm/pmap.h>
100 #include <vm/vm_pageout.h>
101 #include <vm/vm_protos.h>
102 #include <vm/vm_external.h>
103 #include <vm/memory_object.h>
104 #include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
105
106 #include <sys/kdebug.h>
107
108 #define VM_FAULT_CLASSIFY 0
109
110 /* Zero-filled pages are marked "m->zero_fill" and put on the
111 * special zero-fill inactive queue only if they belong to
112 * an object at least this big.
113 */
114 #define VM_ZF_OBJECT_SIZE_THRESHOLD (0x200000)
115
116 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
117
118 int vm_object_pagein_throttle = 16;
119
120 extern int cs_debug;
121
122 #if MACH_KDB
123 extern struct db_watchpoint *db_watchpoint_list;
124 #endif /* MACH_KDB */
125
126
127 /* Forward declarations of internal routines. */
128 extern kern_return_t vm_fault_wire_fast(
129 vm_map_t map,
130 vm_map_offset_t va,
131 vm_map_entry_t entry,
132 pmap_t pmap,
133 vm_map_offset_t pmap_addr);
134
135 extern void vm_fault_continue(void);
136
137 extern void vm_fault_copy_cleanup(
138 vm_page_t page,
139 vm_page_t top_page);
140
141 extern void vm_fault_copy_dst_cleanup(
142 vm_page_t page);
143
144 #if VM_FAULT_CLASSIFY
145 extern void vm_fault_classify(vm_object_t object,
146 vm_object_offset_t offset,
147 vm_prot_t fault_type);
148
149 extern void vm_fault_classify_init(void);
150 #endif
151
152
153 unsigned long vm_cs_validates = 0;
154 unsigned long vm_cs_revalidates = 0;
155 unsigned long vm_cs_query_modified = 0;
156 unsigned long vm_cs_validated_dirtied = 0;
157
158 #if CONFIG_ENFORCE_SIGNED_CODE
159 #if SECURE_KERNEL
160 const int cs_enforcement_disable=0;
161 #else
162 int cs_enforcement_disable=1;
163 #endif
164 #endif
165
166 /*
167 * Routine: vm_fault_init
168 * Purpose:
169 * Initialize our private data structures.
170 */
171 void
172 vm_fault_init(void)
173 {
174 #if !SECURE_KERNEL
175 #if CONFIG_ENFORCE_SIGNED_CODE
176 PE_parse_boot_argn("cs_enforcement_disable", &cs_enforcement_disable, sizeof (cs_enforcement_disable));
177 #endif
178 PE_parse_boot_argn("cs_debug", &cs_debug, sizeof (cs_debug));
179 #endif
180 }
181
182 /*
183 * Routine: vm_fault_cleanup
184 * Purpose:
185 * Clean up the result of vm_fault_page.
186 * Results:
187 * The paging reference for "object" is released.
188 * "object" is unlocked.
189 * If "top_page" is not null, "top_page" is
190 * freed and the paging reference for the object
191 * containing it is released.
192 *
193 * In/out conditions:
194 * "object" must be locked.
195 */
196 void
197 vm_fault_cleanup(
198 register vm_object_t object,
199 register vm_page_t top_page)
200 {
201 vm_object_paging_end(object);
202 vm_object_unlock(object);
203
204 if (top_page != VM_PAGE_NULL) {
205 object = top_page->object;
206
207 vm_object_lock(object);
208 VM_PAGE_FREE(top_page);
209 vm_object_paging_end(object);
210 vm_object_unlock(object);
211 }
212 }
213
214 #if MACH_CLUSTER_STATS
215 #define MAXCLUSTERPAGES 16
216 struct {
217 unsigned long pages_in_cluster;
218 unsigned long pages_at_higher_offsets;
219 unsigned long pages_at_lower_offsets;
220 } cluster_stats_in[MAXCLUSTERPAGES];
221 #define CLUSTER_STAT(clause) clause
222 #define CLUSTER_STAT_HIGHER(x) \
223 ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
224 #define CLUSTER_STAT_LOWER(x) \
225 ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
226 #define CLUSTER_STAT_CLUSTER(x) \
227 ((cluster_stats_in[(x)].pages_in_cluster)++)
228 #else /* MACH_CLUSTER_STATS */
229 #define CLUSTER_STAT(clause)
230 #endif /* MACH_CLUSTER_STATS */
231
232 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
233
234
235 boolean_t vm_page_deactivate_behind = TRUE;
236 /*
237 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
238 */
239 int vm_default_ahead = 0;
240 int vm_default_behind = MAX_UPL_TRANSFER;
241
242 #define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
243
244 /*
245 * vm_page_is_sequential
246 *
247 * Determine if sequential access is in progress
248 * in accordance with the behavior specified.
249 * Update state to indicate current access pattern.
250 *
251 * object must have at least the shared lock held
252 */
253 static
254 void
255 vm_fault_is_sequential(
256 vm_object_t object,
257 vm_object_offset_t offset,
258 vm_behavior_t behavior)
259 {
260 vm_object_offset_t last_alloc;
261 int sequential;
262 int orig_sequential;
263
264 last_alloc = object->last_alloc;
265 sequential = object->sequential;
266 orig_sequential = sequential;
267
268 switch (behavior) {
269 case VM_BEHAVIOR_RANDOM:
270 /*
271 * reset indicator of sequential behavior
272 */
273 sequential = 0;
274 break;
275
276 case VM_BEHAVIOR_SEQUENTIAL:
277 if (offset && last_alloc == offset - PAGE_SIZE_64) {
278 /*
279 * advance indicator of sequential behavior
280 */
281 if (sequential < MAX_SEQUENTIAL_RUN)
282 sequential += PAGE_SIZE;
283 } else {
284 /*
285 * reset indicator of sequential behavior
286 */
287 sequential = 0;
288 }
289 break;
290
291 case VM_BEHAVIOR_RSEQNTL:
292 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
293 /*
294 * advance indicator of sequential behavior
295 */
296 if (sequential > -MAX_SEQUENTIAL_RUN)
297 sequential -= PAGE_SIZE;
298 } else {
299 /*
300 * reset indicator of sequential behavior
301 */
302 sequential = 0;
303 }
304 break;
305
306 case VM_BEHAVIOR_DEFAULT:
307 default:
308 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
309 /*
310 * advance indicator of sequential behavior
311 */
312 if (sequential < 0)
313 sequential = 0;
314 if (sequential < MAX_SEQUENTIAL_RUN)
315 sequential += PAGE_SIZE;
316
317 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
318 /*
319 * advance indicator of sequential behavior
320 */
321 if (sequential > 0)
322 sequential = 0;
323 if (sequential > -MAX_SEQUENTIAL_RUN)
324 sequential -= PAGE_SIZE;
325 } else {
326 /*
327 * reset indicator of sequential behavior
328 */
329 sequential = 0;
330 }
331 break;
332 }
333 if (sequential != orig_sequential) {
334 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
335 /*
336 * if someone else has already updated object->sequential
337 * don't bother trying to update it or object->last_alloc
338 */
339 return;
340 }
341 }
342 /*
343 * I'd like to do this with a OSCompareAndSwap64, but that
344 * doesn't exist for PPC... however, it shouldn't matter
345 * that much... last_alloc is maintained so that we can determine
346 * if a sequential access pattern is taking place... if only
347 * one thread is banging on this object, no problem with the unprotected
348 * update... if 2 or more threads are banging away, we run the risk of
349 * someone seeing a mangled update... however, in the face of multiple
350 * accesses, no sequential access pattern can develop anyway, so we
351 * haven't lost any real info.
352 */
353 object->last_alloc = offset;
354 }
355
356
357 /*
358 * vm_page_deactivate_behind
359 *
360 * Determine if sequential access is in progress
361 * in accordance with the behavior specified. If
362 * so, compute a potential page to deactivate and
363 * deactivate it.
364 *
365 * object must be locked.
366 *
367 * return TRUE if we actually deactivate a page
368 */
369 static
370 boolean_t
371 vm_fault_deactivate_behind(
372 vm_object_t object,
373 vm_object_offset_t offset,
374 vm_behavior_t behavior)
375 {
376 vm_page_t m = NULL;
377 int sequential_run;
378 int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
379
380 #if TRACEFAULTPAGE
381 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
382 #endif
383
384 if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
385 /*
386 * Do not deactivate pages from the kernel object: they
387 * are not intended to become pageable.
388 * or we've disabled the deactivate behind mechanism
389 */
390 return FALSE;
391 }
392 if ((sequential_run = object->sequential)) {
393 if (sequential_run < 0) {
394 sequential_behavior = VM_BEHAVIOR_RSEQNTL;
395 sequential_run = 0 - sequential_run;
396 } else {
397 sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
398 }
399 }
400 switch (behavior) {
401 case VM_BEHAVIOR_RANDOM:
402 break;
403 case VM_BEHAVIOR_SEQUENTIAL:
404 if (sequential_run >= (int)PAGE_SIZE)
405 m = vm_page_lookup(object, offset - PAGE_SIZE_64);
406 break;
407 case VM_BEHAVIOR_RSEQNTL:
408 if (sequential_run >= (int)PAGE_SIZE)
409 m = vm_page_lookup(object, offset + PAGE_SIZE_64);
410 break;
411 case VM_BEHAVIOR_DEFAULT:
412 default:
413 { vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
414
415 /*
416 * determine if the run of sequential accesss has been
417 * long enough on an object with default access behavior
418 * to consider it for deactivation
419 */
420 if ((uint64_t)sequential_run >= behind) {
421 if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
422 if (offset >= behind)
423 m = vm_page_lookup(object, offset - behind);
424 } else {
425 if (offset < -behind)
426 m = vm_page_lookup(object, offset + behind);
427 }
428 }
429 break;
430 }
431 }
432 if (m) {
433 if (!m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
434 pmap_clear_reference(m->phys_page);
435 m->deactivated = TRUE;
436 #if TRACEFAULTPAGE
437 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
438 #endif
439 return TRUE;
440 }
441 }
442 return FALSE;
443 }
444
445
446 /*
447 * check for various conditions that would
448 * prevent us from creating a ZF page...
449 * cleanup is based on being called from vm_fault_page
450 *
451 * object must be locked
452 * object == m->object
453 */
454 static vm_fault_return_t
455 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
456 {
457 if (object->shadow_severed) {
458 /*
459 * the shadow chain was severed
460 * just have to return an error at this point
461 */
462 if (m != VM_PAGE_NULL)
463 VM_PAGE_FREE(m);
464 vm_fault_cleanup(object, first_m);
465
466 thread_interrupt_level(interruptible_state);
467
468 return (VM_FAULT_MEMORY_ERROR);
469 }
470 if (vm_backing_store_low) {
471 /*
472 * are we protecting the system from
473 * backing store exhaustion. If so
474 * sleep unless we are privileged.
475 */
476 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
477
478 if (m != VM_PAGE_NULL)
479 VM_PAGE_FREE(m);
480 vm_fault_cleanup(object, first_m);
481
482 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
483
484 thread_block(THREAD_CONTINUE_NULL);
485 thread_interrupt_level(interruptible_state);
486
487 return (VM_FAULT_RETRY);
488 }
489 }
490 if (VM_PAGE_ZFILL_THROTTLED()) {
491 /*
492 * we're throttling zero-fills...
493 * treat this as if we couldn't grab a page
494 */
495 if (m != VM_PAGE_NULL)
496 VM_PAGE_FREE(m);
497 vm_fault_cleanup(object, first_m);
498
499 thread_interrupt_level(interruptible_state);
500
501 return (VM_FAULT_MEMORY_SHORTAGE);
502 }
503 return (VM_FAULT_SUCCESS);
504 }
505
506
507 /*
508 * do the work to zero fill a page and
509 * inject it into the correct paging queue
510 *
511 * m->object must be locked
512 * page queue lock must NOT be held
513 */
514 static int
515 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
516 {
517 int my_fault = DBG_ZERO_FILL_FAULT;
518
519 /*
520 * This is is a zero-fill page fault...
521 *
522 * Checking the page lock is a waste of
523 * time; this page was absent, so
524 * it can't be page locked by a pager.
525 *
526 * we also consider it undefined
527 * with respect to instruction
528 * execution. i.e. it is the responsibility
529 * of higher layers to call for an instruction
530 * sync after changing the contents and before
531 * sending a program into this area. We
532 * choose this approach for performance
533 */
534 m->pmapped = TRUE;
535
536 m->cs_validated = FALSE;
537 m->cs_tainted = FALSE;
538
539 if (no_zero_fill == TRUE)
540 my_fault = DBG_NZF_PAGE_FAULT;
541 else {
542 vm_page_zero_fill(m);
543
544 VM_STAT_INCR(zero_fill_count);
545 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
546 }
547 assert(!m->laundry);
548 assert(m->object != kernel_object);
549 //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
550
551 if (!IP_VALID(memory_manager_default) &&
552 (m->object->purgable == VM_PURGABLE_DENY ||
553 m->object->purgable == VM_PURGABLE_NONVOLATILE ||
554 m->object->purgable == VM_PURGABLE_VOLATILE )) {
555 vm_page_lock_queues();
556
557 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
558 m->throttled = TRUE;
559 vm_page_throttled_count++;
560
561 vm_page_unlock_queues();
562 } else {
563 if (m->object->size > VM_ZF_OBJECT_SIZE_THRESHOLD) {
564 m->zero_fill = TRUE;
565 OSAddAtomic(1, (SInt32 *)&vm_zf_count);
566 }
567 }
568 return (my_fault);
569 }
570
571
572 /*
573 * Routine: vm_fault_page
574 * Purpose:
575 * Find the resident page for the virtual memory
576 * specified by the given virtual memory object
577 * and offset.
578 * Additional arguments:
579 * The required permissions for the page is given
580 * in "fault_type". Desired permissions are included
581 * in "protection".
582 * fault_info is passed along to determine pagein cluster
583 * limits... it contains the expected reference pattern,
584 * cluster size if available, etc...
585 *
586 * If the desired page is known to be resident (for
587 * example, because it was previously wired down), asserting
588 * the "unwiring" parameter will speed the search.
589 *
590 * If the operation can be interrupted (by thread_abort
591 * or thread_terminate), then the "interruptible"
592 * parameter should be asserted.
593 *
594 * Results:
595 * The page containing the proper data is returned
596 * in "result_page".
597 *
598 * In/out conditions:
599 * The source object must be locked and referenced,
600 * and must donate one paging reference. The reference
601 * is not affected. The paging reference and lock are
602 * consumed.
603 *
604 * If the call succeeds, the object in which "result_page"
605 * resides is left locked and holding a paging reference.
606 * If this is not the original object, a busy page in the
607 * original object is returned in "top_page", to prevent other
608 * callers from pursuing this same data, along with a paging
609 * reference for the original object. The "top_page" should
610 * be destroyed when this guarantee is no longer required.
611 * The "result_page" is also left busy. It is not removed
612 * from the pageout queues.
613 */
614
615 vm_fault_return_t
616 vm_fault_page(
617 /* Arguments: */
618 vm_object_t first_object, /* Object to begin search */
619 vm_object_offset_t first_offset, /* Offset into object */
620 vm_prot_t fault_type, /* What access is requested */
621 boolean_t must_be_resident,/* Must page be resident? */
622 /* Modifies in place: */
623 vm_prot_t *protection, /* Protection for mapping */
624 /* Returns: */
625 vm_page_t *result_page, /* Page found, if successful */
626 vm_page_t *top_page, /* Page in top object, if
627 * not result_page. */
628 int *type_of_fault, /* if non-null, fill in with type of fault
629 * COW, zero-fill, etc... returned in trace point */
630 /* More arguments: */
631 kern_return_t *error_code, /* code if page is in error */
632 boolean_t no_zero_fill, /* don't zero fill absent pages */
633 #if MACH_PAGEMAP
634 boolean_t data_supply, /* treat as data_supply if
635 * it is a write fault and a full
636 * page is provided */
637 #else
638 __unused boolean_t data_supply,
639 #endif
640 vm_object_fault_info_t fault_info)
641 {
642 vm_page_t m;
643 vm_object_t object;
644 vm_object_offset_t offset;
645 vm_page_t first_m;
646 vm_object_t next_object;
647 vm_object_t copy_object;
648 boolean_t look_for_page;
649 vm_prot_t access_required = fault_type;
650 vm_prot_t wants_copy_flag;
651 CLUSTER_STAT(int pages_at_higher_offsets;)
652 CLUSTER_STAT(int pages_at_lower_offsets;)
653 kern_return_t wait_result;
654 boolean_t interruptible_state;
655 vm_fault_return_t error;
656 int my_fault;
657 uint32_t try_failed_count;
658 int interruptible; /* how may fault be interrupted? */
659 memory_object_t pager;
660
661 /*
662 * MACH page map - an optional optimization where a bit map is maintained
663 * by the VM subsystem for internal objects to indicate which pages of
664 * the object currently reside on backing store. This existence map
665 * duplicates information maintained by the vnode pager. It is
666 * created at the time of the first pageout against the object, i.e.
667 * at the same time pager for the object is created. The optimization
668 * is designed to eliminate pager interaction overhead, if it is
669 * 'known' that the page does not exist on backing store.
670 *
671 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
672 * either marked as paged out in the existence map for the object or no
673 * existence map exists for the object. MUST_ASK_PAGER() is one of the
674 * criteria in the decision to invoke the pager. It is also used as one
675 * of the criteria to terminate the scan for adjacent pages in a clustered
676 * pagein operation. Note that MUST_ASK_PAGER() always evaluates to TRUE for
677 * permanent objects. Note also that if the pager for an internal object
678 * has not been created, the pager is not invoked regardless of the value
679 * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
680 * for which a pager has been created.
681 *
682 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
683 * is marked as paged out in the existence map for the object. PAGED_OUT()
684 * PAGED_OUT() is used to determine if a page has already been pushed
685 * into a copy object in order to avoid a redundant page out operation.
686 */
687 #if MACH_PAGEMAP
688 #define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \
689 != VM_EXTERNAL_STATE_ABSENT)
690 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
691 == VM_EXTERNAL_STATE_EXISTS)
692 #else
693 #define MUST_ASK_PAGER(o, f) (TRUE)
694 #define PAGED_OUT(o, f) (FALSE)
695 #endif
696
697 /*
698 * Recovery actions
699 */
700 #define PREPARE_RELEASE_PAGE(m) \
701 MACRO_BEGIN \
702 vm_page_lock_queues(); \
703 MACRO_END
704
705 #define DO_RELEASE_PAGE(m) \
706 MACRO_BEGIN \
707 PAGE_WAKEUP_DONE(m); \
708 if (!m->active && !m->inactive && !m->throttled)\
709 vm_page_activate(m); \
710 vm_page_unlock_queues(); \
711 MACRO_END
712
713 #define RELEASE_PAGE(m) \
714 MACRO_BEGIN \
715 PREPARE_RELEASE_PAGE(m); \
716 DO_RELEASE_PAGE(m); \
717 MACRO_END
718
719 #if TRACEFAULTPAGE
720 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
721 #endif
722
723
724 #if MACH_KDB
725 /*
726 * If there are watchpoints set, then
727 * we don't want to give away write permission
728 * on a read fault. Make the task write fault,
729 * so that the watchpoint code notices the access.
730 */
731 if (db_watchpoint_list) {
732 /*
733 * If we aren't asking for write permission,
734 * then don't give it away. We're using write
735 * faults to set the dirty bit.
736 */
737 if (!(fault_type & VM_PROT_WRITE))
738 *protection &= ~VM_PROT_WRITE;
739 }
740 #endif /* MACH_KDB */
741
742 interruptible = fault_info->interruptible;
743 interruptible_state = thread_interrupt_level(interruptible);
744
745 /*
746 * INVARIANTS (through entire routine):
747 *
748 * 1) At all times, we must either have the object
749 * lock or a busy page in some object to prevent
750 * some other thread from trying to bring in
751 * the same page.
752 *
753 * Note that we cannot hold any locks during the
754 * pager access or when waiting for memory, so
755 * we use a busy page then.
756 *
757 * 2) To prevent another thread from racing us down the
758 * shadow chain and entering a new page in the top
759 * object before we do, we must keep a busy page in
760 * the top object while following the shadow chain.
761 *
762 * 3) We must increment paging_in_progress on any object
763 * for which we have a busy page before dropping
764 * the object lock
765 *
766 * 4) We leave busy pages on the pageout queues.
767 * If the pageout daemon comes across a busy page,
768 * it will remove the page from the pageout queues.
769 */
770
771 object = first_object;
772 offset = first_offset;
773 first_m = VM_PAGE_NULL;
774 access_required = fault_type;
775
776
777 XPR(XPR_VM_FAULT,
778 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
779 (integer_t)object, offset, fault_type, *protection, 0);
780
781 /*
782 * default type of fault
783 */
784 my_fault = DBG_CACHE_HIT_FAULT;
785
786 while (TRUE) {
787 #if TRACEFAULTPAGE
788 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
789 #endif
790 if (!object->alive) {
791 /*
792 * object is no longer valid
793 * clean up and return error
794 */
795 vm_fault_cleanup(object, first_m);
796 thread_interrupt_level(interruptible_state);
797
798 return (VM_FAULT_MEMORY_ERROR);
799 }
800
801 /*
802 * See whether the page at 'offset' is resident
803 */
804 m = vm_page_lookup(object, offset);
805 #if TRACEFAULTPAGE
806 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
807 #endif
808 if (m != VM_PAGE_NULL) {
809
810 if (m->busy) {
811 /*
812 * The page is being brought in,
813 * wait for it and then retry.
814 *
815 * A possible optimization: if the page
816 * is known to be resident, we can ignore
817 * pages that are absent (regardless of
818 * whether they're busy).
819 */
820 #if TRACEFAULTPAGE
821 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
822 #endif
823 wait_result = PAGE_SLEEP(object, m, interruptible);
824 XPR(XPR_VM_FAULT,
825 "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
826 (integer_t)object, offset,
827 (integer_t)m, 0, 0);
828 counter(c_vm_fault_page_block_busy_kernel++);
829
830 if (wait_result != THREAD_AWAKENED) {
831 vm_fault_cleanup(object, first_m);
832 thread_interrupt_level(interruptible_state);
833
834 if (wait_result == THREAD_RESTART)
835 return (VM_FAULT_RETRY);
836 else
837 return (VM_FAULT_INTERRUPTED);
838 }
839 continue;
840 }
841
842 if (m->phys_page == vm_page_guard_addr) {
843 /*
844 * Guard page: off limits !
845 */
846 if (fault_type == VM_PROT_NONE) {
847 /*
848 * The fault is not requesting any
849 * access to the guard page, so it must
850 * be just to wire or unwire it.
851 * Let's pretend it succeeded...
852 */
853 m->busy = TRUE;
854 *result_page = m;
855 assert(first_m == VM_PAGE_NULL);
856 *top_page = first_m;
857 if (type_of_fault)
858 *type_of_fault = DBG_GUARD_FAULT;
859 return VM_FAULT_SUCCESS;
860 } else {
861 /*
862 * The fault requests access to the
863 * guard page: let's deny that !
864 */
865 vm_fault_cleanup(object, first_m);
866 thread_interrupt_level(interruptible_state);
867 return VM_FAULT_MEMORY_ERROR;
868 }
869 }
870
871 if (m->error) {
872 /*
873 * The page is in error, give up now.
874 */
875 #if TRACEFAULTPAGE
876 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
877 #endif
878 if (error_code)
879 *error_code = KERN_MEMORY_ERROR;
880 VM_PAGE_FREE(m);
881
882 vm_fault_cleanup(object, first_m);
883 thread_interrupt_level(interruptible_state);
884
885 return (VM_FAULT_MEMORY_ERROR);
886 }
887 if (m->restart) {
888 /*
889 * The pager wants us to restart
890 * at the top of the chain,
891 * typically because it has moved the
892 * page to another pager, then do so.
893 */
894 #if TRACEFAULTPAGE
895 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
896 #endif
897 VM_PAGE_FREE(m);
898
899 vm_fault_cleanup(object, first_m);
900 thread_interrupt_level(interruptible_state);
901
902 return (VM_FAULT_RETRY);
903 }
904 if (m->absent) {
905 /*
906 * The page isn't busy, but is absent,
907 * therefore it's deemed "unavailable".
908 *
909 * Remove the non-existent page (unless it's
910 * in the top object) and move on down to the
911 * next object (if there is one).
912 */
913 #if TRACEFAULTPAGE
914 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
915 #endif
916 next_object = object->shadow;
917
918 if (next_object == VM_OBJECT_NULL) {
919 /*
920 * Absent page at bottom of shadow
921 * chain; zero fill the page we left
922 * busy in the first object, and free
923 * the absent page.
924 */
925 assert(!must_be_resident);
926
927 /*
928 * check for any conditions that prevent
929 * us from creating a new zero-fill page
930 * vm_fault_check will do all of the
931 * fault cleanup in the case of an error condition
932 * including resetting the thread_interrupt_level
933 */
934 error = vm_fault_check(object, m, first_m, interruptible_state);
935
936 if (error != VM_FAULT_SUCCESS)
937 return (error);
938
939 XPR(XPR_VM_FAULT,
940 "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
941 (integer_t)object, offset,
942 (integer_t)m,
943 (integer_t)first_object, 0);
944
945 if (object != first_object) {
946 /*
947 * free the absent page we just found
948 */
949 VM_PAGE_FREE(m);
950
951 /*
952 * drop reference and lock on current object
953 */
954 vm_object_paging_end(object);
955 vm_object_unlock(object);
956
957 /*
958 * grab the original page we
959 * 'soldered' in place and
960 * retake lock on 'first_object'
961 */
962 m = first_m;
963 first_m = VM_PAGE_NULL;
964
965 object = first_object;
966 offset = first_offset;
967
968 vm_object_lock(object);
969 } else {
970 /*
971 * we're going to use the absent page we just found
972 * so convert it to a 'busy' page
973 */
974 m->absent = FALSE;
975 m->busy = TRUE;
976 }
977 /*
978 * zero-fill the page and put it on
979 * the correct paging queue
980 */
981 my_fault = vm_fault_zero_page(m, no_zero_fill);
982
983 break;
984 } else {
985 if (must_be_resident)
986 vm_object_paging_end(object);
987 else if (object != first_object) {
988 vm_object_paging_end(object);
989 VM_PAGE_FREE(m);
990 } else {
991 first_m = m;
992 m->absent = FALSE;
993 m->busy = TRUE;
994
995 vm_page_lockspin_queues();
996 VM_PAGE_QUEUES_REMOVE(m);
997 vm_page_unlock_queues();
998 }
999 XPR(XPR_VM_FAULT,
1000 "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1001 (integer_t)object, offset,
1002 (integer_t)next_object,
1003 offset+object->shadow_offset,0);
1004
1005 offset += object->shadow_offset;
1006 fault_info->lo_offset += object->shadow_offset;
1007 fault_info->hi_offset += object->shadow_offset;
1008 access_required = VM_PROT_READ;
1009
1010 vm_object_lock(next_object);
1011 vm_object_unlock(object);
1012 object = next_object;
1013 vm_object_paging_begin(object);
1014
1015 /*
1016 * reset to default type of fault
1017 */
1018 my_fault = DBG_CACHE_HIT_FAULT;
1019
1020 continue;
1021 }
1022 }
1023 if ((m->cleaning)
1024 && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1025 && (fault_type & VM_PROT_WRITE)) {
1026 /*
1027 * This is a copy-on-write fault that will
1028 * cause us to revoke access to this page, but
1029 * this page is in the process of being cleaned
1030 * in a clustered pageout. We must wait until
1031 * the cleaning operation completes before
1032 * revoking access to the original page,
1033 * otherwise we might attempt to remove a
1034 * wired mapping.
1035 */
1036 #if TRACEFAULTPAGE
1037 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
1038 #endif
1039 XPR(XPR_VM_FAULT,
1040 "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1041 (integer_t)object, offset,
1042 (integer_t)m, 0, 0);
1043 /*
1044 * take an extra ref so that object won't die
1045 */
1046 vm_object_reference_locked(object);
1047
1048 vm_fault_cleanup(object, first_m);
1049
1050 counter(c_vm_fault_page_block_backoff_kernel++);
1051 vm_object_lock(object);
1052 assert(object->ref_count > 0);
1053
1054 m = vm_page_lookup(object, offset);
1055
1056 if (m != VM_PAGE_NULL && m->cleaning) {
1057 PAGE_ASSERT_WAIT(m, interruptible);
1058
1059 vm_object_unlock(object);
1060 wait_result = thread_block(THREAD_CONTINUE_NULL);
1061 vm_object_deallocate(object);
1062
1063 goto backoff;
1064 } else {
1065 vm_object_unlock(object);
1066
1067 vm_object_deallocate(object);
1068 thread_interrupt_level(interruptible_state);
1069
1070 return (VM_FAULT_RETRY);
1071 }
1072 }
1073 if (type_of_fault == NULL && m->speculative) {
1074 /*
1075 * If we were passed a non-NULL pointer for
1076 * "type_of_fault", than we came from
1077 * vm_fault... we'll let it deal with
1078 * this condition, since it
1079 * needs to see m->speculative to correctly
1080 * account the pageins, otherwise...
1081 * take it off the speculative queue, we'll
1082 * let the caller of vm_fault_page deal
1083 * with getting it onto the correct queue
1084 */
1085 vm_page_lockspin_queues();
1086 VM_PAGE_QUEUES_REMOVE(m);
1087 vm_page_unlock_queues();
1088 }
1089
1090 if (m->encrypted) {
1091 /*
1092 * ENCRYPTED SWAP:
1093 * the user needs access to a page that we
1094 * encrypted before paging it out.
1095 * Decrypt the page now.
1096 * Keep it busy to prevent anyone from
1097 * accessing it during the decryption.
1098 */
1099 m->busy = TRUE;
1100 vm_page_decrypt(m, 0);
1101 assert(object == m->object);
1102 assert(m->busy);
1103 PAGE_WAKEUP_DONE(m);
1104
1105 /*
1106 * Retry from the top, in case
1107 * something changed while we were
1108 * decrypting.
1109 */
1110 continue;
1111 }
1112 ASSERT_PAGE_DECRYPTED(m);
1113
1114 if (m->object->code_signed) {
1115 /*
1116 * CODE SIGNING:
1117 * We just paged in a page from a signed
1118 * memory object but we don't need to
1119 * validate it now. We'll validate it if
1120 * when it gets mapped into a user address
1121 * space for the first time or when the page
1122 * gets copied to another object as a result
1123 * of a copy-on-write.
1124 */
1125 }
1126
1127 /*
1128 * We mark the page busy and leave it on
1129 * the pageout queues. If the pageout
1130 * deamon comes across it, then it will
1131 * remove the page from the queue, but not the object
1132 */
1133 #if TRACEFAULTPAGE
1134 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1135 #endif
1136 XPR(XPR_VM_FAULT,
1137 "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1138 (integer_t)object, offset, (integer_t)m, 0, 0);
1139 assert(!m->busy);
1140 assert(!m->absent);
1141
1142 m->busy = TRUE;
1143 break;
1144 }
1145
1146
1147 /*
1148 * we get here when there is no page present in the object at
1149 * the offset we're interested in... we'll allocate a page
1150 * at this point if the pager associated with
1151 * this object can provide the data or we're the top object...
1152 * object is locked; m == NULL
1153 */
1154 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply);
1155
1156 #if TRACEFAULTPAGE
1157 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1158 #endif
1159 if ((look_for_page || (object == first_object)) && !must_be_resident && !object->phys_contiguous) {
1160 /*
1161 * Allocate a new page for this object/offset pair
1162 */
1163 m = vm_page_grab();
1164 #if TRACEFAULTPAGE
1165 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1166 #endif
1167 if (m == VM_PAGE_NULL) {
1168
1169 vm_fault_cleanup(object, first_m);
1170 thread_interrupt_level(interruptible_state);
1171
1172 return (VM_FAULT_MEMORY_SHORTAGE);
1173 }
1174 vm_page_insert(m, object, offset);
1175 }
1176 if (look_for_page && !must_be_resident) {
1177 kern_return_t rc;
1178
1179 /*
1180 * If the memory manager is not ready, we
1181 * cannot make requests.
1182 */
1183 if (!object->pager_ready) {
1184 #if TRACEFAULTPAGE
1185 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1186 #endif
1187 if (m != VM_PAGE_NULL)
1188 VM_PAGE_FREE(m);
1189
1190 XPR(XPR_VM_FAULT,
1191 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1192 (integer_t)object, offset, 0, 0, 0);
1193
1194 /*
1195 * take an extra ref so object won't die
1196 */
1197 vm_object_reference_locked(object);
1198 vm_fault_cleanup(object, first_m);
1199 counter(c_vm_fault_page_block_backoff_kernel++);
1200
1201 vm_object_lock(object);
1202 assert(object->ref_count > 0);
1203
1204 if (!object->pager_ready) {
1205 wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1206
1207 vm_object_unlock(object);
1208 if (wait_result == THREAD_WAITING)
1209 wait_result = thread_block(THREAD_CONTINUE_NULL);
1210 vm_object_deallocate(object);
1211
1212 goto backoff;
1213 } else {
1214 vm_object_unlock(object);
1215 vm_object_deallocate(object);
1216 thread_interrupt_level(interruptible_state);
1217
1218 return (VM_FAULT_RETRY);
1219 }
1220 }
1221 if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1222 /*
1223 * If there are too many outstanding page
1224 * requests pending on this external object, we
1225 * wait for them to be resolved now.
1226 */
1227 #if TRACEFAULTPAGE
1228 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1229 #endif
1230 if (m != VM_PAGE_NULL)
1231 VM_PAGE_FREE(m);
1232 /*
1233 * take an extra ref so object won't die
1234 */
1235 vm_object_reference_locked(object);
1236
1237 vm_fault_cleanup(object, first_m);
1238
1239 counter(c_vm_fault_page_block_backoff_kernel++);
1240
1241 vm_object_lock(object);
1242 assert(object->ref_count > 0);
1243
1244 if (object->paging_in_progress > vm_object_pagein_throttle) {
1245 vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS, interruptible);
1246
1247 vm_object_unlock(object);
1248 wait_result = thread_block(THREAD_CONTINUE_NULL);
1249 vm_object_deallocate(object);
1250
1251 goto backoff;
1252 } else {
1253 vm_object_unlock(object);
1254 vm_object_deallocate(object);
1255 thread_interrupt_level(interruptible_state);
1256
1257 return (VM_FAULT_RETRY);
1258 }
1259 }
1260 if (m != VM_PAGE_NULL) {
1261 /*
1262 * Indicate that the page is waiting for data
1263 * from the memory manager.
1264 */
1265 m->list_req_pending = TRUE;
1266 m->absent = TRUE;
1267 }
1268
1269 #if TRACEFAULTPAGE
1270 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1271 #endif
1272
1273 /*
1274 * It's possible someone called vm_object_destroy while we weren't
1275 * holding the object lock. If that has happened, then bail out
1276 * here.
1277 */
1278
1279 pager = object->pager;
1280
1281 if (pager == MEMORY_OBJECT_NULL) {
1282 vm_fault_cleanup(object, first_m);
1283 thread_interrupt_level(interruptible_state);
1284 return VM_FAULT_MEMORY_ERROR;
1285 }
1286
1287 /*
1288 * We have an absent page in place for the faulting offset,
1289 * so we can release the object lock.
1290 */
1291
1292 vm_object_unlock(object);
1293
1294 /*
1295 * If this object uses a copy_call strategy,
1296 * and we are interested in a copy of this object
1297 * (having gotten here only by following a
1298 * shadow chain), then tell the memory manager
1299 * via a flag added to the desired_access
1300 * parameter, so that it can detect a race
1301 * between our walking down the shadow chain
1302 * and its pushing pages up into a copy of
1303 * the object that it manages.
1304 */
1305 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1306 wants_copy_flag = VM_PROT_WANTS_COPY;
1307 else
1308 wants_copy_flag = VM_PROT_NONE;
1309
1310 XPR(XPR_VM_FAULT,
1311 "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1312 (integer_t)object, offset, (integer_t)m,
1313 access_required | wants_copy_flag, 0);
1314
1315 /*
1316 * Call the memory manager to retrieve the data.
1317 */
1318 rc = memory_object_data_request(
1319 pager,
1320 offset + object->paging_offset,
1321 PAGE_SIZE,
1322 access_required | wants_copy_flag,
1323 (memory_object_fault_info_t)fault_info);
1324
1325 #if TRACEFAULTPAGE
1326 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1327 #endif
1328 vm_object_lock(object);
1329
1330 if (rc != KERN_SUCCESS) {
1331
1332 vm_fault_cleanup(object, first_m);
1333 thread_interrupt_level(interruptible_state);
1334
1335 return ((rc == MACH_SEND_INTERRUPTED) ?
1336 VM_FAULT_INTERRUPTED :
1337 VM_FAULT_MEMORY_ERROR);
1338 }
1339 if ((interruptible != THREAD_UNINT) && (current_thread()->sched_mode & TH_MODE_ABORT)) {
1340
1341 vm_fault_cleanup(object, first_m);
1342 thread_interrupt_level(interruptible_state);
1343
1344 return (VM_FAULT_INTERRUPTED);
1345 }
1346 if (m == VM_PAGE_NULL && object->phys_contiguous) {
1347 /*
1348 * No page here means that the object we
1349 * initially looked up was "physically
1350 * contiguous" (i.e. device memory). However,
1351 * with Virtual VRAM, the object might not
1352 * be backed by that device memory anymore,
1353 * so we're done here only if the object is
1354 * still "phys_contiguous".
1355 * Otherwise, if the object is no longer
1356 * "phys_contiguous", we need to retry the
1357 * page fault against the object's new backing
1358 * store (different memory object).
1359 */
1360 break;
1361 }
1362 /*
1363 * potentially a pagein fault
1364 * if we make it through the state checks
1365 * above, than we'll count it as such
1366 */
1367 my_fault = DBG_PAGEIN_FAULT;
1368
1369 /*
1370 * Retry with same object/offset, since new data may
1371 * be in a different page (i.e., m is meaningless at
1372 * this point).
1373 */
1374 continue;
1375 }
1376
1377 /*
1378 * We get here if the object has no pager, or an existence map
1379 * exists and indicates the page isn't present on the pager
1380 * or we're unwiring a page. If a pager exists, but there
1381 * is no existence map, then the m->absent case above handles
1382 * the ZF case when the pager can't provide the page
1383 */
1384 #if TRACEFAULTPAGE
1385 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1386 #endif
1387 if (object == first_object)
1388 first_m = m;
1389 else
1390 assert(m == VM_PAGE_NULL);
1391
1392 XPR(XPR_VM_FAULT,
1393 "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1394 (integer_t)object, offset, (integer_t)m,
1395 (integer_t)object->shadow, 0);
1396
1397 next_object = object->shadow;
1398
1399 if (next_object == VM_OBJECT_NULL) {
1400 /*
1401 * we've hit the bottom of the shadown chain,
1402 * fill the page in the top object with zeros.
1403 */
1404 assert(!must_be_resident);
1405
1406 if (object != first_object) {
1407 vm_object_paging_end(object);
1408 vm_object_unlock(object);
1409
1410 object = first_object;
1411 offset = first_offset;
1412 vm_object_lock(object);
1413 }
1414 m = first_m;
1415 assert(m->object == object);
1416 first_m = VM_PAGE_NULL;
1417
1418 /*
1419 * check for any conditions that prevent
1420 * us from creating a new zero-fill page
1421 * vm_fault_check will do all of the
1422 * fault cleanup in the case of an error condition
1423 * including resetting the thread_interrupt_level
1424 */
1425 error = vm_fault_check(object, m, first_m, interruptible_state);
1426
1427 if (error != VM_FAULT_SUCCESS)
1428 return (error);
1429
1430 if (m == VM_PAGE_NULL) {
1431 m = vm_page_grab();
1432
1433 if (m == VM_PAGE_NULL) {
1434 vm_fault_cleanup(object, VM_PAGE_NULL);
1435 thread_interrupt_level(interruptible_state);
1436
1437 return (VM_FAULT_MEMORY_SHORTAGE);
1438 }
1439 vm_page_insert(m, object, offset);
1440 }
1441 my_fault = vm_fault_zero_page(m, no_zero_fill);
1442
1443 break;
1444
1445 } else {
1446 /*
1447 * Move on to the next object. Lock the next
1448 * object before unlocking the current one.
1449 */
1450 if ((object != first_object) || must_be_resident)
1451 vm_object_paging_end(object);
1452
1453 offset += object->shadow_offset;
1454 fault_info->lo_offset += object->shadow_offset;
1455 fault_info->hi_offset += object->shadow_offset;
1456 access_required = VM_PROT_READ;
1457
1458 vm_object_lock(next_object);
1459 vm_object_unlock(object);
1460
1461 object = next_object;
1462 vm_object_paging_begin(object);
1463 }
1464 }
1465
1466 /*
1467 * PAGE HAS BEEN FOUND.
1468 *
1469 * This page (m) is:
1470 * busy, so that we can play with it;
1471 * not absent, so that nobody else will fill it;
1472 * possibly eligible for pageout;
1473 *
1474 * The top-level page (first_m) is:
1475 * VM_PAGE_NULL if the page was found in the
1476 * top-level object;
1477 * busy, not absent, and ineligible for pageout.
1478 *
1479 * The current object (object) is locked. A paging
1480 * reference is held for the current and top-level
1481 * objects.
1482 */
1483
1484 #if TRACEFAULTPAGE
1485 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1486 #endif
1487 #if EXTRA_ASSERTIONS
1488 if (m != VM_PAGE_NULL) {
1489 assert(m->busy && !m->absent);
1490 assert((first_m == VM_PAGE_NULL) ||
1491 (first_m->busy && !first_m->absent &&
1492 !first_m->active && !first_m->inactive));
1493 }
1494 #endif /* EXTRA_ASSERTIONS */
1495
1496 /*
1497 * ENCRYPTED SWAP:
1498 * If we found a page, we must have decrypted it before we
1499 * get here...
1500 */
1501 if (m != VM_PAGE_NULL) {
1502 ASSERT_PAGE_DECRYPTED(m);
1503 }
1504
1505 XPR(XPR_VM_FAULT,
1506 "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1507 (integer_t)object, offset, (integer_t)m,
1508 (integer_t)first_object, (integer_t)first_m);
1509
1510 /*
1511 * If the page is being written, but isn't
1512 * already owned by the top-level object,
1513 * we have to copy it into a new page owned
1514 * by the top-level object.
1515 */
1516 if ((object != first_object) && (m != VM_PAGE_NULL)) {
1517
1518 #if TRACEFAULTPAGE
1519 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1520 #endif
1521 if (fault_type & VM_PROT_WRITE) {
1522 vm_page_t copy_m;
1523
1524 /*
1525 * We only really need to copy if we
1526 * want to write it.
1527 */
1528 assert(!must_be_resident);
1529
1530 /*
1531 * are we protecting the system from
1532 * backing store exhaustion. If so
1533 * sleep unless we are privileged.
1534 */
1535 if (vm_backing_store_low) {
1536 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1537
1538 RELEASE_PAGE(m);
1539 vm_fault_cleanup(object, first_m);
1540
1541 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1542
1543 thread_block(THREAD_CONTINUE_NULL);
1544 thread_interrupt_level(interruptible_state);
1545
1546 return (VM_FAULT_RETRY);
1547 }
1548 }
1549 /*
1550 * If we try to collapse first_object at this
1551 * point, we may deadlock when we try to get
1552 * the lock on an intermediate object (since we
1553 * have the bottom object locked). We can't
1554 * unlock the bottom object, because the page
1555 * we found may move (by collapse) if we do.
1556 *
1557 * Instead, we first copy the page. Then, when
1558 * we have no more use for the bottom object,
1559 * we unlock it and try to collapse.
1560 *
1561 * Note that we copy the page even if we didn't
1562 * need to... that's the breaks.
1563 */
1564
1565 /*
1566 * Allocate a page for the copy
1567 */
1568 copy_m = vm_page_grab();
1569
1570 if (copy_m == VM_PAGE_NULL) {
1571 RELEASE_PAGE(m);
1572
1573 vm_fault_cleanup(object, first_m);
1574 thread_interrupt_level(interruptible_state);
1575
1576 return (VM_FAULT_MEMORY_SHORTAGE);
1577 }
1578 XPR(XPR_VM_FAULT,
1579 "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1580 (integer_t)object, offset,
1581 (integer_t)m, (integer_t)copy_m, 0);
1582
1583 vm_page_copy(m, copy_m);
1584
1585 /*
1586 * If another map is truly sharing this
1587 * page with us, we have to flush all
1588 * uses of the original page, since we
1589 * can't distinguish those which want the
1590 * original from those which need the
1591 * new copy.
1592 *
1593 * XXXO If we know that only one map has
1594 * access to this page, then we could
1595 * avoid the pmap_disconnect() call.
1596 */
1597 if (m->pmapped)
1598 pmap_disconnect(m->phys_page);
1599
1600 assert(!m->cleaning);
1601
1602 /*
1603 * We no longer need the old page or object.
1604 */
1605 PAGE_WAKEUP_DONE(m);
1606 vm_object_paging_end(object);
1607 vm_object_unlock(object);
1608
1609 my_fault = DBG_COW_FAULT;
1610 VM_STAT_INCR(cow_faults);
1611 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1612 current_task()->cow_faults++;
1613
1614 object = first_object;
1615 offset = first_offset;
1616
1617 vm_object_lock(object);
1618 /*
1619 * get rid of the place holder
1620 * page that we soldered in earlier
1621 */
1622 VM_PAGE_FREE(first_m);
1623 first_m = VM_PAGE_NULL;
1624
1625 /*
1626 * and replace it with the
1627 * page we just copied into
1628 */
1629 assert(copy_m->busy);
1630 vm_page_insert(copy_m, object, offset);
1631 copy_m->dirty = TRUE;
1632
1633 m = copy_m;
1634 /*
1635 * Now that we've gotten the copy out of the
1636 * way, let's try to collapse the top object.
1637 * But we have to play ugly games with
1638 * paging_in_progress to do that...
1639 */
1640 vm_object_paging_end(object);
1641 vm_object_collapse(object, offset, TRUE);
1642 vm_object_paging_begin(object);
1643
1644 } else
1645 *protection &= (~VM_PROT_WRITE);
1646 }
1647 /*
1648 * Now check whether the page needs to be pushed into the
1649 * copy object. The use of asymmetric copy on write for
1650 * shared temporary objects means that we may do two copies to
1651 * satisfy the fault; one above to get the page from a
1652 * shadowed object, and one here to push it into the copy.
1653 */
1654 try_failed_count = 0;
1655
1656 while ((copy_object = first_object->copy) != VM_OBJECT_NULL && (m != VM_PAGE_NULL)) {
1657 vm_object_offset_t copy_offset;
1658 vm_page_t copy_m;
1659
1660 #if TRACEFAULTPAGE
1661 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1662 #endif
1663 /*
1664 * If the page is being written, but hasn't been
1665 * copied to the copy-object, we have to copy it there.
1666 */
1667 if ((fault_type & VM_PROT_WRITE) == 0) {
1668 *protection &= ~VM_PROT_WRITE;
1669 break;
1670 }
1671
1672 /*
1673 * If the page was guaranteed to be resident,
1674 * we must have already performed the copy.
1675 */
1676 if (must_be_resident)
1677 break;
1678
1679 /*
1680 * Try to get the lock on the copy_object.
1681 */
1682 if (!vm_object_lock_try(copy_object)) {
1683
1684 vm_object_unlock(object);
1685 try_failed_count++;
1686
1687 mutex_pause(try_failed_count); /* wait a bit */
1688 vm_object_lock(object);
1689
1690 continue;
1691 }
1692 try_failed_count = 0;
1693
1694 /*
1695 * Make another reference to the copy-object,
1696 * to keep it from disappearing during the
1697 * copy.
1698 */
1699 vm_object_reference_locked(copy_object);
1700
1701 /*
1702 * Does the page exist in the copy?
1703 */
1704 copy_offset = first_offset - copy_object->shadow_offset;
1705
1706 if (copy_object->size <= copy_offset)
1707 /*
1708 * Copy object doesn't cover this page -- do nothing.
1709 */
1710 ;
1711 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1712 /*
1713 * Page currently exists in the copy object
1714 */
1715 if (copy_m->busy) {
1716 /*
1717 * If the page is being brought
1718 * in, wait for it and then retry.
1719 */
1720 RELEASE_PAGE(m);
1721
1722 /*
1723 * take an extra ref so object won't die
1724 */
1725 vm_object_reference_locked(copy_object);
1726 vm_object_unlock(copy_object);
1727 vm_fault_cleanup(object, first_m);
1728 counter(c_vm_fault_page_block_backoff_kernel++);
1729
1730 vm_object_lock(copy_object);
1731 assert(copy_object->ref_count > 0);
1732 VM_OBJ_RES_DECR(copy_object);
1733 vm_object_lock_assert_exclusive(copy_object);
1734 copy_object->ref_count--;
1735 assert(copy_object->ref_count > 0);
1736 copy_m = vm_page_lookup(copy_object, copy_offset);
1737 /*
1738 * ENCRYPTED SWAP:
1739 * it's OK if the "copy_m" page is encrypted,
1740 * because we're not moving it nor handling its
1741 * contents.
1742 */
1743 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1744 PAGE_ASSERT_WAIT(copy_m, interruptible);
1745
1746 vm_object_unlock(copy_object);
1747 wait_result = thread_block(THREAD_CONTINUE_NULL);
1748 vm_object_deallocate(copy_object);
1749
1750 goto backoff;
1751 } else {
1752 vm_object_unlock(copy_object);
1753 vm_object_deallocate(copy_object);
1754 thread_interrupt_level(interruptible_state);
1755
1756 return (VM_FAULT_RETRY);
1757 }
1758 }
1759 }
1760 else if (!PAGED_OUT(copy_object, copy_offset)) {
1761 /*
1762 * If PAGED_OUT is TRUE, then the page used to exist
1763 * in the copy-object, and has already been paged out.
1764 * We don't need to repeat this. If PAGED_OUT is
1765 * FALSE, then either we don't know (!pager_created,
1766 * for example) or it hasn't been paged out.
1767 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1768 * We must copy the page to the copy object.
1769 */
1770
1771 if (vm_backing_store_low) {
1772 /*
1773 * we are protecting the system from
1774 * backing store exhaustion. If so
1775 * sleep unless we are privileged.
1776 */
1777 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1778 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1779
1780 RELEASE_PAGE(m);
1781 VM_OBJ_RES_DECR(copy_object);
1782 vm_object_lock_assert_exclusive(copy_object);
1783 copy_object->ref_count--;
1784 assert(copy_object->ref_count > 0);
1785
1786 vm_object_unlock(copy_object);
1787 vm_fault_cleanup(object, first_m);
1788 thread_block(THREAD_CONTINUE_NULL);
1789 thread_interrupt_level(interruptible_state);
1790
1791 return (VM_FAULT_RETRY);
1792 }
1793 }
1794 /*
1795 * Allocate a page for the copy
1796 */
1797 copy_m = vm_page_alloc(copy_object, copy_offset);
1798
1799 if (copy_m == VM_PAGE_NULL) {
1800 RELEASE_PAGE(m);
1801
1802 VM_OBJ_RES_DECR(copy_object);
1803 vm_object_lock_assert_exclusive(copy_object);
1804 copy_object->ref_count--;
1805 assert(copy_object->ref_count > 0);
1806
1807 vm_object_unlock(copy_object);
1808 vm_fault_cleanup(object, first_m);
1809 thread_interrupt_level(interruptible_state);
1810
1811 return (VM_FAULT_MEMORY_SHORTAGE);
1812 }
1813 /*
1814 * Must copy page into copy-object.
1815 */
1816 vm_page_copy(m, copy_m);
1817
1818 /*
1819 * If the old page was in use by any users
1820 * of the copy-object, it must be removed
1821 * from all pmaps. (We can't know which
1822 * pmaps use it.)
1823 */
1824 if (m->pmapped)
1825 pmap_disconnect(m->phys_page);
1826
1827 /*
1828 * If there's a pager, then immediately
1829 * page out this page, using the "initialize"
1830 * option. Else, we use the copy.
1831 */
1832 if ((!copy_object->pager_created)
1833 #if MACH_PAGEMAP
1834 || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
1835 #endif
1836 ) {
1837
1838 vm_page_lockspin_queues();
1839 assert(!m->cleaning);
1840 vm_page_activate(copy_m);
1841 vm_page_unlock_queues();
1842
1843 copy_m->dirty = TRUE;
1844 PAGE_WAKEUP_DONE(copy_m);
1845 }
1846 else {
1847 assert(copy_m->busy == TRUE);
1848 assert(!m->cleaning);
1849
1850 /*
1851 * dirty is protected by the object lock
1852 */
1853 copy_m->dirty = TRUE;
1854
1855 /*
1856 * The page is already ready for pageout:
1857 * not on pageout queues and busy.
1858 * Unlock everything except the
1859 * copy_object itself.
1860 */
1861 vm_object_unlock(object);
1862
1863 /*
1864 * Write the page to the copy-object,
1865 * flushing it from the kernel.
1866 */
1867 vm_pageout_initialize_page(copy_m);
1868
1869 /*
1870 * Since the pageout may have
1871 * temporarily dropped the
1872 * copy_object's lock, we
1873 * check whether we'll have
1874 * to deallocate the hard way.
1875 */
1876 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
1877 vm_object_unlock(copy_object);
1878 vm_object_deallocate(copy_object);
1879 vm_object_lock(object);
1880
1881 continue;
1882 }
1883 /*
1884 * Pick back up the old object's
1885 * lock. [It is safe to do so,
1886 * since it must be deeper in the
1887 * object tree.]
1888 */
1889 vm_object_lock(object);
1890 }
1891 /*
1892 * Because we're pushing a page upward
1893 * in the object tree, we must restart
1894 * any faults that are waiting here.
1895 * [Note that this is an expansion of
1896 * PAGE_WAKEUP that uses the THREAD_RESTART
1897 * wait result]. Can't turn off the page's
1898 * busy bit because we're not done with it.
1899 */
1900 if (m->wanted) {
1901 m->wanted = FALSE;
1902 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
1903 }
1904 }
1905 /*
1906 * The reference count on copy_object must be
1907 * at least 2: one for our extra reference,
1908 * and at least one from the outside world
1909 * (we checked that when we last locked
1910 * copy_object).
1911 */
1912 vm_object_lock_assert_exclusive(copy_object);
1913 copy_object->ref_count--;
1914 assert(copy_object->ref_count > 0);
1915
1916 VM_OBJ_RES_DECR(copy_object);
1917 vm_object_unlock(copy_object);
1918
1919 break;
1920 }
1921 *result_page = m;
1922 *top_page = first_m;
1923
1924 XPR(XPR_VM_FAULT,
1925 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1926 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1927
1928 if (m != VM_PAGE_NULL) {
1929 if (my_fault == DBG_PAGEIN_FAULT) {
1930
1931 VM_STAT_INCR(pageins);
1932 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
1933 DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1934 current_task()->pageins++;
1935
1936 if (m->object->internal) {
1937 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
1938 } else {
1939 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
1940 }
1941
1942 /*
1943 * evaluate access pattern and update state
1944 * vm_fault_deactivate_behind depends on the
1945 * state being up to date
1946 */
1947 vm_fault_is_sequential(object, offset, fault_info->behavior);
1948
1949 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
1950 }
1951 if (type_of_fault)
1952 *type_of_fault = my_fault;
1953 } else
1954 vm_object_unlock(object);
1955
1956 thread_interrupt_level(interruptible_state);
1957
1958 #if TRACEFAULTPAGE
1959 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
1960 #endif
1961 return (VM_FAULT_SUCCESS);
1962
1963 backoff:
1964 thread_interrupt_level(interruptible_state);
1965
1966 if (wait_result == THREAD_INTERRUPTED)
1967 return (VM_FAULT_INTERRUPTED);
1968 return (VM_FAULT_RETRY);
1969
1970 #undef RELEASE_PAGE
1971 }
1972
1973
1974
1975 /*
1976 * CODE SIGNING:
1977 * When soft faulting a page, we have to validate the page if:
1978 * 1. the page is being mapped in user space
1979 * 2. the page hasn't already been found to be "tainted"
1980 * 3. the page belongs to a code-signed object
1981 * 4. the page has not been validated yet or has been mapped for write.
1982 */
1983 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page) \
1984 ((pmap) != kernel_pmap /*1*/ && \
1985 !(page)->cs_tainted /*2*/ && \
1986 (page)->object->code_signed /*3*/ && \
1987 (!(page)->cs_validated || (page)->wpmapped /*4*/))
1988
1989
1990 /*
1991 * page queue lock must NOT be held
1992 * m->object must be locked
1993 *
1994 * NOTE: m->object could be locked "shared" only if we are called
1995 * from vm_fault() as part of a soft fault. If so, we must be
1996 * careful not to modify the VM object in any way that is not
1997 * legal under a shared lock...
1998 */
1999 unsigned long cs_enter_tainted_rejected = 0;
2000 unsigned long cs_enter_tainted_accepted = 0;
2001 kern_return_t
2002 vm_fault_enter(vm_page_t m,
2003 pmap_t pmap,
2004 vm_map_offset_t vaddr,
2005 vm_prot_t prot,
2006 boolean_t wired,
2007 boolean_t change_wiring,
2008 boolean_t no_cache,
2009 int *type_of_fault)
2010 {
2011 unsigned int cache_attr;
2012 kern_return_t kr;
2013 boolean_t previously_pmapped = m->pmapped;
2014
2015 vm_object_lock_assert_held(m->object);
2016 #if DEBUG
2017 mutex_assert(&vm_page_queue_lock, MA_NOTOWNED);
2018 #endif /* DEBUG */
2019
2020 if (m->phys_page == vm_page_guard_addr) {
2021 assert(m->fictitious);
2022 return KERN_SUCCESS;
2023 }
2024
2025 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2026
2027 if (m->pmapped == FALSE) {
2028 /*
2029 * This is the first time this page is being
2030 * mapped in an address space (pmapped == FALSE).
2031 *
2032 * Part of that page may still be in the data cache
2033 * and not flushed to memory. In case we end up
2034 * accessing that page via the instruction cache,
2035 * we need to ensure that the 2 caches are in sync.
2036 */
2037 pmap_sync_page_data_phys(m->phys_page);
2038
2039 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2040 /*
2041 * found it in the cache, but this
2042 * is the first fault-in of the page (m->pmapped == FALSE)
2043 * so it must have come in as part of
2044 * a cluster... account 1 pagein against it
2045 */
2046 VM_STAT_INCR(pageins);
2047 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2048
2049 if (m->object->internal) {
2050 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2051 } else {
2052 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2053 }
2054
2055 current_task()->pageins++;
2056
2057 *type_of_fault = DBG_PAGEIN_FAULT;
2058 }
2059 VM_PAGE_CONSUME_CLUSTERED(m);
2060
2061 } else if (cache_attr != VM_WIMG_DEFAULT)
2062 pmap_sync_page_attributes_phys(m->phys_page);
2063
2064 if (*type_of_fault != DBG_COW_FAULT) {
2065 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2066
2067 if (pmap == kernel_pmap) {
2068 DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2069 }
2070 }
2071
2072 if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2073 vm_object_lock_assert_exclusive(m->object);
2074
2075 if (m->cs_validated) {
2076 vm_cs_revalidates++;
2077 }
2078
2079 /* VM map is locked, so 1 ref will remain on VM object */
2080 vm_page_validate_cs(m);
2081 }
2082
2083 if (m->cs_tainted /* always invalidate a tainted page */
2084 #if CONFIG_ENFORCE_SIGNED_CODE
2085 /*
2086 * Code Signing enforcement invalidates an executable page that
2087 * has no code directory, and thus could not be validated.
2088 */
2089 || ((prot & VM_PROT_EXECUTE) && !m->cs_validated )
2090 #endif
2091 ) {
2092 /*
2093 * CODE SIGNING:
2094 * This page has been tainted and can not be trusted.
2095 * Let's notify the current process and let it take any
2096 * necessary precautions before we enter the tainted page
2097 * into its address space.
2098 */
2099 kr = KERN_SUCCESS;
2100 #if CONFIG_ENFORCE_SIGNED_CODE
2101 if (!cs_enforcement_disable) {
2102 #endif
2103 if (cs_invalid_page((addr64_t) vaddr)) {
2104 /* reject the tainted page: abort the page fault */
2105 kr = KERN_MEMORY_ERROR;
2106 cs_enter_tainted_rejected++;
2107 } else {
2108 /* proceed with the tainted page */
2109 kr = KERN_SUCCESS;
2110 cs_enter_tainted_accepted++;
2111 }
2112 #if CONFIG_ENFORCE_SIGNED_CODE
2113 }
2114 #endif
2115 if (cs_debug || kr != KERN_SUCCESS) {
2116 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2117 "page %p obj %p off 0x%llx *** INVALID PAGE ***\n",
2118 (long long)vaddr, m, m->object, m->offset);
2119 }
2120 } else {
2121 /* proceed with the valid page */
2122 kr = KERN_SUCCESS;
2123 }
2124
2125 if (kr == KERN_SUCCESS) {
2126 /*
2127 * NOTE: we may only hold the vm_object lock SHARED
2128 * at this point, but the update of pmapped is ok
2129 * since this is the ONLY bit updated behind the SHARED
2130 * lock... however, we need to figure out how to do an atomic
2131 * update on a bit field to make this less fragile... right
2132 * now I don't know how to coerce 'C' to give me the offset info
2133 * that's needed for an AtomicCompareAndSwap
2134 */
2135 m->pmapped = TRUE;
2136 if (prot & VM_PROT_WRITE) {
2137 vm_object_lock_assert_exclusive(m->object);
2138 m->wpmapped = TRUE;
2139 }
2140
2141 PMAP_ENTER(pmap, vaddr, m, prot, cache_attr, wired);
2142 }
2143
2144 /*
2145 * Hold queues lock to manipulate
2146 * the page queues. Change wiring
2147 * case is obvious.
2148 */
2149 if (change_wiring) {
2150 vm_page_lockspin_queues();
2151
2152 if (wired) {
2153 if (kr == KERN_SUCCESS) {
2154 vm_page_wire(m);
2155 }
2156 } else {
2157 vm_page_unwire(m);
2158 }
2159 vm_page_unlock_queues();
2160
2161 } else {
2162 if (kr != KERN_SUCCESS) {
2163 vm_page_lock_queues();
2164 vm_page_deactivate(m);
2165 vm_page_unlock_queues();
2166 } else {
2167 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count && !m->throttled) {
2168 vm_page_lockspin_queues();
2169 /*
2170 * test again now that we hold the page queue lock
2171 */
2172 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count) {
2173
2174 /*
2175 * If this is a no_cache mapping and the page has never been
2176 * mapped before or was previously a no_cache page, then we
2177 * want to leave pages in the speculative state so that they
2178 * can be readily recycled if free memory runs low. Otherwise
2179 * the page is activated as normal.
2180 */
2181
2182 if (no_cache && (!previously_pmapped || m->no_cache)) {
2183 m->no_cache = TRUE;
2184
2185 if (m->active || m->inactive)
2186 VM_PAGE_QUEUES_REMOVE(m);
2187
2188 if (!m->speculative)
2189 vm_page_speculate(m, TRUE);
2190
2191 } else if (!m->active && !m->inactive)
2192 vm_page_activate(m);
2193
2194 }
2195
2196 vm_page_unlock_queues();
2197 }
2198 }
2199 }
2200 return kr;
2201 }
2202
2203
2204 /*
2205 * Routine: vm_fault
2206 * Purpose:
2207 * Handle page faults, including pseudo-faults
2208 * used to change the wiring status of pages.
2209 * Returns:
2210 * Explicit continuations have been removed.
2211 * Implementation:
2212 * vm_fault and vm_fault_page save mucho state
2213 * in the moral equivalent of a closure. The state
2214 * structure is allocated when first entering vm_fault
2215 * and deallocated when leaving vm_fault.
2216 */
2217
2218 extern int _map_enter_debug;
2219
2220 unsigned long vm_fault_collapse_total = 0;
2221 unsigned long vm_fault_collapse_skipped = 0;
2222
2223 kern_return_t
2224 vm_fault(
2225 vm_map_t map,
2226 vm_map_offset_t vaddr,
2227 vm_prot_t fault_type,
2228 boolean_t change_wiring,
2229 int interruptible,
2230 pmap_t caller_pmap,
2231 vm_map_offset_t caller_pmap_addr)
2232 {
2233 vm_map_version_t version; /* Map version for verificiation */
2234 boolean_t wired; /* Should mapping be wired down? */
2235 vm_object_t object; /* Top-level object */
2236 vm_object_offset_t offset; /* Top-level offset */
2237 vm_prot_t prot; /* Protection for mapping */
2238 vm_object_t old_copy_object; /* Saved copy object */
2239 vm_page_t result_page; /* Result of vm_fault_page */
2240 vm_page_t top_page; /* Placeholder page */
2241 kern_return_t kr;
2242
2243 vm_page_t m; /* Fast access to result_page */
2244 kern_return_t error_code;
2245 vm_object_t cur_object;
2246 vm_object_offset_t cur_offset;
2247 vm_page_t cur_m;
2248 vm_object_t new_object;
2249 int type_of_fault;
2250 pmap_t pmap;
2251 boolean_t interruptible_state;
2252 vm_map_t real_map = map;
2253 vm_map_t original_map = map;
2254 vm_prot_t original_fault_type;
2255 struct vm_object_fault_info fault_info;
2256 boolean_t need_collapse = FALSE;
2257 int object_lock_type = 0;
2258 int cur_object_lock_type;
2259 vm_object_t top_object = VM_OBJECT_NULL;
2260
2261
2262 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
2263 (int)((uint64_t)vaddr >> 32),
2264 (int)vaddr,
2265 0,
2266 0,
2267 0);
2268
2269 if (get_preemption_level() != 0) {
2270 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2271 (int)((uint64_t)vaddr >> 32),
2272 (int)vaddr,
2273 KERN_FAILURE,
2274 0,
2275 0);
2276
2277 return (KERN_FAILURE);
2278 }
2279 interruptible_state = thread_interrupt_level(interruptible);
2280
2281 VM_STAT_INCR(faults);
2282 current_task()->faults++;
2283 original_fault_type = fault_type;
2284
2285 if (fault_type & VM_PROT_WRITE)
2286 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2287 else
2288 object_lock_type = OBJECT_LOCK_SHARED;
2289
2290 cur_object_lock_type = OBJECT_LOCK_SHARED;
2291
2292 RetryFault:
2293 /*
2294 * assume we will hit a page in the cache
2295 * otherwise, explicitly override with
2296 * the real fault type once we determine it
2297 */
2298 type_of_fault = DBG_CACHE_HIT_FAULT;
2299
2300 /*
2301 * Find the backing store object and offset into
2302 * it to begin the search.
2303 */
2304 fault_type = original_fault_type;
2305 map = original_map;
2306 vm_map_lock_read(map);
2307
2308 kr = vm_map_lookup_locked(&map, vaddr, fault_type,
2309 object_lock_type, &version,
2310 &object, &offset, &prot, &wired,
2311 &fault_info,
2312 &real_map);
2313
2314 if (kr != KERN_SUCCESS) {
2315 vm_map_unlock_read(map);
2316 goto done;
2317 }
2318 pmap = real_map->pmap;
2319 fault_info.interruptible = interruptible;
2320
2321 /*
2322 * If the page is wired, we must fault for the current protection
2323 * value, to avoid further faults.
2324 */
2325 if (wired) {
2326 fault_type = prot | VM_PROT_WRITE;
2327 /*
2328 * since we're treating this fault as a 'write'
2329 * we must hold the top object lock exclusively
2330 */
2331 if (object_lock_type == OBJECT_LOCK_SHARED) {
2332
2333 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2334
2335 if (vm_object_lock_upgrade(object) == FALSE) {
2336 /*
2337 * couldn't upgrade, so explictly
2338 * take the lock exclusively
2339 */
2340 vm_object_lock(object);
2341 }
2342 }
2343 }
2344
2345 #if VM_FAULT_CLASSIFY
2346 /*
2347 * Temporary data gathering code
2348 */
2349 vm_fault_classify(object, offset, fault_type);
2350 #endif
2351 /*
2352 * Fast fault code. The basic idea is to do as much as
2353 * possible while holding the map lock and object locks.
2354 * Busy pages are not used until the object lock has to
2355 * be dropped to do something (copy, zero fill, pmap enter).
2356 * Similarly, paging references aren't acquired until that
2357 * point, and object references aren't used.
2358 *
2359 * If we can figure out what to do
2360 * (zero fill, copy on write, pmap enter) while holding
2361 * the locks, then it gets done. Otherwise, we give up,
2362 * and use the original fault path (which doesn't hold
2363 * the map lock, and relies on busy pages).
2364 * The give up cases include:
2365 * - Have to talk to pager.
2366 * - Page is busy, absent or in error.
2367 * - Pager has locked out desired access.
2368 * - Fault needs to be restarted.
2369 * - Have to push page into copy object.
2370 *
2371 * The code is an infinite loop that moves one level down
2372 * the shadow chain each time. cur_object and cur_offset
2373 * refer to the current object being examined. object and offset
2374 * are the original object from the map. The loop is at the
2375 * top level if and only if object and cur_object are the same.
2376 *
2377 * Invariants: Map lock is held throughout. Lock is held on
2378 * original object and cur_object (if different) when
2379 * continuing or exiting loop.
2380 *
2381 */
2382
2383
2384 /*
2385 * If this page is to be inserted in a copy delay object
2386 * for writing, and if the object has a copy, then the
2387 * copy delay strategy is implemented in the slow fault page.
2388 */
2389 if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
2390 object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
2391 goto handle_copy_delay;
2392
2393 cur_object = object;
2394 cur_offset = offset;
2395
2396 while (TRUE) {
2397 m = vm_page_lookup(cur_object, cur_offset);
2398
2399 if (m != VM_PAGE_NULL) {
2400 if (m->busy) {
2401 wait_result_t result;
2402
2403 /*
2404 * in order to do the PAGE_ASSERT_WAIT, we must
2405 * have object that 'm' belongs to locked exclusively
2406 */
2407 if (object != cur_object) {
2408 vm_object_unlock(object);
2409
2410 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2411
2412 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2413
2414 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2415 /*
2416 * couldn't upgrade so go do a full retry
2417 * immediately since we've already dropped
2418 * the top object lock associated with this page
2419 * and the current one got dropped due to the
2420 * failed upgrade... the state is no longer valid
2421 */
2422 vm_map_unlock_read(map);
2423 if (real_map != map)
2424 vm_map_unlock(real_map);
2425
2426 goto RetryFault;
2427 }
2428 }
2429 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2430
2431 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2432
2433 if (vm_object_lock_upgrade(object) == FALSE) {
2434 /*
2435 * couldn't upgrade, so explictly take the lock
2436 * exclusively and go relookup the page since we
2437 * will have dropped the object lock and
2438 * a different thread could have inserted
2439 * a page at this offset
2440 * no need for a full retry since we're
2441 * at the top level of the object chain
2442 */
2443 vm_object_lock(object);
2444
2445 continue;
2446 }
2447 }
2448 vm_map_unlock_read(map);
2449 if (real_map != map)
2450 vm_map_unlock(real_map);
2451
2452 result = PAGE_ASSERT_WAIT(m, interruptible);
2453
2454 vm_object_unlock(cur_object);
2455
2456 if (result == THREAD_WAITING) {
2457 result = thread_block(THREAD_CONTINUE_NULL);
2458
2459 counter(c_vm_fault_page_block_busy_kernel++);
2460 }
2461 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2462 goto RetryFault;
2463
2464 kr = KERN_ABORTED;
2465 goto done;
2466 }
2467 if (m->phys_page == vm_page_guard_addr) {
2468 /*
2469 * Guard page: let the slow path deal with it
2470 */
2471 break;
2472 }
2473 if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
2474 /*
2475 * Unusual case... let the slow path deal with it
2476 */
2477 break;
2478 }
2479 if (m->encrypted) {
2480 /*
2481 * ENCRYPTED SWAP:
2482 * We've soft-faulted (because it's not in the page
2483 * table) on an encrypted page.
2484 * Keep the page "busy" so that no one messes with
2485 * it during the decryption.
2486 * Release the extra locks we're holding, keep only
2487 * the page's VM object lock.
2488 *
2489 * in order to set 'busy' on 'm', we must
2490 * have object that 'm' belongs to locked exclusively
2491 */
2492 if (object != cur_object) {
2493 vm_object_unlock(object);
2494
2495 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2496
2497 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2498
2499 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2500 /*
2501 * couldn't upgrade so go do a full retry
2502 * immediately since we've already dropped
2503 * the top object lock associated with this page
2504 * and the current one got dropped due to the
2505 * failed upgrade... the state is no longer valid
2506 */
2507 vm_map_unlock_read(map);
2508 if (real_map != map)
2509 vm_map_unlock(real_map);
2510
2511 goto RetryFault;
2512 }
2513 }
2514 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2515
2516 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2517
2518 if (vm_object_lock_upgrade(object) == FALSE) {
2519 /*
2520 * couldn't upgrade, so explictly take the lock
2521 * exclusively and go relookup the page since we
2522 * will have dropped the object lock and
2523 * a different thread could have inserted
2524 * a page at this offset
2525 * no need for a full retry since we're
2526 * at the top level of the object chain
2527 */
2528 vm_object_lock(object);
2529
2530 continue;
2531 }
2532 }
2533 m->busy = TRUE;
2534
2535 vm_map_unlock_read(map);
2536 if (real_map != map)
2537 vm_map_unlock(real_map);
2538
2539 vm_page_decrypt(m, 0);
2540
2541 assert(m->busy);
2542 PAGE_WAKEUP_DONE(m);
2543
2544 vm_object_unlock(cur_object);
2545 /*
2546 * Retry from the top, in case anything
2547 * changed while we were decrypting...
2548 */
2549 goto RetryFault;
2550 }
2551 ASSERT_PAGE_DECRYPTED(m);
2552
2553 if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) {
2554 /*
2555 * We might need to validate this page
2556 * against its code signature, so we
2557 * want to hold the VM object exclusively.
2558 */
2559 if (object != cur_object) {
2560 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2561 vm_object_unlock(object);
2562 vm_object_unlock(cur_object);
2563
2564 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2565
2566 vm_map_unlock_read(map);
2567 if (real_map != map)
2568 vm_map_unlock(real_map);
2569
2570 goto RetryFault;
2571 }
2572
2573 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2574
2575 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2576
2577 if (vm_object_lock_upgrade(object) == FALSE) {
2578 /*
2579 * couldn't upgrade, so explictly take the lock
2580 * exclusively and go relookup the page since we
2581 * will have dropped the object lock and
2582 * a different thread could have inserted
2583 * a page at this offset
2584 * no need for a full retry since we're
2585 * at the top level of the object chain
2586 */
2587 vm_object_lock(object);
2588
2589 continue;
2590 }
2591 }
2592 }
2593 /*
2594 * Two cases of map in faults:
2595 * - At top level w/o copy object.
2596 * - Read fault anywhere.
2597 * --> must disallow write.
2598 */
2599
2600 if (object == cur_object && object->copy == VM_OBJECT_NULL) {
2601 if ((fault_type & VM_PROT_WRITE) == 0) {
2602 /*
2603 * This is not a "write" fault, so we
2604 * might not have taken the object lock
2605 * exclusively and we might not be able
2606 * to update the "wpmapped" bit in
2607 * vm_fault_enter().
2608 * Let's just grant read access to
2609 * the page for now and we'll
2610 * soft-fault again if we need write
2611 * access later...
2612 */
2613 prot &= ~VM_PROT_WRITE;
2614 }
2615 goto FastPmapEnter;
2616 }
2617
2618 if ((fault_type & VM_PROT_WRITE) == 0) {
2619
2620 prot &= ~VM_PROT_WRITE;
2621
2622 if (object != cur_object) {
2623 /*
2624 * We still need to hold the top object
2625 * lock here to prevent a race between
2626 * a read fault (taking only "shared"
2627 * locks) and a write fault (taking
2628 * an "exclusive" lock on the top
2629 * object.
2630 * Otherwise, as soon as we release the
2631 * top lock, the write fault could
2632 * proceed and actually complete before
2633 * the read fault, and the copied page's
2634 * translation could then be overwritten
2635 * by the read fault's translation for
2636 * the original page.
2637 *
2638 * Let's just record what the top object
2639 * is and we'll release it later.
2640 */
2641 top_object = object;
2642
2643 /*
2644 * switch to the object that has the new page
2645 */
2646 object = cur_object;
2647 object_lock_type = cur_object_lock_type;
2648 }
2649 FastPmapEnter:
2650 /*
2651 * prepare for the pmap_enter...
2652 * object and map are both locked
2653 * m contains valid data
2654 * object == m->object
2655 * cur_object == NULL or it's been unlocked
2656 * no paging references on either object or cur_object
2657 */
2658 #if MACH_KDB
2659 if (db_watchpoint_list && (fault_type & VM_PROT_WRITE) == 0)
2660 prot &= ~VM_PROT_WRITE;
2661 #endif
2662 if (caller_pmap) {
2663 kr = vm_fault_enter(m,
2664 caller_pmap,
2665 caller_pmap_addr,
2666 prot,
2667 wired,
2668 change_wiring,
2669 fault_info.no_cache,
2670 &type_of_fault);
2671 } else {
2672 kr = vm_fault_enter(m,
2673 pmap,
2674 vaddr,
2675 prot,
2676 wired,
2677 change_wiring,
2678 fault_info.no_cache,
2679 &type_of_fault);
2680 }
2681
2682 if (top_object != VM_OBJECT_NULL) {
2683 /*
2684 * It's safe to drop the top object
2685 * now that we've done our
2686 * vm_fault_enter(). Any other fault
2687 * in progress for that virtual
2688 * address will either find our page
2689 * and translation or put in a new page
2690 * and translation.
2691 */
2692 vm_object_unlock(top_object);
2693 top_object = VM_OBJECT_NULL;
2694 }
2695
2696 if (need_collapse == TRUE)
2697 vm_object_collapse(object, offset, TRUE);
2698
2699 if (type_of_fault == DBG_PAGEIN_FAULT) {
2700 /*
2701 * evaluate access pattern and update state
2702 * vm_fault_deactivate_behind depends on the
2703 * state being up to date
2704 */
2705 vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
2706
2707 vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
2708 }
2709 /*
2710 * That's it, clean up and return.
2711 */
2712 if (m->busy)
2713 PAGE_WAKEUP_DONE(m);
2714
2715 vm_object_unlock(object);
2716
2717 vm_map_unlock_read(map);
2718 if (real_map != map)
2719 vm_map_unlock(real_map);
2720
2721 goto done;
2722 }
2723 /*
2724 * COPY ON WRITE FAULT
2725 *
2726 * If objects match, then
2727 * object->copy must not be NULL (else control
2728 * would be in previous code block), and we
2729 * have a potential push into the copy object
2730 * with which we can't cope with here.
2731 */
2732 if (cur_object == object) {
2733 /*
2734 * must take the slow path to
2735 * deal with the copy push
2736 */
2737 break;
2738 }
2739 assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
2740
2741 /*
2742 * This is now a shadow based copy on write
2743 * fault -- it requires a copy up the shadow
2744 * chain.
2745 *
2746 * Allocate a page in the original top level
2747 * object. Give up if allocate fails. Also
2748 * need to remember current page, as it's the
2749 * source of the copy.
2750 *
2751 * at this point we hold locks on both
2752 * object and cur_object... no need to take
2753 * paging refs or mark pages BUSY since
2754 * we don't drop either object lock until
2755 * the page has been copied and inserted
2756 */
2757 cur_m = m;
2758 m = vm_page_grab();
2759
2760 if (m == VM_PAGE_NULL) {
2761 /*
2762 * no free page currently available...
2763 * must take the slow path
2764 */
2765 break;
2766 }
2767 /*
2768 * Now do the copy. Mark the source page busy...
2769 *
2770 * NOTE: This code holds the map lock across
2771 * the page copy.
2772 */
2773 vm_page_copy(cur_m, m);
2774 vm_page_insert(m, object, offset);
2775 m->dirty = TRUE;
2776
2777 /*
2778 * Now cope with the source page and object
2779 */
2780 if (object->ref_count > 1 && cur_m->pmapped)
2781 pmap_disconnect(cur_m->phys_page);
2782
2783 need_collapse = TRUE;
2784
2785 if (!cur_object->internal &&
2786 cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
2787 /*
2788 * The object from which we've just
2789 * copied a page is most probably backed
2790 * by a vnode. We don't want to waste too
2791 * much time trying to collapse the VM objects
2792 * and create a bottleneck when several tasks
2793 * map the same file.
2794 */
2795 if (cur_object->copy == object) {
2796 /*
2797 * Shared mapping or no COW yet.
2798 * We can never collapse a copy
2799 * object into its backing object.
2800 */
2801 need_collapse = FALSE;
2802 } else if (cur_object->copy == object->shadow &&
2803 object->shadow->resident_page_count == 0) {
2804 /*
2805 * Shared mapping after a COW occurred.
2806 */
2807 need_collapse = FALSE;
2808 }
2809 }
2810 vm_object_unlock(cur_object);
2811
2812 if (need_collapse == FALSE)
2813 vm_fault_collapse_skipped++;
2814 vm_fault_collapse_total++;
2815
2816 type_of_fault = DBG_COW_FAULT;
2817 VM_STAT_INCR(cow_faults);
2818 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2819 current_task()->cow_faults++;
2820
2821 goto FastPmapEnter;
2822
2823 } else {
2824 /*
2825 * No page at cur_object, cur_offset... m == NULL
2826 */
2827 if (cur_object->pager_created) {
2828 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) {
2829 /*
2830 * May have to talk to a pager...
2831 * take the slow path.
2832 */
2833 break;
2834 }
2835 /*
2836 * existence map present and indicates
2837 * that the pager doesn't have this page
2838 */
2839 }
2840 if (cur_object->shadow == VM_OBJECT_NULL) {
2841 /*
2842 * Zero fill fault. Page gets
2843 * inserted into the original object.
2844 */
2845 if (cur_object->shadow_severed) {
2846
2847 if (object != cur_object)
2848 vm_object_unlock(cur_object);
2849 vm_object_unlock(object);
2850
2851 vm_map_unlock_read(map);
2852 if (real_map != map)
2853 vm_map_unlock(real_map);
2854
2855 kr = KERN_MEMORY_ERROR;
2856 goto done;
2857 }
2858 if (VM_PAGE_ZFILL_THROTTLED()) {
2859 /*
2860 * drop all of our locks...
2861 * wait until the free queue is
2862 * pumped back up and then
2863 * redrive the fault
2864 */
2865 if (object != cur_object)
2866 vm_object_unlock(cur_object);
2867 vm_object_unlock(object);
2868 vm_map_unlock_read(map);
2869 if (real_map != map)
2870 vm_map_unlock(real_map);
2871
2872 if (vm_page_wait((change_wiring) ?
2873 THREAD_UNINT :
2874 THREAD_ABORTSAFE))
2875 goto RetryFault;
2876
2877 kr = KERN_ABORTED;
2878 goto done;
2879 }
2880 if (vm_backing_store_low) {
2881 /*
2882 * we are protecting the system from
2883 * backing store exhaustion...
2884 * must take the slow path if we're
2885 * not privileged
2886 */
2887 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
2888 break;
2889 }
2890 if (cur_object != object) {
2891 vm_object_unlock(cur_object);
2892
2893 cur_object = object;
2894 }
2895 if (object_lock_type == OBJECT_LOCK_SHARED) {
2896
2897 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2898
2899 if (vm_object_lock_upgrade(object) == FALSE) {
2900 /*
2901 * couldn't upgrade so do a full retry on the fault
2902 * since we dropped the object lock which
2903 * could allow another thread to insert
2904 * a page at this offset
2905 */
2906 vm_map_unlock_read(map);
2907 if (real_map != map)
2908 vm_map_unlock(real_map);
2909
2910 goto RetryFault;
2911 }
2912 }
2913 m = vm_page_alloc(object, offset);
2914
2915 if (m == VM_PAGE_NULL) {
2916 /*
2917 * no free page currently available...
2918 * must take the slow path
2919 */
2920 break;
2921 }
2922
2923 /*
2924 * Now zero fill page...
2925 * the page is probably going to
2926 * be written soon, so don't bother
2927 * to clear the modified bit
2928 *
2929 * NOTE: This code holds the map
2930 * lock across the zero fill.
2931 */
2932 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
2933
2934 goto FastPmapEnter;
2935 }
2936 /*
2937 * On to the next level in the shadow chain
2938 */
2939 cur_offset += cur_object->shadow_offset;
2940 new_object = cur_object->shadow;
2941
2942 /*
2943 * take the new_object's lock with the indicated state
2944 */
2945 if (cur_object_lock_type == OBJECT_LOCK_SHARED)
2946 vm_object_lock_shared(new_object);
2947 else
2948 vm_object_lock(new_object);
2949
2950 if (cur_object != object)
2951 vm_object_unlock(cur_object);
2952
2953 cur_object = new_object;
2954
2955 continue;
2956 }
2957 }
2958 /*
2959 * Cleanup from fast fault failure. Drop any object
2960 * lock other than original and drop map lock.
2961 */
2962 if (object != cur_object)
2963 vm_object_unlock(cur_object);
2964
2965 /*
2966 * must own the object lock exclusively at this point
2967 */
2968 if (object_lock_type == OBJECT_LOCK_SHARED) {
2969 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2970
2971 if (vm_object_lock_upgrade(object) == FALSE) {
2972 /*
2973 * couldn't upgrade, so explictly
2974 * take the lock exclusively
2975 * no need to retry the fault at this
2976 * point since "vm_fault_page" will
2977 * completely re-evaluate the state
2978 */
2979 vm_object_lock(object);
2980 }
2981 }
2982
2983 handle_copy_delay:
2984 vm_map_unlock_read(map);
2985 if (real_map != map)
2986 vm_map_unlock(real_map);
2987
2988 /*
2989 * Make a reference to this object to
2990 * prevent its disposal while we are messing with
2991 * it. Once we have the reference, the map is free
2992 * to be diddled. Since objects reference their
2993 * shadows (and copies), they will stay around as well.
2994 */
2995 vm_object_reference_locked(object);
2996 vm_object_paging_begin(object);
2997
2998 XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2999
3000 error_code = 0;
3001
3002 kr = vm_fault_page(object, offset, fault_type,
3003 (change_wiring && !wired),
3004 &prot, &result_page, &top_page,
3005 &type_of_fault,
3006 &error_code, map->no_zero_fill,
3007 FALSE, &fault_info);
3008
3009 /*
3010 * if kr != VM_FAULT_SUCCESS, then the paging reference
3011 * has been dropped and the object unlocked... the ref_count
3012 * is still held
3013 *
3014 * if kr == VM_FAULT_SUCCESS, then the paging reference
3015 * is still held along with the ref_count on the original object
3016 *
3017 * if m != NULL, then the object it belongs to
3018 * is returned locked with a paging reference
3019 *
3020 * if top_page != NULL, then it's BUSY and the
3021 * object it belongs to has a paging reference
3022 * but is returned unlocked
3023 */
3024 if (kr != VM_FAULT_SUCCESS) {
3025 /*
3026 * we didn't succeed, lose the object reference immediately.
3027 */
3028 vm_object_deallocate(object);
3029
3030 /*
3031 * See why we failed, and take corrective action.
3032 */
3033 switch (kr) {
3034 case VM_FAULT_MEMORY_SHORTAGE:
3035 if (vm_page_wait((change_wiring) ?
3036 THREAD_UNINT :
3037 THREAD_ABORTSAFE))
3038 goto RetryFault;
3039 /*
3040 * fall thru
3041 */
3042 case VM_FAULT_INTERRUPTED:
3043 kr = KERN_ABORTED;
3044 goto done;
3045 case VM_FAULT_RETRY:
3046 goto RetryFault;
3047 case VM_FAULT_MEMORY_ERROR:
3048 if (error_code)
3049 kr = error_code;
3050 else
3051 kr = KERN_MEMORY_ERROR;
3052 goto done;
3053 }
3054 }
3055 m = result_page;
3056
3057 if (m != VM_PAGE_NULL) {
3058 assert((change_wiring && !wired) ?
3059 (top_page == VM_PAGE_NULL) :
3060 ((top_page == VM_PAGE_NULL) == (m->object == object)));
3061 }
3062
3063 /*
3064 * What to do with the resulting page from vm_fault_page
3065 * if it doesn't get entered into the physical map:
3066 */
3067 #define RELEASE_PAGE(m) \
3068 MACRO_BEGIN \
3069 PAGE_WAKEUP_DONE(m); \
3070 vm_page_lockspin_queues(); \
3071 if (!m->active && !m->inactive && !m->throttled)\
3072 vm_page_activate(m); \
3073 vm_page_unlock_queues(); \
3074 MACRO_END
3075
3076 /*
3077 * We must verify that the maps have not changed
3078 * since our last lookup.
3079 */
3080 if (m != VM_PAGE_NULL) {
3081 old_copy_object = m->object->copy;
3082 vm_object_unlock(m->object);
3083 } else
3084 old_copy_object = VM_OBJECT_NULL;
3085
3086 /*
3087 * no object locks are held at this point
3088 */
3089 if ((map != original_map) || !vm_map_verify(map, &version)) {
3090 vm_object_t retry_object;
3091 vm_object_offset_t retry_offset;
3092 vm_prot_t retry_prot;
3093
3094 /*
3095 * To avoid trying to write_lock the map while another
3096 * thread has it read_locked (in vm_map_pageable), we
3097 * do not try for write permission. If the page is
3098 * still writable, we will get write permission. If it
3099 * is not, or has been marked needs_copy, we enter the
3100 * mapping without write permission, and will merely
3101 * take another fault.
3102 */
3103 map = original_map;
3104 vm_map_lock_read(map);
3105
3106 kr = vm_map_lookup_locked(&map, vaddr,
3107 fault_type & ~VM_PROT_WRITE,
3108 OBJECT_LOCK_EXCLUSIVE, &version,
3109 &retry_object, &retry_offset, &retry_prot,
3110 &wired,
3111 &fault_info,
3112 &real_map);
3113 pmap = real_map->pmap;
3114
3115 if (kr != KERN_SUCCESS) {
3116 vm_map_unlock_read(map);
3117
3118 if (m != VM_PAGE_NULL) {
3119 /*
3120 * retake the lock so that
3121 * we can drop the paging reference
3122 * in vm_fault_cleanup and do the
3123 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3124 */
3125 vm_object_lock(m->object);
3126
3127 RELEASE_PAGE(m);
3128
3129 vm_fault_cleanup(m->object, top_page);
3130 } else {
3131 /*
3132 * retake the lock so that
3133 * we can drop the paging reference
3134 * in vm_fault_cleanup
3135 */
3136 vm_object_lock(object);
3137
3138 vm_fault_cleanup(object, top_page);
3139 }
3140 vm_object_deallocate(object);
3141
3142 goto done;
3143 }
3144 vm_object_unlock(retry_object);
3145
3146 if ((retry_object != object) || (retry_offset != offset)) {
3147
3148 vm_map_unlock_read(map);
3149 if (real_map != map)
3150 vm_map_unlock(real_map);
3151
3152 if (m != VM_PAGE_NULL) {
3153 /*
3154 * retake the lock so that
3155 * we can drop the paging reference
3156 * in vm_fault_cleanup and do the
3157 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3158 */
3159 vm_object_lock(m->object);
3160
3161 RELEASE_PAGE(m);
3162
3163 vm_fault_cleanup(m->object, top_page);
3164 } else {
3165 /*
3166 * retake the lock so that
3167 * we can drop the paging reference
3168 * in vm_fault_cleanup
3169 */
3170 vm_object_lock(object);
3171
3172 vm_fault_cleanup(object, top_page);
3173 }
3174 vm_object_deallocate(object);
3175
3176 goto RetryFault;
3177 }
3178 /*
3179 * Check whether the protection has changed or the object
3180 * has been copied while we left the map unlocked.
3181 */
3182 prot &= retry_prot;
3183 }
3184 if (m != VM_PAGE_NULL) {
3185 vm_object_lock(m->object);
3186
3187 if (m->object->copy != old_copy_object) {
3188 /*
3189 * The copy object changed while the top-level object
3190 * was unlocked, so take away write permission.
3191 */
3192 prot &= ~VM_PROT_WRITE;
3193 }
3194 } else
3195 vm_object_lock(object);
3196
3197 /*
3198 * If we want to wire down this page, but no longer have
3199 * adequate permissions, we must start all over.
3200 */
3201 if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
3202
3203 vm_map_verify_done(map, &version);
3204 if (real_map != map)
3205 vm_map_unlock(real_map);
3206
3207 if (m != VM_PAGE_NULL) {
3208 RELEASE_PAGE(m);
3209
3210 vm_fault_cleanup(m->object, top_page);
3211 } else
3212 vm_fault_cleanup(object, top_page);
3213
3214 vm_object_deallocate(object);
3215
3216 goto RetryFault;
3217 }
3218 if (m != VM_PAGE_NULL) {
3219 /*
3220 * Put this page into the physical map.
3221 * We had to do the unlock above because pmap_enter
3222 * may cause other faults. The page may be on
3223 * the pageout queues. If the pageout daemon comes
3224 * across the page, it will remove it from the queues.
3225 */
3226 if (caller_pmap) {
3227 kr = vm_fault_enter(m,
3228 caller_pmap,
3229 caller_pmap_addr,
3230 prot,
3231 wired,
3232 change_wiring,
3233 fault_info.no_cache,
3234 &type_of_fault);
3235 } else {
3236 kr = vm_fault_enter(m,
3237 pmap,
3238 vaddr,
3239 prot,
3240 wired,
3241 change_wiring,
3242 fault_info.no_cache,
3243 &type_of_fault);
3244 }
3245 if (kr != KERN_SUCCESS) {
3246 /* abort this page fault */
3247 vm_map_verify_done(map, &version);
3248 if (real_map != map)
3249 vm_map_unlock(real_map);
3250 PAGE_WAKEUP_DONE(m);
3251 vm_fault_cleanup(m->object, top_page);
3252 vm_object_deallocate(object);
3253 goto done;
3254 }
3255 } else {
3256
3257 vm_map_entry_t entry;
3258 vm_map_offset_t laddr;
3259 vm_map_offset_t ldelta, hdelta;
3260
3261 /*
3262 * do a pmap block mapping from the physical address
3263 * in the object
3264 */
3265
3266 #ifdef ppc
3267 /* While we do not worry about execution protection in */
3268 /* general, certian pages may have instruction execution */
3269 /* disallowed. We will check here, and if not allowed */
3270 /* to execute, we return with a protection failure. */
3271
3272 if ((fault_type & VM_PROT_EXECUTE) &&
3273 (!pmap_eligible_for_execute((ppnum_t)(object->shadow_offset >> 12)))) {
3274
3275 vm_map_verify_done(map, &version);
3276
3277 if (real_map != map)
3278 vm_map_unlock(real_map);
3279
3280 vm_fault_cleanup(object, top_page);
3281 vm_object_deallocate(object);
3282
3283 kr = KERN_PROTECTION_FAILURE;
3284 goto done;
3285 }
3286 #endif /* ppc */
3287
3288 if (real_map != map)
3289 vm_map_unlock(real_map);
3290
3291 if (original_map != map) {
3292 vm_map_unlock_read(map);
3293 vm_map_lock_read(original_map);
3294 map = original_map;
3295 }
3296 real_map = map;
3297
3298 laddr = vaddr;
3299 hdelta = 0xFFFFF000;
3300 ldelta = 0xFFFFF000;
3301
3302 while (vm_map_lookup_entry(map, laddr, &entry)) {
3303 if (ldelta > (laddr - entry->vme_start))
3304 ldelta = laddr - entry->vme_start;
3305 if (hdelta > (entry->vme_end - laddr))
3306 hdelta = entry->vme_end - laddr;
3307 if (entry->is_sub_map) {
3308
3309 laddr = (laddr - entry->vme_start)
3310 + entry->offset;
3311 vm_map_lock_read(entry->object.sub_map);
3312
3313 if (map != real_map)
3314 vm_map_unlock_read(map);
3315 if (entry->use_pmap) {
3316 vm_map_unlock_read(real_map);
3317 real_map = entry->object.sub_map;
3318 }
3319 map = entry->object.sub_map;
3320
3321 } else {
3322 break;
3323 }
3324 }
3325
3326 if (vm_map_lookup_entry(map, laddr, &entry) &&
3327 (entry->object.vm_object != NULL) &&
3328 (entry->object.vm_object == object)) {
3329
3330 if (caller_pmap) {
3331 /*
3332 * Set up a block mapped area
3333 */
3334 pmap_map_block(caller_pmap,
3335 (addr64_t)(caller_pmap_addr - ldelta),
3336 (((vm_map_offset_t) (entry->object.vm_object->shadow_offset)) +
3337 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3338 ((ldelta + hdelta) >> 12), prot,
3339 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3340 } else {
3341 /*
3342 * Set up a block mapped area
3343 */
3344 pmap_map_block(real_map->pmap,
3345 (addr64_t)(vaddr - ldelta),
3346 (((vm_map_offset_t)(entry->object.vm_object->shadow_offset)) +
3347 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3348 ((ldelta + hdelta) >> 12), prot,
3349 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3350 }
3351 }
3352 }
3353
3354 /*
3355 * Unlock everything, and return
3356 */
3357 vm_map_verify_done(map, &version);
3358 if (real_map != map)
3359 vm_map_unlock(real_map);
3360
3361 if (m != VM_PAGE_NULL) {
3362 PAGE_WAKEUP_DONE(m);
3363
3364 vm_fault_cleanup(m->object, top_page);
3365 } else
3366 vm_fault_cleanup(object, top_page);
3367
3368 vm_object_deallocate(object);
3369
3370 #undef RELEASE_PAGE
3371
3372 kr = KERN_SUCCESS;
3373 done:
3374 thread_interrupt_level(interruptible_state);
3375
3376 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3377 (int)((uint64_t)vaddr >> 32),
3378 (int)vaddr,
3379 kr,
3380 type_of_fault,
3381 0);
3382
3383 return (kr);
3384 }
3385
3386 /*
3387 * vm_fault_wire:
3388 *
3389 * Wire down a range of virtual addresses in a map.
3390 */
3391 kern_return_t
3392 vm_fault_wire(
3393 vm_map_t map,
3394 vm_map_entry_t entry,
3395 pmap_t pmap,
3396 vm_map_offset_t pmap_addr)
3397 {
3398
3399 register vm_map_offset_t va;
3400 register vm_map_offset_t end_addr = entry->vme_end;
3401 register kern_return_t rc;
3402
3403 assert(entry->in_transition);
3404
3405 if ((entry->object.vm_object != NULL) &&
3406 !entry->is_sub_map &&
3407 entry->object.vm_object->phys_contiguous) {
3408 return KERN_SUCCESS;
3409 }
3410
3411 /*
3412 * Inform the physical mapping system that the
3413 * range of addresses may not fault, so that
3414 * page tables and such can be locked down as well.
3415 */
3416
3417 pmap_pageable(pmap, pmap_addr,
3418 pmap_addr + (end_addr - entry->vme_start), FALSE);
3419
3420 /*
3421 * We simulate a fault to get the page and enter it
3422 * in the physical map.
3423 */
3424
3425 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3426 if ((rc = vm_fault_wire_fast(
3427 map, va, entry, pmap,
3428 pmap_addr + (va - entry->vme_start)
3429 )) != KERN_SUCCESS) {
3430 rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
3431 (pmap == kernel_pmap) ?
3432 THREAD_UNINT : THREAD_ABORTSAFE,
3433 pmap, pmap_addr + (va - entry->vme_start));
3434 DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
3435 }
3436
3437 if (rc != KERN_SUCCESS) {
3438 struct vm_map_entry tmp_entry = *entry;
3439
3440 /* unwire wired pages */
3441 tmp_entry.vme_end = va;
3442 vm_fault_unwire(map,
3443 &tmp_entry, FALSE, pmap, pmap_addr);
3444
3445 return rc;
3446 }
3447 }
3448 return KERN_SUCCESS;
3449 }
3450
3451 /*
3452 * vm_fault_unwire:
3453 *
3454 * Unwire a range of virtual addresses in a map.
3455 */
3456 void
3457 vm_fault_unwire(
3458 vm_map_t map,
3459 vm_map_entry_t entry,
3460 boolean_t deallocate,
3461 pmap_t pmap,
3462 vm_map_offset_t pmap_addr)
3463 {
3464 register vm_map_offset_t va;
3465 register vm_map_offset_t end_addr = entry->vme_end;
3466 vm_object_t object;
3467 struct vm_object_fault_info fault_info;
3468
3469 object = (entry->is_sub_map)
3470 ? VM_OBJECT_NULL : entry->object.vm_object;
3471
3472 /*
3473 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
3474 * do anything since such memory is wired by default. So we don't have
3475 * anything to undo here.
3476 */
3477
3478 if (object != VM_OBJECT_NULL && object->phys_contiguous)
3479 return;
3480
3481 fault_info.interruptible = THREAD_UNINT;
3482 fault_info.behavior = entry->behavior;
3483 fault_info.user_tag = entry->alias;
3484 fault_info.lo_offset = entry->offset;
3485 fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
3486 fault_info.no_cache = entry->no_cache;
3487
3488 /*
3489 * Since the pages are wired down, we must be able to
3490 * get their mappings from the physical map system.
3491 */
3492
3493 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3494
3495 if (object == VM_OBJECT_NULL) {
3496 if (pmap) {
3497 pmap_change_wiring(pmap,
3498 pmap_addr + (va - entry->vme_start), FALSE);
3499 }
3500 (void) vm_fault(map, va, VM_PROT_NONE,
3501 TRUE, THREAD_UNINT, pmap, pmap_addr);
3502 } else {
3503 vm_prot_t prot;
3504 vm_page_t result_page;
3505 vm_page_t top_page;
3506 vm_object_t result_object;
3507 vm_fault_return_t result;
3508
3509 fault_info.cluster_size = end_addr - va;
3510
3511 do {
3512 prot = VM_PROT_NONE;
3513
3514 vm_object_lock(object);
3515 vm_object_paging_begin(object);
3516 XPR(XPR_VM_FAULT,
3517 "vm_fault_unwire -> vm_fault_page\n",
3518 0,0,0,0,0);
3519 result = vm_fault_page(
3520 object,
3521 entry->offset + (va - entry->vme_start),
3522 VM_PROT_NONE, TRUE,
3523 &prot, &result_page, &top_page,
3524 (int *)0,
3525 NULL, map->no_zero_fill,
3526 FALSE, &fault_info);
3527 } while (result == VM_FAULT_RETRY);
3528
3529 /*
3530 * If this was a mapping to a file on a device that has been forcibly
3531 * unmounted, then we won't get a page back from vm_fault_page(). Just
3532 * move on to the next one in case the remaining pages are mapped from
3533 * different objects. During a forced unmount, the object is terminated
3534 * so the alive flag will be false if this happens. A forced unmount will
3535 * will occur when an external disk is unplugged before the user does an
3536 * eject, so we don't want to panic in that situation.
3537 */
3538
3539 if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
3540 continue;
3541
3542 if (result != VM_FAULT_SUCCESS)
3543 panic("vm_fault_unwire: failure");
3544
3545 result_object = result_page->object;
3546
3547 if ((pmap) && (result_page->phys_page != vm_page_guard_addr)) {
3548 pmap_change_wiring(pmap,
3549 pmap_addr + (va - entry->vme_start), FALSE);
3550 }
3551 if (deallocate) {
3552 assert(result_page->phys_page !=
3553 vm_page_fictitious_addr);
3554 pmap_disconnect(result_page->phys_page);
3555 VM_PAGE_FREE(result_page);
3556 } else {
3557 vm_page_lockspin_queues();
3558 vm_page_unwire(result_page);
3559 vm_page_unlock_queues();
3560 PAGE_WAKEUP_DONE(result_page);
3561 }
3562 vm_fault_cleanup(result_object, top_page);
3563 }
3564 }
3565
3566 /*
3567 * Inform the physical mapping system that the range
3568 * of addresses may fault, so that page tables and
3569 * such may be unwired themselves.
3570 */
3571
3572 pmap_pageable(pmap, pmap_addr,
3573 pmap_addr + (end_addr - entry->vme_start), TRUE);
3574
3575 }
3576
3577 /*
3578 * vm_fault_wire_fast:
3579 *
3580 * Handle common case of a wire down page fault at the given address.
3581 * If successful, the page is inserted into the associated physical map.
3582 * The map entry is passed in to avoid the overhead of a map lookup.
3583 *
3584 * NOTE: the given address should be truncated to the
3585 * proper page address.
3586 *
3587 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
3588 * a standard error specifying why the fault is fatal is returned.
3589 *
3590 * The map in question must be referenced, and remains so.
3591 * Caller has a read lock on the map.
3592 *
3593 * This is a stripped version of vm_fault() for wiring pages. Anything
3594 * other than the common case will return KERN_FAILURE, and the caller
3595 * is expected to call vm_fault().
3596 */
3597 kern_return_t
3598 vm_fault_wire_fast(
3599 __unused vm_map_t map,
3600 vm_map_offset_t va,
3601 vm_map_entry_t entry,
3602 pmap_t pmap,
3603 vm_map_offset_t pmap_addr)
3604 {
3605 vm_object_t object;
3606 vm_object_offset_t offset;
3607 register vm_page_t m;
3608 vm_prot_t prot;
3609 thread_t thread = current_thread();
3610 int type_of_fault;
3611 kern_return_t kr;
3612
3613 VM_STAT_INCR(faults);
3614
3615 if (thread != THREAD_NULL && thread->task != TASK_NULL)
3616 thread->task->faults++;
3617
3618 /*
3619 * Recovery actions
3620 */
3621
3622 #undef RELEASE_PAGE
3623 #define RELEASE_PAGE(m) { \
3624 PAGE_WAKEUP_DONE(m); \
3625 vm_page_lockspin_queues(); \
3626 vm_page_unwire(m); \
3627 vm_page_unlock_queues(); \
3628 }
3629
3630
3631 #undef UNLOCK_THINGS
3632 #define UNLOCK_THINGS { \
3633 vm_object_paging_end(object); \
3634 vm_object_unlock(object); \
3635 }
3636
3637 #undef UNLOCK_AND_DEALLOCATE
3638 #define UNLOCK_AND_DEALLOCATE { \
3639 UNLOCK_THINGS; \
3640 vm_object_deallocate(object); \
3641 }
3642 /*
3643 * Give up and have caller do things the hard way.
3644 */
3645
3646 #define GIVE_UP { \
3647 UNLOCK_AND_DEALLOCATE; \
3648 return(KERN_FAILURE); \
3649 }
3650
3651
3652 /*
3653 * If this entry is not directly to a vm_object, bail out.
3654 */
3655 if (entry->is_sub_map)
3656 return(KERN_FAILURE);
3657
3658 /*
3659 * Find the backing store object and offset into it.
3660 */
3661
3662 object = entry->object.vm_object;
3663 offset = (va - entry->vme_start) + entry->offset;
3664 prot = entry->protection;
3665
3666 /*
3667 * Make a reference to this object to prevent its
3668 * disposal while we are messing with it.
3669 */
3670
3671 vm_object_lock(object);
3672 vm_object_reference_locked(object);
3673 vm_object_paging_begin(object);
3674
3675 /*
3676 * INVARIANTS (through entire routine):
3677 *
3678 * 1) At all times, we must either have the object
3679 * lock or a busy page in some object to prevent
3680 * some other thread from trying to bring in
3681 * the same page.
3682 *
3683 * 2) Once we have a busy page, we must remove it from
3684 * the pageout queues, so that the pageout daemon
3685 * will not grab it away.
3686 *
3687 */
3688
3689 /*
3690 * Look for page in top-level object. If it's not there or
3691 * there's something going on, give up.
3692 * ENCRYPTED SWAP: use the slow fault path, since we'll need to
3693 * decrypt the page before wiring it down.
3694 */
3695 m = vm_page_lookup(object, offset);
3696 if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
3697 (m->unusual && ( m->error || m->restart || m->absent))) {
3698
3699 GIVE_UP;
3700 }
3701 ASSERT_PAGE_DECRYPTED(m);
3702
3703 if (m->fictitious &&
3704 m->phys_page == vm_page_guard_addr) {
3705 /*
3706 * Guard pages are fictitious pages and are never
3707 * entered into a pmap, so let's say it's been wired...
3708 */
3709 kr = KERN_SUCCESS;
3710 goto done;
3711 }
3712
3713 /*
3714 * Wire the page down now. All bail outs beyond this
3715 * point must unwire the page.
3716 */
3717
3718 vm_page_lockspin_queues();
3719 vm_page_wire(m);
3720 vm_page_unlock_queues();
3721
3722 /*
3723 * Mark page busy for other threads.
3724 */
3725 assert(!m->busy);
3726 m->busy = TRUE;
3727 assert(!m->absent);
3728
3729 /*
3730 * Give up if the page is being written and there's a copy object
3731 */
3732 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3733 RELEASE_PAGE(m);
3734 GIVE_UP;
3735 }
3736
3737 /*
3738 * Put this page into the physical map.
3739 */
3740 type_of_fault = DBG_CACHE_HIT_FAULT;
3741 kr = vm_fault_enter(m,
3742 pmap,
3743 pmap_addr,
3744 prot,
3745 TRUE,
3746 FALSE,
3747 FALSE,
3748 &type_of_fault);
3749
3750 done:
3751 /*
3752 * Unlock everything, and return
3753 */
3754
3755 PAGE_WAKEUP_DONE(m);
3756 UNLOCK_AND_DEALLOCATE;
3757
3758 return kr;
3759
3760 }
3761
3762 /*
3763 * Routine: vm_fault_copy_cleanup
3764 * Purpose:
3765 * Release a page used by vm_fault_copy.
3766 */
3767
3768 void
3769 vm_fault_copy_cleanup(
3770 vm_page_t page,
3771 vm_page_t top_page)
3772 {
3773 vm_object_t object = page->object;
3774
3775 vm_object_lock(object);
3776 PAGE_WAKEUP_DONE(page);
3777 vm_page_lockspin_queues();
3778 if (!page->active && !page->inactive && !page->throttled)
3779 vm_page_activate(page);
3780 vm_page_unlock_queues();
3781 vm_fault_cleanup(object, top_page);
3782 }
3783
3784 void
3785 vm_fault_copy_dst_cleanup(
3786 vm_page_t page)
3787 {
3788 vm_object_t object;
3789
3790 if (page != VM_PAGE_NULL) {
3791 object = page->object;
3792 vm_object_lock(object);
3793 vm_page_lockspin_queues();
3794 vm_page_unwire(page);
3795 vm_page_unlock_queues();
3796 vm_object_paging_end(object);
3797 vm_object_unlock(object);
3798 }
3799 }
3800
3801 /*
3802 * Routine: vm_fault_copy
3803 *
3804 * Purpose:
3805 * Copy pages from one virtual memory object to another --
3806 * neither the source nor destination pages need be resident.
3807 *
3808 * Before actually copying a page, the version associated with
3809 * the destination address map wil be verified.
3810 *
3811 * In/out conditions:
3812 * The caller must hold a reference, but not a lock, to
3813 * each of the source and destination objects and to the
3814 * destination map.
3815 *
3816 * Results:
3817 * Returns KERN_SUCCESS if no errors were encountered in
3818 * reading or writing the data. Returns KERN_INTERRUPTED if
3819 * the operation was interrupted (only possible if the
3820 * "interruptible" argument is asserted). Other return values
3821 * indicate a permanent error in copying the data.
3822 *
3823 * The actual amount of data copied will be returned in the
3824 * "copy_size" argument. In the event that the destination map
3825 * verification failed, this amount may be less than the amount
3826 * requested.
3827 */
3828 kern_return_t
3829 vm_fault_copy(
3830 vm_object_t src_object,
3831 vm_object_offset_t src_offset,
3832 vm_map_size_t *copy_size, /* INOUT */
3833 vm_object_t dst_object,
3834 vm_object_offset_t dst_offset,
3835 vm_map_t dst_map,
3836 vm_map_version_t *dst_version,
3837 int interruptible)
3838 {
3839 vm_page_t result_page;
3840
3841 vm_page_t src_page;
3842 vm_page_t src_top_page;
3843 vm_prot_t src_prot;
3844
3845 vm_page_t dst_page;
3846 vm_page_t dst_top_page;
3847 vm_prot_t dst_prot;
3848
3849 vm_map_size_t amount_left;
3850 vm_object_t old_copy_object;
3851 kern_return_t error = 0;
3852
3853 vm_map_size_t part_size;
3854 struct vm_object_fault_info fault_info_src;
3855 struct vm_object_fault_info fault_info_dst;
3856
3857 /*
3858 * In order not to confuse the clustered pageins, align
3859 * the different offsets on a page boundary.
3860 */
3861
3862 #define RETURN(x) \
3863 MACRO_BEGIN \
3864 *copy_size -= amount_left; \
3865 MACRO_RETURN(x); \
3866 MACRO_END
3867
3868 amount_left = *copy_size;
3869
3870 fault_info_src.interruptible = interruptible;
3871 fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
3872 fault_info_src.user_tag = 0;
3873 fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
3874 fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
3875 fault_info_src.no_cache = FALSE;
3876
3877 fault_info_dst.interruptible = interruptible;
3878 fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
3879 fault_info_dst.user_tag = 0;
3880 fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
3881 fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
3882 fault_info_dst.no_cache = FALSE;
3883
3884 do { /* while (amount_left > 0) */
3885 /*
3886 * There may be a deadlock if both source and destination
3887 * pages are the same. To avoid this deadlock, the copy must
3888 * start by getting the destination page in order to apply
3889 * COW semantics if any.
3890 */
3891
3892 RetryDestinationFault: ;
3893
3894 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3895
3896 vm_object_lock(dst_object);
3897 vm_object_paging_begin(dst_object);
3898
3899 fault_info_dst.cluster_size = amount_left;
3900
3901 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3902 switch (vm_fault_page(dst_object,
3903 vm_object_trunc_page(dst_offset),
3904 VM_PROT_WRITE|VM_PROT_READ,
3905 FALSE,
3906 &dst_prot, &dst_page, &dst_top_page,
3907 (int *)0,
3908 &error,
3909 dst_map->no_zero_fill,
3910 FALSE, &fault_info_dst)) {
3911 case VM_FAULT_SUCCESS:
3912 break;
3913 case VM_FAULT_RETRY:
3914 goto RetryDestinationFault;
3915 case VM_FAULT_MEMORY_SHORTAGE:
3916 if (vm_page_wait(interruptible))
3917 goto RetryDestinationFault;
3918 /* fall thru */
3919 case VM_FAULT_INTERRUPTED:
3920 RETURN(MACH_SEND_INTERRUPTED);
3921 case VM_FAULT_MEMORY_ERROR:
3922 if (error)
3923 return (error);
3924 else
3925 return(KERN_MEMORY_ERROR);
3926 }
3927 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3928
3929 old_copy_object = dst_page->object->copy;
3930
3931 /*
3932 * There exists the possiblity that the source and
3933 * destination page are the same. But we can't
3934 * easily determine that now. If they are the
3935 * same, the call to vm_fault_page() for the
3936 * destination page will deadlock. To prevent this we
3937 * wire the page so we can drop busy without having
3938 * the page daemon steal the page. We clean up the
3939 * top page but keep the paging reference on the object
3940 * holding the dest page so it doesn't go away.
3941 */
3942
3943 vm_page_lockspin_queues();
3944 vm_page_wire(dst_page);
3945 vm_page_unlock_queues();
3946 PAGE_WAKEUP_DONE(dst_page);
3947 vm_object_unlock(dst_page->object);
3948
3949 if (dst_top_page != VM_PAGE_NULL) {
3950 vm_object_lock(dst_object);
3951 VM_PAGE_FREE(dst_top_page);
3952 vm_object_paging_end(dst_object);
3953 vm_object_unlock(dst_object);
3954 }
3955
3956 RetrySourceFault: ;
3957
3958 if (src_object == VM_OBJECT_NULL) {
3959 /*
3960 * No source object. We will just
3961 * zero-fill the page in dst_object.
3962 */
3963 src_page = VM_PAGE_NULL;
3964 result_page = VM_PAGE_NULL;
3965 } else {
3966 vm_object_lock(src_object);
3967 src_page = vm_page_lookup(src_object,
3968 vm_object_trunc_page(src_offset));
3969 if (src_page == dst_page) {
3970 src_prot = dst_prot;
3971 result_page = VM_PAGE_NULL;
3972 } else {
3973 src_prot = VM_PROT_READ;
3974 vm_object_paging_begin(src_object);
3975
3976 fault_info_src.cluster_size = amount_left;
3977
3978 XPR(XPR_VM_FAULT,
3979 "vm_fault_copy(2) -> vm_fault_page\n",
3980 0,0,0,0,0);
3981 switch (vm_fault_page(
3982 src_object,
3983 vm_object_trunc_page(src_offset),
3984 VM_PROT_READ, FALSE,
3985 &src_prot,
3986 &result_page, &src_top_page,
3987 (int *)0, &error, FALSE,
3988 FALSE, &fault_info_src)) {
3989
3990 case VM_FAULT_SUCCESS:
3991 break;
3992 case VM_FAULT_RETRY:
3993 goto RetrySourceFault;
3994 case VM_FAULT_MEMORY_SHORTAGE:
3995 if (vm_page_wait(interruptible))
3996 goto RetrySourceFault;
3997 /* fall thru */
3998 case VM_FAULT_INTERRUPTED:
3999 vm_fault_copy_dst_cleanup(dst_page);
4000 RETURN(MACH_SEND_INTERRUPTED);
4001 case VM_FAULT_MEMORY_ERROR:
4002 vm_fault_copy_dst_cleanup(dst_page);
4003 if (error)
4004 return (error);
4005 else
4006 return(KERN_MEMORY_ERROR);
4007 }
4008
4009
4010 assert((src_top_page == VM_PAGE_NULL) ==
4011 (result_page->object == src_object));
4012 }
4013 assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
4014 vm_object_unlock(result_page->object);
4015 }
4016
4017 if (!vm_map_verify(dst_map, dst_version)) {
4018 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4019 vm_fault_copy_cleanup(result_page, src_top_page);
4020 vm_fault_copy_dst_cleanup(dst_page);
4021 break;
4022 }
4023
4024 vm_object_lock(dst_page->object);
4025
4026 if (dst_page->object->copy != old_copy_object) {
4027 vm_object_unlock(dst_page->object);
4028 vm_map_verify_done(dst_map, dst_version);
4029 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4030 vm_fault_copy_cleanup(result_page, src_top_page);
4031 vm_fault_copy_dst_cleanup(dst_page);
4032 break;
4033 }
4034 vm_object_unlock(dst_page->object);
4035
4036 /*
4037 * Copy the page, and note that it is dirty
4038 * immediately.
4039 */
4040
4041 if (!page_aligned(src_offset) ||
4042 !page_aligned(dst_offset) ||
4043 !page_aligned(amount_left)) {
4044
4045 vm_object_offset_t src_po,
4046 dst_po;
4047
4048 src_po = src_offset - vm_object_trunc_page(src_offset);
4049 dst_po = dst_offset - vm_object_trunc_page(dst_offset);
4050
4051 if (dst_po > src_po) {
4052 part_size = PAGE_SIZE - dst_po;
4053 } else {
4054 part_size = PAGE_SIZE - src_po;
4055 }
4056 if (part_size > (amount_left)){
4057 part_size = amount_left;
4058 }
4059
4060 if (result_page == VM_PAGE_NULL) {
4061 vm_page_part_zero_fill(dst_page,
4062 dst_po, part_size);
4063 } else {
4064 vm_page_part_copy(result_page, src_po,
4065 dst_page, dst_po, part_size);
4066 if(!dst_page->dirty){
4067 vm_object_lock(dst_object);
4068 dst_page->dirty = TRUE;
4069 vm_object_unlock(dst_page->object);
4070 }
4071
4072 }
4073 } else {
4074 part_size = PAGE_SIZE;
4075
4076 if (result_page == VM_PAGE_NULL)
4077 vm_page_zero_fill(dst_page);
4078 else{
4079 vm_page_copy(result_page, dst_page);
4080 if(!dst_page->dirty){
4081 vm_object_lock(dst_object);
4082 dst_page->dirty = TRUE;
4083 vm_object_unlock(dst_page->object);
4084 }
4085 }
4086
4087 }
4088
4089 /*
4090 * Unlock everything, and return
4091 */
4092
4093 vm_map_verify_done(dst_map, dst_version);
4094
4095 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4096 vm_fault_copy_cleanup(result_page, src_top_page);
4097 vm_fault_copy_dst_cleanup(dst_page);
4098
4099 amount_left -= part_size;
4100 src_offset += part_size;
4101 dst_offset += part_size;
4102 } while (amount_left > 0);
4103
4104 RETURN(KERN_SUCCESS);
4105 #undef RETURN
4106
4107 /*NOTREACHED*/
4108 }
4109
4110 #if VM_FAULT_CLASSIFY
4111 /*
4112 * Temporary statistics gathering support.
4113 */
4114
4115 /*
4116 * Statistics arrays:
4117 */
4118 #define VM_FAULT_TYPES_MAX 5
4119 #define VM_FAULT_LEVEL_MAX 8
4120
4121 int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4122
4123 #define VM_FAULT_TYPE_ZERO_FILL 0
4124 #define VM_FAULT_TYPE_MAP_IN 1
4125 #define VM_FAULT_TYPE_PAGER 2
4126 #define VM_FAULT_TYPE_COPY 3
4127 #define VM_FAULT_TYPE_OTHER 4
4128
4129
4130 void
4131 vm_fault_classify(vm_object_t object,
4132 vm_object_offset_t offset,
4133 vm_prot_t fault_type)
4134 {
4135 int type, level = 0;
4136 vm_page_t m;
4137
4138 while (TRUE) {
4139 m = vm_page_lookup(object, offset);
4140 if (m != VM_PAGE_NULL) {
4141 if (m->busy || m->error || m->restart || m->absent) {
4142 type = VM_FAULT_TYPE_OTHER;
4143 break;
4144 }
4145 if (((fault_type & VM_PROT_WRITE) == 0) ||
4146 ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4147 type = VM_FAULT_TYPE_MAP_IN;
4148 break;
4149 }
4150 type = VM_FAULT_TYPE_COPY;
4151 break;
4152 }
4153 else {
4154 if (object->pager_created) {
4155 type = VM_FAULT_TYPE_PAGER;
4156 break;
4157 }
4158 if (object->shadow == VM_OBJECT_NULL) {
4159 type = VM_FAULT_TYPE_ZERO_FILL;
4160 break;
4161 }
4162
4163 offset += object->shadow_offset;
4164 object = object->shadow;
4165 level++;
4166 continue;
4167 }
4168 }
4169
4170 if (level > VM_FAULT_LEVEL_MAX)
4171 level = VM_FAULT_LEVEL_MAX;
4172
4173 vm_fault_stats[type][level] += 1;
4174
4175 return;
4176 }
4177
4178 /* cleanup routine to call from debugger */
4179
4180 void
4181 vm_fault_classify_init(void)
4182 {
4183 int type, level;
4184
4185 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4186 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4187 vm_fault_stats[type][level] = 0;
4188 }
4189 }
4190
4191 return;
4192 }
4193 #endif /* VM_FAULT_CLASSIFY */
4194
4195
4196 extern int cs_validation;
4197
4198 void
4199 vm_page_validate_cs_mapped(
4200 vm_page_t page,
4201 const void *kaddr)
4202 {
4203 vm_object_t object;
4204 vm_object_offset_t offset;
4205 kern_return_t kr;
4206 memory_object_t pager;
4207 void *blobs;
4208 boolean_t validated, tainted;
4209
4210 assert(page->busy);
4211 vm_object_lock_assert_exclusive(page->object);
4212
4213 if (!cs_validation) {
4214 return;
4215 }
4216
4217 if (page->wpmapped && !page->cs_tainted) {
4218 /*
4219 * This page was mapped for "write" access sometime in the
4220 * past and could still be modifiable in the future.
4221 * Consider it tainted.
4222 * [ If the page was already found to be "tainted", no
4223 * need to re-validate. ]
4224 */
4225 page->cs_validated = TRUE;
4226 page->cs_tainted = TRUE;
4227 if (cs_debug) {
4228 printf("CODESIGNING: vm_page_validate_cs: "
4229 "page %p obj %p off 0x%llx "
4230 "was modified\n",
4231 page, page->object, page->offset);
4232 }
4233 vm_cs_validated_dirtied++;
4234 }
4235
4236 if (page->cs_validated) {
4237 return;
4238 }
4239
4240 vm_cs_validates++;
4241
4242 object = page->object;
4243 assert(object->code_signed);
4244 offset = page->offset;
4245
4246 if (!object->alive || object->terminating || object->pager == NULL) {
4247 /*
4248 * The object is terminating and we don't have its pager
4249 * so we can't validate the data...
4250 */
4251 return;
4252 }
4253 /*
4254 * Since we get here to validate a page that was brought in by
4255 * the pager, we know that this pager is all setup and ready
4256 * by now.
4257 */
4258 assert(!object->internal);
4259 assert(object->pager != NULL);
4260 assert(object->pager_ready);
4261
4262 pager = object->pager;
4263
4264 kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
4265 if (kr != KERN_SUCCESS) {
4266 blobs = NULL;
4267 }
4268
4269 /* verify the SHA1 hash for this page */
4270 validated = cs_validate_page(blobs,
4271 offset + object->paging_offset,
4272 (const void *)kaddr,
4273 &tainted);
4274
4275 page->cs_validated = validated;
4276 if (validated) {
4277 page->cs_tainted = tainted;
4278 }
4279 }
4280
4281 void
4282 vm_page_validate_cs(
4283 vm_page_t page)
4284 {
4285 vm_object_t object;
4286 vm_object_offset_t offset;
4287 vm_map_offset_t koffset;
4288 vm_map_size_t ksize;
4289 vm_offset_t kaddr;
4290 kern_return_t kr;
4291 boolean_t busy_page;
4292
4293 vm_object_lock_assert_held(page->object);
4294
4295 if (!cs_validation) {
4296 return;
4297 }
4298
4299 if (page->wpmapped && !page->cs_tainted) {
4300 vm_object_lock_assert_exclusive(page->object);
4301
4302 /*
4303 * This page was mapped for "write" access sometime in the
4304 * past and could still be modifiable in the future.
4305 * Consider it tainted.
4306 * [ If the page was already found to be "tainted", no
4307 * need to re-validate. ]
4308 */
4309 page->cs_validated = TRUE;
4310 page->cs_tainted = TRUE;
4311 if (cs_debug) {
4312 printf("CODESIGNING: vm_page_validate_cs: "
4313 "page %p obj %p off 0x%llx "
4314 "was modified\n",
4315 page, page->object, page->offset);
4316 }
4317 vm_cs_validated_dirtied++;
4318 }
4319
4320 if (page->cs_validated) {
4321 return;
4322 }
4323
4324 vm_object_lock_assert_exclusive(page->object);
4325
4326 object = page->object;
4327 assert(object->code_signed);
4328 offset = page->offset;
4329
4330 busy_page = page->busy;
4331 if (!busy_page) {
4332 /* keep page busy while we map (and unlock) the VM object */
4333 page->busy = TRUE;
4334 }
4335
4336 /*
4337 * Take a paging reference on the VM object
4338 * to protect it from collapse or bypass,
4339 * and keep it from disappearing too.
4340 */
4341 vm_object_paging_begin(object);
4342
4343 /* map the page in the kernel address space */
4344 koffset = 0;
4345 ksize = PAGE_SIZE_64;
4346 kr = vm_paging_map_object(&koffset,
4347 page,
4348 object,
4349 offset,
4350 &ksize,
4351 VM_PROT_READ,
4352 FALSE); /* can't unlock object ! */
4353 if (kr != KERN_SUCCESS) {
4354 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
4355 }
4356 kaddr = CAST_DOWN(vm_offset_t, koffset);
4357
4358 /* validate the mapped page */
4359 vm_page_validate_cs_mapped(page, (const void *) kaddr);
4360
4361 assert(page->busy);
4362 assert(object == page->object);
4363 vm_object_lock_assert_exclusive(object);
4364
4365 if (!busy_page) {
4366 PAGE_WAKEUP_DONE(page);
4367 }
4368 if (koffset != 0) {
4369 /* unmap the map from the kernel address space */
4370 vm_paging_unmap_object(object, koffset, koffset + ksize);
4371 koffset = 0;
4372 ksize = 0;
4373 kaddr = 0;
4374 }
4375 vm_object_paging_end(object);
4376 }