]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_fault.c
f5275261d3d525fe1eb11b1c4ba819a9028d45ec
[apple/xnu.git] / osfmk / vm / vm_fault.c
1 /*
2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm_fault.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Page fault handling module.
63 */
64
65 #include <mach_cluster_stats.h>
66 #include <mach_pagemap.h>
67 #include <mach_kdb.h>
68 #include <libkern/OSAtomic.h>
69
70 #include <mach/mach_types.h>
71 #include <mach/kern_return.h>
72 #include <mach/message.h> /* for error codes */
73 #include <mach/vm_param.h>
74 #include <mach/vm_behavior.h>
75 #include <mach/memory_object.h>
76 /* For memory_object_data_{request,unlock} */
77 #include <mach/sdt.h>
78
79 #include <kern/kern_types.h>
80 #include <kern/host_statistics.h>
81 #include <kern/counters.h>
82 #include <kern/task.h>
83 #include <kern/thread.h>
84 #include <kern/sched_prim.h>
85 #include <kern/host.h>
86 #include <kern/xpr.h>
87 #include <kern/mach_param.h>
88 #include <kern/macro_help.h>
89 #include <kern/zalloc.h>
90 #include <kern/misc_protos.h>
91
92 #include <ppc/proc_reg.h>
93
94 #include <vm/vm_fault.h>
95 #include <vm/vm_map.h>
96 #include <vm/vm_object.h>
97 #include <vm/vm_page.h>
98 #include <vm/vm_kern.h>
99 #include <vm/pmap.h>
100 #include <vm/vm_pageout.h>
101 #include <vm/vm_protos.h>
102 #include <vm/vm_external.h>
103 #include <vm/memory_object.h>
104 #include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
105
106 #include <sys/kdebug.h>
107
108 #define VM_FAULT_CLASSIFY 0
109
110 /* Zero-filled pages are marked "m->zero_fill" and put on the
111 * special zero-fill inactive queue only if they belong to
112 * an object at least this big.
113 */
114 #define VM_ZF_OBJECT_SIZE_THRESHOLD (0x200000)
115
116 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
117
118 int vm_object_pagein_throttle = 16;
119
120 extern int cs_debug;
121
122 #if MACH_KDB
123 extern struct db_watchpoint *db_watchpoint_list;
124 #endif /* MACH_KDB */
125
126
127 /* Forward declarations of internal routines. */
128 extern kern_return_t vm_fault_wire_fast(
129 vm_map_t map,
130 vm_map_offset_t va,
131 vm_map_entry_t entry,
132 pmap_t pmap,
133 vm_map_offset_t pmap_addr);
134
135 extern void vm_fault_continue(void);
136
137 extern void vm_fault_copy_cleanup(
138 vm_page_t page,
139 vm_page_t top_page);
140
141 extern void vm_fault_copy_dst_cleanup(
142 vm_page_t page);
143
144 #if VM_FAULT_CLASSIFY
145 extern void vm_fault_classify(vm_object_t object,
146 vm_object_offset_t offset,
147 vm_prot_t fault_type);
148
149 extern void vm_fault_classify_init(void);
150 #endif
151
152
153 unsigned long vm_cs_validates = 0;
154 unsigned long vm_cs_revalidates = 0;
155 unsigned long vm_cs_query_modified = 0;
156 unsigned long vm_cs_validated_dirtied = 0;
157
158 /*
159 * Routine: vm_fault_init
160 * Purpose:
161 * Initialize our private data structures.
162 */
163 void
164 vm_fault_init(void)
165 {
166 }
167
168 /*
169 * Routine: vm_fault_cleanup
170 * Purpose:
171 * Clean up the result of vm_fault_page.
172 * Results:
173 * The paging reference for "object" is released.
174 * "object" is unlocked.
175 * If "top_page" is not null, "top_page" is
176 * freed and the paging reference for the object
177 * containing it is released.
178 *
179 * In/out conditions:
180 * "object" must be locked.
181 */
182 void
183 vm_fault_cleanup(
184 register vm_object_t object,
185 register vm_page_t top_page)
186 {
187 vm_object_paging_end(object);
188 vm_object_unlock(object);
189
190 if (top_page != VM_PAGE_NULL) {
191 object = top_page->object;
192
193 vm_object_lock(object);
194 VM_PAGE_FREE(top_page);
195 vm_object_paging_end(object);
196 vm_object_unlock(object);
197 }
198 }
199
200 #if MACH_CLUSTER_STATS
201 #define MAXCLUSTERPAGES 16
202 struct {
203 unsigned long pages_in_cluster;
204 unsigned long pages_at_higher_offsets;
205 unsigned long pages_at_lower_offsets;
206 } cluster_stats_in[MAXCLUSTERPAGES];
207 #define CLUSTER_STAT(clause) clause
208 #define CLUSTER_STAT_HIGHER(x) \
209 ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
210 #define CLUSTER_STAT_LOWER(x) \
211 ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
212 #define CLUSTER_STAT_CLUSTER(x) \
213 ((cluster_stats_in[(x)].pages_in_cluster)++)
214 #else /* MACH_CLUSTER_STATS */
215 #define CLUSTER_STAT(clause)
216 #endif /* MACH_CLUSTER_STATS */
217
218 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
219
220
221 boolean_t vm_page_deactivate_behind = TRUE;
222 /*
223 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
224 */
225 int vm_default_ahead = 0;
226 int vm_default_behind = MAX_UPL_TRANSFER;
227
228 #define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
229
230 /*
231 * vm_page_is_sequential
232 *
233 * Determine if sequential access is in progress
234 * in accordance with the behavior specified.
235 * Update state to indicate current access pattern.
236 *
237 * object must have at least the shared lock held
238 */
239 static
240 void
241 vm_fault_is_sequential(
242 vm_object_t object,
243 vm_object_offset_t offset,
244 vm_behavior_t behavior)
245 {
246 vm_object_offset_t last_alloc;
247 int sequential;
248 int orig_sequential;
249
250 last_alloc = object->last_alloc;
251 sequential = object->sequential;
252 orig_sequential = sequential;
253
254 switch (behavior) {
255 case VM_BEHAVIOR_RANDOM:
256 /*
257 * reset indicator of sequential behavior
258 */
259 sequential = 0;
260 break;
261
262 case VM_BEHAVIOR_SEQUENTIAL:
263 if (offset && last_alloc == offset - PAGE_SIZE_64) {
264 /*
265 * advance indicator of sequential behavior
266 */
267 if (sequential < MAX_SEQUENTIAL_RUN)
268 sequential += PAGE_SIZE;
269 } else {
270 /*
271 * reset indicator of sequential behavior
272 */
273 sequential = 0;
274 }
275 break;
276
277 case VM_BEHAVIOR_RSEQNTL:
278 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
279 /*
280 * advance indicator of sequential behavior
281 */
282 if (sequential > -MAX_SEQUENTIAL_RUN)
283 sequential -= PAGE_SIZE;
284 } else {
285 /*
286 * reset indicator of sequential behavior
287 */
288 sequential = 0;
289 }
290 break;
291
292 case VM_BEHAVIOR_DEFAULT:
293 default:
294 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
295 /*
296 * advance indicator of sequential behavior
297 */
298 if (sequential < 0)
299 sequential = 0;
300 if (sequential < MAX_SEQUENTIAL_RUN)
301 sequential += PAGE_SIZE;
302
303 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
304 /*
305 * advance indicator of sequential behavior
306 */
307 if (sequential > 0)
308 sequential = 0;
309 if (sequential > -MAX_SEQUENTIAL_RUN)
310 sequential -= PAGE_SIZE;
311 } else {
312 /*
313 * reset indicator of sequential behavior
314 */
315 sequential = 0;
316 }
317 break;
318 }
319 if (sequential != orig_sequential) {
320 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
321 /*
322 * if someone else has already updated object->sequential
323 * don't bother trying to update it or object->last_alloc
324 */
325 return;
326 }
327 }
328 /*
329 * I'd like to do this with a OSCompareAndSwap64, but that
330 * doesn't exist for PPC... however, it shouldn't matter
331 * that much... last_alloc is maintained so that we can determine
332 * if a sequential access pattern is taking place... if only
333 * one thread is banging on this object, no problem with the unprotected
334 * update... if 2 or more threads are banging away, we run the risk of
335 * someone seeing a mangled update... however, in the face of multiple
336 * accesses, no sequential access pattern can develop anyway, so we
337 * haven't lost any real info.
338 */
339 object->last_alloc = offset;
340 }
341
342
343 /*
344 * vm_page_deactivate_behind
345 *
346 * Determine if sequential access is in progress
347 * in accordance with the behavior specified. If
348 * so, compute a potential page to deactivate and
349 * deactivate it.
350 *
351 * object must be locked.
352 *
353 * return TRUE if we actually deactivate a page
354 */
355 static
356 boolean_t
357 vm_fault_deactivate_behind(
358 vm_object_t object,
359 vm_object_offset_t offset,
360 vm_behavior_t behavior)
361 {
362 vm_page_t m = NULL;
363 int sequential_run;
364 int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
365
366 #if TRACEFAULTPAGE
367 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
368 #endif
369
370 if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
371 /*
372 * Do not deactivate pages from the kernel object: they
373 * are not intended to become pageable.
374 * or we've disabled the deactivate behind mechanism
375 */
376 return FALSE;
377 }
378 if ((sequential_run = object->sequential)) {
379 if (sequential_run < 0) {
380 sequential_behavior = VM_BEHAVIOR_RSEQNTL;
381 sequential_run = 0 - sequential_run;
382 } else {
383 sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
384 }
385 }
386 switch (behavior) {
387 case VM_BEHAVIOR_RANDOM:
388 break;
389 case VM_BEHAVIOR_SEQUENTIAL:
390 if (sequential_run >= (int)PAGE_SIZE)
391 m = vm_page_lookup(object, offset - PAGE_SIZE_64);
392 break;
393 case VM_BEHAVIOR_RSEQNTL:
394 if (sequential_run >= (int)PAGE_SIZE)
395 m = vm_page_lookup(object, offset + PAGE_SIZE_64);
396 break;
397 case VM_BEHAVIOR_DEFAULT:
398 default:
399 { vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
400
401 /*
402 * determine if the run of sequential accesss has been
403 * long enough on an object with default access behavior
404 * to consider it for deactivation
405 */
406 if ((uint64_t)sequential_run >= behind) {
407 if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
408 if (offset >= behind)
409 m = vm_page_lookup(object, offset - behind);
410 } else {
411 if (offset < -behind)
412 m = vm_page_lookup(object, offset + behind);
413 }
414 }
415 break;
416 }
417 }
418 if (m) {
419 if (!m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
420 pmap_clear_reference(m->phys_page);
421 m->deactivated = TRUE;
422 #if TRACEFAULTPAGE
423 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
424 #endif
425 return TRUE;
426 }
427 }
428 return FALSE;
429 }
430
431
432 /*
433 * check for various conditions that would
434 * prevent us from creating a ZF page...
435 * cleanup is based on being called from vm_fault_page
436 *
437 * object must be locked
438 * object == m->object
439 */
440 static vm_fault_return_t
441 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
442 {
443 if (object->shadow_severed) {
444 /*
445 * the shadow chain was severed
446 * just have to return an error at this point
447 */
448 if (m != VM_PAGE_NULL)
449 VM_PAGE_FREE(m);
450 vm_fault_cleanup(object, first_m);
451
452 thread_interrupt_level(interruptible_state);
453
454 return (VM_FAULT_MEMORY_ERROR);
455 }
456 if (vm_backing_store_low) {
457 /*
458 * are we protecting the system from
459 * backing store exhaustion. If so
460 * sleep unless we are privileged.
461 */
462 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
463
464 if (m != VM_PAGE_NULL)
465 VM_PAGE_FREE(m);
466 vm_fault_cleanup(object, first_m);
467
468 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
469
470 thread_block(THREAD_CONTINUE_NULL);
471 thread_interrupt_level(interruptible_state);
472
473 return (VM_FAULT_RETRY);
474 }
475 }
476 if (VM_PAGE_ZFILL_THROTTLED()) {
477 /*
478 * we're throttling zero-fills...
479 * treat this as if we couldn't grab a page
480 */
481 if (m != VM_PAGE_NULL)
482 VM_PAGE_FREE(m);
483 vm_fault_cleanup(object, first_m);
484
485 thread_interrupt_level(interruptible_state);
486
487 return (VM_FAULT_MEMORY_SHORTAGE);
488 }
489 return (VM_FAULT_SUCCESS);
490 }
491
492
493 /*
494 * do the work to zero fill a page and
495 * inject it into the correct paging queue
496 *
497 * m->object must be locked
498 * page queue lock must NOT be held
499 */
500 static int
501 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
502 {
503 int my_fault = DBG_ZERO_FILL_FAULT;
504
505 /*
506 * This is is a zero-fill page fault...
507 *
508 * Checking the page lock is a waste of
509 * time; this page was absent, so
510 * it can't be page locked by a pager.
511 *
512 * we also consider it undefined
513 * with respect to instruction
514 * execution. i.e. it is the responsibility
515 * of higher layers to call for an instruction
516 * sync after changing the contents and before
517 * sending a program into this area. We
518 * choose this approach for performance
519 */
520 m->pmapped = TRUE;
521
522 m->cs_validated = FALSE;
523 m->cs_tainted = FALSE;
524
525 if (no_zero_fill == TRUE)
526 my_fault = DBG_NZF_PAGE_FAULT;
527 else {
528 vm_page_zero_fill(m);
529
530 VM_STAT_INCR(zero_fill_count);
531 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
532 }
533 assert(!m->laundry);
534 assert(m->object != kernel_object);
535 //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
536
537 if (!IP_VALID(memory_manager_default) &&
538 (m->object->purgable == VM_PURGABLE_DENY ||
539 m->object->purgable == VM_PURGABLE_NONVOLATILE ||
540 m->object->purgable == VM_PURGABLE_VOLATILE )) {
541 vm_page_lock_queues();
542
543 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
544 m->throttled = TRUE;
545 vm_page_throttled_count++;
546
547 vm_page_unlock_queues();
548 } else {
549 if (m->object->size > VM_ZF_OBJECT_SIZE_THRESHOLD) {
550 m->zero_fill = TRUE;
551 OSAddAtomic(1, (SInt32 *)&vm_zf_count);
552 }
553 }
554 return (my_fault);
555 }
556
557
558 /*
559 * Routine: vm_fault_page
560 * Purpose:
561 * Find the resident page for the virtual memory
562 * specified by the given virtual memory object
563 * and offset.
564 * Additional arguments:
565 * The required permissions for the page is given
566 * in "fault_type". Desired permissions are included
567 * in "protection".
568 * fault_info is passed along to determine pagein cluster
569 * limits... it contains the expected reference pattern,
570 * cluster size if available, etc...
571 *
572 * If the desired page is known to be resident (for
573 * example, because it was previously wired down), asserting
574 * the "unwiring" parameter will speed the search.
575 *
576 * If the operation can be interrupted (by thread_abort
577 * or thread_terminate), then the "interruptible"
578 * parameter should be asserted.
579 *
580 * Results:
581 * The page containing the proper data is returned
582 * in "result_page".
583 *
584 * In/out conditions:
585 * The source object must be locked and referenced,
586 * and must donate one paging reference. The reference
587 * is not affected. The paging reference and lock are
588 * consumed.
589 *
590 * If the call succeeds, the object in which "result_page"
591 * resides is left locked and holding a paging reference.
592 * If this is not the original object, a busy page in the
593 * original object is returned in "top_page", to prevent other
594 * callers from pursuing this same data, along with a paging
595 * reference for the original object. The "top_page" should
596 * be destroyed when this guarantee is no longer required.
597 * The "result_page" is also left busy. It is not removed
598 * from the pageout queues.
599 */
600
601 vm_fault_return_t
602 vm_fault_page(
603 /* Arguments: */
604 vm_object_t first_object, /* Object to begin search */
605 vm_object_offset_t first_offset, /* Offset into object */
606 vm_prot_t fault_type, /* What access is requested */
607 boolean_t must_be_resident,/* Must page be resident? */
608 /* Modifies in place: */
609 vm_prot_t *protection, /* Protection for mapping */
610 /* Returns: */
611 vm_page_t *result_page, /* Page found, if successful */
612 vm_page_t *top_page, /* Page in top object, if
613 * not result_page. */
614 int *type_of_fault, /* if non-null, fill in with type of fault
615 * COW, zero-fill, etc... returned in trace point */
616 /* More arguments: */
617 kern_return_t *error_code, /* code if page is in error */
618 boolean_t no_zero_fill, /* don't zero fill absent pages */
619 #if MACH_PAGEMAP
620 boolean_t data_supply, /* treat as data_supply if
621 * it is a write fault and a full
622 * page is provided */
623 #else
624 __unused boolean_t data_supply,
625 #endif
626 vm_object_fault_info_t fault_info)
627 {
628 vm_page_t m;
629 vm_object_t object;
630 vm_object_offset_t offset;
631 vm_page_t first_m;
632 vm_object_t next_object;
633 vm_object_t copy_object;
634 boolean_t look_for_page;
635 vm_prot_t access_required = fault_type;
636 vm_prot_t wants_copy_flag;
637 CLUSTER_STAT(int pages_at_higher_offsets;)
638 CLUSTER_STAT(int pages_at_lower_offsets;)
639 kern_return_t wait_result;
640 boolean_t interruptible_state;
641 vm_fault_return_t error;
642 int my_fault;
643 uint32_t try_failed_count;
644 int interruptible; /* how may fault be interrupted? */
645 memory_object_t pager;
646
647 /*
648 * MACH page map - an optional optimization where a bit map is maintained
649 * by the VM subsystem for internal objects to indicate which pages of
650 * the object currently reside on backing store. This existence map
651 * duplicates information maintained by the vnode pager. It is
652 * created at the time of the first pageout against the object, i.e.
653 * at the same time pager for the object is created. The optimization
654 * is designed to eliminate pager interaction overhead, if it is
655 * 'known' that the page does not exist on backing store.
656 *
657 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
658 * either marked as paged out in the existence map for the object or no
659 * existence map exists for the object. MUST_ASK_PAGER() is one of the
660 * criteria in the decision to invoke the pager. It is also used as one
661 * of the criteria to terminate the scan for adjacent pages in a clustered
662 * pagein operation. Note that MUST_ASK_PAGER() always evaluates to TRUE for
663 * permanent objects. Note also that if the pager for an internal object
664 * has not been created, the pager is not invoked regardless of the value
665 * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
666 * for which a pager has been created.
667 *
668 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
669 * is marked as paged out in the existence map for the object. PAGED_OUT()
670 * PAGED_OUT() is used to determine if a page has already been pushed
671 * into a copy object in order to avoid a redundant page out operation.
672 */
673 #if MACH_PAGEMAP
674 #define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \
675 != VM_EXTERNAL_STATE_ABSENT)
676 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
677 == VM_EXTERNAL_STATE_EXISTS)
678 #else
679 #define MUST_ASK_PAGER(o, f) (TRUE)
680 #define PAGED_OUT(o, f) (FALSE)
681 #endif
682
683 /*
684 * Recovery actions
685 */
686 #define PREPARE_RELEASE_PAGE(m) \
687 MACRO_BEGIN \
688 vm_page_lock_queues(); \
689 MACRO_END
690
691 #define DO_RELEASE_PAGE(m) \
692 MACRO_BEGIN \
693 PAGE_WAKEUP_DONE(m); \
694 if (!m->active && !m->inactive && !m->throttled)\
695 vm_page_activate(m); \
696 vm_page_unlock_queues(); \
697 MACRO_END
698
699 #define RELEASE_PAGE(m) \
700 MACRO_BEGIN \
701 PREPARE_RELEASE_PAGE(m); \
702 DO_RELEASE_PAGE(m); \
703 MACRO_END
704
705 #if TRACEFAULTPAGE
706 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
707 #endif
708
709
710 #if MACH_KDB
711 /*
712 * If there are watchpoints set, then
713 * we don't want to give away write permission
714 * on a read fault. Make the task write fault,
715 * so that the watchpoint code notices the access.
716 */
717 if (db_watchpoint_list) {
718 /*
719 * If we aren't asking for write permission,
720 * then don't give it away. We're using write
721 * faults to set the dirty bit.
722 */
723 if (!(fault_type & VM_PROT_WRITE))
724 *protection &= ~VM_PROT_WRITE;
725 }
726 #endif /* MACH_KDB */
727
728 interruptible = fault_info->interruptible;
729 interruptible_state = thread_interrupt_level(interruptible);
730
731 /*
732 * INVARIANTS (through entire routine):
733 *
734 * 1) At all times, we must either have the object
735 * lock or a busy page in some object to prevent
736 * some other thread from trying to bring in
737 * the same page.
738 *
739 * Note that we cannot hold any locks during the
740 * pager access or when waiting for memory, so
741 * we use a busy page then.
742 *
743 * 2) To prevent another thread from racing us down the
744 * shadow chain and entering a new page in the top
745 * object before we do, we must keep a busy page in
746 * the top object while following the shadow chain.
747 *
748 * 3) We must increment paging_in_progress on any object
749 * for which we have a busy page before dropping
750 * the object lock
751 *
752 * 4) We leave busy pages on the pageout queues.
753 * If the pageout daemon comes across a busy page,
754 * it will remove the page from the pageout queues.
755 */
756
757 object = first_object;
758 offset = first_offset;
759 first_m = VM_PAGE_NULL;
760 access_required = fault_type;
761
762
763 XPR(XPR_VM_FAULT,
764 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
765 (integer_t)object, offset, fault_type, *protection, 0);
766
767 /*
768 * default type of fault
769 */
770 my_fault = DBG_CACHE_HIT_FAULT;
771
772 while (TRUE) {
773 #if TRACEFAULTPAGE
774 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
775 #endif
776 if (!object->alive) {
777 /*
778 * object is no longer valid
779 * clean up and return error
780 */
781 vm_fault_cleanup(object, first_m);
782 thread_interrupt_level(interruptible_state);
783
784 return (VM_FAULT_MEMORY_ERROR);
785 }
786
787 /*
788 * See whether the page at 'offset' is resident
789 */
790 m = vm_page_lookup(object, offset);
791 #if TRACEFAULTPAGE
792 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
793 #endif
794 if (m != VM_PAGE_NULL) {
795
796 if (m->busy) {
797 /*
798 * The page is being brought in,
799 * wait for it and then retry.
800 *
801 * A possible optimization: if the page
802 * is known to be resident, we can ignore
803 * pages that are absent (regardless of
804 * whether they're busy).
805 */
806 #if TRACEFAULTPAGE
807 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
808 #endif
809 wait_result = PAGE_SLEEP(object, m, interruptible);
810 XPR(XPR_VM_FAULT,
811 "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
812 (integer_t)object, offset,
813 (integer_t)m, 0, 0);
814 counter(c_vm_fault_page_block_busy_kernel++);
815
816 if (wait_result != THREAD_AWAKENED) {
817 vm_fault_cleanup(object, first_m);
818 thread_interrupt_level(interruptible_state);
819
820 if (wait_result == THREAD_RESTART)
821 return (VM_FAULT_RETRY);
822 else
823 return (VM_FAULT_INTERRUPTED);
824 }
825 continue;
826 }
827
828 if (m->phys_page == vm_page_guard_addr) {
829 /*
830 * Guard page: off limits !
831 */
832 if (fault_type == VM_PROT_NONE) {
833 /*
834 * The fault is not requesting any
835 * access to the guard page, so it must
836 * be just to wire or unwire it.
837 * Let's pretend it succeeded...
838 */
839 m->busy = TRUE;
840 *result_page = m;
841 assert(first_m == VM_PAGE_NULL);
842 *top_page = first_m;
843 if (type_of_fault)
844 *type_of_fault = DBG_GUARD_FAULT;
845 return VM_FAULT_SUCCESS;
846 } else {
847 /*
848 * The fault requests access to the
849 * guard page: let's deny that !
850 */
851 vm_fault_cleanup(object, first_m);
852 thread_interrupt_level(interruptible_state);
853 return VM_FAULT_MEMORY_ERROR;
854 }
855 }
856
857 if (m->error) {
858 /*
859 * The page is in error, give up now.
860 */
861 #if TRACEFAULTPAGE
862 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
863 #endif
864 if (error_code)
865 *error_code = KERN_MEMORY_ERROR;
866 VM_PAGE_FREE(m);
867
868 vm_fault_cleanup(object, first_m);
869 thread_interrupt_level(interruptible_state);
870
871 return (VM_FAULT_MEMORY_ERROR);
872 }
873 if (m->restart) {
874 /*
875 * The pager wants us to restart
876 * at the top of the chain,
877 * typically because it has moved the
878 * page to another pager, then do so.
879 */
880 #if TRACEFAULTPAGE
881 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
882 #endif
883 VM_PAGE_FREE(m);
884
885 vm_fault_cleanup(object, first_m);
886 thread_interrupt_level(interruptible_state);
887
888 return (VM_FAULT_RETRY);
889 }
890 if (m->absent) {
891 /*
892 * The page isn't busy, but is absent,
893 * therefore it's deemed "unavailable".
894 *
895 * Remove the non-existent page (unless it's
896 * in the top object) and move on down to the
897 * next object (if there is one).
898 */
899 #if TRACEFAULTPAGE
900 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
901 #endif
902 next_object = object->shadow;
903
904 if (next_object == VM_OBJECT_NULL) {
905 /*
906 * Absent page at bottom of shadow
907 * chain; zero fill the page we left
908 * busy in the first object, and free
909 * the absent page.
910 */
911 assert(!must_be_resident);
912
913 /*
914 * check for any conditions that prevent
915 * us from creating a new zero-fill page
916 * vm_fault_check will do all of the
917 * fault cleanup in the case of an error condition
918 * including resetting the thread_interrupt_level
919 */
920 error = vm_fault_check(object, m, first_m, interruptible_state);
921
922 if (error != VM_FAULT_SUCCESS)
923 return (error);
924
925 XPR(XPR_VM_FAULT,
926 "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
927 (integer_t)object, offset,
928 (integer_t)m,
929 (integer_t)first_object, 0);
930
931 if (object != first_object) {
932 /*
933 * free the absent page we just found
934 */
935 VM_PAGE_FREE(m);
936
937 /*
938 * drop reference and lock on current object
939 */
940 vm_object_paging_end(object);
941 vm_object_unlock(object);
942
943 /*
944 * grab the original page we
945 * 'soldered' in place and
946 * retake lock on 'first_object'
947 */
948 m = first_m;
949 first_m = VM_PAGE_NULL;
950
951 object = first_object;
952 offset = first_offset;
953
954 vm_object_lock(object);
955 } else {
956 /*
957 * we're going to use the absent page we just found
958 * so convert it to a 'busy' page
959 */
960 m->absent = FALSE;
961 m->busy = TRUE;
962 }
963 /*
964 * zero-fill the page and put it on
965 * the correct paging queue
966 */
967 my_fault = vm_fault_zero_page(m, no_zero_fill);
968
969 break;
970 } else {
971 if (must_be_resident)
972 vm_object_paging_end(object);
973 else if (object != first_object) {
974 vm_object_paging_end(object);
975 VM_PAGE_FREE(m);
976 } else {
977 first_m = m;
978 m->absent = FALSE;
979 m->busy = TRUE;
980
981 vm_page_lockspin_queues();
982 VM_PAGE_QUEUES_REMOVE(m);
983 vm_page_unlock_queues();
984 }
985 XPR(XPR_VM_FAULT,
986 "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
987 (integer_t)object, offset,
988 (integer_t)next_object,
989 offset+object->shadow_offset,0);
990
991 offset += object->shadow_offset;
992 fault_info->lo_offset += object->shadow_offset;
993 fault_info->hi_offset += object->shadow_offset;
994 access_required = VM_PROT_READ;
995
996 vm_object_lock(next_object);
997 vm_object_unlock(object);
998 object = next_object;
999 vm_object_paging_begin(object);
1000
1001 /*
1002 * reset to default type of fault
1003 */
1004 my_fault = DBG_CACHE_HIT_FAULT;
1005
1006 continue;
1007 }
1008 }
1009 if ((m->cleaning)
1010 && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1011 && (fault_type & VM_PROT_WRITE)) {
1012 /*
1013 * This is a copy-on-write fault that will
1014 * cause us to revoke access to this page, but
1015 * this page is in the process of being cleaned
1016 * in a clustered pageout. We must wait until
1017 * the cleaning operation completes before
1018 * revoking access to the original page,
1019 * otherwise we might attempt to remove a
1020 * wired mapping.
1021 */
1022 #if TRACEFAULTPAGE
1023 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
1024 #endif
1025 XPR(XPR_VM_FAULT,
1026 "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1027 (integer_t)object, offset,
1028 (integer_t)m, 0, 0);
1029 /*
1030 * take an extra ref so that object won't die
1031 */
1032 vm_object_reference_locked(object);
1033
1034 vm_fault_cleanup(object, first_m);
1035
1036 counter(c_vm_fault_page_block_backoff_kernel++);
1037 vm_object_lock(object);
1038 assert(object->ref_count > 0);
1039
1040 m = vm_page_lookup(object, offset);
1041
1042 if (m != VM_PAGE_NULL && m->cleaning) {
1043 PAGE_ASSERT_WAIT(m, interruptible);
1044
1045 vm_object_unlock(object);
1046 wait_result = thread_block(THREAD_CONTINUE_NULL);
1047 vm_object_deallocate(object);
1048
1049 goto backoff;
1050 } else {
1051 vm_object_unlock(object);
1052
1053 vm_object_deallocate(object);
1054 thread_interrupt_level(interruptible_state);
1055
1056 return (VM_FAULT_RETRY);
1057 }
1058 }
1059 if (type_of_fault == NULL && m->speculative) {
1060 /*
1061 * If we were passed a non-NULL pointer for
1062 * "type_of_fault", than we came from
1063 * vm_fault... we'll let it deal with
1064 * this condition, since it
1065 * needs to see m->speculative to correctly
1066 * account the pageins, otherwise...
1067 * take it off the speculative queue, we'll
1068 * let the caller of vm_fault_page deal
1069 * with getting it onto the correct queue
1070 */
1071 vm_page_lockspin_queues();
1072 VM_PAGE_QUEUES_REMOVE(m);
1073 vm_page_unlock_queues();
1074 }
1075
1076 if (m->encrypted) {
1077 /*
1078 * ENCRYPTED SWAP:
1079 * the user needs access to a page that we
1080 * encrypted before paging it out.
1081 * Decrypt the page now.
1082 * Keep it busy to prevent anyone from
1083 * accessing it during the decryption.
1084 */
1085 m->busy = TRUE;
1086 vm_page_decrypt(m, 0);
1087 assert(object == m->object);
1088 assert(m->busy);
1089 PAGE_WAKEUP_DONE(m);
1090
1091 /*
1092 * Retry from the top, in case
1093 * something changed while we were
1094 * decrypting.
1095 */
1096 continue;
1097 }
1098 ASSERT_PAGE_DECRYPTED(m);
1099
1100 if (m->object->code_signed) {
1101 /*
1102 * CODE SIGNING:
1103 * We just paged in a page from a signed
1104 * memory object but we don't need to
1105 * validate it now. We'll validate it if
1106 * when it gets mapped into a user address
1107 * space for the first time or when the page
1108 * gets copied to another object as a result
1109 * of a copy-on-write.
1110 */
1111 }
1112
1113 /*
1114 * We mark the page busy and leave it on
1115 * the pageout queues. If the pageout
1116 * deamon comes across it, then it will
1117 * remove the page from the queue, but not the object
1118 */
1119 #if TRACEFAULTPAGE
1120 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1121 #endif
1122 XPR(XPR_VM_FAULT,
1123 "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1124 (integer_t)object, offset, (integer_t)m, 0, 0);
1125 assert(!m->busy);
1126 assert(!m->absent);
1127
1128 m->busy = TRUE;
1129 break;
1130 }
1131
1132
1133 /*
1134 * we get here when there is no page present in the object at
1135 * the offset we're interested in... we'll allocate a page
1136 * at this point if the pager associated with
1137 * this object can provide the data or we're the top object...
1138 * object is locked; m == NULL
1139 */
1140 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply);
1141
1142 #if TRACEFAULTPAGE
1143 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1144 #endif
1145 if ((look_for_page || (object == first_object)) && !must_be_resident && !object->phys_contiguous) {
1146 /*
1147 * Allocate a new page for this object/offset pair
1148 */
1149 m = vm_page_grab();
1150 #if TRACEFAULTPAGE
1151 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1152 #endif
1153 if (m == VM_PAGE_NULL) {
1154
1155 vm_fault_cleanup(object, first_m);
1156 thread_interrupt_level(interruptible_state);
1157
1158 return (VM_FAULT_MEMORY_SHORTAGE);
1159 }
1160 vm_page_insert(m, object, offset);
1161 }
1162 if (look_for_page && !must_be_resident) {
1163 kern_return_t rc;
1164
1165 /*
1166 * If the memory manager is not ready, we
1167 * cannot make requests.
1168 */
1169 if (!object->pager_ready) {
1170 #if TRACEFAULTPAGE
1171 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1172 #endif
1173 if (m != VM_PAGE_NULL)
1174 VM_PAGE_FREE(m);
1175
1176 XPR(XPR_VM_FAULT,
1177 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1178 (integer_t)object, offset, 0, 0, 0);
1179
1180 /*
1181 * take an extra ref so object won't die
1182 */
1183 vm_object_reference_locked(object);
1184 vm_fault_cleanup(object, first_m);
1185 counter(c_vm_fault_page_block_backoff_kernel++);
1186
1187 vm_object_lock(object);
1188 assert(object->ref_count > 0);
1189
1190 if (!object->pager_ready) {
1191 wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1192
1193 vm_object_unlock(object);
1194 if (wait_result == THREAD_WAITING)
1195 wait_result = thread_block(THREAD_CONTINUE_NULL);
1196 vm_object_deallocate(object);
1197
1198 goto backoff;
1199 } else {
1200 vm_object_unlock(object);
1201 vm_object_deallocate(object);
1202 thread_interrupt_level(interruptible_state);
1203
1204 return (VM_FAULT_RETRY);
1205 }
1206 }
1207 if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1208 /*
1209 * If there are too many outstanding page
1210 * requests pending on this external object, we
1211 * wait for them to be resolved now.
1212 */
1213 #if TRACEFAULTPAGE
1214 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1215 #endif
1216 if (m != VM_PAGE_NULL)
1217 VM_PAGE_FREE(m);
1218 /*
1219 * take an extra ref so object won't die
1220 */
1221 vm_object_reference_locked(object);
1222
1223 vm_fault_cleanup(object, first_m);
1224
1225 counter(c_vm_fault_page_block_backoff_kernel++);
1226
1227 vm_object_lock(object);
1228 assert(object->ref_count > 0);
1229
1230 if (object->paging_in_progress > vm_object_pagein_throttle) {
1231 vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS, interruptible);
1232
1233 vm_object_unlock(object);
1234 wait_result = thread_block(THREAD_CONTINUE_NULL);
1235 vm_object_deallocate(object);
1236
1237 goto backoff;
1238 } else {
1239 vm_object_unlock(object);
1240 vm_object_deallocate(object);
1241 thread_interrupt_level(interruptible_state);
1242
1243 return (VM_FAULT_RETRY);
1244 }
1245 }
1246 if (m != VM_PAGE_NULL) {
1247 /*
1248 * Indicate that the page is waiting for data
1249 * from the memory manager.
1250 */
1251 m->list_req_pending = TRUE;
1252 m->absent = TRUE;
1253 }
1254
1255 #if TRACEFAULTPAGE
1256 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1257 #endif
1258
1259 /*
1260 * It's possible someone called vm_object_destroy while we weren't
1261 * holding the object lock. If that has happened, then bail out
1262 * here.
1263 */
1264
1265 pager = object->pager;
1266
1267 if (pager == MEMORY_OBJECT_NULL) {
1268 vm_fault_cleanup(object, first_m);
1269 thread_interrupt_level(interruptible_state);
1270 return VM_FAULT_MEMORY_ERROR;
1271 }
1272
1273 /*
1274 * We have an absent page in place for the faulting offset,
1275 * so we can release the object lock.
1276 */
1277
1278 vm_object_unlock(object);
1279
1280 /*
1281 * If this object uses a copy_call strategy,
1282 * and we are interested in a copy of this object
1283 * (having gotten here only by following a
1284 * shadow chain), then tell the memory manager
1285 * via a flag added to the desired_access
1286 * parameter, so that it can detect a race
1287 * between our walking down the shadow chain
1288 * and its pushing pages up into a copy of
1289 * the object that it manages.
1290 */
1291 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1292 wants_copy_flag = VM_PROT_WANTS_COPY;
1293 else
1294 wants_copy_flag = VM_PROT_NONE;
1295
1296 XPR(XPR_VM_FAULT,
1297 "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1298 (integer_t)object, offset, (integer_t)m,
1299 access_required | wants_copy_flag, 0);
1300
1301 /*
1302 * Call the memory manager to retrieve the data.
1303 */
1304 rc = memory_object_data_request(
1305 pager,
1306 offset + object->paging_offset,
1307 PAGE_SIZE,
1308 access_required | wants_copy_flag,
1309 (memory_object_fault_info_t)fault_info);
1310
1311 #if TRACEFAULTPAGE
1312 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1313 #endif
1314 vm_object_lock(object);
1315
1316 if (rc != KERN_SUCCESS) {
1317
1318 vm_fault_cleanup(object, first_m);
1319 thread_interrupt_level(interruptible_state);
1320
1321 return ((rc == MACH_SEND_INTERRUPTED) ?
1322 VM_FAULT_INTERRUPTED :
1323 VM_FAULT_MEMORY_ERROR);
1324 }
1325 if ((interruptible != THREAD_UNINT) && (current_thread()->sched_mode & TH_MODE_ABORT)) {
1326
1327 vm_fault_cleanup(object, first_m);
1328 thread_interrupt_level(interruptible_state);
1329
1330 return (VM_FAULT_INTERRUPTED);
1331 }
1332 if (m == VM_PAGE_NULL && object->phys_contiguous) {
1333 /*
1334 * No page here means that the object we
1335 * initially looked up was "physically
1336 * contiguous" (i.e. device memory). However,
1337 * with Virtual VRAM, the object might not
1338 * be backed by that device memory anymore,
1339 * so we're done here only if the object is
1340 * still "phys_contiguous".
1341 * Otherwise, if the object is no longer
1342 * "phys_contiguous", we need to retry the
1343 * page fault against the object's new backing
1344 * store (different memory object).
1345 */
1346 break;
1347 }
1348 /*
1349 * potentially a pagein fault
1350 * if we make it through the state checks
1351 * above, than we'll count it as such
1352 */
1353 my_fault = DBG_PAGEIN_FAULT;
1354
1355 /*
1356 * Retry with same object/offset, since new data may
1357 * be in a different page (i.e., m is meaningless at
1358 * this point).
1359 */
1360 continue;
1361 }
1362
1363 /*
1364 * We get here if the object has no pager, or an existence map
1365 * exists and indicates the page isn't present on the pager
1366 * or we're unwiring a page. If a pager exists, but there
1367 * is no existence map, then the m->absent case above handles
1368 * the ZF case when the pager can't provide the page
1369 */
1370 #if TRACEFAULTPAGE
1371 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1372 #endif
1373 if (object == first_object)
1374 first_m = m;
1375 else
1376 assert(m == VM_PAGE_NULL);
1377
1378 XPR(XPR_VM_FAULT,
1379 "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1380 (integer_t)object, offset, (integer_t)m,
1381 (integer_t)object->shadow, 0);
1382
1383 next_object = object->shadow;
1384
1385 if (next_object == VM_OBJECT_NULL) {
1386 /*
1387 * we've hit the bottom of the shadown chain,
1388 * fill the page in the top object with zeros.
1389 */
1390 assert(!must_be_resident);
1391
1392 if (object != first_object) {
1393 vm_object_paging_end(object);
1394 vm_object_unlock(object);
1395
1396 object = first_object;
1397 offset = first_offset;
1398 vm_object_lock(object);
1399 }
1400 m = first_m;
1401 assert(m->object == object);
1402 first_m = VM_PAGE_NULL;
1403
1404 /*
1405 * check for any conditions that prevent
1406 * us from creating a new zero-fill page
1407 * vm_fault_check will do all of the
1408 * fault cleanup in the case of an error condition
1409 * including resetting the thread_interrupt_level
1410 */
1411 error = vm_fault_check(object, m, first_m, interruptible_state);
1412
1413 if (error != VM_FAULT_SUCCESS)
1414 return (error);
1415
1416 if (m == VM_PAGE_NULL) {
1417 m = vm_page_grab();
1418
1419 if (m == VM_PAGE_NULL) {
1420 vm_fault_cleanup(object, VM_PAGE_NULL);
1421 thread_interrupt_level(interruptible_state);
1422
1423 return (VM_FAULT_MEMORY_SHORTAGE);
1424 }
1425 vm_page_insert(m, object, offset);
1426 }
1427 my_fault = vm_fault_zero_page(m, no_zero_fill);
1428
1429 break;
1430
1431 } else {
1432 /*
1433 * Move on to the next object. Lock the next
1434 * object before unlocking the current one.
1435 */
1436 if ((object != first_object) || must_be_resident)
1437 vm_object_paging_end(object);
1438
1439 offset += object->shadow_offset;
1440 fault_info->lo_offset += object->shadow_offset;
1441 fault_info->hi_offset += object->shadow_offset;
1442 access_required = VM_PROT_READ;
1443
1444 vm_object_lock(next_object);
1445 vm_object_unlock(object);
1446
1447 object = next_object;
1448 vm_object_paging_begin(object);
1449 }
1450 }
1451
1452 /*
1453 * PAGE HAS BEEN FOUND.
1454 *
1455 * This page (m) is:
1456 * busy, so that we can play with it;
1457 * not absent, so that nobody else will fill it;
1458 * possibly eligible for pageout;
1459 *
1460 * The top-level page (first_m) is:
1461 * VM_PAGE_NULL if the page was found in the
1462 * top-level object;
1463 * busy, not absent, and ineligible for pageout.
1464 *
1465 * The current object (object) is locked. A paging
1466 * reference is held for the current and top-level
1467 * objects.
1468 */
1469
1470 #if TRACEFAULTPAGE
1471 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1472 #endif
1473 #if EXTRA_ASSERTIONS
1474 if (m != VM_PAGE_NULL) {
1475 assert(m->busy && !m->absent);
1476 assert((first_m == VM_PAGE_NULL) ||
1477 (first_m->busy && !first_m->absent &&
1478 !first_m->active && !first_m->inactive));
1479 }
1480 #endif /* EXTRA_ASSERTIONS */
1481
1482 /*
1483 * ENCRYPTED SWAP:
1484 * If we found a page, we must have decrypted it before we
1485 * get here...
1486 */
1487 if (m != VM_PAGE_NULL) {
1488 ASSERT_PAGE_DECRYPTED(m);
1489 }
1490
1491 XPR(XPR_VM_FAULT,
1492 "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1493 (integer_t)object, offset, (integer_t)m,
1494 (integer_t)first_object, (integer_t)first_m);
1495
1496 /*
1497 * If the page is being written, but isn't
1498 * already owned by the top-level object,
1499 * we have to copy it into a new page owned
1500 * by the top-level object.
1501 */
1502 if ((object != first_object) && (m != VM_PAGE_NULL)) {
1503
1504 #if TRACEFAULTPAGE
1505 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1506 #endif
1507 if (fault_type & VM_PROT_WRITE) {
1508 vm_page_t copy_m;
1509
1510 /*
1511 * We only really need to copy if we
1512 * want to write it.
1513 */
1514 assert(!must_be_resident);
1515
1516 /*
1517 * are we protecting the system from
1518 * backing store exhaustion. If so
1519 * sleep unless we are privileged.
1520 */
1521 if (vm_backing_store_low) {
1522 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1523
1524 RELEASE_PAGE(m);
1525 vm_fault_cleanup(object, first_m);
1526
1527 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1528
1529 thread_block(THREAD_CONTINUE_NULL);
1530 thread_interrupt_level(interruptible_state);
1531
1532 return (VM_FAULT_RETRY);
1533 }
1534 }
1535 /*
1536 * If we try to collapse first_object at this
1537 * point, we may deadlock when we try to get
1538 * the lock on an intermediate object (since we
1539 * have the bottom object locked). We can't
1540 * unlock the bottom object, because the page
1541 * we found may move (by collapse) if we do.
1542 *
1543 * Instead, we first copy the page. Then, when
1544 * we have no more use for the bottom object,
1545 * we unlock it and try to collapse.
1546 *
1547 * Note that we copy the page even if we didn't
1548 * need to... that's the breaks.
1549 */
1550
1551 /*
1552 * Allocate a page for the copy
1553 */
1554 copy_m = vm_page_grab();
1555
1556 if (copy_m == VM_PAGE_NULL) {
1557 RELEASE_PAGE(m);
1558
1559 vm_fault_cleanup(object, first_m);
1560 thread_interrupt_level(interruptible_state);
1561
1562 return (VM_FAULT_MEMORY_SHORTAGE);
1563 }
1564 XPR(XPR_VM_FAULT,
1565 "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1566 (integer_t)object, offset,
1567 (integer_t)m, (integer_t)copy_m, 0);
1568
1569 vm_page_copy(m, copy_m);
1570
1571 /*
1572 * If another map is truly sharing this
1573 * page with us, we have to flush all
1574 * uses of the original page, since we
1575 * can't distinguish those which want the
1576 * original from those which need the
1577 * new copy.
1578 *
1579 * XXXO If we know that only one map has
1580 * access to this page, then we could
1581 * avoid the pmap_disconnect() call.
1582 */
1583 if (m->pmapped)
1584 pmap_disconnect(m->phys_page);
1585
1586 assert(!m->cleaning);
1587
1588 /*
1589 * We no longer need the old page or object.
1590 */
1591 PAGE_WAKEUP_DONE(m);
1592 vm_object_paging_end(object);
1593 vm_object_unlock(object);
1594
1595 my_fault = DBG_COW_FAULT;
1596 VM_STAT_INCR(cow_faults);
1597 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1598 current_task()->cow_faults++;
1599
1600 object = first_object;
1601 offset = first_offset;
1602
1603 vm_object_lock(object);
1604 /*
1605 * get rid of the place holder
1606 * page that we soldered in earlier
1607 */
1608 VM_PAGE_FREE(first_m);
1609 first_m = VM_PAGE_NULL;
1610
1611 /*
1612 * and replace it with the
1613 * page we just copied into
1614 */
1615 assert(copy_m->busy);
1616 vm_page_insert(copy_m, object, offset);
1617 copy_m->dirty = TRUE;
1618
1619 m = copy_m;
1620 /*
1621 * Now that we've gotten the copy out of the
1622 * way, let's try to collapse the top object.
1623 * But we have to play ugly games with
1624 * paging_in_progress to do that...
1625 */
1626 vm_object_paging_end(object);
1627 vm_object_collapse(object, offset, TRUE);
1628 vm_object_paging_begin(object);
1629
1630 } else
1631 *protection &= (~VM_PROT_WRITE);
1632 }
1633 /*
1634 * Now check whether the page needs to be pushed into the
1635 * copy object. The use of asymmetric copy on write for
1636 * shared temporary objects means that we may do two copies to
1637 * satisfy the fault; one above to get the page from a
1638 * shadowed object, and one here to push it into the copy.
1639 */
1640 try_failed_count = 0;
1641
1642 while ((copy_object = first_object->copy) != VM_OBJECT_NULL && (m != VM_PAGE_NULL)) {
1643 vm_object_offset_t copy_offset;
1644 vm_page_t copy_m;
1645
1646 #if TRACEFAULTPAGE
1647 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1648 #endif
1649 /*
1650 * If the page is being written, but hasn't been
1651 * copied to the copy-object, we have to copy it there.
1652 */
1653 if ((fault_type & VM_PROT_WRITE) == 0) {
1654 *protection &= ~VM_PROT_WRITE;
1655 break;
1656 }
1657
1658 /*
1659 * If the page was guaranteed to be resident,
1660 * we must have already performed the copy.
1661 */
1662 if (must_be_resident)
1663 break;
1664
1665 /*
1666 * Try to get the lock on the copy_object.
1667 */
1668 if (!vm_object_lock_try(copy_object)) {
1669
1670 vm_object_unlock(object);
1671 try_failed_count++;
1672
1673 mutex_pause(try_failed_count); /* wait a bit */
1674 vm_object_lock(object);
1675
1676 continue;
1677 }
1678 try_failed_count = 0;
1679
1680 /*
1681 * Make another reference to the copy-object,
1682 * to keep it from disappearing during the
1683 * copy.
1684 */
1685 vm_object_reference_locked(copy_object);
1686
1687 /*
1688 * Does the page exist in the copy?
1689 */
1690 copy_offset = first_offset - copy_object->shadow_offset;
1691
1692 if (copy_object->size <= copy_offset)
1693 /*
1694 * Copy object doesn't cover this page -- do nothing.
1695 */
1696 ;
1697 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1698 /*
1699 * Page currently exists in the copy object
1700 */
1701 if (copy_m->busy) {
1702 /*
1703 * If the page is being brought
1704 * in, wait for it and then retry.
1705 */
1706 RELEASE_PAGE(m);
1707
1708 /*
1709 * take an extra ref so object won't die
1710 */
1711 vm_object_reference_locked(copy_object);
1712 vm_object_unlock(copy_object);
1713 vm_fault_cleanup(object, first_m);
1714 counter(c_vm_fault_page_block_backoff_kernel++);
1715
1716 vm_object_lock(copy_object);
1717 assert(copy_object->ref_count > 0);
1718 VM_OBJ_RES_DECR(copy_object);
1719 vm_object_lock_assert_exclusive(copy_object);
1720 copy_object->ref_count--;
1721 assert(copy_object->ref_count > 0);
1722 copy_m = vm_page_lookup(copy_object, copy_offset);
1723 /*
1724 * ENCRYPTED SWAP:
1725 * it's OK if the "copy_m" page is encrypted,
1726 * because we're not moving it nor handling its
1727 * contents.
1728 */
1729 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1730 PAGE_ASSERT_WAIT(copy_m, interruptible);
1731
1732 vm_object_unlock(copy_object);
1733 wait_result = thread_block(THREAD_CONTINUE_NULL);
1734 vm_object_deallocate(copy_object);
1735
1736 goto backoff;
1737 } else {
1738 vm_object_unlock(copy_object);
1739 vm_object_deallocate(copy_object);
1740 thread_interrupt_level(interruptible_state);
1741
1742 return (VM_FAULT_RETRY);
1743 }
1744 }
1745 }
1746 else if (!PAGED_OUT(copy_object, copy_offset)) {
1747 /*
1748 * If PAGED_OUT is TRUE, then the page used to exist
1749 * in the copy-object, and has already been paged out.
1750 * We don't need to repeat this. If PAGED_OUT is
1751 * FALSE, then either we don't know (!pager_created,
1752 * for example) or it hasn't been paged out.
1753 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1754 * We must copy the page to the copy object.
1755 */
1756
1757 if (vm_backing_store_low) {
1758 /*
1759 * we are protecting the system from
1760 * backing store exhaustion. If so
1761 * sleep unless we are privileged.
1762 */
1763 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1764 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1765
1766 RELEASE_PAGE(m);
1767 VM_OBJ_RES_DECR(copy_object);
1768 vm_object_lock_assert_exclusive(copy_object);
1769 copy_object->ref_count--;
1770 assert(copy_object->ref_count > 0);
1771
1772 vm_object_unlock(copy_object);
1773 vm_fault_cleanup(object, first_m);
1774 thread_block(THREAD_CONTINUE_NULL);
1775 thread_interrupt_level(interruptible_state);
1776
1777 return (VM_FAULT_RETRY);
1778 }
1779 }
1780 /*
1781 * Allocate a page for the copy
1782 */
1783 copy_m = vm_page_alloc(copy_object, copy_offset);
1784
1785 if (copy_m == VM_PAGE_NULL) {
1786 RELEASE_PAGE(m);
1787
1788 VM_OBJ_RES_DECR(copy_object);
1789 vm_object_lock_assert_exclusive(copy_object);
1790 copy_object->ref_count--;
1791 assert(copy_object->ref_count > 0);
1792
1793 vm_object_unlock(copy_object);
1794 vm_fault_cleanup(object, first_m);
1795 thread_interrupt_level(interruptible_state);
1796
1797 return (VM_FAULT_MEMORY_SHORTAGE);
1798 }
1799 /*
1800 * Must copy page into copy-object.
1801 */
1802 vm_page_copy(m, copy_m);
1803
1804 /*
1805 * If the old page was in use by any users
1806 * of the copy-object, it must be removed
1807 * from all pmaps. (We can't know which
1808 * pmaps use it.)
1809 */
1810 if (m->pmapped)
1811 pmap_disconnect(m->phys_page);
1812
1813 /*
1814 * If there's a pager, then immediately
1815 * page out this page, using the "initialize"
1816 * option. Else, we use the copy.
1817 */
1818 if ((!copy_object->pager_created)
1819 #if MACH_PAGEMAP
1820 || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
1821 #endif
1822 ) {
1823
1824 vm_page_lockspin_queues();
1825 assert(!m->cleaning);
1826 vm_page_activate(copy_m);
1827 vm_page_unlock_queues();
1828
1829 copy_m->dirty = TRUE;
1830 PAGE_WAKEUP_DONE(copy_m);
1831 }
1832 else {
1833 assert(copy_m->busy == TRUE);
1834 assert(!m->cleaning);
1835
1836 /*
1837 * dirty is protected by the object lock
1838 */
1839 copy_m->dirty = TRUE;
1840
1841 /*
1842 * The page is already ready for pageout:
1843 * not on pageout queues and busy.
1844 * Unlock everything except the
1845 * copy_object itself.
1846 */
1847 vm_object_unlock(object);
1848
1849 /*
1850 * Write the page to the copy-object,
1851 * flushing it from the kernel.
1852 */
1853 vm_pageout_initialize_page(copy_m);
1854
1855 /*
1856 * Since the pageout may have
1857 * temporarily dropped the
1858 * copy_object's lock, we
1859 * check whether we'll have
1860 * to deallocate the hard way.
1861 */
1862 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
1863 vm_object_unlock(copy_object);
1864 vm_object_deallocate(copy_object);
1865 vm_object_lock(object);
1866
1867 continue;
1868 }
1869 /*
1870 * Pick back up the old object's
1871 * lock. [It is safe to do so,
1872 * since it must be deeper in the
1873 * object tree.]
1874 */
1875 vm_object_lock(object);
1876 }
1877 /*
1878 * Because we're pushing a page upward
1879 * in the object tree, we must restart
1880 * any faults that are waiting here.
1881 * [Note that this is an expansion of
1882 * PAGE_WAKEUP that uses the THREAD_RESTART
1883 * wait result]. Can't turn off the page's
1884 * busy bit because we're not done with it.
1885 */
1886 if (m->wanted) {
1887 m->wanted = FALSE;
1888 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
1889 }
1890 }
1891 /*
1892 * The reference count on copy_object must be
1893 * at least 2: one for our extra reference,
1894 * and at least one from the outside world
1895 * (we checked that when we last locked
1896 * copy_object).
1897 */
1898 vm_object_lock_assert_exclusive(copy_object);
1899 copy_object->ref_count--;
1900 assert(copy_object->ref_count > 0);
1901
1902 VM_OBJ_RES_DECR(copy_object);
1903 vm_object_unlock(copy_object);
1904
1905 break;
1906 }
1907 *result_page = m;
1908 *top_page = first_m;
1909
1910 XPR(XPR_VM_FAULT,
1911 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1912 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1913
1914 if (m != VM_PAGE_NULL) {
1915 if (my_fault == DBG_PAGEIN_FAULT) {
1916
1917 VM_STAT_INCR(pageins);
1918 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
1919 DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1920 current_task()->pageins++;
1921
1922 if (m->object->internal) {
1923 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
1924 } else {
1925 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
1926 }
1927
1928 /*
1929 * evaluate access pattern and update state
1930 * vm_fault_deactivate_behind depends on the
1931 * state being up to date
1932 */
1933 vm_fault_is_sequential(object, offset, fault_info->behavior);
1934
1935 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
1936 }
1937 if (type_of_fault)
1938 *type_of_fault = my_fault;
1939 } else
1940 vm_object_unlock(object);
1941
1942 thread_interrupt_level(interruptible_state);
1943
1944 #if TRACEFAULTPAGE
1945 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
1946 #endif
1947 return (VM_FAULT_SUCCESS);
1948
1949 backoff:
1950 thread_interrupt_level(interruptible_state);
1951
1952 if (wait_result == THREAD_INTERRUPTED)
1953 return (VM_FAULT_INTERRUPTED);
1954 return (VM_FAULT_RETRY);
1955
1956 #undef RELEASE_PAGE
1957 }
1958
1959
1960
1961 /*
1962 * page queue lock must NOT be held
1963 * m->object must be locked
1964 *
1965 * NOTE: m->object could be locked "shared" only if we are called
1966 * from vm_fault() as part of a soft fault. If so, we must be
1967 * careful not to modify the VM object in any way that is not
1968 * legal under a shared lock...
1969 */
1970 unsigned long cs_enter_tainted_rejected = 0;
1971 unsigned long cs_enter_tainted_accepted = 0;
1972 kern_return_t
1973 vm_fault_enter(vm_page_t m,
1974 pmap_t pmap,
1975 vm_map_offset_t vaddr,
1976 vm_prot_t prot,
1977 boolean_t wired,
1978 boolean_t change_wiring,
1979 boolean_t no_cache,
1980 int *type_of_fault)
1981 {
1982 unsigned int cache_attr;
1983 kern_return_t kr;
1984 boolean_t previously_pmapped = m->pmapped;
1985
1986 vm_object_lock_assert_held(m->object);
1987 #if DEBUG
1988 mutex_assert(&vm_page_queue_lock, MA_NOTOWNED);
1989 #endif /* DEBUG */
1990
1991 if (m->phys_page == vm_page_guard_addr) {
1992 assert(m->fictitious);
1993 return KERN_SUCCESS;
1994 }
1995
1996 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
1997
1998 if (m->object->code_signed && pmap != kernel_pmap &&
1999 (!m->cs_validated || m->wpmapped)) {
2000 vm_object_lock_assert_exclusive(m->object);
2001
2002 if (m->cs_validated && m->wpmapped) {
2003 vm_cs_revalidates++;
2004 }
2005
2006 /*
2007 * CODE SIGNING:
2008 * This page comes from a VM object backed by a signed
2009 * memory object. We are about to enter it into a process
2010 * address space, so we need to validate its signature.
2011 */
2012 /* VM map is locked, so 1 ref will remain on VM object */
2013 vm_page_validate_cs(m);
2014 }
2015
2016 if (m->pmapped == FALSE) {
2017 /*
2018 * This is the first time this page is being
2019 * mapped in an address space (pmapped == FALSE).
2020 *
2021 * Part of that page may still be in the data cache
2022 * and not flushed to memory. In case we end up
2023 * accessing that page via the instruction cache,
2024 * we need to ensure that the 2 caches are in sync.
2025 */
2026 pmap_sync_page_data_phys(m->phys_page);
2027
2028 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2029 /*
2030 * found it in the cache, but this
2031 * is the first fault-in of the page (m->pmapped == FALSE)
2032 * so it must have come in as part of
2033 * a cluster... account 1 pagein against it
2034 */
2035 VM_STAT_INCR(pageins);
2036 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2037
2038 if (m->object->internal) {
2039 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2040 } else {
2041 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2042 }
2043
2044 current_task()->pageins++;
2045
2046 *type_of_fault = DBG_PAGEIN_FAULT;
2047 }
2048 VM_PAGE_CONSUME_CLUSTERED(m);
2049
2050 } else if (cache_attr != VM_WIMG_DEFAULT)
2051 pmap_sync_page_attributes_phys(m->phys_page);
2052
2053 if (*type_of_fault != DBG_COW_FAULT) {
2054 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2055
2056 if (pmap == kernel_pmap) {
2057 DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2058 }
2059 }
2060
2061 if (m->cs_tainted) {
2062 /*
2063 * CODE SIGNING:
2064 * This page has been tainted and can not be trusted.
2065 * Let's notify the current process and let it take any
2066 * necessary precautions before we enter the tainted page
2067 * into its address space.
2068 */
2069 if (cs_invalid_page()) {
2070 /* reject the tainted page: abort the page fault */
2071 kr = KERN_MEMORY_ERROR;
2072 cs_enter_tainted_rejected++;
2073 } else {
2074 /* proceed with the tainted page */
2075 kr = KERN_SUCCESS;
2076 cs_enter_tainted_accepted++;
2077 }
2078 if (cs_debug || kr != KERN_SUCCESS) {
2079 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2080 "page %p obj %p off 0x%llx *** TAINTED ***\n",
2081 (long long)vaddr, m, m->object, m->offset);
2082 }
2083 } else {
2084 /* proceed with the valid page */
2085 kr = KERN_SUCCESS;
2086 }
2087
2088 if (kr == KERN_SUCCESS) {
2089 /*
2090 * NOTE: we may only hold the vm_object lock SHARED
2091 * at this point, but the update of pmapped is ok
2092 * since this is the ONLY bit updated behind the SHARED
2093 * lock... however, we need to figure out how to do an atomic
2094 * update on a bit field to make this less fragile... right
2095 * now I don'w know how to coerce 'C' to give me the offset info
2096 * that's needed for an AtomicCompareAndSwap
2097 */
2098 m->pmapped = TRUE;
2099 if (prot & VM_PROT_WRITE) {
2100 vm_object_lock_assert_exclusive(m->object);
2101 m->wpmapped = TRUE;
2102 }
2103
2104 PMAP_ENTER(pmap, vaddr, m, prot, cache_attr, wired);
2105 }
2106
2107 /*
2108 * Hold queues lock to manipulate
2109 * the page queues. Change wiring
2110 * case is obvious.
2111 */
2112 if (change_wiring) {
2113 vm_page_lockspin_queues();
2114
2115 if (wired) {
2116 if (kr == KERN_SUCCESS) {
2117 vm_page_wire(m);
2118 }
2119 } else {
2120 vm_page_unwire(m);
2121 }
2122 vm_page_unlock_queues();
2123
2124 } else {
2125 if (kr != KERN_SUCCESS) {
2126 vm_page_lock_queues();
2127 vm_page_deactivate(m);
2128 vm_page_unlock_queues();
2129 } else {
2130 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count && !m->throttled) {
2131 vm_page_lockspin_queues();
2132 /*
2133 * test again now that we hold the page queue lock
2134 */
2135 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count) {
2136
2137 /*
2138 * If this is a no_cache mapping and the page has never been
2139 * mapped before or was previously a no_cache page, then we
2140 * want to leave pages in the speculative state so that they
2141 * can be readily recycled if free memory runs low. Otherwise
2142 * the page is activated as normal.
2143 */
2144
2145 if (no_cache && (!previously_pmapped || m->no_cache)) {
2146 m->no_cache = TRUE;
2147
2148 if (m->active || m->inactive)
2149 VM_PAGE_QUEUES_REMOVE(m);
2150
2151 if (!m->speculative)
2152 vm_page_speculate(m, TRUE);
2153
2154 } else if (!m->active && !m->inactive)
2155 vm_page_activate(m);
2156
2157 }
2158
2159 vm_page_unlock_queues();
2160 }
2161 }
2162 }
2163 return kr;
2164 }
2165
2166
2167 /*
2168 * Routine: vm_fault
2169 * Purpose:
2170 * Handle page faults, including pseudo-faults
2171 * used to change the wiring status of pages.
2172 * Returns:
2173 * Explicit continuations have been removed.
2174 * Implementation:
2175 * vm_fault and vm_fault_page save mucho state
2176 * in the moral equivalent of a closure. The state
2177 * structure is allocated when first entering vm_fault
2178 * and deallocated when leaving vm_fault.
2179 */
2180
2181 extern int _map_enter_debug;
2182
2183 unsigned long vm_fault_collapse_total = 0;
2184 unsigned long vm_fault_collapse_skipped = 0;
2185
2186 kern_return_t
2187 vm_fault(
2188 vm_map_t map,
2189 vm_map_offset_t vaddr,
2190 vm_prot_t fault_type,
2191 boolean_t change_wiring,
2192 int interruptible,
2193 pmap_t caller_pmap,
2194 vm_map_offset_t caller_pmap_addr)
2195 {
2196 vm_map_version_t version; /* Map version for verificiation */
2197 boolean_t wired; /* Should mapping be wired down? */
2198 vm_object_t object; /* Top-level object */
2199 vm_object_offset_t offset; /* Top-level offset */
2200 vm_prot_t prot; /* Protection for mapping */
2201 vm_object_t old_copy_object; /* Saved copy object */
2202 vm_page_t result_page; /* Result of vm_fault_page */
2203 vm_page_t top_page; /* Placeholder page */
2204 kern_return_t kr;
2205
2206 vm_page_t m; /* Fast access to result_page */
2207 kern_return_t error_code;
2208 vm_object_t cur_object;
2209 vm_object_offset_t cur_offset;
2210 vm_page_t cur_m;
2211 vm_object_t new_object;
2212 int type_of_fault;
2213 pmap_t pmap;
2214 boolean_t interruptible_state;
2215 vm_map_t real_map = map;
2216 vm_map_t original_map = map;
2217 vm_prot_t original_fault_type;
2218 struct vm_object_fault_info fault_info;
2219 boolean_t need_collapse = FALSE;
2220 int object_lock_type = 0;
2221 int cur_object_lock_type;
2222
2223
2224 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
2225 (int)((uint64_t)vaddr >> 32),
2226 (int)vaddr,
2227 0,
2228 0,
2229 0);
2230
2231 if (get_preemption_level() != 0) {
2232 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2233 (int)((uint64_t)vaddr >> 32),
2234 (int)vaddr,
2235 KERN_FAILURE,
2236 0,
2237 0);
2238
2239 return (KERN_FAILURE);
2240 }
2241 interruptible_state = thread_interrupt_level(interruptible);
2242
2243 VM_STAT_INCR(faults);
2244 current_task()->faults++;
2245 original_fault_type = fault_type;
2246
2247 if (fault_type & VM_PROT_WRITE)
2248 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2249 else
2250 object_lock_type = OBJECT_LOCK_SHARED;
2251
2252 cur_object_lock_type = OBJECT_LOCK_SHARED;
2253
2254 RetryFault:
2255 /*
2256 * assume we will hit a page in the cache
2257 * otherwise, explicitly override with
2258 * the real fault type once we determine it
2259 */
2260 type_of_fault = DBG_CACHE_HIT_FAULT;
2261
2262 /*
2263 * Find the backing store object and offset into
2264 * it to begin the search.
2265 */
2266 fault_type = original_fault_type;
2267 map = original_map;
2268 vm_map_lock_read(map);
2269
2270 kr = vm_map_lookup_locked(&map, vaddr, fault_type,
2271 object_lock_type, &version,
2272 &object, &offset, &prot, &wired,
2273 &fault_info,
2274 &real_map);
2275
2276 if (kr != KERN_SUCCESS) {
2277 vm_map_unlock_read(map);
2278 goto done;
2279 }
2280 pmap = real_map->pmap;
2281 fault_info.interruptible = interruptible;
2282
2283 /*
2284 * If the page is wired, we must fault for the current protection
2285 * value, to avoid further faults.
2286 */
2287 if (wired) {
2288 fault_type = prot | VM_PROT_WRITE;
2289 /*
2290 * since we're treating this fault as a 'write'
2291 * we must hold the top object lock exclusively
2292 */
2293 if (object_lock_type == OBJECT_LOCK_SHARED) {
2294
2295 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2296
2297 if (vm_object_lock_upgrade(object) == FALSE) {
2298 /*
2299 * couldn't upgrade, so explictly
2300 * take the lock exclusively
2301 */
2302 vm_object_lock(object);
2303 }
2304 }
2305 }
2306
2307 #if VM_FAULT_CLASSIFY
2308 /*
2309 * Temporary data gathering code
2310 */
2311 vm_fault_classify(object, offset, fault_type);
2312 #endif
2313 /*
2314 * Fast fault code. The basic idea is to do as much as
2315 * possible while holding the map lock and object locks.
2316 * Busy pages are not used until the object lock has to
2317 * be dropped to do something (copy, zero fill, pmap enter).
2318 * Similarly, paging references aren't acquired until that
2319 * point, and object references aren't used.
2320 *
2321 * If we can figure out what to do
2322 * (zero fill, copy on write, pmap enter) while holding
2323 * the locks, then it gets done. Otherwise, we give up,
2324 * and use the original fault path (which doesn't hold
2325 * the map lock, and relies on busy pages).
2326 * The give up cases include:
2327 * - Have to talk to pager.
2328 * - Page is busy, absent or in error.
2329 * - Pager has locked out desired access.
2330 * - Fault needs to be restarted.
2331 * - Have to push page into copy object.
2332 *
2333 * The code is an infinite loop that moves one level down
2334 * the shadow chain each time. cur_object and cur_offset
2335 * refer to the current object being examined. object and offset
2336 * are the original object from the map. The loop is at the
2337 * top level if and only if object and cur_object are the same.
2338 *
2339 * Invariants: Map lock is held throughout. Lock is held on
2340 * original object and cur_object (if different) when
2341 * continuing or exiting loop.
2342 *
2343 */
2344
2345
2346 /*
2347 * If this page is to be inserted in a copy delay object
2348 * for writing, and if the object has a copy, then the
2349 * copy delay strategy is implemented in the slow fault page.
2350 */
2351 if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
2352 object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
2353 goto handle_copy_delay;
2354
2355 cur_object = object;
2356 cur_offset = offset;
2357
2358 while (TRUE) {
2359 m = vm_page_lookup(cur_object, cur_offset);
2360
2361 if (m != VM_PAGE_NULL) {
2362 if (m->busy) {
2363 wait_result_t result;
2364
2365 /*
2366 * in order to do the PAGE_ASSERT_WAIT, we must
2367 * have object that 'm' belongs to locked exclusively
2368 */
2369 if (object != cur_object) {
2370 vm_object_unlock(object);
2371
2372 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2373
2374 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2375
2376 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2377 /*
2378 * couldn't upgrade so go do a full retry
2379 * immediately since we've already dropped
2380 * the top object lock associated with this page
2381 * and the current one got dropped due to the
2382 * failed upgrade... the state is no longer valid
2383 */
2384 vm_map_unlock_read(map);
2385 if (real_map != map)
2386 vm_map_unlock(real_map);
2387
2388 goto RetryFault;
2389 }
2390 }
2391 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2392
2393 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2394
2395 if (vm_object_lock_upgrade(object) == FALSE) {
2396 /*
2397 * couldn't upgrade, so explictly take the lock
2398 * exclusively and go relookup the page since we
2399 * will have dropped the object lock and
2400 * a different thread could have inserted
2401 * a page at this offset
2402 * no need for a full retry since we're
2403 * at the top level of the object chain
2404 */
2405 vm_object_lock(object);
2406
2407 continue;
2408 }
2409 }
2410 vm_map_unlock_read(map);
2411 if (real_map != map)
2412 vm_map_unlock(real_map);
2413
2414 result = PAGE_ASSERT_WAIT(m, interruptible);
2415
2416 vm_object_unlock(cur_object);
2417
2418 if (result == THREAD_WAITING) {
2419 result = thread_block(THREAD_CONTINUE_NULL);
2420
2421 counter(c_vm_fault_page_block_busy_kernel++);
2422 }
2423 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2424 goto RetryFault;
2425
2426 kr = KERN_ABORTED;
2427 goto done;
2428 }
2429 if (m->phys_page == vm_page_guard_addr) {
2430 /*
2431 * Guard page: let the slow path deal with it
2432 */
2433 break;
2434 }
2435 if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
2436 /*
2437 * Unusual case... let the slow path deal with it
2438 */
2439 break;
2440 }
2441 if (m->encrypted) {
2442 /*
2443 * ENCRYPTED SWAP:
2444 * We've soft-faulted (because it's not in the page
2445 * table) on an encrypted page.
2446 * Keep the page "busy" so that no one messes with
2447 * it during the decryption.
2448 * Release the extra locks we're holding, keep only
2449 * the page's VM object lock.
2450 *
2451 * in order to set 'busy' on 'm', we must
2452 * have object that 'm' belongs to locked exclusively
2453 */
2454 if (object != cur_object) {
2455 vm_object_unlock(object);
2456
2457 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2458
2459 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2460
2461 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2462 /*
2463 * couldn't upgrade so go do a full retry
2464 * immediately since we've already dropped
2465 * the top object lock associated with this page
2466 * and the current one got dropped due to the
2467 * failed upgrade... the state is no longer valid
2468 */
2469 vm_map_unlock_read(map);
2470 if (real_map != map)
2471 vm_map_unlock(real_map);
2472
2473 goto RetryFault;
2474 }
2475 }
2476 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2477
2478 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2479
2480 if (vm_object_lock_upgrade(object) == FALSE) {
2481 /*
2482 * couldn't upgrade, so explictly take the lock
2483 * exclusively and go relookup the page since we
2484 * will have dropped the object lock and
2485 * a different thread could have inserted
2486 * a page at this offset
2487 * no need for a full retry since we're
2488 * at the top level of the object chain
2489 */
2490 vm_object_lock(object);
2491
2492 continue;
2493 }
2494 }
2495 m->busy = TRUE;
2496
2497 vm_map_unlock_read(map);
2498 if (real_map != map)
2499 vm_map_unlock(real_map);
2500
2501 vm_page_decrypt(m, 0);
2502
2503 assert(m->busy);
2504 PAGE_WAKEUP_DONE(m);
2505
2506 vm_object_unlock(cur_object);
2507 /*
2508 * Retry from the top, in case anything
2509 * changed while we were decrypting...
2510 */
2511 goto RetryFault;
2512 }
2513 ASSERT_PAGE_DECRYPTED(m);
2514
2515 if (m->object->code_signed && map != kernel_map &&
2516 (!m->cs_validated || m->wpmapped)) {
2517 /*
2518 * We might need to validate this page
2519 * against its code signature, so we
2520 * want to hold the VM object exclusively.
2521 */
2522 if (object != cur_object) {
2523 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2524 vm_object_unlock(object);
2525 vm_object_unlock(cur_object);
2526
2527 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2528
2529 vm_map_unlock_read(map);
2530 if (real_map != map)
2531 vm_map_unlock(real_map);
2532
2533 goto RetryFault;
2534 }
2535
2536 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2537
2538 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2539
2540 if (vm_object_lock_upgrade(object) == FALSE) {
2541 /*
2542 * couldn't upgrade, so explictly take the lock
2543 * exclusively and go relookup the page since we
2544 * will have dropped the object lock and
2545 * a different thread could have inserted
2546 * a page at this offset
2547 * no need for a full retry since we're
2548 * at the top level of the object chain
2549 */
2550 vm_object_lock(object);
2551
2552 continue;
2553 }
2554 }
2555 }
2556 /*
2557 * Two cases of map in faults:
2558 * - At top level w/o copy object.
2559 * - Read fault anywhere.
2560 * --> must disallow write.
2561 */
2562
2563 if (object == cur_object && object->copy == VM_OBJECT_NULL) {
2564 if ((fault_type & VM_PROT_WRITE) == 0) {
2565 /*
2566 * This is not a "write" fault, so we
2567 * might not have taken the object lock
2568 * exclusively and we might not be able
2569 * to update the "wpmapped" bit in
2570 * vm_fault_enter().
2571 * Let's just grant read access to
2572 * the page for now and we'll
2573 * soft-fault again if we need write
2574 * access later...
2575 */
2576 prot &= ~VM_PROT_WRITE;
2577 }
2578 goto FastPmapEnter;
2579 }
2580
2581 if ((fault_type & VM_PROT_WRITE) == 0) {
2582
2583 prot &= ~VM_PROT_WRITE;
2584
2585 /*
2586 * Set up to map the page...
2587 * mark the page busy, drop
2588 * unneeded object lock
2589 */
2590 if (object != cur_object) {
2591 /*
2592 * don't need the original object anymore
2593 */
2594 vm_object_unlock(object);
2595
2596 /*
2597 * switch to the object that has the new page
2598 */
2599 object = cur_object;
2600 object_lock_type = cur_object_lock_type;
2601 }
2602 FastPmapEnter:
2603 /*
2604 * prepare for the pmap_enter...
2605 * object and map are both locked
2606 * m contains valid data
2607 * object == m->object
2608 * cur_object == NULL or it's been unlocked
2609 * no paging references on either object or cur_object
2610 */
2611 #if MACH_KDB
2612 if (db_watchpoint_list && (fault_type & VM_PROT_WRITE) == 0)
2613 prot &= ~VM_PROT_WRITE;
2614 #endif
2615 if (caller_pmap) {
2616 kr = vm_fault_enter(m,
2617 caller_pmap,
2618 caller_pmap_addr,
2619 prot,
2620 wired,
2621 change_wiring,
2622 fault_info.no_cache,
2623 &type_of_fault);
2624 } else {
2625 kr = vm_fault_enter(m,
2626 pmap,
2627 vaddr,
2628 prot,
2629 wired,
2630 change_wiring,
2631 fault_info.no_cache,
2632 &type_of_fault);
2633 }
2634
2635 if (need_collapse == TRUE)
2636 vm_object_collapse(object, offset, TRUE);
2637
2638 if (type_of_fault == DBG_PAGEIN_FAULT) {
2639 /*
2640 * evaluate access pattern and update state
2641 * vm_fault_deactivate_behind depends on the
2642 * state being up to date
2643 */
2644 vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
2645
2646 vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
2647 }
2648 /*
2649 * That's it, clean up and return.
2650 */
2651 if (m->busy)
2652 PAGE_WAKEUP_DONE(m);
2653
2654 vm_object_unlock(object);
2655
2656 vm_map_unlock_read(map);
2657 if (real_map != map)
2658 vm_map_unlock(real_map);
2659
2660 goto done;
2661 }
2662 /*
2663 * COPY ON WRITE FAULT
2664 *
2665 * If objects match, then
2666 * object->copy must not be NULL (else control
2667 * would be in previous code block), and we
2668 * have a potential push into the copy object
2669 * with which we can't cope with here.
2670 */
2671 if (cur_object == object) {
2672 /*
2673 * must take the slow path to
2674 * deal with the copy push
2675 */
2676 break;
2677 }
2678 assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
2679
2680 /*
2681 * This is now a shadow based copy on write
2682 * fault -- it requires a copy up the shadow
2683 * chain.
2684 *
2685 * Allocate a page in the original top level
2686 * object. Give up if allocate fails. Also
2687 * need to remember current page, as it's the
2688 * source of the copy.
2689 *
2690 * at this point we hold locks on both
2691 * object and cur_object... no need to take
2692 * paging refs or mark pages BUSY since
2693 * we don't drop either object lock until
2694 * the page has been copied and inserted
2695 */
2696 cur_m = m;
2697 m = vm_page_grab();
2698
2699 if (m == VM_PAGE_NULL) {
2700 /*
2701 * no free page currently available...
2702 * must take the slow path
2703 */
2704 break;
2705 }
2706 /*
2707 * Now do the copy. Mark the source page busy...
2708 *
2709 * NOTE: This code holds the map lock across
2710 * the page copy.
2711 */
2712 vm_page_copy(cur_m, m);
2713 vm_page_insert(m, object, offset);
2714 m->dirty = TRUE;
2715
2716 /*
2717 * Now cope with the source page and object
2718 */
2719 if (object->ref_count > 1 && cur_m->pmapped)
2720 pmap_disconnect(cur_m->phys_page);
2721
2722 need_collapse = TRUE;
2723
2724 if (!cur_object->internal &&
2725 cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
2726 /*
2727 * The object from which we've just
2728 * copied a page is most probably backed
2729 * by a vnode. We don't want to waste too
2730 * much time trying to collapse the VM objects
2731 * and create a bottleneck when several tasks
2732 * map the same file.
2733 */
2734 if (cur_object->copy == object) {
2735 /*
2736 * Shared mapping or no COW yet.
2737 * We can never collapse a copy
2738 * object into its backing object.
2739 */
2740 need_collapse = FALSE;
2741 } else if (cur_object->copy == object->shadow &&
2742 object->shadow->resident_page_count == 0) {
2743 /*
2744 * Shared mapping after a COW occurred.
2745 */
2746 need_collapse = FALSE;
2747 }
2748 }
2749 vm_object_unlock(cur_object);
2750
2751 if (need_collapse == FALSE)
2752 vm_fault_collapse_skipped++;
2753 vm_fault_collapse_total++;
2754
2755 type_of_fault = DBG_COW_FAULT;
2756 VM_STAT_INCR(cow_faults);
2757 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2758 current_task()->cow_faults++;
2759
2760 goto FastPmapEnter;
2761
2762 } else {
2763 /*
2764 * No page at cur_object, cur_offset... m == NULL
2765 */
2766 if (cur_object->pager_created) {
2767 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) {
2768 /*
2769 * May have to talk to a pager...
2770 * take the slow path.
2771 */
2772 break;
2773 }
2774 /*
2775 * existence map present and indicates
2776 * that the pager doesn't have this page
2777 */
2778 }
2779 if (cur_object->shadow == VM_OBJECT_NULL) {
2780 /*
2781 * Zero fill fault. Page gets
2782 * inserted into the original object.
2783 */
2784 if (cur_object->shadow_severed) {
2785
2786 if (object != cur_object)
2787 vm_object_unlock(cur_object);
2788 vm_object_unlock(object);
2789
2790 vm_map_unlock_read(map);
2791 if (real_map != map)
2792 vm_map_unlock(real_map);
2793
2794 kr = KERN_MEMORY_ERROR;
2795 goto done;
2796 }
2797 if (VM_PAGE_ZFILL_THROTTLED()) {
2798 /*
2799 * drop all of our locks...
2800 * wait until the free queue is
2801 * pumped back up and then
2802 * redrive the fault
2803 */
2804 if (object != cur_object)
2805 vm_object_unlock(cur_object);
2806 vm_object_unlock(object);
2807 vm_map_unlock_read(map);
2808 if (real_map != map)
2809 vm_map_unlock(real_map);
2810
2811 if (vm_page_wait((change_wiring) ?
2812 THREAD_UNINT :
2813 THREAD_ABORTSAFE))
2814 goto RetryFault;
2815
2816 kr = KERN_ABORTED;
2817 goto done;
2818 }
2819 if (vm_backing_store_low) {
2820 /*
2821 * we are protecting the system from
2822 * backing store exhaustion...
2823 * must take the slow path if we're
2824 * not privileged
2825 */
2826 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
2827 break;
2828 }
2829 if (cur_object != object) {
2830 vm_object_unlock(cur_object);
2831
2832 cur_object = object;
2833 }
2834 if (object_lock_type == OBJECT_LOCK_SHARED) {
2835
2836 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2837
2838 if (vm_object_lock_upgrade(object) == FALSE) {
2839 /*
2840 * couldn't upgrade so do a full retry on the fault
2841 * since we dropped the object lock which
2842 * could allow another thread to insert
2843 * a page at this offset
2844 */
2845 vm_map_unlock_read(map);
2846 if (real_map != map)
2847 vm_map_unlock(real_map);
2848
2849 goto RetryFault;
2850 }
2851 }
2852 m = vm_page_alloc(object, offset);
2853
2854 if (m == VM_PAGE_NULL) {
2855 /*
2856 * no free page currently available...
2857 * must take the slow path
2858 */
2859 break;
2860 }
2861
2862 /*
2863 * Now zero fill page...
2864 * the page is probably going to
2865 * be written soon, so don't bother
2866 * to clear the modified bit
2867 *
2868 * NOTE: This code holds the map
2869 * lock across the zero fill.
2870 */
2871 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
2872
2873 goto FastPmapEnter;
2874 }
2875 /*
2876 * On to the next level in the shadow chain
2877 */
2878 cur_offset += cur_object->shadow_offset;
2879 new_object = cur_object->shadow;
2880
2881 /*
2882 * take the new_object's lock with the indicated state
2883 */
2884 if (cur_object_lock_type == OBJECT_LOCK_SHARED)
2885 vm_object_lock_shared(new_object);
2886 else
2887 vm_object_lock(new_object);
2888
2889 if (cur_object != object)
2890 vm_object_unlock(cur_object);
2891
2892 cur_object = new_object;
2893
2894 continue;
2895 }
2896 }
2897 /*
2898 * Cleanup from fast fault failure. Drop any object
2899 * lock other than original and drop map lock.
2900 */
2901 if (object != cur_object)
2902 vm_object_unlock(cur_object);
2903
2904 /*
2905 * must own the object lock exclusively at this point
2906 */
2907 if (object_lock_type == OBJECT_LOCK_SHARED) {
2908 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2909
2910 if (vm_object_lock_upgrade(object) == FALSE) {
2911 /*
2912 * couldn't upgrade, so explictly
2913 * take the lock exclusively
2914 * no need to retry the fault at this
2915 * point since "vm_fault_page" will
2916 * completely re-evaluate the state
2917 */
2918 vm_object_lock(object);
2919 }
2920 }
2921
2922 handle_copy_delay:
2923 vm_map_unlock_read(map);
2924 if (real_map != map)
2925 vm_map_unlock(real_map);
2926
2927 /*
2928 * Make a reference to this object to
2929 * prevent its disposal while we are messing with
2930 * it. Once we have the reference, the map is free
2931 * to be diddled. Since objects reference their
2932 * shadows (and copies), they will stay around as well.
2933 */
2934 vm_object_reference_locked(object);
2935 vm_object_paging_begin(object);
2936
2937 XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2938
2939 error_code = 0;
2940
2941 kr = vm_fault_page(object, offset, fault_type,
2942 (change_wiring && !wired),
2943 &prot, &result_page, &top_page,
2944 &type_of_fault,
2945 &error_code, map->no_zero_fill,
2946 FALSE, &fault_info);
2947
2948 /*
2949 * if kr != VM_FAULT_SUCCESS, then the paging reference
2950 * has been dropped and the object unlocked... the ref_count
2951 * is still held
2952 *
2953 * if kr == VM_FAULT_SUCCESS, then the paging reference
2954 * is still held along with the ref_count on the original object
2955 *
2956 * if m != NULL, then the object it belongs to
2957 * is returned locked with a paging reference
2958 *
2959 * if top_page != NULL, then it's BUSY and the
2960 * object it belongs to has a paging reference
2961 * but is returned unlocked
2962 */
2963 if (kr != VM_FAULT_SUCCESS) {
2964 /*
2965 * we didn't succeed, lose the object reference immediately.
2966 */
2967 vm_object_deallocate(object);
2968
2969 /*
2970 * See why we failed, and take corrective action.
2971 */
2972 switch (kr) {
2973 case VM_FAULT_MEMORY_SHORTAGE:
2974 if (vm_page_wait((change_wiring) ?
2975 THREAD_UNINT :
2976 THREAD_ABORTSAFE))
2977 goto RetryFault;
2978 /*
2979 * fall thru
2980 */
2981 case VM_FAULT_INTERRUPTED:
2982 kr = KERN_ABORTED;
2983 goto done;
2984 case VM_FAULT_RETRY:
2985 goto RetryFault;
2986 case VM_FAULT_MEMORY_ERROR:
2987 if (error_code)
2988 kr = error_code;
2989 else
2990 kr = KERN_MEMORY_ERROR;
2991 goto done;
2992 }
2993 }
2994 m = result_page;
2995
2996 if (m != VM_PAGE_NULL) {
2997 assert((change_wiring && !wired) ?
2998 (top_page == VM_PAGE_NULL) :
2999 ((top_page == VM_PAGE_NULL) == (m->object == object)));
3000 }
3001
3002 /*
3003 * What to do with the resulting page from vm_fault_page
3004 * if it doesn't get entered into the physical map:
3005 */
3006 #define RELEASE_PAGE(m) \
3007 MACRO_BEGIN \
3008 PAGE_WAKEUP_DONE(m); \
3009 vm_page_lockspin_queues(); \
3010 if (!m->active && !m->inactive && !m->throttled)\
3011 vm_page_activate(m); \
3012 vm_page_unlock_queues(); \
3013 MACRO_END
3014
3015 /*
3016 * We must verify that the maps have not changed
3017 * since our last lookup.
3018 */
3019 if (m != VM_PAGE_NULL) {
3020 old_copy_object = m->object->copy;
3021 vm_object_unlock(m->object);
3022 } else
3023 old_copy_object = VM_OBJECT_NULL;
3024
3025 /*
3026 * no object locks are held at this point
3027 */
3028 if ((map != original_map) || !vm_map_verify(map, &version)) {
3029 vm_object_t retry_object;
3030 vm_object_offset_t retry_offset;
3031 vm_prot_t retry_prot;
3032
3033 /*
3034 * To avoid trying to write_lock the map while another
3035 * thread has it read_locked (in vm_map_pageable), we
3036 * do not try for write permission. If the page is
3037 * still writable, we will get write permission. If it
3038 * is not, or has been marked needs_copy, we enter the
3039 * mapping without write permission, and will merely
3040 * take another fault.
3041 */
3042 map = original_map;
3043 vm_map_lock_read(map);
3044
3045 kr = vm_map_lookup_locked(&map, vaddr,
3046 fault_type & ~VM_PROT_WRITE,
3047 OBJECT_LOCK_EXCLUSIVE, &version,
3048 &retry_object, &retry_offset, &retry_prot,
3049 &wired,
3050 &fault_info,
3051 &real_map);
3052 pmap = real_map->pmap;
3053
3054 if (kr != KERN_SUCCESS) {
3055 vm_map_unlock_read(map);
3056
3057 if (m != VM_PAGE_NULL) {
3058 /*
3059 * retake the lock so that
3060 * we can drop the paging reference
3061 * in vm_fault_cleanup and do the
3062 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3063 */
3064 vm_object_lock(m->object);
3065
3066 RELEASE_PAGE(m);
3067
3068 vm_fault_cleanup(m->object, top_page);
3069 } else {
3070 /*
3071 * retake the lock so that
3072 * we can drop the paging reference
3073 * in vm_fault_cleanup
3074 */
3075 vm_object_lock(object);
3076
3077 vm_fault_cleanup(object, top_page);
3078 }
3079 vm_object_deallocate(object);
3080
3081 goto done;
3082 }
3083 vm_object_unlock(retry_object);
3084
3085 if ((retry_object != object) || (retry_offset != offset)) {
3086
3087 vm_map_unlock_read(map);
3088 if (real_map != map)
3089 vm_map_unlock(real_map);
3090
3091 if (m != VM_PAGE_NULL) {
3092 /*
3093 * retake the lock so that
3094 * we can drop the paging reference
3095 * in vm_fault_cleanup and do the
3096 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3097 */
3098 vm_object_lock(m->object);
3099
3100 RELEASE_PAGE(m);
3101
3102 vm_fault_cleanup(m->object, top_page);
3103 } else {
3104 /*
3105 * retake the lock so that
3106 * we can drop the paging reference
3107 * in vm_fault_cleanup
3108 */
3109 vm_object_lock(object);
3110
3111 vm_fault_cleanup(object, top_page);
3112 }
3113 vm_object_deallocate(object);
3114
3115 goto RetryFault;
3116 }
3117 /*
3118 * Check whether the protection has changed or the object
3119 * has been copied while we left the map unlocked.
3120 */
3121 prot &= retry_prot;
3122 }
3123 if (m != VM_PAGE_NULL) {
3124 vm_object_lock(m->object);
3125
3126 if (m->object->copy != old_copy_object) {
3127 /*
3128 * The copy object changed while the top-level object
3129 * was unlocked, so take away write permission.
3130 */
3131 prot &= ~VM_PROT_WRITE;
3132 }
3133 } else
3134 vm_object_lock(object);
3135
3136 /*
3137 * If we want to wire down this page, but no longer have
3138 * adequate permissions, we must start all over.
3139 */
3140 if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
3141
3142 vm_map_verify_done(map, &version);
3143 if (real_map != map)
3144 vm_map_unlock(real_map);
3145
3146 if (m != VM_PAGE_NULL) {
3147 RELEASE_PAGE(m);
3148
3149 vm_fault_cleanup(m->object, top_page);
3150 } else
3151 vm_fault_cleanup(object, top_page);
3152
3153 vm_object_deallocate(object);
3154
3155 goto RetryFault;
3156 }
3157 if (m != VM_PAGE_NULL) {
3158 /*
3159 * Put this page into the physical map.
3160 * We had to do the unlock above because pmap_enter
3161 * may cause other faults. The page may be on
3162 * the pageout queues. If the pageout daemon comes
3163 * across the page, it will remove it from the queues.
3164 */
3165 if (caller_pmap) {
3166 kr = vm_fault_enter(m,
3167 caller_pmap,
3168 caller_pmap_addr,
3169 prot,
3170 wired,
3171 change_wiring,
3172 fault_info.no_cache,
3173 &type_of_fault);
3174 } else {
3175 kr = vm_fault_enter(m,
3176 pmap,
3177 vaddr,
3178 prot,
3179 wired,
3180 change_wiring,
3181 fault_info.no_cache,
3182 &type_of_fault);
3183 }
3184 if (kr != KERN_SUCCESS) {
3185 /* abort this page fault */
3186 vm_map_verify_done(map, &version);
3187 if (real_map != map)
3188 vm_map_unlock(real_map);
3189 PAGE_WAKEUP_DONE(m);
3190 vm_fault_cleanup(m->object, top_page);
3191 vm_object_deallocate(object);
3192 goto done;
3193 }
3194 } else {
3195
3196 vm_map_entry_t entry;
3197 vm_map_offset_t laddr;
3198 vm_map_offset_t ldelta, hdelta;
3199
3200 /*
3201 * do a pmap block mapping from the physical address
3202 * in the object
3203 */
3204
3205 #ifdef ppc
3206 /* While we do not worry about execution protection in */
3207 /* general, certian pages may have instruction execution */
3208 /* disallowed. We will check here, and if not allowed */
3209 /* to execute, we return with a protection failure. */
3210
3211 if ((fault_type & VM_PROT_EXECUTE) &&
3212 (!pmap_eligible_for_execute((ppnum_t)(object->shadow_offset >> 12)))) {
3213
3214 vm_map_verify_done(map, &version);
3215
3216 if (real_map != map)
3217 vm_map_unlock(real_map);
3218
3219 vm_fault_cleanup(object, top_page);
3220 vm_object_deallocate(object);
3221
3222 kr = KERN_PROTECTION_FAILURE;
3223 goto done;
3224 }
3225 #endif /* ppc */
3226
3227 if (real_map != map)
3228 vm_map_unlock(real_map);
3229
3230 if (original_map != map) {
3231 vm_map_unlock_read(map);
3232 vm_map_lock_read(original_map);
3233 map = original_map;
3234 }
3235 real_map = map;
3236
3237 laddr = vaddr;
3238 hdelta = 0xFFFFF000;
3239 ldelta = 0xFFFFF000;
3240
3241 while (vm_map_lookup_entry(map, laddr, &entry)) {
3242 if (ldelta > (laddr - entry->vme_start))
3243 ldelta = laddr - entry->vme_start;
3244 if (hdelta > (entry->vme_end - laddr))
3245 hdelta = entry->vme_end - laddr;
3246 if (entry->is_sub_map) {
3247
3248 laddr = (laddr - entry->vme_start)
3249 + entry->offset;
3250 vm_map_lock_read(entry->object.sub_map);
3251
3252 if (map != real_map)
3253 vm_map_unlock_read(map);
3254 if (entry->use_pmap) {
3255 vm_map_unlock_read(real_map);
3256 real_map = entry->object.sub_map;
3257 }
3258 map = entry->object.sub_map;
3259
3260 } else {
3261 break;
3262 }
3263 }
3264
3265 if (vm_map_lookup_entry(map, laddr, &entry) &&
3266 (entry->object.vm_object != NULL) &&
3267 (entry->object.vm_object == object)) {
3268
3269 if (caller_pmap) {
3270 /*
3271 * Set up a block mapped area
3272 */
3273 pmap_map_block(caller_pmap,
3274 (addr64_t)(caller_pmap_addr - ldelta),
3275 (((vm_map_offset_t) (entry->object.vm_object->shadow_offset)) +
3276 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3277 ((ldelta + hdelta) >> 12), prot,
3278 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3279 } else {
3280 /*
3281 * Set up a block mapped area
3282 */
3283 pmap_map_block(real_map->pmap,
3284 (addr64_t)(vaddr - ldelta),
3285 (((vm_map_offset_t)(entry->object.vm_object->shadow_offset)) +
3286 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3287 ((ldelta + hdelta) >> 12), prot,
3288 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3289 }
3290 }
3291 }
3292
3293 /*
3294 * Unlock everything, and return
3295 */
3296 vm_map_verify_done(map, &version);
3297 if (real_map != map)
3298 vm_map_unlock(real_map);
3299
3300 if (m != VM_PAGE_NULL) {
3301 PAGE_WAKEUP_DONE(m);
3302
3303 vm_fault_cleanup(m->object, top_page);
3304 } else
3305 vm_fault_cleanup(object, top_page);
3306
3307 vm_object_deallocate(object);
3308
3309 #undef RELEASE_PAGE
3310
3311 kr = KERN_SUCCESS;
3312 done:
3313 thread_interrupt_level(interruptible_state);
3314
3315 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3316 (int)((uint64_t)vaddr >> 32),
3317 (int)vaddr,
3318 kr,
3319 type_of_fault,
3320 0);
3321
3322 return (kr);
3323 }
3324
3325 /*
3326 * vm_fault_wire:
3327 *
3328 * Wire down a range of virtual addresses in a map.
3329 */
3330 kern_return_t
3331 vm_fault_wire(
3332 vm_map_t map,
3333 vm_map_entry_t entry,
3334 pmap_t pmap,
3335 vm_map_offset_t pmap_addr)
3336 {
3337
3338 register vm_map_offset_t va;
3339 register vm_map_offset_t end_addr = entry->vme_end;
3340 register kern_return_t rc;
3341
3342 assert(entry->in_transition);
3343
3344 if ((entry->object.vm_object != NULL) &&
3345 !entry->is_sub_map &&
3346 entry->object.vm_object->phys_contiguous) {
3347 return KERN_SUCCESS;
3348 }
3349
3350 /*
3351 * Inform the physical mapping system that the
3352 * range of addresses may not fault, so that
3353 * page tables and such can be locked down as well.
3354 */
3355
3356 pmap_pageable(pmap, pmap_addr,
3357 pmap_addr + (end_addr - entry->vme_start), FALSE);
3358
3359 /*
3360 * We simulate a fault to get the page and enter it
3361 * in the physical map.
3362 */
3363
3364 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3365 if ((rc = vm_fault_wire_fast(
3366 map, va, entry, pmap,
3367 pmap_addr + (va - entry->vme_start)
3368 )) != KERN_SUCCESS) {
3369 rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
3370 (pmap == kernel_pmap) ?
3371 THREAD_UNINT : THREAD_ABORTSAFE,
3372 pmap, pmap_addr + (va - entry->vme_start));
3373 DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
3374 }
3375
3376 if (rc != KERN_SUCCESS) {
3377 struct vm_map_entry tmp_entry = *entry;
3378
3379 /* unwire wired pages */
3380 tmp_entry.vme_end = va;
3381 vm_fault_unwire(map,
3382 &tmp_entry, FALSE, pmap, pmap_addr);
3383
3384 return rc;
3385 }
3386 }
3387 return KERN_SUCCESS;
3388 }
3389
3390 /*
3391 * vm_fault_unwire:
3392 *
3393 * Unwire a range of virtual addresses in a map.
3394 */
3395 void
3396 vm_fault_unwire(
3397 vm_map_t map,
3398 vm_map_entry_t entry,
3399 boolean_t deallocate,
3400 pmap_t pmap,
3401 vm_map_offset_t pmap_addr)
3402 {
3403 register vm_map_offset_t va;
3404 register vm_map_offset_t end_addr = entry->vme_end;
3405 vm_object_t object;
3406 struct vm_object_fault_info fault_info;
3407
3408 object = (entry->is_sub_map)
3409 ? VM_OBJECT_NULL : entry->object.vm_object;
3410
3411 /*
3412 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
3413 * do anything since such memory is wired by default. So we don't have
3414 * anything to undo here.
3415 */
3416
3417 if (object != VM_OBJECT_NULL && object->phys_contiguous)
3418 return;
3419
3420 fault_info.interruptible = THREAD_UNINT;
3421 fault_info.behavior = entry->behavior;
3422 fault_info.user_tag = entry->alias;
3423 fault_info.lo_offset = entry->offset;
3424 fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
3425 fault_info.no_cache = entry->no_cache;
3426
3427 /*
3428 * Since the pages are wired down, we must be able to
3429 * get their mappings from the physical map system.
3430 */
3431
3432 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3433
3434 if (pmap) {
3435 pmap_change_wiring(pmap,
3436 pmap_addr + (va - entry->vme_start), FALSE);
3437 }
3438 if (object == VM_OBJECT_NULL) {
3439 (void) vm_fault(map, va, VM_PROT_NONE,
3440 TRUE, THREAD_UNINT, pmap, pmap_addr);
3441 } else {
3442 vm_prot_t prot;
3443 vm_page_t result_page;
3444 vm_page_t top_page;
3445 vm_object_t result_object;
3446 vm_fault_return_t result;
3447
3448 fault_info.cluster_size = end_addr - va;
3449
3450 do {
3451 prot = VM_PROT_NONE;
3452
3453 vm_object_lock(object);
3454 vm_object_paging_begin(object);
3455 XPR(XPR_VM_FAULT,
3456 "vm_fault_unwire -> vm_fault_page\n",
3457 0,0,0,0,0);
3458 result = vm_fault_page(
3459 object,
3460 entry->offset + (va - entry->vme_start),
3461 VM_PROT_NONE, TRUE,
3462 &prot, &result_page, &top_page,
3463 (int *)0,
3464 NULL, map->no_zero_fill,
3465 FALSE, &fault_info);
3466 } while (result == VM_FAULT_RETRY);
3467
3468 /*
3469 * If this was a mapping to a file on a device that has been forcibly
3470 * unmounted, then we won't get a page back from vm_fault_page(). Just
3471 * move on to the next one in case the remaining pages are mapped from
3472 * different objects. During a forced unmount, the object is terminated
3473 * so the alive flag will be false if this happens. A forced unmount will
3474 * will occur when an external disk is unplugged before the user does an
3475 * eject, so we don't want to panic in that situation.
3476 */
3477
3478 if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
3479 continue;
3480
3481 if (result != VM_FAULT_SUCCESS)
3482 panic("vm_fault_unwire: failure");
3483
3484 result_object = result_page->object;
3485
3486 if (deallocate) {
3487 assert(result_page->phys_page !=
3488 vm_page_fictitious_addr);
3489 pmap_disconnect(result_page->phys_page);
3490 VM_PAGE_FREE(result_page);
3491 } else {
3492 vm_page_lockspin_queues();
3493 vm_page_unwire(result_page);
3494 vm_page_unlock_queues();
3495 PAGE_WAKEUP_DONE(result_page);
3496 }
3497 vm_fault_cleanup(result_object, top_page);
3498 }
3499 }
3500
3501 /*
3502 * Inform the physical mapping system that the range
3503 * of addresses may fault, so that page tables and
3504 * such may be unwired themselves.
3505 */
3506
3507 pmap_pageable(pmap, pmap_addr,
3508 pmap_addr + (end_addr - entry->vme_start), TRUE);
3509
3510 }
3511
3512 /*
3513 * vm_fault_wire_fast:
3514 *
3515 * Handle common case of a wire down page fault at the given address.
3516 * If successful, the page is inserted into the associated physical map.
3517 * The map entry is passed in to avoid the overhead of a map lookup.
3518 *
3519 * NOTE: the given address should be truncated to the
3520 * proper page address.
3521 *
3522 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
3523 * a standard error specifying why the fault is fatal is returned.
3524 *
3525 * The map in question must be referenced, and remains so.
3526 * Caller has a read lock on the map.
3527 *
3528 * This is a stripped version of vm_fault() for wiring pages. Anything
3529 * other than the common case will return KERN_FAILURE, and the caller
3530 * is expected to call vm_fault().
3531 */
3532 kern_return_t
3533 vm_fault_wire_fast(
3534 __unused vm_map_t map,
3535 vm_map_offset_t va,
3536 vm_map_entry_t entry,
3537 pmap_t pmap,
3538 vm_map_offset_t pmap_addr)
3539 {
3540 vm_object_t object;
3541 vm_object_offset_t offset;
3542 register vm_page_t m;
3543 vm_prot_t prot;
3544 thread_t thread = current_thread();
3545 int type_of_fault;
3546 kern_return_t kr;
3547
3548 VM_STAT_INCR(faults);
3549
3550 if (thread != THREAD_NULL && thread->task != TASK_NULL)
3551 thread->task->faults++;
3552
3553 /*
3554 * Recovery actions
3555 */
3556
3557 #undef RELEASE_PAGE
3558 #define RELEASE_PAGE(m) { \
3559 PAGE_WAKEUP_DONE(m); \
3560 vm_page_lockspin_queues(); \
3561 vm_page_unwire(m); \
3562 vm_page_unlock_queues(); \
3563 }
3564
3565
3566 #undef UNLOCK_THINGS
3567 #define UNLOCK_THINGS { \
3568 vm_object_paging_end(object); \
3569 vm_object_unlock(object); \
3570 }
3571
3572 #undef UNLOCK_AND_DEALLOCATE
3573 #define UNLOCK_AND_DEALLOCATE { \
3574 UNLOCK_THINGS; \
3575 vm_object_deallocate(object); \
3576 }
3577 /*
3578 * Give up and have caller do things the hard way.
3579 */
3580
3581 #define GIVE_UP { \
3582 UNLOCK_AND_DEALLOCATE; \
3583 return(KERN_FAILURE); \
3584 }
3585
3586
3587 /*
3588 * If this entry is not directly to a vm_object, bail out.
3589 */
3590 if (entry->is_sub_map)
3591 return(KERN_FAILURE);
3592
3593 /*
3594 * Find the backing store object and offset into it.
3595 */
3596
3597 object = entry->object.vm_object;
3598 offset = (va - entry->vme_start) + entry->offset;
3599 prot = entry->protection;
3600
3601 /*
3602 * Make a reference to this object to prevent its
3603 * disposal while we are messing with it.
3604 */
3605
3606 vm_object_lock(object);
3607 vm_object_reference_locked(object);
3608 vm_object_paging_begin(object);
3609
3610 /*
3611 * INVARIANTS (through entire routine):
3612 *
3613 * 1) At all times, we must either have the object
3614 * lock or a busy page in some object to prevent
3615 * some other thread from trying to bring in
3616 * the same page.
3617 *
3618 * 2) Once we have a busy page, we must remove it from
3619 * the pageout queues, so that the pageout daemon
3620 * will not grab it away.
3621 *
3622 */
3623
3624 /*
3625 * Look for page in top-level object. If it's not there or
3626 * there's something going on, give up.
3627 * ENCRYPTED SWAP: use the slow fault path, since we'll need to
3628 * decrypt the page before wiring it down.
3629 */
3630 m = vm_page_lookup(object, offset);
3631 if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
3632 (m->unusual && ( m->error || m->restart || m->absent))) {
3633
3634 GIVE_UP;
3635 }
3636 ASSERT_PAGE_DECRYPTED(m);
3637
3638 if (m->fictitious &&
3639 m->phys_page == vm_page_guard_addr) {
3640 /*
3641 * Guard pages are fictitious pages and are never
3642 * entered into a pmap, so let's say it's been wired...
3643 */
3644 kr = KERN_SUCCESS;
3645 goto done;
3646 }
3647
3648 /*
3649 * Wire the page down now. All bail outs beyond this
3650 * point must unwire the page.
3651 */
3652
3653 vm_page_lockspin_queues();
3654 vm_page_wire(m);
3655 vm_page_unlock_queues();
3656
3657 /*
3658 * Mark page busy for other threads.
3659 */
3660 assert(!m->busy);
3661 m->busy = TRUE;
3662 assert(!m->absent);
3663
3664 /*
3665 * Give up if the page is being written and there's a copy object
3666 */
3667 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3668 RELEASE_PAGE(m);
3669 GIVE_UP;
3670 }
3671
3672 /*
3673 * Put this page into the physical map.
3674 */
3675 type_of_fault = DBG_CACHE_HIT_FAULT;
3676 kr = vm_fault_enter(m,
3677 pmap,
3678 pmap_addr,
3679 prot,
3680 TRUE,
3681 FALSE,
3682 FALSE,
3683 &type_of_fault);
3684
3685 done:
3686 /*
3687 * Unlock everything, and return
3688 */
3689
3690 PAGE_WAKEUP_DONE(m);
3691 UNLOCK_AND_DEALLOCATE;
3692
3693 return kr;
3694
3695 }
3696
3697 /*
3698 * Routine: vm_fault_copy_cleanup
3699 * Purpose:
3700 * Release a page used by vm_fault_copy.
3701 */
3702
3703 void
3704 vm_fault_copy_cleanup(
3705 vm_page_t page,
3706 vm_page_t top_page)
3707 {
3708 vm_object_t object = page->object;
3709
3710 vm_object_lock(object);
3711 PAGE_WAKEUP_DONE(page);
3712 vm_page_lockspin_queues();
3713 if (!page->active && !page->inactive && !page->throttled)
3714 vm_page_activate(page);
3715 vm_page_unlock_queues();
3716 vm_fault_cleanup(object, top_page);
3717 }
3718
3719 void
3720 vm_fault_copy_dst_cleanup(
3721 vm_page_t page)
3722 {
3723 vm_object_t object;
3724
3725 if (page != VM_PAGE_NULL) {
3726 object = page->object;
3727 vm_object_lock(object);
3728 vm_page_lockspin_queues();
3729 vm_page_unwire(page);
3730 vm_page_unlock_queues();
3731 vm_object_paging_end(object);
3732 vm_object_unlock(object);
3733 }
3734 }
3735
3736 /*
3737 * Routine: vm_fault_copy
3738 *
3739 * Purpose:
3740 * Copy pages from one virtual memory object to another --
3741 * neither the source nor destination pages need be resident.
3742 *
3743 * Before actually copying a page, the version associated with
3744 * the destination address map wil be verified.
3745 *
3746 * In/out conditions:
3747 * The caller must hold a reference, but not a lock, to
3748 * each of the source and destination objects and to the
3749 * destination map.
3750 *
3751 * Results:
3752 * Returns KERN_SUCCESS if no errors were encountered in
3753 * reading or writing the data. Returns KERN_INTERRUPTED if
3754 * the operation was interrupted (only possible if the
3755 * "interruptible" argument is asserted). Other return values
3756 * indicate a permanent error in copying the data.
3757 *
3758 * The actual amount of data copied will be returned in the
3759 * "copy_size" argument. In the event that the destination map
3760 * verification failed, this amount may be less than the amount
3761 * requested.
3762 */
3763 kern_return_t
3764 vm_fault_copy(
3765 vm_object_t src_object,
3766 vm_object_offset_t src_offset,
3767 vm_map_size_t *copy_size, /* INOUT */
3768 vm_object_t dst_object,
3769 vm_object_offset_t dst_offset,
3770 vm_map_t dst_map,
3771 vm_map_version_t *dst_version,
3772 int interruptible)
3773 {
3774 vm_page_t result_page;
3775
3776 vm_page_t src_page;
3777 vm_page_t src_top_page;
3778 vm_prot_t src_prot;
3779
3780 vm_page_t dst_page;
3781 vm_page_t dst_top_page;
3782 vm_prot_t dst_prot;
3783
3784 vm_map_size_t amount_left;
3785 vm_object_t old_copy_object;
3786 kern_return_t error = 0;
3787
3788 vm_map_size_t part_size;
3789 struct vm_object_fault_info fault_info_src;
3790 struct vm_object_fault_info fault_info_dst;
3791
3792 /*
3793 * In order not to confuse the clustered pageins, align
3794 * the different offsets on a page boundary.
3795 */
3796
3797 #define RETURN(x) \
3798 MACRO_BEGIN \
3799 *copy_size -= amount_left; \
3800 MACRO_RETURN(x); \
3801 MACRO_END
3802
3803 amount_left = *copy_size;
3804
3805 fault_info_src.interruptible = interruptible;
3806 fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
3807 fault_info_src.user_tag = 0;
3808 fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
3809 fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
3810 fault_info_src.no_cache = FALSE;
3811
3812 fault_info_dst.interruptible = interruptible;
3813 fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
3814 fault_info_dst.user_tag = 0;
3815 fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
3816 fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
3817 fault_info_dst.no_cache = FALSE;
3818
3819 do { /* while (amount_left > 0) */
3820 /*
3821 * There may be a deadlock if both source and destination
3822 * pages are the same. To avoid this deadlock, the copy must
3823 * start by getting the destination page in order to apply
3824 * COW semantics if any.
3825 */
3826
3827 RetryDestinationFault: ;
3828
3829 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3830
3831 vm_object_lock(dst_object);
3832 vm_object_paging_begin(dst_object);
3833
3834 fault_info_dst.cluster_size = amount_left;
3835
3836 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3837 switch (vm_fault_page(dst_object,
3838 vm_object_trunc_page(dst_offset),
3839 VM_PROT_WRITE|VM_PROT_READ,
3840 FALSE,
3841 &dst_prot, &dst_page, &dst_top_page,
3842 (int *)0,
3843 &error,
3844 dst_map->no_zero_fill,
3845 FALSE, &fault_info_dst)) {
3846 case VM_FAULT_SUCCESS:
3847 break;
3848 case VM_FAULT_RETRY:
3849 goto RetryDestinationFault;
3850 case VM_FAULT_MEMORY_SHORTAGE:
3851 if (vm_page_wait(interruptible))
3852 goto RetryDestinationFault;
3853 /* fall thru */
3854 case VM_FAULT_INTERRUPTED:
3855 RETURN(MACH_SEND_INTERRUPTED);
3856 case VM_FAULT_MEMORY_ERROR:
3857 if (error)
3858 return (error);
3859 else
3860 return(KERN_MEMORY_ERROR);
3861 }
3862 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3863
3864 old_copy_object = dst_page->object->copy;
3865
3866 /*
3867 * There exists the possiblity that the source and
3868 * destination page are the same. But we can't
3869 * easily determine that now. If they are the
3870 * same, the call to vm_fault_page() for the
3871 * destination page will deadlock. To prevent this we
3872 * wire the page so we can drop busy without having
3873 * the page daemon steal the page. We clean up the
3874 * top page but keep the paging reference on the object
3875 * holding the dest page so it doesn't go away.
3876 */
3877
3878 vm_page_lockspin_queues();
3879 vm_page_wire(dst_page);
3880 vm_page_unlock_queues();
3881 PAGE_WAKEUP_DONE(dst_page);
3882 vm_object_unlock(dst_page->object);
3883
3884 if (dst_top_page != VM_PAGE_NULL) {
3885 vm_object_lock(dst_object);
3886 VM_PAGE_FREE(dst_top_page);
3887 vm_object_paging_end(dst_object);
3888 vm_object_unlock(dst_object);
3889 }
3890
3891 RetrySourceFault: ;
3892
3893 if (src_object == VM_OBJECT_NULL) {
3894 /*
3895 * No source object. We will just
3896 * zero-fill the page in dst_object.
3897 */
3898 src_page = VM_PAGE_NULL;
3899 result_page = VM_PAGE_NULL;
3900 } else {
3901 vm_object_lock(src_object);
3902 src_page = vm_page_lookup(src_object,
3903 vm_object_trunc_page(src_offset));
3904 if (src_page == dst_page) {
3905 src_prot = dst_prot;
3906 result_page = VM_PAGE_NULL;
3907 } else {
3908 src_prot = VM_PROT_READ;
3909 vm_object_paging_begin(src_object);
3910
3911 fault_info_src.cluster_size = amount_left;
3912
3913 XPR(XPR_VM_FAULT,
3914 "vm_fault_copy(2) -> vm_fault_page\n",
3915 0,0,0,0,0);
3916 switch (vm_fault_page(
3917 src_object,
3918 vm_object_trunc_page(src_offset),
3919 VM_PROT_READ, FALSE,
3920 &src_prot,
3921 &result_page, &src_top_page,
3922 (int *)0, &error, FALSE,
3923 FALSE, &fault_info_src)) {
3924
3925 case VM_FAULT_SUCCESS:
3926 break;
3927 case VM_FAULT_RETRY:
3928 goto RetrySourceFault;
3929 case VM_FAULT_MEMORY_SHORTAGE:
3930 if (vm_page_wait(interruptible))
3931 goto RetrySourceFault;
3932 /* fall thru */
3933 case VM_FAULT_INTERRUPTED:
3934 vm_fault_copy_dst_cleanup(dst_page);
3935 RETURN(MACH_SEND_INTERRUPTED);
3936 case VM_FAULT_MEMORY_ERROR:
3937 vm_fault_copy_dst_cleanup(dst_page);
3938 if (error)
3939 return (error);
3940 else
3941 return(KERN_MEMORY_ERROR);
3942 }
3943
3944
3945 assert((src_top_page == VM_PAGE_NULL) ==
3946 (result_page->object == src_object));
3947 }
3948 assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
3949 vm_object_unlock(result_page->object);
3950 }
3951
3952 if (!vm_map_verify(dst_map, dst_version)) {
3953 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3954 vm_fault_copy_cleanup(result_page, src_top_page);
3955 vm_fault_copy_dst_cleanup(dst_page);
3956 break;
3957 }
3958
3959 vm_object_lock(dst_page->object);
3960
3961 if (dst_page->object->copy != old_copy_object) {
3962 vm_object_unlock(dst_page->object);
3963 vm_map_verify_done(dst_map, dst_version);
3964 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3965 vm_fault_copy_cleanup(result_page, src_top_page);
3966 vm_fault_copy_dst_cleanup(dst_page);
3967 break;
3968 }
3969 vm_object_unlock(dst_page->object);
3970
3971 /*
3972 * Copy the page, and note that it is dirty
3973 * immediately.
3974 */
3975
3976 if (!page_aligned(src_offset) ||
3977 !page_aligned(dst_offset) ||
3978 !page_aligned(amount_left)) {
3979
3980 vm_object_offset_t src_po,
3981 dst_po;
3982
3983 src_po = src_offset - vm_object_trunc_page(src_offset);
3984 dst_po = dst_offset - vm_object_trunc_page(dst_offset);
3985
3986 if (dst_po > src_po) {
3987 part_size = PAGE_SIZE - dst_po;
3988 } else {
3989 part_size = PAGE_SIZE - src_po;
3990 }
3991 if (part_size > (amount_left)){
3992 part_size = amount_left;
3993 }
3994
3995 if (result_page == VM_PAGE_NULL) {
3996 vm_page_part_zero_fill(dst_page,
3997 dst_po, part_size);
3998 } else {
3999 vm_page_part_copy(result_page, src_po,
4000 dst_page, dst_po, part_size);
4001 if(!dst_page->dirty){
4002 vm_object_lock(dst_object);
4003 dst_page->dirty = TRUE;
4004 vm_object_unlock(dst_page->object);
4005 }
4006
4007 }
4008 } else {
4009 part_size = PAGE_SIZE;
4010
4011 if (result_page == VM_PAGE_NULL)
4012 vm_page_zero_fill(dst_page);
4013 else{
4014 vm_page_copy(result_page, dst_page);
4015 if(!dst_page->dirty){
4016 vm_object_lock(dst_object);
4017 dst_page->dirty = TRUE;
4018 vm_object_unlock(dst_page->object);
4019 }
4020 }
4021
4022 }
4023
4024 /*
4025 * Unlock everything, and return
4026 */
4027
4028 vm_map_verify_done(dst_map, dst_version);
4029
4030 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4031 vm_fault_copy_cleanup(result_page, src_top_page);
4032 vm_fault_copy_dst_cleanup(dst_page);
4033
4034 amount_left -= part_size;
4035 src_offset += part_size;
4036 dst_offset += part_size;
4037 } while (amount_left > 0);
4038
4039 RETURN(KERN_SUCCESS);
4040 #undef RETURN
4041
4042 /*NOTREACHED*/
4043 }
4044
4045 #if VM_FAULT_CLASSIFY
4046 /*
4047 * Temporary statistics gathering support.
4048 */
4049
4050 /*
4051 * Statistics arrays:
4052 */
4053 #define VM_FAULT_TYPES_MAX 5
4054 #define VM_FAULT_LEVEL_MAX 8
4055
4056 int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4057
4058 #define VM_FAULT_TYPE_ZERO_FILL 0
4059 #define VM_FAULT_TYPE_MAP_IN 1
4060 #define VM_FAULT_TYPE_PAGER 2
4061 #define VM_FAULT_TYPE_COPY 3
4062 #define VM_FAULT_TYPE_OTHER 4
4063
4064
4065 void
4066 vm_fault_classify(vm_object_t object,
4067 vm_object_offset_t offset,
4068 vm_prot_t fault_type)
4069 {
4070 int type, level = 0;
4071 vm_page_t m;
4072
4073 while (TRUE) {
4074 m = vm_page_lookup(object, offset);
4075 if (m != VM_PAGE_NULL) {
4076 if (m->busy || m->error || m->restart || m->absent) {
4077 type = VM_FAULT_TYPE_OTHER;
4078 break;
4079 }
4080 if (((fault_type & VM_PROT_WRITE) == 0) ||
4081 ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4082 type = VM_FAULT_TYPE_MAP_IN;
4083 break;
4084 }
4085 type = VM_FAULT_TYPE_COPY;
4086 break;
4087 }
4088 else {
4089 if (object->pager_created) {
4090 type = VM_FAULT_TYPE_PAGER;
4091 break;
4092 }
4093 if (object->shadow == VM_OBJECT_NULL) {
4094 type = VM_FAULT_TYPE_ZERO_FILL;
4095 break;
4096 }
4097
4098 offset += object->shadow_offset;
4099 object = object->shadow;
4100 level++;
4101 continue;
4102 }
4103 }
4104
4105 if (level > VM_FAULT_LEVEL_MAX)
4106 level = VM_FAULT_LEVEL_MAX;
4107
4108 vm_fault_stats[type][level] += 1;
4109
4110 return;
4111 }
4112
4113 /* cleanup routine to call from debugger */
4114
4115 void
4116 vm_fault_classify_init(void)
4117 {
4118 int type, level;
4119
4120 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4121 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4122 vm_fault_stats[type][level] = 0;
4123 }
4124 }
4125
4126 return;
4127 }
4128 #endif /* VM_FAULT_CLASSIFY */
4129
4130
4131 extern int cs_validation;
4132
4133 void
4134 vm_page_validate_cs(
4135 vm_page_t page)
4136 {
4137 vm_object_t object;
4138 vm_object_offset_t offset;
4139 vm_map_offset_t koffset;
4140 vm_map_size_t ksize;
4141 vm_offset_t kaddr;
4142 kern_return_t kr;
4143 memory_object_t pager;
4144 void *blobs;
4145 boolean_t validated, tainted;
4146 boolean_t busy_page;
4147
4148 vm_object_lock_assert_held(page->object);
4149
4150 if (!cs_validation) {
4151 return;
4152 }
4153
4154 if (page->cs_validated && !page->cs_tainted && page->wpmapped) {
4155 vm_object_lock_assert_exclusive(page->object);
4156
4157 /*
4158 * This page has already been validated and found to
4159 * be valid. However, it was mapped for "write" access
4160 * sometime in the past, so we have to check if it was
4161 * modified. If so, it needs to be revalidated.
4162 * If the page was already found to be "tainted", no
4163 * need to re-validate.
4164 */
4165 if (!page->dirty) {
4166 vm_cs_query_modified++;
4167 page->dirty = pmap_is_modified(page->phys_page);
4168 }
4169 if (page->dirty) {
4170 /*
4171 * The page is dirty, so let's clear its
4172 * "validated" bit and re-validate it.
4173 */
4174 if (cs_debug) {
4175 printf("CODESIGNING: vm_page_validate_cs: "
4176 "page %p obj %p off 0x%llx "
4177 "was modified\n",
4178 page, page->object, page->offset);
4179 }
4180 page->cs_validated = FALSE;
4181 vm_cs_validated_dirtied++;
4182 }
4183 }
4184
4185 if (page->cs_validated) {
4186 return;
4187 }
4188
4189 vm_object_lock_assert_exclusive(page->object);
4190
4191 vm_cs_validates++;
4192
4193 object = page->object;
4194 assert(object->code_signed);
4195 offset = page->offset;
4196
4197 busy_page = page->busy;
4198 if (!busy_page) {
4199 /* keep page busy while we map (and unlock) the VM object */
4200 page->busy = TRUE;
4201 }
4202
4203 /*
4204 * Take a paging reference on the VM object
4205 * to protect it from collapse or bypass,
4206 * and keep it from disappearing too.
4207 */
4208 vm_object_paging_begin(object);
4209
4210 /* map the page in the kernel address space */
4211 koffset = 0;
4212 ksize = PAGE_SIZE_64;
4213 kr = vm_paging_map_object(&koffset,
4214 page,
4215 object,
4216 offset,
4217 &ksize,
4218 FALSE); /* can't unlock object ! */
4219 if (kr != KERN_SUCCESS) {
4220 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
4221 }
4222 kaddr = CAST_DOWN(vm_offset_t, koffset);
4223
4224 /*
4225 * Since we get here to validate a page that was brought in by
4226 * the pager, we know that this pager is all setup and ready
4227 * by now.
4228 */
4229 assert(!object->internal);
4230 assert(object->pager != NULL);
4231 assert(object->pager_ready);
4232
4233 if (!object->alive || object->terminating || object->pager == NULL) {
4234 /*
4235 * The object is terminating and we don't have its pager
4236 * so we can't validate the data...
4237 */
4238 goto out;
4239 }
4240
4241 pager = object->pager;
4242 assert(pager != NULL);
4243
4244 kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
4245 if (kr != KERN_SUCCESS) {
4246 blobs = NULL;
4247 }
4248
4249 /* verify the SHA1 hash for this page */
4250 validated = cs_validate_page(blobs,
4251 offset + object->paging_offset,
4252 (const void *)kaddr,
4253 &tainted);
4254
4255 assert(page->busy);
4256 assert(object == page->object);
4257 vm_object_lock_assert_exclusive(object);
4258
4259 page->cs_validated = validated;
4260 if (validated) {
4261 page->cs_tainted = tainted;
4262 }
4263
4264 out:
4265 if (!busy_page) {
4266 PAGE_WAKEUP_DONE(page);
4267 }
4268 if (koffset != 0) {
4269 /* unmap the map from the kernel address space */
4270 vm_paging_unmap_object(object, koffset, koffset + ksize);
4271 koffset = 0;
4272 ksize = 0;
4273 kaddr = 0;
4274 }
4275 vm_object_paging_end(object);
4276 }