]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_fault.c
87ffc3ee7c6d8b78671fcc2bfe599303a8e851ce
[apple/xnu.git] / osfmk / vm / vm_fault.c
1 /*
2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm_fault.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Page fault handling module.
63 */
64
65 #include <mach_cluster_stats.h>
66 #include <mach_pagemap.h>
67 #include <mach_kdb.h>
68 #include <libkern/OSAtomic.h>
69
70 #include <mach/mach_types.h>
71 #include <mach/kern_return.h>
72 #include <mach/message.h> /* for error codes */
73 #include <mach/vm_param.h>
74 #include <mach/vm_behavior.h>
75 #include <mach/memory_object.h>
76 /* For memory_object_data_{request,unlock} */
77 #include <mach/sdt.h>
78
79 #include <kern/kern_types.h>
80 #include <kern/host_statistics.h>
81 #include <kern/counters.h>
82 #include <kern/task.h>
83 #include <kern/thread.h>
84 #include <kern/sched_prim.h>
85 #include <kern/host.h>
86 #include <kern/xpr.h>
87 #include <kern/mach_param.h>
88 #include <kern/macro_help.h>
89 #include <kern/zalloc.h>
90 #include <kern/misc_protos.h>
91
92 #include <ppc/proc_reg.h>
93
94 #include <vm/vm_fault.h>
95 #include <vm/vm_map.h>
96 #include <vm/vm_object.h>
97 #include <vm/vm_page.h>
98 #include <vm/vm_kern.h>
99 #include <vm/pmap.h>
100 #include <vm/vm_pageout.h>
101 #include <vm/vm_protos.h>
102 #include <vm/vm_external.h>
103 #include <vm/memory_object.h>
104 #include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
105
106 #include <sys/kdebug.h>
107
108 #define VM_FAULT_CLASSIFY 0
109
110 /* Zero-filled pages are marked "m->zero_fill" and put on the
111 * special zero-fill inactive queue only if they belong to
112 * an object at least this big.
113 */
114 #define VM_ZF_OBJECT_SIZE_THRESHOLD (0x200000)
115
116 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
117
118 int vm_object_pagein_throttle = 16;
119
120 extern int cs_debug;
121
122 #if MACH_KDB
123 extern struct db_watchpoint *db_watchpoint_list;
124 #endif /* MACH_KDB */
125
126
127 /* Forward declarations of internal routines. */
128 extern kern_return_t vm_fault_wire_fast(
129 vm_map_t map,
130 vm_map_offset_t va,
131 vm_map_entry_t entry,
132 pmap_t pmap,
133 vm_map_offset_t pmap_addr);
134
135 extern void vm_fault_continue(void);
136
137 extern void vm_fault_copy_cleanup(
138 vm_page_t page,
139 vm_page_t top_page);
140
141 extern void vm_fault_copy_dst_cleanup(
142 vm_page_t page);
143
144 #if VM_FAULT_CLASSIFY
145 extern void vm_fault_classify(vm_object_t object,
146 vm_object_offset_t offset,
147 vm_prot_t fault_type);
148
149 extern void vm_fault_classify_init(void);
150 #endif
151
152 /*
153 * Routine: vm_fault_init
154 * Purpose:
155 * Initialize our private data structures.
156 */
157 void
158 vm_fault_init(void)
159 {
160 }
161
162 /*
163 * Routine: vm_fault_cleanup
164 * Purpose:
165 * Clean up the result of vm_fault_page.
166 * Results:
167 * The paging reference for "object" is released.
168 * "object" is unlocked.
169 * If "top_page" is not null, "top_page" is
170 * freed and the paging reference for the object
171 * containing it is released.
172 *
173 * In/out conditions:
174 * "object" must be locked.
175 */
176 void
177 vm_fault_cleanup(
178 register vm_object_t object,
179 register vm_page_t top_page)
180 {
181 vm_object_paging_end(object);
182 vm_object_unlock(object);
183
184 if (top_page != VM_PAGE_NULL) {
185 object = top_page->object;
186
187 vm_object_lock(object);
188 VM_PAGE_FREE(top_page);
189 vm_object_paging_end(object);
190 vm_object_unlock(object);
191 }
192 }
193
194 #if MACH_CLUSTER_STATS
195 #define MAXCLUSTERPAGES 16
196 struct {
197 unsigned long pages_in_cluster;
198 unsigned long pages_at_higher_offsets;
199 unsigned long pages_at_lower_offsets;
200 } cluster_stats_in[MAXCLUSTERPAGES];
201 #define CLUSTER_STAT(clause) clause
202 #define CLUSTER_STAT_HIGHER(x) \
203 ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
204 #define CLUSTER_STAT_LOWER(x) \
205 ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
206 #define CLUSTER_STAT_CLUSTER(x) \
207 ((cluster_stats_in[(x)].pages_in_cluster)++)
208 #else /* MACH_CLUSTER_STATS */
209 #define CLUSTER_STAT(clause)
210 #endif /* MACH_CLUSTER_STATS */
211
212 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
213
214
215 boolean_t vm_page_deactivate_behind = TRUE;
216 /*
217 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
218 */
219 int vm_default_ahead = 0;
220 int vm_default_behind = MAX_UPL_TRANSFER;
221
222 #define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
223
224 /*
225 * vm_page_is_sequential
226 *
227 * Determine if sequential access is in progress
228 * in accordance with the behavior specified.
229 * Update state to indicate current access pattern.
230 *
231 * object must have at least the shared lock held
232 */
233 static
234 void
235 vm_fault_is_sequential(
236 vm_object_t object,
237 vm_object_offset_t offset,
238 vm_behavior_t behavior)
239 {
240 vm_object_offset_t last_alloc;
241 int sequential;
242 int orig_sequential;
243
244 last_alloc = object->last_alloc;
245 sequential = object->sequential;
246 orig_sequential = sequential;
247
248 switch (behavior) {
249 case VM_BEHAVIOR_RANDOM:
250 /*
251 * reset indicator of sequential behavior
252 */
253 sequential = 0;
254 break;
255
256 case VM_BEHAVIOR_SEQUENTIAL:
257 if (offset && last_alloc == offset - PAGE_SIZE_64) {
258 /*
259 * advance indicator of sequential behavior
260 */
261 if (sequential < MAX_SEQUENTIAL_RUN)
262 sequential += PAGE_SIZE;
263 } else {
264 /*
265 * reset indicator of sequential behavior
266 */
267 sequential = 0;
268 }
269 break;
270
271 case VM_BEHAVIOR_RSEQNTL:
272 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
273 /*
274 * advance indicator of sequential behavior
275 */
276 if (sequential > -MAX_SEQUENTIAL_RUN)
277 sequential -= PAGE_SIZE;
278 } else {
279 /*
280 * reset indicator of sequential behavior
281 */
282 sequential = 0;
283 }
284 break;
285
286 case VM_BEHAVIOR_DEFAULT:
287 default:
288 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
289 /*
290 * advance indicator of sequential behavior
291 */
292 if (sequential < 0)
293 sequential = 0;
294 if (sequential < MAX_SEQUENTIAL_RUN)
295 sequential += PAGE_SIZE;
296
297 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
298 /*
299 * advance indicator of sequential behavior
300 */
301 if (sequential > 0)
302 sequential = 0;
303 if (sequential > -MAX_SEQUENTIAL_RUN)
304 sequential -= PAGE_SIZE;
305 } else {
306 /*
307 * reset indicator of sequential behavior
308 */
309 sequential = 0;
310 }
311 break;
312 }
313 if (sequential != orig_sequential) {
314 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
315 /*
316 * if someone else has already updated object->sequential
317 * don't bother trying to update it or object->last_alloc
318 */
319 return;
320 }
321 }
322 /*
323 * I'd like to do this with a OSCompareAndSwap64, but that
324 * doesn't exist for PPC... however, it shouldn't matter
325 * that much... last_alloc is maintained so that we can determine
326 * if a sequential access pattern is taking place... if only
327 * one thread is banging on this object, no problem with the unprotected
328 * update... if 2 or more threads are banging away, we run the risk of
329 * someone seeing a mangled update... however, in the face of multiple
330 * accesses, no sequential access pattern can develop anyway, so we
331 * haven't lost any real info.
332 */
333 object->last_alloc = offset;
334 }
335
336
337 /*
338 * vm_page_deactivate_behind
339 *
340 * Determine if sequential access is in progress
341 * in accordance with the behavior specified. If
342 * so, compute a potential page to deactivate and
343 * deactivate it.
344 *
345 * object must be locked.
346 *
347 * return TRUE if we actually deactivate a page
348 */
349 static
350 boolean_t
351 vm_fault_deactivate_behind(
352 vm_object_t object,
353 vm_object_offset_t offset,
354 vm_behavior_t behavior)
355 {
356 vm_page_t m = NULL;
357 int sequential_run;
358 int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
359
360 #if TRACEFAULTPAGE
361 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
362 #endif
363
364 if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
365 /*
366 * Do not deactivate pages from the kernel object: they
367 * are not intended to become pageable.
368 * or we've disabled the deactivate behind mechanism
369 */
370 return FALSE;
371 }
372 if ((sequential_run = object->sequential)) {
373 if (sequential_run < 0) {
374 sequential_behavior = VM_BEHAVIOR_RSEQNTL;
375 sequential_run = 0 - sequential_run;
376 } else {
377 sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
378 }
379 }
380 switch (behavior) {
381 case VM_BEHAVIOR_RANDOM:
382 break;
383 case VM_BEHAVIOR_SEQUENTIAL:
384 if (sequential_run >= (int)PAGE_SIZE)
385 m = vm_page_lookup(object, offset - PAGE_SIZE_64);
386 break;
387 case VM_BEHAVIOR_RSEQNTL:
388 if (sequential_run >= (int)PAGE_SIZE)
389 m = vm_page_lookup(object, offset + PAGE_SIZE_64);
390 break;
391 case VM_BEHAVIOR_DEFAULT:
392 default:
393 { vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
394
395 /*
396 * determine if the run of sequential accesss has been
397 * long enough on an object with default access behavior
398 * to consider it for deactivation
399 */
400 if ((uint64_t)sequential_run >= behind) {
401 if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
402 if (offset >= behind)
403 m = vm_page_lookup(object, offset - behind);
404 } else {
405 if (offset < -behind)
406 m = vm_page_lookup(object, offset + behind);
407 }
408 }
409 break;
410 }
411 }
412 if (m) {
413 if (!m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
414 pmap_clear_reference(m->phys_page);
415 m->deactivated = TRUE;
416 #if TRACEFAULTPAGE
417 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
418 #endif
419 return TRUE;
420 }
421 }
422 return FALSE;
423 }
424
425
426 /*
427 * check for various conditions that would
428 * prevent us from creating a ZF page...
429 * cleanup is based on being called from vm_fault_page
430 *
431 * object must be locked
432 * object == m->object
433 */
434 static vm_fault_return_t
435 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
436 {
437 if (object->shadow_severed) {
438 /*
439 * the shadow chain was severed
440 * just have to return an error at this point
441 */
442 if (m != VM_PAGE_NULL)
443 VM_PAGE_FREE(m);
444 vm_fault_cleanup(object, first_m);
445
446 thread_interrupt_level(interruptible_state);
447
448 return (VM_FAULT_MEMORY_ERROR);
449 }
450 if (vm_backing_store_low) {
451 /*
452 * are we protecting the system from
453 * backing store exhaustion. If so
454 * sleep unless we are privileged.
455 */
456 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
457
458 if (m != VM_PAGE_NULL)
459 VM_PAGE_FREE(m);
460 vm_fault_cleanup(object, first_m);
461
462 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
463
464 thread_block(THREAD_CONTINUE_NULL);
465 thread_interrupt_level(interruptible_state);
466
467 return (VM_FAULT_RETRY);
468 }
469 }
470 if (VM_PAGE_ZFILL_THROTTLED()) {
471 /*
472 * we're throttling zero-fills...
473 * treat this as if we couldn't grab a page
474 */
475 if (m != VM_PAGE_NULL)
476 VM_PAGE_FREE(m);
477 vm_fault_cleanup(object, first_m);
478
479 thread_interrupt_level(interruptible_state);
480
481 return (VM_FAULT_MEMORY_SHORTAGE);
482 }
483 return (VM_FAULT_SUCCESS);
484 }
485
486
487 /*
488 * do the work to zero fill a page and
489 * inject it into the correct paging queue
490 *
491 * m->object must be locked
492 * page queue lock must NOT be held
493 */
494 static int
495 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
496 {
497 int my_fault = DBG_ZERO_FILL_FAULT;
498
499 /*
500 * This is is a zero-fill page fault...
501 *
502 * Checking the page lock is a waste of
503 * time; this page was absent, so
504 * it can't be page locked by a pager.
505 *
506 * we also consider it undefined
507 * with respect to instruction
508 * execution. i.e. it is the responsibility
509 * of higher layers to call for an instruction
510 * sync after changing the contents and before
511 * sending a program into this area. We
512 * choose this approach for performance
513 */
514 m->pmapped = TRUE;
515
516 m->cs_validated = FALSE;
517 m->cs_tainted = FALSE;
518
519 if (no_zero_fill == TRUE)
520 my_fault = DBG_NZF_PAGE_FAULT;
521 else {
522 vm_page_zero_fill(m);
523
524 VM_STAT_INCR(zero_fill_count);
525 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
526 }
527 assert(!m->laundry);
528 assert(m->object != kernel_object);
529 //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
530
531 if (!IP_VALID(memory_manager_default) &&
532 (m->object->purgable == VM_PURGABLE_DENY ||
533 m->object->purgable == VM_PURGABLE_NONVOLATILE)) {
534 vm_page_lock_queues();
535
536 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
537 m->throttled = TRUE;
538 vm_page_throttled_count++;
539
540 vm_page_unlock_queues();
541 } else {
542 if (m->object->size > VM_ZF_OBJECT_SIZE_THRESHOLD) {
543 m->zero_fill = TRUE;
544 OSAddAtomic(1, (SInt32 *)&vm_zf_count);
545 }
546 }
547 return (my_fault);
548 }
549
550
551 /*
552 * Routine: vm_fault_page
553 * Purpose:
554 * Find the resident page for the virtual memory
555 * specified by the given virtual memory object
556 * and offset.
557 * Additional arguments:
558 * The required permissions for the page is given
559 * in "fault_type". Desired permissions are included
560 * in "protection".
561 * fault_info is passed along to determine pagein cluster
562 * limits... it contains the expected reference pattern,
563 * cluster size if available, etc...
564 *
565 * If the desired page is known to be resident (for
566 * example, because it was previously wired down), asserting
567 * the "unwiring" parameter will speed the search.
568 *
569 * If the operation can be interrupted (by thread_abort
570 * or thread_terminate), then the "interruptible"
571 * parameter should be asserted.
572 *
573 * Results:
574 * The page containing the proper data is returned
575 * in "result_page".
576 *
577 * In/out conditions:
578 * The source object must be locked and referenced,
579 * and must donate one paging reference. The reference
580 * is not affected. The paging reference and lock are
581 * consumed.
582 *
583 * If the call succeeds, the object in which "result_page"
584 * resides is left locked and holding a paging reference.
585 * If this is not the original object, a busy page in the
586 * original object is returned in "top_page", to prevent other
587 * callers from pursuing this same data, along with a paging
588 * reference for the original object. The "top_page" should
589 * be destroyed when this guarantee is no longer required.
590 * The "result_page" is also left busy. It is not removed
591 * from the pageout queues.
592 */
593
594 vm_fault_return_t
595 vm_fault_page(
596 /* Arguments: */
597 vm_object_t first_object, /* Object to begin search */
598 vm_object_offset_t first_offset, /* Offset into object */
599 vm_prot_t fault_type, /* What access is requested */
600 boolean_t must_be_resident,/* Must page be resident? */
601 /* Modifies in place: */
602 vm_prot_t *protection, /* Protection for mapping */
603 /* Returns: */
604 vm_page_t *result_page, /* Page found, if successful */
605 vm_page_t *top_page, /* Page in top object, if
606 * not result_page. */
607 int *type_of_fault, /* if non-null, fill in with type of fault
608 * COW, zero-fill, etc... returned in trace point */
609 /* More arguments: */
610 kern_return_t *error_code, /* code if page is in error */
611 boolean_t no_zero_fill, /* don't zero fill absent pages */
612 #if MACH_PAGEMAP
613 boolean_t data_supply, /* treat as data_supply if
614 * it is a write fault and a full
615 * page is provided */
616 #else
617 __unused boolean_t data_supply,
618 #endif
619 vm_object_fault_info_t fault_info)
620 {
621 vm_page_t m;
622 vm_object_t object;
623 vm_object_offset_t offset;
624 vm_page_t first_m;
625 vm_object_t next_object;
626 vm_object_t copy_object;
627 boolean_t look_for_page;
628 vm_prot_t access_required = fault_type;
629 vm_prot_t wants_copy_flag;
630 CLUSTER_STAT(int pages_at_higher_offsets;)
631 CLUSTER_STAT(int pages_at_lower_offsets;)
632 kern_return_t wait_result;
633 boolean_t interruptible_state;
634 vm_fault_return_t error;
635 int my_fault;
636 uint32_t try_failed_count;
637 int interruptible; /* how may fault be interrupted? */
638 memory_object_t pager;
639
640 /*
641 * MACH page map - an optional optimization where a bit map is maintained
642 * by the VM subsystem for internal objects to indicate which pages of
643 * the object currently reside on backing store. This existence map
644 * duplicates information maintained by the vnode pager. It is
645 * created at the time of the first pageout against the object, i.e.
646 * at the same time pager for the object is created. The optimization
647 * is designed to eliminate pager interaction overhead, if it is
648 * 'known' that the page does not exist on backing store.
649 *
650 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
651 * either marked as paged out in the existence map for the object or no
652 * existence map exists for the object. MUST_ASK_PAGER() is one of the
653 * criteria in the decision to invoke the pager. It is also used as one
654 * of the criteria to terminate the scan for adjacent pages in a clustered
655 * pagein operation. Note that MUST_ASK_PAGER() always evaluates to TRUE for
656 * permanent objects. Note also that if the pager for an internal object
657 * has not been created, the pager is not invoked regardless of the value
658 * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
659 * for which a pager has been created.
660 *
661 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
662 * is marked as paged out in the existence map for the object. PAGED_OUT()
663 * PAGED_OUT() is used to determine if a page has already been pushed
664 * into a copy object in order to avoid a redundant page out operation.
665 */
666 #if MACH_PAGEMAP
667 #define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \
668 != VM_EXTERNAL_STATE_ABSENT)
669 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
670 == VM_EXTERNAL_STATE_EXISTS)
671 #else
672 #define MUST_ASK_PAGER(o, f) (TRUE)
673 #define PAGED_OUT(o, f) (FALSE)
674 #endif
675
676 /*
677 * Recovery actions
678 */
679 #define PREPARE_RELEASE_PAGE(m) \
680 MACRO_BEGIN \
681 vm_page_lock_queues(); \
682 MACRO_END
683
684 #define DO_RELEASE_PAGE(m) \
685 MACRO_BEGIN \
686 PAGE_WAKEUP_DONE(m); \
687 if (!m->active && !m->inactive && !m->throttled)\
688 vm_page_activate(m); \
689 vm_page_unlock_queues(); \
690 MACRO_END
691
692 #define RELEASE_PAGE(m) \
693 MACRO_BEGIN \
694 PREPARE_RELEASE_PAGE(m); \
695 DO_RELEASE_PAGE(m); \
696 MACRO_END
697
698 #if TRACEFAULTPAGE
699 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
700 #endif
701
702
703 #if MACH_KDB
704 /*
705 * If there are watchpoints set, then
706 * we don't want to give away write permission
707 * on a read fault. Make the task write fault,
708 * so that the watchpoint code notices the access.
709 */
710 if (db_watchpoint_list) {
711 /*
712 * If we aren't asking for write permission,
713 * then don't give it away. We're using write
714 * faults to set the dirty bit.
715 */
716 if (!(fault_type & VM_PROT_WRITE))
717 *protection &= ~VM_PROT_WRITE;
718 }
719 #endif /* MACH_KDB */
720
721 interruptible = fault_info->interruptible;
722 interruptible_state = thread_interrupt_level(interruptible);
723
724 /*
725 * INVARIANTS (through entire routine):
726 *
727 * 1) At all times, we must either have the object
728 * lock or a busy page in some object to prevent
729 * some other thread from trying to bring in
730 * the same page.
731 *
732 * Note that we cannot hold any locks during the
733 * pager access or when waiting for memory, so
734 * we use a busy page then.
735 *
736 * 2) To prevent another thread from racing us down the
737 * shadow chain and entering a new page in the top
738 * object before we do, we must keep a busy page in
739 * the top object while following the shadow chain.
740 *
741 * 3) We must increment paging_in_progress on any object
742 * for which we have a busy page before dropping
743 * the object lock
744 *
745 * 4) We leave busy pages on the pageout queues.
746 * If the pageout daemon comes across a busy page,
747 * it will remove the page from the pageout queues.
748 */
749
750 object = first_object;
751 offset = first_offset;
752 first_m = VM_PAGE_NULL;
753 access_required = fault_type;
754
755
756 XPR(XPR_VM_FAULT,
757 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
758 (integer_t)object, offset, fault_type, *protection, 0);
759
760 /*
761 * default type of fault
762 */
763 my_fault = DBG_CACHE_HIT_FAULT;
764
765 while (TRUE) {
766 #if TRACEFAULTPAGE
767 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
768 #endif
769 if (!object->alive) {
770 /*
771 * object is no longer valid
772 * clean up and return error
773 */
774 vm_fault_cleanup(object, first_m);
775 thread_interrupt_level(interruptible_state);
776
777 return (VM_FAULT_MEMORY_ERROR);
778 }
779
780 /*
781 * See whether the page at 'offset' is resident
782 */
783 m = vm_page_lookup(object, offset);
784 #if TRACEFAULTPAGE
785 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
786 #endif
787 if (m != VM_PAGE_NULL) {
788
789 if (m->busy) {
790 /*
791 * The page is being brought in,
792 * wait for it and then retry.
793 *
794 * A possible optimization: if the page
795 * is known to be resident, we can ignore
796 * pages that are absent (regardless of
797 * whether they're busy).
798 */
799 #if TRACEFAULTPAGE
800 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
801 #endif
802 wait_result = PAGE_SLEEP(object, m, interruptible);
803 XPR(XPR_VM_FAULT,
804 "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
805 (integer_t)object, offset,
806 (integer_t)m, 0, 0);
807 counter(c_vm_fault_page_block_busy_kernel++);
808
809 if (wait_result != THREAD_AWAKENED) {
810 vm_fault_cleanup(object, first_m);
811 thread_interrupt_level(interruptible_state);
812
813 if (wait_result == THREAD_RESTART)
814 return (VM_FAULT_RETRY);
815 else
816 return (VM_FAULT_INTERRUPTED);
817 }
818 continue;
819 }
820
821 if (m->phys_page == vm_page_guard_addr) {
822 /*
823 * Guard page: off limits !
824 */
825 if (fault_type == VM_PROT_NONE) {
826 /*
827 * The fault is not requesting any
828 * access to the guard page, so it must
829 * be just to wire or unwire it.
830 * Let's pretend it succeeded...
831 */
832 m->busy = TRUE;
833 *result_page = m;
834 assert(first_m == VM_PAGE_NULL);
835 *top_page = first_m;
836 if (type_of_fault)
837 *type_of_fault = DBG_GUARD_FAULT;
838 return VM_FAULT_SUCCESS;
839 } else {
840 /*
841 * The fault requests access to the
842 * guard page: let's deny that !
843 */
844 vm_fault_cleanup(object, first_m);
845 thread_interrupt_level(interruptible_state);
846 return VM_FAULT_MEMORY_ERROR;
847 }
848 }
849
850 if (m->error) {
851 /*
852 * The page is in error, give up now.
853 */
854 #if TRACEFAULTPAGE
855 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
856 #endif
857 if (error_code)
858 *error_code = KERN_MEMORY_ERROR;
859 VM_PAGE_FREE(m);
860
861 vm_fault_cleanup(object, first_m);
862 thread_interrupt_level(interruptible_state);
863
864 return (VM_FAULT_MEMORY_ERROR);
865 }
866 if (m->restart) {
867 /*
868 * The pager wants us to restart
869 * at the top of the chain,
870 * typically because it has moved the
871 * page to another pager, then do so.
872 */
873 #if TRACEFAULTPAGE
874 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
875 #endif
876 VM_PAGE_FREE(m);
877
878 vm_fault_cleanup(object, first_m);
879 thread_interrupt_level(interruptible_state);
880
881 return (VM_FAULT_RETRY);
882 }
883 if (m->absent) {
884 /*
885 * The page isn't busy, but is absent,
886 * therefore it's deemed "unavailable".
887 *
888 * Remove the non-existent page (unless it's
889 * in the top object) and move on down to the
890 * next object (if there is one).
891 */
892 #if TRACEFAULTPAGE
893 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
894 #endif
895 next_object = object->shadow;
896
897 if (next_object == VM_OBJECT_NULL) {
898 /*
899 * Absent page at bottom of shadow
900 * chain; zero fill the page we left
901 * busy in the first object, and free
902 * the absent page.
903 */
904 assert(!must_be_resident);
905
906 /*
907 * check for any conditions that prevent
908 * us from creating a new zero-fill page
909 * vm_fault_check will do all of the
910 * fault cleanup in the case of an error condition
911 * including resetting the thread_interrupt_level
912 */
913 error = vm_fault_check(object, m, first_m, interruptible_state);
914
915 if (error != VM_FAULT_SUCCESS)
916 return (error);
917
918 XPR(XPR_VM_FAULT,
919 "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
920 (integer_t)object, offset,
921 (integer_t)m,
922 (integer_t)first_object, 0);
923
924 if (object != first_object) {
925 /*
926 * free the absent page we just found
927 */
928 VM_PAGE_FREE(m);
929
930 /*
931 * drop reference and lock on current object
932 */
933 vm_object_paging_end(object);
934 vm_object_unlock(object);
935
936 /*
937 * grab the original page we
938 * 'soldered' in place and
939 * retake lock on 'first_object'
940 */
941 m = first_m;
942 first_m = VM_PAGE_NULL;
943
944 object = first_object;
945 offset = first_offset;
946
947 vm_object_lock(object);
948 } else {
949 /*
950 * we're going to use the absent page we just found
951 * so convert it to a 'busy' page
952 */
953 m->absent = FALSE;
954 m->busy = TRUE;
955 }
956 /*
957 * zero-fill the page and put it on
958 * the correct paging queue
959 */
960 my_fault = vm_fault_zero_page(m, no_zero_fill);
961
962 break;
963 } else {
964 if (must_be_resident)
965 vm_object_paging_end(object);
966 else if (object != first_object) {
967 vm_object_paging_end(object);
968 VM_PAGE_FREE(m);
969 } else {
970 first_m = m;
971 m->absent = FALSE;
972 m->busy = TRUE;
973
974 vm_page_lockspin_queues();
975 VM_PAGE_QUEUES_REMOVE(m);
976 vm_page_unlock_queues();
977 }
978 XPR(XPR_VM_FAULT,
979 "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
980 (integer_t)object, offset,
981 (integer_t)next_object,
982 offset+object->shadow_offset,0);
983
984 offset += object->shadow_offset;
985 fault_info->lo_offset += object->shadow_offset;
986 fault_info->hi_offset += object->shadow_offset;
987 access_required = VM_PROT_READ;
988
989 vm_object_lock(next_object);
990 vm_object_unlock(object);
991 object = next_object;
992 vm_object_paging_begin(object);
993
994 /*
995 * reset to default type of fault
996 */
997 my_fault = DBG_CACHE_HIT_FAULT;
998
999 continue;
1000 }
1001 }
1002 if ((m->cleaning)
1003 && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1004 && (fault_type & VM_PROT_WRITE)) {
1005 /*
1006 * This is a copy-on-write fault that will
1007 * cause us to revoke access to this page, but
1008 * this page is in the process of being cleaned
1009 * in a clustered pageout. We must wait until
1010 * the cleaning operation completes before
1011 * revoking access to the original page,
1012 * otherwise we might attempt to remove a
1013 * wired mapping.
1014 */
1015 #if TRACEFAULTPAGE
1016 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
1017 #endif
1018 XPR(XPR_VM_FAULT,
1019 "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1020 (integer_t)object, offset,
1021 (integer_t)m, 0, 0);
1022 /*
1023 * take an extra ref so that object won't die
1024 */
1025 vm_object_reference_locked(object);
1026
1027 vm_fault_cleanup(object, first_m);
1028
1029 counter(c_vm_fault_page_block_backoff_kernel++);
1030 vm_object_lock(object);
1031 assert(object->ref_count > 0);
1032
1033 m = vm_page_lookup(object, offset);
1034
1035 if (m != VM_PAGE_NULL && m->cleaning) {
1036 PAGE_ASSERT_WAIT(m, interruptible);
1037
1038 vm_object_unlock(object);
1039 wait_result = thread_block(THREAD_CONTINUE_NULL);
1040 vm_object_deallocate(object);
1041
1042 goto backoff;
1043 } else {
1044 vm_object_unlock(object);
1045
1046 vm_object_deallocate(object);
1047 thread_interrupt_level(interruptible_state);
1048
1049 return (VM_FAULT_RETRY);
1050 }
1051 }
1052 if (type_of_fault == NULL && m->speculative) {
1053 /*
1054 * If we were passed a non-NULL pointer for
1055 * "type_of_fault", than we came from
1056 * vm_fault... we'll let it deal with
1057 * this condition, since it
1058 * needs to see m->speculative to correctly
1059 * account the pageins, otherwise...
1060 * take it off the speculative queue, we'll
1061 * let the caller of vm_fault_page deal
1062 * with getting it onto the correct queue
1063 */
1064 vm_page_lockspin_queues();
1065 VM_PAGE_QUEUES_REMOVE(m);
1066 vm_page_unlock_queues();
1067 }
1068
1069 if (m->encrypted) {
1070 /*
1071 * ENCRYPTED SWAP:
1072 * the user needs access to a page that we
1073 * encrypted before paging it out.
1074 * Decrypt the page now.
1075 * Keep it busy to prevent anyone from
1076 * accessing it during the decryption.
1077 */
1078 m->busy = TRUE;
1079 vm_page_decrypt(m, 0);
1080 assert(object == m->object);
1081 assert(m->busy);
1082 PAGE_WAKEUP_DONE(m);
1083
1084 /*
1085 * Retry from the top, in case
1086 * something changed while we were
1087 * decrypting.
1088 */
1089 continue;
1090 }
1091 ASSERT_PAGE_DECRYPTED(m);
1092
1093 if (m->object->code_signed) {
1094 /*
1095 * CODE SIGNING:
1096 * We just paged in a page from a signed
1097 * memory object but we don't need to
1098 * validate it now. We'll validate it if
1099 * when it gets mapped into a user address
1100 * space for the first time or when the page
1101 * gets copied to another object as a result
1102 * of a copy-on-write.
1103 */
1104 }
1105
1106 /*
1107 * We mark the page busy and leave it on
1108 * the pageout queues. If the pageout
1109 * deamon comes across it, then it will
1110 * remove the page from the queue, but not the object
1111 */
1112 #if TRACEFAULTPAGE
1113 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1114 #endif
1115 XPR(XPR_VM_FAULT,
1116 "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1117 (integer_t)object, offset, (integer_t)m, 0, 0);
1118 assert(!m->busy);
1119 assert(!m->absent);
1120
1121 m->busy = TRUE;
1122 break;
1123 }
1124
1125
1126 /*
1127 * we get here when there is no page present in the object at
1128 * the offset we're interested in... we'll allocate a page
1129 * at this point if the pager associated with
1130 * this object can provide the data or we're the top object...
1131 * object is locked; m == NULL
1132 */
1133 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply);
1134
1135 #if TRACEFAULTPAGE
1136 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1137 #endif
1138 if ((look_for_page || (object == first_object)) && !must_be_resident && !object->phys_contiguous) {
1139 /*
1140 * Allocate a new page for this object/offset pair
1141 */
1142 m = vm_page_grab();
1143 #if TRACEFAULTPAGE
1144 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1145 #endif
1146 if (m == VM_PAGE_NULL) {
1147
1148 vm_fault_cleanup(object, first_m);
1149 thread_interrupt_level(interruptible_state);
1150
1151 return (VM_FAULT_MEMORY_SHORTAGE);
1152 }
1153 vm_page_insert(m, object, offset);
1154 }
1155 if (look_for_page && !must_be_resident) {
1156 kern_return_t rc;
1157
1158 /*
1159 * If the memory manager is not ready, we
1160 * cannot make requests.
1161 */
1162 if (!object->pager_ready) {
1163 #if TRACEFAULTPAGE
1164 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1165 #endif
1166 if (m != VM_PAGE_NULL)
1167 VM_PAGE_FREE(m);
1168
1169 XPR(XPR_VM_FAULT,
1170 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1171 (integer_t)object, offset, 0, 0, 0);
1172
1173 /*
1174 * take an extra ref so object won't die
1175 */
1176 vm_object_reference_locked(object);
1177 vm_fault_cleanup(object, first_m);
1178 counter(c_vm_fault_page_block_backoff_kernel++);
1179
1180 vm_object_lock(object);
1181 assert(object->ref_count > 0);
1182
1183 if (!object->pager_ready) {
1184 wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1185
1186 vm_object_unlock(object);
1187 if (wait_result == THREAD_WAITING)
1188 wait_result = thread_block(THREAD_CONTINUE_NULL);
1189 vm_object_deallocate(object);
1190
1191 goto backoff;
1192 } else {
1193 vm_object_unlock(object);
1194 vm_object_deallocate(object);
1195 thread_interrupt_level(interruptible_state);
1196
1197 return (VM_FAULT_RETRY);
1198 }
1199 }
1200 if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1201 /*
1202 * If there are too many outstanding page
1203 * requests pending on this external object, we
1204 * wait for them to be resolved now.
1205 */
1206 #if TRACEFAULTPAGE
1207 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1208 #endif
1209 if (m != VM_PAGE_NULL)
1210 VM_PAGE_FREE(m);
1211 /*
1212 * take an extra ref so object won't die
1213 */
1214 vm_object_reference_locked(object);
1215
1216 vm_fault_cleanup(object, first_m);
1217
1218 counter(c_vm_fault_page_block_backoff_kernel++);
1219
1220 vm_object_lock(object);
1221 assert(object->ref_count > 0);
1222
1223 if (object->paging_in_progress > vm_object_pagein_throttle) {
1224 vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS, interruptible);
1225
1226 vm_object_unlock(object);
1227 wait_result = thread_block(THREAD_CONTINUE_NULL);
1228 vm_object_deallocate(object);
1229
1230 goto backoff;
1231 } else {
1232 vm_object_unlock(object);
1233 vm_object_deallocate(object);
1234 thread_interrupt_level(interruptible_state);
1235
1236 return (VM_FAULT_RETRY);
1237 }
1238 }
1239 if (m != VM_PAGE_NULL) {
1240 /*
1241 * Indicate that the page is waiting for data
1242 * from the memory manager.
1243 */
1244 m->list_req_pending = TRUE;
1245 m->absent = TRUE;
1246 }
1247
1248 #if TRACEFAULTPAGE
1249 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1250 #endif
1251
1252 /*
1253 * It's possible someone called vm_object_destroy while we weren't
1254 * holding the object lock. If that has happened, then bail out
1255 * here.
1256 */
1257
1258 pager = object->pager;
1259
1260 if (pager == MEMORY_OBJECT_NULL) {
1261 vm_fault_cleanup(object, first_m);
1262 thread_interrupt_level(interruptible_state);
1263 return VM_FAULT_MEMORY_ERROR;
1264 }
1265
1266 /*
1267 * We have an absent page in place for the faulting offset,
1268 * so we can release the object lock.
1269 */
1270
1271 vm_object_unlock(object);
1272
1273 /*
1274 * If this object uses a copy_call strategy,
1275 * and we are interested in a copy of this object
1276 * (having gotten here only by following a
1277 * shadow chain), then tell the memory manager
1278 * via a flag added to the desired_access
1279 * parameter, so that it can detect a race
1280 * between our walking down the shadow chain
1281 * and its pushing pages up into a copy of
1282 * the object that it manages.
1283 */
1284 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1285 wants_copy_flag = VM_PROT_WANTS_COPY;
1286 else
1287 wants_copy_flag = VM_PROT_NONE;
1288
1289 XPR(XPR_VM_FAULT,
1290 "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1291 (integer_t)object, offset, (integer_t)m,
1292 access_required | wants_copy_flag, 0);
1293
1294 /*
1295 * Call the memory manager to retrieve the data.
1296 */
1297 rc = memory_object_data_request(
1298 pager,
1299 offset + object->paging_offset,
1300 PAGE_SIZE,
1301 access_required | wants_copy_flag,
1302 (memory_object_fault_info_t)fault_info);
1303
1304 #if TRACEFAULTPAGE
1305 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1306 #endif
1307 vm_object_lock(object);
1308
1309 if (rc != KERN_SUCCESS) {
1310
1311 vm_fault_cleanup(object, first_m);
1312 thread_interrupt_level(interruptible_state);
1313
1314 return ((rc == MACH_SEND_INTERRUPTED) ?
1315 VM_FAULT_INTERRUPTED :
1316 VM_FAULT_MEMORY_ERROR);
1317 }
1318 if ((interruptible != THREAD_UNINT) && (current_thread()->sched_mode & TH_MODE_ABORT)) {
1319
1320 vm_fault_cleanup(object, first_m);
1321 thread_interrupt_level(interruptible_state);
1322
1323 return (VM_FAULT_INTERRUPTED);
1324 }
1325 if (m == VM_PAGE_NULL && object->phys_contiguous) {
1326 /*
1327 * No page here means that the object we
1328 * initially looked up was "physically
1329 * contiguous" (i.e. device memory). However,
1330 * with Virtual VRAM, the object might not
1331 * be backed by that device memory anymore,
1332 * so we're done here only if the object is
1333 * still "phys_contiguous".
1334 * Otherwise, if the object is no longer
1335 * "phys_contiguous", we need to retry the
1336 * page fault against the object's new backing
1337 * store (different memory object).
1338 */
1339 break;
1340 }
1341 /*
1342 * potentially a pagein fault
1343 * if we make it through the state checks
1344 * above, than we'll count it as such
1345 */
1346 my_fault = DBG_PAGEIN_FAULT;
1347
1348 /*
1349 * Retry with same object/offset, since new data may
1350 * be in a different page (i.e., m is meaningless at
1351 * this point).
1352 */
1353 continue;
1354 }
1355
1356 /*
1357 * We get here if the object has no pager, or an existence map
1358 * exists and indicates the page isn't present on the pager
1359 * or we're unwiring a page. If a pager exists, but there
1360 * is no existence map, then the m->absent case above handles
1361 * the ZF case when the pager can't provide the page
1362 */
1363 #if TRACEFAULTPAGE
1364 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1365 #endif
1366 if (object == first_object)
1367 first_m = m;
1368 else
1369 assert(m == VM_PAGE_NULL);
1370
1371 XPR(XPR_VM_FAULT,
1372 "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1373 (integer_t)object, offset, (integer_t)m,
1374 (integer_t)object->shadow, 0);
1375
1376 next_object = object->shadow;
1377
1378 if (next_object == VM_OBJECT_NULL) {
1379 /*
1380 * we've hit the bottom of the shadown chain,
1381 * fill the page in the top object with zeros.
1382 */
1383 assert(!must_be_resident);
1384
1385 if (object != first_object) {
1386 vm_object_paging_end(object);
1387 vm_object_unlock(object);
1388
1389 object = first_object;
1390 offset = first_offset;
1391 vm_object_lock(object);
1392 }
1393 m = first_m;
1394 assert(m->object == object);
1395 first_m = VM_PAGE_NULL;
1396
1397 /*
1398 * check for any conditions that prevent
1399 * us from creating a new zero-fill page
1400 * vm_fault_check will do all of the
1401 * fault cleanup in the case of an error condition
1402 * including resetting the thread_interrupt_level
1403 */
1404 error = vm_fault_check(object, m, first_m, interruptible_state);
1405
1406 if (error != VM_FAULT_SUCCESS)
1407 return (error);
1408
1409 if (m == VM_PAGE_NULL) {
1410 m = vm_page_grab();
1411
1412 if (m == VM_PAGE_NULL) {
1413 vm_fault_cleanup(object, VM_PAGE_NULL);
1414 thread_interrupt_level(interruptible_state);
1415
1416 return (VM_FAULT_MEMORY_SHORTAGE);
1417 }
1418 vm_page_insert(m, object, offset);
1419 }
1420 my_fault = vm_fault_zero_page(m, no_zero_fill);
1421
1422 break;
1423
1424 } else {
1425 /*
1426 * Move on to the next object. Lock the next
1427 * object before unlocking the current one.
1428 */
1429 if ((object != first_object) || must_be_resident)
1430 vm_object_paging_end(object);
1431
1432 offset += object->shadow_offset;
1433 fault_info->lo_offset += object->shadow_offset;
1434 fault_info->hi_offset += object->shadow_offset;
1435 access_required = VM_PROT_READ;
1436
1437 vm_object_lock(next_object);
1438 vm_object_unlock(object);
1439
1440 object = next_object;
1441 vm_object_paging_begin(object);
1442 }
1443 }
1444
1445 /*
1446 * PAGE HAS BEEN FOUND.
1447 *
1448 * This page (m) is:
1449 * busy, so that we can play with it;
1450 * not absent, so that nobody else will fill it;
1451 * possibly eligible for pageout;
1452 *
1453 * The top-level page (first_m) is:
1454 * VM_PAGE_NULL if the page was found in the
1455 * top-level object;
1456 * busy, not absent, and ineligible for pageout.
1457 *
1458 * The current object (object) is locked. A paging
1459 * reference is held for the current and top-level
1460 * objects.
1461 */
1462
1463 #if TRACEFAULTPAGE
1464 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1465 #endif
1466 #if EXTRA_ASSERTIONS
1467 if (m != VM_PAGE_NULL) {
1468 assert(m->busy && !m->absent);
1469 assert((first_m == VM_PAGE_NULL) ||
1470 (first_m->busy && !first_m->absent &&
1471 !first_m->active && !first_m->inactive));
1472 }
1473 #endif /* EXTRA_ASSERTIONS */
1474
1475 /*
1476 * ENCRYPTED SWAP:
1477 * If we found a page, we must have decrypted it before we
1478 * get here...
1479 */
1480 if (m != VM_PAGE_NULL) {
1481 ASSERT_PAGE_DECRYPTED(m);
1482 }
1483
1484 XPR(XPR_VM_FAULT,
1485 "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1486 (integer_t)object, offset, (integer_t)m,
1487 (integer_t)first_object, (integer_t)first_m);
1488
1489 /*
1490 * If the page is being written, but isn't
1491 * already owned by the top-level object,
1492 * we have to copy it into a new page owned
1493 * by the top-level object.
1494 */
1495 if ((object != first_object) && (m != VM_PAGE_NULL)) {
1496
1497 #if TRACEFAULTPAGE
1498 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1499 #endif
1500 if (fault_type & VM_PROT_WRITE) {
1501 vm_page_t copy_m;
1502
1503 /*
1504 * We only really need to copy if we
1505 * want to write it.
1506 */
1507 assert(!must_be_resident);
1508
1509 /*
1510 * are we protecting the system from
1511 * backing store exhaustion. If so
1512 * sleep unless we are privileged.
1513 */
1514 if (vm_backing_store_low) {
1515 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1516
1517 RELEASE_PAGE(m);
1518 vm_fault_cleanup(object, first_m);
1519
1520 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1521
1522 thread_block(THREAD_CONTINUE_NULL);
1523 thread_interrupt_level(interruptible_state);
1524
1525 return (VM_FAULT_RETRY);
1526 }
1527 }
1528 /*
1529 * If we try to collapse first_object at this
1530 * point, we may deadlock when we try to get
1531 * the lock on an intermediate object (since we
1532 * have the bottom object locked). We can't
1533 * unlock the bottom object, because the page
1534 * we found may move (by collapse) if we do.
1535 *
1536 * Instead, we first copy the page. Then, when
1537 * we have no more use for the bottom object,
1538 * we unlock it and try to collapse.
1539 *
1540 * Note that we copy the page even if we didn't
1541 * need to... that's the breaks.
1542 */
1543
1544 /*
1545 * Allocate a page for the copy
1546 */
1547 copy_m = vm_page_grab();
1548
1549 if (copy_m == VM_PAGE_NULL) {
1550 RELEASE_PAGE(m);
1551
1552 vm_fault_cleanup(object, first_m);
1553 thread_interrupt_level(interruptible_state);
1554
1555 return (VM_FAULT_MEMORY_SHORTAGE);
1556 }
1557 XPR(XPR_VM_FAULT,
1558 "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1559 (integer_t)object, offset,
1560 (integer_t)m, (integer_t)copy_m, 0);
1561
1562 vm_page_copy(m, copy_m);
1563
1564 /*
1565 * If another map is truly sharing this
1566 * page with us, we have to flush all
1567 * uses of the original page, since we
1568 * can't distinguish those which want the
1569 * original from those which need the
1570 * new copy.
1571 *
1572 * XXXO If we know that only one map has
1573 * access to this page, then we could
1574 * avoid the pmap_disconnect() call.
1575 */
1576 if (m->pmapped)
1577 pmap_disconnect(m->phys_page);
1578
1579 assert(!m->cleaning);
1580
1581 /*
1582 * We no longer need the old page or object.
1583 */
1584 PAGE_WAKEUP_DONE(m);
1585 vm_object_paging_end(object);
1586 vm_object_unlock(object);
1587
1588 my_fault = DBG_COW_FAULT;
1589 VM_STAT_INCR(cow_faults);
1590 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1591 current_task()->cow_faults++;
1592
1593 object = first_object;
1594 offset = first_offset;
1595
1596 vm_object_lock(object);
1597 /*
1598 * get rid of the place holder
1599 * page that we soldered in earlier
1600 */
1601 VM_PAGE_FREE(first_m);
1602 first_m = VM_PAGE_NULL;
1603
1604 /*
1605 * and replace it with the
1606 * page we just copied into
1607 */
1608 assert(copy_m->busy);
1609 vm_page_insert(copy_m, object, offset);
1610 copy_m->dirty = TRUE;
1611
1612 m = copy_m;
1613 /*
1614 * Now that we've gotten the copy out of the
1615 * way, let's try to collapse the top object.
1616 * But we have to play ugly games with
1617 * paging_in_progress to do that...
1618 */
1619 vm_object_paging_end(object);
1620 vm_object_collapse(object, offset, TRUE);
1621 vm_object_paging_begin(object);
1622
1623 } else
1624 *protection &= (~VM_PROT_WRITE);
1625 }
1626 /*
1627 * Now check whether the page needs to be pushed into the
1628 * copy object. The use of asymmetric copy on write for
1629 * shared temporary objects means that we may do two copies to
1630 * satisfy the fault; one above to get the page from a
1631 * shadowed object, and one here to push it into the copy.
1632 */
1633 try_failed_count = 0;
1634
1635 while ((copy_object = first_object->copy) != VM_OBJECT_NULL && (m != VM_PAGE_NULL)) {
1636 vm_object_offset_t copy_offset;
1637 vm_page_t copy_m;
1638
1639 #if TRACEFAULTPAGE
1640 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1641 #endif
1642 /*
1643 * If the page is being written, but hasn't been
1644 * copied to the copy-object, we have to copy it there.
1645 */
1646 if ((fault_type & VM_PROT_WRITE) == 0) {
1647 *protection &= ~VM_PROT_WRITE;
1648 break;
1649 }
1650
1651 /*
1652 * If the page was guaranteed to be resident,
1653 * we must have already performed the copy.
1654 */
1655 if (must_be_resident)
1656 break;
1657
1658 /*
1659 * Try to get the lock on the copy_object.
1660 */
1661 if (!vm_object_lock_try(copy_object)) {
1662
1663 vm_object_unlock(object);
1664 try_failed_count++;
1665
1666 mutex_pause(try_failed_count); /* wait a bit */
1667 vm_object_lock(object);
1668
1669 continue;
1670 }
1671 try_failed_count = 0;
1672
1673 /*
1674 * Make another reference to the copy-object,
1675 * to keep it from disappearing during the
1676 * copy.
1677 */
1678 vm_object_reference_locked(copy_object);
1679
1680 /*
1681 * Does the page exist in the copy?
1682 */
1683 copy_offset = first_offset - copy_object->shadow_offset;
1684
1685 if (copy_object->size <= copy_offset)
1686 /*
1687 * Copy object doesn't cover this page -- do nothing.
1688 */
1689 ;
1690 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1691 /*
1692 * Page currently exists in the copy object
1693 */
1694 if (copy_m->busy) {
1695 /*
1696 * If the page is being brought
1697 * in, wait for it and then retry.
1698 */
1699 RELEASE_PAGE(m);
1700
1701 /*
1702 * take an extra ref so object won't die
1703 */
1704 vm_object_reference_locked(copy_object);
1705 vm_object_unlock(copy_object);
1706 vm_fault_cleanup(object, first_m);
1707 counter(c_vm_fault_page_block_backoff_kernel++);
1708
1709 vm_object_lock(copy_object);
1710 assert(copy_object->ref_count > 0);
1711 VM_OBJ_RES_DECR(copy_object);
1712 vm_object_lock_assert_exclusive(copy_object);
1713 copy_object->ref_count--;
1714 assert(copy_object->ref_count > 0);
1715 copy_m = vm_page_lookup(copy_object, copy_offset);
1716 /*
1717 * ENCRYPTED SWAP:
1718 * it's OK if the "copy_m" page is encrypted,
1719 * because we're not moving it nor handling its
1720 * contents.
1721 */
1722 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1723 PAGE_ASSERT_WAIT(copy_m, interruptible);
1724
1725 vm_object_unlock(copy_object);
1726 wait_result = thread_block(THREAD_CONTINUE_NULL);
1727 vm_object_deallocate(copy_object);
1728
1729 goto backoff;
1730 } else {
1731 vm_object_unlock(copy_object);
1732 vm_object_deallocate(copy_object);
1733 thread_interrupt_level(interruptible_state);
1734
1735 return (VM_FAULT_RETRY);
1736 }
1737 }
1738 }
1739 else if (!PAGED_OUT(copy_object, copy_offset)) {
1740 /*
1741 * If PAGED_OUT is TRUE, then the page used to exist
1742 * in the copy-object, and has already been paged out.
1743 * We don't need to repeat this. If PAGED_OUT is
1744 * FALSE, then either we don't know (!pager_created,
1745 * for example) or it hasn't been paged out.
1746 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1747 * We must copy the page to the copy object.
1748 */
1749
1750 if (vm_backing_store_low) {
1751 /*
1752 * we are protecting the system from
1753 * backing store exhaustion. If so
1754 * sleep unless we are privileged.
1755 */
1756 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1757 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1758
1759 RELEASE_PAGE(m);
1760 VM_OBJ_RES_DECR(copy_object);
1761 vm_object_lock_assert_exclusive(copy_object);
1762 copy_object->ref_count--;
1763 assert(copy_object->ref_count > 0);
1764
1765 vm_object_unlock(copy_object);
1766 vm_fault_cleanup(object, first_m);
1767 thread_block(THREAD_CONTINUE_NULL);
1768 thread_interrupt_level(interruptible_state);
1769
1770 return (VM_FAULT_RETRY);
1771 }
1772 }
1773 /*
1774 * Allocate a page for the copy
1775 */
1776 copy_m = vm_page_alloc(copy_object, copy_offset);
1777
1778 if (copy_m == VM_PAGE_NULL) {
1779 RELEASE_PAGE(m);
1780
1781 VM_OBJ_RES_DECR(copy_object);
1782 vm_object_lock_assert_exclusive(copy_object);
1783 copy_object->ref_count--;
1784 assert(copy_object->ref_count > 0);
1785
1786 vm_object_unlock(copy_object);
1787 vm_fault_cleanup(object, first_m);
1788 thread_interrupt_level(interruptible_state);
1789
1790 return (VM_FAULT_MEMORY_SHORTAGE);
1791 }
1792 /*
1793 * Must copy page into copy-object.
1794 */
1795 vm_page_copy(m, copy_m);
1796
1797 /*
1798 * If the old page was in use by any users
1799 * of the copy-object, it must be removed
1800 * from all pmaps. (We can't know which
1801 * pmaps use it.)
1802 */
1803 if (m->pmapped)
1804 pmap_disconnect(m->phys_page);
1805
1806 /*
1807 * If there's a pager, then immediately
1808 * page out this page, using the "initialize"
1809 * option. Else, we use the copy.
1810 */
1811 if ((!copy_object->pager_created)
1812 #if MACH_PAGEMAP
1813 || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
1814 #endif
1815 ) {
1816
1817 vm_page_lockspin_queues();
1818 assert(!m->cleaning);
1819 vm_page_activate(copy_m);
1820 vm_page_unlock_queues();
1821
1822 copy_m->dirty = TRUE;
1823 PAGE_WAKEUP_DONE(copy_m);
1824 }
1825 else {
1826 assert(copy_m->busy == TRUE);
1827 assert(!m->cleaning);
1828
1829 /*
1830 * dirty is protected by the object lock
1831 */
1832 copy_m->dirty = TRUE;
1833
1834 /*
1835 * The page is already ready for pageout:
1836 * not on pageout queues and busy.
1837 * Unlock everything except the
1838 * copy_object itself.
1839 */
1840 vm_object_unlock(object);
1841
1842 /*
1843 * Write the page to the copy-object,
1844 * flushing it from the kernel.
1845 */
1846 vm_pageout_initialize_page(copy_m);
1847
1848 /*
1849 * Since the pageout may have
1850 * temporarily dropped the
1851 * copy_object's lock, we
1852 * check whether we'll have
1853 * to deallocate the hard way.
1854 */
1855 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
1856 vm_object_unlock(copy_object);
1857 vm_object_deallocate(copy_object);
1858 vm_object_lock(object);
1859
1860 continue;
1861 }
1862 /*
1863 * Pick back up the old object's
1864 * lock. [It is safe to do so,
1865 * since it must be deeper in the
1866 * object tree.]
1867 */
1868 vm_object_lock(object);
1869 }
1870 /*
1871 * Because we're pushing a page upward
1872 * in the object tree, we must restart
1873 * any faults that are waiting here.
1874 * [Note that this is an expansion of
1875 * PAGE_WAKEUP that uses the THREAD_RESTART
1876 * wait result]. Can't turn off the page's
1877 * busy bit because we're not done with it.
1878 */
1879 if (m->wanted) {
1880 m->wanted = FALSE;
1881 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
1882 }
1883 }
1884 /*
1885 * The reference count on copy_object must be
1886 * at least 2: one for our extra reference,
1887 * and at least one from the outside world
1888 * (we checked that when we last locked
1889 * copy_object).
1890 */
1891 vm_object_lock_assert_exclusive(copy_object);
1892 copy_object->ref_count--;
1893 assert(copy_object->ref_count > 0);
1894
1895 VM_OBJ_RES_DECR(copy_object);
1896 vm_object_unlock(copy_object);
1897
1898 break;
1899 }
1900 *result_page = m;
1901 *top_page = first_m;
1902
1903 XPR(XPR_VM_FAULT,
1904 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1905 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1906
1907 if (m != VM_PAGE_NULL) {
1908 if (my_fault == DBG_PAGEIN_FAULT) {
1909
1910 VM_STAT_INCR(pageins);
1911 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
1912 DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1913 current_task()->pageins++;
1914
1915 if (m->object->internal) {
1916 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
1917 } else {
1918 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
1919 }
1920
1921 /*
1922 * evaluate access pattern and update state
1923 * vm_fault_deactivate_behind depends on the
1924 * state being up to date
1925 */
1926 vm_fault_is_sequential(object, offset, fault_info->behavior);
1927
1928 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
1929 }
1930 if (type_of_fault)
1931 *type_of_fault = my_fault;
1932 } else
1933 vm_object_unlock(object);
1934
1935 thread_interrupt_level(interruptible_state);
1936
1937 #if TRACEFAULTPAGE
1938 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
1939 #endif
1940 return (VM_FAULT_SUCCESS);
1941
1942 backoff:
1943 thread_interrupt_level(interruptible_state);
1944
1945 if (wait_result == THREAD_INTERRUPTED)
1946 return (VM_FAULT_INTERRUPTED);
1947 return (VM_FAULT_RETRY);
1948
1949 #undef RELEASE_PAGE
1950 }
1951
1952
1953
1954 /*
1955 * page queue lock must NOT be held
1956 * m->object must be locked
1957 *
1958 * NOTE: m->object could be locked "shared" only if we are called
1959 * from vm_fault() as part of a soft fault. If so, we must be
1960 * careful not to modify the VM object in any way that is not
1961 * legal under a shared lock...
1962 */
1963 unsigned long cs_enter_tainted_rejected = 0;
1964 unsigned long cs_enter_tainted_accepted = 0;
1965 kern_return_t
1966 vm_fault_enter(vm_page_t m,
1967 pmap_t pmap,
1968 vm_map_offset_t vaddr,
1969 vm_prot_t prot,
1970 boolean_t wired,
1971 boolean_t change_wiring,
1972 boolean_t no_cache,
1973 int *type_of_fault)
1974 {
1975 unsigned int cache_attr;
1976 kern_return_t kr;
1977 boolean_t previously_pmapped = m->pmapped;
1978
1979 vm_object_lock_assert_held(m->object);
1980 #if DEBUG
1981 mutex_assert(&vm_page_queue_lock, MA_NOTOWNED);
1982 #endif /* DEBUG */
1983
1984 if (m->phys_page == vm_page_guard_addr) {
1985 assert(m->fictitious);
1986 return KERN_SUCCESS;
1987 }
1988
1989 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
1990
1991 if (m->object->code_signed && !m->cs_validated &&
1992 pmap != kernel_pmap) {
1993 /*
1994 * CODE SIGNING:
1995 * This page comes from a VM object backed by a
1996 * signed memory object and it hasn't been validated yet.
1997 * We're about to enter it into a process address space,
1998 * so we need to validate its signature now.
1999 */
2000 vm_object_lock_assert_exclusive(m->object);
2001
2002 /* VM map still locked, so 1 ref will remain on VM object */
2003
2004 vm_page_validate_cs(m);
2005 }
2006
2007 if (m->pmapped == FALSE) {
2008 /*
2009 * This is the first time this page is being
2010 * mapped in an address space (pmapped == FALSE).
2011 *
2012 * Part of that page may still be in the data cache
2013 * and not flushed to memory. In case we end up
2014 * accessing that page via the instruction cache,
2015 * we need to ensure that the 2 caches are in sync.
2016 */
2017 pmap_sync_page_data_phys(m->phys_page);
2018
2019 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2020 /*
2021 * found it in the cache, but this
2022 * is the first fault-in of the page (m->pmapped == FALSE)
2023 * so it must have come in as part of
2024 * a cluster... account 1 pagein against it
2025 */
2026 VM_STAT_INCR(pageins);
2027 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2028
2029 if (m->object->internal) {
2030 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2031 } else {
2032 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2033 }
2034
2035 current_task()->pageins++;
2036
2037 *type_of_fault = DBG_PAGEIN_FAULT;
2038 }
2039 VM_PAGE_CONSUME_CLUSTERED(m);
2040
2041 } else if (cache_attr != VM_WIMG_DEFAULT)
2042 pmap_sync_page_attributes_phys(m->phys_page);
2043
2044 if (*type_of_fault != DBG_COW_FAULT) {
2045 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2046
2047 if (pmap == kernel_pmap) {
2048 DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2049 }
2050 }
2051
2052 if (m->cs_tainted) {
2053 /*
2054 * CODE SIGNING:
2055 * This page has been tainted and can not be trusted.
2056 * Let's notify the current process and let it take any
2057 * necessary precautions before we enter the tainted page
2058 * into its address space.
2059 */
2060 if (cs_invalid_page()) {
2061 /* reject the tainted page: abort the page fault */
2062 kr = KERN_MEMORY_ERROR;
2063 cs_enter_tainted_rejected++;
2064 } else {
2065 /* proceed with the tainted page */
2066 kr = KERN_SUCCESS;
2067 cs_enter_tainted_accepted++;
2068 }
2069 if (cs_debug || kr != KERN_SUCCESS) {
2070 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2071 "page %p obj %p off 0x%llx *** TAINTED ***\n",
2072 (long long)vaddr, m, m->object, m->offset);
2073 }
2074 } else {
2075 /* proceed with the valid page */
2076 kr = KERN_SUCCESS;
2077 }
2078
2079 if (kr == KERN_SUCCESS) {
2080 /*
2081 * NOTE: we may only hold the vm_object lock SHARED
2082 * at this point, but the update of pmapped is ok
2083 * since this is the ONLY bit updated behind the SHARED
2084 * lock... however, we need to figure out how to do an atomic
2085 * update on a bit field to make this less fragile... right
2086 * now I don'w know how to coerce 'C' to give me the offset info
2087 * that's needed for an AtomicCompareAndSwap
2088 */
2089 m->pmapped = TRUE;
2090
2091 PMAP_ENTER(pmap, vaddr, m, prot, cache_attr, wired);
2092 }
2093
2094 /*
2095 * Hold queues lock to manipulate
2096 * the page queues. Change wiring
2097 * case is obvious.
2098 */
2099 if (change_wiring) {
2100 vm_page_lockspin_queues();
2101
2102 if (wired) {
2103 if (kr == KERN_SUCCESS) {
2104 vm_page_wire(m);
2105 }
2106 } else {
2107 vm_page_unwire(m);
2108 }
2109 vm_page_unlock_queues();
2110
2111 } else {
2112 if (kr != KERN_SUCCESS) {
2113 vm_page_lock_queues();
2114 vm_page_deactivate(m);
2115 vm_page_unlock_queues();
2116 } else {
2117 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count && !m->throttled) {
2118 vm_page_lockspin_queues();
2119 /*
2120 * test again now that we hold the page queue lock
2121 */
2122 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count) {
2123
2124 /*
2125 * If this is a no_cache mapping and the page has never been
2126 * mapped before or was previously a no_cache page, then we
2127 * want to leave pages in the speculative state so that they
2128 * can be readily recycled if free memory runs low. Otherwise
2129 * the page is activated as normal.
2130 */
2131
2132 if (no_cache && (!previously_pmapped || m->no_cache)) {
2133 m->no_cache = TRUE;
2134
2135 if (m->active || m->inactive)
2136 VM_PAGE_QUEUES_REMOVE(m);
2137
2138 if (!m->speculative)
2139 vm_page_speculate(m, TRUE);
2140
2141 } else if (!m->active && !m->inactive)
2142 vm_page_activate(m);
2143
2144 }
2145
2146 vm_page_unlock_queues();
2147 }
2148 }
2149 }
2150 return kr;
2151 }
2152
2153
2154 /*
2155 * Routine: vm_fault
2156 * Purpose:
2157 * Handle page faults, including pseudo-faults
2158 * used to change the wiring status of pages.
2159 * Returns:
2160 * Explicit continuations have been removed.
2161 * Implementation:
2162 * vm_fault and vm_fault_page save mucho state
2163 * in the moral equivalent of a closure. The state
2164 * structure is allocated when first entering vm_fault
2165 * and deallocated when leaving vm_fault.
2166 */
2167
2168 extern int _map_enter_debug;
2169
2170 unsigned long vm_fault_collapse_total = 0;
2171 unsigned long vm_fault_collapse_skipped = 0;
2172
2173 kern_return_t
2174 vm_fault(
2175 vm_map_t map,
2176 vm_map_offset_t vaddr,
2177 vm_prot_t fault_type,
2178 boolean_t change_wiring,
2179 int interruptible,
2180 pmap_t caller_pmap,
2181 vm_map_offset_t caller_pmap_addr)
2182 {
2183 vm_map_version_t version; /* Map version for verificiation */
2184 boolean_t wired; /* Should mapping be wired down? */
2185 vm_object_t object; /* Top-level object */
2186 vm_object_offset_t offset; /* Top-level offset */
2187 vm_prot_t prot; /* Protection for mapping */
2188 vm_object_t old_copy_object; /* Saved copy object */
2189 vm_page_t result_page; /* Result of vm_fault_page */
2190 vm_page_t top_page; /* Placeholder page */
2191 kern_return_t kr;
2192
2193 vm_page_t m; /* Fast access to result_page */
2194 kern_return_t error_code;
2195 vm_object_t cur_object;
2196 vm_object_offset_t cur_offset;
2197 vm_page_t cur_m;
2198 vm_object_t new_object;
2199 int type_of_fault;
2200 pmap_t pmap;
2201 boolean_t interruptible_state;
2202 vm_map_t real_map = map;
2203 vm_map_t original_map = map;
2204 vm_prot_t original_fault_type;
2205 struct vm_object_fault_info fault_info;
2206 boolean_t need_collapse = FALSE;
2207 int object_lock_type = 0;
2208 int cur_object_lock_type;
2209
2210
2211 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
2212 (int)((uint64_t)vaddr >> 32),
2213 (int)vaddr,
2214 0,
2215 0,
2216 0);
2217
2218 if (get_preemption_level() != 0) {
2219 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2220 (int)((uint64_t)vaddr >> 32),
2221 (int)vaddr,
2222 KERN_FAILURE,
2223 0,
2224 0);
2225
2226 return (KERN_FAILURE);
2227 }
2228 interruptible_state = thread_interrupt_level(interruptible);
2229
2230 VM_STAT_INCR(faults);
2231 current_task()->faults++;
2232 original_fault_type = fault_type;
2233
2234 if (fault_type & VM_PROT_WRITE)
2235 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2236 else
2237 object_lock_type = OBJECT_LOCK_SHARED;
2238
2239 cur_object_lock_type = OBJECT_LOCK_SHARED;
2240
2241 RetryFault:
2242 /*
2243 * assume we will hit a page in the cache
2244 * otherwise, explicitly override with
2245 * the real fault type once we determine it
2246 */
2247 type_of_fault = DBG_CACHE_HIT_FAULT;
2248
2249 /*
2250 * Find the backing store object and offset into
2251 * it to begin the search.
2252 */
2253 fault_type = original_fault_type;
2254 map = original_map;
2255 vm_map_lock_read(map);
2256
2257 kr = vm_map_lookup_locked(&map, vaddr, fault_type,
2258 object_lock_type, &version,
2259 &object, &offset, &prot, &wired,
2260 &fault_info,
2261 &real_map);
2262
2263 if (kr != KERN_SUCCESS) {
2264 vm_map_unlock_read(map);
2265 goto done;
2266 }
2267 pmap = real_map->pmap;
2268 fault_info.interruptible = interruptible;
2269
2270 /*
2271 * If the page is wired, we must fault for the current protection
2272 * value, to avoid further faults.
2273 */
2274 if (wired) {
2275 fault_type = prot | VM_PROT_WRITE;
2276
2277 /*
2278 * since we're treating this fault as a 'write'
2279 * we must hold the top object lock exclusively
2280 */
2281 if (object_lock_type == OBJECT_LOCK_SHARED) {
2282
2283 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2284
2285 if (vm_object_lock_upgrade(object) == FALSE) {
2286 /*
2287 * couldn't upgrade, so explictly
2288 * take the lock exclusively
2289 */
2290 vm_object_lock(object);
2291 }
2292 }
2293 }
2294
2295 #if VM_FAULT_CLASSIFY
2296 /*
2297 * Temporary data gathering code
2298 */
2299 vm_fault_classify(object, offset, fault_type);
2300 #endif
2301 /*
2302 * Fast fault code. The basic idea is to do as much as
2303 * possible while holding the map lock and object locks.
2304 * Busy pages are not used until the object lock has to
2305 * be dropped to do something (copy, zero fill, pmap enter).
2306 * Similarly, paging references aren't acquired until that
2307 * point, and object references aren't used.
2308 *
2309 * If we can figure out what to do
2310 * (zero fill, copy on write, pmap enter) while holding
2311 * the locks, then it gets done. Otherwise, we give up,
2312 * and use the original fault path (which doesn't hold
2313 * the map lock, and relies on busy pages).
2314 * The give up cases include:
2315 * - Have to talk to pager.
2316 * - Page is busy, absent or in error.
2317 * - Pager has locked out desired access.
2318 * - Fault needs to be restarted.
2319 * - Have to push page into copy object.
2320 *
2321 * The code is an infinite loop that moves one level down
2322 * the shadow chain each time. cur_object and cur_offset
2323 * refer to the current object being examined. object and offset
2324 * are the original object from the map. The loop is at the
2325 * top level if and only if object and cur_object are the same.
2326 *
2327 * Invariants: Map lock is held throughout. Lock is held on
2328 * original object and cur_object (if different) when
2329 * continuing or exiting loop.
2330 *
2331 */
2332
2333
2334 /*
2335 * If this page is to be inserted in a copy delay object
2336 * for writing, and if the object has a copy, then the
2337 * copy delay strategy is implemented in the slow fault page.
2338 */
2339 if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
2340 object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
2341 goto handle_copy_delay;
2342
2343 cur_object = object;
2344 cur_offset = offset;
2345
2346 while (TRUE) {
2347 m = vm_page_lookup(cur_object, cur_offset);
2348
2349 if (m != VM_PAGE_NULL) {
2350 if (m->busy) {
2351 wait_result_t result;
2352
2353 /*
2354 * in order to do the PAGE_ASSERT_WAIT, we must
2355 * have object that 'm' belongs to locked exclusively
2356 */
2357 if (object != cur_object) {
2358 vm_object_unlock(object);
2359
2360 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2361
2362 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2363
2364 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2365 /*
2366 * couldn't upgrade so go do a full retry
2367 * immediately since we've already dropped
2368 * the top object lock associated with this page
2369 * and the current one got dropped due to the
2370 * failed upgrade... the state is no longer valid
2371 */
2372 vm_map_unlock_read(map);
2373 if (real_map != map)
2374 vm_map_unlock(real_map);
2375
2376 goto RetryFault;
2377 }
2378 }
2379 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2380
2381 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2382
2383 if (vm_object_lock_upgrade(object) == FALSE) {
2384 /*
2385 * couldn't upgrade, so explictly take the lock
2386 * exclusively and go relookup the page since we
2387 * will have dropped the object lock and
2388 * a different thread could have inserted
2389 * a page at this offset
2390 * no need for a full retry since we're
2391 * at the top level of the object chain
2392 */
2393 vm_object_lock(object);
2394
2395 continue;
2396 }
2397 }
2398 vm_map_unlock_read(map);
2399 if (real_map != map)
2400 vm_map_unlock(real_map);
2401
2402 result = PAGE_ASSERT_WAIT(m, interruptible);
2403
2404 vm_object_unlock(cur_object);
2405
2406 if (result == THREAD_WAITING) {
2407 result = thread_block(THREAD_CONTINUE_NULL);
2408
2409 counter(c_vm_fault_page_block_busy_kernel++);
2410 }
2411 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2412 goto RetryFault;
2413
2414 kr = KERN_ABORTED;
2415 goto done;
2416 }
2417 if (m->phys_page == vm_page_guard_addr) {
2418 /*
2419 * Guard page: let the slow path deal with it
2420 */
2421 break;
2422 }
2423 if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
2424 /*
2425 * Unusual case... let the slow path deal with it
2426 */
2427 break;
2428 }
2429 if (m->encrypted) {
2430 /*
2431 * ENCRYPTED SWAP:
2432 * We've soft-faulted (because it's not in the page
2433 * table) on an encrypted page.
2434 * Keep the page "busy" so that no one messes with
2435 * it during the decryption.
2436 * Release the extra locks we're holding, keep only
2437 * the page's VM object lock.
2438 *
2439 * in order to set 'busy' on 'm', we must
2440 * have object that 'm' belongs to locked exclusively
2441 */
2442 if (object != cur_object) {
2443 vm_object_unlock(object);
2444
2445 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2446
2447 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2448
2449 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2450 /*
2451 * couldn't upgrade so go do a full retry
2452 * immediately since we've already dropped
2453 * the top object lock associated with this page
2454 * and the current one got dropped due to the
2455 * failed upgrade... the state is no longer valid
2456 */
2457 vm_map_unlock_read(map);
2458 if (real_map != map)
2459 vm_map_unlock(real_map);
2460
2461 goto RetryFault;
2462 }
2463 }
2464 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2465
2466 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2467
2468 if (vm_object_lock_upgrade(object) == FALSE) {
2469 /*
2470 * couldn't upgrade, so explictly take the lock
2471 * exclusively and go relookup the page since we
2472 * will have dropped the object lock and
2473 * a different thread could have inserted
2474 * a page at this offset
2475 * no need for a full retry since we're
2476 * at the top level of the object chain
2477 */
2478 vm_object_lock(object);
2479
2480 continue;
2481 }
2482 }
2483 m->busy = TRUE;
2484
2485 vm_map_unlock_read(map);
2486 if (real_map != map)
2487 vm_map_unlock(real_map);
2488
2489 vm_page_decrypt(m, 0);
2490
2491 assert(m->busy);
2492 PAGE_WAKEUP_DONE(m);
2493
2494 vm_object_unlock(cur_object);
2495 /*
2496 * Retry from the top, in case anything
2497 * changed while we were decrypting...
2498 */
2499 goto RetryFault;
2500 }
2501 ASSERT_PAGE_DECRYPTED(m);
2502
2503 if (m->object->code_signed && !m->cs_validated) {
2504 /*
2505 * We will need to validate this page
2506 * against its code signature, so we
2507 * want to hold the VM object exclusively.
2508 */
2509 if (object != cur_object) {
2510 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2511 vm_object_unlock(object);
2512 vm_object_unlock(cur_object);
2513
2514 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2515
2516 vm_map_unlock_read(map);
2517 if (real_map != map)
2518 vm_map_unlock(real_map);
2519
2520 goto RetryFault;
2521 }
2522
2523 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2524
2525 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2526
2527 if (vm_object_lock_upgrade(object) == FALSE) {
2528 /*
2529 * couldn't upgrade, so explictly take the lock
2530 * exclusively and go relookup the page since we
2531 * will have dropped the object lock and
2532 * a different thread could have inserted
2533 * a page at this offset
2534 * no need for a full retry since we're
2535 * at the top level of the object chain
2536 */
2537 vm_object_lock(object);
2538
2539 continue;
2540 }
2541 }
2542 }
2543 /*
2544 * Two cases of map in faults:
2545 * - At top level w/o copy object.
2546 * - Read fault anywhere.
2547 * --> must disallow write.
2548 */
2549
2550 if (object == cur_object && object->copy == VM_OBJECT_NULL)
2551 goto FastPmapEnter;
2552
2553 if ((fault_type & VM_PROT_WRITE) == 0) {
2554
2555 prot &= ~VM_PROT_WRITE;
2556
2557 /*
2558 * Set up to map the page...
2559 * mark the page busy, drop
2560 * unneeded object lock
2561 */
2562 if (object != cur_object) {
2563 /*
2564 * don't need the original object anymore
2565 */
2566 vm_object_unlock(object);
2567
2568 /*
2569 * switch to the object that has the new page
2570 */
2571 object = cur_object;
2572 object_lock_type = cur_object_lock_type;
2573 }
2574 FastPmapEnter:
2575 /*
2576 * prepare for the pmap_enter...
2577 * object and map are both locked
2578 * m contains valid data
2579 * object == m->object
2580 * cur_object == NULL or it's been unlocked
2581 * no paging references on either object or cur_object
2582 */
2583 #if MACH_KDB
2584 if (db_watchpoint_list && (fault_type & VM_PROT_WRITE) == 0)
2585 prot &= ~VM_PROT_WRITE;
2586 #endif
2587 if (caller_pmap) {
2588 kr = vm_fault_enter(m,
2589 caller_pmap,
2590 caller_pmap_addr,
2591 prot,
2592 wired,
2593 change_wiring,
2594 fault_info.no_cache,
2595 &type_of_fault);
2596 } else {
2597 kr = vm_fault_enter(m,
2598 pmap,
2599 vaddr,
2600 prot,
2601 wired,
2602 change_wiring,
2603 fault_info.no_cache,
2604 &type_of_fault);
2605 }
2606
2607 if (need_collapse == TRUE)
2608 vm_object_collapse(object, offset, TRUE);
2609
2610 if (type_of_fault == DBG_PAGEIN_FAULT) {
2611 /*
2612 * evaluate access pattern and update state
2613 * vm_fault_deactivate_behind depends on the
2614 * state being up to date
2615 */
2616 vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
2617
2618 vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
2619 }
2620 /*
2621 * That's it, clean up and return.
2622 */
2623 if (m->busy)
2624 PAGE_WAKEUP_DONE(m);
2625
2626 vm_object_unlock(object);
2627
2628 vm_map_unlock_read(map);
2629 if (real_map != map)
2630 vm_map_unlock(real_map);
2631
2632 goto done;
2633 }
2634 /*
2635 * COPY ON WRITE FAULT
2636 *
2637 * If objects match, then
2638 * object->copy must not be NULL (else control
2639 * would be in previous code block), and we
2640 * have a potential push into the copy object
2641 * with which we can't cope with here.
2642 */
2643 if (cur_object == object) {
2644 /*
2645 * must take the slow path to
2646 * deal with the copy push
2647 */
2648 break;
2649 }
2650 assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
2651
2652 /*
2653 * This is now a shadow based copy on write
2654 * fault -- it requires a copy up the shadow
2655 * chain.
2656 *
2657 * Allocate a page in the original top level
2658 * object. Give up if allocate fails. Also
2659 * need to remember current page, as it's the
2660 * source of the copy.
2661 *
2662 * at this point we hold locks on both
2663 * object and cur_object... no need to take
2664 * paging refs or mark pages BUSY since
2665 * we don't drop either object lock until
2666 * the page has been copied and inserted
2667 */
2668 cur_m = m;
2669 m = vm_page_grab();
2670
2671 if (m == VM_PAGE_NULL) {
2672 /*
2673 * no free page currently available...
2674 * must take the slow path
2675 */
2676 break;
2677 }
2678 /*
2679 * Now do the copy. Mark the source page busy...
2680 *
2681 * NOTE: This code holds the map lock across
2682 * the page copy.
2683 */
2684 vm_page_copy(cur_m, m);
2685 vm_page_insert(m, object, offset);
2686 m->dirty = TRUE;
2687
2688 /*
2689 * Now cope with the source page and object
2690 */
2691 if (object->ref_count > 1 && cur_m->pmapped)
2692 pmap_disconnect(cur_m->phys_page);
2693
2694 need_collapse = TRUE;
2695
2696 if (!cur_object->internal &&
2697 cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
2698 /*
2699 * The object from which we've just
2700 * copied a page is most probably backed
2701 * by a vnode. We don't want to waste too
2702 * much time trying to collapse the VM objects
2703 * and create a bottleneck when several tasks
2704 * map the same file.
2705 */
2706 if (cur_object->copy == object) {
2707 /*
2708 * Shared mapping or no COW yet.
2709 * We can never collapse a copy
2710 * object into its backing object.
2711 */
2712 need_collapse = FALSE;
2713 } else if (cur_object->copy == object->shadow &&
2714 object->shadow->resident_page_count == 0) {
2715 /*
2716 * Shared mapping after a COW occurred.
2717 */
2718 need_collapse = FALSE;
2719 }
2720 }
2721 vm_object_unlock(cur_object);
2722
2723 if (need_collapse == FALSE)
2724 vm_fault_collapse_skipped++;
2725 vm_fault_collapse_total++;
2726
2727 type_of_fault = DBG_COW_FAULT;
2728 VM_STAT_INCR(cow_faults);
2729 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2730 current_task()->cow_faults++;
2731
2732 goto FastPmapEnter;
2733
2734 } else {
2735 /*
2736 * No page at cur_object, cur_offset... m == NULL
2737 */
2738 if (cur_object->pager_created) {
2739 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) {
2740 /*
2741 * May have to talk to a pager...
2742 * take the slow path.
2743 */
2744 break;
2745 }
2746 /*
2747 * existence map present and indicates
2748 * that the pager doesn't have this page
2749 */
2750 }
2751 if (cur_object->shadow == VM_OBJECT_NULL) {
2752 /*
2753 * Zero fill fault. Page gets
2754 * inserted into the original object.
2755 */
2756 if (cur_object->shadow_severed) {
2757
2758 if (object != cur_object)
2759 vm_object_unlock(cur_object);
2760 vm_object_unlock(object);
2761
2762 vm_map_unlock_read(map);
2763 if (real_map != map)
2764 vm_map_unlock(real_map);
2765
2766 kr = KERN_MEMORY_ERROR;
2767 goto done;
2768 }
2769 if (VM_PAGE_ZFILL_THROTTLED()) {
2770 /*
2771 * drop all of our locks...
2772 * wait until the free queue is
2773 * pumped back up and then
2774 * redrive the fault
2775 */
2776 if (object != cur_object)
2777 vm_object_unlock(cur_object);
2778 vm_object_unlock(object);
2779 vm_map_unlock_read(map);
2780 if (real_map != map)
2781 vm_map_unlock(real_map);
2782
2783 if (vm_page_wait((change_wiring) ?
2784 THREAD_UNINT :
2785 THREAD_ABORTSAFE))
2786 goto RetryFault;
2787
2788 kr = KERN_ABORTED;
2789 goto done;
2790 }
2791 if (vm_backing_store_low) {
2792 /*
2793 * we are protecting the system from
2794 * backing store exhaustion...
2795 * must take the slow path if we're
2796 * not privileged
2797 */
2798 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
2799 break;
2800 }
2801 if (cur_object != object) {
2802 vm_object_unlock(cur_object);
2803
2804 cur_object = object;
2805 }
2806 if (object_lock_type == OBJECT_LOCK_SHARED) {
2807
2808 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2809
2810 if (vm_object_lock_upgrade(object) == FALSE) {
2811 /*
2812 * couldn't upgrade so do a full retry on the fault
2813 * since we dropped the object lock which
2814 * could allow another thread to insert
2815 * a page at this offset
2816 */
2817 vm_map_unlock_read(map);
2818 if (real_map != map)
2819 vm_map_unlock(real_map);
2820
2821 goto RetryFault;
2822 }
2823 }
2824 m = vm_page_alloc(object, offset);
2825
2826 if (m == VM_PAGE_NULL) {
2827 /*
2828 * no free page currently available...
2829 * must take the slow path
2830 */
2831 break;
2832 }
2833
2834 /*
2835 * Now zero fill page...
2836 * the page is probably going to
2837 * be written soon, so don't bother
2838 * to clear the modified bit
2839 *
2840 * NOTE: This code holds the map
2841 * lock across the zero fill.
2842 */
2843 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
2844
2845 goto FastPmapEnter;
2846 }
2847 /*
2848 * On to the next level in the shadow chain
2849 */
2850 cur_offset += cur_object->shadow_offset;
2851 new_object = cur_object->shadow;
2852
2853 /*
2854 * take the new_object's lock with the indicated state
2855 */
2856 if (cur_object_lock_type == OBJECT_LOCK_SHARED)
2857 vm_object_lock_shared(new_object);
2858 else
2859 vm_object_lock(new_object);
2860
2861 if (cur_object != object)
2862 vm_object_unlock(cur_object);
2863
2864 cur_object = new_object;
2865
2866 continue;
2867 }
2868 }
2869 /*
2870 * Cleanup from fast fault failure. Drop any object
2871 * lock other than original and drop map lock.
2872 */
2873 if (object != cur_object)
2874 vm_object_unlock(cur_object);
2875
2876 /*
2877 * must own the object lock exclusively at this point
2878 */
2879 if (object_lock_type == OBJECT_LOCK_SHARED) {
2880 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2881
2882 if (vm_object_lock_upgrade(object) == FALSE) {
2883 /*
2884 * couldn't upgrade, so explictly
2885 * take the lock exclusively
2886 * no need to retry the fault at this
2887 * point since "vm_fault_page" will
2888 * completely re-evaluate the state
2889 */
2890 vm_object_lock(object);
2891 }
2892 }
2893
2894 handle_copy_delay:
2895 vm_map_unlock_read(map);
2896 if (real_map != map)
2897 vm_map_unlock(real_map);
2898
2899 /*
2900 * Make a reference to this object to
2901 * prevent its disposal while we are messing with
2902 * it. Once we have the reference, the map is free
2903 * to be diddled. Since objects reference their
2904 * shadows (and copies), they will stay around as well.
2905 */
2906 vm_object_reference_locked(object);
2907 vm_object_paging_begin(object);
2908
2909 XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2910
2911 error_code = 0;
2912
2913 kr = vm_fault_page(object, offset, fault_type,
2914 (change_wiring && !wired),
2915 &prot, &result_page, &top_page,
2916 &type_of_fault,
2917 &error_code, map->no_zero_fill,
2918 FALSE, &fault_info);
2919
2920 /*
2921 * if kr != VM_FAULT_SUCCESS, then the paging reference
2922 * has been dropped and the object unlocked... the ref_count
2923 * is still held
2924 *
2925 * if kr == VM_FAULT_SUCCESS, then the paging reference
2926 * is still held along with the ref_count on the original object
2927 *
2928 * if m != NULL, then the object it belongs to
2929 * is returned locked with a paging reference
2930 *
2931 * if top_page != NULL, then it's BUSY and the
2932 * object it belongs to has a paging reference
2933 * but is returned unlocked
2934 */
2935 if (kr != VM_FAULT_SUCCESS) {
2936 /*
2937 * we didn't succeed, lose the object reference immediately.
2938 */
2939 vm_object_deallocate(object);
2940
2941 /*
2942 * See why we failed, and take corrective action.
2943 */
2944 switch (kr) {
2945 case VM_FAULT_MEMORY_SHORTAGE:
2946 if (vm_page_wait((change_wiring) ?
2947 THREAD_UNINT :
2948 THREAD_ABORTSAFE))
2949 goto RetryFault;
2950 /*
2951 * fall thru
2952 */
2953 case VM_FAULT_INTERRUPTED:
2954 kr = KERN_ABORTED;
2955 goto done;
2956 case VM_FAULT_RETRY:
2957 goto RetryFault;
2958 case VM_FAULT_MEMORY_ERROR:
2959 if (error_code)
2960 kr = error_code;
2961 else
2962 kr = KERN_MEMORY_ERROR;
2963 goto done;
2964 }
2965 }
2966 m = result_page;
2967
2968 if (m != VM_PAGE_NULL) {
2969 assert((change_wiring && !wired) ?
2970 (top_page == VM_PAGE_NULL) :
2971 ((top_page == VM_PAGE_NULL) == (m->object == object)));
2972 }
2973
2974 /*
2975 * What to do with the resulting page from vm_fault_page
2976 * if it doesn't get entered into the physical map:
2977 */
2978 #define RELEASE_PAGE(m) \
2979 MACRO_BEGIN \
2980 PAGE_WAKEUP_DONE(m); \
2981 vm_page_lockspin_queues(); \
2982 if (!m->active && !m->inactive && !m->throttled)\
2983 vm_page_activate(m); \
2984 vm_page_unlock_queues(); \
2985 MACRO_END
2986
2987 /*
2988 * We must verify that the maps have not changed
2989 * since our last lookup.
2990 */
2991 if (m != VM_PAGE_NULL) {
2992 old_copy_object = m->object->copy;
2993 vm_object_unlock(m->object);
2994 } else
2995 old_copy_object = VM_OBJECT_NULL;
2996
2997 /*
2998 * no object locks are held at this point
2999 */
3000 if ((map != original_map) || !vm_map_verify(map, &version)) {
3001 vm_object_t retry_object;
3002 vm_object_offset_t retry_offset;
3003 vm_prot_t retry_prot;
3004
3005 /*
3006 * To avoid trying to write_lock the map while another
3007 * thread has it read_locked (in vm_map_pageable), we
3008 * do not try for write permission. If the page is
3009 * still writable, we will get write permission. If it
3010 * is not, or has been marked needs_copy, we enter the
3011 * mapping without write permission, and will merely
3012 * take another fault.
3013 */
3014 map = original_map;
3015 vm_map_lock_read(map);
3016
3017 kr = vm_map_lookup_locked(&map, vaddr,
3018 fault_type & ~VM_PROT_WRITE,
3019 OBJECT_LOCK_EXCLUSIVE, &version,
3020 &retry_object, &retry_offset, &retry_prot,
3021 &wired,
3022 &fault_info,
3023 &real_map);
3024 pmap = real_map->pmap;
3025
3026 if (kr != KERN_SUCCESS) {
3027 vm_map_unlock_read(map);
3028
3029 if (m != VM_PAGE_NULL) {
3030 /*
3031 * retake the lock so that
3032 * we can drop the paging reference
3033 * in vm_fault_cleanup and do the
3034 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3035 */
3036 vm_object_lock(m->object);
3037
3038 RELEASE_PAGE(m);
3039
3040 vm_fault_cleanup(m->object, top_page);
3041 } else {
3042 /*
3043 * retake the lock so that
3044 * we can drop the paging reference
3045 * in vm_fault_cleanup
3046 */
3047 vm_object_lock(object);
3048
3049 vm_fault_cleanup(object, top_page);
3050 }
3051 vm_object_deallocate(object);
3052
3053 goto done;
3054 }
3055 vm_object_unlock(retry_object);
3056
3057 if ((retry_object != object) || (retry_offset != offset)) {
3058
3059 vm_map_unlock_read(map);
3060 if (real_map != map)
3061 vm_map_unlock(real_map);
3062
3063 if (m != VM_PAGE_NULL) {
3064 /*
3065 * retake the lock so that
3066 * we can drop the paging reference
3067 * in vm_fault_cleanup and do the
3068 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3069 */
3070 vm_object_lock(m->object);
3071
3072 RELEASE_PAGE(m);
3073
3074 vm_fault_cleanup(m->object, top_page);
3075 } else {
3076 /*
3077 * retake the lock so that
3078 * we can drop the paging reference
3079 * in vm_fault_cleanup
3080 */
3081 vm_object_lock(object);
3082
3083 vm_fault_cleanup(object, top_page);
3084 }
3085 vm_object_deallocate(object);
3086
3087 goto RetryFault;
3088 }
3089 /*
3090 * Check whether the protection has changed or the object
3091 * has been copied while we left the map unlocked.
3092 */
3093 prot &= retry_prot;
3094 }
3095 if (m != VM_PAGE_NULL) {
3096 vm_object_lock(m->object);
3097
3098 if (m->object->copy != old_copy_object) {
3099 /*
3100 * The copy object changed while the top-level object
3101 * was unlocked, so take away write permission.
3102 */
3103 prot &= ~VM_PROT_WRITE;
3104 }
3105 } else
3106 vm_object_lock(object);
3107
3108 /*
3109 * If we want to wire down this page, but no longer have
3110 * adequate permissions, we must start all over.
3111 */
3112 if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
3113
3114 vm_map_verify_done(map, &version);
3115 if (real_map != map)
3116 vm_map_unlock(real_map);
3117
3118 if (m != VM_PAGE_NULL) {
3119 RELEASE_PAGE(m);
3120
3121 vm_fault_cleanup(m->object, top_page);
3122 } else
3123 vm_fault_cleanup(object, top_page);
3124
3125 vm_object_deallocate(object);
3126
3127 goto RetryFault;
3128 }
3129 if (m != VM_PAGE_NULL) {
3130 /*
3131 * Put this page into the physical map.
3132 * We had to do the unlock above because pmap_enter
3133 * may cause other faults. The page may be on
3134 * the pageout queues. If the pageout daemon comes
3135 * across the page, it will remove it from the queues.
3136 */
3137 if (caller_pmap) {
3138 kr = vm_fault_enter(m,
3139 caller_pmap,
3140 caller_pmap_addr,
3141 prot,
3142 wired,
3143 change_wiring,
3144 fault_info.no_cache,
3145 &type_of_fault);
3146 } else {
3147 kr = vm_fault_enter(m,
3148 pmap,
3149 vaddr,
3150 prot,
3151 wired,
3152 change_wiring,
3153 fault_info.no_cache,
3154 &type_of_fault);
3155 }
3156 if (kr != KERN_SUCCESS) {
3157 /* abort this page fault */
3158 vm_map_verify_done(map, &version);
3159 if (real_map != map)
3160 vm_map_unlock(real_map);
3161 PAGE_WAKEUP_DONE(m);
3162 vm_fault_cleanup(m->object, top_page);
3163 vm_object_deallocate(object);
3164 goto done;
3165 }
3166 } else {
3167
3168 vm_map_entry_t entry;
3169 vm_map_offset_t laddr;
3170 vm_map_offset_t ldelta, hdelta;
3171
3172 /*
3173 * do a pmap block mapping from the physical address
3174 * in the object
3175 */
3176
3177 #ifdef ppc
3178 /* While we do not worry about execution protection in */
3179 /* general, certian pages may have instruction execution */
3180 /* disallowed. We will check here, and if not allowed */
3181 /* to execute, we return with a protection failure. */
3182
3183 if ((fault_type & VM_PROT_EXECUTE) &&
3184 (!pmap_eligible_for_execute((ppnum_t)(object->shadow_offset >> 12)))) {
3185
3186 vm_map_verify_done(map, &version);
3187
3188 if (real_map != map)
3189 vm_map_unlock(real_map);
3190
3191 vm_fault_cleanup(object, top_page);
3192 vm_object_deallocate(object);
3193
3194 kr = KERN_PROTECTION_FAILURE;
3195 goto done;
3196 }
3197 #endif /* ppc */
3198
3199 if (real_map != map)
3200 vm_map_unlock(real_map);
3201
3202 if (original_map != map) {
3203 vm_map_unlock_read(map);
3204 vm_map_lock_read(original_map);
3205 map = original_map;
3206 }
3207 real_map = map;
3208
3209 laddr = vaddr;
3210 hdelta = 0xFFFFF000;
3211 ldelta = 0xFFFFF000;
3212
3213 while (vm_map_lookup_entry(map, laddr, &entry)) {
3214 if (ldelta > (laddr - entry->vme_start))
3215 ldelta = laddr - entry->vme_start;
3216 if (hdelta > (entry->vme_end - laddr))
3217 hdelta = entry->vme_end - laddr;
3218 if (entry->is_sub_map) {
3219
3220 laddr = (laddr - entry->vme_start)
3221 + entry->offset;
3222 vm_map_lock_read(entry->object.sub_map);
3223
3224 if (map != real_map)
3225 vm_map_unlock_read(map);
3226 if (entry->use_pmap) {
3227 vm_map_unlock_read(real_map);
3228 real_map = entry->object.sub_map;
3229 }
3230 map = entry->object.sub_map;
3231
3232 } else {
3233 break;
3234 }
3235 }
3236
3237 if (vm_map_lookup_entry(map, laddr, &entry) &&
3238 (entry->object.vm_object != NULL) &&
3239 (entry->object.vm_object == object)) {
3240
3241 if (caller_pmap) {
3242 /*
3243 * Set up a block mapped area
3244 */
3245 pmap_map_block(caller_pmap,
3246 (addr64_t)(caller_pmap_addr - ldelta),
3247 (((vm_map_offset_t) (entry->object.vm_object->shadow_offset)) +
3248 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3249 ((ldelta + hdelta) >> 12), prot,
3250 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3251 } else {
3252 /*
3253 * Set up a block mapped area
3254 */
3255 pmap_map_block(real_map->pmap,
3256 (addr64_t)(vaddr - ldelta),
3257 (((vm_map_offset_t)(entry->object.vm_object->shadow_offset)) +
3258 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3259 ((ldelta + hdelta) >> 12), prot,
3260 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3261 }
3262 }
3263 }
3264
3265 /*
3266 * Unlock everything, and return
3267 */
3268 vm_map_verify_done(map, &version);
3269 if (real_map != map)
3270 vm_map_unlock(real_map);
3271
3272 if (m != VM_PAGE_NULL) {
3273 PAGE_WAKEUP_DONE(m);
3274
3275 vm_fault_cleanup(m->object, top_page);
3276 } else
3277 vm_fault_cleanup(object, top_page);
3278
3279 vm_object_deallocate(object);
3280
3281 #undef RELEASE_PAGE
3282
3283 kr = KERN_SUCCESS;
3284 done:
3285 thread_interrupt_level(interruptible_state);
3286
3287 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3288 (int)((uint64_t)vaddr >> 32),
3289 (int)vaddr,
3290 kr,
3291 type_of_fault,
3292 0);
3293
3294 return (kr);
3295 }
3296
3297 /*
3298 * vm_fault_wire:
3299 *
3300 * Wire down a range of virtual addresses in a map.
3301 */
3302 kern_return_t
3303 vm_fault_wire(
3304 vm_map_t map,
3305 vm_map_entry_t entry,
3306 pmap_t pmap,
3307 vm_map_offset_t pmap_addr)
3308 {
3309
3310 register vm_map_offset_t va;
3311 register vm_map_offset_t end_addr = entry->vme_end;
3312 register kern_return_t rc;
3313
3314 assert(entry->in_transition);
3315
3316 if ((entry->object.vm_object != NULL) &&
3317 !entry->is_sub_map &&
3318 entry->object.vm_object->phys_contiguous) {
3319 return KERN_SUCCESS;
3320 }
3321
3322 /*
3323 * Inform the physical mapping system that the
3324 * range of addresses may not fault, so that
3325 * page tables and such can be locked down as well.
3326 */
3327
3328 pmap_pageable(pmap, pmap_addr,
3329 pmap_addr + (end_addr - entry->vme_start), FALSE);
3330
3331 /*
3332 * We simulate a fault to get the page and enter it
3333 * in the physical map.
3334 */
3335
3336 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3337 if ((rc = vm_fault_wire_fast(
3338 map, va, entry, pmap,
3339 pmap_addr + (va - entry->vme_start)
3340 )) != KERN_SUCCESS) {
3341 rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
3342 (pmap == kernel_pmap) ?
3343 THREAD_UNINT : THREAD_ABORTSAFE,
3344 pmap, pmap_addr + (va - entry->vme_start));
3345 DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
3346 }
3347
3348 if (rc != KERN_SUCCESS) {
3349 struct vm_map_entry tmp_entry = *entry;
3350
3351 /* unwire wired pages */
3352 tmp_entry.vme_end = va;
3353 vm_fault_unwire(map,
3354 &tmp_entry, FALSE, pmap, pmap_addr);
3355
3356 return rc;
3357 }
3358 }
3359 return KERN_SUCCESS;
3360 }
3361
3362 /*
3363 * vm_fault_unwire:
3364 *
3365 * Unwire a range of virtual addresses in a map.
3366 */
3367 void
3368 vm_fault_unwire(
3369 vm_map_t map,
3370 vm_map_entry_t entry,
3371 boolean_t deallocate,
3372 pmap_t pmap,
3373 vm_map_offset_t pmap_addr)
3374 {
3375 register vm_map_offset_t va;
3376 register vm_map_offset_t end_addr = entry->vme_end;
3377 vm_object_t object;
3378 struct vm_object_fault_info fault_info;
3379
3380 object = (entry->is_sub_map)
3381 ? VM_OBJECT_NULL : entry->object.vm_object;
3382
3383 /*
3384 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
3385 * do anything since such memory is wired by default. So we don't have
3386 * anything to undo here.
3387 */
3388
3389 if (object != VM_OBJECT_NULL && object->phys_contiguous)
3390 return;
3391
3392 fault_info.interruptible = THREAD_UNINT;
3393 fault_info.behavior = entry->behavior;
3394 fault_info.user_tag = entry->alias;
3395 fault_info.lo_offset = entry->offset;
3396 fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
3397 fault_info.no_cache = entry->no_cache;
3398
3399 /*
3400 * Since the pages are wired down, we must be able to
3401 * get their mappings from the physical map system.
3402 */
3403
3404 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3405
3406 if (pmap) {
3407 pmap_change_wiring(pmap,
3408 pmap_addr + (va - entry->vme_start), FALSE);
3409 }
3410 if (object == VM_OBJECT_NULL) {
3411 (void) vm_fault(map, va, VM_PROT_NONE,
3412 TRUE, THREAD_UNINT, pmap, pmap_addr);
3413 } else {
3414 vm_prot_t prot;
3415 vm_page_t result_page;
3416 vm_page_t top_page;
3417 vm_object_t result_object;
3418 vm_fault_return_t result;
3419
3420 fault_info.cluster_size = end_addr - va;
3421
3422 do {
3423 prot = VM_PROT_NONE;
3424
3425 vm_object_lock(object);
3426 vm_object_paging_begin(object);
3427 XPR(XPR_VM_FAULT,
3428 "vm_fault_unwire -> vm_fault_page\n",
3429 0,0,0,0,0);
3430 result = vm_fault_page(
3431 object,
3432 entry->offset + (va - entry->vme_start),
3433 VM_PROT_NONE, TRUE,
3434 &prot, &result_page, &top_page,
3435 (int *)0,
3436 NULL, map->no_zero_fill,
3437 FALSE, &fault_info);
3438 } while (result == VM_FAULT_RETRY);
3439
3440 /*
3441 * If this was a mapping to a file on a device that has been forcibly
3442 * unmounted, then we won't get a page back from vm_fault_page(). Just
3443 * move on to the next one in case the remaining pages are mapped from
3444 * different objects. During a forced unmount, the object is terminated
3445 * so the alive flag will be false if this happens. A forced unmount will
3446 * will occur when an external disk is unplugged before the user does an
3447 * eject, so we don't want to panic in that situation.
3448 */
3449
3450 if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
3451 continue;
3452
3453 if (result != VM_FAULT_SUCCESS)
3454 panic("vm_fault_unwire: failure");
3455
3456 result_object = result_page->object;
3457
3458 if (deallocate) {
3459 assert(result_page->phys_page !=
3460 vm_page_fictitious_addr);
3461 pmap_disconnect(result_page->phys_page);
3462 VM_PAGE_FREE(result_page);
3463 } else {
3464 vm_page_lockspin_queues();
3465 vm_page_unwire(result_page);
3466 vm_page_unlock_queues();
3467 PAGE_WAKEUP_DONE(result_page);
3468 }
3469 vm_fault_cleanup(result_object, top_page);
3470 }
3471 }
3472
3473 /*
3474 * Inform the physical mapping system that the range
3475 * of addresses may fault, so that page tables and
3476 * such may be unwired themselves.
3477 */
3478
3479 pmap_pageable(pmap, pmap_addr,
3480 pmap_addr + (end_addr - entry->vme_start), TRUE);
3481
3482 }
3483
3484 /*
3485 * vm_fault_wire_fast:
3486 *
3487 * Handle common case of a wire down page fault at the given address.
3488 * If successful, the page is inserted into the associated physical map.
3489 * The map entry is passed in to avoid the overhead of a map lookup.
3490 *
3491 * NOTE: the given address should be truncated to the
3492 * proper page address.
3493 *
3494 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
3495 * a standard error specifying why the fault is fatal is returned.
3496 *
3497 * The map in question must be referenced, and remains so.
3498 * Caller has a read lock on the map.
3499 *
3500 * This is a stripped version of vm_fault() for wiring pages. Anything
3501 * other than the common case will return KERN_FAILURE, and the caller
3502 * is expected to call vm_fault().
3503 */
3504 kern_return_t
3505 vm_fault_wire_fast(
3506 __unused vm_map_t map,
3507 vm_map_offset_t va,
3508 vm_map_entry_t entry,
3509 pmap_t pmap,
3510 vm_map_offset_t pmap_addr)
3511 {
3512 vm_object_t object;
3513 vm_object_offset_t offset;
3514 register vm_page_t m;
3515 vm_prot_t prot;
3516 thread_t thread = current_thread();
3517 int type_of_fault;
3518 kern_return_t kr;
3519
3520 VM_STAT_INCR(faults);
3521
3522 if (thread != THREAD_NULL && thread->task != TASK_NULL)
3523 thread->task->faults++;
3524
3525 /*
3526 * Recovery actions
3527 */
3528
3529 #undef RELEASE_PAGE
3530 #define RELEASE_PAGE(m) { \
3531 PAGE_WAKEUP_DONE(m); \
3532 vm_page_lockspin_queues(); \
3533 vm_page_unwire(m); \
3534 vm_page_unlock_queues(); \
3535 }
3536
3537
3538 #undef UNLOCK_THINGS
3539 #define UNLOCK_THINGS { \
3540 vm_object_paging_end(object); \
3541 vm_object_unlock(object); \
3542 }
3543
3544 #undef UNLOCK_AND_DEALLOCATE
3545 #define UNLOCK_AND_DEALLOCATE { \
3546 UNLOCK_THINGS; \
3547 vm_object_deallocate(object); \
3548 }
3549 /*
3550 * Give up and have caller do things the hard way.
3551 */
3552
3553 #define GIVE_UP { \
3554 UNLOCK_AND_DEALLOCATE; \
3555 return(KERN_FAILURE); \
3556 }
3557
3558
3559 /*
3560 * If this entry is not directly to a vm_object, bail out.
3561 */
3562 if (entry->is_sub_map)
3563 return(KERN_FAILURE);
3564
3565 /*
3566 * Find the backing store object and offset into it.
3567 */
3568
3569 object = entry->object.vm_object;
3570 offset = (va - entry->vme_start) + entry->offset;
3571 prot = entry->protection;
3572
3573 /*
3574 * Make a reference to this object to prevent its
3575 * disposal while we are messing with it.
3576 */
3577
3578 vm_object_lock(object);
3579 vm_object_reference_locked(object);
3580 vm_object_paging_begin(object);
3581
3582 /*
3583 * INVARIANTS (through entire routine):
3584 *
3585 * 1) At all times, we must either have the object
3586 * lock or a busy page in some object to prevent
3587 * some other thread from trying to bring in
3588 * the same page.
3589 *
3590 * 2) Once we have a busy page, we must remove it from
3591 * the pageout queues, so that the pageout daemon
3592 * will not grab it away.
3593 *
3594 */
3595
3596 /*
3597 * Look for page in top-level object. If it's not there or
3598 * there's something going on, give up.
3599 * ENCRYPTED SWAP: use the slow fault path, since we'll need to
3600 * decrypt the page before wiring it down.
3601 */
3602 m = vm_page_lookup(object, offset);
3603 if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
3604 (m->unusual && ( m->error || m->restart || m->absent))) {
3605
3606 GIVE_UP;
3607 }
3608 ASSERT_PAGE_DECRYPTED(m);
3609
3610 if (m->fictitious &&
3611 m->phys_page == vm_page_guard_addr) {
3612 /*
3613 * Guard pages are fictitious pages and are never
3614 * entered into a pmap, so let's say it's been wired...
3615 */
3616 kr = KERN_SUCCESS;
3617 goto done;
3618 }
3619
3620 /*
3621 * Wire the page down now. All bail outs beyond this
3622 * point must unwire the page.
3623 */
3624
3625 vm_page_lockspin_queues();
3626 vm_page_wire(m);
3627 vm_page_unlock_queues();
3628
3629 /*
3630 * Mark page busy for other threads.
3631 */
3632 assert(!m->busy);
3633 m->busy = TRUE;
3634 assert(!m->absent);
3635
3636 /*
3637 * Give up if the page is being written and there's a copy object
3638 */
3639 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3640 RELEASE_PAGE(m);
3641 GIVE_UP;
3642 }
3643
3644 /*
3645 * Put this page into the physical map.
3646 */
3647 type_of_fault = DBG_CACHE_HIT_FAULT;
3648 kr = vm_fault_enter(m,
3649 pmap,
3650 pmap_addr,
3651 prot,
3652 TRUE,
3653 FALSE,
3654 FALSE,
3655 &type_of_fault);
3656
3657 done:
3658 /*
3659 * Unlock everything, and return
3660 */
3661
3662 PAGE_WAKEUP_DONE(m);
3663 UNLOCK_AND_DEALLOCATE;
3664
3665 return kr;
3666
3667 }
3668
3669 /*
3670 * Routine: vm_fault_copy_cleanup
3671 * Purpose:
3672 * Release a page used by vm_fault_copy.
3673 */
3674
3675 void
3676 vm_fault_copy_cleanup(
3677 vm_page_t page,
3678 vm_page_t top_page)
3679 {
3680 vm_object_t object = page->object;
3681
3682 vm_object_lock(object);
3683 PAGE_WAKEUP_DONE(page);
3684 vm_page_lockspin_queues();
3685 if (!page->active && !page->inactive && !page->throttled)
3686 vm_page_activate(page);
3687 vm_page_unlock_queues();
3688 vm_fault_cleanup(object, top_page);
3689 }
3690
3691 void
3692 vm_fault_copy_dst_cleanup(
3693 vm_page_t page)
3694 {
3695 vm_object_t object;
3696
3697 if (page != VM_PAGE_NULL) {
3698 object = page->object;
3699 vm_object_lock(object);
3700 vm_page_lockspin_queues();
3701 vm_page_unwire(page);
3702 vm_page_unlock_queues();
3703 vm_object_paging_end(object);
3704 vm_object_unlock(object);
3705 }
3706 }
3707
3708 /*
3709 * Routine: vm_fault_copy
3710 *
3711 * Purpose:
3712 * Copy pages from one virtual memory object to another --
3713 * neither the source nor destination pages need be resident.
3714 *
3715 * Before actually copying a page, the version associated with
3716 * the destination address map wil be verified.
3717 *
3718 * In/out conditions:
3719 * The caller must hold a reference, but not a lock, to
3720 * each of the source and destination objects and to the
3721 * destination map.
3722 *
3723 * Results:
3724 * Returns KERN_SUCCESS if no errors were encountered in
3725 * reading or writing the data. Returns KERN_INTERRUPTED if
3726 * the operation was interrupted (only possible if the
3727 * "interruptible" argument is asserted). Other return values
3728 * indicate a permanent error in copying the data.
3729 *
3730 * The actual amount of data copied will be returned in the
3731 * "copy_size" argument. In the event that the destination map
3732 * verification failed, this amount may be less than the amount
3733 * requested.
3734 */
3735 kern_return_t
3736 vm_fault_copy(
3737 vm_object_t src_object,
3738 vm_object_offset_t src_offset,
3739 vm_map_size_t *copy_size, /* INOUT */
3740 vm_object_t dst_object,
3741 vm_object_offset_t dst_offset,
3742 vm_map_t dst_map,
3743 vm_map_version_t *dst_version,
3744 int interruptible)
3745 {
3746 vm_page_t result_page;
3747
3748 vm_page_t src_page;
3749 vm_page_t src_top_page;
3750 vm_prot_t src_prot;
3751
3752 vm_page_t dst_page;
3753 vm_page_t dst_top_page;
3754 vm_prot_t dst_prot;
3755
3756 vm_map_size_t amount_left;
3757 vm_object_t old_copy_object;
3758 kern_return_t error = 0;
3759
3760 vm_map_size_t part_size;
3761 struct vm_object_fault_info fault_info_src;
3762 struct vm_object_fault_info fault_info_dst;
3763
3764 /*
3765 * In order not to confuse the clustered pageins, align
3766 * the different offsets on a page boundary.
3767 */
3768
3769 #define RETURN(x) \
3770 MACRO_BEGIN \
3771 *copy_size -= amount_left; \
3772 MACRO_RETURN(x); \
3773 MACRO_END
3774
3775 amount_left = *copy_size;
3776
3777 fault_info_src.interruptible = interruptible;
3778 fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
3779 fault_info_src.user_tag = 0;
3780 fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
3781 fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
3782 fault_info_src.no_cache = FALSE;
3783
3784 fault_info_dst.interruptible = interruptible;
3785 fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
3786 fault_info_dst.user_tag = 0;
3787 fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
3788 fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
3789 fault_info_dst.no_cache = FALSE;
3790
3791 do { /* while (amount_left > 0) */
3792 /*
3793 * There may be a deadlock if both source and destination
3794 * pages are the same. To avoid this deadlock, the copy must
3795 * start by getting the destination page in order to apply
3796 * COW semantics if any.
3797 */
3798
3799 RetryDestinationFault: ;
3800
3801 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3802
3803 vm_object_lock(dst_object);
3804 vm_object_paging_begin(dst_object);
3805
3806 fault_info_dst.cluster_size = amount_left;
3807
3808 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3809 switch (vm_fault_page(dst_object,
3810 vm_object_trunc_page(dst_offset),
3811 VM_PROT_WRITE|VM_PROT_READ,
3812 FALSE,
3813 &dst_prot, &dst_page, &dst_top_page,
3814 (int *)0,
3815 &error,
3816 dst_map->no_zero_fill,
3817 FALSE, &fault_info_dst)) {
3818 case VM_FAULT_SUCCESS:
3819 break;
3820 case VM_FAULT_RETRY:
3821 goto RetryDestinationFault;
3822 case VM_FAULT_MEMORY_SHORTAGE:
3823 if (vm_page_wait(interruptible))
3824 goto RetryDestinationFault;
3825 /* fall thru */
3826 case VM_FAULT_INTERRUPTED:
3827 RETURN(MACH_SEND_INTERRUPTED);
3828 case VM_FAULT_MEMORY_ERROR:
3829 if (error)
3830 return (error);
3831 else
3832 return(KERN_MEMORY_ERROR);
3833 }
3834 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3835
3836 old_copy_object = dst_page->object->copy;
3837
3838 /*
3839 * There exists the possiblity that the source and
3840 * destination page are the same. But we can't
3841 * easily determine that now. If they are the
3842 * same, the call to vm_fault_page() for the
3843 * destination page will deadlock. To prevent this we
3844 * wire the page so we can drop busy without having
3845 * the page daemon steal the page. We clean up the
3846 * top page but keep the paging reference on the object
3847 * holding the dest page so it doesn't go away.
3848 */
3849
3850 vm_page_lockspin_queues();
3851 vm_page_wire(dst_page);
3852 vm_page_unlock_queues();
3853 PAGE_WAKEUP_DONE(dst_page);
3854 vm_object_unlock(dst_page->object);
3855
3856 if (dst_top_page != VM_PAGE_NULL) {
3857 vm_object_lock(dst_object);
3858 VM_PAGE_FREE(dst_top_page);
3859 vm_object_paging_end(dst_object);
3860 vm_object_unlock(dst_object);
3861 }
3862
3863 RetrySourceFault: ;
3864
3865 if (src_object == VM_OBJECT_NULL) {
3866 /*
3867 * No source object. We will just
3868 * zero-fill the page in dst_object.
3869 */
3870 src_page = VM_PAGE_NULL;
3871 result_page = VM_PAGE_NULL;
3872 } else {
3873 vm_object_lock(src_object);
3874 src_page = vm_page_lookup(src_object,
3875 vm_object_trunc_page(src_offset));
3876 if (src_page == dst_page) {
3877 src_prot = dst_prot;
3878 result_page = VM_PAGE_NULL;
3879 } else {
3880 src_prot = VM_PROT_READ;
3881 vm_object_paging_begin(src_object);
3882
3883 fault_info_src.cluster_size = amount_left;
3884
3885 XPR(XPR_VM_FAULT,
3886 "vm_fault_copy(2) -> vm_fault_page\n",
3887 0,0,0,0,0);
3888 switch (vm_fault_page(
3889 src_object,
3890 vm_object_trunc_page(src_offset),
3891 VM_PROT_READ, FALSE,
3892 &src_prot,
3893 &result_page, &src_top_page,
3894 (int *)0, &error, FALSE,
3895 FALSE, &fault_info_src)) {
3896
3897 case VM_FAULT_SUCCESS:
3898 break;
3899 case VM_FAULT_RETRY:
3900 goto RetrySourceFault;
3901 case VM_FAULT_MEMORY_SHORTAGE:
3902 if (vm_page_wait(interruptible))
3903 goto RetrySourceFault;
3904 /* fall thru */
3905 case VM_FAULT_INTERRUPTED:
3906 vm_fault_copy_dst_cleanup(dst_page);
3907 RETURN(MACH_SEND_INTERRUPTED);
3908 case VM_FAULT_MEMORY_ERROR:
3909 vm_fault_copy_dst_cleanup(dst_page);
3910 if (error)
3911 return (error);
3912 else
3913 return(KERN_MEMORY_ERROR);
3914 }
3915
3916
3917 assert((src_top_page == VM_PAGE_NULL) ==
3918 (result_page->object == src_object));
3919 }
3920 assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
3921 vm_object_unlock(result_page->object);
3922 }
3923
3924 if (!vm_map_verify(dst_map, dst_version)) {
3925 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3926 vm_fault_copy_cleanup(result_page, src_top_page);
3927 vm_fault_copy_dst_cleanup(dst_page);
3928 break;
3929 }
3930
3931 vm_object_lock(dst_page->object);
3932
3933 if (dst_page->object->copy != old_copy_object) {
3934 vm_object_unlock(dst_page->object);
3935 vm_map_verify_done(dst_map, dst_version);
3936 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3937 vm_fault_copy_cleanup(result_page, src_top_page);
3938 vm_fault_copy_dst_cleanup(dst_page);
3939 break;
3940 }
3941 vm_object_unlock(dst_page->object);
3942
3943 /*
3944 * Copy the page, and note that it is dirty
3945 * immediately.
3946 */
3947
3948 if (!page_aligned(src_offset) ||
3949 !page_aligned(dst_offset) ||
3950 !page_aligned(amount_left)) {
3951
3952 vm_object_offset_t src_po,
3953 dst_po;
3954
3955 src_po = src_offset - vm_object_trunc_page(src_offset);
3956 dst_po = dst_offset - vm_object_trunc_page(dst_offset);
3957
3958 if (dst_po > src_po) {
3959 part_size = PAGE_SIZE - dst_po;
3960 } else {
3961 part_size = PAGE_SIZE - src_po;
3962 }
3963 if (part_size > (amount_left)){
3964 part_size = amount_left;
3965 }
3966
3967 if (result_page == VM_PAGE_NULL) {
3968 vm_page_part_zero_fill(dst_page,
3969 dst_po, part_size);
3970 } else {
3971 vm_page_part_copy(result_page, src_po,
3972 dst_page, dst_po, part_size);
3973 if(!dst_page->dirty){
3974 vm_object_lock(dst_object);
3975 dst_page->dirty = TRUE;
3976 vm_object_unlock(dst_page->object);
3977 }
3978
3979 }
3980 } else {
3981 part_size = PAGE_SIZE;
3982
3983 if (result_page == VM_PAGE_NULL)
3984 vm_page_zero_fill(dst_page);
3985 else{
3986 vm_page_copy(result_page, dst_page);
3987 if(!dst_page->dirty){
3988 vm_object_lock(dst_object);
3989 dst_page->dirty = TRUE;
3990 vm_object_unlock(dst_page->object);
3991 }
3992 }
3993
3994 }
3995
3996 /*
3997 * Unlock everything, and return
3998 */
3999
4000 vm_map_verify_done(dst_map, dst_version);
4001
4002 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4003 vm_fault_copy_cleanup(result_page, src_top_page);
4004 vm_fault_copy_dst_cleanup(dst_page);
4005
4006 amount_left -= part_size;
4007 src_offset += part_size;
4008 dst_offset += part_size;
4009 } while (amount_left > 0);
4010
4011 RETURN(KERN_SUCCESS);
4012 #undef RETURN
4013
4014 /*NOTREACHED*/
4015 }
4016
4017 #if VM_FAULT_CLASSIFY
4018 /*
4019 * Temporary statistics gathering support.
4020 */
4021
4022 /*
4023 * Statistics arrays:
4024 */
4025 #define VM_FAULT_TYPES_MAX 5
4026 #define VM_FAULT_LEVEL_MAX 8
4027
4028 int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4029
4030 #define VM_FAULT_TYPE_ZERO_FILL 0
4031 #define VM_FAULT_TYPE_MAP_IN 1
4032 #define VM_FAULT_TYPE_PAGER 2
4033 #define VM_FAULT_TYPE_COPY 3
4034 #define VM_FAULT_TYPE_OTHER 4
4035
4036
4037 void
4038 vm_fault_classify(vm_object_t object,
4039 vm_object_offset_t offset,
4040 vm_prot_t fault_type)
4041 {
4042 int type, level = 0;
4043 vm_page_t m;
4044
4045 while (TRUE) {
4046 m = vm_page_lookup(object, offset);
4047 if (m != VM_PAGE_NULL) {
4048 if (m->busy || m->error || m->restart || m->absent) {
4049 type = VM_FAULT_TYPE_OTHER;
4050 break;
4051 }
4052 if (((fault_type & VM_PROT_WRITE) == 0) ||
4053 ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4054 type = VM_FAULT_TYPE_MAP_IN;
4055 break;
4056 }
4057 type = VM_FAULT_TYPE_COPY;
4058 break;
4059 }
4060 else {
4061 if (object->pager_created) {
4062 type = VM_FAULT_TYPE_PAGER;
4063 break;
4064 }
4065 if (object->shadow == VM_OBJECT_NULL) {
4066 type = VM_FAULT_TYPE_ZERO_FILL;
4067 break;
4068 }
4069
4070 offset += object->shadow_offset;
4071 object = object->shadow;
4072 level++;
4073 continue;
4074 }
4075 }
4076
4077 if (level > VM_FAULT_LEVEL_MAX)
4078 level = VM_FAULT_LEVEL_MAX;
4079
4080 vm_fault_stats[type][level] += 1;
4081
4082 return;
4083 }
4084
4085 /* cleanup routine to call from debugger */
4086
4087 void
4088 vm_fault_classify_init(void)
4089 {
4090 int type, level;
4091
4092 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4093 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4094 vm_fault_stats[type][level] = 0;
4095 }
4096 }
4097
4098 return;
4099 }
4100 #endif /* VM_FAULT_CLASSIFY */
4101
4102
4103 extern int cs_validation;
4104
4105 void
4106 vm_page_validate_cs(
4107 vm_page_t page)
4108 {
4109 vm_object_t object;
4110 vm_object_offset_t offset;
4111 vm_map_offset_t koffset;
4112 vm_map_size_t ksize;
4113 vm_offset_t kaddr;
4114 kern_return_t kr;
4115 memory_object_t pager;
4116 void *blobs;
4117 boolean_t validated, tainted;
4118 boolean_t busy_page;
4119
4120 vm_object_lock_assert_exclusive(page->object);
4121 assert(!page->cs_validated);
4122
4123 if (!cs_validation) {
4124 return;
4125 }
4126
4127 object = page->object;
4128 assert(object->code_signed);
4129 offset = page->offset;
4130
4131 busy_page = page->busy;
4132 if (!busy_page) {
4133 /* keep page busy while we map (and unlock) the VM object */
4134 page->busy = TRUE;
4135 }
4136
4137 /*
4138 * Take a paging reference on the VM object
4139 * to protect it from collapse or bypass,
4140 * and keep it from disappearing too.
4141 */
4142 vm_object_paging_begin(object);
4143
4144 /* map the page in the kernel address space */
4145 koffset = 0;
4146 ksize = PAGE_SIZE_64;
4147 kr = vm_paging_map_object(&koffset,
4148 page,
4149 object,
4150 offset,
4151 &ksize,
4152 FALSE); /* can't unlock object ! */
4153 if (kr != KERN_SUCCESS) {
4154 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
4155 }
4156 kaddr = CAST_DOWN(vm_offset_t, koffset);
4157
4158 /*
4159 * Since we get here to validate a page that was brought in by
4160 * the pager, we know that this pager is all setup and ready
4161 * by now.
4162 */
4163 assert(!object->internal);
4164 assert(object->pager != NULL);
4165 assert(object->pager_ready);
4166
4167 if (!object->alive || object->terminating || object->pager == NULL) {
4168 /*
4169 * The object is terminating and we don't have its pager
4170 * so we can't validate the data...
4171 */
4172 goto out;
4173 }
4174
4175 pager = object->pager;
4176 assert(pager != NULL);
4177
4178 kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
4179 if (kr != KERN_SUCCESS) {
4180 blobs = NULL;
4181 }
4182
4183 /* verify the SHA1 hash for this page */
4184 validated = cs_validate_page(blobs,
4185 offset + object->paging_offset,
4186 (const void *)kaddr,
4187 &tainted);
4188
4189 assert(page->busy);
4190 assert(object == page->object);
4191 vm_object_lock_assert_exclusive(object);
4192
4193 page->cs_validated = validated;
4194 if (validated) {
4195 page->cs_tainted = tainted;
4196 }
4197
4198 out:
4199 if (!busy_page) {
4200 PAGE_WAKEUP_DONE(page);
4201 }
4202 if (koffset != 0) {
4203 /* unmap the map from the kernel address space */
4204 vm_paging_unmap_object(object, koffset, koffset + ksize);
4205 koffset = 0;
4206 ksize = 0;
4207 kaddr = 0;
4208 }
4209 vm_object_paging_end(object);
4210 }