]> git.saurik.com Git - apple/xnu.git/blame - osfmk/vm/vm_fault.c
xnu-1228.12.14.tar.gz
[apple/xnu.git] / osfmk / vm / vm_fault.c
CommitLineData
1c79356b 1/*
2d21ac55 2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: vm_fault.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Page fault handling module.
63 */
1c79356b
A
64
65#include <mach_cluster_stats.h>
66#include <mach_pagemap.h>
67#include <mach_kdb.h>
2d21ac55 68#include <libkern/OSAtomic.h>
1c79356b 69
91447636 70#include <mach/mach_types.h>
1c79356b
A
71#include <mach/kern_return.h>
72#include <mach/message.h> /* for error codes */
91447636
A
73#include <mach/vm_param.h>
74#include <mach/vm_behavior.h>
75#include <mach/memory_object.h>
76 /* For memory_object_data_{request,unlock} */
2d21ac55 77#include <mach/sdt.h>
91447636
A
78
79#include <kern/kern_types.h>
1c79356b
A
80#include <kern/host_statistics.h>
81#include <kern/counters.h>
82#include <kern/task.h>
83#include <kern/thread.h>
84#include <kern/sched_prim.h>
85#include <kern/host.h>
86#include <kern/xpr.h>
91447636
A
87#include <kern/mach_param.h>
88#include <kern/macro_help.h>
89#include <kern/zalloc.h>
90#include <kern/misc_protos.h>
91
0b4e3aa0 92#include <ppc/proc_reg.h>
91447636
A
93
94#include <vm/vm_fault.h>
1c79356b
A
95#include <vm/vm_map.h>
96#include <vm/vm_object.h>
97#include <vm/vm_page.h>
55e303ae 98#include <vm/vm_kern.h>
1c79356b
A
99#include <vm/pmap.h>
100#include <vm/vm_pageout.h>
91447636 101#include <vm/vm_protos.h>
2d21ac55
A
102#include <vm/vm_external.h>
103#include <vm/memory_object.h>
104#include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
1c79356b
A
105
106#include <sys/kdebug.h>
107
108#define VM_FAULT_CLASSIFY 0
1c79356b 109
2d21ac55
A
110/* Zero-filled pages are marked "m->zero_fill" and put on the
111 * special zero-fill inactive queue only if they belong to
112 * an object at least this big.
113 */
114#define VM_ZF_OBJECT_SIZE_THRESHOLD (0x200000)
1c79356b 115
2d21ac55 116#define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
1c79356b 117
2d21ac55 118int vm_object_pagein_throttle = 16;
1c79356b 119
2d21ac55 120extern int cs_debug;
1c79356b
A
121
122#if MACH_KDB
123extern struct db_watchpoint *db_watchpoint_list;
124#endif /* MACH_KDB */
125
91447636 126
1c79356b
A
127/* Forward declarations of internal routines. */
128extern kern_return_t vm_fault_wire_fast(
129 vm_map_t map,
91447636 130 vm_map_offset_t va,
1c79356b 131 vm_map_entry_t entry,
9bccf70c 132 pmap_t pmap,
91447636 133 vm_map_offset_t pmap_addr);
1c79356b
A
134
135extern void vm_fault_continue(void);
136
137extern void vm_fault_copy_cleanup(
138 vm_page_t page,
139 vm_page_t top_page);
140
141extern void vm_fault_copy_dst_cleanup(
142 vm_page_t page);
143
144#if VM_FAULT_CLASSIFY
145extern void vm_fault_classify(vm_object_t object,
146 vm_object_offset_t offset,
147 vm_prot_t fault_type);
148
149extern void vm_fault_classify_init(void);
150#endif
151
4a3eedf9
A
152
153unsigned long vm_cs_validates = 0;
154unsigned long vm_cs_revalidates = 0;
155unsigned long vm_cs_query_modified = 0;
156unsigned long vm_cs_validated_dirtied = 0;
157
593a1d5f
A
158#if CONFIG_ENFORCE_SIGNED_CODE
159#if SECURE_KERNEL
160const int cs_enforcement_disable=0;
161#else
162int cs_enforcement_disable=1;
163#endif
164#endif
165
1c79356b
A
166/*
167 * Routine: vm_fault_init
168 * Purpose:
169 * Initialize our private data structures.
170 */
171void
172vm_fault_init(void)
173{
593a1d5f
A
174#if !SECURE_KERNEL
175#if CONFIG_ENFORCE_SIGNED_CODE
176 PE_parse_boot_argn("cs_enforcement_disable", &cs_enforcement_disable, sizeof (cs_enforcement_disable));
177#endif
178 PE_parse_boot_argn("cs_debug", &cs_debug, sizeof (cs_debug));
179#endif
1c79356b
A
180}
181
182/*
183 * Routine: vm_fault_cleanup
184 * Purpose:
185 * Clean up the result of vm_fault_page.
186 * Results:
187 * The paging reference for "object" is released.
188 * "object" is unlocked.
189 * If "top_page" is not null, "top_page" is
190 * freed and the paging reference for the object
191 * containing it is released.
192 *
193 * In/out conditions:
194 * "object" must be locked.
195 */
196void
197vm_fault_cleanup(
198 register vm_object_t object,
199 register vm_page_t top_page)
200{
201 vm_object_paging_end(object);
202 vm_object_unlock(object);
203
204 if (top_page != VM_PAGE_NULL) {
2d21ac55
A
205 object = top_page->object;
206
207 vm_object_lock(object);
208 VM_PAGE_FREE(top_page);
209 vm_object_paging_end(object);
210 vm_object_unlock(object);
1c79356b
A
211 }
212}
213
214#if MACH_CLUSTER_STATS
215#define MAXCLUSTERPAGES 16
216struct {
217 unsigned long pages_in_cluster;
218 unsigned long pages_at_higher_offsets;
219 unsigned long pages_at_lower_offsets;
220} cluster_stats_in[MAXCLUSTERPAGES];
221#define CLUSTER_STAT(clause) clause
222#define CLUSTER_STAT_HIGHER(x) \
223 ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
224#define CLUSTER_STAT_LOWER(x) \
225 ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
226#define CLUSTER_STAT_CLUSTER(x) \
227 ((cluster_stats_in[(x)].pages_in_cluster)++)
228#else /* MACH_CLUSTER_STATS */
229#define CLUSTER_STAT(clause)
230#endif /* MACH_CLUSTER_STATS */
231
55e303ae
A
232#define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
233
234
235boolean_t vm_page_deactivate_behind = TRUE;
1c79356b 236/*
2d21ac55 237 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
1c79356b 238 */
55e303ae
A
239int vm_default_ahead = 0;
240int vm_default_behind = MAX_UPL_TRANSFER;
241
2d21ac55
A
242#define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
243
244/*
245 * vm_page_is_sequential
246 *
247 * Determine if sequential access is in progress
248 * in accordance with the behavior specified.
249 * Update state to indicate current access pattern.
250 *
251 * object must have at least the shared lock held
252 */
253static
254void
255vm_fault_is_sequential(
256 vm_object_t object,
257 vm_object_offset_t offset,
258 vm_behavior_t behavior)
259{
260 vm_object_offset_t last_alloc;
261 int sequential;
262 int orig_sequential;
263
264 last_alloc = object->last_alloc;
265 sequential = object->sequential;
266 orig_sequential = sequential;
267
268 switch (behavior) {
269 case VM_BEHAVIOR_RANDOM:
270 /*
271 * reset indicator of sequential behavior
272 */
273 sequential = 0;
274 break;
275
276 case VM_BEHAVIOR_SEQUENTIAL:
277 if (offset && last_alloc == offset - PAGE_SIZE_64) {
278 /*
279 * advance indicator of sequential behavior
280 */
281 if (sequential < MAX_SEQUENTIAL_RUN)
282 sequential += PAGE_SIZE;
283 } else {
284 /*
285 * reset indicator of sequential behavior
286 */
287 sequential = 0;
288 }
289 break;
290
291 case VM_BEHAVIOR_RSEQNTL:
292 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
293 /*
294 * advance indicator of sequential behavior
295 */
296 if (sequential > -MAX_SEQUENTIAL_RUN)
297 sequential -= PAGE_SIZE;
298 } else {
299 /*
300 * reset indicator of sequential behavior
301 */
302 sequential = 0;
303 }
304 break;
305
306 case VM_BEHAVIOR_DEFAULT:
307 default:
308 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
309 /*
310 * advance indicator of sequential behavior
311 */
312 if (sequential < 0)
313 sequential = 0;
314 if (sequential < MAX_SEQUENTIAL_RUN)
315 sequential += PAGE_SIZE;
316
317 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
318 /*
319 * advance indicator of sequential behavior
320 */
321 if (sequential > 0)
322 sequential = 0;
323 if (sequential > -MAX_SEQUENTIAL_RUN)
324 sequential -= PAGE_SIZE;
325 } else {
326 /*
327 * reset indicator of sequential behavior
328 */
329 sequential = 0;
330 }
331 break;
332 }
333 if (sequential != orig_sequential) {
334 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
335 /*
336 * if someone else has already updated object->sequential
337 * don't bother trying to update it or object->last_alloc
338 */
339 return;
340 }
341 }
342 /*
343 * I'd like to do this with a OSCompareAndSwap64, but that
344 * doesn't exist for PPC... however, it shouldn't matter
345 * that much... last_alloc is maintained so that we can determine
346 * if a sequential access pattern is taking place... if only
347 * one thread is banging on this object, no problem with the unprotected
348 * update... if 2 or more threads are banging away, we run the risk of
349 * someone seeing a mangled update... however, in the face of multiple
350 * accesses, no sequential access pattern can develop anyway, so we
351 * haven't lost any real info.
352 */
353 object->last_alloc = offset;
354}
355
356
55e303ae 357/*
2d21ac55
A
358 * vm_page_deactivate_behind
359 *
360 * Determine if sequential access is in progress
361 * in accordance with the behavior specified. If
362 * so, compute a potential page to deactivate and
363 * deactivate it.
55e303ae 364 *
2d21ac55 365 * object must be locked.
55e303ae 366 *
2d21ac55 367 * return TRUE if we actually deactivate a page
55e303ae
A
368 */
369static
370boolean_t
371vm_fault_deactivate_behind(
91447636
A
372 vm_object_t object,
373 vm_object_offset_t offset,
374 vm_behavior_t behavior)
55e303ae 375{
2d21ac55
A
376 vm_page_t m = NULL;
377 int sequential_run;
378 int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
55e303ae
A
379
380#if TRACEFAULTPAGE
381 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
382#endif
383
2d21ac55 384 if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
91447636
A
385 /*
386 * Do not deactivate pages from the kernel object: they
387 * are not intended to become pageable.
2d21ac55 388 * or we've disabled the deactivate behind mechanism
91447636
A
389 */
390 return FALSE;
391 }
2d21ac55
A
392 if ((sequential_run = object->sequential)) {
393 if (sequential_run < 0) {
394 sequential_behavior = VM_BEHAVIOR_RSEQNTL;
395 sequential_run = 0 - sequential_run;
396 } else {
397 sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
398 }
399 }
55e303ae
A
400 switch (behavior) {
401 case VM_BEHAVIOR_RANDOM:
55e303ae
A
402 break;
403 case VM_BEHAVIOR_SEQUENTIAL:
2d21ac55 404 if (sequential_run >= (int)PAGE_SIZE)
55e303ae 405 m = vm_page_lookup(object, offset - PAGE_SIZE_64);
55e303ae
A
406 break;
407 case VM_BEHAVIOR_RSEQNTL:
2d21ac55 408 if (sequential_run >= (int)PAGE_SIZE)
55e303ae 409 m = vm_page_lookup(object, offset + PAGE_SIZE_64);
55e303ae
A
410 break;
411 case VM_BEHAVIOR_DEFAULT:
412 default:
2d21ac55
A
413 { vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
414
415 /*
416 * determine if the run of sequential accesss has been
417 * long enough on an object with default access behavior
418 * to consider it for deactivation
419 */
420 if ((uint64_t)sequential_run >= behind) {
421 if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
422 if (offset >= behind)
423 m = vm_page_lookup(object, offset - behind);
424 } else {
425 if (offset < -behind)
426 m = vm_page_lookup(object, offset + behind);
427 }
55e303ae
A
428 }
429 break;
430 }
2d21ac55 431 }
55e303ae 432 if (m) {
2d21ac55
A
433 if (!m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
434 pmap_clear_reference(m->phys_page);
435 m->deactivated = TRUE;
55e303ae
A
436#if TRACEFAULTPAGE
437 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
438#endif
2d21ac55 439 return TRUE;
55e303ae 440 }
55e303ae
A
441 }
442 return FALSE;
443}
1c79356b 444
1c79356b 445
2d21ac55
A
446/*
447 * check for various conditions that would
448 * prevent us from creating a ZF page...
449 * cleanup is based on being called from vm_fault_page
450 *
451 * object must be locked
452 * object == m->object
453 */
454static vm_fault_return_t
455vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
456{
457 if (object->shadow_severed) {
458 /*
459 * the shadow chain was severed
460 * just have to return an error at this point
461 */
462 if (m != VM_PAGE_NULL)
463 VM_PAGE_FREE(m);
464 vm_fault_cleanup(object, first_m);
465
466 thread_interrupt_level(interruptible_state);
467
468 return (VM_FAULT_MEMORY_ERROR);
469 }
470 if (vm_backing_store_low) {
471 /*
472 * are we protecting the system from
473 * backing store exhaustion. If so
474 * sleep unless we are privileged.
475 */
476 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
477
478 if (m != VM_PAGE_NULL)
479 VM_PAGE_FREE(m);
480 vm_fault_cleanup(object, first_m);
481
482 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
483
484 thread_block(THREAD_CONTINUE_NULL);
485 thread_interrupt_level(interruptible_state);
486
487 return (VM_FAULT_RETRY);
488 }
489 }
490 if (VM_PAGE_ZFILL_THROTTLED()) {
491 /*
492 * we're throttling zero-fills...
493 * treat this as if we couldn't grab a page
494 */
495 if (m != VM_PAGE_NULL)
496 VM_PAGE_FREE(m);
497 vm_fault_cleanup(object, first_m);
498
499 thread_interrupt_level(interruptible_state);
500
501 return (VM_FAULT_MEMORY_SHORTAGE);
502 }
503 return (VM_FAULT_SUCCESS);
504}
505
506
507/*
508 * do the work to zero fill a page and
509 * inject it into the correct paging queue
510 *
511 * m->object must be locked
512 * page queue lock must NOT be held
513 */
514static int
515vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
516{
517 int my_fault = DBG_ZERO_FILL_FAULT;
518
519 /*
520 * This is is a zero-fill page fault...
521 *
522 * Checking the page lock is a waste of
523 * time; this page was absent, so
524 * it can't be page locked by a pager.
525 *
526 * we also consider it undefined
527 * with respect to instruction
528 * execution. i.e. it is the responsibility
529 * of higher layers to call for an instruction
530 * sync after changing the contents and before
531 * sending a program into this area. We
532 * choose this approach for performance
533 */
534 m->pmapped = TRUE;
535
536 m->cs_validated = FALSE;
537 m->cs_tainted = FALSE;
538
539 if (no_zero_fill == TRUE)
540 my_fault = DBG_NZF_PAGE_FAULT;
541 else {
542 vm_page_zero_fill(m);
543
544 VM_STAT_INCR(zero_fill_count);
545 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
546 }
547 assert(!m->laundry);
548 assert(m->object != kernel_object);
549 //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
550
551 if (!IP_VALID(memory_manager_default) &&
552 (m->object->purgable == VM_PURGABLE_DENY ||
cf7d32b8
A
553 m->object->purgable == VM_PURGABLE_NONVOLATILE ||
554 m->object->purgable == VM_PURGABLE_VOLATILE )) {
2d21ac55
A
555 vm_page_lock_queues();
556
557 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
558 m->throttled = TRUE;
559 vm_page_throttled_count++;
560
561 vm_page_unlock_queues();
562 } else {
563 if (m->object->size > VM_ZF_OBJECT_SIZE_THRESHOLD) {
564 m->zero_fill = TRUE;
565 OSAddAtomic(1, (SInt32 *)&vm_zf_count);
566 }
567 }
568 return (my_fault);
569}
570
571
1c79356b
A
572/*
573 * Routine: vm_fault_page
574 * Purpose:
575 * Find the resident page for the virtual memory
576 * specified by the given virtual memory object
577 * and offset.
578 * Additional arguments:
579 * The required permissions for the page is given
580 * in "fault_type". Desired permissions are included
2d21ac55
A
581 * in "protection".
582 * fault_info is passed along to determine pagein cluster
583 * limits... it contains the expected reference pattern,
584 * cluster size if available, etc...
1c79356b
A
585 *
586 * If the desired page is known to be resident (for
587 * example, because it was previously wired down), asserting
588 * the "unwiring" parameter will speed the search.
589 *
590 * If the operation can be interrupted (by thread_abort
591 * or thread_terminate), then the "interruptible"
592 * parameter should be asserted.
593 *
594 * Results:
595 * The page containing the proper data is returned
596 * in "result_page".
597 *
598 * In/out conditions:
599 * The source object must be locked and referenced,
600 * and must donate one paging reference. The reference
601 * is not affected. The paging reference and lock are
602 * consumed.
603 *
604 * If the call succeeds, the object in which "result_page"
605 * resides is left locked and holding a paging reference.
606 * If this is not the original object, a busy page in the
607 * original object is returned in "top_page", to prevent other
608 * callers from pursuing this same data, along with a paging
609 * reference for the original object. The "top_page" should
610 * be destroyed when this guarantee is no longer required.
611 * The "result_page" is also left busy. It is not removed
612 * from the pageout queues.
613 */
614
615vm_fault_return_t
616vm_fault_page(
617 /* Arguments: */
618 vm_object_t first_object, /* Object to begin search */
619 vm_object_offset_t first_offset, /* Offset into object */
620 vm_prot_t fault_type, /* What access is requested */
621 boolean_t must_be_resident,/* Must page be resident? */
1c79356b
A
622 /* Modifies in place: */
623 vm_prot_t *protection, /* Protection for mapping */
624 /* Returns: */
625 vm_page_t *result_page, /* Page found, if successful */
626 vm_page_t *top_page, /* Page in top object, if
627 * not result_page. */
628 int *type_of_fault, /* if non-null, fill in with type of fault
629 * COW, zero-fill, etc... returned in trace point */
630 /* More arguments: */
631 kern_return_t *error_code, /* code if page is in error */
632 boolean_t no_zero_fill, /* don't zero fill absent pages */
2d21ac55 633#if MACH_PAGEMAP
0b4e3aa0 634 boolean_t data_supply, /* treat as data_supply if
1c79356b
A
635 * it is a write fault and a full
636 * page is provided */
2d21ac55
A
637#else
638 __unused boolean_t data_supply,
639#endif
640 vm_object_fault_info_t fault_info)
1c79356b 641{
1c79356b 642 vm_page_t m;
1c79356b 643 vm_object_t object;
1c79356b
A
644 vm_object_offset_t offset;
645 vm_page_t first_m;
646 vm_object_t next_object;
647 vm_object_t copy_object;
648 boolean_t look_for_page;
649 vm_prot_t access_required = fault_type;
650 vm_prot_t wants_copy_flag;
1c79356b
A
651 CLUSTER_STAT(int pages_at_higher_offsets;)
652 CLUSTER_STAT(int pages_at_lower_offsets;)
2d21ac55 653 kern_return_t wait_result;
1c79356b 654 boolean_t interruptible_state;
2d21ac55
A
655 vm_fault_return_t error;
656 int my_fault;
657 uint32_t try_failed_count;
658 int interruptible; /* how may fault be interrupted? */
659 memory_object_t pager;
1c79356b 660
1c79356b
A
661/*
662 * MACH page map - an optional optimization where a bit map is maintained
663 * by the VM subsystem for internal objects to indicate which pages of
664 * the object currently reside on backing store. This existence map
665 * duplicates information maintained by the vnode pager. It is
666 * created at the time of the first pageout against the object, i.e.
667 * at the same time pager for the object is created. The optimization
668 * is designed to eliminate pager interaction overhead, if it is
669 * 'known' that the page does not exist on backing store.
670 *
2d21ac55 671 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
1c79356b 672 * either marked as paged out in the existence map for the object or no
2d21ac55 673 * existence map exists for the object. MUST_ASK_PAGER() is one of the
1c79356b
A
674 * criteria in the decision to invoke the pager. It is also used as one
675 * of the criteria to terminate the scan for adjacent pages in a clustered
2d21ac55 676 * pagein operation. Note that MUST_ASK_PAGER() always evaluates to TRUE for
1c79356b
A
677 * permanent objects. Note also that if the pager for an internal object
678 * has not been created, the pager is not invoked regardless of the value
2d21ac55 679 * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
1c79356b
A
680 * for which a pager has been created.
681 *
682 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
683 * is marked as paged out in the existence map for the object. PAGED_OUT()
684 * PAGED_OUT() is used to determine if a page has already been pushed
685 * into a copy object in order to avoid a redundant page out operation.
686 */
2d21ac55
A
687#if MACH_PAGEMAP
688#define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \
1c79356b
A
689 != VM_EXTERNAL_STATE_ABSENT)
690#define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
691 == VM_EXTERNAL_STATE_EXISTS)
2d21ac55
A
692#else
693#define MUST_ASK_PAGER(o, f) (TRUE)
694#define PAGED_OUT(o, f) (FALSE)
695#endif
1c79356b
A
696
697/*
698 * Recovery actions
699 */
700#define PREPARE_RELEASE_PAGE(m) \
701 MACRO_BEGIN \
702 vm_page_lock_queues(); \
703 MACRO_END
704
705#define DO_RELEASE_PAGE(m) \
706 MACRO_BEGIN \
707 PAGE_WAKEUP_DONE(m); \
2d21ac55 708 if (!m->active && !m->inactive && !m->throttled)\
1c79356b
A
709 vm_page_activate(m); \
710 vm_page_unlock_queues(); \
711 MACRO_END
712
713#define RELEASE_PAGE(m) \
714 MACRO_BEGIN \
715 PREPARE_RELEASE_PAGE(m); \
716 DO_RELEASE_PAGE(m); \
717 MACRO_END
718
719#if TRACEFAULTPAGE
720 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
721#endif
722
723
1c79356b
A
724#if MACH_KDB
725 /*
726 * If there are watchpoints set, then
727 * we don't want to give away write permission
728 * on a read fault. Make the task write fault,
729 * so that the watchpoint code notices the access.
730 */
731 if (db_watchpoint_list) {
732 /*
733 * If we aren't asking for write permission,
734 * then don't give it away. We're using write
735 * faults to set the dirty bit.
736 */
737 if (!(fault_type & VM_PROT_WRITE))
738 *protection &= ~VM_PROT_WRITE;
739 }
1c79356b 740#endif /* MACH_KDB */
1c79356b 741
2d21ac55 742 interruptible = fault_info->interruptible;
9bccf70c 743 interruptible_state = thread_interrupt_level(interruptible);
1c79356b
A
744
745 /*
746 * INVARIANTS (through entire routine):
747 *
748 * 1) At all times, we must either have the object
749 * lock or a busy page in some object to prevent
750 * some other thread from trying to bring in
751 * the same page.
752 *
753 * Note that we cannot hold any locks during the
754 * pager access or when waiting for memory, so
755 * we use a busy page then.
756 *
1c79356b
A
757 * 2) To prevent another thread from racing us down the
758 * shadow chain and entering a new page in the top
759 * object before we do, we must keep a busy page in
760 * the top object while following the shadow chain.
761 *
762 * 3) We must increment paging_in_progress on any object
2d21ac55
A
763 * for which we have a busy page before dropping
764 * the object lock
1c79356b
A
765 *
766 * 4) We leave busy pages on the pageout queues.
767 * If the pageout daemon comes across a busy page,
768 * it will remove the page from the pageout queues.
769 */
770
1c79356b
A
771 object = first_object;
772 offset = first_offset;
773 first_m = VM_PAGE_NULL;
774 access_required = fault_type;
775
2d21ac55 776
1c79356b
A
777 XPR(XPR_VM_FAULT,
778 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
779 (integer_t)object, offset, fault_type, *protection, 0);
780
781 /*
2d21ac55 782 * default type of fault
1c79356b 783 */
2d21ac55 784 my_fault = DBG_CACHE_HIT_FAULT;
1c79356b
A
785
786 while (TRUE) {
787#if TRACEFAULTPAGE
788 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
789#endif
790 if (!object->alive) {
2d21ac55
A
791 /*
792 * object is no longer valid
793 * clean up and return error
794 */
1c79356b 795 vm_fault_cleanup(object, first_m);
9bccf70c 796 thread_interrupt_level(interruptible_state);
2d21ac55
A
797
798 return (VM_FAULT_MEMORY_ERROR);
1c79356b 799 }
2d21ac55
A
800
801 /*
802 * See whether the page at 'offset' is resident
803 */
1c79356b
A
804 m = vm_page_lookup(object, offset);
805#if TRACEFAULTPAGE
806 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
807#endif
808 if (m != VM_PAGE_NULL) {
1c79356b
A
809
810 if (m->busy) {
2d21ac55
A
811 /*
812 * The page is being brought in,
813 * wait for it and then retry.
814 *
815 * A possible optimization: if the page
816 * is known to be resident, we can ignore
817 * pages that are absent (regardless of
818 * whether they're busy).
819 */
1c79356b
A
820#if TRACEFAULTPAGE
821 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
822#endif
9bccf70c 823 wait_result = PAGE_SLEEP(object, m, interruptible);
1c79356b
A
824 XPR(XPR_VM_FAULT,
825 "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
826 (integer_t)object, offset,
827 (integer_t)m, 0, 0);
828 counter(c_vm_fault_page_block_busy_kernel++);
1c79356b 829
1c79356b
A
830 if (wait_result != THREAD_AWAKENED) {
831 vm_fault_cleanup(object, first_m);
9bccf70c 832 thread_interrupt_level(interruptible_state);
2d21ac55 833
1c79356b 834 if (wait_result == THREAD_RESTART)
2d21ac55 835 return (VM_FAULT_RETRY);
1c79356b 836 else
2d21ac55 837 return (VM_FAULT_INTERRUPTED);
1c79356b
A
838 }
839 continue;
840 }
841
2d21ac55 842 if (m->phys_page == vm_page_guard_addr) {
91447636 843 /*
2d21ac55 844 * Guard page: off limits !
91447636 845 */
2d21ac55
A
846 if (fault_type == VM_PROT_NONE) {
847 /*
848 * The fault is not requesting any
849 * access to the guard page, so it must
850 * be just to wire or unwire it.
851 * Let's pretend it succeeded...
852 */
853 m->busy = TRUE;
854 *result_page = m;
855 assert(first_m == VM_PAGE_NULL);
856 *top_page = first_m;
857 if (type_of_fault)
858 *type_of_fault = DBG_GUARD_FAULT;
859 return VM_FAULT_SUCCESS;
860 } else {
861 /*
862 * The fault requests access to the
863 * guard page: let's deny that !
864 */
865 vm_fault_cleanup(object, first_m);
866 thread_interrupt_level(interruptible_state);
867 return VM_FAULT_MEMORY_ERROR;
868 }
91447636 869 }
1c79356b
A
870
871 if (m->error) {
2d21ac55
A
872 /*
873 * The page is in error, give up now.
874 */
1c79356b
A
875#if TRACEFAULTPAGE
876 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
877#endif
878 if (error_code)
2d21ac55 879 *error_code = KERN_MEMORY_ERROR;
1c79356b 880 VM_PAGE_FREE(m);
2d21ac55 881
1c79356b 882 vm_fault_cleanup(object, first_m);
9bccf70c 883 thread_interrupt_level(interruptible_state);
1c79356b 884
2d21ac55
A
885 return (VM_FAULT_MEMORY_ERROR);
886 }
1c79356b 887 if (m->restart) {
2d21ac55
A
888 /*
889 * The pager wants us to restart
890 * at the top of the chain,
891 * typically because it has moved the
892 * page to another pager, then do so.
893 */
1c79356b
A
894#if TRACEFAULTPAGE
895 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
896#endif
897 VM_PAGE_FREE(m);
2d21ac55 898
1c79356b 899 vm_fault_cleanup(object, first_m);
9bccf70c 900 thread_interrupt_level(interruptible_state);
1c79356b 901
2d21ac55
A
902 return (VM_FAULT_RETRY);
903 }
1c79356b 904 if (m->absent) {
2d21ac55
A
905 /*
906 * The page isn't busy, but is absent,
907 * therefore it's deemed "unavailable".
908 *
1c79356b
A
909 * Remove the non-existent page (unless it's
910 * in the top object) and move on down to the
911 * next object (if there is one).
912 */
913#if TRACEFAULTPAGE
914 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
915#endif
1c79356b 916 next_object = object->shadow;
1c79356b 917
2d21ac55 918 if (next_object == VM_OBJECT_NULL) {
1c79356b
A
919 /*
920 * Absent page at bottom of shadow
921 * chain; zero fill the page we left
2d21ac55
A
922 * busy in the first object, and free
923 * the absent page.
1c79356b 924 */
2d21ac55 925 assert(!must_be_resident);
55e303ae
A
926
927 /*
2d21ac55
A
928 * check for any conditions that prevent
929 * us from creating a new zero-fill page
930 * vm_fault_check will do all of the
931 * fault cleanup in the case of an error condition
932 * including resetting the thread_interrupt_level
55e303ae 933 */
2d21ac55 934 error = vm_fault_check(object, m, first_m, interruptible_state);
55e303ae 935
2d21ac55
A
936 if (error != VM_FAULT_SUCCESS)
937 return (error);
55e303ae 938
1c79356b 939 XPR(XPR_VM_FAULT,
2d21ac55 940 "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1c79356b
A
941 (integer_t)object, offset,
942 (integer_t)m,
943 (integer_t)first_object, 0);
2d21ac55 944
1c79356b 945 if (object != first_object) {
2d21ac55
A
946 /*
947 * free the absent page we just found
948 */
1c79356b 949 VM_PAGE_FREE(m);
2d21ac55
A
950
951 /*
952 * drop reference and lock on current object
953 */
1c79356b
A
954 vm_object_paging_end(object);
955 vm_object_unlock(object);
2d21ac55
A
956
957 /*
958 * grab the original page we
959 * 'soldered' in place and
960 * retake lock on 'first_object'
961 */
1c79356b
A
962 m = first_m;
963 first_m = VM_PAGE_NULL;
1c79356b 964
2d21ac55
A
965 object = first_object;
966 offset = first_offset;
0b4e3aa0 967
1c79356b 968 vm_object_lock(object);
9bccf70c 969 } else {
2d21ac55
A
970 /*
971 * we're going to use the absent page we just found
972 * so convert it to a 'busy' page
973 */
974 m->absent = FALSE;
975 m->busy = TRUE;
0b4e3aa0 976 }
2d21ac55
A
977 /*
978 * zero-fill the page and put it on
979 * the correct paging queue
980 */
981 my_fault = vm_fault_zero_page(m, no_zero_fill);
982
1c79356b
A
983 break;
984 } else {
2d21ac55 985 if (must_be_resident)
1c79356b 986 vm_object_paging_end(object);
2d21ac55 987 else if (object != first_object) {
1c79356b
A
988 vm_object_paging_end(object);
989 VM_PAGE_FREE(m);
990 } else {
991 first_m = m;
992 m->absent = FALSE;
1c79356b
A
993 m->busy = TRUE;
994
2d21ac55 995 vm_page_lockspin_queues();
1c79356b
A
996 VM_PAGE_QUEUES_REMOVE(m);
997 vm_page_unlock_queues();
998 }
999 XPR(XPR_VM_FAULT,
1000 "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1001 (integer_t)object, offset,
1002 (integer_t)next_object,
1003 offset+object->shadow_offset,0);
2d21ac55 1004
1c79356b 1005 offset += object->shadow_offset;
2d21ac55
A
1006 fault_info->lo_offset += object->shadow_offset;
1007 fault_info->hi_offset += object->shadow_offset;
1c79356b 1008 access_required = VM_PROT_READ;
2d21ac55 1009
1c79356b
A
1010 vm_object_lock(next_object);
1011 vm_object_unlock(object);
1012 object = next_object;
1013 vm_object_paging_begin(object);
2d21ac55
A
1014
1015 /*
1016 * reset to default type of fault
1017 */
1018 my_fault = DBG_CACHE_HIT_FAULT;
1019
1c79356b
A
1020 continue;
1021 }
1022 }
1c79356b 1023 if ((m->cleaning)
2d21ac55
A
1024 && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1025 && (fault_type & VM_PROT_WRITE)) {
1c79356b
A
1026 /*
1027 * This is a copy-on-write fault that will
1028 * cause us to revoke access to this page, but
1029 * this page is in the process of being cleaned
1030 * in a clustered pageout. We must wait until
1031 * the cleaning operation completes before
1032 * revoking access to the original page,
1033 * otherwise we might attempt to remove a
1034 * wired mapping.
1035 */
1036#if TRACEFAULTPAGE
1037 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
1038#endif
1039 XPR(XPR_VM_FAULT,
1040 "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1041 (integer_t)object, offset,
1042 (integer_t)m, 0, 0);
2d21ac55
A
1043 /*
1044 * take an extra ref so that object won't die
1045 */
1046 vm_object_reference_locked(object);
1047
1c79356b 1048 vm_fault_cleanup(object, first_m);
2d21ac55 1049
1c79356b
A
1050 counter(c_vm_fault_page_block_backoff_kernel++);
1051 vm_object_lock(object);
1052 assert(object->ref_count > 0);
2d21ac55 1053
1c79356b 1054 m = vm_page_lookup(object, offset);
2d21ac55 1055
1c79356b
A
1056 if (m != VM_PAGE_NULL && m->cleaning) {
1057 PAGE_ASSERT_WAIT(m, interruptible);
2d21ac55 1058
1c79356b 1059 vm_object_unlock(object);
9bccf70c 1060 wait_result = thread_block(THREAD_CONTINUE_NULL);
1c79356b 1061 vm_object_deallocate(object);
2d21ac55 1062
1c79356b
A
1063 goto backoff;
1064 } else {
1065 vm_object_unlock(object);
2d21ac55 1066
1c79356b 1067 vm_object_deallocate(object);
9bccf70c 1068 thread_interrupt_level(interruptible_state);
2d21ac55
A
1069
1070 return (VM_FAULT_RETRY);
1c79356b
A
1071 }
1072 }
2d21ac55
A
1073 if (type_of_fault == NULL && m->speculative) {
1074 /*
1075 * If we were passed a non-NULL pointer for
1076 * "type_of_fault", than we came from
1077 * vm_fault... we'll let it deal with
1078 * this condition, since it
1079 * needs to see m->speculative to correctly
1080 * account the pageins, otherwise...
1081 * take it off the speculative queue, we'll
1082 * let the caller of vm_fault_page deal
1083 * with getting it onto the correct queue
1084 */
1085 vm_page_lockspin_queues();
1086 VM_PAGE_QUEUES_REMOVE(m);
1087 vm_page_unlock_queues();
1088 }
1c79356b 1089
2d21ac55
A
1090 if (m->encrypted) {
1091 /*
1092 * ENCRYPTED SWAP:
1093 * the user needs access to a page that we
1094 * encrypted before paging it out.
1095 * Decrypt the page now.
1096 * Keep it busy to prevent anyone from
1097 * accessing it during the decryption.
1098 */
1099 m->busy = TRUE;
1100 vm_page_decrypt(m, 0);
1101 assert(object == m->object);
1102 assert(m->busy);
1103 PAGE_WAKEUP_DONE(m);
1c79356b 1104
2d21ac55
A
1105 /*
1106 * Retry from the top, in case
1107 * something changed while we were
1108 * decrypting.
1109 */
1110 continue;
1111 }
1112 ASSERT_PAGE_DECRYPTED(m);
1c79356b 1113
2d21ac55
A
1114 if (m->object->code_signed) {
1115 /*
1116 * CODE SIGNING:
1117 * We just paged in a page from a signed
1118 * memory object but we don't need to
1119 * validate it now. We'll validate it if
1120 * when it gets mapped into a user address
1121 * space for the first time or when the page
1122 * gets copied to another object as a result
1123 * of a copy-on-write.
1124 */
1c79356b 1125 }
2d21ac55 1126
1c79356b 1127 /*
2d21ac55
A
1128 * We mark the page busy and leave it on
1129 * the pageout queues. If the pageout
1130 * deamon comes across it, then it will
1131 * remove the page from the queue, but not the object
1c79356b 1132 */
1c79356b
A
1133#if TRACEFAULTPAGE
1134 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1135#endif
1c79356b
A
1136 XPR(XPR_VM_FAULT,
1137 "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1138 (integer_t)object, offset, (integer_t)m, 0, 0);
1139 assert(!m->busy);
1c79356b 1140 assert(!m->absent);
2d21ac55
A
1141
1142 m->busy = TRUE;
1c79356b
A
1143 break;
1144 }
2d21ac55 1145
1c79356b 1146
2d21ac55
A
1147 /*
1148 * we get here when there is no page present in the object at
1149 * the offset we're interested in... we'll allocate a page
1150 * at this point if the pager associated with
1151 * this object can provide the data or we're the top object...
1152 * object is locked; m == NULL
1153 */
1154 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply);
1155
1c79356b
A
1156#if TRACEFAULTPAGE
1157 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1158#endif
2d21ac55 1159 if ((look_for_page || (object == first_object)) && !must_be_resident && !object->phys_contiguous) {
1c79356b 1160 /*
2d21ac55 1161 * Allocate a new page for this object/offset pair
1c79356b 1162 */
2d21ac55 1163 m = vm_page_grab();
1c79356b
A
1164#if TRACEFAULTPAGE
1165 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1166#endif
1167 if (m == VM_PAGE_NULL) {
2d21ac55 1168
1c79356b 1169 vm_fault_cleanup(object, first_m);
9bccf70c 1170 thread_interrupt_level(interruptible_state);
2d21ac55
A
1171
1172 return (VM_FAULT_MEMORY_SHORTAGE);
1c79356b
A
1173 }
1174 vm_page_insert(m, object, offset);
1175 }
2d21ac55 1176 if (look_for_page && !must_be_resident) {
1c79356b
A
1177 kern_return_t rc;
1178
1179 /*
1180 * If the memory manager is not ready, we
1181 * cannot make requests.
1182 */
1183 if (!object->pager_ready) {
1184#if TRACEFAULTPAGE
1185 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1186#endif
2d21ac55
A
1187 if (m != VM_PAGE_NULL)
1188 VM_PAGE_FREE(m);
1189
1c79356b
A
1190 XPR(XPR_VM_FAULT,
1191 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1192 (integer_t)object, offset, 0, 0, 0);
2d21ac55
A
1193
1194 /*
1195 * take an extra ref so object won't die
1196 */
1197 vm_object_reference_locked(object);
1c79356b
A
1198 vm_fault_cleanup(object, first_m);
1199 counter(c_vm_fault_page_block_backoff_kernel++);
2d21ac55 1200
1c79356b
A
1201 vm_object_lock(object);
1202 assert(object->ref_count > 0);
2d21ac55 1203
1c79356b 1204 if (!object->pager_ready) {
2d21ac55
A
1205 wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1206
1c79356b 1207 vm_object_unlock(object);
9bccf70c
A
1208 if (wait_result == THREAD_WAITING)
1209 wait_result = thread_block(THREAD_CONTINUE_NULL);
1c79356b 1210 vm_object_deallocate(object);
2d21ac55 1211
1c79356b
A
1212 goto backoff;
1213 } else {
1214 vm_object_unlock(object);
1215 vm_object_deallocate(object);
9bccf70c 1216 thread_interrupt_level(interruptible_state);
1c79356b 1217
2d21ac55 1218 return (VM_FAULT_RETRY);
0b4e3aa0 1219 }
0b4e3aa0 1220 }
2d21ac55 1221 if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1c79356b 1222 /*
2d21ac55
A
1223 * If there are too many outstanding page
1224 * requests pending on this external object, we
1225 * wait for them to be resolved now.
1c79356b 1226 */
1c79356b 1227#if TRACEFAULTPAGE
2d21ac55 1228 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1c79356b 1229#endif
2d21ac55 1230 if (m != VM_PAGE_NULL)
1c79356b 1231 VM_PAGE_FREE(m);
1c79356b 1232 /*
2d21ac55 1233 * take an extra ref so object won't die
1c79356b 1234 */
2d21ac55 1235 vm_object_reference_locked(object);
1c79356b 1236
1c79356b 1237 vm_fault_cleanup(object, first_m);
2d21ac55 1238
1c79356b 1239 counter(c_vm_fault_page_block_backoff_kernel++);
2d21ac55 1240
1c79356b
A
1241 vm_object_lock(object);
1242 assert(object->ref_count > 0);
2d21ac55
A
1243
1244 if (object->paging_in_progress > vm_object_pagein_throttle) {
1245 vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS, interruptible);
1246
1c79356b 1247 vm_object_unlock(object);
9bccf70c 1248 wait_result = thread_block(THREAD_CONTINUE_NULL);
1c79356b 1249 vm_object_deallocate(object);
2d21ac55 1250
1c79356b
A
1251 goto backoff;
1252 } else {
1253 vm_object_unlock(object);
1254 vm_object_deallocate(object);
9bccf70c 1255 thread_interrupt_level(interruptible_state);
2d21ac55
A
1256
1257 return (VM_FAULT_RETRY);
1c79356b
A
1258 }
1259 }
2d21ac55
A
1260 if (m != VM_PAGE_NULL) {
1261 /*
1262 * Indicate that the page is waiting for data
1263 * from the memory manager.
1264 */
1265 m->list_req_pending = TRUE;
0b4e3aa0 1266 m->absent = TRUE;
0b4e3aa0 1267 }
1c79356b 1268
1c79356b
A
1269#if TRACEFAULTPAGE
1270 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1271#endif
2d21ac55 1272
1c79356b 1273 /*
2d21ac55
A
1274 * It's possible someone called vm_object_destroy while we weren't
1275 * holding the object lock. If that has happened, then bail out
1276 * here.
1c79356b 1277 */
2d21ac55
A
1278
1279 pager = object->pager;
1280
1281 if (pager == MEMORY_OBJECT_NULL) {
1282 vm_fault_cleanup(object, first_m);
1283 thread_interrupt_level(interruptible_state);
1284 return VM_FAULT_MEMORY_ERROR;
1285 }
1c79356b
A
1286
1287 /*
2d21ac55
A
1288 * We have an absent page in place for the faulting offset,
1289 * so we can release the object lock.
1c79356b
A
1290 */
1291
2d21ac55 1292 vm_object_unlock(object);
1c79356b
A
1293
1294 /*
2d21ac55
A
1295 * If this object uses a copy_call strategy,
1296 * and we are interested in a copy of this object
1297 * (having gotten here only by following a
1298 * shadow chain), then tell the memory manager
1299 * via a flag added to the desired_access
1300 * parameter, so that it can detect a race
1301 * between our walking down the shadow chain
1302 * and its pushing pages up into a copy of
1303 * the object that it manages.
1c79356b 1304 */
2d21ac55 1305 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1c79356b 1306 wants_copy_flag = VM_PROT_WANTS_COPY;
2d21ac55 1307 else
1c79356b 1308 wants_copy_flag = VM_PROT_NONE;
1c79356b
A
1309
1310 XPR(XPR_VM_FAULT,
1311 "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1312 (integer_t)object, offset, (integer_t)m,
1313 access_required | wants_copy_flag, 0);
1314
2d21ac55
A
1315 /*
1316 * Call the memory manager to retrieve the data.
1317 */
1318 rc = memory_object_data_request(
1319 pager,
1320 offset + object->paging_offset,
1321 PAGE_SIZE,
1322 access_required | wants_copy_flag,
1323 (memory_object_fault_info_t)fault_info);
1c79356b
A
1324
1325#if TRACEFAULTPAGE
1326 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1327#endif
2d21ac55
A
1328 vm_object_lock(object);
1329
1c79356b 1330 if (rc != KERN_SUCCESS) {
2d21ac55 1331
1c79356b 1332 vm_fault_cleanup(object, first_m);
9bccf70c 1333 thread_interrupt_level(interruptible_state);
2d21ac55
A
1334
1335 return ((rc == MACH_SEND_INTERRUPTED) ?
1c79356b
A
1336 VM_FAULT_INTERRUPTED :
1337 VM_FAULT_MEMORY_ERROR);
1338 }
2d21ac55
A
1339 if ((interruptible != THREAD_UNINT) && (current_thread()->sched_mode & TH_MODE_ABORT)) {
1340
1c79356b 1341 vm_fault_cleanup(object, first_m);
9bccf70c 1342 thread_interrupt_level(interruptible_state);
2d21ac55
A
1343
1344 return (VM_FAULT_INTERRUPTED);
1c79356b 1345 }
2d21ac55 1346 if (m == VM_PAGE_NULL && object->phys_contiguous) {
91447636
A
1347 /*
1348 * No page here means that the object we
1349 * initially looked up was "physically
1350 * contiguous" (i.e. device memory). However,
1351 * with Virtual VRAM, the object might not
1352 * be backed by that device memory anymore,
1353 * so we're done here only if the object is
1354 * still "phys_contiguous".
1355 * Otherwise, if the object is no longer
1356 * "phys_contiguous", we need to retry the
1357 * page fault against the object's new backing
1358 * store (different memory object).
1359 */
0b4e3aa0 1360 break;
91447636 1361 }
2d21ac55
A
1362 /*
1363 * potentially a pagein fault
1364 * if we make it through the state checks
1365 * above, than we'll count it as such
1366 */
1367 my_fault = DBG_PAGEIN_FAULT;
91447636
A
1368
1369 /*
1370 * Retry with same object/offset, since new data may
1371 * be in a different page (i.e., m is meaningless at
1372 * this point).
1373 */
1c79356b
A
1374 continue;
1375 }
1376
1377 /*
2d21ac55
A
1378 * We get here if the object has no pager, or an existence map
1379 * exists and indicates the page isn't present on the pager
1380 * or we're unwiring a page. If a pager exists, but there
1381 * is no existence map, then the m->absent case above handles
1382 * the ZF case when the pager can't provide the page
1c79356b
A
1383 */
1384#if TRACEFAULTPAGE
1385 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1386#endif
1387 if (object == first_object)
1388 first_m = m;
1389 else
1390 assert(m == VM_PAGE_NULL);
1391
1392 XPR(XPR_VM_FAULT,
1393 "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1394 (integer_t)object, offset, (integer_t)m,
1395 (integer_t)object->shadow, 0);
2d21ac55 1396
1c79356b 1397 next_object = object->shadow;
2d21ac55 1398
1c79356b 1399 if (next_object == VM_OBJECT_NULL) {
1c79356b 1400 /*
2d21ac55
A
1401 * we've hit the bottom of the shadown chain,
1402 * fill the page in the top object with zeros.
1c79356b 1403 */
2d21ac55 1404 assert(!must_be_resident);
1c79356b
A
1405
1406 if (object != first_object) {
1407 vm_object_paging_end(object);
1408 vm_object_unlock(object);
1409
1410 object = first_object;
1411 offset = first_offset;
1412 vm_object_lock(object);
1413 }
1c79356b
A
1414 m = first_m;
1415 assert(m->object == object);
1416 first_m = VM_PAGE_NULL;
1417
55e303ae 1418 /*
2d21ac55
A
1419 * check for any conditions that prevent
1420 * us from creating a new zero-fill page
1421 * vm_fault_check will do all of the
1422 * fault cleanup in the case of an error condition
1423 * including resetting the thread_interrupt_level
55e303ae 1424 */
2d21ac55 1425 error = vm_fault_check(object, m, first_m, interruptible_state);
55e303ae 1426
2d21ac55
A
1427 if (error != VM_FAULT_SUCCESS)
1428 return (error);
55e303ae 1429
2d21ac55
A
1430 if (m == VM_PAGE_NULL) {
1431 m = vm_page_grab();
1c79356b 1432
2d21ac55
A
1433 if (m == VM_PAGE_NULL) {
1434 vm_fault_cleanup(object, VM_PAGE_NULL);
1435 thread_interrupt_level(interruptible_state);
55e303ae 1436
2d21ac55
A
1437 return (VM_FAULT_MEMORY_SHORTAGE);
1438 }
1439 vm_page_insert(m, object, offset);
0b4e3aa0 1440 }
2d21ac55
A
1441 my_fault = vm_fault_zero_page(m, no_zero_fill);
1442
1c79356b 1443 break;
2d21ac55
A
1444
1445 } else {
1446 /*
1447 * Move on to the next object. Lock the next
1448 * object before unlocking the current one.
1449 */
1c79356b
A
1450 if ((object != first_object) || must_be_resident)
1451 vm_object_paging_end(object);
2d21ac55 1452
1c79356b 1453 offset += object->shadow_offset;
2d21ac55
A
1454 fault_info->lo_offset += object->shadow_offset;
1455 fault_info->hi_offset += object->shadow_offset;
1c79356b 1456 access_required = VM_PROT_READ;
2d21ac55 1457
1c79356b
A
1458 vm_object_lock(next_object);
1459 vm_object_unlock(object);
2d21ac55 1460
1c79356b
A
1461 object = next_object;
1462 vm_object_paging_begin(object);
1463 }
1464 }
1465
1466 /*
1467 * PAGE HAS BEEN FOUND.
1468 *
1469 * This page (m) is:
1470 * busy, so that we can play with it;
1471 * not absent, so that nobody else will fill it;
1472 * possibly eligible for pageout;
1473 *
1474 * The top-level page (first_m) is:
1475 * VM_PAGE_NULL if the page was found in the
1476 * top-level object;
1477 * busy, not absent, and ineligible for pageout.
1478 *
1479 * The current object (object) is locked. A paging
1480 * reference is held for the current and top-level
1481 * objects.
1482 */
1483
1484#if TRACEFAULTPAGE
1485 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1486#endif
1487#if EXTRA_ASSERTIONS
2d21ac55 1488 if (m != VM_PAGE_NULL) {
0b4e3aa0
A
1489 assert(m->busy && !m->absent);
1490 assert((first_m == VM_PAGE_NULL) ||
1491 (first_m->busy && !first_m->absent &&
1492 !first_m->active && !first_m->inactive));
1493 }
1c79356b
A
1494#endif /* EXTRA_ASSERTIONS */
1495
91447636
A
1496 /*
1497 * ENCRYPTED SWAP:
1498 * If we found a page, we must have decrypted it before we
1499 * get here...
1500 */
1501 if (m != VM_PAGE_NULL) {
1502 ASSERT_PAGE_DECRYPTED(m);
1503 }
1504
1c79356b 1505 XPR(XPR_VM_FAULT,
2d21ac55 1506 "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1c79356b
A
1507 (integer_t)object, offset, (integer_t)m,
1508 (integer_t)first_object, (integer_t)first_m);
2d21ac55 1509
1c79356b 1510 /*
2d21ac55
A
1511 * If the page is being written, but isn't
1512 * already owned by the top-level object,
1513 * we have to copy it into a new page owned
1514 * by the top-level object.
1c79356b 1515 */
0b4e3aa0 1516 if ((object != first_object) && (m != VM_PAGE_NULL)) {
1c79356b
A
1517
1518#if TRACEFAULTPAGE
2d21ac55 1519 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1c79356b
A
1520#endif
1521 if (fault_type & VM_PROT_WRITE) {
1522 vm_page_t copy_m;
1523
2d21ac55
A
1524 /*
1525 * We only really need to copy if we
1526 * want to write it.
1527 */
1c79356b
A
1528 assert(!must_be_resident);
1529
55e303ae
A
1530 /*
1531 * are we protecting the system from
1532 * backing store exhaustion. If so
1533 * sleep unless we are privileged.
1534 */
2d21ac55
A
1535 if (vm_backing_store_low) {
1536 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
55e303ae 1537
55e303ae
A
1538 RELEASE_PAGE(m);
1539 vm_fault_cleanup(object, first_m);
2d21ac55
A
1540
1541 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1542
91447636 1543 thread_block(THREAD_CONTINUE_NULL);
2d21ac55
A
1544 thread_interrupt_level(interruptible_state);
1545
1546 return (VM_FAULT_RETRY);
55e303ae
A
1547 }
1548 }
1c79356b 1549 /*
2d21ac55
A
1550 * If we try to collapse first_object at this
1551 * point, we may deadlock when we try to get
1552 * the lock on an intermediate object (since we
1553 * have the bottom object locked). We can't
1554 * unlock the bottom object, because the page
1555 * we found may move (by collapse) if we do.
1c79356b 1556 *
2d21ac55
A
1557 * Instead, we first copy the page. Then, when
1558 * we have no more use for the bottom object,
1559 * we unlock it and try to collapse.
1c79356b 1560 *
2d21ac55
A
1561 * Note that we copy the page even if we didn't
1562 * need to... that's the breaks.
1c79356b
A
1563 */
1564
1565 /*
2d21ac55 1566 * Allocate a page for the copy
1c79356b
A
1567 */
1568 copy_m = vm_page_grab();
2d21ac55 1569
1c79356b
A
1570 if (copy_m == VM_PAGE_NULL) {
1571 RELEASE_PAGE(m);
2d21ac55 1572
1c79356b 1573 vm_fault_cleanup(object, first_m);
9bccf70c 1574 thread_interrupt_level(interruptible_state);
1c79356b 1575
2d21ac55
A
1576 return (VM_FAULT_MEMORY_SHORTAGE);
1577 }
1c79356b
A
1578 XPR(XPR_VM_FAULT,
1579 "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1580 (integer_t)object, offset,
1581 (integer_t)m, (integer_t)copy_m, 0);
2d21ac55 1582
1c79356b
A
1583 vm_page_copy(m, copy_m);
1584
1585 /*
2d21ac55
A
1586 * If another map is truly sharing this
1587 * page with us, we have to flush all
1588 * uses of the original page, since we
1589 * can't distinguish those which want the
1590 * original from those which need the
1591 * new copy.
1c79356b 1592 *
2d21ac55
A
1593 * XXXO If we know that only one map has
1594 * access to this page, then we could
1595 * avoid the pmap_disconnect() call.
1c79356b 1596 */
2d21ac55
A
1597 if (m->pmapped)
1598 pmap_disconnect(m->phys_page);
1c79356b 1599
1c79356b 1600 assert(!m->cleaning);
1c79356b
A
1601
1602 /*
2d21ac55 1603 * We no longer need the old page or object.
1c79356b 1604 */
1c79356b
A
1605 PAGE_WAKEUP_DONE(m);
1606 vm_object_paging_end(object);
1607 vm_object_unlock(object);
1608
2d21ac55
A
1609 my_fault = DBG_COW_FAULT;
1610 VM_STAT_INCR(cow_faults);
1611 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1c79356b 1612 current_task()->cow_faults++;
2d21ac55 1613
1c79356b
A
1614 object = first_object;
1615 offset = first_offset;
1616
1617 vm_object_lock(object);
2d21ac55
A
1618 /*
1619 * get rid of the place holder
1620 * page that we soldered in earlier
1621 */
1c79356b
A
1622 VM_PAGE_FREE(first_m);
1623 first_m = VM_PAGE_NULL;
2d21ac55
A
1624
1625 /*
1626 * and replace it with the
1627 * page we just copied into
1628 */
1c79356b
A
1629 assert(copy_m->busy);
1630 vm_page_insert(copy_m, object, offset);
2d21ac55 1631 copy_m->dirty = TRUE;
1c79356b 1632
2d21ac55 1633 m = copy_m;
1c79356b 1634 /*
2d21ac55
A
1635 * Now that we've gotten the copy out of the
1636 * way, let's try to collapse the top object.
1637 * But we have to play ugly games with
1638 * paging_in_progress to do that...
1c79356b 1639 */
1c79356b 1640 vm_object_paging_end(object);
0c530ab8 1641 vm_object_collapse(object, offset, TRUE);
1c79356b
A
1642 vm_object_paging_begin(object);
1643
2d21ac55 1644 } else
1c79356b 1645 *protection &= (~VM_PROT_WRITE);
1c79356b 1646 }
1c79356b 1647 /*
2d21ac55
A
1648 * Now check whether the page needs to be pushed into the
1649 * copy object. The use of asymmetric copy on write for
1650 * shared temporary objects means that we may do two copies to
1651 * satisfy the fault; one above to get the page from a
1652 * shadowed object, and one here to push it into the copy.
1c79356b 1653 */
2d21ac55 1654 try_failed_count = 0;
1c79356b 1655
2d21ac55 1656 while ((copy_object = first_object->copy) != VM_OBJECT_NULL && (m != VM_PAGE_NULL)) {
1c79356b
A
1657 vm_object_offset_t copy_offset;
1658 vm_page_t copy_m;
1659
1660#if TRACEFAULTPAGE
1661 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1662#endif
1663 /*
2d21ac55
A
1664 * If the page is being written, but hasn't been
1665 * copied to the copy-object, we have to copy it there.
1c79356b 1666 */
1c79356b
A
1667 if ((fault_type & VM_PROT_WRITE) == 0) {
1668 *protection &= ~VM_PROT_WRITE;
1669 break;
1670 }
1671
1672 /*
2d21ac55
A
1673 * If the page was guaranteed to be resident,
1674 * we must have already performed the copy.
1c79356b 1675 */
1c79356b
A
1676 if (must_be_resident)
1677 break;
1678
1679 /*
2d21ac55 1680 * Try to get the lock on the copy_object.
1c79356b
A
1681 */
1682 if (!vm_object_lock_try(copy_object)) {
1c79356b 1683
2d21ac55
A
1684 vm_object_unlock(object);
1685 try_failed_count++;
1c79356b 1686
2d21ac55 1687 mutex_pause(try_failed_count); /* wait a bit */
1c79356b 1688 vm_object_lock(object);
2d21ac55 1689
1c79356b
A
1690 continue;
1691 }
2d21ac55 1692 try_failed_count = 0;
1c79356b
A
1693
1694 /*
2d21ac55
A
1695 * Make another reference to the copy-object,
1696 * to keep it from disappearing during the
1697 * copy.
1c79356b 1698 */
2d21ac55 1699 vm_object_reference_locked(copy_object);
1c79356b
A
1700
1701 /*
2d21ac55 1702 * Does the page exist in the copy?
1c79356b
A
1703 */
1704 copy_offset = first_offset - copy_object->shadow_offset;
2d21ac55 1705
1c79356b
A
1706 if (copy_object->size <= copy_offset)
1707 /*
1708 * Copy object doesn't cover this page -- do nothing.
1709 */
1710 ;
2d21ac55
A
1711 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1712 /*
1713 * Page currently exists in the copy object
1714 */
1c79356b
A
1715 if (copy_m->busy) {
1716 /*
2d21ac55
A
1717 * If the page is being brought
1718 * in, wait for it and then retry.
1c79356b
A
1719 */
1720 RELEASE_PAGE(m);
2d21ac55
A
1721
1722 /*
1723 * take an extra ref so object won't die
1724 */
1725 vm_object_reference_locked(copy_object);
1c79356b
A
1726 vm_object_unlock(copy_object);
1727 vm_fault_cleanup(object, first_m);
1728 counter(c_vm_fault_page_block_backoff_kernel++);
2d21ac55 1729
1c79356b
A
1730 vm_object_lock(copy_object);
1731 assert(copy_object->ref_count > 0);
1732 VM_OBJ_RES_DECR(copy_object);
2d21ac55 1733 vm_object_lock_assert_exclusive(copy_object);
1c79356b
A
1734 copy_object->ref_count--;
1735 assert(copy_object->ref_count > 0);
1736 copy_m = vm_page_lookup(copy_object, copy_offset);
91447636
A
1737 /*
1738 * ENCRYPTED SWAP:
1739 * it's OK if the "copy_m" page is encrypted,
1740 * because we're not moving it nor handling its
1741 * contents.
1742 */
1c79356b
A
1743 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1744 PAGE_ASSERT_WAIT(copy_m, interruptible);
2d21ac55 1745
1c79356b 1746 vm_object_unlock(copy_object);
9bccf70c 1747 wait_result = thread_block(THREAD_CONTINUE_NULL);
1c79356b 1748 vm_object_deallocate(copy_object);
2d21ac55 1749
1c79356b
A
1750 goto backoff;
1751 } else {
1752 vm_object_unlock(copy_object);
1753 vm_object_deallocate(copy_object);
9bccf70c 1754 thread_interrupt_level(interruptible_state);
2d21ac55
A
1755
1756 return (VM_FAULT_RETRY);
1c79356b
A
1757 }
1758 }
1759 }
1760 else if (!PAGED_OUT(copy_object, copy_offset)) {
1761 /*
1762 * If PAGED_OUT is TRUE, then the page used to exist
1763 * in the copy-object, and has already been paged out.
1764 * We don't need to repeat this. If PAGED_OUT is
1765 * FALSE, then either we don't know (!pager_created,
1766 * for example) or it hasn't been paged out.
1767 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1768 * We must copy the page to the copy object.
1769 */
1770
2d21ac55
A
1771 if (vm_backing_store_low) {
1772 /*
1773 * we are protecting the system from
1774 * backing store exhaustion. If so
1775 * sleep unless we are privileged.
1776 */
1777 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1778 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
55e303ae 1779
55e303ae
A
1780 RELEASE_PAGE(m);
1781 VM_OBJ_RES_DECR(copy_object);
2d21ac55 1782 vm_object_lock_assert_exclusive(copy_object);
55e303ae
A
1783 copy_object->ref_count--;
1784 assert(copy_object->ref_count > 0);
2d21ac55 1785
55e303ae
A
1786 vm_object_unlock(copy_object);
1787 vm_fault_cleanup(object, first_m);
91447636 1788 thread_block(THREAD_CONTINUE_NULL);
2d21ac55
A
1789 thread_interrupt_level(interruptible_state);
1790
1791 return (VM_FAULT_RETRY);
55e303ae
A
1792 }
1793 }
1c79356b 1794 /*
2d21ac55 1795 * Allocate a page for the copy
1c79356b
A
1796 */
1797 copy_m = vm_page_alloc(copy_object, copy_offset);
2d21ac55 1798
1c79356b
A
1799 if (copy_m == VM_PAGE_NULL) {
1800 RELEASE_PAGE(m);
2d21ac55 1801
1c79356b 1802 VM_OBJ_RES_DECR(copy_object);
2d21ac55 1803 vm_object_lock_assert_exclusive(copy_object);
1c79356b
A
1804 copy_object->ref_count--;
1805 assert(copy_object->ref_count > 0);
2d21ac55 1806
1c79356b
A
1807 vm_object_unlock(copy_object);
1808 vm_fault_cleanup(object, first_m);
9bccf70c 1809 thread_interrupt_level(interruptible_state);
1c79356b 1810
2d21ac55
A
1811 return (VM_FAULT_MEMORY_SHORTAGE);
1812 }
1c79356b 1813 /*
2d21ac55 1814 * Must copy page into copy-object.
1c79356b 1815 */
1c79356b
A
1816 vm_page_copy(m, copy_m);
1817
1818 /*
2d21ac55
A
1819 * If the old page was in use by any users
1820 * of the copy-object, it must be removed
1821 * from all pmaps. (We can't know which
1822 * pmaps use it.)
1c79356b 1823 */
2d21ac55
A
1824 if (m->pmapped)
1825 pmap_disconnect(m->phys_page);
1c79356b
A
1826
1827 /*
2d21ac55
A
1828 * If there's a pager, then immediately
1829 * page out this page, using the "initialize"
1830 * option. Else, we use the copy.
1c79356b 1831 */
2d21ac55
A
1832 if ((!copy_object->pager_created)
1833#if MACH_PAGEMAP
1834 || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
1c79356b 1835#endif
2d21ac55
A
1836 ) {
1837
1838 vm_page_lockspin_queues();
1839 assert(!m->cleaning);
1c79356b
A
1840 vm_page_activate(copy_m);
1841 vm_page_unlock_queues();
2d21ac55
A
1842
1843 copy_m->dirty = TRUE;
1c79356b
A
1844 PAGE_WAKEUP_DONE(copy_m);
1845 }
1846 else {
1847 assert(copy_m->busy == TRUE);
2d21ac55 1848 assert(!m->cleaning);
1c79356b
A
1849
1850 /*
2d21ac55 1851 * dirty is protected by the object lock
1c79356b 1852 */
2d21ac55 1853 copy_m->dirty = TRUE;
1c79356b 1854
2d21ac55
A
1855 /*
1856 * The page is already ready for pageout:
1857 * not on pageout queues and busy.
1858 * Unlock everything except the
1859 * copy_object itself.
1860 */
1c79356b
A
1861 vm_object_unlock(object);
1862
1863 /*
2d21ac55
A
1864 * Write the page to the copy-object,
1865 * flushing it from the kernel.
1c79356b 1866 */
1c79356b
A
1867 vm_pageout_initialize_page(copy_m);
1868
1869 /*
2d21ac55
A
1870 * Since the pageout may have
1871 * temporarily dropped the
1872 * copy_object's lock, we
1873 * check whether we'll have
1874 * to deallocate the hard way.
1c79356b 1875 */
2d21ac55 1876 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
1c79356b
A
1877 vm_object_unlock(copy_object);
1878 vm_object_deallocate(copy_object);
1879 vm_object_lock(object);
2d21ac55 1880
1c79356b
A
1881 continue;
1882 }
1c79356b 1883 /*
2d21ac55
A
1884 * Pick back up the old object's
1885 * lock. [It is safe to do so,
1886 * since it must be deeper in the
1887 * object tree.]
1c79356b 1888 */
1c79356b
A
1889 vm_object_lock(object);
1890 }
1c79356b 1891 /*
2d21ac55
A
1892 * Because we're pushing a page upward
1893 * in the object tree, we must restart
1894 * any faults that are waiting here.
1895 * [Note that this is an expansion of
1896 * PAGE_WAKEUP that uses the THREAD_RESTART
1897 * wait result]. Can't turn off the page's
1898 * busy bit because we're not done with it.
1c79356b 1899 */
1c79356b
A
1900 if (m->wanted) {
1901 m->wanted = FALSE;
2d21ac55 1902 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
1c79356b
A
1903 }
1904 }
1c79356b 1905 /*
2d21ac55
A
1906 * The reference count on copy_object must be
1907 * at least 2: one for our extra reference,
1908 * and at least one from the outside world
1909 * (we checked that when we last locked
1910 * copy_object).
1c79356b 1911 */
2d21ac55 1912 vm_object_lock_assert_exclusive(copy_object);
1c79356b
A
1913 copy_object->ref_count--;
1914 assert(copy_object->ref_count > 0);
2d21ac55 1915
1c79356b
A
1916 VM_OBJ_RES_DECR(copy_object);
1917 vm_object_unlock(copy_object);
1918
1919 break;
1920 }
1c79356b
A
1921 *result_page = m;
1922 *top_page = first_m;
1923
1924 XPR(XPR_VM_FAULT,
1925 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1926 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1c79356b 1927
2d21ac55
A
1928 if (m != VM_PAGE_NULL) {
1929 if (my_fault == DBG_PAGEIN_FAULT) {
55e303ae 1930
2d21ac55
A
1931 VM_STAT_INCR(pageins);
1932 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
1933 DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1934 current_task()->pageins++;
1935
1936 if (m->object->internal) {
1937 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
1938 } else {
1939 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
1940 }
1941
1942 /*
1943 * evaluate access pattern and update state
1944 * vm_fault_deactivate_behind depends on the
1945 * state being up to date
1946 */
1947 vm_fault_is_sequential(object, offset, fault_info->behavior);
1948
1949 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
1950 }
1951 if (type_of_fault)
1952 *type_of_fault = my_fault;
1953 } else
55e303ae 1954 vm_object_unlock(object);
2d21ac55 1955
55e303ae
A
1956 thread_interrupt_level(interruptible_state);
1957
1c79356b
A
1958#if TRACEFAULTPAGE
1959 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
1960#endif
2d21ac55 1961 return (VM_FAULT_SUCCESS);
1c79356b 1962
2d21ac55 1963backoff:
9bccf70c 1964 thread_interrupt_level(interruptible_state);
2d21ac55 1965
1c79356b 1966 if (wait_result == THREAD_INTERRUPTED)
2d21ac55
A
1967 return (VM_FAULT_INTERRUPTED);
1968 return (VM_FAULT_RETRY);
1c79356b
A
1969
1970#undef RELEASE_PAGE
1971}
1972
2d21ac55
A
1973
1974
593a1d5f
A
1975/*
1976 * CODE SIGNING:
1977 * When soft faulting a page, we have to validate the page if:
1978 * 1. the page is being mapped in user space
1979 * 2. the page hasn't already been found to be "tainted"
1980 * 3. the page belongs to a code-signed object
1981 * 4. the page has not been validated yet or has been mapped for write.
1982 */
1983#define VM_FAULT_NEED_CS_VALIDATION(pmap, page) \
1984 ((pmap) != kernel_pmap /*1*/ && \
1985 !(page)->cs_tainted /*2*/ && \
1986 (page)->object->code_signed /*3*/ && \
1987 (!(page)->cs_validated || (page)->wpmapped /*4*/))
1988
1989
55e303ae 1990/*
2d21ac55
A
1991 * page queue lock must NOT be held
1992 * m->object must be locked
1993 *
1994 * NOTE: m->object could be locked "shared" only if we are called
1995 * from vm_fault() as part of a soft fault. If so, we must be
1996 * careful not to modify the VM object in any way that is not
1997 * legal under a shared lock...
55e303ae 1998 */
2d21ac55
A
1999unsigned long cs_enter_tainted_rejected = 0;
2000unsigned long cs_enter_tainted_accepted = 0;
2001kern_return_t
2002vm_fault_enter(vm_page_t m,
2003 pmap_t pmap,
2004 vm_map_offset_t vaddr,
2005 vm_prot_t prot,
2006 boolean_t wired,
2007 boolean_t change_wiring,
2008 boolean_t no_cache,
2009 int *type_of_fault)
55e303ae 2010{
2d21ac55 2011 unsigned int cache_attr;
55e303ae 2012 kern_return_t kr;
2d21ac55
A
2013 boolean_t previously_pmapped = m->pmapped;
2014
2015 vm_object_lock_assert_held(m->object);
2016#if DEBUG
2017 mutex_assert(&vm_page_queue_lock, MA_NOTOWNED);
2018#endif /* DEBUG */
2019
2020 if (m->phys_page == vm_page_guard_addr) {
2021 assert(m->fictitious);
2022 return KERN_SUCCESS;
2023 }
2024
2025 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2026
2d21ac55
A
2027 if (m->pmapped == FALSE) {
2028 /*
2029 * This is the first time this page is being
2030 * mapped in an address space (pmapped == FALSE).
2031 *
2032 * Part of that page may still be in the data cache
2033 * and not flushed to memory. In case we end up
2034 * accessing that page via the instruction cache,
2035 * we need to ensure that the 2 caches are in sync.
2036 */
2037 pmap_sync_page_data_phys(m->phys_page);
2038
2039 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2040 /*
2041 * found it in the cache, but this
2042 * is the first fault-in of the page (m->pmapped == FALSE)
2043 * so it must have come in as part of
2044 * a cluster... account 1 pagein against it
2045 */
2046 VM_STAT_INCR(pageins);
2047 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2048
2049 if (m->object->internal) {
2050 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2051 } else {
2052 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
55e303ae 2053 }
2d21ac55
A
2054
2055 current_task()->pageins++;
2056
2057 *type_of_fault = DBG_PAGEIN_FAULT;
2058 }
2059 VM_PAGE_CONSUME_CLUSTERED(m);
2060
2061 } else if (cache_attr != VM_WIMG_DEFAULT)
2062 pmap_sync_page_attributes_phys(m->phys_page);
2063
2064 if (*type_of_fault != DBG_COW_FAULT) {
2065 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2066
2067 if (pmap == kernel_pmap) {
2068 DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2069 }
2070 }
2071
593a1d5f
A
2072 if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2073 vm_object_lock_assert_exclusive(m->object);
2074
2075 if (m->cs_validated) {
2076 vm_cs_revalidates++;
2077 }
2078
2079 /* VM map is locked, so 1 ref will remain on VM object */
2080 vm_page_validate_cs(m);
2081 }
2082
2083 if (m->cs_tainted /* always invalidate a tainted page */
2084#if CONFIG_ENFORCE_SIGNED_CODE
2085 /*
2086 * Code Signing enforcement invalidates an executable page that
2087 * has no code directory, and thus could not be validated.
2088 */
2089 || ((prot & VM_PROT_EXECUTE) && !m->cs_validated )
2090#endif
2091 ) {
2d21ac55
A
2092 /*
2093 * CODE SIGNING:
2094 * This page has been tainted and can not be trusted.
2095 * Let's notify the current process and let it take any
2096 * necessary precautions before we enter the tainted page
2097 * into its address space.
2098 */
593a1d5f
A
2099 kr = KERN_SUCCESS;
2100#if CONFIG_ENFORCE_SIGNED_CODE
2101 if (!cs_enforcement_disable) {
2102#endif
2103 if (cs_invalid_page((addr64_t) vaddr)) {
2104 /* reject the tainted page: abort the page fault */
2105 kr = KERN_MEMORY_ERROR;
2106 cs_enter_tainted_rejected++;
2107 } else {
2108 /* proceed with the tainted page */
2109 kr = KERN_SUCCESS;
2110 cs_enter_tainted_accepted++;
2111 }
2112#if CONFIG_ENFORCE_SIGNED_CODE
2d21ac55 2113 }
593a1d5f 2114#endif
2d21ac55
A
2115 if (cs_debug || kr != KERN_SUCCESS) {
2116 printf("CODESIGNING: vm_fault_enter(0x%llx): "
593a1d5f 2117 "page %p obj %p off 0x%llx *** INVALID PAGE ***\n",
2d21ac55
A
2118 (long long)vaddr, m, m->object, m->offset);
2119 }
2120 } else {
2121 /* proceed with the valid page */
2122 kr = KERN_SUCCESS;
2123 }
2124
2125 if (kr == KERN_SUCCESS) {
2126 /*
2127 * NOTE: we may only hold the vm_object lock SHARED
2128 * at this point, but the update of pmapped is ok
2129 * since this is the ONLY bit updated behind the SHARED
2130 * lock... however, we need to figure out how to do an atomic
2131 * update on a bit field to make this less fragile... right
593a1d5f 2132 * now I don't know how to coerce 'C' to give me the offset info
2d21ac55
A
2133 * that's needed for an AtomicCompareAndSwap
2134 */
2135 m->pmapped = TRUE;
4a3eedf9
A
2136 if (prot & VM_PROT_WRITE) {
2137 vm_object_lock_assert_exclusive(m->object);
2138 m->wpmapped = TRUE;
2139 }
2d21ac55
A
2140
2141 PMAP_ENTER(pmap, vaddr, m, prot, cache_attr, wired);
2142 }
2143
2144 /*
2145 * Hold queues lock to manipulate
2146 * the page queues. Change wiring
2147 * case is obvious.
2148 */
2149 if (change_wiring) {
2150 vm_page_lockspin_queues();
2151
2152 if (wired) {
2153 if (kr == KERN_SUCCESS) {
2154 vm_page_wire(m);
55e303ae 2155 }
2d21ac55
A
2156 } else {
2157 vm_page_unwire(m);
2158 }
2159 vm_page_unlock_queues();
2160
2161 } else {
2162 if (kr != KERN_SUCCESS) {
2163 vm_page_lock_queues();
2164 vm_page_deactivate(m);
2165 vm_page_unlock_queues();
2166 } else {
2167 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count && !m->throttled) {
2168 vm_page_lockspin_queues();
2169 /*
2170 * test again now that we hold the page queue lock
2171 */
2172 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count) {
2173
2174 /*
2175 * If this is a no_cache mapping and the page has never been
2176 * mapped before or was previously a no_cache page, then we
2177 * want to leave pages in the speculative state so that they
2178 * can be readily recycled if free memory runs low. Otherwise
2179 * the page is activated as normal.
2180 */
2181
2182 if (no_cache && (!previously_pmapped || m->no_cache)) {
2183 m->no_cache = TRUE;
2184
2185 if (m->active || m->inactive)
2186 VM_PAGE_QUEUES_REMOVE(m);
2187
2188 if (!m->speculative)
2189 vm_page_speculate(m, TRUE);
2190
2191 } else if (!m->active && !m->inactive)
2192 vm_page_activate(m);
2193
2194 }
2195
2196 vm_page_unlock_queues();
55e303ae 2197 }
55e303ae
A
2198 }
2199 }
2d21ac55 2200 return kr;
55e303ae
A
2201}
2202
2d21ac55 2203
1c79356b
A
2204/*
2205 * Routine: vm_fault
2206 * Purpose:
2207 * Handle page faults, including pseudo-faults
2208 * used to change the wiring status of pages.
2209 * Returns:
2210 * Explicit continuations have been removed.
2211 * Implementation:
2212 * vm_fault and vm_fault_page save mucho state
2213 * in the moral equivalent of a closure. The state
2214 * structure is allocated when first entering vm_fault
2215 * and deallocated when leaving vm_fault.
2216 */
2217
91447636
A
2218extern int _map_enter_debug;
2219
2d21ac55
A
2220unsigned long vm_fault_collapse_total = 0;
2221unsigned long vm_fault_collapse_skipped = 0;
2222
1c79356b
A
2223kern_return_t
2224vm_fault(
2225 vm_map_t map,
91447636 2226 vm_map_offset_t vaddr,
1c79356b
A
2227 vm_prot_t fault_type,
2228 boolean_t change_wiring,
9bccf70c
A
2229 int interruptible,
2230 pmap_t caller_pmap,
91447636 2231 vm_map_offset_t caller_pmap_addr)
1c79356b
A
2232{
2233 vm_map_version_t version; /* Map version for verificiation */
2234 boolean_t wired; /* Should mapping be wired down? */
2235 vm_object_t object; /* Top-level object */
2236 vm_object_offset_t offset; /* Top-level offset */
2237 vm_prot_t prot; /* Protection for mapping */
1c79356b
A
2238 vm_object_t old_copy_object; /* Saved copy object */
2239 vm_page_t result_page; /* Result of vm_fault_page */
2240 vm_page_t top_page; /* Placeholder page */
2241 kern_return_t kr;
2242
1c79356b 2243 vm_page_t m; /* Fast access to result_page */
2d21ac55 2244 kern_return_t error_code;
1c79356b 2245 vm_object_t cur_object;
1c79356b
A
2246 vm_object_offset_t cur_offset;
2247 vm_page_t cur_m;
2248 vm_object_t new_object;
2249 int type_of_fault;
2d21ac55
A
2250 pmap_t pmap;
2251 boolean_t interruptible_state;
91447636 2252 vm_map_t real_map = map;
1c79356b 2253 vm_map_t original_map = map;
0c530ab8 2254 vm_prot_t original_fault_type;
2d21ac55
A
2255 struct vm_object_fault_info fault_info;
2256 boolean_t need_collapse = FALSE;
2257 int object_lock_type = 0;
2258 int cur_object_lock_type;
c910b4d9 2259 vm_object_t top_object = VM_OBJECT_NULL;
1c79356b 2260
de355530 2261
2d21ac55
A
2262 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
2263 (int)((uint64_t)vaddr >> 32),
2264 (int)vaddr,
1c79356b
A
2265 0,
2266 0,
2267 0);
2268
0c530ab8 2269 if (get_preemption_level() != 0) {
2d21ac55
A
2270 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2271 (int)((uint64_t)vaddr >> 32),
2272 (int)vaddr,
0c530ab8
A
2273 KERN_FAILURE,
2274 0,
2275 0);
2276
2277 return (KERN_FAILURE);
9bccf70c 2278 }
9bccf70c 2279 interruptible_state = thread_interrupt_level(interruptible);
1c79356b 2280
2d21ac55
A
2281 VM_STAT_INCR(faults);
2282 current_task()->faults++;
2283 original_fault_type = fault_type;
2284
2285 if (fault_type & VM_PROT_WRITE)
2286 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2287 else
2288 object_lock_type = OBJECT_LOCK_SHARED;
2289
2290 cur_object_lock_type = OBJECT_LOCK_SHARED;
2291
2292RetryFault:
1c79356b
A
2293 /*
2294 * assume we will hit a page in the cache
2295 * otherwise, explicitly override with
2296 * the real fault type once we determine it
2297 */
2298 type_of_fault = DBG_CACHE_HIT_FAULT;
2299
1c79356b
A
2300 /*
2301 * Find the backing store object and offset into
2302 * it to begin the search.
2303 */
0c530ab8 2304 fault_type = original_fault_type;
1c79356b
A
2305 map = original_map;
2306 vm_map_lock_read(map);
1c79356b 2307
2d21ac55
A
2308 kr = vm_map_lookup_locked(&map, vaddr, fault_type,
2309 object_lock_type, &version,
2310 &object, &offset, &prot, &wired,
2311 &fault_info,
2312 &real_map);
1c79356b
A
2313
2314 if (kr != KERN_SUCCESS) {
2315 vm_map_unlock_read(map);
2316 goto done;
2317 }
2d21ac55
A
2318 pmap = real_map->pmap;
2319 fault_info.interruptible = interruptible;
1c79356b
A
2320
2321 /*
2d21ac55
A
2322 * If the page is wired, we must fault for the current protection
2323 * value, to avoid further faults.
1c79356b 2324 */
2d21ac55 2325 if (wired) {
1c79356b 2326 fault_type = prot | VM_PROT_WRITE;
2d21ac55
A
2327 /*
2328 * since we're treating this fault as a 'write'
2329 * we must hold the top object lock exclusively
2330 */
2331 if (object_lock_type == OBJECT_LOCK_SHARED) {
2332
2333 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2334
2335 if (vm_object_lock_upgrade(object) == FALSE) {
2336 /*
2337 * couldn't upgrade, so explictly
2338 * take the lock exclusively
2339 */
2340 vm_object_lock(object);
2341 }
2342 }
2343 }
1c79356b
A
2344
2345#if VM_FAULT_CLASSIFY
2346 /*
2347 * Temporary data gathering code
2348 */
2349 vm_fault_classify(object, offset, fault_type);
2350#endif
2351 /*
2352 * Fast fault code. The basic idea is to do as much as
2353 * possible while holding the map lock and object locks.
2354 * Busy pages are not used until the object lock has to
2355 * be dropped to do something (copy, zero fill, pmap enter).
2356 * Similarly, paging references aren't acquired until that
2357 * point, and object references aren't used.
2358 *
2359 * If we can figure out what to do
2360 * (zero fill, copy on write, pmap enter) while holding
2361 * the locks, then it gets done. Otherwise, we give up,
2362 * and use the original fault path (which doesn't hold
2363 * the map lock, and relies on busy pages).
2364 * The give up cases include:
2365 * - Have to talk to pager.
2366 * - Page is busy, absent or in error.
2367 * - Pager has locked out desired access.
2368 * - Fault needs to be restarted.
2369 * - Have to push page into copy object.
2370 *
2371 * The code is an infinite loop that moves one level down
2372 * the shadow chain each time. cur_object and cur_offset
2373 * refer to the current object being examined. object and offset
2374 * are the original object from the map. The loop is at the
2375 * top level if and only if object and cur_object are the same.
2376 *
2377 * Invariants: Map lock is held throughout. Lock is held on
2378 * original object and cur_object (if different) when
2379 * continuing or exiting loop.
2380 *
2381 */
2382
2383
2384 /*
2d21ac55
A
2385 * If this page is to be inserted in a copy delay object
2386 * for writing, and if the object has a copy, then the
2387 * copy delay strategy is implemented in the slow fault page.
1c79356b 2388 */
2d21ac55
A
2389 if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
2390 object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
2391 goto handle_copy_delay;
2392
1c79356b
A
2393 cur_object = object;
2394 cur_offset = offset;
2395
2396 while (TRUE) {
2397 m = vm_page_lookup(cur_object, cur_offset);
2d21ac55 2398
1c79356b 2399 if (m != VM_PAGE_NULL) {
55e303ae 2400 if (m->busy) {
143cc14e
A
2401 wait_result_t result;
2402
2d21ac55
A
2403 /*
2404 * in order to do the PAGE_ASSERT_WAIT, we must
2405 * have object that 'm' belongs to locked exclusively
2406 */
2407 if (object != cur_object) {
143cc14e
A
2408 vm_object_unlock(object);
2409
2d21ac55
A
2410 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2411
2412 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2413
2414 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2415 /*
2416 * couldn't upgrade so go do a full retry
2417 * immediately since we've already dropped
2418 * the top object lock associated with this page
2419 * and the current one got dropped due to the
2420 * failed upgrade... the state is no longer valid
2421 */
2422 vm_map_unlock_read(map);
2423 if (real_map != map)
2424 vm_map_unlock(real_map);
2425
2426 goto RetryFault;
2427 }
2428 }
2429 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2430
2431 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2432
2433 if (vm_object_lock_upgrade(object) == FALSE) {
2434 /*
2435 * couldn't upgrade, so explictly take the lock
2436 * exclusively and go relookup the page since we
2437 * will have dropped the object lock and
2438 * a different thread could have inserted
2439 * a page at this offset
2440 * no need for a full retry since we're
2441 * at the top level of the object chain
2442 */
2443 vm_object_lock(object);
2444
2445 continue;
2446 }
2447 }
143cc14e 2448 vm_map_unlock_read(map);
91447636
A
2449 if (real_map != map)
2450 vm_map_unlock(real_map);
143cc14e 2451
143cc14e 2452 result = PAGE_ASSERT_WAIT(m, interruptible);
1c79356b 2453
143cc14e
A
2454 vm_object_unlock(cur_object);
2455
2456 if (result == THREAD_WAITING) {
2457 result = thread_block(THREAD_CONTINUE_NULL);
2458
2459 counter(c_vm_fault_page_block_busy_kernel++);
2460 }
2461 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2462 goto RetryFault;
2463
2464 kr = KERN_ABORTED;
2465 goto done;
2466 }
2d21ac55
A
2467 if (m->phys_page == vm_page_guard_addr) {
2468 /*
2469 * Guard page: let the slow path deal with it
2470 */
2471 break;
2472 }
2473 if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
143cc14e 2474 /*
2d21ac55 2475 * Unusual case... let the slow path deal with it
1c79356b
A
2476 */
2477 break;
2478 }
91447636
A
2479 if (m->encrypted) {
2480 /*
2481 * ENCRYPTED SWAP:
2482 * We've soft-faulted (because it's not in the page
2483 * table) on an encrypted page.
2d21ac55 2484 * Keep the page "busy" so that no one messes with
91447636
A
2485 * it during the decryption.
2486 * Release the extra locks we're holding, keep only
2487 * the page's VM object lock.
2d21ac55
A
2488 *
2489 * in order to set 'busy' on 'm', we must
2490 * have object that 'm' belongs to locked exclusively
91447636 2491 */
2d21ac55 2492 if (object != cur_object) {
91447636 2493 vm_object_unlock(object);
2d21ac55
A
2494
2495 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2496
2497 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2498
2499 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2500 /*
2501 * couldn't upgrade so go do a full retry
2502 * immediately since we've already dropped
2503 * the top object lock associated with this page
2504 * and the current one got dropped due to the
2505 * failed upgrade... the state is no longer valid
2506 */
2507 vm_map_unlock_read(map);
2508 if (real_map != map)
2509 vm_map_unlock(real_map);
2510
2511 goto RetryFault;
2512 }
2513 }
2514 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2515
2516 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2517
2518 if (vm_object_lock_upgrade(object) == FALSE) {
2519 /*
2520 * couldn't upgrade, so explictly take the lock
2521 * exclusively and go relookup the page since we
2522 * will have dropped the object lock and
2523 * a different thread could have inserted
2524 * a page at this offset
2525 * no need for a full retry since we're
2526 * at the top level of the object chain
2527 */
2528 vm_object_lock(object);
2529
2530 continue;
2531 }
91447636 2532 }
2d21ac55
A
2533 m->busy = TRUE;
2534
91447636
A
2535 vm_map_unlock_read(map);
2536 if (real_map != map)
2537 vm_map_unlock(real_map);
2538
2539 vm_page_decrypt(m, 0);
2540
2541 assert(m->busy);
2542 PAGE_WAKEUP_DONE(m);
91447636 2543
2d21ac55 2544 vm_object_unlock(cur_object);
91447636
A
2545 /*
2546 * Retry from the top, in case anything
2547 * changed while we were decrypting...
2548 */
2549 goto RetryFault;
2550 }
2551 ASSERT_PAGE_DECRYPTED(m);
2552
593a1d5f 2553 if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) {
2d21ac55 2554 /*
4a3eedf9 2555 * We might need to validate this page
2d21ac55
A
2556 * against its code signature, so we
2557 * want to hold the VM object exclusively.
2558 */
2559 if (object != cur_object) {
2560 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2561 vm_object_unlock(object);
2562 vm_object_unlock(cur_object);
2563
2564 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2565
2566 vm_map_unlock_read(map);
2567 if (real_map != map)
2568 vm_map_unlock(real_map);
2569
2570 goto RetryFault;
2571 }
2572
2573 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2574
2575 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2576
2577 if (vm_object_lock_upgrade(object) == FALSE) {
2578 /*
2579 * couldn't upgrade, so explictly take the lock
2580 * exclusively and go relookup the page since we
2581 * will have dropped the object lock and
2582 * a different thread could have inserted
2583 * a page at this offset
2584 * no need for a full retry since we're
2585 * at the top level of the object chain
2586 */
2587 vm_object_lock(object);
2588
2589 continue;
2590 }
2591 }
2592 }
1c79356b
A
2593 /*
2594 * Two cases of map in faults:
2595 * - At top level w/o copy object.
2596 * - Read fault anywhere.
2597 * --> must disallow write.
2598 */
2599
4a3eedf9
A
2600 if (object == cur_object && object->copy == VM_OBJECT_NULL) {
2601 if ((fault_type & VM_PROT_WRITE) == 0) {
2602 /*
2603 * This is not a "write" fault, so we
2604 * might not have taken the object lock
2605 * exclusively and we might not be able
2606 * to update the "wpmapped" bit in
2607 * vm_fault_enter().
2608 * Let's just grant read access to
2609 * the page for now and we'll
2610 * soft-fault again if we need write
2611 * access later...
2612 */
2613 prot &= ~VM_PROT_WRITE;
2614 }
2d21ac55 2615 goto FastPmapEnter;
4a3eedf9 2616 }
1c79356b
A
2617
2618 if ((fault_type & VM_PROT_WRITE) == 0) {
2619
2620 prot &= ~VM_PROT_WRITE;
2621
1c79356b 2622 if (object != cur_object) {
c910b4d9
A
2623 /*
2624 * We still need to hold the top object
2625 * lock here to prevent a race between
2626 * a read fault (taking only "shared"
2627 * locks) and a write fault (taking
2628 * an "exclusive" lock on the top
2629 * object.
2630 * Otherwise, as soon as we release the
2631 * top lock, the write fault could
2632 * proceed and actually complete before
2633 * the read fault, and the copied page's
2634 * translation could then be overwritten
2635 * by the read fault's translation for
2636 * the original page.
2637 *
2638 * Let's just record what the top object
2639 * is and we'll release it later.
2d21ac55 2640 */
c910b4d9 2641 top_object = object;
2d21ac55
A
2642
2643 /*
2644 * switch to the object that has the new page
2645 */
1c79356b 2646 object = cur_object;
2d21ac55 2647 object_lock_type = cur_object_lock_type;
1c79356b 2648 }
1c79356b
A
2649FastPmapEnter:
2650 /*
2d21ac55
A
2651 * prepare for the pmap_enter...
2652 * object and map are both locked
2653 * m contains valid data
2654 * object == m->object
2655 * cur_object == NULL or it's been unlocked
2656 * no paging references on either object or cur_object
1c79356b 2657 */
1c79356b 2658#if MACH_KDB
2d21ac55 2659 if (db_watchpoint_list && (fault_type & VM_PROT_WRITE) == 0)
1c79356b 2660 prot &= ~VM_PROT_WRITE;
2d21ac55
A
2661#endif
2662 if (caller_pmap) {
2663 kr = vm_fault_enter(m,
2664 caller_pmap,
2665 caller_pmap_addr,
2666 prot,
2667 wired,
2668 change_wiring,
2669 fault_info.no_cache,
2670 &type_of_fault);
9bccf70c 2671 } else {
2d21ac55
A
2672 kr = vm_fault_enter(m,
2673 pmap,
2674 vaddr,
2675 prot,
2676 wired,
2677 change_wiring,
2678 fault_info.no_cache,
2679 &type_of_fault);
9bccf70c 2680 }
0b4e3aa0 2681
c910b4d9
A
2682 if (top_object != VM_OBJECT_NULL) {
2683 /*
2684 * It's safe to drop the top object
2685 * now that we've done our
2686 * vm_fault_enter(). Any other fault
2687 * in progress for that virtual
2688 * address will either find our page
2689 * and translation or put in a new page
2690 * and translation.
2691 */
2692 vm_object_unlock(top_object);
2693 top_object = VM_OBJECT_NULL;
2694 }
2695
2d21ac55
A
2696 if (need_collapse == TRUE)
2697 vm_object_collapse(object, offset, TRUE);
0c530ab8 2698
2d21ac55
A
2699 if (type_of_fault == DBG_PAGEIN_FAULT) {
2700 /*
2701 * evaluate access pattern and update state
2702 * vm_fault_deactivate_behind depends on the
2703 * state being up to date
2704 */
2705 vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
0c530ab8 2706
2d21ac55 2707 vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
1c79356b 2708 }
1c79356b 2709 /*
2d21ac55 2710 * That's it, clean up and return.
1c79356b 2711 */
2d21ac55
A
2712 if (m->busy)
2713 PAGE_WAKEUP_DONE(m);
6601e61a 2714
1c79356b 2715 vm_object_unlock(object);
143cc14e 2716
1c79356b 2717 vm_map_unlock_read(map);
2d21ac55 2718 if (real_map != map)
91447636 2719 vm_map_unlock(real_map);
1c79356b 2720
2d21ac55 2721 goto done;
1c79356b 2722 }
1c79356b 2723 /*
2d21ac55
A
2724 * COPY ON WRITE FAULT
2725 *
2726 * If objects match, then
2727 * object->copy must not be NULL (else control
2728 * would be in previous code block), and we
2729 * have a potential push into the copy object
2730 * with which we can't cope with here.
1c79356b 2731 */
2d21ac55
A
2732 if (cur_object == object) {
2733 /*
2734 * must take the slow path to
2735 * deal with the copy push
2736 */
1c79356b 2737 break;
2d21ac55
A
2738 }
2739 assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
2740
1c79356b 2741 /*
2d21ac55
A
2742 * This is now a shadow based copy on write
2743 * fault -- it requires a copy up the shadow
2744 * chain.
2745 *
2746 * Allocate a page in the original top level
2747 * object. Give up if allocate fails. Also
2748 * need to remember current page, as it's the
2749 * source of the copy.
1c79356b 2750 *
2d21ac55
A
2751 * at this point we hold locks on both
2752 * object and cur_object... no need to take
2753 * paging refs or mark pages BUSY since
2754 * we don't drop either object lock until
2755 * the page has been copied and inserted
1c79356b
A
2756 */
2757 cur_m = m;
2758 m = vm_page_grab();
2d21ac55 2759
1c79356b 2760 if (m == VM_PAGE_NULL) {
2d21ac55
A
2761 /*
2762 * no free page currently available...
2763 * must take the slow path
2764 */
1c79356b
A
2765 break;
2766 }
1c79356b 2767 /*
2d21ac55 2768 * Now do the copy. Mark the source page busy...
1c79356b
A
2769 *
2770 * NOTE: This code holds the map lock across
2771 * the page copy.
2772 */
1c79356b
A
2773 vm_page_copy(cur_m, m);
2774 vm_page_insert(m, object, offset);
2d21ac55 2775 m->dirty = TRUE;
1c79356b
A
2776
2777 /*
2d21ac55 2778 * Now cope with the source page and object
1c79356b 2779 */
2d21ac55
A
2780 if (object->ref_count > 1 && cur_m->pmapped)
2781 pmap_disconnect(cur_m->phys_page);
1c79356b 2782
2d21ac55 2783 need_collapse = TRUE;
1c79356b 2784
2d21ac55
A
2785 if (!cur_object->internal &&
2786 cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
2787 /*
2788 * The object from which we've just
2789 * copied a page is most probably backed
2790 * by a vnode. We don't want to waste too
2791 * much time trying to collapse the VM objects
2792 * and create a bottleneck when several tasks
2793 * map the same file.
2794 */
2795 if (cur_object->copy == object) {
2796 /*
2797 * Shared mapping or no COW yet.
2798 * We can never collapse a copy
2799 * object into its backing object.
2800 */
2801 need_collapse = FALSE;
2802 } else if (cur_object->copy == object->shadow &&
2803 object->shadow->resident_page_count == 0) {
2804 /*
2805 * Shared mapping after a COW occurred.
2806 */
2807 need_collapse = FALSE;
2808 }
2809 }
1c79356b
A
2810 vm_object_unlock(cur_object);
2811
2d21ac55
A
2812 if (need_collapse == FALSE)
2813 vm_fault_collapse_skipped++;
2814 vm_fault_collapse_total++;
2815
2816 type_of_fault = DBG_COW_FAULT;
2817 VM_STAT_INCR(cow_faults);
2818 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2819 current_task()->cow_faults++;
1c79356b
A
2820
2821 goto FastPmapEnter;
1c79356b 2822
2d21ac55 2823 } else {
1c79356b 2824 /*
2d21ac55 2825 * No page at cur_object, cur_offset... m == NULL
1c79356b 2826 */
1c79356b 2827 if (cur_object->pager_created) {
2d21ac55
A
2828 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) {
2829 /*
2830 * May have to talk to a pager...
2831 * take the slow path.
2832 */
2833 break;
2834 }
1c79356b 2835 /*
2d21ac55
A
2836 * existence map present and indicates
2837 * that the pager doesn't have this page
1c79356b 2838 */
1c79356b 2839 }
1c79356b 2840 if (cur_object->shadow == VM_OBJECT_NULL) {
2d21ac55
A
2841 /*
2842 * Zero fill fault. Page gets
2843 * inserted into the original object.
2844 */
1c79356b 2845 if (cur_object->shadow_severed) {
2d21ac55
A
2846
2847 if (object != cur_object)
2848 vm_object_unlock(cur_object);
1c79356b 2849 vm_object_unlock(object);
2d21ac55 2850
1c79356b 2851 vm_map_unlock_read(map);
2d21ac55 2852 if (real_map != map)
91447636 2853 vm_map_unlock(real_map);
1c79356b 2854
2d21ac55
A
2855 kr = KERN_MEMORY_ERROR;
2856 goto done;
2857 }
2858 if (VM_PAGE_ZFILL_THROTTLED()) {
2859 /*
2860 * drop all of our locks...
2861 * wait until the free queue is
2862 * pumped back up and then
2863 * redrive the fault
2864 */
2865 if (object != cur_object)
2866 vm_object_unlock(cur_object);
2867 vm_object_unlock(object);
2868 vm_map_unlock_read(map);
2869 if (real_map != map)
2870 vm_map_unlock(real_map);
9bccf70c 2871
2d21ac55
A
2872 if (vm_page_wait((change_wiring) ?
2873 THREAD_UNINT :
2874 THREAD_ABORTSAFE))
2875 goto RetryFault;
1c79356b 2876
2d21ac55
A
2877 kr = KERN_ABORTED;
2878 goto done;
2879 }
2880 if (vm_backing_store_low) {
2881 /*
2882 * we are protecting the system from
2883 * backing store exhaustion...
2884 * must take the slow path if we're
2885 * not privileged
2886 */
2887 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
2888 break;
1c79356b 2889 }
2d21ac55
A
2890 if (cur_object != object) {
2891 vm_object_unlock(cur_object);
1c79356b 2892
2d21ac55 2893 cur_object = object;
55e303ae 2894 }
2d21ac55 2895 if (object_lock_type == OBJECT_LOCK_SHARED) {
55e303ae 2896
2d21ac55
A
2897 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2898
2899 if (vm_object_lock_upgrade(object) == FALSE) {
2900 /*
2901 * couldn't upgrade so do a full retry on the fault
2902 * since we dropped the object lock which
2903 * could allow another thread to insert
2904 * a page at this offset
2905 */
2906 vm_map_unlock_read(map);
2907 if (real_map != map)
2908 vm_map_unlock(real_map);
2909
2910 goto RetryFault;
2911 }
1c79356b
A
2912 }
2913 m = vm_page_alloc(object, offset);
2d21ac55 2914
1c79356b 2915 if (m == VM_PAGE_NULL) {
2d21ac55
A
2916 /*
2917 * no free page currently available...
2918 * must take the slow path
2919 */
1c79356b
A
2920 break;
2921 }
1c79356b 2922
1c79356b 2923 /*
2d21ac55
A
2924 * Now zero fill page...
2925 * the page is probably going to
2926 * be written soon, so don't bother
2927 * to clear the modified bit
1c79356b 2928 *
2d21ac55
A
2929 * NOTE: This code holds the map
2930 * lock across the zero fill.
1c79356b 2931 */
2d21ac55 2932 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
143cc14e 2933
1c79356b
A
2934 goto FastPmapEnter;
2935 }
1c79356b 2936 /*
2d21ac55 2937 * On to the next level in the shadow chain
1c79356b 2938 */
1c79356b
A
2939 cur_offset += cur_object->shadow_offset;
2940 new_object = cur_object->shadow;
2d21ac55
A
2941
2942 /*
2943 * take the new_object's lock with the indicated state
2944 */
2945 if (cur_object_lock_type == OBJECT_LOCK_SHARED)
2946 vm_object_lock_shared(new_object);
2947 else
2948 vm_object_lock(new_object);
2949
1c79356b
A
2950 if (cur_object != object)
2951 vm_object_unlock(cur_object);
2d21ac55 2952
1c79356b
A
2953 cur_object = new_object;
2954
2955 continue;
2956 }
2957 }
1c79356b 2958 /*
2d21ac55
A
2959 * Cleanup from fast fault failure. Drop any object
2960 * lock other than original and drop map lock.
1c79356b 2961 */
1c79356b
A
2962 if (object != cur_object)
2963 vm_object_unlock(cur_object);
2d21ac55
A
2964
2965 /*
2966 * must own the object lock exclusively at this point
2967 */
2968 if (object_lock_type == OBJECT_LOCK_SHARED) {
2969 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2970
2971 if (vm_object_lock_upgrade(object) == FALSE) {
2972 /*
2973 * couldn't upgrade, so explictly
2974 * take the lock exclusively
2975 * no need to retry the fault at this
2976 * point since "vm_fault_page" will
2977 * completely re-evaluate the state
2978 */
2979 vm_object_lock(object);
2980 }
1c79356b 2981 }
143cc14e 2982
2d21ac55
A
2983handle_copy_delay:
2984 vm_map_unlock_read(map);
2985 if (real_map != map)
91447636 2986 vm_map_unlock(real_map);
1c79356b
A
2987
2988 /*
2d21ac55
A
2989 * Make a reference to this object to
2990 * prevent its disposal while we are messing with
2991 * it. Once we have the reference, the map is free
2992 * to be diddled. Since objects reference their
2993 * shadows (and copies), they will stay around as well.
1c79356b 2994 */
2d21ac55 2995 vm_object_reference_locked(object);
1c79356b
A
2996 vm_object_paging_begin(object);
2997
2998 XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
55e303ae 2999
2d21ac55 3000 error_code = 0;
55e303ae 3001
1c79356b
A
3002 kr = vm_fault_page(object, offset, fault_type,
3003 (change_wiring && !wired),
1c79356b
A
3004 &prot, &result_page, &top_page,
3005 &type_of_fault,
2d21ac55
A
3006 &error_code, map->no_zero_fill,
3007 FALSE, &fault_info);
1c79356b
A
3008
3009 /*
2d21ac55
A
3010 * if kr != VM_FAULT_SUCCESS, then the paging reference
3011 * has been dropped and the object unlocked... the ref_count
3012 * is still held
3013 *
3014 * if kr == VM_FAULT_SUCCESS, then the paging reference
3015 * is still held along with the ref_count on the original object
3016 *
3017 * if m != NULL, then the object it belongs to
3018 * is returned locked with a paging reference
3019 *
3020 * if top_page != NULL, then it's BUSY and the
3021 * object it belongs to has a paging reference
3022 * but is returned unlocked
1c79356b 3023 */
2d21ac55
A
3024 if (kr != VM_FAULT_SUCCESS) {
3025 /*
3026 * we didn't succeed, lose the object reference immediately.
3027 */
1c79356b
A
3028 vm_object_deallocate(object);
3029
2d21ac55
A
3030 /*
3031 * See why we failed, and take corrective action.
3032 */
3033 switch (kr) {
1c79356b
A
3034 case VM_FAULT_MEMORY_SHORTAGE:
3035 if (vm_page_wait((change_wiring) ?
3036 THREAD_UNINT :
3037 THREAD_ABORTSAFE))
3038 goto RetryFault;
2d21ac55
A
3039 /*
3040 * fall thru
3041 */
1c79356b
A
3042 case VM_FAULT_INTERRUPTED:
3043 kr = KERN_ABORTED;
3044 goto done;
3045 case VM_FAULT_RETRY:
3046 goto RetryFault;
1c79356b
A
3047 case VM_FAULT_MEMORY_ERROR:
3048 if (error_code)
3049 kr = error_code;
3050 else
3051 kr = KERN_MEMORY_ERROR;
3052 goto done;
2d21ac55 3053 }
1c79356b 3054 }
1c79356b
A
3055 m = result_page;
3056
2d21ac55 3057 if (m != VM_PAGE_NULL) {
0b4e3aa0
A
3058 assert((change_wiring && !wired) ?
3059 (top_page == VM_PAGE_NULL) :
3060 ((top_page == VM_PAGE_NULL) == (m->object == object)));
3061 }
1c79356b
A
3062
3063 /*
2d21ac55
A
3064 * What to do with the resulting page from vm_fault_page
3065 * if it doesn't get entered into the physical map:
1c79356b 3066 */
1c79356b
A
3067#define RELEASE_PAGE(m) \
3068 MACRO_BEGIN \
3069 PAGE_WAKEUP_DONE(m); \
2d21ac55
A
3070 vm_page_lockspin_queues(); \
3071 if (!m->active && !m->inactive && !m->throttled)\
1c79356b
A
3072 vm_page_activate(m); \
3073 vm_page_unlock_queues(); \
3074 MACRO_END
3075
3076 /*
2d21ac55
A
3077 * We must verify that the maps have not changed
3078 * since our last lookup.
1c79356b 3079 */
2d21ac55 3080 if (m != VM_PAGE_NULL) {
0b4e3aa0 3081 old_copy_object = m->object->copy;
0b4e3aa0 3082 vm_object_unlock(m->object);
2d21ac55 3083 } else
0b4e3aa0 3084 old_copy_object = VM_OBJECT_NULL;
2d21ac55
A
3085
3086 /*
3087 * no object locks are held at this point
3088 */
1c79356b
A
3089 if ((map != original_map) || !vm_map_verify(map, &version)) {
3090 vm_object_t retry_object;
3091 vm_object_offset_t retry_offset;
3092 vm_prot_t retry_prot;
3093
3094 /*
2d21ac55
A
3095 * To avoid trying to write_lock the map while another
3096 * thread has it read_locked (in vm_map_pageable), we
3097 * do not try for write permission. If the page is
3098 * still writable, we will get write permission. If it
3099 * is not, or has been marked needs_copy, we enter the
3100 * mapping without write permission, and will merely
3101 * take another fault.
1c79356b
A
3102 */
3103 map = original_map;
3104 vm_map_lock_read(map);
2d21ac55 3105
1c79356b 3106 kr = vm_map_lookup_locked(&map, vaddr,
2d21ac55
A
3107 fault_type & ~VM_PROT_WRITE,
3108 OBJECT_LOCK_EXCLUSIVE, &version,
3109 &retry_object, &retry_offset, &retry_prot,
3110 &wired,
3111 &fault_info,
3112 &real_map);
91447636 3113 pmap = real_map->pmap;
1c79356b
A
3114
3115 if (kr != KERN_SUCCESS) {
3116 vm_map_unlock_read(map);
2d21ac55
A
3117
3118 if (m != VM_PAGE_NULL) {
3119 /*
3120 * retake the lock so that
3121 * we can drop the paging reference
3122 * in vm_fault_cleanup and do the
3123 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3124 */
0b4e3aa0 3125 vm_object_lock(m->object);
2d21ac55 3126
0b4e3aa0 3127 RELEASE_PAGE(m);
2d21ac55
A
3128
3129 vm_fault_cleanup(m->object, top_page);
0b4e3aa0 3130 } else {
2d21ac55
A
3131 /*
3132 * retake the lock so that
3133 * we can drop the paging reference
3134 * in vm_fault_cleanup
3135 */
3136 vm_object_lock(object);
3137
3138 vm_fault_cleanup(object, top_page);
0b4e3aa0 3139 }
2d21ac55
A
3140 vm_object_deallocate(object);
3141
1c79356b
A
3142 goto done;
3143 }
1c79356b 3144 vm_object_unlock(retry_object);
1c79356b 3145
2d21ac55
A
3146 if ((retry_object != object) || (retry_offset != offset)) {
3147
1c79356b 3148 vm_map_unlock_read(map);
2d21ac55 3149 if (real_map != map)
91447636 3150 vm_map_unlock(real_map);
2d21ac55
A
3151
3152 if (m != VM_PAGE_NULL) {
3153 /*
3154 * retake the lock so that
3155 * we can drop the paging reference
3156 * in vm_fault_cleanup and do the
3157 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3158 */
3159 vm_object_lock(m->object);
3160
0b4e3aa0 3161 RELEASE_PAGE(m);
2d21ac55
A
3162
3163 vm_fault_cleanup(m->object, top_page);
0b4e3aa0 3164 } else {
2d21ac55
A
3165 /*
3166 * retake the lock so that
3167 * we can drop the paging reference
3168 * in vm_fault_cleanup
3169 */
3170 vm_object_lock(object);
3171
3172 vm_fault_cleanup(object, top_page);
0b4e3aa0 3173 }
2d21ac55
A
3174 vm_object_deallocate(object);
3175
1c79356b
A
3176 goto RetryFault;
3177 }
1c79356b 3178 /*
2d21ac55
A
3179 * Check whether the protection has changed or the object
3180 * has been copied while we left the map unlocked.
1c79356b
A
3181 */
3182 prot &= retry_prot;
0b4e3aa0 3183 }
2d21ac55 3184 if (m != VM_PAGE_NULL) {
0b4e3aa0 3185 vm_object_lock(m->object);
1c79356b 3186
2d21ac55
A
3187 if (m->object->copy != old_copy_object) {
3188 /*
3189 * The copy object changed while the top-level object
3190 * was unlocked, so take away write permission.
3191 */
0b4e3aa0 3192 prot &= ~VM_PROT_WRITE;
2d21ac55
A
3193 }
3194 } else
3195 vm_object_lock(object);
1c79356b
A
3196
3197 /*
2d21ac55
A
3198 * If we want to wire down this page, but no longer have
3199 * adequate permissions, we must start all over.
1c79356b 3200 */
2d21ac55 3201 if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
1c79356b 3202
1c79356b 3203 vm_map_verify_done(map, &version);
2d21ac55 3204 if (real_map != map)
91447636 3205 vm_map_unlock(real_map);
1c79356b 3206
2d21ac55
A
3207 if (m != VM_PAGE_NULL) {
3208 RELEASE_PAGE(m);
91447636 3209
2d21ac55
A
3210 vm_fault_cleanup(m->object, top_page);
3211 } else
3212 vm_fault_cleanup(object, top_page);
0b4e3aa0 3213
2d21ac55 3214 vm_object_deallocate(object);
55e303ae 3215
2d21ac55
A
3216 goto RetryFault;
3217 }
3218 if (m != VM_PAGE_NULL) {
55e303ae 3219 /*
2d21ac55
A
3220 * Put this page into the physical map.
3221 * We had to do the unlock above because pmap_enter
3222 * may cause other faults. The page may be on
3223 * the pageout queues. If the pageout daemon comes
3224 * across the page, it will remove it from the queues.
55e303ae 3225 */
2d21ac55
A
3226 if (caller_pmap) {
3227 kr = vm_fault_enter(m,
3228 caller_pmap,
3229 caller_pmap_addr,
3230 prot,
3231 wired,
3232 change_wiring,
3233 fault_info.no_cache,
3234 &type_of_fault);
3235 } else {
3236 kr = vm_fault_enter(m,
3237 pmap,
3238 vaddr,
3239 prot,
3240 wired,
3241 change_wiring,
3242 fault_info.no_cache,
3243 &type_of_fault);
3244 }
3245 if (kr != KERN_SUCCESS) {
3246 /* abort this page fault */
3247 vm_map_verify_done(map, &version);
3248 if (real_map != map)
3249 vm_map_unlock(real_map);
3250 PAGE_WAKEUP_DONE(m);
3251 vm_fault_cleanup(m->object, top_page);
3252 vm_object_deallocate(object);
3253 goto done;
0b4e3aa0
A
3254 }
3255 } else {
3256
9bccf70c 3257 vm_map_entry_t entry;
91447636
A
3258 vm_map_offset_t laddr;
3259 vm_map_offset_t ldelta, hdelta;
143cc14e 3260
0b4e3aa0
A
3261 /*
3262 * do a pmap block mapping from the physical address
3263 * in the object
3264 */
9bccf70c 3265
2d21ac55 3266#ifdef ppc
55e303ae
A
3267 /* While we do not worry about execution protection in */
3268 /* general, certian pages may have instruction execution */
3269 /* disallowed. We will check here, and if not allowed */
3270 /* to execute, we return with a protection failure. */
9bccf70c 3271
2d21ac55
A
3272 if ((fault_type & VM_PROT_EXECUTE) &&
3273 (!pmap_eligible_for_execute((ppnum_t)(object->shadow_offset >> 12)))) {
9bccf70c 3274
9bccf70c 3275 vm_map_verify_done(map, &version);
2d21ac55
A
3276
3277 if (real_map != map)
91447636 3278 vm_map_unlock(real_map);
2d21ac55 3279
9bccf70c
A
3280 vm_fault_cleanup(object, top_page);
3281 vm_object_deallocate(object);
2d21ac55 3282
9bccf70c
A
3283 kr = KERN_PROTECTION_FAILURE;
3284 goto done;
0b4e3aa0 3285 }
2d21ac55 3286#endif /* ppc */
1c79356b 3287
2d21ac55 3288 if (real_map != map)
91447636 3289 vm_map_unlock(real_map);
2d21ac55 3290
9bccf70c
A
3291 if (original_map != map) {
3292 vm_map_unlock_read(map);
3293 vm_map_lock_read(original_map);
3294 map = original_map;
3295 }
91447636 3296 real_map = map;
9bccf70c
A
3297
3298 laddr = vaddr;
3299 hdelta = 0xFFFFF000;
3300 ldelta = 0xFFFFF000;
3301
2d21ac55
A
3302 while (vm_map_lookup_entry(map, laddr, &entry)) {
3303 if (ldelta > (laddr - entry->vme_start))
9bccf70c 3304 ldelta = laddr - entry->vme_start;
2d21ac55 3305 if (hdelta > (entry->vme_end - laddr))
9bccf70c 3306 hdelta = entry->vme_end - laddr;
2d21ac55 3307 if (entry->is_sub_map) {
9bccf70c
A
3308
3309 laddr = (laddr - entry->vme_start)
3310 + entry->offset;
3311 vm_map_lock_read(entry->object.sub_map);
2d21ac55
A
3312
3313 if (map != real_map)
9bccf70c 3314 vm_map_unlock_read(map);
2d21ac55 3315 if (entry->use_pmap) {
91447636
A
3316 vm_map_unlock_read(real_map);
3317 real_map = entry->object.sub_map;
9bccf70c
A
3318 }
3319 map = entry->object.sub_map;
3320
3321 } else {
3322 break;
3323 }
3324 }
3325
2d21ac55
A
3326 if (vm_map_lookup_entry(map, laddr, &entry) &&
3327 (entry->object.vm_object != NULL) &&
3328 (entry->object.vm_object == object)) {
3329
3330 if (caller_pmap) {
3331 /*
3332 * Set up a block mapped area
3333 */
3334 pmap_map_block(caller_pmap,
3335 (addr64_t)(caller_pmap_addr - ldelta),
3336 (((vm_map_offset_t) (entry->object.vm_object->shadow_offset)) +
3337 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3338 ((ldelta + hdelta) >> 12), prot,
3339 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
55e303ae 3340 } else {
2d21ac55
A
3341 /*
3342 * Set up a block mapped area
3343 */
3344 pmap_map_block(real_map->pmap,
3345 (addr64_t)(vaddr - ldelta),
3346 (((vm_map_offset_t)(entry->object.vm_object->shadow_offset)) +
3347 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3348 ((ldelta + hdelta) >> 12), prot,
3349 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
9bccf70c
A
3350 }
3351 }
0b4e3aa0 3352 }
1c79356b
A
3353
3354 /*
2d21ac55 3355 * Unlock everything, and return
1c79356b 3356 */
1c79356b 3357 vm_map_verify_done(map, &version);
2d21ac55 3358 if (real_map != map)
91447636 3359 vm_map_unlock(real_map);
2d21ac55
A
3360
3361 if (m != VM_PAGE_NULL) {
0b4e3aa0 3362 PAGE_WAKEUP_DONE(m);
1c79356b 3363
2d21ac55
A
3364 vm_fault_cleanup(m->object, top_page);
3365 } else
3366 vm_fault_cleanup(object, top_page);
1c79356b 3367
2d21ac55
A
3368 vm_object_deallocate(object);
3369
3370#undef RELEASE_PAGE
91447636 3371
2d21ac55
A
3372 kr = KERN_SUCCESS;
3373done:
9bccf70c 3374 thread_interrupt_level(interruptible_state);
1c79356b 3375
2d21ac55
A
3376 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3377 (int)((uint64_t)vaddr >> 32),
3378 (int)vaddr,
1c79356b 3379 kr,
2d21ac55 3380 type_of_fault,
1c79356b 3381 0);
143cc14e 3382
2d21ac55 3383 return (kr);
1c79356b
A
3384}
3385
3386/*
3387 * vm_fault_wire:
3388 *
3389 * Wire down a range of virtual addresses in a map.
3390 */
3391kern_return_t
3392vm_fault_wire(
3393 vm_map_t map,
3394 vm_map_entry_t entry,
9bccf70c 3395 pmap_t pmap,
91447636 3396 vm_map_offset_t pmap_addr)
1c79356b
A
3397{
3398
91447636
A
3399 register vm_map_offset_t va;
3400 register vm_map_offset_t end_addr = entry->vme_end;
1c79356b
A
3401 register kern_return_t rc;
3402
3403 assert(entry->in_transition);
3404
9bccf70c
A
3405 if ((entry->object.vm_object != NULL) &&
3406 !entry->is_sub_map &&
3407 entry->object.vm_object->phys_contiguous) {
3408 return KERN_SUCCESS;
3409 }
3410
1c79356b
A
3411 /*
3412 * Inform the physical mapping system that the
3413 * range of addresses may not fault, so that
3414 * page tables and such can be locked down as well.
3415 */
3416
9bccf70c
A
3417 pmap_pageable(pmap, pmap_addr,
3418 pmap_addr + (end_addr - entry->vme_start), FALSE);
1c79356b
A
3419
3420 /*
3421 * We simulate a fault to get the page and enter it
3422 * in the physical map.
3423 */
3424
3425 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3426 if ((rc = vm_fault_wire_fast(
9bccf70c
A
3427 map, va, entry, pmap,
3428 pmap_addr + (va - entry->vme_start)
3429 )) != KERN_SUCCESS) {
1c79356b 3430 rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
9bccf70c
A
3431 (pmap == kernel_pmap) ?
3432 THREAD_UNINT : THREAD_ABORTSAFE,
3433 pmap, pmap_addr + (va - entry->vme_start));
2d21ac55 3434 DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
1c79356b
A
3435 }
3436
3437 if (rc != KERN_SUCCESS) {
3438 struct vm_map_entry tmp_entry = *entry;
3439
3440 /* unwire wired pages */
3441 tmp_entry.vme_end = va;
9bccf70c
A
3442 vm_fault_unwire(map,
3443 &tmp_entry, FALSE, pmap, pmap_addr);
1c79356b
A
3444
3445 return rc;
3446 }
3447 }
3448 return KERN_SUCCESS;
3449}
3450
3451/*
3452 * vm_fault_unwire:
3453 *
3454 * Unwire a range of virtual addresses in a map.
3455 */
3456void
3457vm_fault_unwire(
3458 vm_map_t map,
3459 vm_map_entry_t entry,
3460 boolean_t deallocate,
9bccf70c 3461 pmap_t pmap,
91447636 3462 vm_map_offset_t pmap_addr)
1c79356b 3463{
91447636
A
3464 register vm_map_offset_t va;
3465 register vm_map_offset_t end_addr = entry->vme_end;
1c79356b 3466 vm_object_t object;
2d21ac55 3467 struct vm_object_fault_info fault_info;
1c79356b
A
3468
3469 object = (entry->is_sub_map)
3470 ? VM_OBJECT_NULL : entry->object.vm_object;
3471
2d21ac55
A
3472 /*
3473 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
3474 * do anything since such memory is wired by default. So we don't have
3475 * anything to undo here.
3476 */
3477
3478 if (object != VM_OBJECT_NULL && object->phys_contiguous)
3479 return;
3480
3481 fault_info.interruptible = THREAD_UNINT;
3482 fault_info.behavior = entry->behavior;
3483 fault_info.user_tag = entry->alias;
3484 fault_info.lo_offset = entry->offset;
3485 fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
3486 fault_info.no_cache = entry->no_cache;
3487
1c79356b
A
3488 /*
3489 * Since the pages are wired down, we must be able to
3490 * get their mappings from the physical map system.
3491 */
3492
3493 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
1c79356b
A
3494
3495 if (object == VM_OBJECT_NULL) {
593a1d5f
A
3496 if (pmap) {
3497 pmap_change_wiring(pmap,
3498 pmap_addr + (va - entry->vme_start), FALSE);
3499 }
9bccf70c
A
3500 (void) vm_fault(map, va, VM_PROT_NONE,
3501 TRUE, THREAD_UNINT, pmap, pmap_addr);
1c79356b
A
3502 } else {
3503 vm_prot_t prot;
3504 vm_page_t result_page;
3505 vm_page_t top_page;
3506 vm_object_t result_object;
3507 vm_fault_return_t result;
3508
2d21ac55
A
3509 fault_info.cluster_size = end_addr - va;
3510
1c79356b
A
3511 do {
3512 prot = VM_PROT_NONE;
3513
3514 vm_object_lock(object);
3515 vm_object_paging_begin(object);
3516 XPR(XPR_VM_FAULT,
3517 "vm_fault_unwire -> vm_fault_page\n",
3518 0,0,0,0,0);
2d21ac55
A
3519 result = vm_fault_page(
3520 object,
3521 entry->offset + (va - entry->vme_start),
3522 VM_PROT_NONE, TRUE,
3523 &prot, &result_page, &top_page,
3524 (int *)0,
3525 NULL, map->no_zero_fill,
3526 FALSE, &fault_info);
1c79356b
A
3527 } while (result == VM_FAULT_RETRY);
3528
2d21ac55
A
3529 /*
3530 * If this was a mapping to a file on a device that has been forcibly
3531 * unmounted, then we won't get a page back from vm_fault_page(). Just
3532 * move on to the next one in case the remaining pages are mapped from
3533 * different objects. During a forced unmount, the object is terminated
3534 * so the alive flag will be false if this happens. A forced unmount will
3535 * will occur when an external disk is unplugged before the user does an
3536 * eject, so we don't want to panic in that situation.
3537 */
3538
3539 if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
3540 continue;
3541
1c79356b
A
3542 if (result != VM_FAULT_SUCCESS)
3543 panic("vm_fault_unwire: failure");
3544
3545 result_object = result_page->object;
2d21ac55 3546
593a1d5f
A
3547 if ((pmap) && (result_page->phys_page != vm_page_guard_addr)) {
3548 pmap_change_wiring(pmap,
3549 pmap_addr + (va - entry->vme_start), FALSE);
3550 }
1c79356b 3551 if (deallocate) {
2d21ac55
A
3552 assert(result_page->phys_page !=
3553 vm_page_fictitious_addr);
91447636 3554 pmap_disconnect(result_page->phys_page);
1c79356b
A
3555 VM_PAGE_FREE(result_page);
3556 } else {
2d21ac55 3557 vm_page_lockspin_queues();
1c79356b
A
3558 vm_page_unwire(result_page);
3559 vm_page_unlock_queues();
3560 PAGE_WAKEUP_DONE(result_page);
3561 }
1c79356b
A
3562 vm_fault_cleanup(result_object, top_page);
3563 }
3564 }
3565
3566 /*
3567 * Inform the physical mapping system that the range
3568 * of addresses may fault, so that page tables and
3569 * such may be unwired themselves.
3570 */
3571
9bccf70c
A
3572 pmap_pageable(pmap, pmap_addr,
3573 pmap_addr + (end_addr - entry->vme_start), TRUE);
1c79356b
A
3574
3575}
3576
3577/*
3578 * vm_fault_wire_fast:
3579 *
3580 * Handle common case of a wire down page fault at the given address.
3581 * If successful, the page is inserted into the associated physical map.
3582 * The map entry is passed in to avoid the overhead of a map lookup.
3583 *
3584 * NOTE: the given address should be truncated to the
3585 * proper page address.
3586 *
3587 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
3588 * a standard error specifying why the fault is fatal is returned.
3589 *
3590 * The map in question must be referenced, and remains so.
3591 * Caller has a read lock on the map.
3592 *
3593 * This is a stripped version of vm_fault() for wiring pages. Anything
3594 * other than the common case will return KERN_FAILURE, and the caller
3595 * is expected to call vm_fault().
3596 */
3597kern_return_t
3598vm_fault_wire_fast(
91447636
A
3599 __unused vm_map_t map,
3600 vm_map_offset_t va,
1c79356b 3601 vm_map_entry_t entry,
91447636
A
3602 pmap_t pmap,
3603 vm_map_offset_t pmap_addr)
1c79356b
A
3604{
3605 vm_object_t object;
3606 vm_object_offset_t offset;
3607 register vm_page_t m;
3608 vm_prot_t prot;
91447636 3609 thread_t thread = current_thread();
2d21ac55
A
3610 int type_of_fault;
3611 kern_return_t kr;
1c79356b 3612
2d21ac55 3613 VM_STAT_INCR(faults);
1c79356b 3614
91447636
A
3615 if (thread != THREAD_NULL && thread->task != TASK_NULL)
3616 thread->task->faults++;
1c79356b
A
3617
3618/*
3619 * Recovery actions
3620 */
3621
3622#undef RELEASE_PAGE
3623#define RELEASE_PAGE(m) { \
3624 PAGE_WAKEUP_DONE(m); \
2d21ac55 3625 vm_page_lockspin_queues(); \
1c79356b
A
3626 vm_page_unwire(m); \
3627 vm_page_unlock_queues(); \
3628}
3629
3630
3631#undef UNLOCK_THINGS
3632#define UNLOCK_THINGS { \
ff6e181a
A
3633 vm_object_paging_end(object); \
3634 vm_object_unlock(object); \
1c79356b
A
3635}
3636
3637#undef UNLOCK_AND_DEALLOCATE
3638#define UNLOCK_AND_DEALLOCATE { \
3639 UNLOCK_THINGS; \
3640 vm_object_deallocate(object); \
3641}
3642/*
3643 * Give up and have caller do things the hard way.
3644 */
3645
3646#define GIVE_UP { \
3647 UNLOCK_AND_DEALLOCATE; \
3648 return(KERN_FAILURE); \
3649}
3650
3651
3652 /*
3653 * If this entry is not directly to a vm_object, bail out.
3654 */
3655 if (entry->is_sub_map)
3656 return(KERN_FAILURE);
3657
3658 /*
3659 * Find the backing store object and offset into it.
3660 */
3661
3662 object = entry->object.vm_object;
3663 offset = (va - entry->vme_start) + entry->offset;
3664 prot = entry->protection;
3665
3666 /*
3667 * Make a reference to this object to prevent its
3668 * disposal while we are messing with it.
3669 */
3670
3671 vm_object_lock(object);
2d21ac55 3672 vm_object_reference_locked(object);
ff6e181a 3673 vm_object_paging_begin(object);
1c79356b
A
3674
3675 /*
3676 * INVARIANTS (through entire routine):
3677 *
3678 * 1) At all times, we must either have the object
3679 * lock or a busy page in some object to prevent
3680 * some other thread from trying to bring in
3681 * the same page.
3682 *
3683 * 2) Once we have a busy page, we must remove it from
3684 * the pageout queues, so that the pageout daemon
3685 * will not grab it away.
3686 *
3687 */
3688
3689 /*
3690 * Look for page in top-level object. If it's not there or
3691 * there's something going on, give up.
91447636
A
3692 * ENCRYPTED SWAP: use the slow fault path, since we'll need to
3693 * decrypt the page before wiring it down.
1c79356b
A
3694 */
3695 m = vm_page_lookup(object, offset);
91447636 3696 if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
2d21ac55 3697 (m->unusual && ( m->error || m->restart || m->absent))) {
1c79356b
A
3698
3699 GIVE_UP;
3700 }
91447636 3701 ASSERT_PAGE_DECRYPTED(m);
1c79356b 3702
2d21ac55
A
3703 if (m->fictitious &&
3704 m->phys_page == vm_page_guard_addr) {
3705 /*
3706 * Guard pages are fictitious pages and are never
3707 * entered into a pmap, so let's say it's been wired...
3708 */
3709 kr = KERN_SUCCESS;
3710 goto done;
3711 }
3712
1c79356b
A
3713 /*
3714 * Wire the page down now. All bail outs beyond this
3715 * point must unwire the page.
3716 */
3717
2d21ac55 3718 vm_page_lockspin_queues();
1c79356b
A
3719 vm_page_wire(m);
3720 vm_page_unlock_queues();
3721
3722 /*
3723 * Mark page busy for other threads.
3724 */
3725 assert(!m->busy);
3726 m->busy = TRUE;
3727 assert(!m->absent);
3728
3729 /*
3730 * Give up if the page is being written and there's a copy object
3731 */
3732 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3733 RELEASE_PAGE(m);
3734 GIVE_UP;
3735 }
3736
3737 /*
3738 * Put this page into the physical map.
1c79356b 3739 */
2d21ac55
A
3740 type_of_fault = DBG_CACHE_HIT_FAULT;
3741 kr = vm_fault_enter(m,
3742 pmap,
3743 pmap_addr,
3744 prot,
3745 TRUE,
3746 FALSE,
3747 FALSE,
3748 &type_of_fault);
3749
3750done:
1c79356b
A
3751 /*
3752 * Unlock everything, and return
3753 */
3754
3755 PAGE_WAKEUP_DONE(m);
3756 UNLOCK_AND_DEALLOCATE;
3757
2d21ac55 3758 return kr;
1c79356b
A
3759
3760}
3761
3762/*
3763 * Routine: vm_fault_copy_cleanup
3764 * Purpose:
3765 * Release a page used by vm_fault_copy.
3766 */
3767
3768void
3769vm_fault_copy_cleanup(
3770 vm_page_t page,
3771 vm_page_t top_page)
3772{
3773 vm_object_t object = page->object;
3774
3775 vm_object_lock(object);
3776 PAGE_WAKEUP_DONE(page);
2d21ac55
A
3777 vm_page_lockspin_queues();
3778 if (!page->active && !page->inactive && !page->throttled)
1c79356b
A
3779 vm_page_activate(page);
3780 vm_page_unlock_queues();
3781 vm_fault_cleanup(object, top_page);
3782}
3783
3784void
3785vm_fault_copy_dst_cleanup(
3786 vm_page_t page)
3787{
3788 vm_object_t object;
3789
3790 if (page != VM_PAGE_NULL) {
3791 object = page->object;
3792 vm_object_lock(object);
2d21ac55 3793 vm_page_lockspin_queues();
1c79356b
A
3794 vm_page_unwire(page);
3795 vm_page_unlock_queues();
3796 vm_object_paging_end(object);
3797 vm_object_unlock(object);
3798 }
3799}
3800
3801/*
3802 * Routine: vm_fault_copy
3803 *
3804 * Purpose:
3805 * Copy pages from one virtual memory object to another --
3806 * neither the source nor destination pages need be resident.
3807 *
3808 * Before actually copying a page, the version associated with
3809 * the destination address map wil be verified.
3810 *
3811 * In/out conditions:
3812 * The caller must hold a reference, but not a lock, to
3813 * each of the source and destination objects and to the
3814 * destination map.
3815 *
3816 * Results:
3817 * Returns KERN_SUCCESS if no errors were encountered in
3818 * reading or writing the data. Returns KERN_INTERRUPTED if
3819 * the operation was interrupted (only possible if the
3820 * "interruptible" argument is asserted). Other return values
3821 * indicate a permanent error in copying the data.
3822 *
3823 * The actual amount of data copied will be returned in the
3824 * "copy_size" argument. In the event that the destination map
3825 * verification failed, this amount may be less than the amount
3826 * requested.
3827 */
3828kern_return_t
3829vm_fault_copy(
3830 vm_object_t src_object,
3831 vm_object_offset_t src_offset,
91447636 3832 vm_map_size_t *copy_size, /* INOUT */
1c79356b
A
3833 vm_object_t dst_object,
3834 vm_object_offset_t dst_offset,
3835 vm_map_t dst_map,
3836 vm_map_version_t *dst_version,
3837 int interruptible)
3838{
3839 vm_page_t result_page;
3840
3841 vm_page_t src_page;
3842 vm_page_t src_top_page;
3843 vm_prot_t src_prot;
3844
3845 vm_page_t dst_page;
3846 vm_page_t dst_top_page;
3847 vm_prot_t dst_prot;
3848
91447636 3849 vm_map_size_t amount_left;
1c79356b
A
3850 vm_object_t old_copy_object;
3851 kern_return_t error = 0;
3852
91447636 3853 vm_map_size_t part_size;
2d21ac55
A
3854 struct vm_object_fault_info fault_info_src;
3855 struct vm_object_fault_info fault_info_dst;
1c79356b
A
3856
3857 /*
3858 * In order not to confuse the clustered pageins, align
3859 * the different offsets on a page boundary.
3860 */
1c79356b
A
3861
3862#define RETURN(x) \
3863 MACRO_BEGIN \
91447636 3864 *copy_size -= amount_left; \
1c79356b
A
3865 MACRO_RETURN(x); \
3866 MACRO_END
3867
91447636 3868 amount_left = *copy_size;
2d21ac55
A
3869
3870 fault_info_src.interruptible = interruptible;
3871 fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
3872 fault_info_src.user_tag = 0;
3873 fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
3874 fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
3875 fault_info_src.no_cache = FALSE;
3876
3877 fault_info_dst.interruptible = interruptible;
3878 fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
3879 fault_info_dst.user_tag = 0;
3880 fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
3881 fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
3882 fault_info_dst.no_cache = FALSE;
3883
1c79356b
A
3884 do { /* while (amount_left > 0) */
3885 /*
3886 * There may be a deadlock if both source and destination
3887 * pages are the same. To avoid this deadlock, the copy must
3888 * start by getting the destination page in order to apply
3889 * COW semantics if any.
3890 */
3891
3892 RetryDestinationFault: ;
3893
3894 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3895
3896 vm_object_lock(dst_object);
3897 vm_object_paging_begin(dst_object);
3898
2d21ac55
A
3899 fault_info_dst.cluster_size = amount_left;
3900
1c79356b
A
3901 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3902 switch (vm_fault_page(dst_object,
91447636 3903 vm_object_trunc_page(dst_offset),
1c79356b
A
3904 VM_PROT_WRITE|VM_PROT_READ,
3905 FALSE,
2d21ac55 3906 &dst_prot, &dst_page, &dst_top_page,
1c79356b
A
3907 (int *)0,
3908 &error,
3909 dst_map->no_zero_fill,
2d21ac55 3910 FALSE, &fault_info_dst)) {
1c79356b
A
3911 case VM_FAULT_SUCCESS:
3912 break;
3913 case VM_FAULT_RETRY:
3914 goto RetryDestinationFault;
3915 case VM_FAULT_MEMORY_SHORTAGE:
3916 if (vm_page_wait(interruptible))
3917 goto RetryDestinationFault;
3918 /* fall thru */
3919 case VM_FAULT_INTERRUPTED:
3920 RETURN(MACH_SEND_INTERRUPTED);
1c79356b
A
3921 case VM_FAULT_MEMORY_ERROR:
3922 if (error)
3923 return (error);
3924 else
3925 return(KERN_MEMORY_ERROR);
3926 }
3927 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3928
3929 old_copy_object = dst_page->object->copy;
3930
3931 /*
3932 * There exists the possiblity that the source and
3933 * destination page are the same. But we can't
3934 * easily determine that now. If they are the
3935 * same, the call to vm_fault_page() for the
3936 * destination page will deadlock. To prevent this we
3937 * wire the page so we can drop busy without having
3938 * the page daemon steal the page. We clean up the
3939 * top page but keep the paging reference on the object
3940 * holding the dest page so it doesn't go away.
3941 */
3942
2d21ac55 3943 vm_page_lockspin_queues();
1c79356b
A
3944 vm_page_wire(dst_page);
3945 vm_page_unlock_queues();
3946 PAGE_WAKEUP_DONE(dst_page);
3947 vm_object_unlock(dst_page->object);
3948
3949 if (dst_top_page != VM_PAGE_NULL) {
3950 vm_object_lock(dst_object);
3951 VM_PAGE_FREE(dst_top_page);
3952 vm_object_paging_end(dst_object);
3953 vm_object_unlock(dst_object);
3954 }
3955
3956 RetrySourceFault: ;
3957
3958 if (src_object == VM_OBJECT_NULL) {
3959 /*
3960 * No source object. We will just
3961 * zero-fill the page in dst_object.
3962 */
3963 src_page = VM_PAGE_NULL;
e3027f41 3964 result_page = VM_PAGE_NULL;
1c79356b
A
3965 } else {
3966 vm_object_lock(src_object);
3967 src_page = vm_page_lookup(src_object,
91447636 3968 vm_object_trunc_page(src_offset));
e3027f41 3969 if (src_page == dst_page) {
1c79356b 3970 src_prot = dst_prot;
e3027f41
A
3971 result_page = VM_PAGE_NULL;
3972 } else {
1c79356b
A
3973 src_prot = VM_PROT_READ;
3974 vm_object_paging_begin(src_object);
3975
2d21ac55
A
3976 fault_info_src.cluster_size = amount_left;
3977
1c79356b
A
3978 XPR(XPR_VM_FAULT,
3979 "vm_fault_copy(2) -> vm_fault_page\n",
3980 0,0,0,0,0);
2d21ac55
A
3981 switch (vm_fault_page(
3982 src_object,
3983 vm_object_trunc_page(src_offset),
3984 VM_PROT_READ, FALSE,
3985 &src_prot,
3986 &result_page, &src_top_page,
3987 (int *)0, &error, FALSE,
3988 FALSE, &fault_info_src)) {
1c79356b
A
3989
3990 case VM_FAULT_SUCCESS:
3991 break;
3992 case VM_FAULT_RETRY:
3993 goto RetrySourceFault;
3994 case VM_FAULT_MEMORY_SHORTAGE:
3995 if (vm_page_wait(interruptible))
3996 goto RetrySourceFault;
3997 /* fall thru */
3998 case VM_FAULT_INTERRUPTED:
3999 vm_fault_copy_dst_cleanup(dst_page);
4000 RETURN(MACH_SEND_INTERRUPTED);
1c79356b
A
4001 case VM_FAULT_MEMORY_ERROR:
4002 vm_fault_copy_dst_cleanup(dst_page);
4003 if (error)
4004 return (error);
4005 else
4006 return(KERN_MEMORY_ERROR);
4007 }
4008
1c79356b
A
4009
4010 assert((src_top_page == VM_PAGE_NULL) ==
e3027f41 4011 (result_page->object == src_object));
1c79356b
A
4012 }
4013 assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
e3027f41 4014 vm_object_unlock(result_page->object);
1c79356b
A
4015 }
4016
4017 if (!vm_map_verify(dst_map, dst_version)) {
e3027f41
A
4018 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4019 vm_fault_copy_cleanup(result_page, src_top_page);
1c79356b
A
4020 vm_fault_copy_dst_cleanup(dst_page);
4021 break;
4022 }
4023
4024 vm_object_lock(dst_page->object);
4025
4026 if (dst_page->object->copy != old_copy_object) {
4027 vm_object_unlock(dst_page->object);
4028 vm_map_verify_done(dst_map, dst_version);
e3027f41
A
4029 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4030 vm_fault_copy_cleanup(result_page, src_top_page);
1c79356b
A
4031 vm_fault_copy_dst_cleanup(dst_page);
4032 break;
4033 }
4034 vm_object_unlock(dst_page->object);
4035
4036 /*
4037 * Copy the page, and note that it is dirty
4038 * immediately.
4039 */
4040
4041 if (!page_aligned(src_offset) ||
4042 !page_aligned(dst_offset) ||
4043 !page_aligned(amount_left)) {
4044
4045 vm_object_offset_t src_po,
4046 dst_po;
4047
91447636
A
4048 src_po = src_offset - vm_object_trunc_page(src_offset);
4049 dst_po = dst_offset - vm_object_trunc_page(dst_offset);
1c79356b
A
4050
4051 if (dst_po > src_po) {
4052 part_size = PAGE_SIZE - dst_po;
4053 } else {
4054 part_size = PAGE_SIZE - src_po;
4055 }
4056 if (part_size > (amount_left)){
4057 part_size = amount_left;
4058 }
4059
e3027f41 4060 if (result_page == VM_PAGE_NULL) {
1c79356b
A
4061 vm_page_part_zero_fill(dst_page,
4062 dst_po, part_size);
4063 } else {
e3027f41 4064 vm_page_part_copy(result_page, src_po,
1c79356b
A
4065 dst_page, dst_po, part_size);
4066 if(!dst_page->dirty){
4067 vm_object_lock(dst_object);
4068 dst_page->dirty = TRUE;
4069 vm_object_unlock(dst_page->object);
4070 }
4071
4072 }
4073 } else {
4074 part_size = PAGE_SIZE;
4075
e3027f41 4076 if (result_page == VM_PAGE_NULL)
1c79356b
A
4077 vm_page_zero_fill(dst_page);
4078 else{
e3027f41 4079 vm_page_copy(result_page, dst_page);
1c79356b
A
4080 if(!dst_page->dirty){
4081 vm_object_lock(dst_object);
4082 dst_page->dirty = TRUE;
4083 vm_object_unlock(dst_page->object);
4084 }
4085 }
4086
4087 }
4088
4089 /*
4090 * Unlock everything, and return
4091 */
4092
4093 vm_map_verify_done(dst_map, dst_version);
4094
e3027f41
A
4095 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4096 vm_fault_copy_cleanup(result_page, src_top_page);
1c79356b
A
4097 vm_fault_copy_dst_cleanup(dst_page);
4098
4099 amount_left -= part_size;
4100 src_offset += part_size;
4101 dst_offset += part_size;
4102 } while (amount_left > 0);
4103
4104 RETURN(KERN_SUCCESS);
4105#undef RETURN
4106
4107 /*NOTREACHED*/
4108}
4109
1c79356b
A
4110#if VM_FAULT_CLASSIFY
4111/*
4112 * Temporary statistics gathering support.
4113 */
4114
4115/*
4116 * Statistics arrays:
4117 */
4118#define VM_FAULT_TYPES_MAX 5
4119#define VM_FAULT_LEVEL_MAX 8
4120
4121int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4122
4123#define VM_FAULT_TYPE_ZERO_FILL 0
4124#define VM_FAULT_TYPE_MAP_IN 1
4125#define VM_FAULT_TYPE_PAGER 2
4126#define VM_FAULT_TYPE_COPY 3
4127#define VM_FAULT_TYPE_OTHER 4
4128
4129
4130void
4131vm_fault_classify(vm_object_t object,
4132 vm_object_offset_t offset,
4133 vm_prot_t fault_type)
4134{
4135 int type, level = 0;
4136 vm_page_t m;
4137
4138 while (TRUE) {
4139 m = vm_page_lookup(object, offset);
4140 if (m != VM_PAGE_NULL) {
2d21ac55 4141 if (m->busy || m->error || m->restart || m->absent) {
1c79356b
A
4142 type = VM_FAULT_TYPE_OTHER;
4143 break;
4144 }
4145 if (((fault_type & VM_PROT_WRITE) == 0) ||
4146 ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4147 type = VM_FAULT_TYPE_MAP_IN;
4148 break;
4149 }
4150 type = VM_FAULT_TYPE_COPY;
4151 break;
4152 }
4153 else {
4154 if (object->pager_created) {
4155 type = VM_FAULT_TYPE_PAGER;
4156 break;
4157 }
4158 if (object->shadow == VM_OBJECT_NULL) {
4159 type = VM_FAULT_TYPE_ZERO_FILL;
4160 break;
4161 }
4162
4163 offset += object->shadow_offset;
4164 object = object->shadow;
4165 level++;
4166 continue;
4167 }
4168 }
4169
4170 if (level > VM_FAULT_LEVEL_MAX)
4171 level = VM_FAULT_LEVEL_MAX;
4172
4173 vm_fault_stats[type][level] += 1;
4174
4175 return;
4176}
4177
4178/* cleanup routine to call from debugger */
4179
4180void
4181vm_fault_classify_init(void)
4182{
4183 int type, level;
4184
4185 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4186 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4187 vm_fault_stats[type][level] = 0;
4188 }
4189 }
4190
4191 return;
4192}
4193#endif /* VM_FAULT_CLASSIFY */
2d21ac55
A
4194
4195
4196extern int cs_validation;
4197
593a1d5f
A
4198void
4199vm_page_validate_cs_mapped(
4200 vm_page_t page,
4201 const void *kaddr)
4202{
4203 vm_object_t object;
4204 vm_object_offset_t offset;
4205 kern_return_t kr;
4206 memory_object_t pager;
4207 void *blobs;
4208 boolean_t validated, tainted;
4209
4210 assert(page->busy);
4211 vm_object_lock_assert_exclusive(page->object);
4212
4213 if (!cs_validation) {
4214 return;
4215 }
4216
4217 if (page->wpmapped && !page->cs_tainted) {
4218 /*
4219 * This page was mapped for "write" access sometime in the
4220 * past and could still be modifiable in the future.
4221 * Consider it tainted.
4222 * [ If the page was already found to be "tainted", no
4223 * need to re-validate. ]
4224 */
4225 page->cs_validated = TRUE;
4226 page->cs_tainted = TRUE;
4227 if (cs_debug) {
4228 printf("CODESIGNING: vm_page_validate_cs: "
4229 "page %p obj %p off 0x%llx "
4230 "was modified\n",
4231 page, page->object, page->offset);
4232 }
4233 vm_cs_validated_dirtied++;
4234 }
4235
4236 if (page->cs_validated) {
4237 return;
4238 }
4239
4240 vm_cs_validates++;
4241
4242 object = page->object;
4243 assert(object->code_signed);
4244 offset = page->offset;
4245
4246 if (!object->alive || object->terminating || object->pager == NULL) {
4247 /*
4248 * The object is terminating and we don't have its pager
4249 * so we can't validate the data...
4250 */
4251 return;
4252 }
4253 /*
4254 * Since we get here to validate a page that was brought in by
4255 * the pager, we know that this pager is all setup and ready
4256 * by now.
4257 */
4258 assert(!object->internal);
4259 assert(object->pager != NULL);
4260 assert(object->pager_ready);
4261
4262 pager = object->pager;
4263
4264 kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
4265 if (kr != KERN_SUCCESS) {
4266 blobs = NULL;
4267 }
4268
4269 /* verify the SHA1 hash for this page */
4270 validated = cs_validate_page(blobs,
4271 offset + object->paging_offset,
4272 (const void *)kaddr,
4273 &tainted);
4274
4275 page->cs_validated = validated;
4276 if (validated) {
4277 page->cs_tainted = tainted;
4278 }
4279}
4280
2d21ac55
A
4281void
4282vm_page_validate_cs(
4283 vm_page_t page)
4284{
4285 vm_object_t object;
4286 vm_object_offset_t offset;
4287 vm_map_offset_t koffset;
4288 vm_map_size_t ksize;
4289 vm_offset_t kaddr;
4290 kern_return_t kr;
2d21ac55
A
4291 boolean_t busy_page;
4292
4a3eedf9 4293 vm_object_lock_assert_held(page->object);
2d21ac55
A
4294
4295 if (!cs_validation) {
4296 return;
4297 }
4298
593a1d5f 4299 if (page->wpmapped && !page->cs_tainted) {
4a3eedf9
A
4300 vm_object_lock_assert_exclusive(page->object);
4301
4302 /*
593a1d5f
A
4303 * This page was mapped for "write" access sometime in the
4304 * past and could still be modifiable in the future.
4305 * Consider it tainted.
4306 * [ If the page was already found to be "tainted", no
4307 * need to re-validate. ]
4a3eedf9 4308 */
593a1d5f
A
4309 page->cs_validated = TRUE;
4310 page->cs_tainted = TRUE;
4311 if (cs_debug) {
4312 printf("CODESIGNING: vm_page_validate_cs: "
4313 "page %p obj %p off 0x%llx "
4314 "was modified\n",
4315 page, page->object, page->offset);
4a3eedf9 4316 }
593a1d5f 4317 vm_cs_validated_dirtied++;
4a3eedf9
A
4318 }
4319
4320 if (page->cs_validated) {
4321 return;
4322 }
4323
4324 vm_object_lock_assert_exclusive(page->object);
4325
2d21ac55
A
4326 object = page->object;
4327 assert(object->code_signed);
4328 offset = page->offset;
4329
4330 busy_page = page->busy;
4331 if (!busy_page) {
4332 /* keep page busy while we map (and unlock) the VM object */
4333 page->busy = TRUE;
4334 }
4335
4336 /*
4337 * Take a paging reference on the VM object
4338 * to protect it from collapse or bypass,
4339 * and keep it from disappearing too.
4340 */
4341 vm_object_paging_begin(object);
4342
4343 /* map the page in the kernel address space */
4344 koffset = 0;
4345 ksize = PAGE_SIZE_64;
4346 kr = vm_paging_map_object(&koffset,
4347 page,
4348 object,
4349 offset,
4350 &ksize,
593a1d5f 4351 VM_PROT_READ,
2d21ac55
A
4352 FALSE); /* can't unlock object ! */
4353 if (kr != KERN_SUCCESS) {
4354 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
4355 }
4356 kaddr = CAST_DOWN(vm_offset_t, koffset);
4357
593a1d5f
A
4358 /* validate the mapped page */
4359 vm_page_validate_cs_mapped(page, (const void *) kaddr);
2d21ac55
A
4360
4361 assert(page->busy);
4362 assert(object == page->object);
4363 vm_object_lock_assert_exclusive(object);
4364
2d21ac55
A
4365 if (!busy_page) {
4366 PAGE_WAKEUP_DONE(page);
4367 }
4368 if (koffset != 0) {
4369 /* unmap the map from the kernel address space */
4370 vm_paging_unmap_object(object, koffset, koffset + ksize);
4371 koffset = 0;
4372 ksize = 0;
4373 kaddr = 0;
4374 }
4375 vm_object_paging_end(object);
4376}