]> git.saurik.com Git - apple/xnu.git/blame - osfmk/vm/vm_fault.c
xnu-2050.9.2.tar.gz
[apple/xnu.git] / osfmk / vm / vm_fault.c
CommitLineData
1c79356b 1/*
b0d623f7 2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: vm_fault.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Page fault handling module.
63 */
1c79356b
A
64
65#include <mach_cluster_stats.h>
66#include <mach_pagemap.h>
2d21ac55 67#include <libkern/OSAtomic.h>
1c79356b 68
91447636 69#include <mach/mach_types.h>
1c79356b
A
70#include <mach/kern_return.h>
71#include <mach/message.h> /* for error codes */
91447636
A
72#include <mach/vm_param.h>
73#include <mach/vm_behavior.h>
74#include <mach/memory_object.h>
75 /* For memory_object_data_{request,unlock} */
2d21ac55 76#include <mach/sdt.h>
91447636
A
77
78#include <kern/kern_types.h>
1c79356b
A
79#include <kern/host_statistics.h>
80#include <kern/counters.h>
81#include <kern/task.h>
82#include <kern/thread.h>
83#include <kern/sched_prim.h>
84#include <kern/host.h>
85#include <kern/xpr.h>
91447636
A
86#include <kern/mach_param.h>
87#include <kern/macro_help.h>
88#include <kern/zalloc.h>
89#include <kern/misc_protos.h>
90
91447636 91#include <vm/vm_fault.h>
1c79356b
A
92#include <vm/vm_map.h>
93#include <vm/vm_object.h>
94#include <vm/vm_page.h>
55e303ae 95#include <vm/vm_kern.h>
1c79356b
A
96#include <vm/pmap.h>
97#include <vm/vm_pageout.h>
91447636 98#include <vm/vm_protos.h>
2d21ac55
A
99#include <vm/vm_external.h>
100#include <vm/memory_object.h>
101#include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
6d2010ae 102#include <vm/vm_shared_region.h>
1c79356b
A
103
104#define VM_FAULT_CLASSIFY 0
1c79356b 105
2d21ac55 106#define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
1c79356b 107
2d21ac55 108int vm_object_pagein_throttle = 16;
1c79356b 109
b0d623f7
A
110/*
111 * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
112 * kicks in when swap space runs out. 64-bit programs have massive address spaces and can leak enormous amounts
113 * of memory if they're buggy and can run the system completely out of swap space. If this happens, we
114 * impose a hard throttle on them to prevent them from taking the last bit of memory left. This helps
115 * keep the UI active so that the user has a chance to kill the offending task before the system
116 * completely hangs.
117 *
118 * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
119 * to tasks that appear to be bloated. When swap runs out, any task using more than vm_hard_throttle_threshold
120 * will be throttled. The throttling is done by giving the thread that's trying to demand zero a page a
121 * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
122 */
123
124boolean_t thread_is_io_throttled(void);
125
126uint64_t vm_hard_throttle_threshold;
127
128extern unsigned int dp_pages_free, dp_pages_reserve;
129
130#define NEED_TO_HARD_THROTTLE_THIS_TASK() (((dp_pages_free + dp_pages_reserve < 2000) && \
131 (get_task_resident_size(current_task()) > vm_hard_throttle_threshold) && \
6d2010ae 132 (current_task() != kernel_task) && VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) || \
b0d623f7
A
133 (vm_page_free_count < vm_page_throttle_limit && thread_is_io_throttled() && \
134 (get_task_resident_size(current_task()) > vm_hard_throttle_threshold)))
135
136
6d2010ae
A
137#define HARD_THROTTLE_DELAY 20000 /* 20000 us == 20 ms */
138#define SOFT_THROTTLE_DELAY 2000 /* 2000 us == 2 ms */
b0d623f7
A
139
140
2d21ac55 141extern int cs_debug;
1c79356b 142
b0d623f7 143boolean_t current_thread_aborted(void);
91447636 144
1c79356b
A
145/* Forward declarations of internal routines. */
146extern kern_return_t vm_fault_wire_fast(
147 vm_map_t map,
91447636 148 vm_map_offset_t va,
1c79356b 149 vm_map_entry_t entry,
9bccf70c 150 pmap_t pmap,
91447636 151 vm_map_offset_t pmap_addr);
1c79356b
A
152
153extern void vm_fault_continue(void);
154
155extern void vm_fault_copy_cleanup(
156 vm_page_t page,
157 vm_page_t top_page);
158
159extern void vm_fault_copy_dst_cleanup(
160 vm_page_t page);
161
162#if VM_FAULT_CLASSIFY
163extern void vm_fault_classify(vm_object_t object,
164 vm_object_offset_t offset,
165 vm_prot_t fault_type);
166
167extern void vm_fault_classify_init(void);
168#endif
169
d1ecb069 170unsigned long vm_pmap_enter_blocked = 0;
316670eb 171unsigned long vm_pmap_enter_retried = 0;
4a3eedf9
A
172
173unsigned long vm_cs_validates = 0;
174unsigned long vm_cs_revalidates = 0;
175unsigned long vm_cs_query_modified = 0;
176unsigned long vm_cs_validated_dirtied = 0;
6d2010ae 177unsigned long vm_cs_bitmap_validated = 0;
593a1d5f 178#if CONFIG_ENFORCE_SIGNED_CODE
b0d623f7 179int cs_enforcement_disable=0;
593a1d5f 180#else
b0d623f7 181static const int cs_enforcement_disable=1;
593a1d5f
A
182#endif
183
1c79356b
A
184/*
185 * Routine: vm_fault_init
186 * Purpose:
187 * Initialize our private data structures.
188 */
189void
190vm_fault_init(void)
191{
593a1d5f
A
192#if !SECURE_KERNEL
193#if CONFIG_ENFORCE_SIGNED_CODE
b0d623f7
A
194 PE_parse_boot_argn("cs_enforcement_disable", &cs_enforcement_disable,
195 sizeof (cs_enforcement_disable));
593a1d5f
A
196#endif
197 PE_parse_boot_argn("cs_debug", &cs_debug, sizeof (cs_debug));
198#endif
b0d623f7
A
199
200 /*
201 * Choose a value for the hard throttle threshold based on the amount of ram. The threshold is
202 * computed as a percentage of available memory, and the percentage used is scaled inversely with
203 * the amount of memory. The pertange runs between 10% and 35%. We use 35% for small memory systems
204 * and reduce the value down to 10% for very large memory configurations. This helps give us a
205 * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
206 * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
207 */
208
209 vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100;
1c79356b
A
210}
211
212/*
213 * Routine: vm_fault_cleanup
214 * Purpose:
215 * Clean up the result of vm_fault_page.
216 * Results:
217 * The paging reference for "object" is released.
218 * "object" is unlocked.
219 * If "top_page" is not null, "top_page" is
220 * freed and the paging reference for the object
221 * containing it is released.
222 *
223 * In/out conditions:
224 * "object" must be locked.
225 */
226void
227vm_fault_cleanup(
228 register vm_object_t object,
229 register vm_page_t top_page)
230{
231 vm_object_paging_end(object);
316670eb 232 vm_object_unlock(object);
1c79356b
A
233
234 if (top_page != VM_PAGE_NULL) {
2d21ac55
A
235 object = top_page->object;
236
237 vm_object_lock(object);
238 VM_PAGE_FREE(top_page);
239 vm_object_paging_end(object);
240 vm_object_unlock(object);
1c79356b
A
241 }
242}
243
244#if MACH_CLUSTER_STATS
245#define MAXCLUSTERPAGES 16
246struct {
247 unsigned long pages_in_cluster;
248 unsigned long pages_at_higher_offsets;
249 unsigned long pages_at_lower_offsets;
250} cluster_stats_in[MAXCLUSTERPAGES];
251#define CLUSTER_STAT(clause) clause
252#define CLUSTER_STAT_HIGHER(x) \
253 ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
254#define CLUSTER_STAT_LOWER(x) \
255 ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
256#define CLUSTER_STAT_CLUSTER(x) \
257 ((cluster_stats_in[(x)].pages_in_cluster)++)
258#else /* MACH_CLUSTER_STATS */
259#define CLUSTER_STAT(clause)
260#endif /* MACH_CLUSTER_STATS */
261
55e303ae
A
262#define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
263
264
265boolean_t vm_page_deactivate_behind = TRUE;
1c79356b 266/*
2d21ac55 267 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
1c79356b 268 */
b0d623f7
A
269#define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW 128
270#define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER 16 /* don't make this too big... */
271 /* we use it to size an array on the stack */
272
273int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
55e303ae 274
2d21ac55
A
275#define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
276
277/*
278 * vm_page_is_sequential
279 *
280 * Determine if sequential access is in progress
281 * in accordance with the behavior specified.
282 * Update state to indicate current access pattern.
283 *
284 * object must have at least the shared lock held
285 */
286static
287void
288vm_fault_is_sequential(
289 vm_object_t object,
290 vm_object_offset_t offset,
291 vm_behavior_t behavior)
292{
293 vm_object_offset_t last_alloc;
294 int sequential;
295 int orig_sequential;
296
297 last_alloc = object->last_alloc;
298 sequential = object->sequential;
299 orig_sequential = sequential;
300
301 switch (behavior) {
302 case VM_BEHAVIOR_RANDOM:
303 /*
304 * reset indicator of sequential behavior
305 */
306 sequential = 0;
307 break;
308
309 case VM_BEHAVIOR_SEQUENTIAL:
310 if (offset && last_alloc == offset - PAGE_SIZE_64) {
311 /*
312 * advance indicator of sequential behavior
313 */
314 if (sequential < MAX_SEQUENTIAL_RUN)
315 sequential += PAGE_SIZE;
316 } else {
317 /*
318 * reset indicator of sequential behavior
319 */
320 sequential = 0;
321 }
322 break;
323
324 case VM_BEHAVIOR_RSEQNTL:
325 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
326 /*
327 * advance indicator of sequential behavior
328 */
329 if (sequential > -MAX_SEQUENTIAL_RUN)
330 sequential -= PAGE_SIZE;
331 } else {
332 /*
333 * reset indicator of sequential behavior
334 */
335 sequential = 0;
336 }
337 break;
338
339 case VM_BEHAVIOR_DEFAULT:
340 default:
341 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
342 /*
343 * advance indicator of sequential behavior
344 */
345 if (sequential < 0)
346 sequential = 0;
347 if (sequential < MAX_SEQUENTIAL_RUN)
348 sequential += PAGE_SIZE;
349
350 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
351 /*
352 * advance indicator of sequential behavior
353 */
354 if (sequential > 0)
355 sequential = 0;
356 if (sequential > -MAX_SEQUENTIAL_RUN)
357 sequential -= PAGE_SIZE;
358 } else {
359 /*
360 * reset indicator of sequential behavior
361 */
362 sequential = 0;
363 }
364 break;
365 }
366 if (sequential != orig_sequential) {
367 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
368 /*
369 * if someone else has already updated object->sequential
370 * don't bother trying to update it or object->last_alloc
371 */
372 return;
373 }
374 }
375 /*
376 * I'd like to do this with a OSCompareAndSwap64, but that
377 * doesn't exist for PPC... however, it shouldn't matter
378 * that much... last_alloc is maintained so that we can determine
379 * if a sequential access pattern is taking place... if only
380 * one thread is banging on this object, no problem with the unprotected
381 * update... if 2 or more threads are banging away, we run the risk of
382 * someone seeing a mangled update... however, in the face of multiple
383 * accesses, no sequential access pattern can develop anyway, so we
384 * haven't lost any real info.
385 */
386 object->last_alloc = offset;
387}
388
389
b0d623f7
A
390int vm_page_deactivate_behind_count = 0;
391
55e303ae 392/*
2d21ac55
A
393 * vm_page_deactivate_behind
394 *
395 * Determine if sequential access is in progress
396 * in accordance with the behavior specified. If
397 * so, compute a potential page to deactivate and
398 * deactivate it.
55e303ae 399 *
2d21ac55 400 * object must be locked.
55e303ae 401 *
2d21ac55 402 * return TRUE if we actually deactivate a page
55e303ae
A
403 */
404static
405boolean_t
406vm_fault_deactivate_behind(
91447636
A
407 vm_object_t object,
408 vm_object_offset_t offset,
409 vm_behavior_t behavior)
55e303ae 410{
b0d623f7
A
411 int n;
412 int pages_in_run = 0;
413 int max_pages_in_run = 0;
2d21ac55
A
414 int sequential_run;
415 int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
b0d623f7
A
416 vm_object_offset_t run_offset = 0;
417 vm_object_offset_t pg_offset = 0;
418 vm_page_t m;
419 vm_page_t page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
55e303ae 420
b0d623f7 421 pages_in_run = 0;
55e303ae
A
422#if TRACEFAULTPAGE
423 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
424#endif
425
2d21ac55 426 if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
91447636
A
427 /*
428 * Do not deactivate pages from the kernel object: they
429 * are not intended to become pageable.
2d21ac55 430 * or we've disabled the deactivate behind mechanism
91447636
A
431 */
432 return FALSE;
433 }
2d21ac55
A
434 if ((sequential_run = object->sequential)) {
435 if (sequential_run < 0) {
436 sequential_behavior = VM_BEHAVIOR_RSEQNTL;
437 sequential_run = 0 - sequential_run;
438 } else {
439 sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
440 }
441 }
55e303ae
A
442 switch (behavior) {
443 case VM_BEHAVIOR_RANDOM:
55e303ae
A
444 break;
445 case VM_BEHAVIOR_SEQUENTIAL:
b0d623f7
A
446 if (sequential_run >= (int)PAGE_SIZE) {
447 run_offset = 0 - PAGE_SIZE_64;
448 max_pages_in_run = 1;
449 }
55e303ae
A
450 break;
451 case VM_BEHAVIOR_RSEQNTL:
b0d623f7
A
452 if (sequential_run >= (int)PAGE_SIZE) {
453 run_offset = PAGE_SIZE_64;
454 max_pages_in_run = 1;
455 }
55e303ae
A
456 break;
457 case VM_BEHAVIOR_DEFAULT:
458 default:
2d21ac55
A
459 { vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
460
461 /*
462 * determine if the run of sequential accesss has been
463 * long enough on an object with default access behavior
464 * to consider it for deactivation
465 */
b0d623f7
A
466 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
467 /*
468 * the comparisons between offset and behind are done
469 * in this kind of odd fashion in order to prevent wrap around
470 * at the end points
471 */
2d21ac55 472 if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
b0d623f7
A
473 if (offset >= behind) {
474 run_offset = 0 - behind;
475 pg_offset = PAGE_SIZE_64;
476 max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
477 }
2d21ac55 478 } else {
b0d623f7
A
479 if (offset < -behind) {
480 run_offset = behind;
481 pg_offset = 0 - PAGE_SIZE_64;
482 max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
483 }
2d21ac55 484 }
55e303ae
A
485 }
486 break;
487 }
2d21ac55 488 }
b0d623f7
A
489 for (n = 0; n < max_pages_in_run; n++) {
490 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
491
316670eb 492 if (m && !m->laundry && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
b0d623f7
A
493 page_run[pages_in_run++] = m;
494 pmap_clear_reference(m->phys_page);
495 }
496 }
497 if (pages_in_run) {
498 vm_page_lockspin_queues();
499
500 for (n = 0; n < pages_in_run; n++) {
501
502 m = page_run[n];
503
504 vm_page_deactivate_internal(m, FALSE);
505
506 vm_page_deactivate_behind_count++;
55e303ae
A
507#if TRACEFAULTPAGE
508 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
509#endif
510 }
b0d623f7
A
511 vm_page_unlock_queues();
512
513 return TRUE;
55e303ae
A
514 }
515 return FALSE;
516}
1c79356b 517
1c79356b 518
6d2010ae 519static int
b0d623f7
A
520vm_page_throttled(void)
521{
522 clock_sec_t elapsed_sec;
523 clock_sec_t tv_sec;
524 clock_usec_t tv_usec;
525
526 thread_t thread = current_thread();
527
528 if (thread->options & TH_OPT_VMPRIV)
6d2010ae 529 return (0);
b0d623f7
A
530
531 thread->t_page_creation_count++;
532
533 if (NEED_TO_HARD_THROTTLE_THIS_TASK())
6d2010ae 534 return (HARD_THROTTLE_DELAY);
b0d623f7
A
535
536 if (vm_page_free_count < vm_page_throttle_limit &&
537 thread->t_page_creation_count > vm_page_creation_throttle) {
538
539 clock_get_system_microtime(&tv_sec, &tv_usec);
540
541 elapsed_sec = tv_sec - thread->t_page_creation_time;
542
543 if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) {
544
545 if (elapsed_sec >= 60) {
546 /*
547 * we'll reset our stats to give a well behaved app
548 * that was unlucky enough to accumulate a bunch of pages
549 * over a long period of time a chance to get out of
550 * the throttled state... we reset the counter and timestamp
551 * so that if it stays under the rate limit for the next second
552 * it will be back in our good graces... if it exceeds it, it
553 * will remain in the throttled state
554 */
555 thread->t_page_creation_time = tv_sec;
556 thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5;
557 }
558 ++vm_page_throttle_count;
559
6d2010ae 560 return (SOFT_THROTTLE_DELAY);
b0d623f7
A
561 }
562 thread->t_page_creation_time = tv_sec;
563 thread->t_page_creation_count = 0;
564 }
6d2010ae 565 return (0);
b0d623f7
A
566}
567
568
2d21ac55
A
569/*
570 * check for various conditions that would
571 * prevent us from creating a ZF page...
572 * cleanup is based on being called from vm_fault_page
573 *
574 * object must be locked
575 * object == m->object
576 */
577static vm_fault_return_t
578vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
579{
6d2010ae
A
580 int throttle_delay;
581
b0d623f7
A
582 if (object->shadow_severed ||
583 VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
2d21ac55 584 /*
b0d623f7
A
585 * Either:
586 * 1. the shadow chain was severed,
587 * 2. the purgeable object is volatile or empty and is marked
588 * to fault on access while volatile.
589 * Just have to return an error at this point
2d21ac55
A
590 */
591 if (m != VM_PAGE_NULL)
592 VM_PAGE_FREE(m);
593 vm_fault_cleanup(object, first_m);
594
595 thread_interrupt_level(interruptible_state);
596
597 return (VM_FAULT_MEMORY_ERROR);
598 }
599 if (vm_backing_store_low) {
600 /*
601 * are we protecting the system from
602 * backing store exhaustion. If so
603 * sleep unless we are privileged.
604 */
605 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
606
607 if (m != VM_PAGE_NULL)
608 VM_PAGE_FREE(m);
609 vm_fault_cleanup(object, first_m);
610
611 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
612
613 thread_block(THREAD_CONTINUE_NULL);
614 thread_interrupt_level(interruptible_state);
615
616 return (VM_FAULT_RETRY);
617 }
618 }
6d2010ae 619 if ((throttle_delay = vm_page_throttled())) {
2d21ac55
A
620 /*
621 * we're throttling zero-fills...
622 * treat this as if we couldn't grab a page
623 */
624 if (m != VM_PAGE_NULL)
625 VM_PAGE_FREE(m);
626 vm_fault_cleanup(object, first_m);
627
6d2010ae 628 VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
b0d623f7 629
6d2010ae 630 delay(throttle_delay);
b0d623f7 631
6d2010ae
A
632 if (current_thread_aborted()) {
633 thread_interrupt_level(interruptible_state);
634 return VM_FAULT_INTERRUPTED;
635 }
2d21ac55
A
636 thread_interrupt_level(interruptible_state);
637
638 return (VM_FAULT_MEMORY_SHORTAGE);
639 }
640 return (VM_FAULT_SUCCESS);
641}
642
643
644/*
645 * do the work to zero fill a page and
646 * inject it into the correct paging queue
647 *
648 * m->object must be locked
649 * page queue lock must NOT be held
650 */
651static int
652vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
653{
654 int my_fault = DBG_ZERO_FILL_FAULT;
655
656 /*
657 * This is is a zero-fill page fault...
658 *
659 * Checking the page lock is a waste of
660 * time; this page was absent, so
661 * it can't be page locked by a pager.
662 *
663 * we also consider it undefined
664 * with respect to instruction
665 * execution. i.e. it is the responsibility
666 * of higher layers to call for an instruction
667 * sync after changing the contents and before
668 * sending a program into this area. We
669 * choose this approach for performance
670 */
671 m->pmapped = TRUE;
672
673 m->cs_validated = FALSE;
674 m->cs_tainted = FALSE;
675
6d2010ae
A
676 if (no_zero_fill == TRUE) {
677 my_fault = DBG_NZF_PAGE_FAULT;
678 } else {
2d21ac55
A
679 vm_page_zero_fill(m);
680
681 VM_STAT_INCR(zero_fill_count);
682 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
683 }
684 assert(!m->laundry);
685 assert(m->object != kernel_object);
686 //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
687
6d2010ae 688 if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) &&
2d21ac55 689 (m->object->purgable == VM_PURGABLE_DENY ||
cf7d32b8
A
690 m->object->purgable == VM_PURGABLE_NONVOLATILE ||
691 m->object->purgable == VM_PURGABLE_VOLATILE )) {
6d2010ae 692
b0d623f7 693 vm_page_lockspin_queues();
2d21ac55 694
6d2010ae
A
695 assert(!VM_PAGE_WIRED(m));
696
316670eb
A
697 /*
698 * can't be on the pageout queue since we don't
699 * have a pager to try and clean to
700 */
701 assert(!m->pageout_queue);
702
6d2010ae
A
703 VM_PAGE_QUEUES_REMOVE(m);
704
2d21ac55
A
705 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
706 m->throttled = TRUE;
707 vm_page_throttled_count++;
708
709 vm_page_unlock_queues();
2d21ac55
A
710 }
711 return (my_fault);
712}
713
714
1c79356b
A
715/*
716 * Routine: vm_fault_page
717 * Purpose:
718 * Find the resident page for the virtual memory
719 * specified by the given virtual memory object
720 * and offset.
721 * Additional arguments:
722 * The required permissions for the page is given
723 * in "fault_type". Desired permissions are included
2d21ac55
A
724 * in "protection".
725 * fault_info is passed along to determine pagein cluster
726 * limits... it contains the expected reference pattern,
727 * cluster size if available, etc...
1c79356b
A
728 *
729 * If the desired page is known to be resident (for
730 * example, because it was previously wired down), asserting
731 * the "unwiring" parameter will speed the search.
732 *
733 * If the operation can be interrupted (by thread_abort
734 * or thread_terminate), then the "interruptible"
735 * parameter should be asserted.
736 *
737 * Results:
738 * The page containing the proper data is returned
739 * in "result_page".
740 *
741 * In/out conditions:
742 * The source object must be locked and referenced,
743 * and must donate one paging reference. The reference
744 * is not affected. The paging reference and lock are
745 * consumed.
746 *
747 * If the call succeeds, the object in which "result_page"
748 * resides is left locked and holding a paging reference.
749 * If this is not the original object, a busy page in the
750 * original object is returned in "top_page", to prevent other
751 * callers from pursuing this same data, along with a paging
752 * reference for the original object. The "top_page" should
753 * be destroyed when this guarantee is no longer required.
754 * The "result_page" is also left busy. It is not removed
755 * from the pageout queues.
b0d623f7
A
756 * Special Case:
757 * A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
758 * fault succeeded but there's no VM page (i.e. the VM object
759 * does not actually hold VM pages, but device memory or
760 * large pages). The object is still locked and we still hold a
761 * paging_in_progress reference.
1c79356b 762 */
b0d623f7 763unsigned int vm_fault_page_blocked_access = 0;
316670eb 764unsigned int vm_fault_page_forced_retry = 0;
1c79356b
A
765
766vm_fault_return_t
767vm_fault_page(
768 /* Arguments: */
769 vm_object_t first_object, /* Object to begin search */
770 vm_object_offset_t first_offset, /* Offset into object */
771 vm_prot_t fault_type, /* What access is requested */
772 boolean_t must_be_resident,/* Must page be resident? */
1c79356b
A
773 /* Modifies in place: */
774 vm_prot_t *protection, /* Protection for mapping */
775 /* Returns: */
776 vm_page_t *result_page, /* Page found, if successful */
777 vm_page_t *top_page, /* Page in top object, if
778 * not result_page. */
779 int *type_of_fault, /* if non-null, fill in with type of fault
780 * COW, zero-fill, etc... returned in trace point */
781 /* More arguments: */
782 kern_return_t *error_code, /* code if page is in error */
783 boolean_t no_zero_fill, /* don't zero fill absent pages */
2d21ac55 784#if MACH_PAGEMAP
0b4e3aa0 785 boolean_t data_supply, /* treat as data_supply if
1c79356b
A
786 * it is a write fault and a full
787 * page is provided */
2d21ac55
A
788#else
789 __unused boolean_t data_supply,
790#endif
791 vm_object_fault_info_t fault_info)
1c79356b 792{
1c79356b 793 vm_page_t m;
1c79356b 794 vm_object_t object;
1c79356b
A
795 vm_object_offset_t offset;
796 vm_page_t first_m;
797 vm_object_t next_object;
798 vm_object_t copy_object;
799 boolean_t look_for_page;
316670eb 800 boolean_t force_fault_retry = FALSE;
1c79356b
A
801 vm_prot_t access_required = fault_type;
802 vm_prot_t wants_copy_flag;
1c79356b
A
803 CLUSTER_STAT(int pages_at_higher_offsets;)
804 CLUSTER_STAT(int pages_at_lower_offsets;)
2d21ac55 805 kern_return_t wait_result;
1c79356b 806 boolean_t interruptible_state;
316670eb
A
807 boolean_t data_already_requested = FALSE;
808 vm_behavior_t orig_behavior;
809 vm_size_t orig_cluster_size;
2d21ac55
A
810 vm_fault_return_t error;
811 int my_fault;
812 uint32_t try_failed_count;
813 int interruptible; /* how may fault be interrupted? */
814 memory_object_t pager;
b0d623f7 815 vm_fault_return_t retval;
1c79356b 816
1c79356b
A
817/*
818 * MACH page map - an optional optimization where a bit map is maintained
819 * by the VM subsystem for internal objects to indicate which pages of
820 * the object currently reside on backing store. This existence map
821 * duplicates information maintained by the vnode pager. It is
822 * created at the time of the first pageout against the object, i.e.
823 * at the same time pager for the object is created. The optimization
824 * is designed to eliminate pager interaction overhead, if it is
825 * 'known' that the page does not exist on backing store.
826 *
2d21ac55 827 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
1c79356b 828 * either marked as paged out in the existence map for the object or no
2d21ac55 829 * existence map exists for the object. MUST_ASK_PAGER() is one of the
1c79356b
A
830 * criteria in the decision to invoke the pager. It is also used as one
831 * of the criteria to terminate the scan for adjacent pages in a clustered
2d21ac55 832 * pagein operation. Note that MUST_ASK_PAGER() always evaluates to TRUE for
1c79356b
A
833 * permanent objects. Note also that if the pager for an internal object
834 * has not been created, the pager is not invoked regardless of the value
2d21ac55 835 * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
1c79356b
A
836 * for which a pager has been created.
837 *
838 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
839 * is marked as paged out in the existence map for the object. PAGED_OUT()
840 * PAGED_OUT() is used to determine if a page has already been pushed
841 * into a copy object in order to avoid a redundant page out operation.
842 */
2d21ac55
A
843#if MACH_PAGEMAP
844#define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \
1c79356b
A
845 != VM_EXTERNAL_STATE_ABSENT)
846#define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
847 == VM_EXTERNAL_STATE_EXISTS)
2d21ac55
A
848#else
849#define MUST_ASK_PAGER(o, f) (TRUE)
850#define PAGED_OUT(o, f) (FALSE)
851#endif
1c79356b
A
852
853/*
854 * Recovery actions
855 */
1c79356b
A
856#define RELEASE_PAGE(m) \
857 MACRO_BEGIN \
b0d623f7
A
858 PAGE_WAKEUP_DONE(m); \
859 if (!m->active && !m->inactive && !m->throttled) { \
860 vm_page_lockspin_queues(); \
861 if (!m->active && !m->inactive && !m->throttled) \
862 vm_page_activate(m); \
863 vm_page_unlock_queues(); \
864 } \
1c79356b
A
865 MACRO_END
866
867#if TRACEFAULTPAGE
868 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
869#endif
870
2d21ac55 871 interruptible = fault_info->interruptible;
9bccf70c 872 interruptible_state = thread_interrupt_level(interruptible);
1c79356b
A
873
874 /*
875 * INVARIANTS (through entire routine):
876 *
877 * 1) At all times, we must either have the object
878 * lock or a busy page in some object to prevent
879 * some other thread from trying to bring in
880 * the same page.
881 *
882 * Note that we cannot hold any locks during the
883 * pager access or when waiting for memory, so
884 * we use a busy page then.
885 *
1c79356b
A
886 * 2) To prevent another thread from racing us down the
887 * shadow chain and entering a new page in the top
888 * object before we do, we must keep a busy page in
889 * the top object while following the shadow chain.
890 *
891 * 3) We must increment paging_in_progress on any object
2d21ac55
A
892 * for which we have a busy page before dropping
893 * the object lock
1c79356b
A
894 *
895 * 4) We leave busy pages on the pageout queues.
896 * If the pageout daemon comes across a busy page,
897 * it will remove the page from the pageout queues.
898 */
899
1c79356b
A
900 object = first_object;
901 offset = first_offset;
902 first_m = VM_PAGE_NULL;
903 access_required = fault_type;
904
2d21ac55 905
1c79356b
A
906 XPR(XPR_VM_FAULT,
907 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
b0d623f7 908 object, offset, fault_type, *protection, 0);
1c79356b
A
909
910 /*
2d21ac55 911 * default type of fault
1c79356b 912 */
2d21ac55 913 my_fault = DBG_CACHE_HIT_FAULT;
1c79356b
A
914
915 while (TRUE) {
916#if TRACEFAULTPAGE
917 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
918#endif
919 if (!object->alive) {
2d21ac55
A
920 /*
921 * object is no longer valid
922 * clean up and return error
923 */
1c79356b 924 vm_fault_cleanup(object, first_m);
9bccf70c 925 thread_interrupt_level(interruptible_state);
2d21ac55
A
926
927 return (VM_FAULT_MEMORY_ERROR);
1c79356b 928 }
2d21ac55 929
b0d623f7
A
930 if (!object->pager_created && object->phys_contiguous) {
931 /*
932 * A physically-contiguous object without a pager:
933 * must be a "large page" object. We do not deal
934 * with VM pages for this object.
935 */
936 m = VM_PAGE_NULL;
937 goto phys_contig_object;
938 }
939
940 if (object->blocked_access) {
941 /*
942 * Access to this VM object has been blocked.
943 * Replace our "paging_in_progress" reference with
944 * a "activity_in_progress" reference and wait for
945 * access to be unblocked.
946 */
947 vm_object_activity_begin(object);
948 vm_object_paging_end(object);
949 while (object->blocked_access) {
950 vm_object_sleep(object,
951 VM_OBJECT_EVENT_UNBLOCKED,
952 THREAD_UNINT);
953 }
954 vm_fault_page_blocked_access++;
955 vm_object_paging_begin(object);
956 vm_object_activity_end(object);
957 }
958
2d21ac55
A
959 /*
960 * See whether the page at 'offset' is resident
961 */
1c79356b
A
962 m = vm_page_lookup(object, offset);
963#if TRACEFAULTPAGE
964 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
965#endif
966 if (m != VM_PAGE_NULL) {
1c79356b
A
967
968 if (m->busy) {
2d21ac55
A
969 /*
970 * The page is being brought in,
971 * wait for it and then retry.
2d21ac55 972 */
1c79356b
A
973#if TRACEFAULTPAGE
974 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
975#endif
316670eb 976 wait_result = PAGE_SLEEP(object, m, interruptible);
1c79356b 977
316670eb
A
978 XPR(XPR_VM_FAULT,
979 "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
980 object, offset,
981 m, 0, 0);
982 counter(c_vm_fault_page_block_busy_kernel++);
2d21ac55 983
316670eb
A
984 if (wait_result != THREAD_AWAKENED) {
985 vm_fault_cleanup(object, first_m);
986 thread_interrupt_level(interruptible_state);
6d2010ae 987
316670eb
A
988 if (wait_result == THREAD_RESTART)
989 return (VM_FAULT_RETRY);
990 else
991 return (VM_FAULT_INTERRUPTED);
1c79356b 992 }
316670eb 993 continue;
1c79356b 994 }
316670eb
A
995 if (m->laundry) {
996 m->pageout = FALSE;
1c79356b 997
316670eb
A
998 if (!m->cleaning)
999 vm_pageout_steal_laundry(m, FALSE);
1000 }
2d21ac55 1001 if (m->phys_page == vm_page_guard_addr) {
91447636 1002 /*
2d21ac55 1003 * Guard page: off limits !
91447636 1004 */
2d21ac55
A
1005 if (fault_type == VM_PROT_NONE) {
1006 /*
1007 * The fault is not requesting any
1008 * access to the guard page, so it must
1009 * be just to wire or unwire it.
1010 * Let's pretend it succeeded...
1011 */
1012 m->busy = TRUE;
1013 *result_page = m;
1014 assert(first_m == VM_PAGE_NULL);
1015 *top_page = first_m;
1016 if (type_of_fault)
1017 *type_of_fault = DBG_GUARD_FAULT;
1018 return VM_FAULT_SUCCESS;
1019 } else {
1020 /*
1021 * The fault requests access to the
1022 * guard page: let's deny that !
1023 */
1024 vm_fault_cleanup(object, first_m);
1025 thread_interrupt_level(interruptible_state);
1026 return VM_FAULT_MEMORY_ERROR;
1027 }
91447636 1028 }
1c79356b
A
1029
1030 if (m->error) {
2d21ac55
A
1031 /*
1032 * The page is in error, give up now.
1033 */
1c79356b
A
1034#if TRACEFAULTPAGE
1035 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
1036#endif
1037 if (error_code)
2d21ac55 1038 *error_code = KERN_MEMORY_ERROR;
1c79356b 1039 VM_PAGE_FREE(m);
2d21ac55 1040
1c79356b 1041 vm_fault_cleanup(object, first_m);
9bccf70c 1042 thread_interrupt_level(interruptible_state);
1c79356b 1043
2d21ac55
A
1044 return (VM_FAULT_MEMORY_ERROR);
1045 }
1c79356b 1046 if (m->restart) {
2d21ac55
A
1047 /*
1048 * The pager wants us to restart
1049 * at the top of the chain,
1050 * typically because it has moved the
1051 * page to another pager, then do so.
1052 */
1c79356b
A
1053#if TRACEFAULTPAGE
1054 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1055#endif
1056 VM_PAGE_FREE(m);
2d21ac55 1057
1c79356b 1058 vm_fault_cleanup(object, first_m);
9bccf70c 1059 thread_interrupt_level(interruptible_state);
1c79356b 1060
2d21ac55
A
1061 return (VM_FAULT_RETRY);
1062 }
1c79356b 1063 if (m->absent) {
2d21ac55
A
1064 /*
1065 * The page isn't busy, but is absent,
1066 * therefore it's deemed "unavailable".
1067 *
1c79356b
A
1068 * Remove the non-existent page (unless it's
1069 * in the top object) and move on down to the
1070 * next object (if there is one).
1071 */
1072#if TRACEFAULTPAGE
1073 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
1074#endif
1c79356b 1075 next_object = object->shadow;
1c79356b 1076
2d21ac55 1077 if (next_object == VM_OBJECT_NULL) {
1c79356b
A
1078 /*
1079 * Absent page at bottom of shadow
1080 * chain; zero fill the page we left
2d21ac55
A
1081 * busy in the first object, and free
1082 * the absent page.
1c79356b 1083 */
2d21ac55 1084 assert(!must_be_resident);
55e303ae
A
1085
1086 /*
2d21ac55
A
1087 * check for any conditions that prevent
1088 * us from creating a new zero-fill page
1089 * vm_fault_check will do all of the
1090 * fault cleanup in the case of an error condition
1091 * including resetting the thread_interrupt_level
55e303ae 1092 */
2d21ac55 1093 error = vm_fault_check(object, m, first_m, interruptible_state);
55e303ae 1094
2d21ac55
A
1095 if (error != VM_FAULT_SUCCESS)
1096 return (error);
55e303ae 1097
1c79356b 1098 XPR(XPR_VM_FAULT,
2d21ac55 1099 "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
b0d623f7
A
1100 object, offset,
1101 m,
1102 first_object, 0);
2d21ac55 1103
1c79356b 1104 if (object != first_object) {
2d21ac55
A
1105 /*
1106 * free the absent page we just found
1107 */
1c79356b 1108 VM_PAGE_FREE(m);
2d21ac55
A
1109
1110 /*
1111 * drop reference and lock on current object
1112 */
1c79356b
A
1113 vm_object_paging_end(object);
1114 vm_object_unlock(object);
2d21ac55
A
1115
1116 /*
1117 * grab the original page we
1118 * 'soldered' in place and
1119 * retake lock on 'first_object'
1120 */
1c79356b
A
1121 m = first_m;
1122 first_m = VM_PAGE_NULL;
1c79356b 1123
2d21ac55
A
1124 object = first_object;
1125 offset = first_offset;
0b4e3aa0 1126
1c79356b 1127 vm_object_lock(object);
9bccf70c 1128 } else {
2d21ac55
A
1129 /*
1130 * we're going to use the absent page we just found
1131 * so convert it to a 'busy' page
1132 */
1133 m->absent = FALSE;
1134 m->busy = TRUE;
0b4e3aa0 1135 }
2d21ac55
A
1136 /*
1137 * zero-fill the page and put it on
1138 * the correct paging queue
1139 */
1140 my_fault = vm_fault_zero_page(m, no_zero_fill);
1141
0b4c1975
A
1142 if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1143 m->absent = TRUE;
6d2010ae 1144
1c79356b
A
1145 break;
1146 } else {
2d21ac55 1147 if (must_be_resident)
1c79356b 1148 vm_object_paging_end(object);
2d21ac55 1149 else if (object != first_object) {
1c79356b
A
1150 vm_object_paging_end(object);
1151 VM_PAGE_FREE(m);
1152 } else {
1153 first_m = m;
1154 m->absent = FALSE;
1c79356b
A
1155 m->busy = TRUE;
1156
2d21ac55 1157 vm_page_lockspin_queues();
316670eb
A
1158
1159 assert(!m->pageout_queue);
1c79356b 1160 VM_PAGE_QUEUES_REMOVE(m);
316670eb 1161
1c79356b
A
1162 vm_page_unlock_queues();
1163 }
1164 XPR(XPR_VM_FAULT,
1165 "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
b0d623f7
A
1166 object, offset,
1167 next_object,
6d2010ae 1168 offset+object->vo_shadow_offset,0);
2d21ac55 1169
6d2010ae
A
1170 offset += object->vo_shadow_offset;
1171 fault_info->lo_offset += object->vo_shadow_offset;
1172 fault_info->hi_offset += object->vo_shadow_offset;
1c79356b 1173 access_required = VM_PROT_READ;
2d21ac55 1174
1c79356b
A
1175 vm_object_lock(next_object);
1176 vm_object_unlock(object);
1177 object = next_object;
1178 vm_object_paging_begin(object);
2d21ac55
A
1179
1180 /*
1181 * reset to default type of fault
1182 */
1183 my_fault = DBG_CACHE_HIT_FAULT;
1184
1c79356b
A
1185 continue;
1186 }
1187 }
1c79356b 1188 if ((m->cleaning)
2d21ac55
A
1189 && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1190 && (fault_type & VM_PROT_WRITE)) {
1c79356b
A
1191 /*
1192 * This is a copy-on-write fault that will
1193 * cause us to revoke access to this page, but
1194 * this page is in the process of being cleaned
1195 * in a clustered pageout. We must wait until
1196 * the cleaning operation completes before
1197 * revoking access to the original page,
1198 * otherwise we might attempt to remove a
1199 * wired mapping.
1200 */
1201#if TRACEFAULTPAGE
1202 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
1203#endif
1204 XPR(XPR_VM_FAULT,
1205 "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
b0d623f7
A
1206 object, offset,
1207 m, 0, 0);
2d21ac55
A
1208 /*
1209 * take an extra ref so that object won't die
1210 */
1211 vm_object_reference_locked(object);
1212
1c79356b 1213 vm_fault_cleanup(object, first_m);
2d21ac55 1214
1c79356b
A
1215 counter(c_vm_fault_page_block_backoff_kernel++);
1216 vm_object_lock(object);
1217 assert(object->ref_count > 0);
2d21ac55 1218
1c79356b 1219 m = vm_page_lookup(object, offset);
2d21ac55 1220
1c79356b
A
1221 if (m != VM_PAGE_NULL && m->cleaning) {
1222 PAGE_ASSERT_WAIT(m, interruptible);
2d21ac55 1223
1c79356b 1224 vm_object_unlock(object);
9bccf70c 1225 wait_result = thread_block(THREAD_CONTINUE_NULL);
1c79356b 1226 vm_object_deallocate(object);
2d21ac55 1227
1c79356b
A
1228 goto backoff;
1229 } else {
1230 vm_object_unlock(object);
2d21ac55 1231
1c79356b 1232 vm_object_deallocate(object);
9bccf70c 1233 thread_interrupt_level(interruptible_state);
2d21ac55
A
1234
1235 return (VM_FAULT_RETRY);
1c79356b
A
1236 }
1237 }
b0d623f7
A
1238 if (type_of_fault == NULL && m->speculative &&
1239 !(fault_info != NULL && fault_info->stealth)) {
2d21ac55
A
1240 /*
1241 * If we were passed a non-NULL pointer for
1242 * "type_of_fault", than we came from
1243 * vm_fault... we'll let it deal with
1244 * this condition, since it
1245 * needs to see m->speculative to correctly
1246 * account the pageins, otherwise...
1247 * take it off the speculative queue, we'll
1248 * let the caller of vm_fault_page deal
1249 * with getting it onto the correct queue
b0d623f7
A
1250 *
1251 * If the caller specified in fault_info that
1252 * it wants a "stealth" fault, we also leave
1253 * the page in the speculative queue.
2d21ac55
A
1254 */
1255 vm_page_lockspin_queues();
316670eb
A
1256 if (m->speculative)
1257 VM_PAGE_QUEUES_REMOVE(m);
2d21ac55
A
1258 vm_page_unlock_queues();
1259 }
1c79356b 1260
2d21ac55
A
1261 if (m->encrypted) {
1262 /*
1263 * ENCRYPTED SWAP:
1264 * the user needs access to a page that we
1265 * encrypted before paging it out.
1266 * Decrypt the page now.
1267 * Keep it busy to prevent anyone from
1268 * accessing it during the decryption.
1269 */
1270 m->busy = TRUE;
1271 vm_page_decrypt(m, 0);
1272 assert(object == m->object);
1273 assert(m->busy);
1274 PAGE_WAKEUP_DONE(m);
1c79356b 1275
2d21ac55
A
1276 /*
1277 * Retry from the top, in case
1278 * something changed while we were
1279 * decrypting.
1280 */
1281 continue;
1282 }
1283 ASSERT_PAGE_DECRYPTED(m);
1c79356b 1284
2d21ac55
A
1285 if (m->object->code_signed) {
1286 /*
1287 * CODE SIGNING:
1288 * We just paged in a page from a signed
1289 * memory object but we don't need to
1290 * validate it now. We'll validate it if
1291 * when it gets mapped into a user address
1292 * space for the first time or when the page
1293 * gets copied to another object as a result
1294 * of a copy-on-write.
1295 */
1c79356b 1296 }
2d21ac55 1297
1c79356b 1298 /*
2d21ac55
A
1299 * We mark the page busy and leave it on
1300 * the pageout queues. If the pageout
1301 * deamon comes across it, then it will
1302 * remove the page from the queue, but not the object
1c79356b 1303 */
1c79356b
A
1304#if TRACEFAULTPAGE
1305 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1306#endif
1c79356b
A
1307 XPR(XPR_VM_FAULT,
1308 "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
b0d623f7 1309 object, offset, m, 0, 0);
1c79356b 1310 assert(!m->busy);
1c79356b 1311 assert(!m->absent);
2d21ac55
A
1312
1313 m->busy = TRUE;
1c79356b
A
1314 break;
1315 }
2d21ac55 1316
1c79356b 1317
2d21ac55
A
1318 /*
1319 * we get here when there is no page present in the object at
1320 * the offset we're interested in... we'll allocate a page
1321 * at this point if the pager associated with
1322 * this object can provide the data or we're the top object...
1323 * object is locked; m == NULL
1324 */
316670eb
A
1325 if (must_be_resident)
1326 goto dont_look_for_page;
1327
2d21ac55
A
1328 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply);
1329
1c79356b
A
1330#if TRACEFAULTPAGE
1331 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1332#endif
316670eb 1333 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1c79356b 1334 /*
316670eb 1335 * Allocate a new page for this object/offset pair as a placeholder
1c79356b 1336 */
2d21ac55 1337 m = vm_page_grab();
1c79356b
A
1338#if TRACEFAULTPAGE
1339 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1340#endif
1341 if (m == VM_PAGE_NULL) {
2d21ac55 1342
1c79356b 1343 vm_fault_cleanup(object, first_m);
9bccf70c 1344 thread_interrupt_level(interruptible_state);
2d21ac55
A
1345
1346 return (VM_FAULT_MEMORY_SHORTAGE);
1c79356b 1347 }
316670eb
A
1348
1349 if (fault_info && fault_info->batch_pmap_op == TRUE) {
1350 vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1351 } else {
1352 vm_page_insert(m, object, offset);
1353 }
1c79356b 1354 }
316670eb 1355 if (look_for_page) {
1c79356b
A
1356 kern_return_t rc;
1357
1358 /*
1359 * If the memory manager is not ready, we
1360 * cannot make requests.
1361 */
1362 if (!object->pager_ready) {
1363#if TRACEFAULTPAGE
1364 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1365#endif
2d21ac55
A
1366 if (m != VM_PAGE_NULL)
1367 VM_PAGE_FREE(m);
1368
1c79356b
A
1369 XPR(XPR_VM_FAULT,
1370 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
b0d623f7 1371 object, offset, 0, 0, 0);
2d21ac55
A
1372
1373 /*
1374 * take an extra ref so object won't die
1375 */
1376 vm_object_reference_locked(object);
1c79356b
A
1377 vm_fault_cleanup(object, first_m);
1378 counter(c_vm_fault_page_block_backoff_kernel++);
2d21ac55 1379
1c79356b
A
1380 vm_object_lock(object);
1381 assert(object->ref_count > 0);
2d21ac55 1382
1c79356b 1383 if (!object->pager_ready) {
2d21ac55
A
1384 wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1385
1c79356b 1386 vm_object_unlock(object);
9bccf70c
A
1387 if (wait_result == THREAD_WAITING)
1388 wait_result = thread_block(THREAD_CONTINUE_NULL);
1c79356b 1389 vm_object_deallocate(object);
2d21ac55 1390
1c79356b
A
1391 goto backoff;
1392 } else {
1393 vm_object_unlock(object);
1394 vm_object_deallocate(object);
9bccf70c 1395 thread_interrupt_level(interruptible_state);
1c79356b 1396
2d21ac55 1397 return (VM_FAULT_RETRY);
0b4e3aa0 1398 }
0b4e3aa0 1399 }
2d21ac55 1400 if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1c79356b 1401 /*
2d21ac55
A
1402 * If there are too many outstanding page
1403 * requests pending on this external object, we
1404 * wait for them to be resolved now.
1c79356b 1405 */
1c79356b 1406#if TRACEFAULTPAGE
2d21ac55 1407 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1c79356b 1408#endif
2d21ac55 1409 if (m != VM_PAGE_NULL)
1c79356b 1410 VM_PAGE_FREE(m);
1c79356b 1411 /*
2d21ac55 1412 * take an extra ref so object won't die
1c79356b 1413 */
2d21ac55 1414 vm_object_reference_locked(object);
1c79356b 1415
1c79356b 1416 vm_fault_cleanup(object, first_m);
2d21ac55 1417
1c79356b 1418 counter(c_vm_fault_page_block_backoff_kernel++);
2d21ac55 1419
1c79356b
A
1420 vm_object_lock(object);
1421 assert(object->ref_count > 0);
2d21ac55 1422
6d2010ae
A
1423 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1424 vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
2d21ac55 1425
1c79356b 1426 vm_object_unlock(object);
9bccf70c 1427 wait_result = thread_block(THREAD_CONTINUE_NULL);
1c79356b 1428 vm_object_deallocate(object);
2d21ac55 1429
1c79356b
A
1430 goto backoff;
1431 } else {
1432 vm_object_unlock(object);
1433 vm_object_deallocate(object);
9bccf70c 1434 thread_interrupt_level(interruptible_state);
2d21ac55
A
1435
1436 return (VM_FAULT_RETRY);
1c79356b
A
1437 }
1438 }
2d21ac55 1439 if (m != VM_PAGE_NULL) {
316670eb
A
1440 VM_PAGE_FREE(m);
1441 m = VM_PAGE_NULL;
0b4e3aa0 1442 }
1c79356b 1443
1c79356b
A
1444#if TRACEFAULTPAGE
1445 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1446#endif
2d21ac55 1447
1c79356b 1448 /*
2d21ac55
A
1449 * It's possible someone called vm_object_destroy while we weren't
1450 * holding the object lock. If that has happened, then bail out
1451 * here.
1c79356b 1452 */
2d21ac55
A
1453
1454 pager = object->pager;
1455
1456 if (pager == MEMORY_OBJECT_NULL) {
1457 vm_fault_cleanup(object, first_m);
1458 thread_interrupt_level(interruptible_state);
1459 return VM_FAULT_MEMORY_ERROR;
1460 }
1c79356b
A
1461
1462 /*
2d21ac55
A
1463 * We have an absent page in place for the faulting offset,
1464 * so we can release the object lock.
1c79356b
A
1465 */
1466
2d21ac55 1467 vm_object_unlock(object);
1c79356b
A
1468
1469 /*
2d21ac55
A
1470 * If this object uses a copy_call strategy,
1471 * and we are interested in a copy of this object
1472 * (having gotten here only by following a
1473 * shadow chain), then tell the memory manager
1474 * via a flag added to the desired_access
1475 * parameter, so that it can detect a race
1476 * between our walking down the shadow chain
1477 * and its pushing pages up into a copy of
1478 * the object that it manages.
1c79356b 1479 */
2d21ac55 1480 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1c79356b 1481 wants_copy_flag = VM_PROT_WANTS_COPY;
2d21ac55 1482 else
1c79356b 1483 wants_copy_flag = VM_PROT_NONE;
1c79356b
A
1484
1485 XPR(XPR_VM_FAULT,
1486 "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
b0d623f7 1487 object, offset, m,
1c79356b
A
1488 access_required | wants_copy_flag, 0);
1489
316670eb
A
1490 if (object->copy == first_object) {
1491 /*
1492 * if we issue the memory_object_data_request in
1493 * this state, we are subject to a deadlock with
1494 * the underlying filesystem if it is trying to
1495 * shrink the file resulting in a push of pages
1496 * into the copy object... that push will stall
1497 * on the placeholder page, and if the pushing thread
1498 * is holding a lock that is required on the pagein
1499 * path (such as a truncate lock), we'll deadlock...
1500 * to avoid this potential deadlock, we throw away
1501 * our placeholder page before calling memory_object_data_request
1502 * and force this thread to retry the vm_fault_page after
1503 * we have issued the I/O. the second time through this path
1504 * we will find the page already in the cache (presumably still
1505 * busy waiting for the I/O to complete) and then complete
1506 * the fault w/o having to go through memory_object_data_request again
1507 */
1508 assert(first_m != VM_PAGE_NULL);
1509 assert(first_m->object == first_object);
1510
1511 vm_object_lock(first_object);
1512 VM_PAGE_FREE(first_m);
1513 vm_object_paging_end(first_object);
1514 vm_object_unlock(first_object);
1515
1516 first_m = VM_PAGE_NULL;
1517 force_fault_retry = TRUE;
1518
1519 vm_fault_page_forced_retry++;
1520 }
1521
1522 if (data_already_requested == TRUE) {
1523 orig_behavior = fault_info->behavior;
1524 orig_cluster_size = fault_info->cluster_size;
1525
1526 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1527 fault_info->cluster_size = PAGE_SIZE;
1528 }
2d21ac55
A
1529 /*
1530 * Call the memory manager to retrieve the data.
1531 */
1532 rc = memory_object_data_request(
1533 pager,
1534 offset + object->paging_offset,
1535 PAGE_SIZE,
1536 access_required | wants_copy_flag,
1537 (memory_object_fault_info_t)fault_info);
1c79356b 1538
316670eb
A
1539 if (data_already_requested == TRUE) {
1540 fault_info->behavior = orig_behavior;
1541 fault_info->cluster_size = orig_cluster_size;
1542 } else
1543 data_already_requested = TRUE;
1544
1c79356b
A
1545#if TRACEFAULTPAGE
1546 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1547#endif
2d21ac55
A
1548 vm_object_lock(object);
1549
1c79356b 1550 if (rc != KERN_SUCCESS) {
2d21ac55 1551
1c79356b 1552 vm_fault_cleanup(object, first_m);
9bccf70c 1553 thread_interrupt_level(interruptible_state);
2d21ac55
A
1554
1555 return ((rc == MACH_SEND_INTERRUPTED) ?
1c79356b
A
1556 VM_FAULT_INTERRUPTED :
1557 VM_FAULT_MEMORY_ERROR);
b0d623f7
A
1558 } else {
1559 clock_sec_t tv_sec;
1560 clock_usec_t tv_usec;
1561
1562 clock_get_system_microtime(&tv_sec, &tv_usec);
1563 current_thread()->t_page_creation_time = tv_sec;
1564 current_thread()->t_page_creation_count = 0;
1c79356b 1565 }
6d2010ae 1566 if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
2d21ac55 1567
1c79356b 1568 vm_fault_cleanup(object, first_m);
9bccf70c 1569 thread_interrupt_level(interruptible_state);
2d21ac55
A
1570
1571 return (VM_FAULT_INTERRUPTED);
1c79356b 1572 }
316670eb
A
1573 if (force_fault_retry == TRUE) {
1574
1575 vm_fault_cleanup(object, first_m);
1576 thread_interrupt_level(interruptible_state);
1577
1578 return (VM_FAULT_RETRY);
1579 }
2d21ac55 1580 if (m == VM_PAGE_NULL && object->phys_contiguous) {
91447636
A
1581 /*
1582 * No page here means that the object we
1583 * initially looked up was "physically
1584 * contiguous" (i.e. device memory). However,
1585 * with Virtual VRAM, the object might not
1586 * be backed by that device memory anymore,
1587 * so we're done here only if the object is
1588 * still "phys_contiguous".
1589 * Otherwise, if the object is no longer
1590 * "phys_contiguous", we need to retry the
1591 * page fault against the object's new backing
1592 * store (different memory object).
1593 */
b0d623f7
A
1594 phys_contig_object:
1595 goto done;
91447636 1596 }
2d21ac55
A
1597 /*
1598 * potentially a pagein fault
1599 * if we make it through the state checks
1600 * above, than we'll count it as such
1601 */
1602 my_fault = DBG_PAGEIN_FAULT;
91447636
A
1603
1604 /*
1605 * Retry with same object/offset, since new data may
1606 * be in a different page (i.e., m is meaningless at
1607 * this point).
1608 */
1c79356b
A
1609 continue;
1610 }
316670eb 1611dont_look_for_page:
1c79356b 1612 /*
2d21ac55
A
1613 * We get here if the object has no pager, or an existence map
1614 * exists and indicates the page isn't present on the pager
1615 * or we're unwiring a page. If a pager exists, but there
1616 * is no existence map, then the m->absent case above handles
1617 * the ZF case when the pager can't provide the page
1c79356b
A
1618 */
1619#if TRACEFAULTPAGE
1620 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1621#endif
1622 if (object == first_object)
1623 first_m = m;
1624 else
1625 assert(m == VM_PAGE_NULL);
1626
1627 XPR(XPR_VM_FAULT,
1628 "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
b0d623f7
A
1629 object, offset, m,
1630 object->shadow, 0);
2d21ac55 1631
1c79356b 1632 next_object = object->shadow;
2d21ac55 1633
1c79356b 1634 if (next_object == VM_OBJECT_NULL) {
1c79356b 1635 /*
2d21ac55
A
1636 * we've hit the bottom of the shadown chain,
1637 * fill the page in the top object with zeros.
1c79356b 1638 */
2d21ac55 1639 assert(!must_be_resident);
1c79356b
A
1640
1641 if (object != first_object) {
1642 vm_object_paging_end(object);
1643 vm_object_unlock(object);
1644
1645 object = first_object;
1646 offset = first_offset;
1647 vm_object_lock(object);
1648 }
1c79356b
A
1649 m = first_m;
1650 assert(m->object == object);
1651 first_m = VM_PAGE_NULL;
1652
55e303ae 1653 /*
2d21ac55
A
1654 * check for any conditions that prevent
1655 * us from creating a new zero-fill page
1656 * vm_fault_check will do all of the
1657 * fault cleanup in the case of an error condition
1658 * including resetting the thread_interrupt_level
55e303ae 1659 */
2d21ac55 1660 error = vm_fault_check(object, m, first_m, interruptible_state);
55e303ae 1661
2d21ac55
A
1662 if (error != VM_FAULT_SUCCESS)
1663 return (error);
55e303ae 1664
2d21ac55
A
1665 if (m == VM_PAGE_NULL) {
1666 m = vm_page_grab();
1c79356b 1667
2d21ac55
A
1668 if (m == VM_PAGE_NULL) {
1669 vm_fault_cleanup(object, VM_PAGE_NULL);
1670 thread_interrupt_level(interruptible_state);
55e303ae 1671
2d21ac55
A
1672 return (VM_FAULT_MEMORY_SHORTAGE);
1673 }
1674 vm_page_insert(m, object, offset);
0b4e3aa0 1675 }
2d21ac55
A
1676 my_fault = vm_fault_zero_page(m, no_zero_fill);
1677
0b4c1975
A
1678 if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1679 m->absent = TRUE;
1c79356b 1680 break;
2d21ac55
A
1681
1682 } else {
1683 /*
1684 * Move on to the next object. Lock the next
1685 * object before unlocking the current one.
1686 */
1c79356b
A
1687 if ((object != first_object) || must_be_resident)
1688 vm_object_paging_end(object);
2d21ac55 1689
6d2010ae
A
1690 offset += object->vo_shadow_offset;
1691 fault_info->lo_offset += object->vo_shadow_offset;
1692 fault_info->hi_offset += object->vo_shadow_offset;
1c79356b 1693 access_required = VM_PROT_READ;
2d21ac55 1694
1c79356b
A
1695 vm_object_lock(next_object);
1696 vm_object_unlock(object);
2d21ac55 1697
1c79356b
A
1698 object = next_object;
1699 vm_object_paging_begin(object);
1700 }
1701 }
1702
1703 /*
1704 * PAGE HAS BEEN FOUND.
1705 *
1706 * This page (m) is:
1707 * busy, so that we can play with it;
1708 * not absent, so that nobody else will fill it;
1709 * possibly eligible for pageout;
1710 *
1711 * The top-level page (first_m) is:
1712 * VM_PAGE_NULL if the page was found in the
1713 * top-level object;
1714 * busy, not absent, and ineligible for pageout.
1715 *
1716 * The current object (object) is locked. A paging
1717 * reference is held for the current and top-level
1718 * objects.
1719 */
1720
1721#if TRACEFAULTPAGE
1722 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1723#endif
1724#if EXTRA_ASSERTIONS
b0d623f7
A
1725 assert(m->busy && !m->absent);
1726 assert((first_m == VM_PAGE_NULL) ||
1727 (first_m->busy && !first_m->absent &&
1728 !first_m->active && !first_m->inactive));
1c79356b
A
1729#endif /* EXTRA_ASSERTIONS */
1730
91447636
A
1731 /*
1732 * ENCRYPTED SWAP:
1733 * If we found a page, we must have decrypted it before we
1734 * get here...
1735 */
b0d623f7 1736 ASSERT_PAGE_DECRYPTED(m);
91447636 1737
1c79356b 1738 XPR(XPR_VM_FAULT,
2d21ac55 1739 "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
b0d623f7
A
1740 object, offset, m,
1741 first_object, first_m);
2d21ac55 1742
1c79356b 1743 /*
2d21ac55
A
1744 * If the page is being written, but isn't
1745 * already owned by the top-level object,
1746 * we have to copy it into a new page owned
1747 * by the top-level object.
1c79356b 1748 */
b0d623f7 1749 if (object != first_object) {
1c79356b
A
1750
1751#if TRACEFAULTPAGE
2d21ac55 1752 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1c79356b
A
1753#endif
1754 if (fault_type & VM_PROT_WRITE) {
1755 vm_page_t copy_m;
1756
2d21ac55
A
1757 /*
1758 * We only really need to copy if we
1759 * want to write it.
1760 */
1c79356b
A
1761 assert(!must_be_resident);
1762
55e303ae
A
1763 /*
1764 * are we protecting the system from
1765 * backing store exhaustion. If so
1766 * sleep unless we are privileged.
1767 */
2d21ac55
A
1768 if (vm_backing_store_low) {
1769 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
55e303ae 1770
55e303ae
A
1771 RELEASE_PAGE(m);
1772 vm_fault_cleanup(object, first_m);
2d21ac55
A
1773
1774 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1775
91447636 1776 thread_block(THREAD_CONTINUE_NULL);
2d21ac55
A
1777 thread_interrupt_level(interruptible_state);
1778
1779 return (VM_FAULT_RETRY);
55e303ae
A
1780 }
1781 }
1c79356b 1782 /*
2d21ac55
A
1783 * If we try to collapse first_object at this
1784 * point, we may deadlock when we try to get
1785 * the lock on an intermediate object (since we
1786 * have the bottom object locked). We can't
1787 * unlock the bottom object, because the page
1788 * we found may move (by collapse) if we do.
1c79356b 1789 *
2d21ac55
A
1790 * Instead, we first copy the page. Then, when
1791 * we have no more use for the bottom object,
1792 * we unlock it and try to collapse.
1c79356b 1793 *
2d21ac55
A
1794 * Note that we copy the page even if we didn't
1795 * need to... that's the breaks.
1c79356b
A
1796 */
1797
1798 /*
2d21ac55 1799 * Allocate a page for the copy
1c79356b
A
1800 */
1801 copy_m = vm_page_grab();
2d21ac55 1802
1c79356b
A
1803 if (copy_m == VM_PAGE_NULL) {
1804 RELEASE_PAGE(m);
2d21ac55 1805
1c79356b 1806 vm_fault_cleanup(object, first_m);
9bccf70c 1807 thread_interrupt_level(interruptible_state);
1c79356b 1808
2d21ac55
A
1809 return (VM_FAULT_MEMORY_SHORTAGE);
1810 }
1c79356b
A
1811 XPR(XPR_VM_FAULT,
1812 "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
b0d623f7
A
1813 object, offset,
1814 m, copy_m, 0);
2d21ac55 1815
1c79356b
A
1816 vm_page_copy(m, copy_m);
1817
1818 /*
2d21ac55
A
1819 * If another map is truly sharing this
1820 * page with us, we have to flush all
1821 * uses of the original page, since we
1822 * can't distinguish those which want the
1823 * original from those which need the
1824 * new copy.
1c79356b 1825 *
2d21ac55
A
1826 * XXXO If we know that only one map has
1827 * access to this page, then we could
1828 * avoid the pmap_disconnect() call.
1c79356b 1829 */
2d21ac55
A
1830 if (m->pmapped)
1831 pmap_disconnect(m->phys_page);
1c79356b 1832
1c79356b 1833 assert(!m->cleaning);
1c79356b
A
1834
1835 /*
2d21ac55 1836 * We no longer need the old page or object.
1c79356b 1837 */
1c79356b
A
1838 PAGE_WAKEUP_DONE(m);
1839 vm_object_paging_end(object);
1840 vm_object_unlock(object);
1841
2d21ac55
A
1842 my_fault = DBG_COW_FAULT;
1843 VM_STAT_INCR(cow_faults);
1844 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1c79356b 1845 current_task()->cow_faults++;
2d21ac55 1846
1c79356b
A
1847 object = first_object;
1848 offset = first_offset;
1849
1850 vm_object_lock(object);
2d21ac55
A
1851 /*
1852 * get rid of the place holder
1853 * page that we soldered in earlier
1854 */
1c79356b
A
1855 VM_PAGE_FREE(first_m);
1856 first_m = VM_PAGE_NULL;
2d21ac55
A
1857
1858 /*
1859 * and replace it with the
1860 * page we just copied into
1861 */
1c79356b
A
1862 assert(copy_m->busy);
1863 vm_page_insert(copy_m, object, offset);
316670eb 1864 SET_PAGE_DIRTY(copy_m, TRUE);
1c79356b 1865
2d21ac55 1866 m = copy_m;
1c79356b 1867 /*
2d21ac55
A
1868 * Now that we've gotten the copy out of the
1869 * way, let's try to collapse the top object.
1870 * But we have to play ugly games with
1871 * paging_in_progress to do that...
1c79356b 1872 */
1c79356b 1873 vm_object_paging_end(object);
0c530ab8 1874 vm_object_collapse(object, offset, TRUE);
1c79356b
A
1875 vm_object_paging_begin(object);
1876
2d21ac55 1877 } else
1c79356b 1878 *protection &= (~VM_PROT_WRITE);
1c79356b 1879 }
1c79356b 1880 /*
2d21ac55
A
1881 * Now check whether the page needs to be pushed into the
1882 * copy object. The use of asymmetric copy on write for
1883 * shared temporary objects means that we may do two copies to
1884 * satisfy the fault; one above to get the page from a
1885 * shadowed object, and one here to push it into the copy.
1c79356b 1886 */
2d21ac55 1887 try_failed_count = 0;
1c79356b 1888
b0d623f7 1889 while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
1c79356b
A
1890 vm_object_offset_t copy_offset;
1891 vm_page_t copy_m;
1892
1893#if TRACEFAULTPAGE
1894 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1895#endif
1896 /*
2d21ac55
A
1897 * If the page is being written, but hasn't been
1898 * copied to the copy-object, we have to copy it there.
1c79356b 1899 */
1c79356b
A
1900 if ((fault_type & VM_PROT_WRITE) == 0) {
1901 *protection &= ~VM_PROT_WRITE;
1902 break;
1903 }
1904
1905 /*
2d21ac55
A
1906 * If the page was guaranteed to be resident,
1907 * we must have already performed the copy.
1c79356b 1908 */
1c79356b
A
1909 if (must_be_resident)
1910 break;
1911
1912 /*
2d21ac55 1913 * Try to get the lock on the copy_object.
1c79356b
A
1914 */
1915 if (!vm_object_lock_try(copy_object)) {
1c79356b 1916
2d21ac55
A
1917 vm_object_unlock(object);
1918 try_failed_count++;
1c79356b 1919
2d21ac55 1920 mutex_pause(try_failed_count); /* wait a bit */
1c79356b 1921 vm_object_lock(object);
2d21ac55 1922
1c79356b
A
1923 continue;
1924 }
2d21ac55 1925 try_failed_count = 0;
1c79356b
A
1926
1927 /*
2d21ac55
A
1928 * Make another reference to the copy-object,
1929 * to keep it from disappearing during the
1930 * copy.
1c79356b 1931 */
2d21ac55 1932 vm_object_reference_locked(copy_object);
1c79356b
A
1933
1934 /*
2d21ac55 1935 * Does the page exist in the copy?
1c79356b 1936 */
6d2010ae 1937 copy_offset = first_offset - copy_object->vo_shadow_offset;
2d21ac55 1938
6d2010ae 1939 if (copy_object->vo_size <= copy_offset)
1c79356b
A
1940 /*
1941 * Copy object doesn't cover this page -- do nothing.
1942 */
1943 ;
2d21ac55
A
1944 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1945 /*
1946 * Page currently exists in the copy object
1947 */
1c79356b
A
1948 if (copy_m->busy) {
1949 /*
2d21ac55
A
1950 * If the page is being brought
1951 * in, wait for it and then retry.
1c79356b
A
1952 */
1953 RELEASE_PAGE(m);
2d21ac55
A
1954
1955 /*
1956 * take an extra ref so object won't die
1957 */
1958 vm_object_reference_locked(copy_object);
1c79356b
A
1959 vm_object_unlock(copy_object);
1960 vm_fault_cleanup(object, first_m);
1961 counter(c_vm_fault_page_block_backoff_kernel++);
2d21ac55 1962
1c79356b
A
1963 vm_object_lock(copy_object);
1964 assert(copy_object->ref_count > 0);
1965 VM_OBJ_RES_DECR(copy_object);
2d21ac55 1966 vm_object_lock_assert_exclusive(copy_object);
1c79356b
A
1967 copy_object->ref_count--;
1968 assert(copy_object->ref_count > 0);
1969 copy_m = vm_page_lookup(copy_object, copy_offset);
91447636
A
1970 /*
1971 * ENCRYPTED SWAP:
1972 * it's OK if the "copy_m" page is encrypted,
1973 * because we're not moving it nor handling its
1974 * contents.
1975 */
1c79356b
A
1976 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1977 PAGE_ASSERT_WAIT(copy_m, interruptible);
2d21ac55 1978
1c79356b 1979 vm_object_unlock(copy_object);
9bccf70c 1980 wait_result = thread_block(THREAD_CONTINUE_NULL);
1c79356b 1981 vm_object_deallocate(copy_object);
2d21ac55 1982
1c79356b
A
1983 goto backoff;
1984 } else {
1985 vm_object_unlock(copy_object);
1986 vm_object_deallocate(copy_object);
9bccf70c 1987 thread_interrupt_level(interruptible_state);
2d21ac55
A
1988
1989 return (VM_FAULT_RETRY);
1c79356b
A
1990 }
1991 }
1992 }
1993 else if (!PAGED_OUT(copy_object, copy_offset)) {
1994 /*
1995 * If PAGED_OUT is TRUE, then the page used to exist
1996 * in the copy-object, and has already been paged out.
1997 * We don't need to repeat this. If PAGED_OUT is
1998 * FALSE, then either we don't know (!pager_created,
1999 * for example) or it hasn't been paged out.
2000 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2001 * We must copy the page to the copy object.
2002 */
2003
2d21ac55
A
2004 if (vm_backing_store_low) {
2005 /*
2006 * we are protecting the system from
2007 * backing store exhaustion. If so
2008 * sleep unless we are privileged.
2009 */
2010 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
2011 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
55e303ae 2012
55e303ae
A
2013 RELEASE_PAGE(m);
2014 VM_OBJ_RES_DECR(copy_object);
2d21ac55 2015 vm_object_lock_assert_exclusive(copy_object);
55e303ae
A
2016 copy_object->ref_count--;
2017 assert(copy_object->ref_count > 0);
2d21ac55 2018
55e303ae
A
2019 vm_object_unlock(copy_object);
2020 vm_fault_cleanup(object, first_m);
91447636 2021 thread_block(THREAD_CONTINUE_NULL);
2d21ac55
A
2022 thread_interrupt_level(interruptible_state);
2023
2024 return (VM_FAULT_RETRY);
55e303ae
A
2025 }
2026 }
1c79356b 2027 /*
2d21ac55 2028 * Allocate a page for the copy
1c79356b
A
2029 */
2030 copy_m = vm_page_alloc(copy_object, copy_offset);
2d21ac55 2031
1c79356b
A
2032 if (copy_m == VM_PAGE_NULL) {
2033 RELEASE_PAGE(m);
2d21ac55 2034
1c79356b 2035 VM_OBJ_RES_DECR(copy_object);
2d21ac55 2036 vm_object_lock_assert_exclusive(copy_object);
1c79356b
A
2037 copy_object->ref_count--;
2038 assert(copy_object->ref_count > 0);
2d21ac55 2039
1c79356b
A
2040 vm_object_unlock(copy_object);
2041 vm_fault_cleanup(object, first_m);
9bccf70c 2042 thread_interrupt_level(interruptible_state);
1c79356b 2043
2d21ac55
A
2044 return (VM_FAULT_MEMORY_SHORTAGE);
2045 }
1c79356b 2046 /*
2d21ac55 2047 * Must copy page into copy-object.
1c79356b 2048 */
1c79356b
A
2049 vm_page_copy(m, copy_m);
2050
2051 /*
2d21ac55
A
2052 * If the old page was in use by any users
2053 * of the copy-object, it must be removed
2054 * from all pmaps. (We can't know which
2055 * pmaps use it.)
1c79356b 2056 */
2d21ac55
A
2057 if (m->pmapped)
2058 pmap_disconnect(m->phys_page);
1c79356b
A
2059
2060 /*
2d21ac55
A
2061 * If there's a pager, then immediately
2062 * page out this page, using the "initialize"
2063 * option. Else, we use the copy.
1c79356b 2064 */
2d21ac55
A
2065 if ((!copy_object->pager_created)
2066#if MACH_PAGEMAP
2067 || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
1c79356b 2068#endif
2d21ac55
A
2069 ) {
2070
2071 vm_page_lockspin_queues();
2072 assert(!m->cleaning);
1c79356b
A
2073 vm_page_activate(copy_m);
2074 vm_page_unlock_queues();
2d21ac55 2075
316670eb 2076 SET_PAGE_DIRTY(copy_m, TRUE);
1c79356b 2077 PAGE_WAKEUP_DONE(copy_m);
316670eb
A
2078
2079 } else if (copy_object->internal) {
2080 /*
2081 * For internal objects check with the pager to see
2082 * if the page already exists in the backing store.
2083 * If yes, then we can drop the copy page. If not,
2084 * then we'll activate it, mark it dirty and keep it
2085 * around.
2086 */
2087
2088 kern_return_t kr = KERN_SUCCESS;
2089
2090 memory_object_t copy_pager = copy_object->pager;
2091 assert(copy_pager != MEMORY_OBJECT_NULL);
2092 vm_object_paging_begin(copy_object);
2093
2094 vm_object_unlock(copy_object);
2095
2096 kr = memory_object_data_request(
2097 copy_pager,
2098 copy_offset + copy_object->paging_offset,
2099 0, /* Only query the pager. */
2100 VM_PROT_READ,
2101 NULL);
2102
2103 vm_object_lock(copy_object);
2104
2105 vm_object_paging_end(copy_object);
2106
2107 /*
2108 * Since we dropped the copy_object's lock,
2109 * check whether we'll have to deallocate
2110 * the hard way.
2111 */
2112 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2113 vm_object_unlock(copy_object);
2114 vm_object_deallocate(copy_object);
2115 vm_object_lock(object);
2116
2117 continue;
2118 }
2119 if (kr == KERN_SUCCESS) {
2120 /*
2121 * The pager has the page. We don't want to overwrite
2122 * that page by sending this one out to the backing store.
2123 * So we drop the copy page.
2124 */
2125 VM_PAGE_FREE(copy_m);
2126
2127 } else {
2128 /*
2129 * The pager doesn't have the page. We'll keep this one
2130 * around in the copy object. It might get sent out to
2131 * the backing store under memory pressure.
2132 */
2133 vm_page_lockspin_queues();
2134 assert(!m->cleaning);
2135 vm_page_activate(copy_m);
2136 vm_page_unlock_queues();
2137
2138 SET_PAGE_DIRTY(copy_m, TRUE);
2139 PAGE_WAKEUP_DONE(copy_m);
2140 }
2141 } else {
2142
1c79356b 2143 assert(copy_m->busy == TRUE);
2d21ac55 2144 assert(!m->cleaning);
1c79356b
A
2145
2146 /*
2d21ac55 2147 * dirty is protected by the object lock
1c79356b 2148 */
316670eb 2149 SET_PAGE_DIRTY(copy_m, TRUE);
1c79356b 2150
2d21ac55
A
2151 /*
2152 * The page is already ready for pageout:
2153 * not on pageout queues and busy.
2154 * Unlock everything except the
2155 * copy_object itself.
2156 */
1c79356b
A
2157 vm_object_unlock(object);
2158
2159 /*
2d21ac55
A
2160 * Write the page to the copy-object,
2161 * flushing it from the kernel.
1c79356b 2162 */
1c79356b
A
2163 vm_pageout_initialize_page(copy_m);
2164
2165 /*
2d21ac55
A
2166 * Since the pageout may have
2167 * temporarily dropped the
2168 * copy_object's lock, we
2169 * check whether we'll have
2170 * to deallocate the hard way.
1c79356b 2171 */
2d21ac55 2172 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
1c79356b
A
2173 vm_object_unlock(copy_object);
2174 vm_object_deallocate(copy_object);
2175 vm_object_lock(object);
2d21ac55 2176
1c79356b
A
2177 continue;
2178 }
1c79356b 2179 /*
2d21ac55
A
2180 * Pick back up the old object's
2181 * lock. [It is safe to do so,
2182 * since it must be deeper in the
2183 * object tree.]
1c79356b 2184 */
1c79356b
A
2185 vm_object_lock(object);
2186 }
316670eb 2187
1c79356b 2188 /*
2d21ac55
A
2189 * Because we're pushing a page upward
2190 * in the object tree, we must restart
2191 * any faults that are waiting here.
2192 * [Note that this is an expansion of
2193 * PAGE_WAKEUP that uses the THREAD_RESTART
2194 * wait result]. Can't turn off the page's
2195 * busy bit because we're not done with it.
1c79356b 2196 */
1c79356b
A
2197 if (m->wanted) {
2198 m->wanted = FALSE;
2d21ac55 2199 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
1c79356b
A
2200 }
2201 }
1c79356b 2202 /*
2d21ac55
A
2203 * The reference count on copy_object must be
2204 * at least 2: one for our extra reference,
2205 * and at least one from the outside world
2206 * (we checked that when we last locked
2207 * copy_object).
1c79356b 2208 */
2d21ac55 2209 vm_object_lock_assert_exclusive(copy_object);
1c79356b
A
2210 copy_object->ref_count--;
2211 assert(copy_object->ref_count > 0);
2d21ac55 2212
1c79356b
A
2213 VM_OBJ_RES_DECR(copy_object);
2214 vm_object_unlock(copy_object);
2215
2216 break;
2217 }
b0d623f7
A
2218
2219done:
1c79356b
A
2220 *result_page = m;
2221 *top_page = first_m;
2222
2223 XPR(XPR_VM_FAULT,
2224 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
b0d623f7 2225 object, offset, m, first_m, 0);
1c79356b 2226
2d21ac55 2227 if (m != VM_PAGE_NULL) {
b0d623f7 2228 retval = VM_FAULT_SUCCESS;
2d21ac55 2229 if (my_fault == DBG_PAGEIN_FAULT) {
55e303ae 2230
2d21ac55
A
2231 VM_STAT_INCR(pageins);
2232 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2233 DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
2234 current_task()->pageins++;
2235
2236 if (m->object->internal) {
2237 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
b0d623f7 2238 my_fault = DBG_PAGEIND_FAULT;
2d21ac55
A
2239 } else {
2240 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
b0d623f7 2241 my_fault = DBG_PAGEINV_FAULT;
2d21ac55
A
2242 }
2243
2244 /*
2245 * evaluate access pattern and update state
2246 * vm_fault_deactivate_behind depends on the
2247 * state being up to date
2248 */
2249 vm_fault_is_sequential(object, offset, fault_info->behavior);
2250
2251 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2252 }
2253 if (type_of_fault)
2254 *type_of_fault = my_fault;
b0d623f7
A
2255 } else {
2256 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2257 assert(first_m == VM_PAGE_NULL);
2258 assert(object == first_object);
2259 }
2d21ac55 2260
55e303ae
A
2261 thread_interrupt_level(interruptible_state);
2262
1c79356b
A
2263#if TRACEFAULTPAGE
2264 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
2265#endif
b0d623f7 2266 return retval;
1c79356b 2267
2d21ac55 2268backoff:
9bccf70c 2269 thread_interrupt_level(interruptible_state);
2d21ac55 2270
1c79356b 2271 if (wait_result == THREAD_INTERRUPTED)
2d21ac55
A
2272 return (VM_FAULT_INTERRUPTED);
2273 return (VM_FAULT_RETRY);
1c79356b
A
2274
2275#undef RELEASE_PAGE
2276}
2277
2d21ac55
A
2278
2279
593a1d5f
A
2280/*
2281 * CODE SIGNING:
2282 * When soft faulting a page, we have to validate the page if:
2283 * 1. the page is being mapped in user space
2284 * 2. the page hasn't already been found to be "tainted"
2285 * 3. the page belongs to a code-signed object
2286 * 4. the page has not been validated yet or has been mapped for write.
2287 */
2288#define VM_FAULT_NEED_CS_VALIDATION(pmap, page) \
2289 ((pmap) != kernel_pmap /*1*/ && \
2290 !(page)->cs_tainted /*2*/ && \
2291 (page)->object->code_signed /*3*/ && \
2292 (!(page)->cs_validated || (page)->wpmapped /*4*/))
2293
2294
55e303ae 2295/*
2d21ac55
A
2296 * page queue lock must NOT be held
2297 * m->object must be locked
2298 *
2299 * NOTE: m->object could be locked "shared" only if we are called
2300 * from vm_fault() as part of a soft fault. If so, we must be
2301 * careful not to modify the VM object in any way that is not
2302 * legal under a shared lock...
55e303ae 2303 */
2d21ac55
A
2304unsigned long cs_enter_tainted_rejected = 0;
2305unsigned long cs_enter_tainted_accepted = 0;
2306kern_return_t
2307vm_fault_enter(vm_page_t m,
2308 pmap_t pmap,
2309 vm_map_offset_t vaddr,
2310 vm_prot_t prot,
6d2010ae 2311 vm_prot_t fault_type,
2d21ac55
A
2312 boolean_t wired,
2313 boolean_t change_wiring,
2314 boolean_t no_cache,
6d2010ae 2315 boolean_t cs_bypass,
316670eb 2316 boolean_t *need_retry,
2d21ac55 2317 int *type_of_fault)
55e303ae 2318{
d1ecb069 2319 kern_return_t kr, pe_result;
2d21ac55 2320 boolean_t previously_pmapped = m->pmapped;
b0d623f7
A
2321 boolean_t must_disconnect = 0;
2322 boolean_t map_is_switched, map_is_switch_protected;
2323
2d21ac55
A
2324 vm_object_lock_assert_held(m->object);
2325#if DEBUG
b0d623f7 2326 lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2d21ac55
A
2327#endif /* DEBUG */
2328
2329 if (m->phys_page == vm_page_guard_addr) {
2330 assert(m->fictitious);
2331 return KERN_SUCCESS;
2332 }
2333
6d2010ae 2334 if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2d21ac55 2335
6d2010ae
A
2336 vm_object_lock_assert_exclusive(m->object);
2337
2338 } else if ((fault_type & VM_PROT_WRITE) == 0) {
2d21ac55 2339 /*
6d2010ae
A
2340 * This is not a "write" fault, so we
2341 * might not have taken the object lock
2342 * exclusively and we might not be able
2343 * to update the "wpmapped" bit in
2344 * vm_fault_enter().
2345 * Let's just grant read access to
2346 * the page for now and we'll
2347 * soft-fault again if we need write
2348 * access later...
2d21ac55 2349 */
6d2010ae
A
2350 prot &= ~VM_PROT_WRITE;
2351 }
2352 if (m->pmapped == FALSE) {
2d21ac55
A
2353
2354 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2355 /*
2356 * found it in the cache, but this
2357 * is the first fault-in of the page (m->pmapped == FALSE)
2358 * so it must have come in as part of
2359 * a cluster... account 1 pagein against it
2360 */
2361 VM_STAT_INCR(pageins);
2362 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2363
2364 if (m->object->internal) {
2365 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
b0d623f7 2366 *type_of_fault = DBG_PAGEIND_FAULT;
2d21ac55
A
2367 } else {
2368 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
b0d623f7 2369 *type_of_fault = DBG_PAGEINV_FAULT;
55e303ae 2370 }
2d21ac55
A
2371
2372 current_task()->pageins++;
2d21ac55
A
2373 }
2374 VM_PAGE_CONSUME_CLUSTERED(m);
2375
6d2010ae 2376 }
2d21ac55
A
2377
2378 if (*type_of_fault != DBG_COW_FAULT) {
2379 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2380
2381 if (pmap == kernel_pmap) {
2382 DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2383 }
2384 }
2385
b0d623f7 2386 /* Validate code signature if necessary. */
593a1d5f
A
2387 if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2388 vm_object_lock_assert_exclusive(m->object);
2389
2390 if (m->cs_validated) {
2391 vm_cs_revalidates++;
2392 }
2393
b0d623f7
A
2394 /* VM map is locked, so 1 ref will remain on VM object -
2395 * so no harm if vm_page_validate_cs drops the object lock */
593a1d5f
A
2396 vm_page_validate_cs(m);
2397 }
2398
b0d623f7
A
2399#define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
2400
2401 map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2402 (pmap == vm_map_pmap(current_thread()->map)));
2403 map_is_switch_protected = current_thread()->map->switch_protect;
2404
2405 /* If the map is switched, and is switch-protected, we must protect
2406 * some pages from being write-faulted: immutable pages because by
2407 * definition they may not be written, and executable pages because that
2408 * would provide a way to inject unsigned code.
2409 * If the page is immutable, we can simply return. However, we can't
2410 * immediately determine whether a page is executable anywhere. But,
2411 * we can disconnect it everywhere and remove the executable protection
2412 * from the current map. We do that below right before we do the
2413 * PMAP_ENTER.
2414 */
2415 if(!cs_enforcement_disable && map_is_switched &&
2416 map_is_switch_protected && page_immutable(m, prot) &&
2417 (prot & VM_PROT_WRITE))
2418 {
2419 return KERN_CODESIGN_ERROR;
2420 }
2421
2422 /* A page could be tainted, or pose a risk of being tainted later.
2423 * Check whether the receiving process wants it, and make it feel
2424 * the consequences (that hapens in cs_invalid_page()).
2425 * For CS Enforcement, two other conditions will
2426 * cause that page to be tainted as well:
2427 * - pmapping an unsigned page executable - this means unsigned code;
2428 * - writeable mapping of a validated page - the content of that page
2429 * can be changed without the kernel noticing, therefore unsigned
2430 * code can be created
2431 */
2432 if (m->cs_tainted ||
6d2010ae 2433 (( !cs_enforcement_disable && !cs_bypass ) &&
b0d623f7
A
2434 (/* The page is unsigned and wants to be executable */
2435 (!m->cs_validated && (prot & VM_PROT_EXECUTE)) ||
2436 /* The page should be immutable, but is in danger of being modified
2437 * This is the case where we want policy from the code directory -
2438 * is the page immutable or not? For now we have to assume that
2439 * code pages will be immutable, data pages not.
2440 * We'll assume a page is a code page if it has a code directory
2441 * and we fault for execution.
2442 * That is good enough since if we faulted the code page for
2443 * writing in another map before, it is wpmapped; if we fault
2444 * it for writing in this map later it will also be faulted for executing
2445 * at the same time; and if we fault for writing in another map
2446 * later, we will disconnect it from this pmap so we'll notice
2447 * the change.
2448 */
2449 (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped))
2450 ))
2451 )
2452 {
2453 /* We will have a tainted page. Have to handle the special case
2454 * of a switched map now. If the map is not switched, standard
2455 * procedure applies - call cs_invalid_page().
2456 * If the map is switched, the real owner is invalid already.
2457 * There is no point in invalidating the switching process since
2458 * it will not be executing from the map. So we don't call
2459 * cs_invalid_page() in that case. */
2460 boolean_t reject_page;
2461 if(map_is_switched) {
2462 assert(pmap==vm_map_pmap(current_thread()->map));
2463 assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2464 reject_page = FALSE;
2465 } else {
2466 reject_page = cs_invalid_page((addr64_t) vaddr);
2467 }
2468
2469 if (reject_page) {
2470 /* reject the tainted page: abort the page fault */
2471 kr = KERN_CODESIGN_ERROR;
2472 cs_enter_tainted_rejected++;
2473 } else {
2474 /* proceed with the tainted page */
2475 kr = KERN_SUCCESS;
2476 /* Page might have been tainted before or not; now it
2477 * definitively is. If the page wasn't tainted, we must
2478 * disconnect it from all pmaps later. */
b7266188 2479 must_disconnect = !m->cs_tainted;
b0d623f7
A
2480 m->cs_tainted = TRUE;
2481 cs_enter_tainted_accepted++;
2d21ac55
A
2482 }
2483 if (cs_debug || kr != KERN_SUCCESS) {
2484 printf("CODESIGNING: vm_fault_enter(0x%llx): "
593a1d5f 2485 "page %p obj %p off 0x%llx *** INVALID PAGE ***\n",
2d21ac55
A
2486 (long long)vaddr, m, m->object, m->offset);
2487 }
b0d623f7 2488
2d21ac55
A
2489 } else {
2490 /* proceed with the valid page */
2491 kr = KERN_SUCCESS;
2492 }
2493
b0d623f7
A
2494 /* If we have a KERN_SUCCESS from the previous checks, we either have
2495 * a good page, or a tainted page that has been accepted by the process.
2496 * In both cases the page will be entered into the pmap.
2497 * If the page is writeable, we need to disconnect it from other pmaps
2498 * now so those processes can take note.
2499 */
2d21ac55
A
2500 if (kr == KERN_SUCCESS) {
2501 /*
2502 * NOTE: we may only hold the vm_object lock SHARED
2503 * at this point, but the update of pmapped is ok
2504 * since this is the ONLY bit updated behind the SHARED
2505 * lock... however, we need to figure out how to do an atomic
2506 * update on a bit field to make this less fragile... right
593a1d5f 2507 * now I don't know how to coerce 'C' to give me the offset info
2d21ac55
A
2508 * that's needed for an AtomicCompareAndSwap
2509 */
2510 m->pmapped = TRUE;
6d2010ae
A
2511 if(vm_page_is_slideable(m)) {
2512 boolean_t was_busy = m->busy;
2513 m->busy = TRUE;
2514 kr = vm_page_slide(m, 0);
2515 assert(m->busy);
2516 if(!was_busy) {
2517 PAGE_WAKEUP_DONE(m);
2518 }
2519 if (kr != KERN_SUCCESS) {
2520 /*
2521 * This page has not been slid correctly,
2522 * do not do the pmap_enter() !
2523 * Let vm_fault_enter() return the error
2524 * so the caller can fail the fault.
2525 */
2526 goto after_the_pmap_enter;
2527 }
2528 }
2529
2530 if (fault_type & VM_PROT_WRITE) {
2531
2532 if (m->wpmapped == FALSE) {
2533 vm_object_lock_assert_exclusive(m->object);
2534
2535 m->wpmapped = TRUE;
2536 }
2537 if (must_disconnect) {
2538 /*
2539 * We can only get here
2540 * because of the CSE logic
2541 */
b0d623f7
A
2542 assert(cs_enforcement_disable == FALSE);
2543 pmap_disconnect(m->phys_page);
6d2010ae
A
2544 /*
2545 * If we are faulting for a write, we can clear
b0d623f7
A
2546 * the execute bit - that will ensure the page is
2547 * checked again before being executable, which
2548 * protects against a map switch.
2549 * This only happens the first time the page
2550 * gets tainted, so we won't get stuck here
6d2010ae
A
2551 * to make an already writeable page executable.
2552 */
2553 if (!cs_bypass){
2554 prot &= ~VM_PROT_EXECUTE;
2555 }
b0d623f7 2556 }
4a3eedf9 2557 }
d1ecb069
A
2558
2559 /* Prevent a deadlock by not
2560 * holding the object lock if we need to wait for a page in
2561 * pmap_enter() - <rdar://problem/7138958> */
316670eb 2562 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
d1ecb069
A
2563 wired, PMAP_OPTIONS_NOWAIT, pe_result);
2564
2565 if(pe_result == KERN_RESOURCE_SHORTAGE) {
316670eb
A
2566
2567 if (need_retry) {
2568 /*
2569 * this will be non-null in the case where we hold the lock
2570 * on the top-object in this chain... we can't just drop
2571 * the lock on the object we're inserting the page into
2572 * and recall the PMAP_ENTER since we can still cause
2573 * a deadlock if one of the critical paths tries to
2574 * acquire the lock on the top-object and we're blocked
2575 * in PMAP_ENTER waiting for memory... our only recourse
2576 * is to deal with it at a higher level where we can
2577 * drop both locks.
2578 */
2579 *need_retry = TRUE;
2580 vm_pmap_enter_retried++;
2581 goto after_the_pmap_enter;
2582 }
d1ecb069 2583 /* The nonblocking version of pmap_enter did not succeed.
316670eb
A
2584 * and we don't need to drop other locks and retry
2585 * at the level above us, so
2586 * use the blocking version instead. Requires marking
d1ecb069
A
2587 * the page busy and unlocking the object */
2588 boolean_t was_busy = m->busy;
2589 m->busy = TRUE;
2590 vm_object_unlock(m->object);
2591
316670eb
A
2592 PMAP_ENTER(pmap, vaddr, m, prot, fault_type, 0, wired);
2593
d1ecb069
A
2594 /* Take the object lock again. */
2595 vm_object_lock(m->object);
2596
2597 /* If the page was busy, someone else will wake it up.
2598 * Otherwise, we have to do it now. */
2599 assert(m->busy);
2600 if(!was_busy) {
2601 PAGE_WAKEUP_DONE(m);
2602 }
2603 vm_pmap_enter_blocked++;
2604 }
2d21ac55
A
2605 }
2606
6d2010ae 2607after_the_pmap_enter:
2d21ac55
A
2608 /*
2609 * Hold queues lock to manipulate
2610 * the page queues. Change wiring
2611 * case is obvious.
2612 */
2613 if (change_wiring) {
2614 vm_page_lockspin_queues();
2615
2616 if (wired) {
2617 if (kr == KERN_SUCCESS) {
2618 vm_page_wire(m);
55e303ae 2619 }
2d21ac55 2620 } else {
0b4c1975 2621 vm_page_unwire(m, TRUE);
2d21ac55
A
2622 }
2623 vm_page_unlock_queues();
2624
2625 } else {
2626 if (kr != KERN_SUCCESS) {
b0d623f7 2627 vm_page_lockspin_queues();
2d21ac55
A
2628 vm_page_deactivate(m);
2629 vm_page_unlock_queues();
2630 } else {
316670eb 2631 if (((!m->active && !m->inactive) || m->clean_queue || no_cache) && !VM_PAGE_WIRED(m) && !m->throttled) {
b0d623f7
A
2632
2633 if ( vm_page_local_q && !no_cache && (*type_of_fault == DBG_COW_FAULT || *type_of_fault == DBG_ZERO_FILL_FAULT) ) {
2634 struct vpl *lq;
2635 uint32_t lid;
2636
2637 /*
2638 * we got a local queue to stuff this new page on...
2639 * its safe to manipulate local and local_id at this point
2640 * since we're behind an exclusive object lock and the
2641 * page is not on any global queue.
2642 *
2643 * we'll use the current cpu number to select the queue
2644 * note that we don't need to disable preemption... we're
2645 * going to behind the local queue's lock to do the real
2646 * work
2647 */
2648 lid = cpu_number();
2649
2650 lq = &vm_page_local_q[lid].vpl_un.vpl;
2651
2652 VPL_LOCK(&lq->vpl_lock);
2653
2654 queue_enter(&lq->vpl_queue, m, vm_page_t, pageq);
2655 m->local = TRUE;
2656 m->local_id = lid;
2657 lq->vpl_count++;
2658
2659 VPL_UNLOCK(&lq->vpl_lock);
2660
2661 if (lq->vpl_count > vm_page_local_q_soft_limit) {
2662 /*
2663 * we're beyond the soft limit for the local queue
2664 * vm_page_reactivate_local will 'try' to take
2665 * the global page queue lock... if it can't that's
2666 * ok... we'll let the queue continue to grow up
2667 * to the hard limit... at that point we'll wait
2668 * for the lock... once we've got the lock, we'll
2669 * transfer all of the pages from the local queue
2670 * to the global active queue
2671 */
2672 vm_page_reactivate_local(lid, FALSE, FALSE);
2673 }
2674 return kr;
2675 }
2676
2d21ac55
A
2677 vm_page_lockspin_queues();
2678 /*
2679 * test again now that we hold the page queue lock
2680 */
316670eb
A
2681 if (!VM_PAGE_WIRED(m)) {
2682 if (m->clean_queue) {
2683 VM_PAGE_QUEUES_REMOVE(m);
2d21ac55 2684
316670eb
A
2685 vm_pageout_cleaned_reactivated++;
2686 vm_pageout_cleaned_fault_reactivated++;
2687 }
2d21ac55 2688
316670eb
A
2689 if ((!m->active && !m->inactive) || no_cache) {
2690 /*
2691 * If this is a no_cache mapping and the page has never been
2692 * mapped before or was previously a no_cache page, then we
2693 * want to leave pages in the speculative state so that they
2694 * can be readily recycled if free memory runs low. Otherwise
2695 * the page is activated as normal.
2696 */
2d21ac55 2697
316670eb
A
2698 if (no_cache && (!previously_pmapped || m->no_cache)) {
2699 m->no_cache = TRUE;
2d21ac55 2700
316670eb
A
2701 if (!m->speculative)
2702 vm_page_speculate(m, FALSE);
2d21ac55 2703
316670eb 2704 } else if (!m->active && !m->inactive) {
2d21ac55 2705
316670eb
A
2706 vm_page_activate(m);
2707 }
2708 }
2709 }
2d21ac55 2710 vm_page_unlock_queues();
55e303ae 2711 }
55e303ae
A
2712 }
2713 }
2d21ac55 2714 return kr;
55e303ae
A
2715}
2716
2d21ac55 2717
1c79356b
A
2718/*
2719 * Routine: vm_fault
2720 * Purpose:
2721 * Handle page faults, including pseudo-faults
2722 * used to change the wiring status of pages.
2723 * Returns:
2724 * Explicit continuations have been removed.
2725 * Implementation:
2726 * vm_fault and vm_fault_page save mucho state
2727 * in the moral equivalent of a closure. The state
2728 * structure is allocated when first entering vm_fault
2729 * and deallocated when leaving vm_fault.
2730 */
2731
91447636
A
2732extern int _map_enter_debug;
2733
2d21ac55
A
2734unsigned long vm_fault_collapse_total = 0;
2735unsigned long vm_fault_collapse_skipped = 0;
2736
1c79356b
A
2737kern_return_t
2738vm_fault(
2739 vm_map_t map,
91447636 2740 vm_map_offset_t vaddr,
1c79356b
A
2741 vm_prot_t fault_type,
2742 boolean_t change_wiring,
9bccf70c
A
2743 int interruptible,
2744 pmap_t caller_pmap,
91447636 2745 vm_map_offset_t caller_pmap_addr)
1c79356b
A
2746{
2747 vm_map_version_t version; /* Map version for verificiation */
2748 boolean_t wired; /* Should mapping be wired down? */
2749 vm_object_t object; /* Top-level object */
2750 vm_object_offset_t offset; /* Top-level offset */
2751 vm_prot_t prot; /* Protection for mapping */
1c79356b
A
2752 vm_object_t old_copy_object; /* Saved copy object */
2753 vm_page_t result_page; /* Result of vm_fault_page */
2754 vm_page_t top_page; /* Placeholder page */
2755 kern_return_t kr;
2756
1c79356b 2757 vm_page_t m; /* Fast access to result_page */
2d21ac55 2758 kern_return_t error_code;
1c79356b 2759 vm_object_t cur_object;
1c79356b
A
2760 vm_object_offset_t cur_offset;
2761 vm_page_t cur_m;
2762 vm_object_t new_object;
2763 int type_of_fault;
2d21ac55
A
2764 pmap_t pmap;
2765 boolean_t interruptible_state;
91447636 2766 vm_map_t real_map = map;
1c79356b 2767 vm_map_t original_map = map;
0c530ab8 2768 vm_prot_t original_fault_type;
2d21ac55
A
2769 struct vm_object_fault_info fault_info;
2770 boolean_t need_collapse = FALSE;
316670eb 2771 boolean_t need_retry = FALSE;
2d21ac55
A
2772 int object_lock_type = 0;
2773 int cur_object_lock_type;
c910b4d9 2774 vm_object_t top_object = VM_OBJECT_NULL;
6d2010ae 2775 int throttle_delay;
1c79356b 2776
de355530 2777
316670eb
A
2778 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2779 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
2d21ac55
A
2780 (int)((uint64_t)vaddr >> 32),
2781 (int)vaddr,
6d2010ae 2782 (map == kernel_map),
1c79356b
A
2783 0,
2784 0);
2785
0c530ab8 2786 if (get_preemption_level() != 0) {
316670eb
A
2787 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2788 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2d21ac55
A
2789 (int)((uint64_t)vaddr >> 32),
2790 (int)vaddr,
0c530ab8
A
2791 KERN_FAILURE,
2792 0,
2793 0);
2794
2795 return (KERN_FAILURE);
9bccf70c 2796 }
b0d623f7 2797
9bccf70c 2798 interruptible_state = thread_interrupt_level(interruptible);
1c79356b 2799
2d21ac55
A
2800 VM_STAT_INCR(faults);
2801 current_task()->faults++;
2802 original_fault_type = fault_type;
2803
2804 if (fault_type & VM_PROT_WRITE)
2805 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2806 else
2807 object_lock_type = OBJECT_LOCK_SHARED;
2808
2809 cur_object_lock_type = OBJECT_LOCK_SHARED;
2810
2811RetryFault:
1c79356b
A
2812 /*
2813 * assume we will hit a page in the cache
2814 * otherwise, explicitly override with
2815 * the real fault type once we determine it
2816 */
2817 type_of_fault = DBG_CACHE_HIT_FAULT;
2818
1c79356b
A
2819 /*
2820 * Find the backing store object and offset into
2821 * it to begin the search.
2822 */
0c530ab8 2823 fault_type = original_fault_type;
1c79356b
A
2824 map = original_map;
2825 vm_map_lock_read(map);
1c79356b 2826
2d21ac55
A
2827 kr = vm_map_lookup_locked(&map, vaddr, fault_type,
2828 object_lock_type, &version,
2829 &object, &offset, &prot, &wired,
2830 &fault_info,
2831 &real_map);
1c79356b
A
2832
2833 if (kr != KERN_SUCCESS) {
2834 vm_map_unlock_read(map);
2835 goto done;
2836 }
2d21ac55
A
2837 pmap = real_map->pmap;
2838 fault_info.interruptible = interruptible;
b0d623f7 2839 fault_info.stealth = FALSE;
6d2010ae 2840 fault_info.io_sync = FALSE;
0b4c1975 2841 fault_info.mark_zf_absent = FALSE;
316670eb 2842 fault_info.batch_pmap_op = FALSE;
1c79356b
A
2843
2844 /*
2d21ac55
A
2845 * If the page is wired, we must fault for the current protection
2846 * value, to avoid further faults.
1c79356b 2847 */
2d21ac55 2848 if (wired) {
1c79356b 2849 fault_type = prot | VM_PROT_WRITE;
2d21ac55
A
2850 /*
2851 * since we're treating this fault as a 'write'
2852 * we must hold the top object lock exclusively
2853 */
2854 if (object_lock_type == OBJECT_LOCK_SHARED) {
2855
2856 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2857
2858 if (vm_object_lock_upgrade(object) == FALSE) {
2859 /*
2860 * couldn't upgrade, so explictly
2861 * take the lock exclusively
2862 */
2863 vm_object_lock(object);
2864 }
2865 }
2866 }
1c79356b
A
2867
2868#if VM_FAULT_CLASSIFY
2869 /*
2870 * Temporary data gathering code
2871 */
2872 vm_fault_classify(object, offset, fault_type);
2873#endif
2874 /*
2875 * Fast fault code. The basic idea is to do as much as
2876 * possible while holding the map lock and object locks.
2877 * Busy pages are not used until the object lock has to
2878 * be dropped to do something (copy, zero fill, pmap enter).
2879 * Similarly, paging references aren't acquired until that
2880 * point, and object references aren't used.
2881 *
2882 * If we can figure out what to do
2883 * (zero fill, copy on write, pmap enter) while holding
2884 * the locks, then it gets done. Otherwise, we give up,
2885 * and use the original fault path (which doesn't hold
2886 * the map lock, and relies on busy pages).
2887 * The give up cases include:
2888 * - Have to talk to pager.
2889 * - Page is busy, absent or in error.
2890 * - Pager has locked out desired access.
2891 * - Fault needs to be restarted.
2892 * - Have to push page into copy object.
2893 *
2894 * The code is an infinite loop that moves one level down
2895 * the shadow chain each time. cur_object and cur_offset
2896 * refer to the current object being examined. object and offset
2897 * are the original object from the map. The loop is at the
2898 * top level if and only if object and cur_object are the same.
2899 *
2900 * Invariants: Map lock is held throughout. Lock is held on
2901 * original object and cur_object (if different) when
2902 * continuing or exiting loop.
2903 *
2904 */
2905
2906
2907 /*
2d21ac55
A
2908 * If this page is to be inserted in a copy delay object
2909 * for writing, and if the object has a copy, then the
2910 * copy delay strategy is implemented in the slow fault page.
1c79356b 2911 */
2d21ac55
A
2912 if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
2913 object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
2914 goto handle_copy_delay;
2915
1c79356b
A
2916 cur_object = object;
2917 cur_offset = offset;
2918
2919 while (TRUE) {
b0d623f7
A
2920 if (!cur_object->pager_created &&
2921 cur_object->phys_contiguous) /* superpage */
2922 break;
2923
2924 if (cur_object->blocked_access) {
2925 /*
2926 * Access to this VM object has been blocked.
2927 * Let the slow path handle it.
2928 */
2929 break;
2930 }
2931
1c79356b 2932 m = vm_page_lookup(cur_object, cur_offset);
2d21ac55 2933
1c79356b 2934 if (m != VM_PAGE_NULL) {
55e303ae 2935 if (m->busy) {
143cc14e
A
2936 wait_result_t result;
2937
2d21ac55
A
2938 /*
2939 * in order to do the PAGE_ASSERT_WAIT, we must
2940 * have object that 'm' belongs to locked exclusively
2941 */
2942 if (object != cur_object) {
143cc14e
A
2943 vm_object_unlock(object);
2944
2d21ac55
A
2945 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2946
2947 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2948
2949 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2950 /*
2951 * couldn't upgrade so go do a full retry
2952 * immediately since we've already dropped
2953 * the top object lock associated with this page
2954 * and the current one got dropped due to the
2955 * failed upgrade... the state is no longer valid
2956 */
2957 vm_map_unlock_read(map);
2958 if (real_map != map)
2959 vm_map_unlock(real_map);
2960
2961 goto RetryFault;
2962 }
2963 }
2964 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2965
2966 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2967
2968 if (vm_object_lock_upgrade(object) == FALSE) {
2969 /*
2970 * couldn't upgrade, so explictly take the lock
2971 * exclusively and go relookup the page since we
2972 * will have dropped the object lock and
2973 * a different thread could have inserted
2974 * a page at this offset
2975 * no need for a full retry since we're
2976 * at the top level of the object chain
2977 */
2978 vm_object_lock(object);
2979
2980 continue;
2981 }
2982 }
143cc14e 2983 vm_map_unlock_read(map);
91447636
A
2984 if (real_map != map)
2985 vm_map_unlock(real_map);
143cc14e 2986
143cc14e 2987 result = PAGE_ASSERT_WAIT(m, interruptible);
1c79356b 2988
143cc14e
A
2989 vm_object_unlock(cur_object);
2990
2991 if (result == THREAD_WAITING) {
2992 result = thread_block(THREAD_CONTINUE_NULL);
2993
2994 counter(c_vm_fault_page_block_busy_kernel++);
2995 }
2996 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2997 goto RetryFault;
2998
2999 kr = KERN_ABORTED;
3000 goto done;
3001 }
316670eb
A
3002 if (m->laundry) {
3003 if (object != cur_object) {
3004 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3005 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3006
3007 vm_object_unlock(object);
3008 vm_object_unlock(cur_object);
3009
3010 vm_map_unlock_read(map);
3011 if (real_map != map)
3012 vm_map_unlock(real_map);
3013
3014 goto RetryFault;
3015 }
3016
3017 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3018
3019 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3020
3021 if (vm_object_lock_upgrade(object) == FALSE) {
3022 /*
3023 * couldn't upgrade, so explictly take the lock
3024 * exclusively and go relookup the page since we
3025 * will have dropped the object lock and
3026 * a different thread could have inserted
3027 * a page at this offset
3028 * no need for a full retry since we're
3029 * at the top level of the object chain
3030 */
3031 vm_object_lock(object);
3032
3033 continue;
3034 }
3035 }
3036 m->pageout = FALSE;
3037
3038 vm_pageout_steal_laundry(m, FALSE);
3039 }
3040
2d21ac55
A
3041 if (m->phys_page == vm_page_guard_addr) {
3042 /*
3043 * Guard page: let the slow path deal with it
3044 */
3045 break;
3046 }
3047 if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
143cc14e 3048 /*
2d21ac55 3049 * Unusual case... let the slow path deal with it
1c79356b
A
3050 */
3051 break;
3052 }
b0d623f7
A
3053 if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) {
3054 if (object != cur_object)
3055 vm_object_unlock(object);
3056 vm_map_unlock_read(map);
3057 if (real_map != map)
3058 vm_map_unlock(real_map);
3059 vm_object_unlock(cur_object);
3060 kr = KERN_MEMORY_ERROR;
3061 goto done;
3062 }
3063
91447636
A
3064 if (m->encrypted) {
3065 /*
3066 * ENCRYPTED SWAP:
3067 * We've soft-faulted (because it's not in the page
3068 * table) on an encrypted page.
2d21ac55 3069 * Keep the page "busy" so that no one messes with
91447636
A
3070 * it during the decryption.
3071 * Release the extra locks we're holding, keep only
3072 * the page's VM object lock.
2d21ac55
A
3073 *
3074 * in order to set 'busy' on 'm', we must
3075 * have object that 'm' belongs to locked exclusively
91447636 3076 */
2d21ac55 3077 if (object != cur_object) {
91447636 3078 vm_object_unlock(object);
2d21ac55
A
3079
3080 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3081
3082 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3083
3084 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3085 /*
3086 * couldn't upgrade so go do a full retry
3087 * immediately since we've already dropped
3088 * the top object lock associated with this page
3089 * and the current one got dropped due to the
3090 * failed upgrade... the state is no longer valid
3091 */
3092 vm_map_unlock_read(map);
3093 if (real_map != map)
3094 vm_map_unlock(real_map);
3095
3096 goto RetryFault;
3097 }
3098 }
3099 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3100
3101 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3102
3103 if (vm_object_lock_upgrade(object) == FALSE) {
3104 /*
3105 * couldn't upgrade, so explictly take the lock
3106 * exclusively and go relookup the page since we
3107 * will have dropped the object lock and
3108 * a different thread could have inserted
3109 * a page at this offset
3110 * no need for a full retry since we're
3111 * at the top level of the object chain
3112 */
3113 vm_object_lock(object);
3114
3115 continue;
3116 }
91447636 3117 }
2d21ac55
A
3118 m->busy = TRUE;
3119
91447636
A
3120 vm_map_unlock_read(map);
3121 if (real_map != map)
3122 vm_map_unlock(real_map);
3123
3124 vm_page_decrypt(m, 0);
3125
3126 assert(m->busy);
3127 PAGE_WAKEUP_DONE(m);
91447636 3128
2d21ac55 3129 vm_object_unlock(cur_object);
91447636
A
3130 /*
3131 * Retry from the top, in case anything
3132 * changed while we were decrypting...
3133 */
3134 goto RetryFault;
3135 }
3136 ASSERT_PAGE_DECRYPTED(m);
3137
6d2010ae
A
3138 if(vm_page_is_slideable(m)) {
3139 /*
3140 * We might need to slide this page, and so,
3141 * we want to hold the VM object exclusively.
3142 */
3143 if (object != cur_object) {
3144 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3145 vm_object_unlock(object);
3146 vm_object_unlock(cur_object);
3147
3148 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3149
3150 vm_map_unlock_read(map);
3151 if (real_map != map)
3152 vm_map_unlock(real_map);
3153
3154 goto RetryFault;
3155 }
3156 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3157
3158 vm_object_unlock(object);
3159 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3160 vm_map_unlock_read(map);
3161 goto RetryFault;
3162 }
3163 }
3164
593a1d5f 3165 if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) {
6d2010ae 3166upgrade_for_validation:
2d21ac55 3167 /*
4a3eedf9 3168 * We might need to validate this page
2d21ac55
A
3169 * against its code signature, so we
3170 * want to hold the VM object exclusively.
3171 */
3172 if (object != cur_object) {
3173 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3174 vm_object_unlock(object);
3175 vm_object_unlock(cur_object);
3176
3177 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3178
3179 vm_map_unlock_read(map);
3180 if (real_map != map)
3181 vm_map_unlock(real_map);
3182
3183 goto RetryFault;
3184 }
3185
3186 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3187
3188 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3189
3190 if (vm_object_lock_upgrade(object) == FALSE) {
3191 /*
3192 * couldn't upgrade, so explictly take the lock
3193 * exclusively and go relookup the page since we
3194 * will have dropped the object lock and
3195 * a different thread could have inserted
3196 * a page at this offset
3197 * no need for a full retry since we're
3198 * at the top level of the object chain
3199 */
3200 vm_object_lock(object);
3201
3202 continue;
3203 }
3204 }
3205 }
1c79356b
A
3206 /*
3207 * Two cases of map in faults:
3208 * - At top level w/o copy object.
3209 * - Read fault anywhere.
3210 * --> must disallow write.
3211 */
3212
4a3eedf9 3213 if (object == cur_object && object->copy == VM_OBJECT_NULL) {
6d2010ae 3214
2d21ac55 3215 goto FastPmapEnter;
4a3eedf9 3216 }
1c79356b
A
3217
3218 if ((fault_type & VM_PROT_WRITE) == 0) {
3219
1c79356b 3220 if (object != cur_object) {
c910b4d9
A
3221 /*
3222 * We still need to hold the top object
3223 * lock here to prevent a race between
3224 * a read fault (taking only "shared"
3225 * locks) and a write fault (taking
3226 * an "exclusive" lock on the top
3227 * object.
3228 * Otherwise, as soon as we release the
3229 * top lock, the write fault could
3230 * proceed and actually complete before
3231 * the read fault, and the copied page's
3232 * translation could then be overwritten
3233 * by the read fault's translation for
3234 * the original page.
3235 *
3236 * Let's just record what the top object
3237 * is and we'll release it later.
2d21ac55 3238 */
c910b4d9 3239 top_object = object;
2d21ac55
A
3240
3241 /*
3242 * switch to the object that has the new page
3243 */
1c79356b 3244 object = cur_object;
2d21ac55 3245 object_lock_type = cur_object_lock_type;
1c79356b 3246 }
1c79356b
A
3247FastPmapEnter:
3248 /*
2d21ac55
A
3249 * prepare for the pmap_enter...
3250 * object and map are both locked
3251 * m contains valid data
3252 * object == m->object
3253 * cur_object == NULL or it's been unlocked
3254 * no paging references on either object or cur_object
1c79356b 3255 */
2d21ac55
A
3256 if (caller_pmap) {
3257 kr = vm_fault_enter(m,
3258 caller_pmap,
3259 caller_pmap_addr,
3260 prot,
6d2010ae 3261 fault_type,
2d21ac55
A
3262 wired,
3263 change_wiring,
3264 fault_info.no_cache,
6d2010ae 3265 fault_info.cs_bypass,
316670eb 3266 (top_object != VM_OBJECT_NULL ? &need_retry : NULL),
2d21ac55 3267 &type_of_fault);
9bccf70c 3268 } else {
2d21ac55
A
3269 kr = vm_fault_enter(m,
3270 pmap,
3271 vaddr,
3272 prot,
6d2010ae 3273 fault_type,
2d21ac55
A
3274 wired,
3275 change_wiring,
3276 fault_info.no_cache,
6d2010ae 3277 fault_info.cs_bypass,
316670eb 3278 (top_object != VM_OBJECT_NULL ? &need_retry : NULL),
2d21ac55 3279 &type_of_fault);
9bccf70c 3280 }
0b4e3aa0 3281
c910b4d9
A
3282 if (top_object != VM_OBJECT_NULL) {
3283 /*
3284 * It's safe to drop the top object
3285 * now that we've done our
3286 * vm_fault_enter(). Any other fault
3287 * in progress for that virtual
3288 * address will either find our page
3289 * and translation or put in a new page
3290 * and translation.
3291 */
3292 vm_object_unlock(top_object);
3293 top_object = VM_OBJECT_NULL;
3294 }
3295
2d21ac55
A
3296 if (need_collapse == TRUE)
3297 vm_object_collapse(object, offset, TRUE);
6d2010ae 3298
316670eb
A
3299 if (need_retry == FALSE &&
3300 (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
2d21ac55
A
3301 /*
3302 * evaluate access pattern and update state
3303 * vm_fault_deactivate_behind depends on the
3304 * state being up to date
3305 */
3306 vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
0c530ab8 3307
2d21ac55 3308 vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
1c79356b 3309 }
1c79356b 3310 /*
2d21ac55 3311 * That's it, clean up and return.
1c79356b 3312 */
2d21ac55
A
3313 if (m->busy)
3314 PAGE_WAKEUP_DONE(m);
6601e61a 3315
1c79356b 3316 vm_object_unlock(object);
143cc14e 3317
1c79356b 3318 vm_map_unlock_read(map);
2d21ac55 3319 if (real_map != map)
91447636 3320 vm_map_unlock(real_map);
1c79356b 3321
316670eb
A
3322 if (need_retry == TRUE) {
3323 /*
3324 * vm_fault_enter couldn't complete the PMAP_ENTER...
3325 * at this point we don't hold any locks so it's safe
3326 * to ask the pmap layer to expand the page table to
3327 * accommodate this mapping... once expanded, we'll
3328 * re-drive the fault which should result in vm_fault_enter
3329 * being able to successfully enter the mapping this time around
3330 */
3331 (void)pmap_enter_options(pmap, vaddr, 0, 0, 0, 0, 0, PMAP_OPTIONS_NOENTER);
3332
3333 need_retry = FALSE;
3334 goto RetryFault;
3335 }
2d21ac55 3336 goto done;
1c79356b 3337 }
1c79356b 3338 /*
2d21ac55 3339 * COPY ON WRITE FAULT
b0d623f7
A
3340 */
3341 assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
3342
6d2010ae 3343 if ((throttle_delay = vm_page_throttled())) {
b0d623f7
A
3344 /*
3345 * drop all of our locks...
3346 * wait until the free queue is
3347 * pumped back up and then
3348 * redrive the fault
3349 */
3350 if (object != cur_object)
3351 vm_object_unlock(cur_object);
3352 vm_object_unlock(object);
3353 vm_map_unlock_read(map);
3354 if (real_map != map)
3355 vm_map_unlock(real_map);
3356
6d2010ae
A
3357 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
3358
3359 delay(throttle_delay);
b0d623f7
A
3360
3361 if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3362 THREAD_UNINT :
3363 THREAD_ABORTSAFE))
3364 goto RetryFault;
3365 kr = KERN_ABORTED;
3366 goto done;
3367 }
3368 /*
2d21ac55
A
3369 * If objects match, then
3370 * object->copy must not be NULL (else control
3371 * would be in previous code block), and we
3372 * have a potential push into the copy object
3373 * with which we can't cope with here.
1c79356b 3374 */
2d21ac55
A
3375 if (cur_object == object) {
3376 /*
3377 * must take the slow path to
3378 * deal with the copy push
3379 */
1c79356b 3380 break;
2d21ac55 3381 }
6d2010ae 3382
1c79356b 3383 /*
2d21ac55
A
3384 * This is now a shadow based copy on write
3385 * fault -- it requires a copy up the shadow
3386 * chain.
6d2010ae
A
3387 */
3388
3389 if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
3390 VM_FAULT_NEED_CS_VALIDATION(NULL, m)) {
3391 goto upgrade_for_validation;
3392 }
3393
3394 /*
2d21ac55
A
3395 * Allocate a page in the original top level
3396 * object. Give up if allocate fails. Also
3397 * need to remember current page, as it's the
3398 * source of the copy.
1c79356b 3399 *
2d21ac55
A
3400 * at this point we hold locks on both
3401 * object and cur_object... no need to take
3402 * paging refs or mark pages BUSY since
3403 * we don't drop either object lock until
3404 * the page has been copied and inserted
1c79356b
A
3405 */
3406 cur_m = m;
3407 m = vm_page_grab();
2d21ac55 3408
1c79356b 3409 if (m == VM_PAGE_NULL) {
2d21ac55
A
3410 /*
3411 * no free page currently available...
3412 * must take the slow path
3413 */
1c79356b
A
3414 break;
3415 }
1c79356b 3416 /*
2d21ac55 3417 * Now do the copy. Mark the source page busy...
1c79356b
A
3418 *
3419 * NOTE: This code holds the map lock across
3420 * the page copy.
3421 */
1c79356b
A
3422 vm_page_copy(cur_m, m);
3423 vm_page_insert(m, object, offset);
316670eb 3424 SET_PAGE_DIRTY(m, FALSE);
1c79356b
A
3425
3426 /*
2d21ac55 3427 * Now cope with the source page and object
1c79356b 3428 */
2d21ac55
A
3429 if (object->ref_count > 1 && cur_m->pmapped)
3430 pmap_disconnect(cur_m->phys_page);
1c79356b 3431
2d21ac55 3432 need_collapse = TRUE;
1c79356b 3433
2d21ac55
A
3434 if (!cur_object->internal &&
3435 cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
3436 /*
3437 * The object from which we've just
3438 * copied a page is most probably backed
3439 * by a vnode. We don't want to waste too
3440 * much time trying to collapse the VM objects
3441 * and create a bottleneck when several tasks
3442 * map the same file.
3443 */
3444 if (cur_object->copy == object) {
3445 /*
3446 * Shared mapping or no COW yet.
3447 * We can never collapse a copy
3448 * object into its backing object.
3449 */
3450 need_collapse = FALSE;
3451 } else if (cur_object->copy == object->shadow &&
3452 object->shadow->resident_page_count == 0) {
3453 /*
3454 * Shared mapping after a COW occurred.
3455 */
3456 need_collapse = FALSE;
3457 }
3458 }
1c79356b
A
3459 vm_object_unlock(cur_object);
3460
2d21ac55
A
3461 if (need_collapse == FALSE)
3462 vm_fault_collapse_skipped++;
3463 vm_fault_collapse_total++;
3464
3465 type_of_fault = DBG_COW_FAULT;
3466 VM_STAT_INCR(cow_faults);
3467 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
3468 current_task()->cow_faults++;
1c79356b
A
3469
3470 goto FastPmapEnter;
1c79356b 3471
2d21ac55 3472 } else {
1c79356b 3473 /*
2d21ac55 3474 * No page at cur_object, cur_offset... m == NULL
1c79356b 3475 */
1c79356b 3476 if (cur_object->pager_created) {
2d21ac55
A
3477 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) {
3478 /*
3479 * May have to talk to a pager...
3480 * take the slow path.
3481 */
3482 break;
3483 }
1c79356b 3484 /*
2d21ac55
A
3485 * existence map present and indicates
3486 * that the pager doesn't have this page
1c79356b 3487 */
1c79356b 3488 }
1c79356b 3489 if (cur_object->shadow == VM_OBJECT_NULL) {
2d21ac55
A
3490 /*
3491 * Zero fill fault. Page gets
3492 * inserted into the original object.
3493 */
b0d623f7
A
3494 if (cur_object->shadow_severed ||
3495 VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object))
3496 {
2d21ac55
A
3497 if (object != cur_object)
3498 vm_object_unlock(cur_object);
1c79356b 3499 vm_object_unlock(object);
2d21ac55 3500
1c79356b 3501 vm_map_unlock_read(map);
2d21ac55 3502 if (real_map != map)
91447636 3503 vm_map_unlock(real_map);
1c79356b 3504
2d21ac55
A
3505 kr = KERN_MEMORY_ERROR;
3506 goto done;
3507 }
6d2010ae 3508 if ((throttle_delay = vm_page_throttled())) {
2d21ac55
A
3509 /*
3510 * drop all of our locks...
3511 * wait until the free queue is
3512 * pumped back up and then
3513 * redrive the fault
3514 */
3515 if (object != cur_object)
3516 vm_object_unlock(cur_object);
3517 vm_object_unlock(object);
3518 vm_map_unlock_read(map);
3519 if (real_map != map)
3520 vm_map_unlock(real_map);
9bccf70c 3521
6d2010ae
A
3522 VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
3523
3524 delay(throttle_delay);
b0d623f7
A
3525
3526 if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
2d21ac55
A
3527 THREAD_UNINT :
3528 THREAD_ABORTSAFE))
3529 goto RetryFault;
2d21ac55
A
3530 kr = KERN_ABORTED;
3531 goto done;
3532 }
3533 if (vm_backing_store_low) {
3534 /*
3535 * we are protecting the system from
3536 * backing store exhaustion...
3537 * must take the slow path if we're
3538 * not privileged
3539 */
3540 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
3541 break;
1c79356b 3542 }
2d21ac55
A
3543 if (cur_object != object) {
3544 vm_object_unlock(cur_object);
1c79356b 3545
2d21ac55 3546 cur_object = object;
55e303ae 3547 }
2d21ac55 3548 if (object_lock_type == OBJECT_LOCK_SHARED) {
55e303ae 3549
2d21ac55
A
3550 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3551
3552 if (vm_object_lock_upgrade(object) == FALSE) {
3553 /*
3554 * couldn't upgrade so do a full retry on the fault
3555 * since we dropped the object lock which
3556 * could allow another thread to insert
3557 * a page at this offset
3558 */
3559 vm_map_unlock_read(map);
3560 if (real_map != map)
3561 vm_map_unlock(real_map);
3562
3563 goto RetryFault;
3564 }
1c79356b
A
3565 }
3566 m = vm_page_alloc(object, offset);
2d21ac55 3567
1c79356b 3568 if (m == VM_PAGE_NULL) {
2d21ac55
A
3569 /*
3570 * no free page currently available...
3571 * must take the slow path
3572 */
1c79356b
A
3573 break;
3574 }
1c79356b 3575
1c79356b 3576 /*
2d21ac55
A
3577 * Now zero fill page...
3578 * the page is probably going to
3579 * be written soon, so don't bother
3580 * to clear the modified bit
1c79356b 3581 *
2d21ac55
A
3582 * NOTE: This code holds the map
3583 * lock across the zero fill.
1c79356b 3584 */
2d21ac55 3585 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
143cc14e 3586
1c79356b
A
3587 goto FastPmapEnter;
3588 }
1c79356b 3589 /*
2d21ac55 3590 * On to the next level in the shadow chain
1c79356b 3591 */
6d2010ae 3592 cur_offset += cur_object->vo_shadow_offset;
1c79356b 3593 new_object = cur_object->shadow;
2d21ac55
A
3594
3595 /*
3596 * take the new_object's lock with the indicated state
3597 */
3598 if (cur_object_lock_type == OBJECT_LOCK_SHARED)
3599 vm_object_lock_shared(new_object);
3600 else
3601 vm_object_lock(new_object);
3602
1c79356b
A
3603 if (cur_object != object)
3604 vm_object_unlock(cur_object);
2d21ac55 3605
1c79356b
A
3606 cur_object = new_object;
3607
3608 continue;
3609 }
3610 }
1c79356b 3611 /*
2d21ac55
A
3612 * Cleanup from fast fault failure. Drop any object
3613 * lock other than original and drop map lock.
1c79356b 3614 */
1c79356b
A
3615 if (object != cur_object)
3616 vm_object_unlock(cur_object);
2d21ac55
A
3617
3618 /*
3619 * must own the object lock exclusively at this point
3620 */
3621 if (object_lock_type == OBJECT_LOCK_SHARED) {
3622 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3623
3624 if (vm_object_lock_upgrade(object) == FALSE) {
3625 /*
3626 * couldn't upgrade, so explictly
3627 * take the lock exclusively
3628 * no need to retry the fault at this
3629 * point since "vm_fault_page" will
3630 * completely re-evaluate the state
3631 */
3632 vm_object_lock(object);
3633 }
1c79356b 3634 }
143cc14e 3635
2d21ac55
A
3636handle_copy_delay:
3637 vm_map_unlock_read(map);
3638 if (real_map != map)
91447636 3639 vm_map_unlock(real_map);
1c79356b
A
3640
3641 /*
2d21ac55
A
3642 * Make a reference to this object to
3643 * prevent its disposal while we are messing with
3644 * it. Once we have the reference, the map is free
3645 * to be diddled. Since objects reference their
3646 * shadows (and copies), they will stay around as well.
1c79356b 3647 */
2d21ac55 3648 vm_object_reference_locked(object);
1c79356b
A
3649 vm_object_paging_begin(object);
3650
3651 XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
55e303ae 3652
2d21ac55 3653 error_code = 0;
55e303ae 3654
1c79356b
A
3655 kr = vm_fault_page(object, offset, fault_type,
3656 (change_wiring && !wired),
1c79356b
A
3657 &prot, &result_page, &top_page,
3658 &type_of_fault,
2d21ac55
A
3659 &error_code, map->no_zero_fill,
3660 FALSE, &fault_info);
1c79356b
A
3661
3662 /*
2d21ac55
A
3663 * if kr != VM_FAULT_SUCCESS, then the paging reference
3664 * has been dropped and the object unlocked... the ref_count
3665 * is still held
3666 *
3667 * if kr == VM_FAULT_SUCCESS, then the paging reference
3668 * is still held along with the ref_count on the original object
3669 *
b0d623f7 3670 * the object is returned locked with a paging reference
2d21ac55
A
3671 *
3672 * if top_page != NULL, then it's BUSY and the
3673 * object it belongs to has a paging reference
3674 * but is returned unlocked
1c79356b 3675 */
b0d623f7
A
3676 if (kr != VM_FAULT_SUCCESS &&
3677 kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
2d21ac55
A
3678 /*
3679 * we didn't succeed, lose the object reference immediately.
3680 */
1c79356b
A
3681 vm_object_deallocate(object);
3682
2d21ac55
A
3683 /*
3684 * See why we failed, and take corrective action.
3685 */
3686 switch (kr) {
1c79356b
A
3687 case VM_FAULT_MEMORY_SHORTAGE:
3688 if (vm_page_wait((change_wiring) ?
3689 THREAD_UNINT :
3690 THREAD_ABORTSAFE))
3691 goto RetryFault;
2d21ac55
A
3692 /*
3693 * fall thru
3694 */
1c79356b
A
3695 case VM_FAULT_INTERRUPTED:
3696 kr = KERN_ABORTED;
3697 goto done;
3698 case VM_FAULT_RETRY:
3699 goto RetryFault;
1c79356b
A
3700 case VM_FAULT_MEMORY_ERROR:
3701 if (error_code)
3702 kr = error_code;
3703 else
3704 kr = KERN_MEMORY_ERROR;
3705 goto done;
b0d623f7
A
3706 default:
3707 panic("vm_fault: unexpected error 0x%x from "
3708 "vm_fault_page()\n", kr);
2d21ac55 3709 }
1c79356b 3710 }
1c79356b
A
3711 m = result_page;
3712
2d21ac55 3713 if (m != VM_PAGE_NULL) {
0b4e3aa0
A
3714 assert((change_wiring && !wired) ?
3715 (top_page == VM_PAGE_NULL) :
3716 ((top_page == VM_PAGE_NULL) == (m->object == object)));
3717 }
1c79356b
A
3718
3719 /*
2d21ac55
A
3720 * What to do with the resulting page from vm_fault_page
3721 * if it doesn't get entered into the physical map:
1c79356b 3722 */
1c79356b
A
3723#define RELEASE_PAGE(m) \
3724 MACRO_BEGIN \
3725 PAGE_WAKEUP_DONE(m); \
b0d623f7
A
3726 if (!m->active && !m->inactive && !m->throttled) { \
3727 vm_page_lockspin_queues(); \
3728 if (!m->active && !m->inactive && !m->throttled) \
3729 vm_page_activate(m); \
3730 vm_page_unlock_queues(); \
3731 } \
1c79356b
A
3732 MACRO_END
3733
3734 /*
2d21ac55
A
3735 * We must verify that the maps have not changed
3736 * since our last lookup.
1c79356b 3737 */
2d21ac55 3738 if (m != VM_PAGE_NULL) {
0b4e3aa0 3739 old_copy_object = m->object->copy;
0b4e3aa0 3740 vm_object_unlock(m->object);
b0d623f7 3741 } else {
0b4e3aa0 3742 old_copy_object = VM_OBJECT_NULL;
b0d623f7
A
3743 vm_object_unlock(object);
3744 }
2d21ac55
A
3745
3746 /*
3747 * no object locks are held at this point
3748 */
1c79356b
A
3749 if ((map != original_map) || !vm_map_verify(map, &version)) {
3750 vm_object_t retry_object;
3751 vm_object_offset_t retry_offset;
3752 vm_prot_t retry_prot;
3753
3754 /*
2d21ac55
A
3755 * To avoid trying to write_lock the map while another
3756 * thread has it read_locked (in vm_map_pageable), we
3757 * do not try for write permission. If the page is
3758 * still writable, we will get write permission. If it
3759 * is not, or has been marked needs_copy, we enter the
3760 * mapping without write permission, and will merely
3761 * take another fault.
1c79356b
A
3762 */
3763 map = original_map;
3764 vm_map_lock_read(map);
2d21ac55 3765
1c79356b 3766 kr = vm_map_lookup_locked(&map, vaddr,
2d21ac55
A
3767 fault_type & ~VM_PROT_WRITE,
3768 OBJECT_LOCK_EXCLUSIVE, &version,
3769 &retry_object, &retry_offset, &retry_prot,
3770 &wired,
3771 &fault_info,
3772 &real_map);
91447636 3773 pmap = real_map->pmap;
1c79356b
A
3774
3775 if (kr != KERN_SUCCESS) {
3776 vm_map_unlock_read(map);
2d21ac55
A
3777
3778 if (m != VM_PAGE_NULL) {
3779 /*
3780 * retake the lock so that
3781 * we can drop the paging reference
3782 * in vm_fault_cleanup and do the
3783 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3784 */
0b4e3aa0 3785 vm_object_lock(m->object);
2d21ac55 3786
0b4e3aa0 3787 RELEASE_PAGE(m);
2d21ac55
A
3788
3789 vm_fault_cleanup(m->object, top_page);
0b4e3aa0 3790 } else {
2d21ac55
A
3791 /*
3792 * retake the lock so that
3793 * we can drop the paging reference
3794 * in vm_fault_cleanup
3795 */
3796 vm_object_lock(object);
3797
3798 vm_fault_cleanup(object, top_page);
0b4e3aa0 3799 }
2d21ac55
A
3800 vm_object_deallocate(object);
3801
1c79356b
A
3802 goto done;
3803 }
1c79356b 3804 vm_object_unlock(retry_object);
1c79356b 3805
2d21ac55
A
3806 if ((retry_object != object) || (retry_offset != offset)) {
3807
1c79356b 3808 vm_map_unlock_read(map);
2d21ac55 3809 if (real_map != map)
91447636 3810 vm_map_unlock(real_map);
2d21ac55
A
3811
3812 if (m != VM_PAGE_NULL) {
3813 /*
3814 * retake the lock so that
3815 * we can drop the paging reference
3816 * in vm_fault_cleanup and do the
3817 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3818 */
3819 vm_object_lock(m->object);
3820
0b4e3aa0 3821 RELEASE_PAGE(m);
2d21ac55
A
3822
3823 vm_fault_cleanup(m->object, top_page);
0b4e3aa0 3824 } else {
2d21ac55
A
3825 /*
3826 * retake the lock so that
3827 * we can drop the paging reference
3828 * in vm_fault_cleanup
3829 */
3830 vm_object_lock(object);
3831
3832 vm_fault_cleanup(object, top_page);
0b4e3aa0 3833 }
2d21ac55
A
3834 vm_object_deallocate(object);
3835
1c79356b
A
3836 goto RetryFault;
3837 }
1c79356b 3838 /*
2d21ac55
A
3839 * Check whether the protection has changed or the object
3840 * has been copied while we left the map unlocked.
1c79356b
A
3841 */
3842 prot &= retry_prot;
0b4e3aa0 3843 }
2d21ac55 3844 if (m != VM_PAGE_NULL) {
0b4e3aa0 3845 vm_object_lock(m->object);
1c79356b 3846
2d21ac55
A
3847 if (m->object->copy != old_copy_object) {
3848 /*
3849 * The copy object changed while the top-level object
3850 * was unlocked, so take away write permission.
3851 */
0b4e3aa0 3852 prot &= ~VM_PROT_WRITE;
2d21ac55
A
3853 }
3854 } else
3855 vm_object_lock(object);
1c79356b
A
3856
3857 /*
2d21ac55
A
3858 * If we want to wire down this page, but no longer have
3859 * adequate permissions, we must start all over.
1c79356b 3860 */
2d21ac55 3861 if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
1c79356b 3862
1c79356b 3863 vm_map_verify_done(map, &version);
2d21ac55 3864 if (real_map != map)
91447636 3865 vm_map_unlock(real_map);
1c79356b 3866
2d21ac55
A
3867 if (m != VM_PAGE_NULL) {
3868 RELEASE_PAGE(m);
91447636 3869
2d21ac55
A
3870 vm_fault_cleanup(m->object, top_page);
3871 } else
3872 vm_fault_cleanup(object, top_page);
0b4e3aa0 3873
2d21ac55 3874 vm_object_deallocate(object);
55e303ae 3875
2d21ac55
A
3876 goto RetryFault;
3877 }
3878 if (m != VM_PAGE_NULL) {
55e303ae 3879 /*
2d21ac55
A
3880 * Put this page into the physical map.
3881 * We had to do the unlock above because pmap_enter
3882 * may cause other faults. The page may be on
3883 * the pageout queues. If the pageout daemon comes
3884 * across the page, it will remove it from the queues.
55e303ae 3885 */
2d21ac55
A
3886 if (caller_pmap) {
3887 kr = vm_fault_enter(m,
3888 caller_pmap,
3889 caller_pmap_addr,
3890 prot,
6d2010ae 3891 fault_type,
2d21ac55
A
3892 wired,
3893 change_wiring,
3894 fault_info.no_cache,
6d2010ae 3895 fault_info.cs_bypass,
316670eb 3896 NULL,
2d21ac55
A
3897 &type_of_fault);
3898 } else {
3899 kr = vm_fault_enter(m,
3900 pmap,
3901 vaddr,
3902 prot,
6d2010ae 3903 fault_type,
2d21ac55
A
3904 wired,
3905 change_wiring,
3906 fault_info.no_cache,
6d2010ae 3907 fault_info.cs_bypass,
316670eb 3908 NULL,
2d21ac55
A
3909 &type_of_fault);
3910 }
3911 if (kr != KERN_SUCCESS) {
3912 /* abort this page fault */
3913 vm_map_verify_done(map, &version);
3914 if (real_map != map)
3915 vm_map_unlock(real_map);
3916 PAGE_WAKEUP_DONE(m);
3917 vm_fault_cleanup(m->object, top_page);
3918 vm_object_deallocate(object);
3919 goto done;
0b4e3aa0
A
3920 }
3921 } else {
3922
9bccf70c 3923 vm_map_entry_t entry;
91447636
A
3924 vm_map_offset_t laddr;
3925 vm_map_offset_t ldelta, hdelta;
143cc14e 3926
0b4e3aa0
A
3927 /*
3928 * do a pmap block mapping from the physical address
3929 * in the object
3930 */
9bccf70c 3931
2d21ac55 3932#ifdef ppc
55e303ae
A
3933 /* While we do not worry about execution protection in */
3934 /* general, certian pages may have instruction execution */
3935 /* disallowed. We will check here, and if not allowed */
3936 /* to execute, we return with a protection failure. */
9bccf70c 3937
2d21ac55 3938 if ((fault_type & VM_PROT_EXECUTE) &&
6d2010ae 3939 (!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) {
9bccf70c 3940
9bccf70c 3941 vm_map_verify_done(map, &version);
2d21ac55
A
3942
3943 if (real_map != map)
91447636 3944 vm_map_unlock(real_map);
2d21ac55 3945
9bccf70c
A
3946 vm_fault_cleanup(object, top_page);
3947 vm_object_deallocate(object);
2d21ac55 3948
9bccf70c
A
3949 kr = KERN_PROTECTION_FAILURE;
3950 goto done;
0b4e3aa0 3951 }
2d21ac55 3952#endif /* ppc */
1c79356b 3953
2d21ac55 3954 if (real_map != map)
91447636 3955 vm_map_unlock(real_map);
2d21ac55 3956
9bccf70c
A
3957 if (original_map != map) {
3958 vm_map_unlock_read(map);
3959 vm_map_lock_read(original_map);
3960 map = original_map;
3961 }
91447636 3962 real_map = map;
9bccf70c
A
3963
3964 laddr = vaddr;
3965 hdelta = 0xFFFFF000;
3966 ldelta = 0xFFFFF000;
3967
2d21ac55
A
3968 while (vm_map_lookup_entry(map, laddr, &entry)) {
3969 if (ldelta > (laddr - entry->vme_start))
9bccf70c 3970 ldelta = laddr - entry->vme_start;
2d21ac55 3971 if (hdelta > (entry->vme_end - laddr))
9bccf70c 3972 hdelta = entry->vme_end - laddr;
2d21ac55 3973 if (entry->is_sub_map) {
9bccf70c
A
3974
3975 laddr = (laddr - entry->vme_start)
3976 + entry->offset;
3977 vm_map_lock_read(entry->object.sub_map);
2d21ac55
A
3978
3979 if (map != real_map)
9bccf70c 3980 vm_map_unlock_read(map);
2d21ac55 3981 if (entry->use_pmap) {
91447636
A
3982 vm_map_unlock_read(real_map);
3983 real_map = entry->object.sub_map;
9bccf70c
A
3984 }
3985 map = entry->object.sub_map;
3986
3987 } else {
3988 break;
3989 }
3990 }
3991
2d21ac55
A
3992 if (vm_map_lookup_entry(map, laddr, &entry) &&
3993 (entry->object.vm_object != NULL) &&
3994 (entry->object.vm_object == object)) {
3995
b0d623f7 3996 int superpage = (!object->pager_created && object->phys_contiguous)? VM_MEM_SUPERPAGE : 0;
2d21ac55
A
3997 if (caller_pmap) {
3998 /*
3999 * Set up a block mapped area
4000 */
b0d623f7 4001 assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
2d21ac55
A
4002 pmap_map_block(caller_pmap,
4003 (addr64_t)(caller_pmap_addr - ldelta),
6d2010ae 4004 (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->vo_shadow_offset)) +
b0d623f7
A
4005 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
4006 (uint32_t)((ldelta + hdelta) >> 12), prot,
4007 (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
55e303ae 4008 } else {
2d21ac55
A
4009 /*
4010 * Set up a block mapped area
4011 */
b0d623f7 4012 assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
2d21ac55
A
4013 pmap_map_block(real_map->pmap,
4014 (addr64_t)(vaddr - ldelta),
6d2010ae 4015 (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->vo_shadow_offset)) +
b0d623f7
A
4016 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
4017 (uint32_t)((ldelta + hdelta) >> 12), prot,
4018 (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
9bccf70c
A
4019 }
4020 }
0b4e3aa0 4021 }
1c79356b
A
4022
4023 /*
2d21ac55 4024 * Unlock everything, and return
1c79356b 4025 */
1c79356b 4026 vm_map_verify_done(map, &version);
2d21ac55 4027 if (real_map != map)
91447636 4028 vm_map_unlock(real_map);
2d21ac55
A
4029
4030 if (m != VM_PAGE_NULL) {
0b4e3aa0 4031 PAGE_WAKEUP_DONE(m);
1c79356b 4032
2d21ac55
A
4033 vm_fault_cleanup(m->object, top_page);
4034 } else
4035 vm_fault_cleanup(object, top_page);
1c79356b 4036
2d21ac55
A
4037 vm_object_deallocate(object);
4038
4039#undef RELEASE_PAGE
91447636 4040
2d21ac55
A
4041 kr = KERN_SUCCESS;
4042done:
9bccf70c 4043 thread_interrupt_level(interruptible_state);
1c79356b 4044
316670eb
A
4045 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4046 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2d21ac55
A
4047 (int)((uint64_t)vaddr >> 32),
4048 (int)vaddr,
1c79356b 4049 kr,
2d21ac55 4050 type_of_fault,
1c79356b 4051 0);
143cc14e 4052
2d21ac55 4053 return (kr);
1c79356b
A
4054}
4055
4056/*
4057 * vm_fault_wire:
4058 *
4059 * Wire down a range of virtual addresses in a map.
4060 */
4061kern_return_t
4062vm_fault_wire(
4063 vm_map_t map,
4064 vm_map_entry_t entry,
9bccf70c 4065 pmap_t pmap,
91447636 4066 vm_map_offset_t pmap_addr)
1c79356b
A
4067{
4068
91447636
A
4069 register vm_map_offset_t va;
4070 register vm_map_offset_t end_addr = entry->vme_end;
1c79356b
A
4071 register kern_return_t rc;
4072
4073 assert(entry->in_transition);
4074
9bccf70c
A
4075 if ((entry->object.vm_object != NULL) &&
4076 !entry->is_sub_map &&
4077 entry->object.vm_object->phys_contiguous) {
4078 return KERN_SUCCESS;
4079 }
4080
1c79356b
A
4081 /*
4082 * Inform the physical mapping system that the
4083 * range of addresses may not fault, so that
4084 * page tables and such can be locked down as well.
4085 */
4086
9bccf70c
A
4087 pmap_pageable(pmap, pmap_addr,
4088 pmap_addr + (end_addr - entry->vme_start), FALSE);
1c79356b
A
4089
4090 /*
4091 * We simulate a fault to get the page and enter it
4092 * in the physical map.
4093 */
4094
4095 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4096 if ((rc = vm_fault_wire_fast(
9bccf70c
A
4097 map, va, entry, pmap,
4098 pmap_addr + (va - entry->vme_start)
4099 )) != KERN_SUCCESS) {
1c79356b 4100 rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
9bccf70c
A
4101 (pmap == kernel_pmap) ?
4102 THREAD_UNINT : THREAD_ABORTSAFE,
4103 pmap, pmap_addr + (va - entry->vme_start));
2d21ac55 4104 DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
1c79356b
A
4105 }
4106
4107 if (rc != KERN_SUCCESS) {
4108 struct vm_map_entry tmp_entry = *entry;
4109
4110 /* unwire wired pages */
4111 tmp_entry.vme_end = va;
9bccf70c
A
4112 vm_fault_unwire(map,
4113 &tmp_entry, FALSE, pmap, pmap_addr);
1c79356b
A
4114
4115 return rc;
4116 }
4117 }
4118 return KERN_SUCCESS;
4119}
4120
4121/*
4122 * vm_fault_unwire:
4123 *
4124 * Unwire a range of virtual addresses in a map.
4125 */
4126void
4127vm_fault_unwire(
4128 vm_map_t map,
4129 vm_map_entry_t entry,
4130 boolean_t deallocate,
9bccf70c 4131 pmap_t pmap,
91447636 4132 vm_map_offset_t pmap_addr)
1c79356b 4133{
91447636
A
4134 register vm_map_offset_t va;
4135 register vm_map_offset_t end_addr = entry->vme_end;
1c79356b 4136 vm_object_t object;
2d21ac55 4137 struct vm_object_fault_info fault_info;
1c79356b
A
4138
4139 object = (entry->is_sub_map)
4140 ? VM_OBJECT_NULL : entry->object.vm_object;
4141
2d21ac55
A
4142 /*
4143 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
4144 * do anything since such memory is wired by default. So we don't have
4145 * anything to undo here.
4146 */
4147
4148 if (object != VM_OBJECT_NULL && object->phys_contiguous)
4149 return;
4150
4151 fault_info.interruptible = THREAD_UNINT;
4152 fault_info.behavior = entry->behavior;
4153 fault_info.user_tag = entry->alias;
4154 fault_info.lo_offset = entry->offset;
4155 fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
4156 fault_info.no_cache = entry->no_cache;
b0d623f7 4157 fault_info.stealth = TRUE;
6d2010ae
A
4158 fault_info.io_sync = FALSE;
4159 fault_info.cs_bypass = FALSE;
0b4c1975 4160 fault_info.mark_zf_absent = FALSE;
316670eb 4161 fault_info.batch_pmap_op = FALSE;
2d21ac55 4162
1c79356b
A
4163 /*
4164 * Since the pages are wired down, we must be able to
4165 * get their mappings from the physical map system.
4166 */
4167
4168 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
1c79356b
A
4169
4170 if (object == VM_OBJECT_NULL) {
593a1d5f
A
4171 if (pmap) {
4172 pmap_change_wiring(pmap,
4173 pmap_addr + (va - entry->vme_start), FALSE);
4174 }
9bccf70c
A
4175 (void) vm_fault(map, va, VM_PROT_NONE,
4176 TRUE, THREAD_UNINT, pmap, pmap_addr);
1c79356b
A
4177 } else {
4178 vm_prot_t prot;
4179 vm_page_t result_page;
4180 vm_page_t top_page;
4181 vm_object_t result_object;
4182 vm_fault_return_t result;
4183
b0d623f7
A
4184 if (end_addr - va > (vm_size_t) -1) {
4185 /* 32-bit overflow */
4186 fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4187 } else {
4188 fault_info.cluster_size = (vm_size_t) (end_addr - va);
4189 assert(fault_info.cluster_size == end_addr - va);
4190 }
2d21ac55 4191
1c79356b
A
4192 do {
4193 prot = VM_PROT_NONE;
4194
4195 vm_object_lock(object);
4196 vm_object_paging_begin(object);
4197 XPR(XPR_VM_FAULT,
4198 "vm_fault_unwire -> vm_fault_page\n",
4199 0,0,0,0,0);
2d21ac55
A
4200 result = vm_fault_page(
4201 object,
4202 entry->offset + (va - entry->vme_start),
4203 VM_PROT_NONE, TRUE,
4204 &prot, &result_page, &top_page,
4205 (int *)0,
4206 NULL, map->no_zero_fill,
4207 FALSE, &fault_info);
1c79356b
A
4208 } while (result == VM_FAULT_RETRY);
4209
2d21ac55
A
4210 /*
4211 * If this was a mapping to a file on a device that has been forcibly
4212 * unmounted, then we won't get a page back from vm_fault_page(). Just
4213 * move on to the next one in case the remaining pages are mapped from
4214 * different objects. During a forced unmount, the object is terminated
4215 * so the alive flag will be false if this happens. A forced unmount will
4216 * will occur when an external disk is unplugged before the user does an
4217 * eject, so we don't want to panic in that situation.
4218 */
4219
4220 if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
4221 continue;
4222
1c79356b
A
4223 if (result != VM_FAULT_SUCCESS)
4224 panic("vm_fault_unwire: failure");
4225
4226 result_object = result_page->object;
2d21ac55 4227
1c79356b 4228 if (deallocate) {
2d21ac55
A
4229 assert(result_page->phys_page !=
4230 vm_page_fictitious_addr);
91447636 4231 pmap_disconnect(result_page->phys_page);
1c79356b
A
4232 VM_PAGE_FREE(result_page);
4233 } else {
6d2010ae
A
4234 if ((pmap) && (result_page->phys_page != vm_page_guard_addr))
4235 pmap_change_wiring(pmap,
4236 pmap_addr + (va - entry->vme_start), FALSE);
4237
4238
b0d623f7
A
4239 if (VM_PAGE_WIRED(result_page)) {
4240 vm_page_lockspin_queues();
0b4c1975 4241 vm_page_unwire(result_page, TRUE);
b0d623f7
A
4242 vm_page_unlock_queues();
4243 }
4244 if(entry->zero_wired_pages) {
4245 pmap_zero_page(result_page->phys_page);
4246 entry->zero_wired_pages = FALSE;
4247 }
4248
1c79356b
A
4249 PAGE_WAKEUP_DONE(result_page);
4250 }
1c79356b
A
4251 vm_fault_cleanup(result_object, top_page);
4252 }
4253 }
4254
4255 /*
4256 * Inform the physical mapping system that the range
4257 * of addresses may fault, so that page tables and
4258 * such may be unwired themselves.
4259 */
4260
9bccf70c
A
4261 pmap_pageable(pmap, pmap_addr,
4262 pmap_addr + (end_addr - entry->vme_start), TRUE);
1c79356b
A
4263
4264}
4265
4266/*
4267 * vm_fault_wire_fast:
4268 *
4269 * Handle common case of a wire down page fault at the given address.
4270 * If successful, the page is inserted into the associated physical map.
4271 * The map entry is passed in to avoid the overhead of a map lookup.
4272 *
4273 * NOTE: the given address should be truncated to the
4274 * proper page address.
4275 *
4276 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
4277 * a standard error specifying why the fault is fatal is returned.
4278 *
4279 * The map in question must be referenced, and remains so.
4280 * Caller has a read lock on the map.
4281 *
4282 * This is a stripped version of vm_fault() for wiring pages. Anything
4283 * other than the common case will return KERN_FAILURE, and the caller
4284 * is expected to call vm_fault().
4285 */
4286kern_return_t
4287vm_fault_wire_fast(
91447636
A
4288 __unused vm_map_t map,
4289 vm_map_offset_t va,
1c79356b 4290 vm_map_entry_t entry,
91447636
A
4291 pmap_t pmap,
4292 vm_map_offset_t pmap_addr)
1c79356b
A
4293{
4294 vm_object_t object;
4295 vm_object_offset_t offset;
4296 register vm_page_t m;
4297 vm_prot_t prot;
91447636 4298 thread_t thread = current_thread();
2d21ac55
A
4299 int type_of_fault;
4300 kern_return_t kr;
1c79356b 4301
2d21ac55 4302 VM_STAT_INCR(faults);
1c79356b 4303
91447636
A
4304 if (thread != THREAD_NULL && thread->task != TASK_NULL)
4305 thread->task->faults++;
1c79356b
A
4306
4307/*
4308 * Recovery actions
4309 */
4310
4311#undef RELEASE_PAGE
4312#define RELEASE_PAGE(m) { \
4313 PAGE_WAKEUP_DONE(m); \
2d21ac55 4314 vm_page_lockspin_queues(); \
0b4c1975 4315 vm_page_unwire(m, TRUE); \
1c79356b
A
4316 vm_page_unlock_queues(); \
4317}
4318
4319
4320#undef UNLOCK_THINGS
4321#define UNLOCK_THINGS { \
ff6e181a
A
4322 vm_object_paging_end(object); \
4323 vm_object_unlock(object); \
1c79356b
A
4324}
4325
4326#undef UNLOCK_AND_DEALLOCATE
4327#define UNLOCK_AND_DEALLOCATE { \
4328 UNLOCK_THINGS; \
4329 vm_object_deallocate(object); \
4330}
4331/*
4332 * Give up and have caller do things the hard way.
4333 */
4334
4335#define GIVE_UP { \
4336 UNLOCK_AND_DEALLOCATE; \
4337 return(KERN_FAILURE); \
4338}
4339
4340
4341 /*
4342 * If this entry is not directly to a vm_object, bail out.
4343 */
4344 if (entry->is_sub_map)
4345 return(KERN_FAILURE);
4346
4347 /*
4348 * Find the backing store object and offset into it.
4349 */
4350
4351 object = entry->object.vm_object;
4352 offset = (va - entry->vme_start) + entry->offset;
4353 prot = entry->protection;
4354
4355 /*
4356 * Make a reference to this object to prevent its
4357 * disposal while we are messing with it.
4358 */
4359
4360 vm_object_lock(object);
2d21ac55 4361 vm_object_reference_locked(object);
ff6e181a 4362 vm_object_paging_begin(object);
1c79356b
A
4363
4364 /*
4365 * INVARIANTS (through entire routine):
4366 *
4367 * 1) At all times, we must either have the object
4368 * lock or a busy page in some object to prevent
4369 * some other thread from trying to bring in
4370 * the same page.
4371 *
4372 * 2) Once we have a busy page, we must remove it from
4373 * the pageout queues, so that the pageout daemon
4374 * will not grab it away.
4375 *
4376 */
4377
4378 /*
4379 * Look for page in top-level object. If it's not there or
4380 * there's something going on, give up.
91447636
A
4381 * ENCRYPTED SWAP: use the slow fault path, since we'll need to
4382 * decrypt the page before wiring it down.
1c79356b
A
4383 */
4384 m = vm_page_lookup(object, offset);
91447636 4385 if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
2d21ac55 4386 (m->unusual && ( m->error || m->restart || m->absent))) {
1c79356b
A
4387
4388 GIVE_UP;
4389 }
91447636 4390 ASSERT_PAGE_DECRYPTED(m);
1c79356b 4391
2d21ac55
A
4392 if (m->fictitious &&
4393 m->phys_page == vm_page_guard_addr) {
4394 /*
4395 * Guard pages are fictitious pages and are never
4396 * entered into a pmap, so let's say it's been wired...
4397 */
4398 kr = KERN_SUCCESS;
4399 goto done;
4400 }
4401
1c79356b
A
4402 /*
4403 * Wire the page down now. All bail outs beyond this
4404 * point must unwire the page.
4405 */
4406
2d21ac55 4407 vm_page_lockspin_queues();
1c79356b
A
4408 vm_page_wire(m);
4409 vm_page_unlock_queues();
4410
4411 /*
4412 * Mark page busy for other threads.
4413 */
4414 assert(!m->busy);
4415 m->busy = TRUE;
4416 assert(!m->absent);
4417
4418 /*
4419 * Give up if the page is being written and there's a copy object
4420 */
4421 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
4422 RELEASE_PAGE(m);
4423 GIVE_UP;
4424 }
4425
4426 /*
4427 * Put this page into the physical map.
1c79356b 4428 */
2d21ac55
A
4429 type_of_fault = DBG_CACHE_HIT_FAULT;
4430 kr = vm_fault_enter(m,
4431 pmap,
4432 pmap_addr,
4433 prot,
6d2010ae 4434 prot,
2d21ac55
A
4435 TRUE,
4436 FALSE,
4437 FALSE,
6d2010ae 4438 FALSE,
316670eb 4439 NULL,
2d21ac55
A
4440 &type_of_fault);
4441
4442done:
1c79356b
A
4443 /*
4444 * Unlock everything, and return
4445 */
4446
4447 PAGE_WAKEUP_DONE(m);
4448 UNLOCK_AND_DEALLOCATE;
4449
2d21ac55 4450 return kr;
1c79356b
A
4451
4452}
4453
4454/*
4455 * Routine: vm_fault_copy_cleanup
4456 * Purpose:
4457 * Release a page used by vm_fault_copy.
4458 */
4459
4460void
4461vm_fault_copy_cleanup(
4462 vm_page_t page,
4463 vm_page_t top_page)
4464{
4465 vm_object_t object = page->object;
4466
4467 vm_object_lock(object);
4468 PAGE_WAKEUP_DONE(page);
b0d623f7
A
4469 if (!page->active && !page->inactive && !page->throttled) {
4470 vm_page_lockspin_queues();
4471 if (!page->active && !page->inactive && !page->throttled)
4472 vm_page_activate(page);
4473 vm_page_unlock_queues();
4474 }
1c79356b
A
4475 vm_fault_cleanup(object, top_page);
4476}
4477
4478void
4479vm_fault_copy_dst_cleanup(
4480 vm_page_t page)
4481{
4482 vm_object_t object;
4483
4484 if (page != VM_PAGE_NULL) {
4485 object = page->object;
4486 vm_object_lock(object);
2d21ac55 4487 vm_page_lockspin_queues();
0b4c1975 4488 vm_page_unwire(page, TRUE);
1c79356b
A
4489 vm_page_unlock_queues();
4490 vm_object_paging_end(object);
4491 vm_object_unlock(object);
4492 }
4493}
4494
4495/*
4496 * Routine: vm_fault_copy
4497 *
4498 * Purpose:
4499 * Copy pages from one virtual memory object to another --
4500 * neither the source nor destination pages need be resident.
4501 *
4502 * Before actually copying a page, the version associated with
4503 * the destination address map wil be verified.
4504 *
4505 * In/out conditions:
4506 * The caller must hold a reference, but not a lock, to
4507 * each of the source and destination objects and to the
4508 * destination map.
4509 *
4510 * Results:
4511 * Returns KERN_SUCCESS if no errors were encountered in
4512 * reading or writing the data. Returns KERN_INTERRUPTED if
4513 * the operation was interrupted (only possible if the
4514 * "interruptible" argument is asserted). Other return values
4515 * indicate a permanent error in copying the data.
4516 *
4517 * The actual amount of data copied will be returned in the
4518 * "copy_size" argument. In the event that the destination map
4519 * verification failed, this amount may be less than the amount
4520 * requested.
4521 */
4522kern_return_t
4523vm_fault_copy(
4524 vm_object_t src_object,
4525 vm_object_offset_t src_offset,
91447636 4526 vm_map_size_t *copy_size, /* INOUT */
1c79356b
A
4527 vm_object_t dst_object,
4528 vm_object_offset_t dst_offset,
4529 vm_map_t dst_map,
4530 vm_map_version_t *dst_version,
4531 int interruptible)
4532{
4533 vm_page_t result_page;
4534
4535 vm_page_t src_page;
4536 vm_page_t src_top_page;
4537 vm_prot_t src_prot;
4538
4539 vm_page_t dst_page;
4540 vm_page_t dst_top_page;
4541 vm_prot_t dst_prot;
4542
91447636 4543 vm_map_size_t amount_left;
1c79356b
A
4544 vm_object_t old_copy_object;
4545 kern_return_t error = 0;
b0d623f7 4546 vm_fault_return_t result;
1c79356b 4547
91447636 4548 vm_map_size_t part_size;
2d21ac55
A
4549 struct vm_object_fault_info fault_info_src;
4550 struct vm_object_fault_info fault_info_dst;
1c79356b
A
4551
4552 /*
4553 * In order not to confuse the clustered pageins, align
4554 * the different offsets on a page boundary.
4555 */
1c79356b
A
4556
4557#define RETURN(x) \
4558 MACRO_BEGIN \
91447636 4559 *copy_size -= amount_left; \
1c79356b
A
4560 MACRO_RETURN(x); \
4561 MACRO_END
4562
91447636 4563 amount_left = *copy_size;
2d21ac55
A
4564
4565 fault_info_src.interruptible = interruptible;
4566 fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
4567 fault_info_src.user_tag = 0;
4568 fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
4569 fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
4570 fault_info_src.no_cache = FALSE;
b0d623f7 4571 fault_info_src.stealth = TRUE;
6d2010ae
A
4572 fault_info_src.io_sync = FALSE;
4573 fault_info_src.cs_bypass = FALSE;
0b4c1975 4574 fault_info_src.mark_zf_absent = FALSE;
316670eb 4575 fault_info_src.batch_pmap_op = FALSE;
2d21ac55
A
4576
4577 fault_info_dst.interruptible = interruptible;
4578 fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
4579 fault_info_dst.user_tag = 0;
4580 fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
4581 fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
4582 fault_info_dst.no_cache = FALSE;
b0d623f7 4583 fault_info_dst.stealth = TRUE;
6d2010ae
A
4584 fault_info_dst.io_sync = FALSE;
4585 fault_info_dst.cs_bypass = FALSE;
0b4c1975 4586 fault_info_dst.mark_zf_absent = FALSE;
316670eb 4587 fault_info_dst.batch_pmap_op = FALSE;
2d21ac55 4588
1c79356b
A
4589 do { /* while (amount_left > 0) */
4590 /*
4591 * There may be a deadlock if both source and destination
4592 * pages are the same. To avoid this deadlock, the copy must
4593 * start by getting the destination page in order to apply
4594 * COW semantics if any.
4595 */
4596
4597 RetryDestinationFault: ;
4598
4599 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
4600
4601 vm_object_lock(dst_object);
4602 vm_object_paging_begin(dst_object);
4603
b0d623f7
A
4604 if (amount_left > (vm_size_t) -1) {
4605 /* 32-bit overflow */
4606 fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4607 } else {
4608 fault_info_dst.cluster_size = (vm_size_t) amount_left;
4609 assert(fault_info_dst.cluster_size == amount_left);
4610 }
2d21ac55 4611
1c79356b 4612 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
b0d623f7
A
4613 result = vm_fault_page(dst_object,
4614 vm_object_trunc_page(dst_offset),
4615 VM_PROT_WRITE|VM_PROT_READ,
4616 FALSE,
4617 &dst_prot, &dst_page, &dst_top_page,
4618 (int *)0,
4619 &error,
4620 dst_map->no_zero_fill,
4621 FALSE, &fault_info_dst);
4622 switch (result) {
1c79356b
A
4623 case VM_FAULT_SUCCESS:
4624 break;
4625 case VM_FAULT_RETRY:
4626 goto RetryDestinationFault;
4627 case VM_FAULT_MEMORY_SHORTAGE:
4628 if (vm_page_wait(interruptible))
4629 goto RetryDestinationFault;
4630 /* fall thru */
4631 case VM_FAULT_INTERRUPTED:
4632 RETURN(MACH_SEND_INTERRUPTED);
b0d623f7
A
4633 case VM_FAULT_SUCCESS_NO_VM_PAGE:
4634 /* success but no VM page: fail the copy */
4635 vm_object_paging_end(dst_object);
4636 vm_object_unlock(dst_object);
4637 /*FALLTHROUGH*/
1c79356b
A
4638 case VM_FAULT_MEMORY_ERROR:
4639 if (error)
4640 return (error);
4641 else
4642 return(KERN_MEMORY_ERROR);
b0d623f7
A
4643 default:
4644 panic("vm_fault_copy: unexpected error 0x%x from "
4645 "vm_fault_page()\n", result);
1c79356b
A
4646 }
4647 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
4648
4649 old_copy_object = dst_page->object->copy;
4650
4651 /*
4652 * There exists the possiblity that the source and
4653 * destination page are the same. But we can't
4654 * easily determine that now. If they are the
4655 * same, the call to vm_fault_page() for the
4656 * destination page will deadlock. To prevent this we
4657 * wire the page so we can drop busy without having
4658 * the page daemon steal the page. We clean up the
4659 * top page but keep the paging reference on the object
4660 * holding the dest page so it doesn't go away.
4661 */
4662
2d21ac55 4663 vm_page_lockspin_queues();
1c79356b
A
4664 vm_page_wire(dst_page);
4665 vm_page_unlock_queues();
4666 PAGE_WAKEUP_DONE(dst_page);
4667 vm_object_unlock(dst_page->object);
4668
4669 if (dst_top_page != VM_PAGE_NULL) {
4670 vm_object_lock(dst_object);
4671 VM_PAGE_FREE(dst_top_page);
4672 vm_object_paging_end(dst_object);
4673 vm_object_unlock(dst_object);
4674 }
4675
4676 RetrySourceFault: ;
4677
4678 if (src_object == VM_OBJECT_NULL) {
4679 /*
4680 * No source object. We will just
4681 * zero-fill the page in dst_object.
4682 */
4683 src_page = VM_PAGE_NULL;
e3027f41 4684 result_page = VM_PAGE_NULL;
1c79356b
A
4685 } else {
4686 vm_object_lock(src_object);
4687 src_page = vm_page_lookup(src_object,
91447636 4688 vm_object_trunc_page(src_offset));
e3027f41 4689 if (src_page == dst_page) {
1c79356b 4690 src_prot = dst_prot;
e3027f41
A
4691 result_page = VM_PAGE_NULL;
4692 } else {
1c79356b
A
4693 src_prot = VM_PROT_READ;
4694 vm_object_paging_begin(src_object);
4695
b0d623f7
A
4696 if (amount_left > (vm_size_t) -1) {
4697 /* 32-bit overflow */
4698 fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4699 } else {
4700 fault_info_src.cluster_size = (vm_size_t) amount_left;
4701 assert(fault_info_src.cluster_size == amount_left);
4702 }
2d21ac55 4703
1c79356b
A
4704 XPR(XPR_VM_FAULT,
4705 "vm_fault_copy(2) -> vm_fault_page\n",
4706 0,0,0,0,0);
b0d623f7
A
4707 result = vm_fault_page(
4708 src_object,
4709 vm_object_trunc_page(src_offset),
4710 VM_PROT_READ, FALSE,
4711 &src_prot,
4712 &result_page, &src_top_page,
4713 (int *)0, &error, FALSE,
4714 FALSE, &fault_info_src);
4715
4716 switch (result) {
1c79356b
A
4717 case VM_FAULT_SUCCESS:
4718 break;
4719 case VM_FAULT_RETRY:
4720 goto RetrySourceFault;
4721 case VM_FAULT_MEMORY_SHORTAGE:
4722 if (vm_page_wait(interruptible))
4723 goto RetrySourceFault;
4724 /* fall thru */
4725 case VM_FAULT_INTERRUPTED:
4726 vm_fault_copy_dst_cleanup(dst_page);
4727 RETURN(MACH_SEND_INTERRUPTED);
b0d623f7
A
4728 case VM_FAULT_SUCCESS_NO_VM_PAGE:
4729 /* success but no VM page: fail */
4730 vm_object_paging_end(src_object);
4731 vm_object_unlock(src_object);
4732 /*FALLTHROUGH*/
1c79356b
A
4733 case VM_FAULT_MEMORY_ERROR:
4734 vm_fault_copy_dst_cleanup(dst_page);
4735 if (error)
4736 return (error);
4737 else
4738 return(KERN_MEMORY_ERROR);
b0d623f7
A
4739 default:
4740 panic("vm_fault_copy(2): unexpected "
4741 "error 0x%x from "
4742 "vm_fault_page()\n", result);
1c79356b
A
4743 }
4744
1c79356b
A
4745
4746 assert((src_top_page == VM_PAGE_NULL) ==
e3027f41 4747 (result_page->object == src_object));
1c79356b
A
4748 }
4749 assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
e3027f41 4750 vm_object_unlock(result_page->object);
1c79356b
A
4751 }
4752
4753 if (!vm_map_verify(dst_map, dst_version)) {
e3027f41
A
4754 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4755 vm_fault_copy_cleanup(result_page, src_top_page);
1c79356b
A
4756 vm_fault_copy_dst_cleanup(dst_page);
4757 break;
4758 }
4759
4760 vm_object_lock(dst_page->object);
4761
4762 if (dst_page->object->copy != old_copy_object) {
4763 vm_object_unlock(dst_page->object);
4764 vm_map_verify_done(dst_map, dst_version);
e3027f41
A
4765 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4766 vm_fault_copy_cleanup(result_page, src_top_page);
1c79356b
A
4767 vm_fault_copy_dst_cleanup(dst_page);
4768 break;
4769 }
4770 vm_object_unlock(dst_page->object);
4771
4772 /*
4773 * Copy the page, and note that it is dirty
4774 * immediately.
4775 */
4776
4777 if (!page_aligned(src_offset) ||
4778 !page_aligned(dst_offset) ||
4779 !page_aligned(amount_left)) {
4780
4781 vm_object_offset_t src_po,
4782 dst_po;
4783
91447636
A
4784 src_po = src_offset - vm_object_trunc_page(src_offset);
4785 dst_po = dst_offset - vm_object_trunc_page(dst_offset);
1c79356b
A
4786
4787 if (dst_po > src_po) {
4788 part_size = PAGE_SIZE - dst_po;
4789 } else {
4790 part_size = PAGE_SIZE - src_po;
4791 }
4792 if (part_size > (amount_left)){
4793 part_size = amount_left;
4794 }
4795
e3027f41 4796 if (result_page == VM_PAGE_NULL) {
b0d623f7
A
4797 assert((vm_offset_t) dst_po == dst_po);
4798 assert((vm_size_t) part_size == part_size);
1c79356b 4799 vm_page_part_zero_fill(dst_page,
b0d623f7
A
4800 (vm_offset_t) dst_po,
4801 (vm_size_t) part_size);
1c79356b 4802 } else {
b0d623f7
A
4803 assert((vm_offset_t) src_po == src_po);
4804 assert((vm_offset_t) dst_po == dst_po);
4805 assert((vm_size_t) part_size == part_size);
4806 vm_page_part_copy(result_page,
4807 (vm_offset_t) src_po,
4808 dst_page,
4809 (vm_offset_t) dst_po,
4810 (vm_size_t)part_size);
1c79356b
A
4811 if(!dst_page->dirty){
4812 vm_object_lock(dst_object);
316670eb 4813 SET_PAGE_DIRTY(dst_page, TRUE);
1c79356b
A
4814 vm_object_unlock(dst_page->object);
4815 }
4816
4817 }
4818 } else {
4819 part_size = PAGE_SIZE;
4820
e3027f41 4821 if (result_page == VM_PAGE_NULL)
1c79356b
A
4822 vm_page_zero_fill(dst_page);
4823 else{
316670eb 4824 vm_object_lock(result_page->object);
e3027f41 4825 vm_page_copy(result_page, dst_page);
316670eb
A
4826 vm_object_unlock(result_page->object);
4827
1c79356b
A
4828 if(!dst_page->dirty){
4829 vm_object_lock(dst_object);
316670eb 4830 SET_PAGE_DIRTY(dst_page, TRUE);
1c79356b
A
4831 vm_object_unlock(dst_page->object);
4832 }
4833 }
4834
4835 }
4836
4837 /*
4838 * Unlock everything, and return
4839 */
4840
4841 vm_map_verify_done(dst_map, dst_version);
4842
e3027f41
A
4843 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4844 vm_fault_copy_cleanup(result_page, src_top_page);
1c79356b
A
4845 vm_fault_copy_dst_cleanup(dst_page);
4846
4847 amount_left -= part_size;
4848 src_offset += part_size;
4849 dst_offset += part_size;
4850 } while (amount_left > 0);
4851
4852 RETURN(KERN_SUCCESS);
4853#undef RETURN
4854
4855 /*NOTREACHED*/
4856}
4857
1c79356b
A
4858#if VM_FAULT_CLASSIFY
4859/*
4860 * Temporary statistics gathering support.
4861 */
4862
4863/*
4864 * Statistics arrays:
4865 */
4866#define VM_FAULT_TYPES_MAX 5
4867#define VM_FAULT_LEVEL_MAX 8
4868
4869int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4870
4871#define VM_FAULT_TYPE_ZERO_FILL 0
4872#define VM_FAULT_TYPE_MAP_IN 1
4873#define VM_FAULT_TYPE_PAGER 2
4874#define VM_FAULT_TYPE_COPY 3
4875#define VM_FAULT_TYPE_OTHER 4
4876
4877
4878void
4879vm_fault_classify(vm_object_t object,
4880 vm_object_offset_t offset,
4881 vm_prot_t fault_type)
4882{
4883 int type, level = 0;
4884 vm_page_t m;
4885
4886 while (TRUE) {
4887 m = vm_page_lookup(object, offset);
4888 if (m != VM_PAGE_NULL) {
2d21ac55 4889 if (m->busy || m->error || m->restart || m->absent) {
1c79356b
A
4890 type = VM_FAULT_TYPE_OTHER;
4891 break;
4892 }
4893 if (((fault_type & VM_PROT_WRITE) == 0) ||
4894 ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4895 type = VM_FAULT_TYPE_MAP_IN;
4896 break;
4897 }
4898 type = VM_FAULT_TYPE_COPY;
4899 break;
4900 }
4901 else {
4902 if (object->pager_created) {
4903 type = VM_FAULT_TYPE_PAGER;
4904 break;
4905 }
4906 if (object->shadow == VM_OBJECT_NULL) {
4907 type = VM_FAULT_TYPE_ZERO_FILL;
4908 break;
4909 }
4910
6d2010ae 4911 offset += object->vo_shadow_offset;
1c79356b
A
4912 object = object->shadow;
4913 level++;
4914 continue;
4915 }
4916 }
4917
4918 if (level > VM_FAULT_LEVEL_MAX)
4919 level = VM_FAULT_LEVEL_MAX;
4920
4921 vm_fault_stats[type][level] += 1;
4922
4923 return;
4924}
4925
4926/* cleanup routine to call from debugger */
4927
4928void
4929vm_fault_classify_init(void)
4930{
4931 int type, level;
4932
4933 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4934 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4935 vm_fault_stats[type][level] = 0;
4936 }
4937 }
4938
4939 return;
4940}
4941#endif /* VM_FAULT_CLASSIFY */
2d21ac55
A
4942
4943
4944extern int cs_validation;
4945
593a1d5f
A
4946void
4947vm_page_validate_cs_mapped(
4948 vm_page_t page,
4949 const void *kaddr)
4950{
4951 vm_object_t object;
4952 vm_object_offset_t offset;
4953 kern_return_t kr;
4954 memory_object_t pager;
4955 void *blobs;
4956 boolean_t validated, tainted;
4957
4958 assert(page->busy);
4959 vm_object_lock_assert_exclusive(page->object);
4960
4961 if (!cs_validation) {
4962 return;
4963 }
4964
4965 if (page->wpmapped && !page->cs_tainted) {
4966 /*
4967 * This page was mapped for "write" access sometime in the
4968 * past and could still be modifiable in the future.
4969 * Consider it tainted.
4970 * [ If the page was already found to be "tainted", no
4971 * need to re-validate. ]
4972 */
4973 page->cs_validated = TRUE;
4974 page->cs_tainted = TRUE;
4975 if (cs_debug) {
4976 printf("CODESIGNING: vm_page_validate_cs: "
4977 "page %p obj %p off 0x%llx "
4978 "was modified\n",
4979 page, page->object, page->offset);
4980 }
4981 vm_cs_validated_dirtied++;
4982 }
4983
4984 if (page->cs_validated) {
4985 return;
4986 }
4987
4988 vm_cs_validates++;
4989
4990 object = page->object;
4991 assert(object->code_signed);
4992 offset = page->offset;
4993
4994 if (!object->alive || object->terminating || object->pager == NULL) {
4995 /*
4996 * The object is terminating and we don't have its pager
4997 * so we can't validate the data...
4998 */
4999 return;
5000 }
5001 /*
5002 * Since we get here to validate a page that was brought in by
5003 * the pager, we know that this pager is all setup and ready
5004 * by now.
5005 */
5006 assert(!object->internal);
5007 assert(object->pager != NULL);
5008 assert(object->pager_ready);
5009
5010 pager = object->pager;
b0d623f7 5011 assert(object->paging_in_progress);
593a1d5f
A
5012 kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
5013 if (kr != KERN_SUCCESS) {
5014 blobs = NULL;
5015 }
5016
5017 /* verify the SHA1 hash for this page */
5018 validated = cs_validate_page(blobs,
316670eb 5019 pager,
593a1d5f
A
5020 offset + object->paging_offset,
5021 (const void *)kaddr,
5022 &tainted);
5023
5024 page->cs_validated = validated;
5025 if (validated) {
5026 page->cs_tainted = tainted;
5027 }
5028}
5029
2d21ac55
A
5030void
5031vm_page_validate_cs(
5032 vm_page_t page)
5033{
5034 vm_object_t object;
5035 vm_object_offset_t offset;
5036 vm_map_offset_t koffset;
5037 vm_map_size_t ksize;
5038 vm_offset_t kaddr;
5039 kern_return_t kr;
2d21ac55
A
5040 boolean_t busy_page;
5041
4a3eedf9 5042 vm_object_lock_assert_held(page->object);
2d21ac55
A
5043
5044 if (!cs_validation) {
5045 return;
5046 }
5047
593a1d5f 5048 if (page->wpmapped && !page->cs_tainted) {
4a3eedf9
A
5049 vm_object_lock_assert_exclusive(page->object);
5050
5051 /*
593a1d5f
A
5052 * This page was mapped for "write" access sometime in the
5053 * past and could still be modifiable in the future.
5054 * Consider it tainted.
5055 * [ If the page was already found to be "tainted", no
5056 * need to re-validate. ]
4a3eedf9 5057 */
593a1d5f
A
5058 page->cs_validated = TRUE;
5059 page->cs_tainted = TRUE;
5060 if (cs_debug) {
5061 printf("CODESIGNING: vm_page_validate_cs: "
5062 "page %p obj %p off 0x%llx "
5063 "was modified\n",
5064 page, page->object, page->offset);
4a3eedf9 5065 }
593a1d5f 5066 vm_cs_validated_dirtied++;
4a3eedf9
A
5067 }
5068
5069 if (page->cs_validated) {
5070 return;
5071 }
5072
6d2010ae
A
5073#if CHECK_CS_VALIDATION_BITMAP
5074 if ( vnode_pager_cs_check_validation_bitmap( page->object->pager, trunc_page(page->offset + page->object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) {
5075 page->cs_validated = TRUE;
5076 page->cs_tainted = FALSE;
5077 vm_cs_bitmap_validated++;
5078 return;
5079 }
5080#endif
4a3eedf9
A
5081 vm_object_lock_assert_exclusive(page->object);
5082
2d21ac55
A
5083 object = page->object;
5084 assert(object->code_signed);
5085 offset = page->offset;
5086
5087 busy_page = page->busy;
5088 if (!busy_page) {
5089 /* keep page busy while we map (and unlock) the VM object */
5090 page->busy = TRUE;
5091 }
5092
5093 /*
5094 * Take a paging reference on the VM object
5095 * to protect it from collapse or bypass,
5096 * and keep it from disappearing too.
5097 */
5098 vm_object_paging_begin(object);
5099
5100 /* map the page in the kernel address space */
5101 koffset = 0;
5102 ksize = PAGE_SIZE_64;
5103 kr = vm_paging_map_object(&koffset,
5104 page,
5105 object,
5106 offset,
5107 &ksize,
593a1d5f 5108 VM_PROT_READ,
2d21ac55
A
5109 FALSE); /* can't unlock object ! */
5110 if (kr != KERN_SUCCESS) {
5111 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
5112 }
5113 kaddr = CAST_DOWN(vm_offset_t, koffset);
5114
593a1d5f
A
5115 /* validate the mapped page */
5116 vm_page_validate_cs_mapped(page, (const void *) kaddr);
2d21ac55 5117
6d2010ae
A
5118#if CHECK_CS_VALIDATION_BITMAP
5119 if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) {
5120 vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET );
5121 }
5122#endif
2d21ac55
A
5123 assert(page->busy);
5124 assert(object == page->object);
5125 vm_object_lock_assert_exclusive(object);
5126
2d21ac55
A
5127 if (!busy_page) {
5128 PAGE_WAKEUP_DONE(page);
5129 }
5130 if (koffset != 0) {
5131 /* unmap the map from the kernel address space */
5132 vm_paging_unmap_object(object, koffset, koffset + ksize);
5133 koffset = 0;
5134 ksize = 0;
5135 kaddr = 0;
5136 }
5137 vm_object_paging_end(object);
5138}