]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_fault.c
963f6345ed5cbc1e6135fcb69292127868730c52
[apple/xnu.git] / osfmk / vm / vm_fault.c
1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm_fault.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Page fault handling module.
63 */
64
65 #include <mach_cluster_stats.h>
66 #include <mach_pagemap.h>
67 #include <mach_kdb.h>
68
69 #include <mach/mach_types.h>
70 #include <mach/kern_return.h>
71 #include <mach/message.h> /* for error codes */
72 #include <mach/vm_param.h>
73 #include <mach/vm_behavior.h>
74 #include <mach/memory_object.h>
75 /* For memory_object_data_{request,unlock} */
76
77 #include <kern/kern_types.h>
78 #include <kern/host_statistics.h>
79 #include <kern/counters.h>
80 #include <kern/task.h>
81 #include <kern/thread.h>
82 #include <kern/sched_prim.h>
83 #include <kern/host.h>
84 #include <kern/xpr.h>
85 #include <kern/mach_param.h>
86 #include <kern/macro_help.h>
87 #include <kern/zalloc.h>
88 #include <kern/misc_protos.h>
89
90 #include <ppc/proc_reg.h>
91
92 #include <vm/vm_fault.h>
93 #include <vm/task_working_set.h>
94 #include <vm/vm_map.h>
95 #include <vm/vm_object.h>
96 #include <vm/vm_page.h>
97 #include <vm/vm_kern.h>
98 #include <vm/pmap.h>
99 #include <vm/vm_pageout.h>
100 #include <vm/vm_protos.h>
101
102 #include <sys/kdebug.h>
103
104 #define VM_FAULT_CLASSIFY 0
105 #define VM_FAULT_STATIC_CONFIG 1
106
107 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
108
109 unsigned int vm_object_absent_max = 50;
110
111 int vm_fault_debug = 0;
112
113 #if !VM_FAULT_STATIC_CONFIG
114 boolean_t vm_fault_dirty_handling = FALSE;
115 boolean_t vm_fault_interruptible = FALSE;
116 boolean_t software_reference_bits = TRUE;
117 #endif
118
119 #if MACH_KDB
120 extern struct db_watchpoint *db_watchpoint_list;
121 #endif /* MACH_KDB */
122
123
124 /* Forward declarations of internal routines. */
125 extern kern_return_t vm_fault_wire_fast(
126 vm_map_t map,
127 vm_map_offset_t va,
128 vm_map_entry_t entry,
129 pmap_t pmap,
130 vm_map_offset_t pmap_addr);
131
132 extern void vm_fault_continue(void);
133
134 extern void vm_fault_copy_cleanup(
135 vm_page_t page,
136 vm_page_t top_page);
137
138 extern void vm_fault_copy_dst_cleanup(
139 vm_page_t page);
140
141 #if VM_FAULT_CLASSIFY
142 extern void vm_fault_classify(vm_object_t object,
143 vm_object_offset_t offset,
144 vm_prot_t fault_type);
145
146 extern void vm_fault_classify_init(void);
147 #endif
148
149 /*
150 * Routine: vm_fault_init
151 * Purpose:
152 * Initialize our private data structures.
153 */
154 void
155 vm_fault_init(void)
156 {
157 }
158
159 /*
160 * Routine: vm_fault_cleanup
161 * Purpose:
162 * Clean up the result of vm_fault_page.
163 * Results:
164 * The paging reference for "object" is released.
165 * "object" is unlocked.
166 * If "top_page" is not null, "top_page" is
167 * freed and the paging reference for the object
168 * containing it is released.
169 *
170 * In/out conditions:
171 * "object" must be locked.
172 */
173 void
174 vm_fault_cleanup(
175 register vm_object_t object,
176 register vm_page_t top_page)
177 {
178 vm_object_paging_end(object);
179 vm_object_unlock(object);
180
181 if (top_page != VM_PAGE_NULL) {
182 object = top_page->object;
183 vm_object_lock(object);
184 VM_PAGE_FREE(top_page);
185 vm_object_paging_end(object);
186 vm_object_unlock(object);
187 }
188 }
189
190 #if MACH_CLUSTER_STATS
191 #define MAXCLUSTERPAGES 16
192 struct {
193 unsigned long pages_in_cluster;
194 unsigned long pages_at_higher_offsets;
195 unsigned long pages_at_lower_offsets;
196 } cluster_stats_in[MAXCLUSTERPAGES];
197 #define CLUSTER_STAT(clause) clause
198 #define CLUSTER_STAT_HIGHER(x) \
199 ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
200 #define CLUSTER_STAT_LOWER(x) \
201 ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
202 #define CLUSTER_STAT_CLUSTER(x) \
203 ((cluster_stats_in[(x)].pages_in_cluster)++)
204 #else /* MACH_CLUSTER_STATS */
205 #define CLUSTER_STAT(clause)
206 #endif /* MACH_CLUSTER_STATS */
207
208 /* XXX - temporary */
209 boolean_t vm_allow_clustered_pagein = FALSE;
210 int vm_pagein_cluster_used = 0;
211
212 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
213
214
215 boolean_t vm_page_deactivate_behind = TRUE;
216 /*
217 * Prepage default sizes given VM_BEHAVIOR_DEFAULT reference behavior
218 */
219 int vm_default_ahead = 0;
220 int vm_default_behind = MAX_UPL_TRANSFER;
221
222 /*
223 * vm_page_deactivate_behind
224 *
225 * Determine if sequential access is in progress
226 * in accordance with the behavior specified. If
227 * so, compute a potential page to deactive and
228 * deactivate it.
229 *
230 * The object must be locked.
231 */
232 static
233 boolean_t
234 vm_fault_deactivate_behind(
235 vm_object_t object,
236 vm_object_offset_t offset,
237 vm_behavior_t behavior)
238 {
239 vm_page_t m;
240
241 #if TRACEFAULTPAGE
242 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
243 #endif
244
245 if (object == kernel_object) {
246 /*
247 * Do not deactivate pages from the kernel object: they
248 * are not intended to become pageable.
249 */
250 return FALSE;
251 }
252
253 switch (behavior) {
254 case VM_BEHAVIOR_RANDOM:
255 object->sequential = PAGE_SIZE_64;
256 m = VM_PAGE_NULL;
257 break;
258 case VM_BEHAVIOR_SEQUENTIAL:
259 if (offset &&
260 object->last_alloc == offset - PAGE_SIZE_64) {
261 object->sequential += PAGE_SIZE_64;
262 m = vm_page_lookup(object, offset - PAGE_SIZE_64);
263 } else {
264 object->sequential = PAGE_SIZE_64; /* reset */
265 m = VM_PAGE_NULL;
266 }
267 break;
268 case VM_BEHAVIOR_RSEQNTL:
269 if (object->last_alloc &&
270 object->last_alloc == offset + PAGE_SIZE_64) {
271 object->sequential += PAGE_SIZE_64;
272 m = vm_page_lookup(object, offset + PAGE_SIZE_64);
273 } else {
274 object->sequential = PAGE_SIZE_64; /* reset */
275 m = VM_PAGE_NULL;
276 }
277 break;
278 case VM_BEHAVIOR_DEFAULT:
279 default:
280 if (offset &&
281 object->last_alloc == offset - PAGE_SIZE_64) {
282 vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
283
284 object->sequential += PAGE_SIZE_64;
285 m = (offset >= behind &&
286 object->sequential >= behind) ?
287 vm_page_lookup(object, offset - behind) :
288 VM_PAGE_NULL;
289 } else if (object->last_alloc &&
290 object->last_alloc == offset + PAGE_SIZE_64) {
291 vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
292
293 object->sequential += PAGE_SIZE_64;
294 m = (offset < -behind &&
295 object->sequential >= behind) ?
296 vm_page_lookup(object, offset + behind) :
297 VM_PAGE_NULL;
298 } else {
299 object->sequential = PAGE_SIZE_64;
300 m = VM_PAGE_NULL;
301 }
302 break;
303 }
304
305 object->last_alloc = offset;
306
307 if (m) {
308 if (!m->busy) {
309 vm_page_lock_queues();
310 vm_page_deactivate(m);
311 vm_page_unlock_queues();
312 #if TRACEFAULTPAGE
313 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
314 #endif
315 }
316 return TRUE;
317 }
318 return FALSE;
319 }
320
321
322 /*
323 * Routine: vm_fault_page
324 * Purpose:
325 * Find the resident page for the virtual memory
326 * specified by the given virtual memory object
327 * and offset.
328 * Additional arguments:
329 * The required permissions for the page is given
330 * in "fault_type". Desired permissions are included
331 * in "protection". The minimum and maximum valid offsets
332 * within the object for the relevant map entry are
333 * passed in "lo_offset" and "hi_offset" respectively and
334 * the expected page reference pattern is passed in "behavior".
335 * These three parameters are used to determine pagein cluster
336 * limits.
337 *
338 * If the desired page is known to be resident (for
339 * example, because it was previously wired down), asserting
340 * the "unwiring" parameter will speed the search.
341 *
342 * If the operation can be interrupted (by thread_abort
343 * or thread_terminate), then the "interruptible"
344 * parameter should be asserted.
345 *
346 * Results:
347 * The page containing the proper data is returned
348 * in "result_page".
349 *
350 * In/out conditions:
351 * The source object must be locked and referenced,
352 * and must donate one paging reference. The reference
353 * is not affected. The paging reference and lock are
354 * consumed.
355 *
356 * If the call succeeds, the object in which "result_page"
357 * resides is left locked and holding a paging reference.
358 * If this is not the original object, a busy page in the
359 * original object is returned in "top_page", to prevent other
360 * callers from pursuing this same data, along with a paging
361 * reference for the original object. The "top_page" should
362 * be destroyed when this guarantee is no longer required.
363 * The "result_page" is also left busy. It is not removed
364 * from the pageout queues.
365 */
366
367 vm_fault_return_t
368 vm_fault_page(
369 /* Arguments: */
370 vm_object_t first_object, /* Object to begin search */
371 vm_object_offset_t first_offset, /* Offset into object */
372 vm_prot_t fault_type, /* What access is requested */
373 boolean_t must_be_resident,/* Must page be resident? */
374 int interruptible, /* how may fault be interrupted? */
375 vm_map_offset_t lo_offset, /* Map entry start */
376 vm_map_offset_t hi_offset, /* Map entry end */
377 vm_behavior_t behavior, /* Page reference behavior */
378 /* Modifies in place: */
379 vm_prot_t *protection, /* Protection for mapping */
380 /* Returns: */
381 vm_page_t *result_page, /* Page found, if successful */
382 vm_page_t *top_page, /* Page in top object, if
383 * not result_page. */
384 int *type_of_fault, /* if non-null, fill in with type of fault
385 * COW, zero-fill, etc... returned in trace point */
386 /* More arguments: */
387 kern_return_t *error_code, /* code if page is in error */
388 boolean_t no_zero_fill, /* don't zero fill absent pages */
389 boolean_t data_supply, /* treat as data_supply if
390 * it is a write fault and a full
391 * page is provided */
392 vm_map_t map,
393 __unused vm_map_offset_t vaddr)
394 {
395 register
396 vm_page_t m;
397 register
398 vm_object_t object;
399 register
400 vm_object_offset_t offset;
401 vm_page_t first_m;
402 vm_object_t next_object;
403 vm_object_t copy_object;
404 boolean_t look_for_page;
405 vm_prot_t access_required = fault_type;
406 vm_prot_t wants_copy_flag;
407 vm_object_size_t length;
408 vm_object_offset_t cluster_start, cluster_end;
409 CLUSTER_STAT(int pages_at_higher_offsets;)
410 CLUSTER_STAT(int pages_at_lower_offsets;)
411 kern_return_t wait_result;
412 boolean_t interruptible_state;
413 boolean_t bumped_pagein = FALSE;
414
415
416 #if MACH_PAGEMAP
417 /*
418 * MACH page map - an optional optimization where a bit map is maintained
419 * by the VM subsystem for internal objects to indicate which pages of
420 * the object currently reside on backing store. This existence map
421 * duplicates information maintained by the vnode pager. It is
422 * created at the time of the first pageout against the object, i.e.
423 * at the same time pager for the object is created. The optimization
424 * is designed to eliminate pager interaction overhead, if it is
425 * 'known' that the page does not exist on backing store.
426 *
427 * LOOK_FOR() evaluates to TRUE if the page specified by object/offset is
428 * either marked as paged out in the existence map for the object or no
429 * existence map exists for the object. LOOK_FOR() is one of the
430 * criteria in the decision to invoke the pager. It is also used as one
431 * of the criteria to terminate the scan for adjacent pages in a clustered
432 * pagein operation. Note that LOOK_FOR() always evaluates to TRUE for
433 * permanent objects. Note also that if the pager for an internal object
434 * has not been created, the pager is not invoked regardless of the value
435 * of LOOK_FOR() and that clustered pagein scans are only done on an object
436 * for which a pager has been created.
437 *
438 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
439 * is marked as paged out in the existence map for the object. PAGED_OUT()
440 * PAGED_OUT() is used to determine if a page has already been pushed
441 * into a copy object in order to avoid a redundant page out operation.
442 */
443 #define LOOK_FOR(o, f) (vm_external_state_get((o)->existence_map, (f)) \
444 != VM_EXTERNAL_STATE_ABSENT)
445 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
446 == VM_EXTERNAL_STATE_EXISTS)
447 #else /* MACH_PAGEMAP */
448 /*
449 * If the MACH page map optimization is not enabled,
450 * LOOK_FOR() always evaluates to TRUE. The pager will always be
451 * invoked to resolve missing pages in an object, assuming the pager
452 * has been created for the object. In a clustered page operation, the
453 * absence of a page on backing backing store cannot be used to terminate
454 * a scan for adjacent pages since that information is available only in
455 * the pager. Hence pages that may not be paged out are potentially
456 * included in a clustered request. The vnode pager is coded to deal
457 * with any combination of absent/present pages in a clustered
458 * pagein request. PAGED_OUT() always evaluates to FALSE, i.e. the pager
459 * will always be invoked to push a dirty page into a copy object assuming
460 * a pager has been created. If the page has already been pushed, the
461 * pager will ingore the new request.
462 */
463 #define LOOK_FOR(o, f) TRUE
464 #define PAGED_OUT(o, f) FALSE
465 #endif /* MACH_PAGEMAP */
466
467 /*
468 * Recovery actions
469 */
470 #define PREPARE_RELEASE_PAGE(m) \
471 MACRO_BEGIN \
472 vm_page_lock_queues(); \
473 MACRO_END
474
475 #define DO_RELEASE_PAGE(m) \
476 MACRO_BEGIN \
477 PAGE_WAKEUP_DONE(m); \
478 if (!m->active && !m->inactive) \
479 vm_page_activate(m); \
480 vm_page_unlock_queues(); \
481 MACRO_END
482
483 #define RELEASE_PAGE(m) \
484 MACRO_BEGIN \
485 PREPARE_RELEASE_PAGE(m); \
486 DO_RELEASE_PAGE(m); \
487 MACRO_END
488
489 #if TRACEFAULTPAGE
490 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
491 #endif
492
493
494
495 #if !VM_FAULT_STATIC_CONFIG
496 if (vm_fault_dirty_handling
497 #if MACH_KDB
498 /*
499 * If there are watchpoints set, then
500 * we don't want to give away write permission
501 * on a read fault. Make the task write fault,
502 * so that the watchpoint code notices the access.
503 */
504 || db_watchpoint_list
505 #endif /* MACH_KDB */
506 ) {
507 /*
508 * If we aren't asking for write permission,
509 * then don't give it away. We're using write
510 * faults to set the dirty bit.
511 */
512 if (!(fault_type & VM_PROT_WRITE))
513 *protection &= ~VM_PROT_WRITE;
514 }
515
516 if (!vm_fault_interruptible)
517 interruptible = THREAD_UNINT;
518 #else /* STATIC_CONFIG */
519 #if MACH_KDB
520 /*
521 * If there are watchpoints set, then
522 * we don't want to give away write permission
523 * on a read fault. Make the task write fault,
524 * so that the watchpoint code notices the access.
525 */
526 if (db_watchpoint_list) {
527 /*
528 * If we aren't asking for write permission,
529 * then don't give it away. We're using write
530 * faults to set the dirty bit.
531 */
532 if (!(fault_type & VM_PROT_WRITE))
533 *protection &= ~VM_PROT_WRITE;
534 }
535
536 #endif /* MACH_KDB */
537 #endif /* STATIC_CONFIG */
538
539 interruptible_state = thread_interrupt_level(interruptible);
540
541 /*
542 * INVARIANTS (through entire routine):
543 *
544 * 1) At all times, we must either have the object
545 * lock or a busy page in some object to prevent
546 * some other thread from trying to bring in
547 * the same page.
548 *
549 * Note that we cannot hold any locks during the
550 * pager access or when waiting for memory, so
551 * we use a busy page then.
552 *
553 * Note also that we aren't as concerned about more than
554 * one thread attempting to memory_object_data_unlock
555 * the same page at once, so we don't hold the page
556 * as busy then, but do record the highest unlock
557 * value so far. [Unlock requests may also be delivered
558 * out of order.]
559 *
560 * 2) To prevent another thread from racing us down the
561 * shadow chain and entering a new page in the top
562 * object before we do, we must keep a busy page in
563 * the top object while following the shadow chain.
564 *
565 * 3) We must increment paging_in_progress on any object
566 * for which we have a busy page
567 *
568 * 4) We leave busy pages on the pageout queues.
569 * If the pageout daemon comes across a busy page,
570 * it will remove the page from the pageout queues.
571 */
572
573 /*
574 * Search for the page at object/offset.
575 */
576
577 object = first_object;
578 offset = first_offset;
579 first_m = VM_PAGE_NULL;
580 access_required = fault_type;
581
582 XPR(XPR_VM_FAULT,
583 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
584 (integer_t)object, offset, fault_type, *protection, 0);
585
586 /*
587 * See whether this page is resident
588 */
589
590 while (TRUE) {
591 #if TRACEFAULTPAGE
592 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
593 #endif
594 if (!object->alive) {
595 vm_fault_cleanup(object, first_m);
596 thread_interrupt_level(interruptible_state);
597 return(VM_FAULT_MEMORY_ERROR);
598 }
599 m = vm_page_lookup(object, offset);
600 #if TRACEFAULTPAGE
601 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
602 #endif
603 if (m != VM_PAGE_NULL) {
604 /*
605 * If the page was pre-paged as part of a
606 * cluster, record the fact.
607 * If we were passed a valid pointer for
608 * "type_of_fault", than we came from
609 * vm_fault... we'll let it deal with
610 * this condition, since it
611 * needs to see m->clustered to correctly
612 * account the pageins.
613 */
614 if (type_of_fault == NULL && m->clustered) {
615 vm_pagein_cluster_used++;
616 m->clustered = FALSE;
617 }
618
619 /*
620 * If the page is being brought in,
621 * wait for it and then retry.
622 *
623 * A possible optimization: if the page
624 * is known to be resident, we can ignore
625 * pages that are absent (regardless of
626 * whether they're busy).
627 */
628
629 if (m->busy) {
630 #if TRACEFAULTPAGE
631 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
632 #endif
633 wait_result = PAGE_SLEEP(object, m, interruptible);
634 XPR(XPR_VM_FAULT,
635 "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
636 (integer_t)object, offset,
637 (integer_t)m, 0, 0);
638 counter(c_vm_fault_page_block_busy_kernel++);
639
640 if (wait_result != THREAD_AWAKENED) {
641 vm_fault_cleanup(object, first_m);
642 thread_interrupt_level(interruptible_state);
643 if (wait_result == THREAD_RESTART)
644 {
645 return(VM_FAULT_RETRY);
646 }
647 else
648 {
649 return(VM_FAULT_INTERRUPTED);
650 }
651 }
652 continue;
653 }
654
655 if (m->encrypted) {
656 /*
657 * ENCRYPTED SWAP:
658 * the user needs access to a page that we
659 * encrypted before paging it out.
660 * Decrypt the page now.
661 * Keep it busy to prevent anyone from
662 * accessing it during the decryption.
663 */
664 m->busy = TRUE;
665 vm_page_decrypt(m, 0);
666 assert(object == m->object);
667 assert(m->busy);
668 PAGE_WAKEUP_DONE(m);
669
670 /*
671 * Retry from the top, in case
672 * something changed while we were
673 * decrypting.
674 */
675 continue;
676 }
677 ASSERT_PAGE_DECRYPTED(m);
678
679 /*
680 * If the page is in error, give up now.
681 */
682
683 if (m->error) {
684 #if TRACEFAULTPAGE
685 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
686 #endif
687 if (error_code)
688 *error_code = m->page_error;
689 VM_PAGE_FREE(m);
690 vm_fault_cleanup(object, first_m);
691 thread_interrupt_level(interruptible_state);
692 return(VM_FAULT_MEMORY_ERROR);
693 }
694
695 /*
696 * If the pager wants us to restart
697 * at the top of the chain,
698 * typically because it has moved the
699 * page to another pager, then do so.
700 */
701
702 if (m->restart) {
703 #if TRACEFAULTPAGE
704 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
705 #endif
706 VM_PAGE_FREE(m);
707 vm_fault_cleanup(object, first_m);
708 thread_interrupt_level(interruptible_state);
709 return(VM_FAULT_RETRY);
710 }
711
712 /*
713 * If the page isn't busy, but is absent,
714 * then it was deemed "unavailable".
715 */
716
717 if (m->absent) {
718 /*
719 * Remove the non-existent page (unless it's
720 * in the top object) and move on down to the
721 * next object (if there is one).
722 */
723 #if TRACEFAULTPAGE
724 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
725 #endif
726
727 next_object = object->shadow;
728 if (next_object == VM_OBJECT_NULL) {
729 vm_page_t real_m;
730
731 assert(!must_be_resident);
732
733 if (object->shadow_severed) {
734 vm_fault_cleanup(
735 object, first_m);
736 thread_interrupt_level(interruptible_state);
737 return VM_FAULT_MEMORY_ERROR;
738 }
739
740 /*
741 * Absent page at bottom of shadow
742 * chain; zero fill the page we left
743 * busy in the first object, and flush
744 * the absent page. But first we
745 * need to allocate a real page.
746 */
747 if (VM_PAGE_THROTTLED() ||
748 (real_m = vm_page_grab())
749 == VM_PAGE_NULL) {
750 vm_fault_cleanup(
751 object, first_m);
752 thread_interrupt_level(
753 interruptible_state);
754 return(
755 VM_FAULT_MEMORY_SHORTAGE);
756 }
757
758 /*
759 * are we protecting the system from
760 * backing store exhaustion. If so
761 * sleep unless we are privileged.
762 */
763
764 if(vm_backing_store_low) {
765 if(!(current_task()->priv_flags
766 & VM_BACKING_STORE_PRIV)) {
767 assert_wait((event_t)
768 &vm_backing_store_low,
769 THREAD_UNINT);
770 vm_fault_cleanup(object,
771 first_m);
772 thread_block(THREAD_CONTINUE_NULL);
773 thread_interrupt_level(
774 interruptible_state);
775 return(VM_FAULT_RETRY);
776 }
777 }
778
779
780 XPR(XPR_VM_FAULT,
781 "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
782 (integer_t)object, offset,
783 (integer_t)m,
784 (integer_t)first_object, 0);
785 if (object != first_object) {
786 VM_PAGE_FREE(m);
787 vm_object_paging_end(object);
788 vm_object_unlock(object);
789 object = first_object;
790 offset = first_offset;
791 m = first_m;
792 first_m = VM_PAGE_NULL;
793 vm_object_lock(object);
794 }
795
796 VM_PAGE_FREE(m);
797 assert(real_m->busy);
798 vm_page_insert(real_m, object, offset);
799 m = real_m;
800
801 /*
802 * Drop the lock while zero filling
803 * page. Then break because this
804 * is the page we wanted. Checking
805 * the page lock is a waste of time;
806 * this page was either absent or
807 * newly allocated -- in both cases
808 * it can't be page locked by a pager.
809 */
810 m->no_isync = FALSE;
811
812 if (!no_zero_fill) {
813 vm_object_unlock(object);
814 vm_page_zero_fill(m);
815 vm_object_lock(object);
816
817 if (type_of_fault)
818 *type_of_fault = DBG_ZERO_FILL_FAULT;
819 VM_STAT(zero_fill_count++);
820 }
821 if (bumped_pagein == TRUE) {
822 VM_STAT(pageins--);
823 current_task()->pageins--;
824 }
825 vm_page_lock_queues();
826 VM_PAGE_QUEUES_REMOVE(m);
827 m->page_ticket = vm_page_ticket;
828 assert(!m->laundry);
829 assert(m->object != kernel_object);
830 assert(m->pageq.next == NULL &&
831 m->pageq.prev == NULL);
832 if(m->object->size > 0x200000) {
833 m->zero_fill = TRUE;
834 /* depends on the queues lock */
835 vm_zf_count += 1;
836 queue_enter(&vm_page_queue_zf,
837 m, vm_page_t, pageq);
838 } else {
839 queue_enter(
840 &vm_page_queue_inactive,
841 m, vm_page_t, pageq);
842 }
843 vm_page_ticket_roll++;
844 if(vm_page_ticket_roll ==
845 VM_PAGE_TICKETS_IN_ROLL) {
846 vm_page_ticket_roll = 0;
847 if(vm_page_ticket ==
848 VM_PAGE_TICKET_ROLL_IDS)
849 vm_page_ticket= 0;
850 else
851 vm_page_ticket++;
852 }
853 m->inactive = TRUE;
854 vm_page_inactive_count++;
855 vm_page_unlock_queues();
856 break;
857 } else {
858 if (must_be_resident) {
859 vm_object_paging_end(object);
860 } else if (object != first_object) {
861 vm_object_paging_end(object);
862 VM_PAGE_FREE(m);
863 } else {
864 first_m = m;
865 m->absent = FALSE;
866 m->unusual = FALSE;
867 vm_object_absent_release(object);
868 m->busy = TRUE;
869
870 vm_page_lock_queues();
871 VM_PAGE_QUEUES_REMOVE(m);
872 vm_page_unlock_queues();
873 }
874 XPR(XPR_VM_FAULT,
875 "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
876 (integer_t)object, offset,
877 (integer_t)next_object,
878 offset+object->shadow_offset,0);
879 offset += object->shadow_offset;
880 hi_offset += object->shadow_offset;
881 lo_offset += object->shadow_offset;
882 access_required = VM_PROT_READ;
883 vm_object_lock(next_object);
884 vm_object_unlock(object);
885 object = next_object;
886 vm_object_paging_begin(object);
887 continue;
888 }
889 }
890
891 if ((m->cleaning)
892 && ((object != first_object) ||
893 (object->copy != VM_OBJECT_NULL))
894 && (fault_type & VM_PROT_WRITE)) {
895 /*
896 * This is a copy-on-write fault that will
897 * cause us to revoke access to this page, but
898 * this page is in the process of being cleaned
899 * in a clustered pageout. We must wait until
900 * the cleaning operation completes before
901 * revoking access to the original page,
902 * otherwise we might attempt to remove a
903 * wired mapping.
904 */
905 #if TRACEFAULTPAGE
906 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
907 #endif
908 XPR(XPR_VM_FAULT,
909 "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
910 (integer_t)object, offset,
911 (integer_t)m, 0, 0);
912 /* take an extra ref so that object won't die */
913 assert(object->ref_count > 0);
914 object->ref_count++;
915 vm_object_res_reference(object);
916 vm_fault_cleanup(object, first_m);
917 counter(c_vm_fault_page_block_backoff_kernel++);
918 vm_object_lock(object);
919 assert(object->ref_count > 0);
920 m = vm_page_lookup(object, offset);
921 if (m != VM_PAGE_NULL && m->cleaning) {
922 PAGE_ASSERT_WAIT(m, interruptible);
923 vm_object_unlock(object);
924 wait_result = thread_block(THREAD_CONTINUE_NULL);
925 vm_object_deallocate(object);
926 goto backoff;
927 } else {
928 vm_object_unlock(object);
929 vm_object_deallocate(object);
930 thread_interrupt_level(interruptible_state);
931 return VM_FAULT_RETRY;
932 }
933 }
934
935 /*
936 * If the desired access to this page has
937 * been locked out, request that it be unlocked.
938 */
939
940 if (access_required & m->page_lock) {
941 if ((access_required & m->unlock_request) != access_required) {
942 vm_prot_t new_unlock_request;
943 kern_return_t rc;
944
945 #if TRACEFAULTPAGE
946 dbgTrace(0xBEEF000A, (unsigned int) m, (unsigned int) object->pager_ready); /* (TEST/DEBUG) */
947 #endif
948 if (!object->pager_ready) {
949 XPR(XPR_VM_FAULT,
950 "vm_f_page: ready wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
951 access_required,
952 (integer_t)object, offset,
953 (integer_t)m, 0);
954 /* take an extra ref */
955 assert(object->ref_count > 0);
956 object->ref_count++;
957 vm_object_res_reference(object);
958 vm_fault_cleanup(object,
959 first_m);
960 counter(c_vm_fault_page_block_backoff_kernel++);
961 vm_object_lock(object);
962 assert(object->ref_count > 0);
963 if (!object->pager_ready) {
964 wait_result = vm_object_assert_wait(
965 object,
966 VM_OBJECT_EVENT_PAGER_READY,
967 interruptible);
968 vm_object_unlock(object);
969 if (wait_result == THREAD_WAITING)
970 wait_result = thread_block(THREAD_CONTINUE_NULL);
971 vm_object_deallocate(object);
972 goto backoff;
973 } else {
974 vm_object_unlock(object);
975 vm_object_deallocate(object);
976 thread_interrupt_level(interruptible_state);
977 return VM_FAULT_RETRY;
978 }
979 }
980
981 new_unlock_request = m->unlock_request =
982 (access_required | m->unlock_request);
983 vm_object_unlock(object);
984 XPR(XPR_VM_FAULT,
985 "vm_f_page: unlock obj 0x%X, offset 0x%X, page 0x%X, unl_req %d\n",
986 (integer_t)object, offset,
987 (integer_t)m, new_unlock_request, 0);
988 if ((rc = memory_object_data_unlock(
989 object->pager,
990 offset + object->paging_offset,
991 PAGE_SIZE,
992 new_unlock_request))
993 != KERN_SUCCESS) {
994 if (vm_fault_debug)
995 printf("vm_fault: memory_object_data_unlock failed\n");
996 vm_object_lock(object);
997 vm_fault_cleanup(object, first_m);
998 thread_interrupt_level(interruptible_state);
999 return((rc == MACH_SEND_INTERRUPTED) ?
1000 VM_FAULT_INTERRUPTED :
1001 VM_FAULT_MEMORY_ERROR);
1002 }
1003 vm_object_lock(object);
1004 continue;
1005 }
1006
1007 XPR(XPR_VM_FAULT,
1008 "vm_f_page: access wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
1009 access_required, (integer_t)object,
1010 offset, (integer_t)m, 0);
1011 /* take an extra ref so object won't die */
1012 assert(object->ref_count > 0);
1013 object->ref_count++;
1014 vm_object_res_reference(object);
1015 vm_fault_cleanup(object, first_m);
1016 counter(c_vm_fault_page_block_backoff_kernel++);
1017 vm_object_lock(object);
1018 assert(object->ref_count > 0);
1019 m = vm_page_lookup(object, offset);
1020 if (m != VM_PAGE_NULL &&
1021 (access_required & m->page_lock) &&
1022 !((access_required & m->unlock_request) != access_required)) {
1023 PAGE_ASSERT_WAIT(m, interruptible);
1024 vm_object_unlock(object);
1025 wait_result = thread_block(THREAD_CONTINUE_NULL);
1026 vm_object_deallocate(object);
1027 goto backoff;
1028 } else {
1029 vm_object_unlock(object);
1030 vm_object_deallocate(object);
1031 thread_interrupt_level(interruptible_state);
1032 return VM_FAULT_RETRY;
1033 }
1034 }
1035 /*
1036 * We mark the page busy and leave it on
1037 * the pageout queues. If the pageout
1038 * deamon comes across it, then it will
1039 * remove the page.
1040 */
1041
1042 #if TRACEFAULTPAGE
1043 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1044 #endif
1045
1046 #if !VM_FAULT_STATIC_CONFIG
1047 if (!software_reference_bits) {
1048 vm_page_lock_queues();
1049 if (m->inactive)
1050 vm_stat.reactivations++;
1051
1052 VM_PAGE_QUEUES_REMOVE(m);
1053 vm_page_unlock_queues();
1054 }
1055 #endif
1056 XPR(XPR_VM_FAULT,
1057 "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1058 (integer_t)object, offset, (integer_t)m, 0, 0);
1059 assert(!m->busy);
1060 m->busy = TRUE;
1061 assert(!m->absent);
1062 break;
1063 }
1064
1065 look_for_page =
1066 (object->pager_created) &&
1067 LOOK_FOR(object, offset) &&
1068 (!data_supply);
1069
1070 #if TRACEFAULTPAGE
1071 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1072 #endif
1073 if ((look_for_page || (object == first_object))
1074 && !must_be_resident
1075 && !(object->phys_contiguous)) {
1076 /*
1077 * Allocate a new page for this object/offset
1078 * pair.
1079 */
1080
1081 m = vm_page_grab_fictitious();
1082 #if TRACEFAULTPAGE
1083 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1084 #endif
1085 if (m == VM_PAGE_NULL) {
1086 vm_fault_cleanup(object, first_m);
1087 thread_interrupt_level(interruptible_state);
1088 return(VM_FAULT_FICTITIOUS_SHORTAGE);
1089 }
1090 vm_page_insert(m, object, offset);
1091 }
1092
1093 if ((look_for_page && !must_be_resident)) {
1094 kern_return_t rc;
1095
1096 /*
1097 * If the memory manager is not ready, we
1098 * cannot make requests.
1099 */
1100 if (!object->pager_ready) {
1101 #if TRACEFAULTPAGE
1102 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1103 #endif
1104 if(m != VM_PAGE_NULL)
1105 VM_PAGE_FREE(m);
1106 XPR(XPR_VM_FAULT,
1107 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1108 (integer_t)object, offset, 0, 0, 0);
1109 /* take an extra ref so object won't die */
1110 assert(object->ref_count > 0);
1111 object->ref_count++;
1112 vm_object_res_reference(object);
1113 vm_fault_cleanup(object, first_m);
1114 counter(c_vm_fault_page_block_backoff_kernel++);
1115 vm_object_lock(object);
1116 assert(object->ref_count > 0);
1117 if (!object->pager_ready) {
1118 wait_result = vm_object_assert_wait(object,
1119 VM_OBJECT_EVENT_PAGER_READY,
1120 interruptible);
1121 vm_object_unlock(object);
1122 if (wait_result == THREAD_WAITING)
1123 wait_result = thread_block(THREAD_CONTINUE_NULL);
1124 vm_object_deallocate(object);
1125 goto backoff;
1126 } else {
1127 vm_object_unlock(object);
1128 vm_object_deallocate(object);
1129 thread_interrupt_level(interruptible_state);
1130 return VM_FAULT_RETRY;
1131 }
1132 }
1133
1134 if(object->phys_contiguous) {
1135 if(m != VM_PAGE_NULL) {
1136 VM_PAGE_FREE(m);
1137 m = VM_PAGE_NULL;
1138 }
1139 goto no_clustering;
1140 }
1141 if (object->internal) {
1142 /*
1143 * Requests to the default pager
1144 * must reserve a real page in advance,
1145 * because the pager's data-provided
1146 * won't block for pages. IMPORTANT:
1147 * this acts as a throttling mechanism
1148 * for data_requests to the default
1149 * pager.
1150 */
1151
1152 #if TRACEFAULTPAGE
1153 dbgTrace(0xBEEF000F, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1154 #endif
1155 if (m->fictitious && !vm_page_convert(m)) {
1156 VM_PAGE_FREE(m);
1157 vm_fault_cleanup(object, first_m);
1158 thread_interrupt_level(interruptible_state);
1159 return(VM_FAULT_MEMORY_SHORTAGE);
1160 }
1161 } else if (object->absent_count >
1162 vm_object_absent_max) {
1163 /*
1164 * If there are too many outstanding page
1165 * requests pending on this object, we
1166 * wait for them to be resolved now.
1167 */
1168
1169 #if TRACEFAULTPAGE
1170 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1171 #endif
1172 if(m != VM_PAGE_NULL)
1173 VM_PAGE_FREE(m);
1174 /* take an extra ref so object won't die */
1175 assert(object->ref_count > 0);
1176 object->ref_count++;
1177 vm_object_res_reference(object);
1178 vm_fault_cleanup(object, first_m);
1179 counter(c_vm_fault_page_block_backoff_kernel++);
1180 vm_object_lock(object);
1181 assert(object->ref_count > 0);
1182 if (object->absent_count > vm_object_absent_max) {
1183 vm_object_absent_assert_wait(object,
1184 interruptible);
1185 vm_object_unlock(object);
1186 wait_result = thread_block(THREAD_CONTINUE_NULL);
1187 vm_object_deallocate(object);
1188 goto backoff;
1189 } else {
1190 vm_object_unlock(object);
1191 vm_object_deallocate(object);
1192 thread_interrupt_level(interruptible_state);
1193 return VM_FAULT_RETRY;
1194 }
1195 }
1196
1197 /*
1198 * Indicate that the page is waiting for data
1199 * from the memory manager.
1200 */
1201
1202 if(m != VM_PAGE_NULL) {
1203
1204 m->list_req_pending = TRUE;
1205 m->absent = TRUE;
1206 m->unusual = TRUE;
1207 object->absent_count++;
1208
1209 }
1210
1211 no_clustering:
1212 cluster_start = offset;
1213 length = PAGE_SIZE;
1214
1215 /*
1216 * lengthen the cluster by the pages in the working set
1217 */
1218 if((map != NULL) &&
1219 (current_task()->dynamic_working_set != 0)) {
1220 cluster_end = cluster_start + length;
1221 /* tws values for start and end are just a
1222 * suggestions. Therefore, as long as
1223 * build_cluster does not use pointers or
1224 * take action based on values that
1225 * could be affected by re-entrance we
1226 * do not need to take the map lock.
1227 */
1228 cluster_end = offset + PAGE_SIZE_64;
1229 tws_build_cluster(
1230 current_task()->dynamic_working_set,
1231 object, &cluster_start,
1232 &cluster_end, 0x40000);
1233 length = cluster_end - cluster_start;
1234 }
1235 #if TRACEFAULTPAGE
1236 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1237 #endif
1238 /*
1239 * We have a busy page, so we can
1240 * release the object lock.
1241 */
1242 vm_object_unlock(object);
1243
1244 /*
1245 * Call the memory manager to retrieve the data.
1246 */
1247
1248 if (type_of_fault)
1249 *type_of_fault = ((int)length << 8) | DBG_PAGEIN_FAULT;
1250 VM_STAT(pageins++);
1251 current_task()->pageins++;
1252 bumped_pagein = TRUE;
1253
1254 /*
1255 * If this object uses a copy_call strategy,
1256 * and we are interested in a copy of this object
1257 * (having gotten here only by following a
1258 * shadow chain), then tell the memory manager
1259 * via a flag added to the desired_access
1260 * parameter, so that it can detect a race
1261 * between our walking down the shadow chain
1262 * and its pushing pages up into a copy of
1263 * the object that it manages.
1264 */
1265
1266 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL &&
1267 object != first_object) {
1268 wants_copy_flag = VM_PROT_WANTS_COPY;
1269 } else {
1270 wants_copy_flag = VM_PROT_NONE;
1271 }
1272
1273 XPR(XPR_VM_FAULT,
1274 "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1275 (integer_t)object, offset, (integer_t)m,
1276 access_required | wants_copy_flag, 0);
1277
1278 rc = memory_object_data_request(object->pager,
1279 cluster_start + object->paging_offset,
1280 length,
1281 access_required | wants_copy_flag);
1282
1283
1284 #if TRACEFAULTPAGE
1285 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1286 #endif
1287 if (rc != KERN_SUCCESS) {
1288 if (rc != MACH_SEND_INTERRUPTED
1289 && vm_fault_debug)
1290 printf("%s(0x%x, 0x%xll, 0x%xll, 0x%x) failed, rc=%d\n",
1291 "memory_object_data_request",
1292 object->pager,
1293 cluster_start + object->paging_offset,
1294 length, access_required, rc);
1295 /*
1296 * Don't want to leave a busy page around,
1297 * but the data request may have blocked,
1298 * so check if it's still there and busy.
1299 */
1300 if(!object->phys_contiguous) {
1301 vm_object_lock(object);
1302 for (; length; length -= PAGE_SIZE,
1303 cluster_start += PAGE_SIZE_64) {
1304 vm_page_t p;
1305 if ((p = vm_page_lookup(object,
1306 cluster_start))
1307 && p->absent && p->busy
1308 && p != first_m) {
1309 VM_PAGE_FREE(p);
1310 }
1311 }
1312 }
1313 vm_fault_cleanup(object, first_m);
1314 thread_interrupt_level(interruptible_state);
1315 return((rc == MACH_SEND_INTERRUPTED) ?
1316 VM_FAULT_INTERRUPTED :
1317 VM_FAULT_MEMORY_ERROR);
1318 }
1319
1320 vm_object_lock(object);
1321 if ((interruptible != THREAD_UNINT) &&
1322 (current_thread()->state & TH_ABORT)) {
1323 vm_fault_cleanup(object, first_m);
1324 thread_interrupt_level(interruptible_state);
1325 return(VM_FAULT_INTERRUPTED);
1326 }
1327 if (m == VM_PAGE_NULL &&
1328 object->phys_contiguous) {
1329 /*
1330 * No page here means that the object we
1331 * initially looked up was "physically
1332 * contiguous" (i.e. device memory). However,
1333 * with Virtual VRAM, the object might not
1334 * be backed by that device memory anymore,
1335 * so we're done here only if the object is
1336 * still "phys_contiguous".
1337 * Otherwise, if the object is no longer
1338 * "phys_contiguous", we need to retry the
1339 * page fault against the object's new backing
1340 * store (different memory object).
1341 */
1342 break;
1343 }
1344
1345 /*
1346 * Retry with same object/offset, since new data may
1347 * be in a different page (i.e., m is meaningless at
1348 * this point).
1349 */
1350 continue;
1351 }
1352
1353 /*
1354 * The only case in which we get here is if
1355 * object has no pager (or unwiring). If the pager doesn't
1356 * have the page this is handled in the m->absent case above
1357 * (and if you change things here you should look above).
1358 */
1359 #if TRACEFAULTPAGE
1360 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1361 #endif
1362 if (object == first_object)
1363 first_m = m;
1364 else
1365 assert(m == VM_PAGE_NULL);
1366
1367 XPR(XPR_VM_FAULT,
1368 "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1369 (integer_t)object, offset, (integer_t)m,
1370 (integer_t)object->shadow, 0);
1371 /*
1372 * Move on to the next object. Lock the next
1373 * object before unlocking the current one.
1374 */
1375 next_object = object->shadow;
1376 if (next_object == VM_OBJECT_NULL) {
1377 assert(!must_be_resident);
1378 /*
1379 * If there's no object left, fill the page
1380 * in the top object with zeros. But first we
1381 * need to allocate a real page.
1382 */
1383
1384 if (object != first_object) {
1385 vm_object_paging_end(object);
1386 vm_object_unlock(object);
1387
1388 object = first_object;
1389 offset = first_offset;
1390 vm_object_lock(object);
1391 }
1392
1393 m = first_m;
1394 assert(m->object == object);
1395 first_m = VM_PAGE_NULL;
1396
1397 if(m == VM_PAGE_NULL) {
1398 m = vm_page_grab();
1399 if (m == VM_PAGE_NULL) {
1400 vm_fault_cleanup(
1401 object, VM_PAGE_NULL);
1402 thread_interrupt_level(
1403 interruptible_state);
1404 return(VM_FAULT_MEMORY_SHORTAGE);
1405 }
1406 vm_page_insert(
1407 m, object, offset);
1408 }
1409
1410 if (object->shadow_severed) {
1411 VM_PAGE_FREE(m);
1412 vm_fault_cleanup(object, VM_PAGE_NULL);
1413 thread_interrupt_level(interruptible_state);
1414 return VM_FAULT_MEMORY_ERROR;
1415 }
1416
1417 /*
1418 * are we protecting the system from
1419 * backing store exhaustion. If so
1420 * sleep unless we are privileged.
1421 */
1422
1423 if(vm_backing_store_low) {
1424 if(!(current_task()->priv_flags
1425 & VM_BACKING_STORE_PRIV)) {
1426 assert_wait((event_t)
1427 &vm_backing_store_low,
1428 THREAD_UNINT);
1429 VM_PAGE_FREE(m);
1430 vm_fault_cleanup(object, VM_PAGE_NULL);
1431 thread_block(THREAD_CONTINUE_NULL);
1432 thread_interrupt_level(
1433 interruptible_state);
1434 return(VM_FAULT_RETRY);
1435 }
1436 }
1437
1438 if (VM_PAGE_THROTTLED() ||
1439 (m->fictitious && !vm_page_convert(m))) {
1440 VM_PAGE_FREE(m);
1441 vm_fault_cleanup(object, VM_PAGE_NULL);
1442 thread_interrupt_level(interruptible_state);
1443 return(VM_FAULT_MEMORY_SHORTAGE);
1444 }
1445 m->no_isync = FALSE;
1446
1447 if (!no_zero_fill) {
1448 vm_object_unlock(object);
1449 vm_page_zero_fill(m);
1450 vm_object_lock(object);
1451
1452 if (type_of_fault)
1453 *type_of_fault = DBG_ZERO_FILL_FAULT;
1454 VM_STAT(zero_fill_count++);
1455 }
1456 if (bumped_pagein == TRUE) {
1457 VM_STAT(pageins--);
1458 current_task()->pageins--;
1459 }
1460 vm_page_lock_queues();
1461 VM_PAGE_QUEUES_REMOVE(m);
1462 assert(!m->laundry);
1463 assert(m->object != kernel_object);
1464 assert(m->pageq.next == NULL &&
1465 m->pageq.prev == NULL);
1466 if(m->object->size > 0x200000) {
1467 m->zero_fill = TRUE;
1468 /* depends on the queues lock */
1469 vm_zf_count += 1;
1470 queue_enter(&vm_page_queue_zf,
1471 m, vm_page_t, pageq);
1472 } else {
1473 queue_enter(
1474 &vm_page_queue_inactive,
1475 m, vm_page_t, pageq);
1476 }
1477 m->page_ticket = vm_page_ticket;
1478 vm_page_ticket_roll++;
1479 if(vm_page_ticket_roll == VM_PAGE_TICKETS_IN_ROLL) {
1480 vm_page_ticket_roll = 0;
1481 if(vm_page_ticket ==
1482 VM_PAGE_TICKET_ROLL_IDS)
1483 vm_page_ticket= 0;
1484 else
1485 vm_page_ticket++;
1486 }
1487 m->inactive = TRUE;
1488 vm_page_inactive_count++;
1489 vm_page_unlock_queues();
1490 #if 0
1491 pmap_clear_modify(m->phys_page);
1492 #endif
1493 break;
1494 }
1495 else {
1496 if ((object != first_object) || must_be_resident)
1497 vm_object_paging_end(object);
1498 offset += object->shadow_offset;
1499 hi_offset += object->shadow_offset;
1500 lo_offset += object->shadow_offset;
1501 access_required = VM_PROT_READ;
1502 vm_object_lock(next_object);
1503 vm_object_unlock(object);
1504 object = next_object;
1505 vm_object_paging_begin(object);
1506 }
1507 }
1508
1509 /*
1510 * PAGE HAS BEEN FOUND.
1511 *
1512 * This page (m) is:
1513 * busy, so that we can play with it;
1514 * not absent, so that nobody else will fill it;
1515 * possibly eligible for pageout;
1516 *
1517 * The top-level page (first_m) is:
1518 * VM_PAGE_NULL if the page was found in the
1519 * top-level object;
1520 * busy, not absent, and ineligible for pageout.
1521 *
1522 * The current object (object) is locked. A paging
1523 * reference is held for the current and top-level
1524 * objects.
1525 */
1526
1527 #if TRACEFAULTPAGE
1528 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1529 #endif
1530 #if EXTRA_ASSERTIONS
1531 if(m != VM_PAGE_NULL) {
1532 assert(m->busy && !m->absent);
1533 assert((first_m == VM_PAGE_NULL) ||
1534 (first_m->busy && !first_m->absent &&
1535 !first_m->active && !first_m->inactive));
1536 }
1537 #endif /* EXTRA_ASSERTIONS */
1538
1539 /*
1540 * ENCRYPTED SWAP:
1541 * If we found a page, we must have decrypted it before we
1542 * get here...
1543 */
1544 if (m != VM_PAGE_NULL) {
1545 ASSERT_PAGE_DECRYPTED(m);
1546 }
1547
1548 XPR(XPR_VM_FAULT,
1549 "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1550 (integer_t)object, offset, (integer_t)m,
1551 (integer_t)first_object, (integer_t)first_m);
1552 /*
1553 * If the page is being written, but isn't
1554 * already owned by the top-level object,
1555 * we have to copy it into a new page owned
1556 * by the top-level object.
1557 */
1558
1559 if ((object != first_object) && (m != VM_PAGE_NULL)) {
1560 /*
1561 * We only really need to copy if we
1562 * want to write it.
1563 */
1564
1565 #if TRACEFAULTPAGE
1566 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1567 #endif
1568 if (fault_type & VM_PROT_WRITE) {
1569 vm_page_t copy_m;
1570
1571 assert(!must_be_resident);
1572
1573 /*
1574 * are we protecting the system from
1575 * backing store exhaustion. If so
1576 * sleep unless we are privileged.
1577 */
1578
1579 if(vm_backing_store_low) {
1580 if(!(current_task()->priv_flags
1581 & VM_BACKING_STORE_PRIV)) {
1582 assert_wait((event_t)
1583 &vm_backing_store_low,
1584 THREAD_UNINT);
1585 RELEASE_PAGE(m);
1586 vm_fault_cleanup(object, first_m);
1587 thread_block(THREAD_CONTINUE_NULL);
1588 thread_interrupt_level(
1589 interruptible_state);
1590 return(VM_FAULT_RETRY);
1591 }
1592 }
1593
1594 /*
1595 * If we try to collapse first_object at this
1596 * point, we may deadlock when we try to get
1597 * the lock on an intermediate object (since we
1598 * have the bottom object locked). We can't
1599 * unlock the bottom object, because the page
1600 * we found may move (by collapse) if we do.
1601 *
1602 * Instead, we first copy the page. Then, when
1603 * we have no more use for the bottom object,
1604 * we unlock it and try to collapse.
1605 *
1606 * Note that we copy the page even if we didn't
1607 * need to... that's the breaks.
1608 */
1609
1610 /*
1611 * Allocate a page for the copy
1612 */
1613 copy_m = vm_page_grab();
1614 if (copy_m == VM_PAGE_NULL) {
1615 RELEASE_PAGE(m);
1616 vm_fault_cleanup(object, first_m);
1617 thread_interrupt_level(interruptible_state);
1618 return(VM_FAULT_MEMORY_SHORTAGE);
1619 }
1620
1621
1622 XPR(XPR_VM_FAULT,
1623 "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1624 (integer_t)object, offset,
1625 (integer_t)m, (integer_t)copy_m, 0);
1626 vm_page_copy(m, copy_m);
1627
1628 /*
1629 * If another map is truly sharing this
1630 * page with us, we have to flush all
1631 * uses of the original page, since we
1632 * can't distinguish those which want the
1633 * original from those which need the
1634 * new copy.
1635 *
1636 * XXXO If we know that only one map has
1637 * access to this page, then we could
1638 * avoid the pmap_disconnect() call.
1639 */
1640
1641 vm_page_lock_queues();
1642 assert(!m->cleaning);
1643 pmap_disconnect(m->phys_page);
1644 vm_page_deactivate(m);
1645 copy_m->dirty = TRUE;
1646 /*
1647 * Setting reference here prevents this fault from
1648 * being counted as a (per-thread) reactivate as well
1649 * as a copy-on-write.
1650 */
1651 first_m->reference = TRUE;
1652 vm_page_unlock_queues();
1653
1654 /*
1655 * We no longer need the old page or object.
1656 */
1657
1658 PAGE_WAKEUP_DONE(m);
1659 vm_object_paging_end(object);
1660 vm_object_unlock(object);
1661
1662 if (type_of_fault)
1663 *type_of_fault = DBG_COW_FAULT;
1664 VM_STAT(cow_faults++);
1665 current_task()->cow_faults++;
1666 object = first_object;
1667 offset = first_offset;
1668
1669 vm_object_lock(object);
1670 VM_PAGE_FREE(first_m);
1671 first_m = VM_PAGE_NULL;
1672 assert(copy_m->busy);
1673 vm_page_insert(copy_m, object, offset);
1674 m = copy_m;
1675
1676 /*
1677 * Now that we've gotten the copy out of the
1678 * way, let's try to collapse the top object.
1679 * But we have to play ugly games with
1680 * paging_in_progress to do that...
1681 */
1682
1683 vm_object_paging_end(object);
1684 vm_object_collapse(object, offset);
1685 vm_object_paging_begin(object);
1686
1687 }
1688 else {
1689 *protection &= (~VM_PROT_WRITE);
1690 }
1691 }
1692
1693 /*
1694 * Now check whether the page needs to be pushed into the
1695 * copy object. The use of asymmetric copy on write for
1696 * shared temporary objects means that we may do two copies to
1697 * satisfy the fault; one above to get the page from a
1698 * shadowed object, and one here to push it into the copy.
1699 */
1700
1701 while ((copy_object = first_object->copy) != VM_OBJECT_NULL &&
1702 (m!= VM_PAGE_NULL)) {
1703 vm_object_offset_t copy_offset;
1704 vm_page_t copy_m;
1705
1706 #if TRACEFAULTPAGE
1707 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1708 #endif
1709 /*
1710 * If the page is being written, but hasn't been
1711 * copied to the copy-object, we have to copy it there.
1712 */
1713
1714 if ((fault_type & VM_PROT_WRITE) == 0) {
1715 *protection &= ~VM_PROT_WRITE;
1716 break;
1717 }
1718
1719 /*
1720 * If the page was guaranteed to be resident,
1721 * we must have already performed the copy.
1722 */
1723
1724 if (must_be_resident)
1725 break;
1726
1727 /*
1728 * Try to get the lock on the copy_object.
1729 */
1730 if (!vm_object_lock_try(copy_object)) {
1731 vm_object_unlock(object);
1732
1733 mutex_pause(); /* wait a bit */
1734
1735 vm_object_lock(object);
1736 continue;
1737 }
1738
1739 /*
1740 * Make another reference to the copy-object,
1741 * to keep it from disappearing during the
1742 * copy.
1743 */
1744 assert(copy_object->ref_count > 0);
1745 copy_object->ref_count++;
1746 VM_OBJ_RES_INCR(copy_object);
1747
1748 /*
1749 * Does the page exist in the copy?
1750 */
1751 copy_offset = first_offset - copy_object->shadow_offset;
1752 if (copy_object->size <= copy_offset)
1753 /*
1754 * Copy object doesn't cover this page -- do nothing.
1755 */
1756 ;
1757 else if ((copy_m =
1758 vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1759 /* Page currently exists in the copy object */
1760 if (copy_m->busy) {
1761 /*
1762 * If the page is being brought
1763 * in, wait for it and then retry.
1764 */
1765 RELEASE_PAGE(m);
1766 /* take an extra ref so object won't die */
1767 assert(copy_object->ref_count > 0);
1768 copy_object->ref_count++;
1769 vm_object_res_reference(copy_object);
1770 vm_object_unlock(copy_object);
1771 vm_fault_cleanup(object, first_m);
1772 counter(c_vm_fault_page_block_backoff_kernel++);
1773 vm_object_lock(copy_object);
1774 assert(copy_object->ref_count > 0);
1775 VM_OBJ_RES_DECR(copy_object);
1776 copy_object->ref_count--;
1777 assert(copy_object->ref_count > 0);
1778 copy_m = vm_page_lookup(copy_object, copy_offset);
1779 /*
1780 * ENCRYPTED SWAP:
1781 * it's OK if the "copy_m" page is encrypted,
1782 * because we're not moving it nor handling its
1783 * contents.
1784 */
1785 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1786 PAGE_ASSERT_WAIT(copy_m, interruptible);
1787 vm_object_unlock(copy_object);
1788 wait_result = thread_block(THREAD_CONTINUE_NULL);
1789 vm_object_deallocate(copy_object);
1790 goto backoff;
1791 } else {
1792 vm_object_unlock(copy_object);
1793 vm_object_deallocate(copy_object);
1794 thread_interrupt_level(interruptible_state);
1795 return VM_FAULT_RETRY;
1796 }
1797 }
1798 }
1799 else if (!PAGED_OUT(copy_object, copy_offset)) {
1800 /*
1801 * If PAGED_OUT is TRUE, then the page used to exist
1802 * in the copy-object, and has already been paged out.
1803 * We don't need to repeat this. If PAGED_OUT is
1804 * FALSE, then either we don't know (!pager_created,
1805 * for example) or it hasn't been paged out.
1806 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1807 * We must copy the page to the copy object.
1808 */
1809
1810 /*
1811 * are we protecting the system from
1812 * backing store exhaustion. If so
1813 * sleep unless we are privileged.
1814 */
1815
1816 if(vm_backing_store_low) {
1817 if(!(current_task()->priv_flags
1818 & VM_BACKING_STORE_PRIV)) {
1819 assert_wait((event_t)
1820 &vm_backing_store_low,
1821 THREAD_UNINT);
1822 RELEASE_PAGE(m);
1823 VM_OBJ_RES_DECR(copy_object);
1824 copy_object->ref_count--;
1825 assert(copy_object->ref_count > 0);
1826 vm_object_unlock(copy_object);
1827 vm_fault_cleanup(object, first_m);
1828 thread_block(THREAD_CONTINUE_NULL);
1829 thread_interrupt_level(
1830 interruptible_state);
1831 return(VM_FAULT_RETRY);
1832 }
1833 }
1834
1835 /*
1836 * Allocate a page for the copy
1837 */
1838 copy_m = vm_page_alloc(copy_object, copy_offset);
1839 if (copy_m == VM_PAGE_NULL) {
1840 RELEASE_PAGE(m);
1841 VM_OBJ_RES_DECR(copy_object);
1842 copy_object->ref_count--;
1843 assert(copy_object->ref_count > 0);
1844 vm_object_unlock(copy_object);
1845 vm_fault_cleanup(object, first_m);
1846 thread_interrupt_level(interruptible_state);
1847 return(VM_FAULT_MEMORY_SHORTAGE);
1848 }
1849
1850 /*
1851 * Must copy page into copy-object.
1852 */
1853
1854 vm_page_copy(m, copy_m);
1855
1856 /*
1857 * If the old page was in use by any users
1858 * of the copy-object, it must be removed
1859 * from all pmaps. (We can't know which
1860 * pmaps use it.)
1861 */
1862
1863 vm_page_lock_queues();
1864 assert(!m->cleaning);
1865 pmap_disconnect(m->phys_page);
1866 copy_m->dirty = TRUE;
1867 vm_page_unlock_queues();
1868
1869 /*
1870 * If there's a pager, then immediately
1871 * page out this page, using the "initialize"
1872 * option. Else, we use the copy.
1873 */
1874
1875 if
1876 #if MACH_PAGEMAP
1877 ((!copy_object->pager_created) ||
1878 vm_external_state_get(
1879 copy_object->existence_map, copy_offset)
1880 == VM_EXTERNAL_STATE_ABSENT)
1881 #else
1882 (!copy_object->pager_created)
1883 #endif
1884 {
1885 vm_page_lock_queues();
1886 vm_page_activate(copy_m);
1887 vm_page_unlock_queues();
1888 PAGE_WAKEUP_DONE(copy_m);
1889 }
1890 else {
1891 assert(copy_m->busy == TRUE);
1892
1893 /*
1894 * The page is already ready for pageout:
1895 * not on pageout queues and busy.
1896 * Unlock everything except the
1897 * copy_object itself.
1898 */
1899
1900 vm_object_unlock(object);
1901
1902 /*
1903 * Write the page to the copy-object,
1904 * flushing it from the kernel.
1905 */
1906
1907 vm_pageout_initialize_page(copy_m);
1908
1909 /*
1910 * Since the pageout may have
1911 * temporarily dropped the
1912 * copy_object's lock, we
1913 * check whether we'll have
1914 * to deallocate the hard way.
1915 */
1916
1917 if ((copy_object->shadow != object) ||
1918 (copy_object->ref_count == 1)) {
1919 vm_object_unlock(copy_object);
1920 vm_object_deallocate(copy_object);
1921 vm_object_lock(object);
1922 continue;
1923 }
1924
1925 /*
1926 * Pick back up the old object's
1927 * lock. [It is safe to do so,
1928 * since it must be deeper in the
1929 * object tree.]
1930 */
1931
1932 vm_object_lock(object);
1933 }
1934
1935 /*
1936 * Because we're pushing a page upward
1937 * in the object tree, we must restart
1938 * any faults that are waiting here.
1939 * [Note that this is an expansion of
1940 * PAGE_WAKEUP that uses the THREAD_RESTART
1941 * wait result]. Can't turn off the page's
1942 * busy bit because we're not done with it.
1943 */
1944
1945 if (m->wanted) {
1946 m->wanted = FALSE;
1947 thread_wakeup_with_result((event_t) m,
1948 THREAD_RESTART);
1949 }
1950 }
1951
1952 /*
1953 * The reference count on copy_object must be
1954 * at least 2: one for our extra reference,
1955 * and at least one from the outside world
1956 * (we checked that when we last locked
1957 * copy_object).
1958 */
1959 copy_object->ref_count--;
1960 assert(copy_object->ref_count > 0);
1961 VM_OBJ_RES_DECR(copy_object);
1962 vm_object_unlock(copy_object);
1963
1964 break;
1965 }
1966
1967 *result_page = m;
1968 *top_page = first_m;
1969
1970 XPR(XPR_VM_FAULT,
1971 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1972 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1973 /*
1974 * If the page can be written, assume that it will be.
1975 * [Earlier, we restrict the permission to allow write
1976 * access only if the fault so required, so we don't
1977 * mark read-only data as dirty.]
1978 */
1979
1980
1981 if(m != VM_PAGE_NULL) {
1982 #if !VM_FAULT_STATIC_CONFIG
1983 if (vm_fault_dirty_handling && (*protection & VM_PROT_WRITE))
1984 m->dirty = TRUE;
1985 #endif
1986 if (vm_page_deactivate_behind)
1987 vm_fault_deactivate_behind(object, offset, behavior);
1988 } else {
1989 vm_object_unlock(object);
1990 }
1991 thread_interrupt_level(interruptible_state);
1992
1993 #if TRACEFAULTPAGE
1994 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
1995 #endif
1996 return(VM_FAULT_SUCCESS);
1997
1998 #if 0
1999 block_and_backoff:
2000 vm_fault_cleanup(object, first_m);
2001
2002 counter(c_vm_fault_page_block_backoff_kernel++);
2003 thread_block(THREAD_CONTINUE_NULL);
2004 #endif
2005
2006 backoff:
2007 thread_interrupt_level(interruptible_state);
2008 if (wait_result == THREAD_INTERRUPTED)
2009 return VM_FAULT_INTERRUPTED;
2010 return VM_FAULT_RETRY;
2011
2012 #undef RELEASE_PAGE
2013 }
2014
2015 /*
2016 * Routine: vm_fault_tws_insert
2017 * Purpose:
2018 * Add fault information to the task working set.
2019 * Implementation:
2020 * We always insert the base object/offset pair
2021 * rather the actual object/offset.
2022 * Assumptions:
2023 * Map and real_map locked.
2024 * Object locked and referenced.
2025 * Returns:
2026 * TRUE if startup file should be written.
2027 * With object locked and still referenced.
2028 * But we may drop the object lock temporarily.
2029 */
2030 static boolean_t
2031 vm_fault_tws_insert(
2032 vm_map_t map,
2033 vm_map_t real_map,
2034 vm_map_offset_t vaddr,
2035 vm_object_t object,
2036 vm_object_offset_t offset)
2037 {
2038 tws_hash_line_t line;
2039 task_t task;
2040 kern_return_t kr;
2041 boolean_t result = FALSE;
2042
2043 /* Avoid possible map lock deadlock issues */
2044 if (map == kernel_map || map == kalloc_map ||
2045 real_map == kernel_map || real_map == kalloc_map)
2046 return result;
2047
2048 task = current_task();
2049 if (task->dynamic_working_set != 0) {
2050 vm_object_t base_object;
2051 vm_object_t base_shadow;
2052 vm_object_offset_t base_offset;
2053 base_object = object;
2054 base_offset = offset;
2055 while ((base_shadow = base_object->shadow)) {
2056 vm_object_lock(base_shadow);
2057 vm_object_unlock(base_object);
2058 base_offset +=
2059 base_object->shadow_offset;
2060 base_object = base_shadow;
2061 }
2062 kr = tws_lookup(
2063 task->dynamic_working_set,
2064 base_offset, base_object,
2065 &line);
2066 if (kr == KERN_OPERATION_TIMED_OUT){
2067 result = TRUE;
2068 if (base_object != object) {
2069 vm_object_unlock(base_object);
2070 vm_object_lock(object);
2071 }
2072 } else if (kr != KERN_SUCCESS) {
2073 if(base_object != object)
2074 vm_object_reference_locked(base_object);
2075 kr = tws_insert(
2076 task->dynamic_working_set,
2077 base_offset, base_object,
2078 vaddr, real_map);
2079 if(base_object != object) {
2080 vm_object_unlock(base_object);
2081 vm_object_deallocate(base_object);
2082 }
2083 if(kr == KERN_NO_SPACE) {
2084 if (base_object == object)
2085 vm_object_unlock(object);
2086 tws_expand_working_set(
2087 task->dynamic_working_set,
2088 TWS_HASH_LINE_COUNT,
2089 FALSE);
2090 if (base_object == object)
2091 vm_object_lock(object);
2092 } else if(kr == KERN_OPERATION_TIMED_OUT) {
2093 result = TRUE;
2094 }
2095 if(base_object != object)
2096 vm_object_lock(object);
2097 } else if (base_object != object) {
2098 vm_object_unlock(base_object);
2099 vm_object_lock(object);
2100 }
2101 }
2102 return result;
2103 }
2104
2105 /*
2106 * Routine: vm_fault
2107 * Purpose:
2108 * Handle page faults, including pseudo-faults
2109 * used to change the wiring status of pages.
2110 * Returns:
2111 * Explicit continuations have been removed.
2112 * Implementation:
2113 * vm_fault and vm_fault_page save mucho state
2114 * in the moral equivalent of a closure. The state
2115 * structure is allocated when first entering vm_fault
2116 * and deallocated when leaving vm_fault.
2117 */
2118
2119 extern int _map_enter_debug;
2120
2121 kern_return_t
2122 vm_fault(
2123 vm_map_t map,
2124 vm_map_offset_t vaddr,
2125 vm_prot_t fault_type,
2126 boolean_t change_wiring,
2127 int interruptible,
2128 pmap_t caller_pmap,
2129 vm_map_offset_t caller_pmap_addr)
2130 {
2131 vm_map_version_t version; /* Map version for verificiation */
2132 boolean_t wired; /* Should mapping be wired down? */
2133 vm_object_t object; /* Top-level object */
2134 vm_object_offset_t offset; /* Top-level offset */
2135 vm_prot_t prot; /* Protection for mapping */
2136 vm_behavior_t behavior; /* Expected paging behavior */
2137 vm_map_offset_t lo_offset, hi_offset;
2138 vm_object_t old_copy_object; /* Saved copy object */
2139 vm_page_t result_page; /* Result of vm_fault_page */
2140 vm_page_t top_page; /* Placeholder page */
2141 kern_return_t kr;
2142
2143 register
2144 vm_page_t m; /* Fast access to result_page */
2145 kern_return_t error_code = 0; /* page error reasons */
2146 register
2147 vm_object_t cur_object;
2148 register
2149 vm_object_offset_t cur_offset;
2150 vm_page_t cur_m;
2151 vm_object_t new_object;
2152 int type_of_fault;
2153 vm_map_t real_map = map;
2154 vm_map_t original_map = map;
2155 pmap_t pmap = NULL;
2156 boolean_t interruptible_state;
2157 unsigned int cache_attr;
2158 int write_startup_file = 0;
2159 boolean_t need_activation;
2160 vm_prot_t full_fault_type;
2161
2162 if (get_preemption_level() != 0)
2163 return (KERN_FAILURE);
2164
2165 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_START,
2166 vaddr,
2167 0,
2168 0,
2169 0,
2170 0);
2171
2172 /* at present we do not fully check for execute permission */
2173 /* we generally treat it is read except in certain device */
2174 /* memory settings */
2175 full_fault_type = fault_type;
2176 if(fault_type & VM_PROT_EXECUTE) {
2177 fault_type &= ~VM_PROT_EXECUTE;
2178 fault_type |= VM_PROT_READ;
2179 }
2180
2181 interruptible_state = thread_interrupt_level(interruptible);
2182
2183 /*
2184 * assume we will hit a page in the cache
2185 * otherwise, explicitly override with
2186 * the real fault type once we determine it
2187 */
2188 type_of_fault = DBG_CACHE_HIT_FAULT;
2189
2190 VM_STAT(faults++);
2191 current_task()->faults++;
2192
2193 RetryFault: ;
2194
2195 /*
2196 * Find the backing store object and offset into
2197 * it to begin the search.
2198 */
2199 map = original_map;
2200 vm_map_lock_read(map);
2201 kr = vm_map_lookup_locked(&map, vaddr, fault_type, &version,
2202 &object, &offset,
2203 &prot, &wired,
2204 &behavior, &lo_offset, &hi_offset, &real_map);
2205
2206 //if (_map_enter_debug)printf("vm_map_lookup_locked(map=0x%x, addr=0x%llx, prot=%d wired=%d) = %d\n", map, vaddr, prot, wired, kr);
2207
2208 pmap = real_map->pmap;
2209
2210 if (kr != KERN_SUCCESS) {
2211 vm_map_unlock_read(map);
2212 goto done;
2213 }
2214
2215 /*
2216 * If the page is wired, we must fault for the current protection
2217 * value, to avoid further faults.
2218 */
2219
2220 if (wired)
2221 fault_type = prot | VM_PROT_WRITE;
2222
2223 #if VM_FAULT_CLASSIFY
2224 /*
2225 * Temporary data gathering code
2226 */
2227 vm_fault_classify(object, offset, fault_type);
2228 #endif
2229 /*
2230 * Fast fault code. The basic idea is to do as much as
2231 * possible while holding the map lock and object locks.
2232 * Busy pages are not used until the object lock has to
2233 * be dropped to do something (copy, zero fill, pmap enter).
2234 * Similarly, paging references aren't acquired until that
2235 * point, and object references aren't used.
2236 *
2237 * If we can figure out what to do
2238 * (zero fill, copy on write, pmap enter) while holding
2239 * the locks, then it gets done. Otherwise, we give up,
2240 * and use the original fault path (which doesn't hold
2241 * the map lock, and relies on busy pages).
2242 * The give up cases include:
2243 * - Have to talk to pager.
2244 * - Page is busy, absent or in error.
2245 * - Pager has locked out desired access.
2246 * - Fault needs to be restarted.
2247 * - Have to push page into copy object.
2248 *
2249 * The code is an infinite loop that moves one level down
2250 * the shadow chain each time. cur_object and cur_offset
2251 * refer to the current object being examined. object and offset
2252 * are the original object from the map. The loop is at the
2253 * top level if and only if object and cur_object are the same.
2254 *
2255 * Invariants: Map lock is held throughout. Lock is held on
2256 * original object and cur_object (if different) when
2257 * continuing or exiting loop.
2258 *
2259 */
2260
2261
2262 /*
2263 * If this page is to be inserted in a copy delay object
2264 * for writing, and if the object has a copy, then the
2265 * copy delay strategy is implemented in the slow fault page.
2266 */
2267 if (object->copy_strategy != MEMORY_OBJECT_COPY_DELAY ||
2268 object->copy == VM_OBJECT_NULL ||
2269 (fault_type & VM_PROT_WRITE) == 0) {
2270 cur_object = object;
2271 cur_offset = offset;
2272
2273 while (TRUE) {
2274 m = vm_page_lookup(cur_object, cur_offset);
2275 if (m != VM_PAGE_NULL) {
2276 if (m->busy) {
2277 wait_result_t result;
2278
2279 if (object != cur_object)
2280 vm_object_unlock(object);
2281
2282 vm_map_unlock_read(map);
2283 if (real_map != map)
2284 vm_map_unlock(real_map);
2285
2286 #if !VM_FAULT_STATIC_CONFIG
2287 if (!vm_fault_interruptible)
2288 interruptible = THREAD_UNINT;
2289 #endif
2290 result = PAGE_ASSERT_WAIT(m, interruptible);
2291
2292 vm_object_unlock(cur_object);
2293
2294 if (result == THREAD_WAITING) {
2295 result = thread_block(THREAD_CONTINUE_NULL);
2296
2297 counter(c_vm_fault_page_block_busy_kernel++);
2298 }
2299 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2300 goto RetryFault;
2301
2302 kr = KERN_ABORTED;
2303 goto done;
2304 }
2305 if (m->unusual && (m->error || m->restart || m->private
2306 || m->absent || (fault_type & m->page_lock))) {
2307
2308 /*
2309 * Unusual case. Give up.
2310 */
2311 break;
2312 }
2313
2314 if (m->encrypted) {
2315 /*
2316 * ENCRYPTED SWAP:
2317 * We've soft-faulted (because it's not in the page
2318 * table) on an encrypted page.
2319 * Keep the page "busy" so that noone messes with
2320 * it during the decryption.
2321 * Release the extra locks we're holding, keep only
2322 * the page's VM object lock.
2323 */
2324 m->busy = TRUE;
2325 if (object != cur_object) {
2326 vm_object_unlock(object);
2327 }
2328 vm_map_unlock_read(map);
2329 if (real_map != map)
2330 vm_map_unlock(real_map);
2331
2332 vm_page_decrypt(m, 0);
2333
2334 assert(m->busy);
2335 PAGE_WAKEUP_DONE(m);
2336 vm_object_unlock(m->object);
2337
2338 /*
2339 * Retry from the top, in case anything
2340 * changed while we were decrypting...
2341 */
2342 goto RetryFault;
2343 }
2344 ASSERT_PAGE_DECRYPTED(m);
2345
2346 /*
2347 * Two cases of map in faults:
2348 * - At top level w/o copy object.
2349 * - Read fault anywhere.
2350 * --> must disallow write.
2351 */
2352
2353 if (object == cur_object &&
2354 object->copy == VM_OBJECT_NULL)
2355 goto FastMapInFault;
2356
2357 if ((fault_type & VM_PROT_WRITE) == 0) {
2358 boolean_t sequential;
2359
2360 prot &= ~VM_PROT_WRITE;
2361
2362 /*
2363 * Set up to map the page ...
2364 * mark the page busy, drop
2365 * locks and take a paging reference
2366 * on the object with the page.
2367 */
2368
2369 if (object != cur_object) {
2370 vm_object_unlock(object);
2371 object = cur_object;
2372 }
2373 FastMapInFault:
2374 m->busy = TRUE;
2375
2376 vm_object_paging_begin(object);
2377
2378 FastPmapEnter:
2379 /*
2380 * Check a couple of global reasons to
2381 * be conservative about write access.
2382 * Then do the pmap_enter.
2383 */
2384 #if !VM_FAULT_STATIC_CONFIG
2385 if (vm_fault_dirty_handling
2386 #if MACH_KDB
2387 || db_watchpoint_list
2388 #endif
2389 && (fault_type & VM_PROT_WRITE) == 0)
2390 prot &= ~VM_PROT_WRITE;
2391 #else /* STATIC_CONFIG */
2392 #if MACH_KDB
2393 if (db_watchpoint_list
2394 && (fault_type & VM_PROT_WRITE) == 0)
2395 prot &= ~VM_PROT_WRITE;
2396 #endif /* MACH_KDB */
2397 #endif /* STATIC_CONFIG */
2398 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2399
2400 sequential = FALSE;
2401 need_activation = FALSE;
2402
2403 if (m->no_isync == TRUE) {
2404 m->no_isync = FALSE;
2405 pmap_sync_page_data_phys(m->phys_page);
2406
2407 if ((type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2408 /*
2409 * found it in the cache, but this
2410 * is the first fault-in of the page (no_isync == TRUE)
2411 * so it must have come in as part of
2412 * a cluster... account 1 pagein against it
2413 */
2414 VM_STAT(pageins++);
2415 current_task()->pageins++;
2416 type_of_fault = DBG_PAGEIN_FAULT;
2417 sequential = TRUE;
2418 }
2419 if (m->clustered)
2420 need_activation = TRUE;
2421
2422 } else if (cache_attr != VM_WIMG_DEFAULT) {
2423 pmap_sync_page_attributes_phys(m->phys_page);
2424 }
2425
2426 if(caller_pmap) {
2427 PMAP_ENTER(caller_pmap,
2428 caller_pmap_addr, m,
2429 prot, cache_attr, wired);
2430 } else {
2431 PMAP_ENTER(pmap, vaddr, m,
2432 prot, cache_attr, wired);
2433 }
2434
2435 /*
2436 * Hold queues lock to manipulate
2437 * the page queues. Change wiring
2438 * case is obvious. In soft ref bits
2439 * case activate page only if it fell
2440 * off paging queues, otherwise just
2441 * activate it if it's inactive.
2442 *
2443 * NOTE: original vm_fault code will
2444 * move active page to back of active
2445 * queue. This code doesn't.
2446 */
2447 vm_page_lock_queues();
2448
2449 if (m->clustered) {
2450 vm_pagein_cluster_used++;
2451 m->clustered = FALSE;
2452 }
2453 m->reference = TRUE;
2454
2455 if (change_wiring) {
2456 if (wired)
2457 vm_page_wire(m);
2458 else
2459 vm_page_unwire(m);
2460 }
2461 #if VM_FAULT_STATIC_CONFIG
2462 else {
2463 if ((!m->active && !m->inactive) || ((need_activation == TRUE) && !m->active))
2464 vm_page_activate(m);
2465 }
2466 #else
2467 else if (software_reference_bits) {
2468 if (!m->active && !m->inactive)
2469 vm_page_activate(m);
2470 }
2471 else if (!m->active) {
2472 vm_page_activate(m);
2473 }
2474 #endif
2475 vm_page_unlock_queues();
2476
2477 /*
2478 * That's it, clean up and return.
2479 */
2480 PAGE_WAKEUP_DONE(m);
2481
2482 sequential = (sequential && vm_page_deactivate_behind) ?
2483 vm_fault_deactivate_behind(object, cur_offset, behavior) :
2484 FALSE;
2485
2486 /*
2487 * Add non-sequential pages to the working set.
2488 * The sequential pages will be brought in through
2489 * normal clustering behavior.
2490 */
2491 if (!sequential && !object->private) {
2492 write_startup_file =
2493 vm_fault_tws_insert(map, real_map, vaddr,
2494 object, cur_offset);
2495 }
2496
2497 vm_object_paging_end(object);
2498 vm_object_unlock(object);
2499
2500 vm_map_unlock_read(map);
2501 if(real_map != map)
2502 vm_map_unlock(real_map);
2503
2504 if(write_startup_file)
2505 tws_send_startup_info(current_task());
2506
2507 thread_interrupt_level(interruptible_state);
2508
2509
2510 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2511 vaddr,
2512 type_of_fault & 0xff,
2513 KERN_SUCCESS,
2514 type_of_fault >> 8,
2515 0);
2516
2517 return KERN_SUCCESS;
2518 }
2519
2520 /*
2521 * Copy on write fault. If objects match, then
2522 * object->copy must not be NULL (else control
2523 * would be in previous code block), and we
2524 * have a potential push into the copy object
2525 * with which we won't cope here.
2526 */
2527
2528 if (cur_object == object)
2529 break;
2530 /*
2531 * This is now a shadow based copy on write
2532 * fault -- it requires a copy up the shadow
2533 * chain.
2534 *
2535 * Allocate a page in the original top level
2536 * object. Give up if allocate fails. Also
2537 * need to remember current page, as it's the
2538 * source of the copy.
2539 */
2540 cur_m = m;
2541 m = vm_page_grab();
2542 if (m == VM_PAGE_NULL) {
2543 break;
2544 }
2545 /*
2546 * Now do the copy. Mark the source busy
2547 * and take out paging references on both
2548 * objects.
2549 *
2550 * NOTE: This code holds the map lock across
2551 * the page copy.
2552 */
2553
2554 cur_m->busy = TRUE;
2555 vm_page_copy(cur_m, m);
2556 vm_page_insert(m, object, offset);
2557
2558 vm_object_paging_begin(cur_object);
2559 vm_object_paging_begin(object);
2560
2561 type_of_fault = DBG_COW_FAULT;
2562 VM_STAT(cow_faults++);
2563 current_task()->cow_faults++;
2564
2565 /*
2566 * Now cope with the source page and object
2567 * If the top object has a ref count of 1
2568 * then no other map can access it, and hence
2569 * it's not necessary to do the pmap_disconnect.
2570 */
2571
2572 vm_page_lock_queues();
2573 vm_page_deactivate(cur_m);
2574 m->dirty = TRUE;
2575 pmap_disconnect(cur_m->phys_page);
2576 vm_page_unlock_queues();
2577
2578 PAGE_WAKEUP_DONE(cur_m);
2579 vm_object_paging_end(cur_object);
2580 vm_object_unlock(cur_object);
2581
2582 /*
2583 * Slight hack to call vm_object collapse
2584 * and then reuse common map in code.
2585 * note that the object lock was taken above.
2586 */
2587
2588 vm_object_paging_end(object);
2589 vm_object_collapse(object, offset);
2590 vm_object_paging_begin(object);
2591
2592 goto FastPmapEnter;
2593 }
2594 else {
2595
2596 /*
2597 * No page at cur_object, cur_offset
2598 */
2599
2600 if (cur_object->pager_created) {
2601
2602 /*
2603 * Have to talk to the pager. Give up.
2604 */
2605 break;
2606 }
2607
2608
2609 if (cur_object->shadow == VM_OBJECT_NULL) {
2610
2611 if (cur_object->shadow_severed) {
2612 vm_object_paging_end(object);
2613 vm_object_unlock(object);
2614 vm_map_unlock_read(map);
2615 if(real_map != map)
2616 vm_map_unlock(real_map);
2617
2618 if(write_startup_file)
2619 tws_send_startup_info(
2620 current_task());
2621
2622 thread_interrupt_level(interruptible_state);
2623
2624 return KERN_MEMORY_ERROR;
2625 }
2626
2627 /*
2628 * Zero fill fault. Page gets
2629 * filled in top object. Insert
2630 * page, then drop any lower lock.
2631 * Give up if no page.
2632 */
2633 if (VM_PAGE_THROTTLED()) {
2634 break;
2635 }
2636
2637 /*
2638 * are we protecting the system from
2639 * backing store exhaustion. If so
2640 * sleep unless we are privileged.
2641 */
2642 if(vm_backing_store_low) {
2643 if(!(current_task()->priv_flags
2644 & VM_BACKING_STORE_PRIV))
2645 break;
2646 }
2647 m = vm_page_alloc(object, offset);
2648 if (m == VM_PAGE_NULL) {
2649 break;
2650 }
2651 /*
2652 * This is a zero-fill or initial fill
2653 * page fault. As such, we consider it
2654 * undefined with respect to instruction
2655 * execution. i.e. it is the responsibility
2656 * of higher layers to call for an instruction
2657 * sync after changing the contents and before
2658 * sending a program into this area. We
2659 * choose this approach for performance
2660 */
2661
2662 m->no_isync = FALSE;
2663
2664 if (cur_object != object)
2665 vm_object_unlock(cur_object);
2666
2667 vm_object_paging_begin(object);
2668 vm_object_unlock(object);
2669
2670 /*
2671 * Now zero fill page and map it.
2672 * the page is probably going to
2673 * be written soon, so don't bother
2674 * to clear the modified bit
2675 *
2676 * NOTE: This code holds the map
2677 * lock across the zero fill.
2678 */
2679
2680 if (!map->no_zero_fill) {
2681 vm_page_zero_fill(m);
2682 type_of_fault = DBG_ZERO_FILL_FAULT;
2683 VM_STAT(zero_fill_count++);
2684 }
2685 vm_page_lock_queues();
2686 VM_PAGE_QUEUES_REMOVE(m);
2687
2688 m->page_ticket = vm_page_ticket;
2689 assert(!m->laundry);
2690 assert(m->object != kernel_object);
2691 assert(m->pageq.next == NULL &&
2692 m->pageq.prev == NULL);
2693 if(m->object->size > 0x200000) {
2694 m->zero_fill = TRUE;
2695 /* depends on the queues lock */
2696 vm_zf_count += 1;
2697 queue_enter(&vm_page_queue_zf,
2698 m, vm_page_t, pageq);
2699 } else {
2700 queue_enter(
2701 &vm_page_queue_inactive,
2702 m, vm_page_t, pageq);
2703 }
2704 vm_page_ticket_roll++;
2705 if(vm_page_ticket_roll ==
2706 VM_PAGE_TICKETS_IN_ROLL) {
2707 vm_page_ticket_roll = 0;
2708 if(vm_page_ticket ==
2709 VM_PAGE_TICKET_ROLL_IDS)
2710 vm_page_ticket= 0;
2711 else
2712 vm_page_ticket++;
2713 }
2714
2715 m->inactive = TRUE;
2716 vm_page_inactive_count++;
2717 vm_page_unlock_queues();
2718 vm_object_lock(object);
2719
2720 goto FastPmapEnter;
2721 }
2722
2723 /*
2724 * On to the next level
2725 */
2726
2727 cur_offset += cur_object->shadow_offset;
2728 new_object = cur_object->shadow;
2729 vm_object_lock(new_object);
2730 if (cur_object != object)
2731 vm_object_unlock(cur_object);
2732 cur_object = new_object;
2733
2734 continue;
2735 }
2736 }
2737
2738 /*
2739 * Cleanup from fast fault failure. Drop any object
2740 * lock other than original and drop map lock.
2741 */
2742
2743 if (object != cur_object)
2744 vm_object_unlock(cur_object);
2745 }
2746 vm_map_unlock_read(map);
2747
2748 if(real_map != map)
2749 vm_map_unlock(real_map);
2750
2751 /*
2752 * Make a reference to this object to
2753 * prevent its disposal while we are messing with
2754 * it. Once we have the reference, the map is free
2755 * to be diddled. Since objects reference their
2756 * shadows (and copies), they will stay around as well.
2757 */
2758
2759 assert(object->ref_count > 0);
2760 object->ref_count++;
2761 vm_object_res_reference(object);
2762 vm_object_paging_begin(object);
2763
2764 XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2765
2766 if (!object->private) {
2767 write_startup_file =
2768 vm_fault_tws_insert(map, real_map, vaddr, object, offset);
2769 }
2770
2771 kr = vm_fault_page(object, offset, fault_type,
2772 (change_wiring && !wired),
2773 interruptible,
2774 lo_offset, hi_offset, behavior,
2775 &prot, &result_page, &top_page,
2776 &type_of_fault,
2777 &error_code, map->no_zero_fill, FALSE, map, vaddr);
2778
2779 /*
2780 * If we didn't succeed, lose the object reference immediately.
2781 */
2782
2783 if (kr != VM_FAULT_SUCCESS)
2784 vm_object_deallocate(object);
2785
2786 /*
2787 * See why we failed, and take corrective action.
2788 */
2789
2790 switch (kr) {
2791 case VM_FAULT_SUCCESS:
2792 break;
2793 case VM_FAULT_MEMORY_SHORTAGE:
2794 if (vm_page_wait((change_wiring) ?
2795 THREAD_UNINT :
2796 THREAD_ABORTSAFE))
2797 goto RetryFault;
2798 /* fall thru */
2799 case VM_FAULT_INTERRUPTED:
2800 kr = KERN_ABORTED;
2801 goto done;
2802 case VM_FAULT_RETRY:
2803 goto RetryFault;
2804 case VM_FAULT_FICTITIOUS_SHORTAGE:
2805 vm_page_more_fictitious();
2806 goto RetryFault;
2807 case VM_FAULT_MEMORY_ERROR:
2808 if (error_code)
2809 kr = error_code;
2810 else
2811 kr = KERN_MEMORY_ERROR;
2812 goto done;
2813 }
2814
2815 m = result_page;
2816
2817 if(m != VM_PAGE_NULL) {
2818 assert((change_wiring && !wired) ?
2819 (top_page == VM_PAGE_NULL) :
2820 ((top_page == VM_PAGE_NULL) == (m->object == object)));
2821 }
2822
2823 /*
2824 * How to clean up the result of vm_fault_page. This
2825 * happens whether the mapping is entered or not.
2826 */
2827
2828 #define UNLOCK_AND_DEALLOCATE \
2829 MACRO_BEGIN \
2830 vm_fault_cleanup(m->object, top_page); \
2831 vm_object_deallocate(object); \
2832 MACRO_END
2833
2834 /*
2835 * What to do with the resulting page from vm_fault_page
2836 * if it doesn't get entered into the physical map:
2837 */
2838
2839 #define RELEASE_PAGE(m) \
2840 MACRO_BEGIN \
2841 PAGE_WAKEUP_DONE(m); \
2842 vm_page_lock_queues(); \
2843 if (!m->active && !m->inactive) \
2844 vm_page_activate(m); \
2845 vm_page_unlock_queues(); \
2846 MACRO_END
2847
2848 /*
2849 * We must verify that the maps have not changed
2850 * since our last lookup.
2851 */
2852
2853 if(m != VM_PAGE_NULL) {
2854 old_copy_object = m->object->copy;
2855 vm_object_unlock(m->object);
2856 } else {
2857 old_copy_object = VM_OBJECT_NULL;
2858 }
2859 if ((map != original_map) || !vm_map_verify(map, &version)) {
2860 vm_object_t retry_object;
2861 vm_object_offset_t retry_offset;
2862 vm_prot_t retry_prot;
2863
2864 /*
2865 * To avoid trying to write_lock the map while another
2866 * thread has it read_locked (in vm_map_pageable), we
2867 * do not try for write permission. If the page is
2868 * still writable, we will get write permission. If it
2869 * is not, or has been marked needs_copy, we enter the
2870 * mapping without write permission, and will merely
2871 * take another fault.
2872 */
2873 map = original_map;
2874 vm_map_lock_read(map);
2875 kr = vm_map_lookup_locked(&map, vaddr,
2876 fault_type & ~VM_PROT_WRITE, &version,
2877 &retry_object, &retry_offset, &retry_prot,
2878 &wired, &behavior, &lo_offset, &hi_offset,
2879 &real_map);
2880 pmap = real_map->pmap;
2881
2882 if (kr != KERN_SUCCESS) {
2883 vm_map_unlock_read(map);
2884 if(m != VM_PAGE_NULL) {
2885 vm_object_lock(m->object);
2886 RELEASE_PAGE(m);
2887 UNLOCK_AND_DEALLOCATE;
2888 } else {
2889 vm_object_deallocate(object);
2890 }
2891 goto done;
2892 }
2893
2894 vm_object_unlock(retry_object);
2895 if(m != VM_PAGE_NULL) {
2896 vm_object_lock(m->object);
2897 } else {
2898 vm_object_lock(object);
2899 }
2900
2901 if ((retry_object != object) ||
2902 (retry_offset != offset)) {
2903 vm_map_unlock_read(map);
2904 if(real_map != map)
2905 vm_map_unlock(real_map);
2906 if(m != VM_PAGE_NULL) {
2907 RELEASE_PAGE(m);
2908 UNLOCK_AND_DEALLOCATE;
2909 } else {
2910 vm_object_deallocate(object);
2911 }
2912 goto RetryFault;
2913 }
2914
2915 /*
2916 * Check whether the protection has changed or the object
2917 * has been copied while we left the map unlocked.
2918 */
2919 prot &= retry_prot;
2920 if(m != VM_PAGE_NULL) {
2921 vm_object_unlock(m->object);
2922 } else {
2923 vm_object_unlock(object);
2924 }
2925 }
2926 if(m != VM_PAGE_NULL) {
2927 vm_object_lock(m->object);
2928 } else {
2929 vm_object_lock(object);
2930 }
2931
2932 /*
2933 * If the copy object changed while the top-level object
2934 * was unlocked, then we must take away write permission.
2935 */
2936
2937 if(m != VM_PAGE_NULL) {
2938 if (m->object->copy != old_copy_object)
2939 prot &= ~VM_PROT_WRITE;
2940 }
2941
2942 /*
2943 * If we want to wire down this page, but no longer have
2944 * adequate permissions, we must start all over.
2945 */
2946
2947 if (wired && (fault_type != (prot|VM_PROT_WRITE))) {
2948 vm_map_verify_done(map, &version);
2949 if(real_map != map)
2950 vm_map_unlock(real_map);
2951 if(m != VM_PAGE_NULL) {
2952 RELEASE_PAGE(m);
2953 UNLOCK_AND_DEALLOCATE;
2954 } else {
2955 vm_object_deallocate(object);
2956 }
2957 goto RetryFault;
2958 }
2959
2960 /*
2961 * Put this page into the physical map.
2962 * We had to do the unlock above because pmap_enter
2963 * may cause other faults. The page may be on
2964 * the pageout queues. If the pageout daemon comes
2965 * across the page, it will remove it from the queues.
2966 */
2967 need_activation = FALSE;
2968
2969 if (m != VM_PAGE_NULL) {
2970 if (m->no_isync == TRUE) {
2971 pmap_sync_page_data_phys(m->phys_page);
2972
2973 if ((type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2974 /*
2975 * found it in the cache, but this
2976 * is the first fault-in of the page (no_isync == TRUE)
2977 * so it must have come in as part of
2978 * a cluster... account 1 pagein against it
2979 */
2980 VM_STAT(pageins++);
2981 current_task()->pageins++;
2982
2983 type_of_fault = DBG_PAGEIN_FAULT;
2984 }
2985 if (m->clustered) {
2986 need_activation = TRUE;
2987 }
2988 m->no_isync = FALSE;
2989 }
2990 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2991
2992 if(caller_pmap) {
2993 PMAP_ENTER(caller_pmap,
2994 caller_pmap_addr, m,
2995 prot, cache_attr, wired);
2996 } else {
2997 PMAP_ENTER(pmap, vaddr, m,
2998 prot, cache_attr, wired);
2999 }
3000
3001 /*
3002 * Add working set information for private objects here.
3003 */
3004 if (m->object->private) {
3005 write_startup_file =
3006 vm_fault_tws_insert(map, real_map, vaddr,
3007 m->object, m->offset);
3008 }
3009 } else {
3010
3011 #ifndef i386
3012 vm_map_entry_t entry;
3013 vm_map_offset_t laddr;
3014 vm_map_offset_t ldelta, hdelta;
3015
3016 /*
3017 * do a pmap block mapping from the physical address
3018 * in the object
3019 */
3020
3021 /* While we do not worry about execution protection in */
3022 /* general, certian pages may have instruction execution */
3023 /* disallowed. We will check here, and if not allowed */
3024 /* to execute, we return with a protection failure. */
3025
3026 if((full_fault_type & VM_PROT_EXECUTE) &&
3027 (!pmap_eligible_for_execute((ppnum_t)
3028 (object->shadow_offset >> 12)))) {
3029
3030 vm_map_verify_done(map, &version);
3031 if(real_map != map)
3032 vm_map_unlock(real_map);
3033 vm_fault_cleanup(object, top_page);
3034 vm_object_deallocate(object);
3035 kr = KERN_PROTECTION_FAILURE;
3036 goto done;
3037 }
3038
3039 if(real_map != map) {
3040 vm_map_unlock(real_map);
3041 }
3042 if (original_map != map) {
3043 vm_map_unlock_read(map);
3044 vm_map_lock_read(original_map);
3045 map = original_map;
3046 }
3047 real_map = map;
3048
3049 laddr = vaddr;
3050 hdelta = 0xFFFFF000;
3051 ldelta = 0xFFFFF000;
3052
3053
3054 while(vm_map_lookup_entry(map, laddr, &entry)) {
3055 if(ldelta > (laddr - entry->vme_start))
3056 ldelta = laddr - entry->vme_start;
3057 if(hdelta > (entry->vme_end - laddr))
3058 hdelta = entry->vme_end - laddr;
3059 if(entry->is_sub_map) {
3060
3061 laddr = (laddr - entry->vme_start)
3062 + entry->offset;
3063 vm_map_lock_read(entry->object.sub_map);
3064 if(map != real_map)
3065 vm_map_unlock_read(map);
3066 if(entry->use_pmap) {
3067 vm_map_unlock_read(real_map);
3068 real_map = entry->object.sub_map;
3069 }
3070 map = entry->object.sub_map;
3071
3072 } else {
3073 break;
3074 }
3075 }
3076
3077 if(vm_map_lookup_entry(map, laddr, &entry) &&
3078 (entry->object.vm_object != NULL) &&
3079 (entry->object.vm_object == object)) {
3080
3081
3082 if(caller_pmap) {
3083 /* Set up a block mapped area */
3084 pmap_map_block(caller_pmap,
3085 (addr64_t)(caller_pmap_addr - ldelta),
3086 (((vm_map_offset_t)
3087 (entry->object.vm_object->shadow_offset))
3088 + entry->offset +
3089 (laddr - entry->vme_start)
3090 - ldelta) >> 12,
3091 ((ldelta + hdelta) >> 12), prot,
3092 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3093 } else {
3094 /* Set up a block mapped area */
3095 pmap_map_block(real_map->pmap,
3096 (addr64_t)(vaddr - ldelta),
3097 (((vm_map_offset_t)
3098 (entry->object.vm_object->shadow_offset))
3099 + entry->offset +
3100 (laddr - entry->vme_start) - ldelta) >> 12,
3101 ((ldelta + hdelta) >> 12), prot,
3102 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3103 }
3104 }
3105 #else
3106 #ifdef notyet
3107 if(caller_pmap) {
3108 pmap_enter(caller_pmap, caller_pmap_addr,
3109 object->shadow_offset>>12, prot, 0, TRUE);
3110 } else {
3111 pmap_enter(pmap, vaddr,
3112 object->shadow_offset>>12, prot, 0, TRUE);
3113 }
3114 /* Map it in */
3115 #endif
3116 #endif
3117
3118 }
3119
3120 /*
3121 * If the page is not wired down and isn't already
3122 * on a pageout queue, then put it where the
3123 * pageout daemon can find it.
3124 */
3125 if(m != VM_PAGE_NULL) {
3126 vm_page_lock_queues();
3127
3128 if (m->clustered) {
3129 vm_pagein_cluster_used++;
3130 m->clustered = FALSE;
3131 }
3132 m->reference = TRUE;
3133
3134 if (change_wiring) {
3135 if (wired)
3136 vm_page_wire(m);
3137 else
3138 vm_page_unwire(m);
3139 }
3140 #if VM_FAULT_STATIC_CONFIG
3141 else {
3142 if ((!m->active && !m->inactive) || ((need_activation == TRUE) && !m->active))
3143 vm_page_activate(m);
3144 }
3145 #else
3146 else if (software_reference_bits) {
3147 if (!m->active && !m->inactive)
3148 vm_page_activate(m);
3149 m->reference = TRUE;
3150 } else {
3151 vm_page_activate(m);
3152 }
3153 #endif
3154 vm_page_unlock_queues();
3155 }
3156
3157 /*
3158 * Unlock everything, and return
3159 */
3160
3161 vm_map_verify_done(map, &version);
3162 if(real_map != map)
3163 vm_map_unlock(real_map);
3164 if(m != VM_PAGE_NULL) {
3165 PAGE_WAKEUP_DONE(m);
3166 UNLOCK_AND_DEALLOCATE;
3167 } else {
3168 vm_fault_cleanup(object, top_page);
3169 vm_object_deallocate(object);
3170 }
3171 kr = KERN_SUCCESS;
3172
3173 #undef UNLOCK_AND_DEALLOCATE
3174 #undef RELEASE_PAGE
3175
3176 done:
3177 if(write_startup_file)
3178 tws_send_startup_info(current_task());
3179
3180 thread_interrupt_level(interruptible_state);
3181
3182 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
3183 vaddr,
3184 type_of_fault & 0xff,
3185 kr,
3186 type_of_fault >> 8,
3187 0);
3188
3189 return(kr);
3190 }
3191
3192 /*
3193 * vm_fault_wire:
3194 *
3195 * Wire down a range of virtual addresses in a map.
3196 */
3197 kern_return_t
3198 vm_fault_wire(
3199 vm_map_t map,
3200 vm_map_entry_t entry,
3201 pmap_t pmap,
3202 vm_map_offset_t pmap_addr)
3203 {
3204
3205 register vm_map_offset_t va;
3206 register vm_map_offset_t end_addr = entry->vme_end;
3207 register kern_return_t rc;
3208
3209 assert(entry->in_transition);
3210
3211 if ((entry->object.vm_object != NULL) &&
3212 !entry->is_sub_map &&
3213 entry->object.vm_object->phys_contiguous) {
3214 return KERN_SUCCESS;
3215 }
3216
3217 /*
3218 * Inform the physical mapping system that the
3219 * range of addresses may not fault, so that
3220 * page tables and such can be locked down as well.
3221 */
3222
3223 pmap_pageable(pmap, pmap_addr,
3224 pmap_addr + (end_addr - entry->vme_start), FALSE);
3225
3226 /*
3227 * We simulate a fault to get the page and enter it
3228 * in the physical map.
3229 */
3230
3231 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3232 if ((rc = vm_fault_wire_fast(
3233 map, va, entry, pmap,
3234 pmap_addr + (va - entry->vme_start)
3235 )) != KERN_SUCCESS) {
3236 rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
3237 (pmap == kernel_pmap) ?
3238 THREAD_UNINT : THREAD_ABORTSAFE,
3239 pmap, pmap_addr + (va - entry->vme_start));
3240 }
3241
3242 if (rc != KERN_SUCCESS) {
3243 struct vm_map_entry tmp_entry = *entry;
3244
3245 /* unwire wired pages */
3246 tmp_entry.vme_end = va;
3247 vm_fault_unwire(map,
3248 &tmp_entry, FALSE, pmap, pmap_addr);
3249
3250 return rc;
3251 }
3252 }
3253 return KERN_SUCCESS;
3254 }
3255
3256 /*
3257 * vm_fault_unwire:
3258 *
3259 * Unwire a range of virtual addresses in a map.
3260 */
3261 void
3262 vm_fault_unwire(
3263 vm_map_t map,
3264 vm_map_entry_t entry,
3265 boolean_t deallocate,
3266 pmap_t pmap,
3267 vm_map_offset_t pmap_addr)
3268 {
3269 register vm_map_offset_t va;
3270 register vm_map_offset_t end_addr = entry->vme_end;
3271 vm_object_t object;
3272
3273 object = (entry->is_sub_map)
3274 ? VM_OBJECT_NULL : entry->object.vm_object;
3275
3276 /*
3277 * Since the pages are wired down, we must be able to
3278 * get their mappings from the physical map system.
3279 */
3280
3281 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3282 pmap_change_wiring(pmap,
3283 pmap_addr + (va - entry->vme_start), FALSE);
3284
3285 if (object == VM_OBJECT_NULL) {
3286 (void) vm_fault(map, va, VM_PROT_NONE,
3287 TRUE, THREAD_UNINT, pmap, pmap_addr);
3288 } else if (object->phys_contiguous) {
3289 continue;
3290 } else {
3291 vm_prot_t prot;
3292 vm_page_t result_page;
3293 vm_page_t top_page;
3294 vm_object_t result_object;
3295 vm_fault_return_t result;
3296
3297 do {
3298 prot = VM_PROT_NONE;
3299
3300 vm_object_lock(object);
3301 vm_object_paging_begin(object);
3302 XPR(XPR_VM_FAULT,
3303 "vm_fault_unwire -> vm_fault_page\n",
3304 0,0,0,0,0);
3305 result = vm_fault_page(object,
3306 entry->offset +
3307 (va - entry->vme_start),
3308 VM_PROT_NONE, TRUE,
3309 THREAD_UNINT,
3310 entry->offset,
3311 entry->offset +
3312 (entry->vme_end
3313 - entry->vme_start),
3314 entry->behavior,
3315 &prot,
3316 &result_page,
3317 &top_page,
3318 (int *)0,
3319 0, map->no_zero_fill,
3320 FALSE, NULL, 0);
3321 } while (result == VM_FAULT_RETRY);
3322
3323 if (result != VM_FAULT_SUCCESS)
3324 panic("vm_fault_unwire: failure");
3325
3326 result_object = result_page->object;
3327 if (deallocate) {
3328 assert(!result_page->fictitious);
3329 pmap_disconnect(result_page->phys_page);
3330 VM_PAGE_FREE(result_page);
3331 } else {
3332 vm_page_lock_queues();
3333 vm_page_unwire(result_page);
3334 vm_page_unlock_queues();
3335 PAGE_WAKEUP_DONE(result_page);
3336 }
3337
3338 vm_fault_cleanup(result_object, top_page);
3339 }
3340 }
3341
3342 /*
3343 * Inform the physical mapping system that the range
3344 * of addresses may fault, so that page tables and
3345 * such may be unwired themselves.
3346 */
3347
3348 pmap_pageable(pmap, pmap_addr,
3349 pmap_addr + (end_addr - entry->vme_start), TRUE);
3350
3351 }
3352
3353 /*
3354 * vm_fault_wire_fast:
3355 *
3356 * Handle common case of a wire down page fault at the given address.
3357 * If successful, the page is inserted into the associated physical map.
3358 * The map entry is passed in to avoid the overhead of a map lookup.
3359 *
3360 * NOTE: the given address should be truncated to the
3361 * proper page address.
3362 *
3363 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
3364 * a standard error specifying why the fault is fatal is returned.
3365 *
3366 * The map in question must be referenced, and remains so.
3367 * Caller has a read lock on the map.
3368 *
3369 * This is a stripped version of vm_fault() for wiring pages. Anything
3370 * other than the common case will return KERN_FAILURE, and the caller
3371 * is expected to call vm_fault().
3372 */
3373 kern_return_t
3374 vm_fault_wire_fast(
3375 __unused vm_map_t map,
3376 vm_map_offset_t va,
3377 vm_map_entry_t entry,
3378 pmap_t pmap,
3379 vm_map_offset_t pmap_addr)
3380 {
3381 vm_object_t object;
3382 vm_object_offset_t offset;
3383 register vm_page_t m;
3384 vm_prot_t prot;
3385 thread_t thread = current_thread();
3386 unsigned int cache_attr;
3387
3388 VM_STAT(faults++);
3389
3390 if (thread != THREAD_NULL && thread->task != TASK_NULL)
3391 thread->task->faults++;
3392
3393 /*
3394 * Recovery actions
3395 */
3396
3397 #undef RELEASE_PAGE
3398 #define RELEASE_PAGE(m) { \
3399 PAGE_WAKEUP_DONE(m); \
3400 vm_page_lock_queues(); \
3401 vm_page_unwire(m); \
3402 vm_page_unlock_queues(); \
3403 }
3404
3405
3406 #undef UNLOCK_THINGS
3407 #define UNLOCK_THINGS { \
3408 vm_object_paging_end(object); \
3409 vm_object_unlock(object); \
3410 }
3411
3412 #undef UNLOCK_AND_DEALLOCATE
3413 #define UNLOCK_AND_DEALLOCATE { \
3414 UNLOCK_THINGS; \
3415 vm_object_deallocate(object); \
3416 }
3417 /*
3418 * Give up and have caller do things the hard way.
3419 */
3420
3421 #define GIVE_UP { \
3422 UNLOCK_AND_DEALLOCATE; \
3423 return(KERN_FAILURE); \
3424 }
3425
3426
3427 /*
3428 * If this entry is not directly to a vm_object, bail out.
3429 */
3430 if (entry->is_sub_map)
3431 return(KERN_FAILURE);
3432
3433 /*
3434 * Find the backing store object and offset into it.
3435 */
3436
3437 object = entry->object.vm_object;
3438 offset = (va - entry->vme_start) + entry->offset;
3439 prot = entry->protection;
3440
3441 /*
3442 * Make a reference to this object to prevent its
3443 * disposal while we are messing with it.
3444 */
3445
3446 vm_object_lock(object);
3447 assert(object->ref_count > 0);
3448 object->ref_count++;
3449 vm_object_res_reference(object);
3450 vm_object_paging_begin(object);
3451
3452 /*
3453 * INVARIANTS (through entire routine):
3454 *
3455 * 1) At all times, we must either have the object
3456 * lock or a busy page in some object to prevent
3457 * some other thread from trying to bring in
3458 * the same page.
3459 *
3460 * 2) Once we have a busy page, we must remove it from
3461 * the pageout queues, so that the pageout daemon
3462 * will not grab it away.
3463 *
3464 */
3465
3466 /*
3467 * Look for page in top-level object. If it's not there or
3468 * there's something going on, give up.
3469 * ENCRYPTED SWAP: use the slow fault path, since we'll need to
3470 * decrypt the page before wiring it down.
3471 */
3472 m = vm_page_lookup(object, offset);
3473 if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
3474 (m->unusual && ( m->error || m->restart || m->absent ||
3475 prot & m->page_lock))) {
3476
3477 GIVE_UP;
3478 }
3479 ASSERT_PAGE_DECRYPTED(m);
3480
3481 /*
3482 * Wire the page down now. All bail outs beyond this
3483 * point must unwire the page.
3484 */
3485
3486 vm_page_lock_queues();
3487 vm_page_wire(m);
3488 vm_page_unlock_queues();
3489
3490 /*
3491 * Mark page busy for other threads.
3492 */
3493 assert(!m->busy);
3494 m->busy = TRUE;
3495 assert(!m->absent);
3496
3497 /*
3498 * Give up if the page is being written and there's a copy object
3499 */
3500 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3501 RELEASE_PAGE(m);
3502 GIVE_UP;
3503 }
3504
3505 /*
3506 * Put this page into the physical map.
3507 * We have to unlock the object because pmap_enter
3508 * may cause other faults.
3509 */
3510 if (m->no_isync == TRUE) {
3511 pmap_sync_page_data_phys(m->phys_page);
3512
3513 m->no_isync = FALSE;
3514 }
3515
3516 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3517
3518 PMAP_ENTER(pmap, pmap_addr, m, prot, cache_attr, TRUE);
3519
3520 /*
3521 * Unlock everything, and return
3522 */
3523
3524 PAGE_WAKEUP_DONE(m);
3525 UNLOCK_AND_DEALLOCATE;
3526
3527 return(KERN_SUCCESS);
3528
3529 }
3530
3531 /*
3532 * Routine: vm_fault_copy_cleanup
3533 * Purpose:
3534 * Release a page used by vm_fault_copy.
3535 */
3536
3537 void
3538 vm_fault_copy_cleanup(
3539 vm_page_t page,
3540 vm_page_t top_page)
3541 {
3542 vm_object_t object = page->object;
3543
3544 vm_object_lock(object);
3545 PAGE_WAKEUP_DONE(page);
3546 vm_page_lock_queues();
3547 if (!page->active && !page->inactive)
3548 vm_page_activate(page);
3549 vm_page_unlock_queues();
3550 vm_fault_cleanup(object, top_page);
3551 }
3552
3553 void
3554 vm_fault_copy_dst_cleanup(
3555 vm_page_t page)
3556 {
3557 vm_object_t object;
3558
3559 if (page != VM_PAGE_NULL) {
3560 object = page->object;
3561 vm_object_lock(object);
3562 vm_page_lock_queues();
3563 vm_page_unwire(page);
3564 vm_page_unlock_queues();
3565 vm_object_paging_end(object);
3566 vm_object_unlock(object);
3567 }
3568 }
3569
3570 /*
3571 * Routine: vm_fault_copy
3572 *
3573 * Purpose:
3574 * Copy pages from one virtual memory object to another --
3575 * neither the source nor destination pages need be resident.
3576 *
3577 * Before actually copying a page, the version associated with
3578 * the destination address map wil be verified.
3579 *
3580 * In/out conditions:
3581 * The caller must hold a reference, but not a lock, to
3582 * each of the source and destination objects and to the
3583 * destination map.
3584 *
3585 * Results:
3586 * Returns KERN_SUCCESS if no errors were encountered in
3587 * reading or writing the data. Returns KERN_INTERRUPTED if
3588 * the operation was interrupted (only possible if the
3589 * "interruptible" argument is asserted). Other return values
3590 * indicate a permanent error in copying the data.
3591 *
3592 * The actual amount of data copied will be returned in the
3593 * "copy_size" argument. In the event that the destination map
3594 * verification failed, this amount may be less than the amount
3595 * requested.
3596 */
3597 kern_return_t
3598 vm_fault_copy(
3599 vm_object_t src_object,
3600 vm_object_offset_t src_offset,
3601 vm_map_size_t *copy_size, /* INOUT */
3602 vm_object_t dst_object,
3603 vm_object_offset_t dst_offset,
3604 vm_map_t dst_map,
3605 vm_map_version_t *dst_version,
3606 int interruptible)
3607 {
3608 vm_page_t result_page;
3609
3610 vm_page_t src_page;
3611 vm_page_t src_top_page;
3612 vm_prot_t src_prot;
3613
3614 vm_page_t dst_page;
3615 vm_page_t dst_top_page;
3616 vm_prot_t dst_prot;
3617
3618 vm_map_size_t amount_left;
3619 vm_object_t old_copy_object;
3620 kern_return_t error = 0;
3621
3622 vm_map_size_t part_size;
3623
3624 /*
3625 * In order not to confuse the clustered pageins, align
3626 * the different offsets on a page boundary.
3627 */
3628 vm_object_offset_t src_lo_offset = vm_object_trunc_page(src_offset);
3629 vm_object_offset_t dst_lo_offset = vm_object_trunc_page(dst_offset);
3630 vm_object_offset_t src_hi_offset = vm_object_round_page(src_offset + *copy_size);
3631 vm_object_offset_t dst_hi_offset = vm_object_round_page(dst_offset + *copy_size);
3632
3633 #define RETURN(x) \
3634 MACRO_BEGIN \
3635 *copy_size -= amount_left; \
3636 MACRO_RETURN(x); \
3637 MACRO_END
3638
3639 amount_left = *copy_size;
3640 do { /* while (amount_left > 0) */
3641 /*
3642 * There may be a deadlock if both source and destination
3643 * pages are the same. To avoid this deadlock, the copy must
3644 * start by getting the destination page in order to apply
3645 * COW semantics if any.
3646 */
3647
3648 RetryDestinationFault: ;
3649
3650 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3651
3652 vm_object_lock(dst_object);
3653 vm_object_paging_begin(dst_object);
3654
3655 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3656 switch (vm_fault_page(dst_object,
3657 vm_object_trunc_page(dst_offset),
3658 VM_PROT_WRITE|VM_PROT_READ,
3659 FALSE,
3660 interruptible,
3661 dst_lo_offset,
3662 dst_hi_offset,
3663 VM_BEHAVIOR_SEQUENTIAL,
3664 &dst_prot,
3665 &dst_page,
3666 &dst_top_page,
3667 (int *)0,
3668 &error,
3669 dst_map->no_zero_fill,
3670 FALSE, NULL, 0)) {
3671 case VM_FAULT_SUCCESS:
3672 break;
3673 case VM_FAULT_RETRY:
3674 goto RetryDestinationFault;
3675 case VM_FAULT_MEMORY_SHORTAGE:
3676 if (vm_page_wait(interruptible))
3677 goto RetryDestinationFault;
3678 /* fall thru */
3679 case VM_FAULT_INTERRUPTED:
3680 RETURN(MACH_SEND_INTERRUPTED);
3681 case VM_FAULT_FICTITIOUS_SHORTAGE:
3682 vm_page_more_fictitious();
3683 goto RetryDestinationFault;
3684 case VM_FAULT_MEMORY_ERROR:
3685 if (error)
3686 return (error);
3687 else
3688 return(KERN_MEMORY_ERROR);
3689 }
3690 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3691
3692 old_copy_object = dst_page->object->copy;
3693
3694 /*
3695 * There exists the possiblity that the source and
3696 * destination page are the same. But we can't
3697 * easily determine that now. If they are the
3698 * same, the call to vm_fault_page() for the
3699 * destination page will deadlock. To prevent this we
3700 * wire the page so we can drop busy without having
3701 * the page daemon steal the page. We clean up the
3702 * top page but keep the paging reference on the object
3703 * holding the dest page so it doesn't go away.
3704 */
3705
3706 vm_page_lock_queues();
3707 vm_page_wire(dst_page);
3708 vm_page_unlock_queues();
3709 PAGE_WAKEUP_DONE(dst_page);
3710 vm_object_unlock(dst_page->object);
3711
3712 if (dst_top_page != VM_PAGE_NULL) {
3713 vm_object_lock(dst_object);
3714 VM_PAGE_FREE(dst_top_page);
3715 vm_object_paging_end(dst_object);
3716 vm_object_unlock(dst_object);
3717 }
3718
3719 RetrySourceFault: ;
3720
3721 if (src_object == VM_OBJECT_NULL) {
3722 /*
3723 * No source object. We will just
3724 * zero-fill the page in dst_object.
3725 */
3726 src_page = VM_PAGE_NULL;
3727 result_page = VM_PAGE_NULL;
3728 } else {
3729 vm_object_lock(src_object);
3730 src_page = vm_page_lookup(src_object,
3731 vm_object_trunc_page(src_offset));
3732 if (src_page == dst_page) {
3733 src_prot = dst_prot;
3734 result_page = VM_PAGE_NULL;
3735 } else {
3736 src_prot = VM_PROT_READ;
3737 vm_object_paging_begin(src_object);
3738
3739 XPR(XPR_VM_FAULT,
3740 "vm_fault_copy(2) -> vm_fault_page\n",
3741 0,0,0,0,0);
3742 switch (vm_fault_page(src_object,
3743 vm_object_trunc_page(src_offset),
3744 VM_PROT_READ,
3745 FALSE,
3746 interruptible,
3747 src_lo_offset,
3748 src_hi_offset,
3749 VM_BEHAVIOR_SEQUENTIAL,
3750 &src_prot,
3751 &result_page,
3752 &src_top_page,
3753 (int *)0,
3754 &error,
3755 FALSE,
3756 FALSE, NULL, 0)) {
3757
3758 case VM_FAULT_SUCCESS:
3759 break;
3760 case VM_FAULT_RETRY:
3761 goto RetrySourceFault;
3762 case VM_FAULT_MEMORY_SHORTAGE:
3763 if (vm_page_wait(interruptible))
3764 goto RetrySourceFault;
3765 /* fall thru */
3766 case VM_FAULT_INTERRUPTED:
3767 vm_fault_copy_dst_cleanup(dst_page);
3768 RETURN(MACH_SEND_INTERRUPTED);
3769 case VM_FAULT_FICTITIOUS_SHORTAGE:
3770 vm_page_more_fictitious();
3771 goto RetrySourceFault;
3772 case VM_FAULT_MEMORY_ERROR:
3773 vm_fault_copy_dst_cleanup(dst_page);
3774 if (error)
3775 return (error);
3776 else
3777 return(KERN_MEMORY_ERROR);
3778 }
3779
3780
3781 assert((src_top_page == VM_PAGE_NULL) ==
3782 (result_page->object == src_object));
3783 }
3784 assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
3785 vm_object_unlock(result_page->object);
3786 }
3787
3788 if (!vm_map_verify(dst_map, dst_version)) {
3789 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3790 vm_fault_copy_cleanup(result_page, src_top_page);
3791 vm_fault_copy_dst_cleanup(dst_page);
3792 break;
3793 }
3794
3795 vm_object_lock(dst_page->object);
3796
3797 if (dst_page->object->copy != old_copy_object) {
3798 vm_object_unlock(dst_page->object);
3799 vm_map_verify_done(dst_map, dst_version);
3800 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3801 vm_fault_copy_cleanup(result_page, src_top_page);
3802 vm_fault_copy_dst_cleanup(dst_page);
3803 break;
3804 }
3805 vm_object_unlock(dst_page->object);
3806
3807 /*
3808 * Copy the page, and note that it is dirty
3809 * immediately.
3810 */
3811
3812 if (!page_aligned(src_offset) ||
3813 !page_aligned(dst_offset) ||
3814 !page_aligned(amount_left)) {
3815
3816 vm_object_offset_t src_po,
3817 dst_po;
3818
3819 src_po = src_offset - vm_object_trunc_page(src_offset);
3820 dst_po = dst_offset - vm_object_trunc_page(dst_offset);
3821
3822 if (dst_po > src_po) {
3823 part_size = PAGE_SIZE - dst_po;
3824 } else {
3825 part_size = PAGE_SIZE - src_po;
3826 }
3827 if (part_size > (amount_left)){
3828 part_size = amount_left;
3829 }
3830
3831 if (result_page == VM_PAGE_NULL) {
3832 vm_page_part_zero_fill(dst_page,
3833 dst_po, part_size);
3834 } else {
3835 vm_page_part_copy(result_page, src_po,
3836 dst_page, dst_po, part_size);
3837 if(!dst_page->dirty){
3838 vm_object_lock(dst_object);
3839 dst_page->dirty = TRUE;
3840 vm_object_unlock(dst_page->object);
3841 }
3842
3843 }
3844 } else {
3845 part_size = PAGE_SIZE;
3846
3847 if (result_page == VM_PAGE_NULL)
3848 vm_page_zero_fill(dst_page);
3849 else{
3850 vm_page_copy(result_page, dst_page);
3851 if(!dst_page->dirty){
3852 vm_object_lock(dst_object);
3853 dst_page->dirty = TRUE;
3854 vm_object_unlock(dst_page->object);
3855 }
3856 }
3857
3858 }
3859
3860 /*
3861 * Unlock everything, and return
3862 */
3863
3864 vm_map_verify_done(dst_map, dst_version);
3865
3866 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3867 vm_fault_copy_cleanup(result_page, src_top_page);
3868 vm_fault_copy_dst_cleanup(dst_page);
3869
3870 amount_left -= part_size;
3871 src_offset += part_size;
3872 dst_offset += part_size;
3873 } while (amount_left > 0);
3874
3875 RETURN(KERN_SUCCESS);
3876 #undef RETURN
3877
3878 /*NOTREACHED*/
3879 }
3880
3881 #ifdef notdef
3882
3883 /*
3884 * Routine: vm_fault_page_overwrite
3885 *
3886 * Description:
3887 * A form of vm_fault_page that assumes that the
3888 * resulting page will be overwritten in its entirety,
3889 * making it unnecessary to obtain the correct *contents*
3890 * of the page.
3891 *
3892 * Implementation:
3893 * XXX Untested. Also unused. Eventually, this technology
3894 * could be used in vm_fault_copy() to advantage.
3895 */
3896 vm_fault_return_t
3897 vm_fault_page_overwrite(
3898 register
3899 vm_object_t dst_object,
3900 vm_object_offset_t dst_offset,
3901 vm_page_t *result_page) /* OUT */
3902 {
3903 register
3904 vm_page_t dst_page;
3905 kern_return_t wait_result;
3906
3907 #define interruptible THREAD_UNINT /* XXX */
3908
3909 while (TRUE) {
3910 /*
3911 * Look for a page at this offset
3912 */
3913
3914 while ((dst_page = vm_page_lookup(dst_object, dst_offset))
3915 == VM_PAGE_NULL) {
3916 /*
3917 * No page, no problem... just allocate one.
3918 */
3919
3920 dst_page = vm_page_alloc(dst_object, dst_offset);
3921 if (dst_page == VM_PAGE_NULL) {
3922 vm_object_unlock(dst_object);
3923 VM_PAGE_WAIT();
3924 vm_object_lock(dst_object);
3925 continue;
3926 }
3927
3928 /*
3929 * Pretend that the memory manager
3930 * write-protected the page.
3931 *
3932 * Note that we will be asking for write
3933 * permission without asking for the data
3934 * first.
3935 */
3936
3937 dst_page->overwriting = TRUE;
3938 dst_page->page_lock = VM_PROT_WRITE;
3939 dst_page->absent = TRUE;
3940 dst_page->unusual = TRUE;
3941 dst_object->absent_count++;
3942
3943 break;
3944
3945 /*
3946 * When we bail out, we might have to throw
3947 * away the page created here.
3948 */
3949
3950 #define DISCARD_PAGE \
3951 MACRO_BEGIN \
3952 vm_object_lock(dst_object); \
3953 dst_page = vm_page_lookup(dst_object, dst_offset); \
3954 if ((dst_page != VM_PAGE_NULL) && dst_page->overwriting) \
3955 VM_PAGE_FREE(dst_page); \
3956 vm_object_unlock(dst_object); \
3957 MACRO_END
3958 }
3959
3960 /*
3961 * If the page is write-protected...
3962 */
3963
3964 if (dst_page->page_lock & VM_PROT_WRITE) {
3965 /*
3966 * ... and an unlock request hasn't been sent
3967 */
3968
3969 if ( ! (dst_page->unlock_request & VM_PROT_WRITE)) {
3970 vm_prot_t u;
3971 kern_return_t rc;
3972
3973 /*
3974 * ... then send one now.
3975 */
3976
3977 if (!dst_object->pager_ready) {
3978 wait_result = vm_object_assert_wait(dst_object,
3979 VM_OBJECT_EVENT_PAGER_READY,
3980 interruptible);
3981 vm_object_unlock(dst_object);
3982 if (wait_result == THREAD_WAITING)
3983 wait_result = thread_block(THREAD_CONTINUE_NULL);
3984 if (wait_result != THREAD_AWAKENED) {
3985 DISCARD_PAGE;
3986 return(VM_FAULT_INTERRUPTED);
3987 }
3988 continue;
3989 }
3990
3991 u = dst_page->unlock_request |= VM_PROT_WRITE;
3992 vm_object_unlock(dst_object);
3993
3994 if ((rc = memory_object_data_unlock(
3995 dst_object->pager,
3996 dst_offset + dst_object->paging_offset,
3997 PAGE_SIZE,
3998 u)) != KERN_SUCCESS) {
3999 if (vm_fault_debug)
4000 printf("vm_object_overwrite: memory_object_data_unlock failed\n");
4001 DISCARD_PAGE;
4002 return((rc == MACH_SEND_INTERRUPTED) ?
4003 VM_FAULT_INTERRUPTED :
4004 VM_FAULT_MEMORY_ERROR);
4005 }
4006 vm_object_lock(dst_object);
4007 continue;
4008 }
4009
4010 /* ... fall through to wait below */
4011 } else {
4012 /*
4013 * If the page isn't being used for other
4014 * purposes, then we're done.
4015 */
4016 if ( ! (dst_page->busy || dst_page->absent ||
4017 dst_page->error || dst_page->restart) )
4018 break;
4019 }
4020
4021 wait_result = PAGE_ASSERT_WAIT(dst_page, interruptible);
4022 vm_object_unlock(dst_object);
4023 if (wait_result == THREAD_WAITING)
4024 wait_result = thread_block(THREAD_CONTINUE_NULL);
4025 if (wait_result != THREAD_AWAKENED) {
4026 DISCARD_PAGE;
4027 return(VM_FAULT_INTERRUPTED);
4028 }
4029 }
4030
4031 *result_page = dst_page;
4032 return(VM_FAULT_SUCCESS);
4033
4034 #undef interruptible
4035 #undef DISCARD_PAGE
4036 }
4037
4038 #endif /* notdef */
4039
4040 #if VM_FAULT_CLASSIFY
4041 /*
4042 * Temporary statistics gathering support.
4043 */
4044
4045 /*
4046 * Statistics arrays:
4047 */
4048 #define VM_FAULT_TYPES_MAX 5
4049 #define VM_FAULT_LEVEL_MAX 8
4050
4051 int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4052
4053 #define VM_FAULT_TYPE_ZERO_FILL 0
4054 #define VM_FAULT_TYPE_MAP_IN 1
4055 #define VM_FAULT_TYPE_PAGER 2
4056 #define VM_FAULT_TYPE_COPY 3
4057 #define VM_FAULT_TYPE_OTHER 4
4058
4059
4060 void
4061 vm_fault_classify(vm_object_t object,
4062 vm_object_offset_t offset,
4063 vm_prot_t fault_type)
4064 {
4065 int type, level = 0;
4066 vm_page_t m;
4067
4068 while (TRUE) {
4069 m = vm_page_lookup(object, offset);
4070 if (m != VM_PAGE_NULL) {
4071 if (m->busy || m->error || m->restart || m->absent ||
4072 fault_type & m->page_lock) {
4073 type = VM_FAULT_TYPE_OTHER;
4074 break;
4075 }
4076 if (((fault_type & VM_PROT_WRITE) == 0) ||
4077 ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4078 type = VM_FAULT_TYPE_MAP_IN;
4079 break;
4080 }
4081 type = VM_FAULT_TYPE_COPY;
4082 break;
4083 }
4084 else {
4085 if (object->pager_created) {
4086 type = VM_FAULT_TYPE_PAGER;
4087 break;
4088 }
4089 if (object->shadow == VM_OBJECT_NULL) {
4090 type = VM_FAULT_TYPE_ZERO_FILL;
4091 break;
4092 }
4093
4094 offset += object->shadow_offset;
4095 object = object->shadow;
4096 level++;
4097 continue;
4098 }
4099 }
4100
4101 if (level > VM_FAULT_LEVEL_MAX)
4102 level = VM_FAULT_LEVEL_MAX;
4103
4104 vm_fault_stats[type][level] += 1;
4105
4106 return;
4107 }
4108
4109 /* cleanup routine to call from debugger */
4110
4111 void
4112 vm_fault_classify_init(void)
4113 {
4114 int type, level;
4115
4116 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4117 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4118 vm_fault_stats[type][level] = 0;
4119 }
4120 }
4121
4122 return;
4123 }
4124 #endif /* VM_FAULT_CLASSIFY */