]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_fault.c
948f89a6bec6805186022472fa860484b046caae
[apple/xnu.git] / osfmk / vm / vm_fault.c
1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30 /*
31 * @OSF_COPYRIGHT@
32 */
33 /*
34 * Mach Operating System
35 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
36 * All Rights Reserved.
37 *
38 * Permission to use, copy, modify and distribute this software and its
39 * documentation is hereby granted, provided that both the copyright
40 * notice and this permission notice appear in all copies of the
41 * software, derivative works or modified versions, and any portions
42 * thereof, and that both notices appear in supporting documentation.
43 *
44 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
45 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
46 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
47 *
48 * Carnegie Mellon requests users of this software to return to
49 *
50 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
51 * School of Computer Science
52 * Carnegie Mellon University
53 * Pittsburgh PA 15213-3890
54 *
55 * any improvements or extensions that they make and grant Carnegie Mellon
56 * the rights to redistribute these changes.
57 */
58 /*
59 */
60 /*
61 * File: vm_fault.c
62 * Author: Avadis Tevanian, Jr., Michael Wayne Young
63 *
64 * Page fault handling module.
65 */
66
67 #include <mach_cluster_stats.h>
68 #include <mach_pagemap.h>
69 #include <mach_kdb.h>
70
71 #include <mach/mach_types.h>
72 #include <mach/kern_return.h>
73 #include <mach/message.h> /* for error codes */
74 #include <mach/vm_param.h>
75 #include <mach/vm_behavior.h>
76 #include <mach/memory_object.h>
77 /* For memory_object_data_{request,unlock} */
78
79 #include <kern/kern_types.h>
80 #include <kern/host_statistics.h>
81 #include <kern/counters.h>
82 #include <kern/task.h>
83 #include <kern/thread.h>
84 #include <kern/sched_prim.h>
85 #include <kern/host.h>
86 #include <kern/xpr.h>
87 #include <kern/mach_param.h>
88 #include <kern/macro_help.h>
89 #include <kern/zalloc.h>
90 #include <kern/misc_protos.h>
91
92 #include <ppc/proc_reg.h>
93
94 #include <vm/vm_fault.h>
95 #include <vm/task_working_set.h>
96 #include <vm/vm_map.h>
97 #include <vm/vm_object.h>
98 #include <vm/vm_page.h>
99 #include <vm/vm_kern.h>
100 #include <vm/pmap.h>
101 #include <vm/vm_pageout.h>
102 #include <vm/vm_protos.h>
103
104 #include <sys/kdebug.h>
105
106 #define VM_FAULT_CLASSIFY 0
107 #define VM_FAULT_STATIC_CONFIG 1
108
109 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
110
111 unsigned int vm_object_absent_max = 50;
112
113 int vm_fault_debug = 0;
114
115 #if !VM_FAULT_STATIC_CONFIG
116 boolean_t vm_fault_dirty_handling = FALSE;
117 boolean_t vm_fault_interruptible = FALSE;
118 boolean_t software_reference_bits = TRUE;
119 #endif
120
121 #if MACH_KDB
122 extern struct db_watchpoint *db_watchpoint_list;
123 #endif /* MACH_KDB */
124
125
126 /* Forward declarations of internal routines. */
127 extern kern_return_t vm_fault_wire_fast(
128 vm_map_t map,
129 vm_map_offset_t va,
130 vm_map_entry_t entry,
131 pmap_t pmap,
132 vm_map_offset_t pmap_addr);
133
134 extern void vm_fault_continue(void);
135
136 extern void vm_fault_copy_cleanup(
137 vm_page_t page,
138 vm_page_t top_page);
139
140 extern void vm_fault_copy_dst_cleanup(
141 vm_page_t page);
142
143 #if VM_FAULT_CLASSIFY
144 extern void vm_fault_classify(vm_object_t object,
145 vm_object_offset_t offset,
146 vm_prot_t fault_type);
147
148 extern void vm_fault_classify_init(void);
149 #endif
150
151 /*
152 * Routine: vm_fault_init
153 * Purpose:
154 * Initialize our private data structures.
155 */
156 void
157 vm_fault_init(void)
158 {
159 }
160
161 /*
162 * Routine: vm_fault_cleanup
163 * Purpose:
164 * Clean up the result of vm_fault_page.
165 * Results:
166 * The paging reference for "object" is released.
167 * "object" is unlocked.
168 * If "top_page" is not null, "top_page" is
169 * freed and the paging reference for the object
170 * containing it is released.
171 *
172 * In/out conditions:
173 * "object" must be locked.
174 */
175 void
176 vm_fault_cleanup(
177 register vm_object_t object,
178 register vm_page_t top_page)
179 {
180 vm_object_paging_end(object);
181 vm_object_unlock(object);
182
183 if (top_page != VM_PAGE_NULL) {
184 object = top_page->object;
185 vm_object_lock(object);
186 VM_PAGE_FREE(top_page);
187 vm_object_paging_end(object);
188 vm_object_unlock(object);
189 }
190 }
191
192 #if MACH_CLUSTER_STATS
193 #define MAXCLUSTERPAGES 16
194 struct {
195 unsigned long pages_in_cluster;
196 unsigned long pages_at_higher_offsets;
197 unsigned long pages_at_lower_offsets;
198 } cluster_stats_in[MAXCLUSTERPAGES];
199 #define CLUSTER_STAT(clause) clause
200 #define CLUSTER_STAT_HIGHER(x) \
201 ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
202 #define CLUSTER_STAT_LOWER(x) \
203 ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
204 #define CLUSTER_STAT_CLUSTER(x) \
205 ((cluster_stats_in[(x)].pages_in_cluster)++)
206 #else /* MACH_CLUSTER_STATS */
207 #define CLUSTER_STAT(clause)
208 #endif /* MACH_CLUSTER_STATS */
209
210 /* XXX - temporary */
211 boolean_t vm_allow_clustered_pagein = FALSE;
212 int vm_pagein_cluster_used = 0;
213
214 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
215
216
217 boolean_t vm_page_deactivate_behind = TRUE;
218 /*
219 * Prepage default sizes given VM_BEHAVIOR_DEFAULT reference behavior
220 */
221 int vm_default_ahead = 0;
222 int vm_default_behind = MAX_UPL_TRANSFER;
223
224 /*
225 * vm_page_deactivate_behind
226 *
227 * Determine if sequential access is in progress
228 * in accordance with the behavior specified. If
229 * so, compute a potential page to deactive and
230 * deactivate it.
231 *
232 * The object must be locked.
233 */
234 static
235 boolean_t
236 vm_fault_deactivate_behind(
237 vm_object_t object,
238 vm_object_offset_t offset,
239 vm_behavior_t behavior)
240 {
241 vm_page_t m;
242
243 #if TRACEFAULTPAGE
244 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
245 #endif
246
247 if (object == kernel_object) {
248 /*
249 * Do not deactivate pages from the kernel object: they
250 * are not intended to become pageable.
251 */
252 return FALSE;
253 }
254
255 switch (behavior) {
256 case VM_BEHAVIOR_RANDOM:
257 object->sequential = PAGE_SIZE_64;
258 m = VM_PAGE_NULL;
259 break;
260 case VM_BEHAVIOR_SEQUENTIAL:
261 if (offset &&
262 object->last_alloc == offset - PAGE_SIZE_64) {
263 object->sequential += PAGE_SIZE_64;
264 m = vm_page_lookup(object, offset - PAGE_SIZE_64);
265 } else {
266 object->sequential = PAGE_SIZE_64; /* reset */
267 m = VM_PAGE_NULL;
268 }
269 break;
270 case VM_BEHAVIOR_RSEQNTL:
271 if (object->last_alloc &&
272 object->last_alloc == offset + PAGE_SIZE_64) {
273 object->sequential += PAGE_SIZE_64;
274 m = vm_page_lookup(object, offset + PAGE_SIZE_64);
275 } else {
276 object->sequential = PAGE_SIZE_64; /* reset */
277 m = VM_PAGE_NULL;
278 }
279 break;
280 case VM_BEHAVIOR_DEFAULT:
281 default:
282 if (offset &&
283 object->last_alloc == offset - PAGE_SIZE_64) {
284 vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
285
286 object->sequential += PAGE_SIZE_64;
287 m = (offset >= behind &&
288 object->sequential >= behind) ?
289 vm_page_lookup(object, offset - behind) :
290 VM_PAGE_NULL;
291 } else if (object->last_alloc &&
292 object->last_alloc == offset + PAGE_SIZE_64) {
293 vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
294
295 object->sequential += PAGE_SIZE_64;
296 m = (offset < -behind &&
297 object->sequential >= behind) ?
298 vm_page_lookup(object, offset + behind) :
299 VM_PAGE_NULL;
300 } else {
301 object->sequential = PAGE_SIZE_64;
302 m = VM_PAGE_NULL;
303 }
304 break;
305 }
306
307 object->last_alloc = offset;
308
309 if (m) {
310 if (!m->busy) {
311 vm_page_lock_queues();
312 vm_page_deactivate(m);
313 vm_page_unlock_queues();
314 #if TRACEFAULTPAGE
315 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
316 #endif
317 }
318 return TRUE;
319 }
320 return FALSE;
321 }
322
323
324 /*
325 * Routine: vm_fault_page
326 * Purpose:
327 * Find the resident page for the virtual memory
328 * specified by the given virtual memory object
329 * and offset.
330 * Additional arguments:
331 * The required permissions for the page is given
332 * in "fault_type". Desired permissions are included
333 * in "protection". The minimum and maximum valid offsets
334 * within the object for the relevant map entry are
335 * passed in "lo_offset" and "hi_offset" respectively and
336 * the expected page reference pattern is passed in "behavior".
337 * These three parameters are used to determine pagein cluster
338 * limits.
339 *
340 * If the desired page is known to be resident (for
341 * example, because it was previously wired down), asserting
342 * the "unwiring" parameter will speed the search.
343 *
344 * If the operation can be interrupted (by thread_abort
345 * or thread_terminate), then the "interruptible"
346 * parameter should be asserted.
347 *
348 * Results:
349 * The page containing the proper data is returned
350 * in "result_page".
351 *
352 * In/out conditions:
353 * The source object must be locked and referenced,
354 * and must donate one paging reference. The reference
355 * is not affected. The paging reference and lock are
356 * consumed.
357 *
358 * If the call succeeds, the object in which "result_page"
359 * resides is left locked and holding a paging reference.
360 * If this is not the original object, a busy page in the
361 * original object is returned in "top_page", to prevent other
362 * callers from pursuing this same data, along with a paging
363 * reference for the original object. The "top_page" should
364 * be destroyed when this guarantee is no longer required.
365 * The "result_page" is also left busy. It is not removed
366 * from the pageout queues.
367 */
368
369 vm_fault_return_t
370 vm_fault_page(
371 /* Arguments: */
372 vm_object_t first_object, /* Object to begin search */
373 vm_object_offset_t first_offset, /* Offset into object */
374 vm_prot_t fault_type, /* What access is requested */
375 boolean_t must_be_resident,/* Must page be resident? */
376 int interruptible, /* how may fault be interrupted? */
377 vm_map_offset_t lo_offset, /* Map entry start */
378 vm_map_offset_t hi_offset, /* Map entry end */
379 vm_behavior_t behavior, /* Page reference behavior */
380 /* Modifies in place: */
381 vm_prot_t *protection, /* Protection for mapping */
382 /* Returns: */
383 vm_page_t *result_page, /* Page found, if successful */
384 vm_page_t *top_page, /* Page in top object, if
385 * not result_page. */
386 int *type_of_fault, /* if non-null, fill in with type of fault
387 * COW, zero-fill, etc... returned in trace point */
388 /* More arguments: */
389 kern_return_t *error_code, /* code if page is in error */
390 boolean_t no_zero_fill, /* don't zero fill absent pages */
391 boolean_t data_supply, /* treat as data_supply if
392 * it is a write fault and a full
393 * page is provided */
394 vm_map_t map,
395 __unused vm_map_offset_t vaddr)
396 {
397 register
398 vm_page_t m;
399 register
400 vm_object_t object;
401 register
402 vm_object_offset_t offset;
403 vm_page_t first_m;
404 vm_object_t next_object;
405 vm_object_t copy_object;
406 boolean_t look_for_page;
407 vm_prot_t access_required = fault_type;
408 vm_prot_t wants_copy_flag;
409 vm_object_size_t length;
410 vm_object_offset_t cluster_start, cluster_end;
411 CLUSTER_STAT(int pages_at_higher_offsets;)
412 CLUSTER_STAT(int pages_at_lower_offsets;)
413 kern_return_t wait_result;
414 boolean_t interruptible_state;
415 boolean_t bumped_pagein = FALSE;
416
417
418 #if MACH_PAGEMAP
419 /*
420 * MACH page map - an optional optimization where a bit map is maintained
421 * by the VM subsystem for internal objects to indicate which pages of
422 * the object currently reside on backing store. This existence map
423 * duplicates information maintained by the vnode pager. It is
424 * created at the time of the first pageout against the object, i.e.
425 * at the same time pager for the object is created. The optimization
426 * is designed to eliminate pager interaction overhead, if it is
427 * 'known' that the page does not exist on backing store.
428 *
429 * LOOK_FOR() evaluates to TRUE if the page specified by object/offset is
430 * either marked as paged out in the existence map for the object or no
431 * existence map exists for the object. LOOK_FOR() is one of the
432 * criteria in the decision to invoke the pager. It is also used as one
433 * of the criteria to terminate the scan for adjacent pages in a clustered
434 * pagein operation. Note that LOOK_FOR() always evaluates to TRUE for
435 * permanent objects. Note also that if the pager for an internal object
436 * has not been created, the pager is not invoked regardless of the value
437 * of LOOK_FOR() and that clustered pagein scans are only done on an object
438 * for which a pager has been created.
439 *
440 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
441 * is marked as paged out in the existence map for the object. PAGED_OUT()
442 * PAGED_OUT() is used to determine if a page has already been pushed
443 * into a copy object in order to avoid a redundant page out operation.
444 */
445 #define LOOK_FOR(o, f) (vm_external_state_get((o)->existence_map, (f)) \
446 != VM_EXTERNAL_STATE_ABSENT)
447 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
448 == VM_EXTERNAL_STATE_EXISTS)
449 #else /* MACH_PAGEMAP */
450 /*
451 * If the MACH page map optimization is not enabled,
452 * LOOK_FOR() always evaluates to TRUE. The pager will always be
453 * invoked to resolve missing pages in an object, assuming the pager
454 * has been created for the object. In a clustered page operation, the
455 * absence of a page on backing backing store cannot be used to terminate
456 * a scan for adjacent pages since that information is available only in
457 * the pager. Hence pages that may not be paged out are potentially
458 * included in a clustered request. The vnode pager is coded to deal
459 * with any combination of absent/present pages in a clustered
460 * pagein request. PAGED_OUT() always evaluates to FALSE, i.e. the pager
461 * will always be invoked to push a dirty page into a copy object assuming
462 * a pager has been created. If the page has already been pushed, the
463 * pager will ingore the new request.
464 */
465 #define LOOK_FOR(o, f) TRUE
466 #define PAGED_OUT(o, f) FALSE
467 #endif /* MACH_PAGEMAP */
468
469 /*
470 * Recovery actions
471 */
472 #define PREPARE_RELEASE_PAGE(m) \
473 MACRO_BEGIN \
474 vm_page_lock_queues(); \
475 MACRO_END
476
477 #define DO_RELEASE_PAGE(m) \
478 MACRO_BEGIN \
479 PAGE_WAKEUP_DONE(m); \
480 if (!m->active && !m->inactive) \
481 vm_page_activate(m); \
482 vm_page_unlock_queues(); \
483 MACRO_END
484
485 #define RELEASE_PAGE(m) \
486 MACRO_BEGIN \
487 PREPARE_RELEASE_PAGE(m); \
488 DO_RELEASE_PAGE(m); \
489 MACRO_END
490
491 #if TRACEFAULTPAGE
492 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
493 #endif
494
495
496
497 #if !VM_FAULT_STATIC_CONFIG
498 if (vm_fault_dirty_handling
499 #if MACH_KDB
500 /*
501 * If there are watchpoints set, then
502 * we don't want to give away write permission
503 * on a read fault. Make the task write fault,
504 * so that the watchpoint code notices the access.
505 */
506 || db_watchpoint_list
507 #endif /* MACH_KDB */
508 ) {
509 /*
510 * If we aren't asking for write permission,
511 * then don't give it away. We're using write
512 * faults to set the dirty bit.
513 */
514 if (!(fault_type & VM_PROT_WRITE))
515 *protection &= ~VM_PROT_WRITE;
516 }
517
518 if (!vm_fault_interruptible)
519 interruptible = THREAD_UNINT;
520 #else /* STATIC_CONFIG */
521 #if MACH_KDB
522 /*
523 * If there are watchpoints set, then
524 * we don't want to give away write permission
525 * on a read fault. Make the task write fault,
526 * so that the watchpoint code notices the access.
527 */
528 if (db_watchpoint_list) {
529 /*
530 * If we aren't asking for write permission,
531 * then don't give it away. We're using write
532 * faults to set the dirty bit.
533 */
534 if (!(fault_type & VM_PROT_WRITE))
535 *protection &= ~VM_PROT_WRITE;
536 }
537
538 #endif /* MACH_KDB */
539 #endif /* STATIC_CONFIG */
540
541 interruptible_state = thread_interrupt_level(interruptible);
542
543 /*
544 * INVARIANTS (through entire routine):
545 *
546 * 1) At all times, we must either have the object
547 * lock or a busy page in some object to prevent
548 * some other thread from trying to bring in
549 * the same page.
550 *
551 * Note that we cannot hold any locks during the
552 * pager access or when waiting for memory, so
553 * we use a busy page then.
554 *
555 * Note also that we aren't as concerned about more than
556 * one thread attempting to memory_object_data_unlock
557 * the same page at once, so we don't hold the page
558 * as busy then, but do record the highest unlock
559 * value so far. [Unlock requests may also be delivered
560 * out of order.]
561 *
562 * 2) To prevent another thread from racing us down the
563 * shadow chain and entering a new page in the top
564 * object before we do, we must keep a busy page in
565 * the top object while following the shadow chain.
566 *
567 * 3) We must increment paging_in_progress on any object
568 * for which we have a busy page
569 *
570 * 4) We leave busy pages on the pageout queues.
571 * If the pageout daemon comes across a busy page,
572 * it will remove the page from the pageout queues.
573 */
574
575 /*
576 * Search for the page at object/offset.
577 */
578
579 object = first_object;
580 offset = first_offset;
581 first_m = VM_PAGE_NULL;
582 access_required = fault_type;
583
584 XPR(XPR_VM_FAULT,
585 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
586 (integer_t)object, offset, fault_type, *protection, 0);
587
588 /*
589 * See whether this page is resident
590 */
591
592 while (TRUE) {
593 #if TRACEFAULTPAGE
594 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
595 #endif
596 if (!object->alive) {
597 vm_fault_cleanup(object, first_m);
598 thread_interrupt_level(interruptible_state);
599 return(VM_FAULT_MEMORY_ERROR);
600 }
601 m = vm_page_lookup(object, offset);
602 #if TRACEFAULTPAGE
603 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
604 #endif
605 if (m != VM_PAGE_NULL) {
606 /*
607 * If the page was pre-paged as part of a
608 * cluster, record the fact.
609 * If we were passed a valid pointer for
610 * "type_of_fault", than we came from
611 * vm_fault... we'll let it deal with
612 * this condition, since it
613 * needs to see m->clustered to correctly
614 * account the pageins.
615 */
616 if (type_of_fault == NULL && m->clustered) {
617 vm_pagein_cluster_used++;
618 m->clustered = FALSE;
619 }
620
621 /*
622 * If the page is being brought in,
623 * wait for it and then retry.
624 *
625 * A possible optimization: if the page
626 * is known to be resident, we can ignore
627 * pages that are absent (regardless of
628 * whether they're busy).
629 */
630
631 if (m->busy) {
632 #if TRACEFAULTPAGE
633 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
634 #endif
635 wait_result = PAGE_SLEEP(object, m, interruptible);
636 XPR(XPR_VM_FAULT,
637 "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
638 (integer_t)object, offset,
639 (integer_t)m, 0, 0);
640 counter(c_vm_fault_page_block_busy_kernel++);
641
642 if (wait_result != THREAD_AWAKENED) {
643 vm_fault_cleanup(object, first_m);
644 thread_interrupt_level(interruptible_state);
645 if (wait_result == THREAD_RESTART)
646 {
647 return(VM_FAULT_RETRY);
648 }
649 else
650 {
651 return(VM_FAULT_INTERRUPTED);
652 }
653 }
654 continue;
655 }
656
657 if (m->encrypted) {
658 /*
659 * ENCRYPTED SWAP:
660 * the user needs access to a page that we
661 * encrypted before paging it out.
662 * Decrypt the page now.
663 * Keep it busy to prevent anyone from
664 * accessing it during the decryption.
665 */
666 m->busy = TRUE;
667 vm_page_decrypt(m, 0);
668 assert(object == m->object);
669 assert(m->busy);
670 PAGE_WAKEUP_DONE(m);
671
672 /*
673 * Retry from the top, in case
674 * something changed while we were
675 * decrypting.
676 */
677 continue;
678 }
679 ASSERT_PAGE_DECRYPTED(m);
680
681 /*
682 * If the page is in error, give up now.
683 */
684
685 if (m->error) {
686 #if TRACEFAULTPAGE
687 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
688 #endif
689 if (error_code)
690 *error_code = m->page_error;
691 VM_PAGE_FREE(m);
692 vm_fault_cleanup(object, first_m);
693 thread_interrupt_level(interruptible_state);
694 return(VM_FAULT_MEMORY_ERROR);
695 }
696
697 /*
698 * If the pager wants us to restart
699 * at the top of the chain,
700 * typically because it has moved the
701 * page to another pager, then do so.
702 */
703
704 if (m->restart) {
705 #if TRACEFAULTPAGE
706 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
707 #endif
708 VM_PAGE_FREE(m);
709 vm_fault_cleanup(object, first_m);
710 thread_interrupt_level(interruptible_state);
711 return(VM_FAULT_RETRY);
712 }
713
714 /*
715 * If the page isn't busy, but is absent,
716 * then it was deemed "unavailable".
717 */
718
719 if (m->absent) {
720 /*
721 * Remove the non-existent page (unless it's
722 * in the top object) and move on down to the
723 * next object (if there is one).
724 */
725 #if TRACEFAULTPAGE
726 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
727 #endif
728
729 next_object = object->shadow;
730 if (next_object == VM_OBJECT_NULL) {
731 vm_page_t real_m;
732
733 assert(!must_be_resident);
734
735 if (object->shadow_severed) {
736 vm_fault_cleanup(
737 object, first_m);
738 thread_interrupt_level(interruptible_state);
739 return VM_FAULT_MEMORY_ERROR;
740 }
741
742 /*
743 * Absent page at bottom of shadow
744 * chain; zero fill the page we left
745 * busy in the first object, and flush
746 * the absent page. But first we
747 * need to allocate a real page.
748 */
749 if (VM_PAGE_THROTTLED() ||
750 (real_m = vm_page_grab())
751 == VM_PAGE_NULL) {
752 vm_fault_cleanup(
753 object, first_m);
754 thread_interrupt_level(
755 interruptible_state);
756 return(
757 VM_FAULT_MEMORY_SHORTAGE);
758 }
759
760 /*
761 * are we protecting the system from
762 * backing store exhaustion. If so
763 * sleep unless we are privileged.
764 */
765
766 if(vm_backing_store_low) {
767 if(!(current_task()->priv_flags
768 & VM_BACKING_STORE_PRIV)) {
769 assert_wait((event_t)
770 &vm_backing_store_low,
771 THREAD_UNINT);
772 vm_fault_cleanup(object,
773 first_m);
774 thread_block(THREAD_CONTINUE_NULL);
775 thread_interrupt_level(
776 interruptible_state);
777 return(VM_FAULT_RETRY);
778 }
779 }
780
781
782 XPR(XPR_VM_FAULT,
783 "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
784 (integer_t)object, offset,
785 (integer_t)m,
786 (integer_t)first_object, 0);
787 if (object != first_object) {
788 VM_PAGE_FREE(m);
789 vm_object_paging_end(object);
790 vm_object_unlock(object);
791 object = first_object;
792 offset = first_offset;
793 m = first_m;
794 first_m = VM_PAGE_NULL;
795 vm_object_lock(object);
796 }
797
798 VM_PAGE_FREE(m);
799 assert(real_m->busy);
800 vm_page_insert(real_m, object, offset);
801 m = real_m;
802
803 /*
804 * Drop the lock while zero filling
805 * page. Then break because this
806 * is the page we wanted. Checking
807 * the page lock is a waste of time;
808 * this page was either absent or
809 * newly allocated -- in both cases
810 * it can't be page locked by a pager.
811 */
812 m->no_isync = FALSE;
813
814 if (!no_zero_fill) {
815 vm_object_unlock(object);
816 vm_page_zero_fill(m);
817 vm_object_lock(object);
818
819 if (type_of_fault)
820 *type_of_fault = DBG_ZERO_FILL_FAULT;
821 VM_STAT(zero_fill_count++);
822 }
823 if (bumped_pagein == TRUE) {
824 VM_STAT(pageins--);
825 current_task()->pageins--;
826 }
827 vm_page_lock_queues();
828 VM_PAGE_QUEUES_REMOVE(m);
829 m->page_ticket = vm_page_ticket;
830 assert(!m->laundry);
831 assert(m->object != kernel_object);
832 assert(m->pageq.next == NULL &&
833 m->pageq.prev == NULL);
834 if(m->object->size > 0x200000) {
835 m->zero_fill = TRUE;
836 /* depends on the queues lock */
837 vm_zf_count += 1;
838 queue_enter(&vm_page_queue_zf,
839 m, vm_page_t, pageq);
840 } else {
841 queue_enter(
842 &vm_page_queue_inactive,
843 m, vm_page_t, pageq);
844 }
845 vm_page_ticket_roll++;
846 if(vm_page_ticket_roll ==
847 VM_PAGE_TICKETS_IN_ROLL) {
848 vm_page_ticket_roll = 0;
849 if(vm_page_ticket ==
850 VM_PAGE_TICKET_ROLL_IDS)
851 vm_page_ticket= 0;
852 else
853 vm_page_ticket++;
854 }
855 m->inactive = TRUE;
856 vm_page_inactive_count++;
857 vm_page_unlock_queues();
858 break;
859 } else {
860 if (must_be_resident) {
861 vm_object_paging_end(object);
862 } else if (object != first_object) {
863 vm_object_paging_end(object);
864 VM_PAGE_FREE(m);
865 } else {
866 first_m = m;
867 m->absent = FALSE;
868 m->unusual = FALSE;
869 vm_object_absent_release(object);
870 m->busy = TRUE;
871
872 vm_page_lock_queues();
873 VM_PAGE_QUEUES_REMOVE(m);
874 vm_page_unlock_queues();
875 }
876 XPR(XPR_VM_FAULT,
877 "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
878 (integer_t)object, offset,
879 (integer_t)next_object,
880 offset+object->shadow_offset,0);
881 offset += object->shadow_offset;
882 hi_offset += object->shadow_offset;
883 lo_offset += object->shadow_offset;
884 access_required = VM_PROT_READ;
885 vm_object_lock(next_object);
886 vm_object_unlock(object);
887 object = next_object;
888 vm_object_paging_begin(object);
889 continue;
890 }
891 }
892
893 if ((m->cleaning)
894 && ((object != first_object) ||
895 (object->copy != VM_OBJECT_NULL))
896 && (fault_type & VM_PROT_WRITE)) {
897 /*
898 * This is a copy-on-write fault that will
899 * cause us to revoke access to this page, but
900 * this page is in the process of being cleaned
901 * in a clustered pageout. We must wait until
902 * the cleaning operation completes before
903 * revoking access to the original page,
904 * otherwise we might attempt to remove a
905 * wired mapping.
906 */
907 #if TRACEFAULTPAGE
908 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
909 #endif
910 XPR(XPR_VM_FAULT,
911 "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
912 (integer_t)object, offset,
913 (integer_t)m, 0, 0);
914 /* take an extra ref so that object won't die */
915 assert(object->ref_count > 0);
916 object->ref_count++;
917 vm_object_res_reference(object);
918 vm_fault_cleanup(object, first_m);
919 counter(c_vm_fault_page_block_backoff_kernel++);
920 vm_object_lock(object);
921 assert(object->ref_count > 0);
922 m = vm_page_lookup(object, offset);
923 if (m != VM_PAGE_NULL && m->cleaning) {
924 PAGE_ASSERT_WAIT(m, interruptible);
925 vm_object_unlock(object);
926 wait_result = thread_block(THREAD_CONTINUE_NULL);
927 vm_object_deallocate(object);
928 goto backoff;
929 } else {
930 vm_object_unlock(object);
931 vm_object_deallocate(object);
932 thread_interrupt_level(interruptible_state);
933 return VM_FAULT_RETRY;
934 }
935 }
936
937 /*
938 * If the desired access to this page has
939 * been locked out, request that it be unlocked.
940 */
941
942 if (access_required & m->page_lock) {
943 if ((access_required & m->unlock_request) != access_required) {
944 vm_prot_t new_unlock_request;
945 kern_return_t rc;
946
947 #if TRACEFAULTPAGE
948 dbgTrace(0xBEEF000A, (unsigned int) m, (unsigned int) object->pager_ready); /* (TEST/DEBUG) */
949 #endif
950 if (!object->pager_ready) {
951 XPR(XPR_VM_FAULT,
952 "vm_f_page: ready wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
953 access_required,
954 (integer_t)object, offset,
955 (integer_t)m, 0);
956 /* take an extra ref */
957 assert(object->ref_count > 0);
958 object->ref_count++;
959 vm_object_res_reference(object);
960 vm_fault_cleanup(object,
961 first_m);
962 counter(c_vm_fault_page_block_backoff_kernel++);
963 vm_object_lock(object);
964 assert(object->ref_count > 0);
965 if (!object->pager_ready) {
966 wait_result = vm_object_assert_wait(
967 object,
968 VM_OBJECT_EVENT_PAGER_READY,
969 interruptible);
970 vm_object_unlock(object);
971 if (wait_result == THREAD_WAITING)
972 wait_result = thread_block(THREAD_CONTINUE_NULL);
973 vm_object_deallocate(object);
974 goto backoff;
975 } else {
976 vm_object_unlock(object);
977 vm_object_deallocate(object);
978 thread_interrupt_level(interruptible_state);
979 return VM_FAULT_RETRY;
980 }
981 }
982
983 new_unlock_request = m->unlock_request =
984 (access_required | m->unlock_request);
985 vm_object_unlock(object);
986 XPR(XPR_VM_FAULT,
987 "vm_f_page: unlock obj 0x%X, offset 0x%X, page 0x%X, unl_req %d\n",
988 (integer_t)object, offset,
989 (integer_t)m, new_unlock_request, 0);
990 if ((rc = memory_object_data_unlock(
991 object->pager,
992 offset + object->paging_offset,
993 PAGE_SIZE,
994 new_unlock_request))
995 != KERN_SUCCESS) {
996 if (vm_fault_debug)
997 printf("vm_fault: memory_object_data_unlock failed\n");
998 vm_object_lock(object);
999 vm_fault_cleanup(object, first_m);
1000 thread_interrupt_level(interruptible_state);
1001 return((rc == MACH_SEND_INTERRUPTED) ?
1002 VM_FAULT_INTERRUPTED :
1003 VM_FAULT_MEMORY_ERROR);
1004 }
1005 vm_object_lock(object);
1006 continue;
1007 }
1008
1009 XPR(XPR_VM_FAULT,
1010 "vm_f_page: access wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
1011 access_required, (integer_t)object,
1012 offset, (integer_t)m, 0);
1013 /* take an extra ref so object won't die */
1014 assert(object->ref_count > 0);
1015 object->ref_count++;
1016 vm_object_res_reference(object);
1017 vm_fault_cleanup(object, first_m);
1018 counter(c_vm_fault_page_block_backoff_kernel++);
1019 vm_object_lock(object);
1020 assert(object->ref_count > 0);
1021 m = vm_page_lookup(object, offset);
1022 if (m != VM_PAGE_NULL &&
1023 (access_required & m->page_lock) &&
1024 !((access_required & m->unlock_request) != access_required)) {
1025 PAGE_ASSERT_WAIT(m, interruptible);
1026 vm_object_unlock(object);
1027 wait_result = thread_block(THREAD_CONTINUE_NULL);
1028 vm_object_deallocate(object);
1029 goto backoff;
1030 } else {
1031 vm_object_unlock(object);
1032 vm_object_deallocate(object);
1033 thread_interrupt_level(interruptible_state);
1034 return VM_FAULT_RETRY;
1035 }
1036 }
1037 /*
1038 * We mark the page busy and leave it on
1039 * the pageout queues. If the pageout
1040 * deamon comes across it, then it will
1041 * remove the page.
1042 */
1043
1044 #if TRACEFAULTPAGE
1045 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1046 #endif
1047
1048 #if !VM_FAULT_STATIC_CONFIG
1049 if (!software_reference_bits) {
1050 vm_page_lock_queues();
1051 if (m->inactive)
1052 vm_stat.reactivations++;
1053
1054 VM_PAGE_QUEUES_REMOVE(m);
1055 vm_page_unlock_queues();
1056 }
1057 #endif
1058 XPR(XPR_VM_FAULT,
1059 "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1060 (integer_t)object, offset, (integer_t)m, 0, 0);
1061 assert(!m->busy);
1062 m->busy = TRUE;
1063 assert(!m->absent);
1064 break;
1065 }
1066
1067 look_for_page =
1068 (object->pager_created) &&
1069 LOOK_FOR(object, offset) &&
1070 (!data_supply);
1071
1072 #if TRACEFAULTPAGE
1073 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1074 #endif
1075 if ((look_for_page || (object == first_object))
1076 && !must_be_resident
1077 && !(object->phys_contiguous)) {
1078 /*
1079 * Allocate a new page for this object/offset
1080 * pair.
1081 */
1082
1083 m = vm_page_grab_fictitious();
1084 #if TRACEFAULTPAGE
1085 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1086 #endif
1087 if (m == VM_PAGE_NULL) {
1088 vm_fault_cleanup(object, first_m);
1089 thread_interrupt_level(interruptible_state);
1090 return(VM_FAULT_FICTITIOUS_SHORTAGE);
1091 }
1092 vm_page_insert(m, object, offset);
1093 }
1094
1095 if ((look_for_page && !must_be_resident)) {
1096 kern_return_t rc;
1097
1098 /*
1099 * If the memory manager is not ready, we
1100 * cannot make requests.
1101 */
1102 if (!object->pager_ready) {
1103 #if TRACEFAULTPAGE
1104 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1105 #endif
1106 if(m != VM_PAGE_NULL)
1107 VM_PAGE_FREE(m);
1108 XPR(XPR_VM_FAULT,
1109 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1110 (integer_t)object, offset, 0, 0, 0);
1111 /* take an extra ref so object won't die */
1112 assert(object->ref_count > 0);
1113 object->ref_count++;
1114 vm_object_res_reference(object);
1115 vm_fault_cleanup(object, first_m);
1116 counter(c_vm_fault_page_block_backoff_kernel++);
1117 vm_object_lock(object);
1118 assert(object->ref_count > 0);
1119 if (!object->pager_ready) {
1120 wait_result = vm_object_assert_wait(object,
1121 VM_OBJECT_EVENT_PAGER_READY,
1122 interruptible);
1123 vm_object_unlock(object);
1124 if (wait_result == THREAD_WAITING)
1125 wait_result = thread_block(THREAD_CONTINUE_NULL);
1126 vm_object_deallocate(object);
1127 goto backoff;
1128 } else {
1129 vm_object_unlock(object);
1130 vm_object_deallocate(object);
1131 thread_interrupt_level(interruptible_state);
1132 return VM_FAULT_RETRY;
1133 }
1134 }
1135
1136 if(object->phys_contiguous) {
1137 if(m != VM_PAGE_NULL) {
1138 VM_PAGE_FREE(m);
1139 m = VM_PAGE_NULL;
1140 }
1141 goto no_clustering;
1142 }
1143 if (object->internal) {
1144 /*
1145 * Requests to the default pager
1146 * must reserve a real page in advance,
1147 * because the pager's data-provided
1148 * won't block for pages. IMPORTANT:
1149 * this acts as a throttling mechanism
1150 * for data_requests to the default
1151 * pager.
1152 */
1153
1154 #if TRACEFAULTPAGE
1155 dbgTrace(0xBEEF000F, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1156 #endif
1157 if (m->fictitious && !vm_page_convert(m)) {
1158 VM_PAGE_FREE(m);
1159 vm_fault_cleanup(object, first_m);
1160 thread_interrupt_level(interruptible_state);
1161 return(VM_FAULT_MEMORY_SHORTAGE);
1162 }
1163 } else if (object->absent_count >
1164 vm_object_absent_max) {
1165 /*
1166 * If there are too many outstanding page
1167 * requests pending on this object, we
1168 * wait for them to be resolved now.
1169 */
1170
1171 #if TRACEFAULTPAGE
1172 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1173 #endif
1174 if(m != VM_PAGE_NULL)
1175 VM_PAGE_FREE(m);
1176 /* take an extra ref so object won't die */
1177 assert(object->ref_count > 0);
1178 object->ref_count++;
1179 vm_object_res_reference(object);
1180 vm_fault_cleanup(object, first_m);
1181 counter(c_vm_fault_page_block_backoff_kernel++);
1182 vm_object_lock(object);
1183 assert(object->ref_count > 0);
1184 if (object->absent_count > vm_object_absent_max) {
1185 vm_object_absent_assert_wait(object,
1186 interruptible);
1187 vm_object_unlock(object);
1188 wait_result = thread_block(THREAD_CONTINUE_NULL);
1189 vm_object_deallocate(object);
1190 goto backoff;
1191 } else {
1192 vm_object_unlock(object);
1193 vm_object_deallocate(object);
1194 thread_interrupt_level(interruptible_state);
1195 return VM_FAULT_RETRY;
1196 }
1197 }
1198
1199 /*
1200 * Indicate that the page is waiting for data
1201 * from the memory manager.
1202 */
1203
1204 if(m != VM_PAGE_NULL) {
1205
1206 m->list_req_pending = TRUE;
1207 m->absent = TRUE;
1208 m->unusual = TRUE;
1209 object->absent_count++;
1210
1211 }
1212
1213 no_clustering:
1214 cluster_start = offset;
1215 length = PAGE_SIZE;
1216
1217 /*
1218 * lengthen the cluster by the pages in the working set
1219 */
1220 if((map != NULL) &&
1221 (current_task()->dynamic_working_set != 0)) {
1222 cluster_end = cluster_start + length;
1223 /* tws values for start and end are just a
1224 * suggestions. Therefore, as long as
1225 * build_cluster does not use pointers or
1226 * take action based on values that
1227 * could be affected by re-entrance we
1228 * do not need to take the map lock.
1229 */
1230 cluster_end = offset + PAGE_SIZE_64;
1231 tws_build_cluster(
1232 current_task()->dynamic_working_set,
1233 object, &cluster_start,
1234 &cluster_end, 0x40000);
1235 length = cluster_end - cluster_start;
1236 }
1237 #if TRACEFAULTPAGE
1238 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1239 #endif
1240 /*
1241 * We have a busy page, so we can
1242 * release the object lock.
1243 */
1244 vm_object_unlock(object);
1245
1246 /*
1247 * Call the memory manager to retrieve the data.
1248 */
1249
1250 if (type_of_fault)
1251 *type_of_fault = ((int)length << 8) | DBG_PAGEIN_FAULT;
1252 VM_STAT(pageins++);
1253 current_task()->pageins++;
1254 bumped_pagein = TRUE;
1255
1256 /*
1257 * If this object uses a copy_call strategy,
1258 * and we are interested in a copy of this object
1259 * (having gotten here only by following a
1260 * shadow chain), then tell the memory manager
1261 * via a flag added to the desired_access
1262 * parameter, so that it can detect a race
1263 * between our walking down the shadow chain
1264 * and its pushing pages up into a copy of
1265 * the object that it manages.
1266 */
1267
1268 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL &&
1269 object != first_object) {
1270 wants_copy_flag = VM_PROT_WANTS_COPY;
1271 } else {
1272 wants_copy_flag = VM_PROT_NONE;
1273 }
1274
1275 XPR(XPR_VM_FAULT,
1276 "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1277 (integer_t)object, offset, (integer_t)m,
1278 access_required | wants_copy_flag, 0);
1279
1280 rc = memory_object_data_request(object->pager,
1281 cluster_start + object->paging_offset,
1282 length,
1283 access_required | wants_copy_flag);
1284
1285
1286 #if TRACEFAULTPAGE
1287 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1288 #endif
1289 if (rc != KERN_SUCCESS) {
1290 if (rc != MACH_SEND_INTERRUPTED
1291 && vm_fault_debug)
1292 printf("%s(0x%x, 0x%xll, 0x%xll, 0x%x) failed, rc=%d\n",
1293 "memory_object_data_request",
1294 object->pager,
1295 cluster_start + object->paging_offset,
1296 length, access_required, rc);
1297 /*
1298 * Don't want to leave a busy page around,
1299 * but the data request may have blocked,
1300 * so check if it's still there and busy.
1301 */
1302 if(!object->phys_contiguous) {
1303 vm_object_lock(object);
1304 for (; length; length -= PAGE_SIZE,
1305 cluster_start += PAGE_SIZE_64) {
1306 vm_page_t p;
1307 if ((p = vm_page_lookup(object,
1308 cluster_start))
1309 && p->absent && p->busy
1310 && p != first_m) {
1311 VM_PAGE_FREE(p);
1312 }
1313 }
1314 }
1315 vm_fault_cleanup(object, first_m);
1316 thread_interrupt_level(interruptible_state);
1317 return((rc == MACH_SEND_INTERRUPTED) ?
1318 VM_FAULT_INTERRUPTED :
1319 VM_FAULT_MEMORY_ERROR);
1320 }
1321
1322 vm_object_lock(object);
1323 if ((interruptible != THREAD_UNINT) &&
1324 (current_thread()->state & TH_ABORT)) {
1325 vm_fault_cleanup(object, first_m);
1326 thread_interrupt_level(interruptible_state);
1327 return(VM_FAULT_INTERRUPTED);
1328 }
1329 if (m == VM_PAGE_NULL &&
1330 object->phys_contiguous) {
1331 /*
1332 * No page here means that the object we
1333 * initially looked up was "physically
1334 * contiguous" (i.e. device memory). However,
1335 * with Virtual VRAM, the object might not
1336 * be backed by that device memory anymore,
1337 * so we're done here only if the object is
1338 * still "phys_contiguous".
1339 * Otherwise, if the object is no longer
1340 * "phys_contiguous", we need to retry the
1341 * page fault against the object's new backing
1342 * store (different memory object).
1343 */
1344 break;
1345 }
1346
1347 /*
1348 * Retry with same object/offset, since new data may
1349 * be in a different page (i.e., m is meaningless at
1350 * this point).
1351 */
1352 continue;
1353 }
1354
1355 /*
1356 * The only case in which we get here is if
1357 * object has no pager (or unwiring). If the pager doesn't
1358 * have the page this is handled in the m->absent case above
1359 * (and if you change things here you should look above).
1360 */
1361 #if TRACEFAULTPAGE
1362 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1363 #endif
1364 if (object == first_object)
1365 first_m = m;
1366 else
1367 assert(m == VM_PAGE_NULL);
1368
1369 XPR(XPR_VM_FAULT,
1370 "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1371 (integer_t)object, offset, (integer_t)m,
1372 (integer_t)object->shadow, 0);
1373 /*
1374 * Move on to the next object. Lock the next
1375 * object before unlocking the current one.
1376 */
1377 next_object = object->shadow;
1378 if (next_object == VM_OBJECT_NULL) {
1379 assert(!must_be_resident);
1380 /*
1381 * If there's no object left, fill the page
1382 * in the top object with zeros. But first we
1383 * need to allocate a real page.
1384 */
1385
1386 if (object != first_object) {
1387 vm_object_paging_end(object);
1388 vm_object_unlock(object);
1389
1390 object = first_object;
1391 offset = first_offset;
1392 vm_object_lock(object);
1393 }
1394
1395 m = first_m;
1396 assert(m->object == object);
1397 first_m = VM_PAGE_NULL;
1398
1399 if(m == VM_PAGE_NULL) {
1400 m = vm_page_grab();
1401 if (m == VM_PAGE_NULL) {
1402 vm_fault_cleanup(
1403 object, VM_PAGE_NULL);
1404 thread_interrupt_level(
1405 interruptible_state);
1406 return(VM_FAULT_MEMORY_SHORTAGE);
1407 }
1408 vm_page_insert(
1409 m, object, offset);
1410 }
1411
1412 if (object->shadow_severed) {
1413 VM_PAGE_FREE(m);
1414 vm_fault_cleanup(object, VM_PAGE_NULL);
1415 thread_interrupt_level(interruptible_state);
1416 return VM_FAULT_MEMORY_ERROR;
1417 }
1418
1419 /*
1420 * are we protecting the system from
1421 * backing store exhaustion. If so
1422 * sleep unless we are privileged.
1423 */
1424
1425 if(vm_backing_store_low) {
1426 if(!(current_task()->priv_flags
1427 & VM_BACKING_STORE_PRIV)) {
1428 assert_wait((event_t)
1429 &vm_backing_store_low,
1430 THREAD_UNINT);
1431 VM_PAGE_FREE(m);
1432 vm_fault_cleanup(object, VM_PAGE_NULL);
1433 thread_block(THREAD_CONTINUE_NULL);
1434 thread_interrupt_level(
1435 interruptible_state);
1436 return(VM_FAULT_RETRY);
1437 }
1438 }
1439
1440 if (VM_PAGE_THROTTLED() ||
1441 (m->fictitious && !vm_page_convert(m))) {
1442 VM_PAGE_FREE(m);
1443 vm_fault_cleanup(object, VM_PAGE_NULL);
1444 thread_interrupt_level(interruptible_state);
1445 return(VM_FAULT_MEMORY_SHORTAGE);
1446 }
1447 m->no_isync = FALSE;
1448
1449 if (!no_zero_fill) {
1450 vm_object_unlock(object);
1451 vm_page_zero_fill(m);
1452 vm_object_lock(object);
1453
1454 if (type_of_fault)
1455 *type_of_fault = DBG_ZERO_FILL_FAULT;
1456 VM_STAT(zero_fill_count++);
1457 }
1458 if (bumped_pagein == TRUE) {
1459 VM_STAT(pageins--);
1460 current_task()->pageins--;
1461 }
1462 vm_page_lock_queues();
1463 VM_PAGE_QUEUES_REMOVE(m);
1464 assert(!m->laundry);
1465 assert(m->object != kernel_object);
1466 assert(m->pageq.next == NULL &&
1467 m->pageq.prev == NULL);
1468 if(m->object->size > 0x200000) {
1469 m->zero_fill = TRUE;
1470 /* depends on the queues lock */
1471 vm_zf_count += 1;
1472 queue_enter(&vm_page_queue_zf,
1473 m, vm_page_t, pageq);
1474 } else {
1475 queue_enter(
1476 &vm_page_queue_inactive,
1477 m, vm_page_t, pageq);
1478 }
1479 m->page_ticket = vm_page_ticket;
1480 vm_page_ticket_roll++;
1481 if(vm_page_ticket_roll == VM_PAGE_TICKETS_IN_ROLL) {
1482 vm_page_ticket_roll = 0;
1483 if(vm_page_ticket ==
1484 VM_PAGE_TICKET_ROLL_IDS)
1485 vm_page_ticket= 0;
1486 else
1487 vm_page_ticket++;
1488 }
1489 m->inactive = TRUE;
1490 vm_page_inactive_count++;
1491 vm_page_unlock_queues();
1492 #if 0
1493 pmap_clear_modify(m->phys_page);
1494 #endif
1495 break;
1496 }
1497 else {
1498 if ((object != first_object) || must_be_resident)
1499 vm_object_paging_end(object);
1500 offset += object->shadow_offset;
1501 hi_offset += object->shadow_offset;
1502 lo_offset += object->shadow_offset;
1503 access_required = VM_PROT_READ;
1504 vm_object_lock(next_object);
1505 vm_object_unlock(object);
1506 object = next_object;
1507 vm_object_paging_begin(object);
1508 }
1509 }
1510
1511 /*
1512 * PAGE HAS BEEN FOUND.
1513 *
1514 * This page (m) is:
1515 * busy, so that we can play with it;
1516 * not absent, so that nobody else will fill it;
1517 * possibly eligible for pageout;
1518 *
1519 * The top-level page (first_m) is:
1520 * VM_PAGE_NULL if the page was found in the
1521 * top-level object;
1522 * busy, not absent, and ineligible for pageout.
1523 *
1524 * The current object (object) is locked. A paging
1525 * reference is held for the current and top-level
1526 * objects.
1527 */
1528
1529 #if TRACEFAULTPAGE
1530 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1531 #endif
1532 #if EXTRA_ASSERTIONS
1533 if(m != VM_PAGE_NULL) {
1534 assert(m->busy && !m->absent);
1535 assert((first_m == VM_PAGE_NULL) ||
1536 (first_m->busy && !first_m->absent &&
1537 !first_m->active && !first_m->inactive));
1538 }
1539 #endif /* EXTRA_ASSERTIONS */
1540
1541 /*
1542 * ENCRYPTED SWAP:
1543 * If we found a page, we must have decrypted it before we
1544 * get here...
1545 */
1546 if (m != VM_PAGE_NULL) {
1547 ASSERT_PAGE_DECRYPTED(m);
1548 }
1549
1550 XPR(XPR_VM_FAULT,
1551 "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1552 (integer_t)object, offset, (integer_t)m,
1553 (integer_t)first_object, (integer_t)first_m);
1554 /*
1555 * If the page is being written, but isn't
1556 * already owned by the top-level object,
1557 * we have to copy it into a new page owned
1558 * by the top-level object.
1559 */
1560
1561 if ((object != first_object) && (m != VM_PAGE_NULL)) {
1562 /*
1563 * We only really need to copy if we
1564 * want to write it.
1565 */
1566
1567 #if TRACEFAULTPAGE
1568 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1569 #endif
1570 if (fault_type & VM_PROT_WRITE) {
1571 vm_page_t copy_m;
1572
1573 assert(!must_be_resident);
1574
1575 /*
1576 * are we protecting the system from
1577 * backing store exhaustion. If so
1578 * sleep unless we are privileged.
1579 */
1580
1581 if(vm_backing_store_low) {
1582 if(!(current_task()->priv_flags
1583 & VM_BACKING_STORE_PRIV)) {
1584 assert_wait((event_t)
1585 &vm_backing_store_low,
1586 THREAD_UNINT);
1587 RELEASE_PAGE(m);
1588 vm_fault_cleanup(object, first_m);
1589 thread_block(THREAD_CONTINUE_NULL);
1590 thread_interrupt_level(
1591 interruptible_state);
1592 return(VM_FAULT_RETRY);
1593 }
1594 }
1595
1596 /*
1597 * If we try to collapse first_object at this
1598 * point, we may deadlock when we try to get
1599 * the lock on an intermediate object (since we
1600 * have the bottom object locked). We can't
1601 * unlock the bottom object, because the page
1602 * we found may move (by collapse) if we do.
1603 *
1604 * Instead, we first copy the page. Then, when
1605 * we have no more use for the bottom object,
1606 * we unlock it and try to collapse.
1607 *
1608 * Note that we copy the page even if we didn't
1609 * need to... that's the breaks.
1610 */
1611
1612 /*
1613 * Allocate a page for the copy
1614 */
1615 copy_m = vm_page_grab();
1616 if (copy_m == VM_PAGE_NULL) {
1617 RELEASE_PAGE(m);
1618 vm_fault_cleanup(object, first_m);
1619 thread_interrupt_level(interruptible_state);
1620 return(VM_FAULT_MEMORY_SHORTAGE);
1621 }
1622
1623
1624 XPR(XPR_VM_FAULT,
1625 "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1626 (integer_t)object, offset,
1627 (integer_t)m, (integer_t)copy_m, 0);
1628 vm_page_copy(m, copy_m);
1629
1630 /*
1631 * If another map is truly sharing this
1632 * page with us, we have to flush all
1633 * uses of the original page, since we
1634 * can't distinguish those which want the
1635 * original from those which need the
1636 * new copy.
1637 *
1638 * XXXO If we know that only one map has
1639 * access to this page, then we could
1640 * avoid the pmap_disconnect() call.
1641 */
1642
1643 vm_page_lock_queues();
1644 assert(!m->cleaning);
1645 pmap_disconnect(m->phys_page);
1646 vm_page_deactivate(m);
1647 copy_m->dirty = TRUE;
1648 /*
1649 * Setting reference here prevents this fault from
1650 * being counted as a (per-thread) reactivate as well
1651 * as a copy-on-write.
1652 */
1653 first_m->reference = TRUE;
1654 vm_page_unlock_queues();
1655
1656 /*
1657 * We no longer need the old page or object.
1658 */
1659
1660 PAGE_WAKEUP_DONE(m);
1661 vm_object_paging_end(object);
1662 vm_object_unlock(object);
1663
1664 if (type_of_fault)
1665 *type_of_fault = DBG_COW_FAULT;
1666 VM_STAT(cow_faults++);
1667 current_task()->cow_faults++;
1668 object = first_object;
1669 offset = first_offset;
1670
1671 vm_object_lock(object);
1672 VM_PAGE_FREE(first_m);
1673 first_m = VM_PAGE_NULL;
1674 assert(copy_m->busy);
1675 vm_page_insert(copy_m, object, offset);
1676 m = copy_m;
1677
1678 /*
1679 * Now that we've gotten the copy out of the
1680 * way, let's try to collapse the top object.
1681 * But we have to play ugly games with
1682 * paging_in_progress to do that...
1683 */
1684
1685 vm_object_paging_end(object);
1686 vm_object_collapse(object, offset);
1687 vm_object_paging_begin(object);
1688
1689 }
1690 else {
1691 *protection &= (~VM_PROT_WRITE);
1692 }
1693 }
1694
1695 /*
1696 * Now check whether the page needs to be pushed into the
1697 * copy object. The use of asymmetric copy on write for
1698 * shared temporary objects means that we may do two copies to
1699 * satisfy the fault; one above to get the page from a
1700 * shadowed object, and one here to push it into the copy.
1701 */
1702
1703 while ((copy_object = first_object->copy) != VM_OBJECT_NULL &&
1704 (m!= VM_PAGE_NULL)) {
1705 vm_object_offset_t copy_offset;
1706 vm_page_t copy_m;
1707
1708 #if TRACEFAULTPAGE
1709 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1710 #endif
1711 /*
1712 * If the page is being written, but hasn't been
1713 * copied to the copy-object, we have to copy it there.
1714 */
1715
1716 if ((fault_type & VM_PROT_WRITE) == 0) {
1717 *protection &= ~VM_PROT_WRITE;
1718 break;
1719 }
1720
1721 /*
1722 * If the page was guaranteed to be resident,
1723 * we must have already performed the copy.
1724 */
1725
1726 if (must_be_resident)
1727 break;
1728
1729 /*
1730 * Try to get the lock on the copy_object.
1731 */
1732 if (!vm_object_lock_try(copy_object)) {
1733 vm_object_unlock(object);
1734
1735 mutex_pause(); /* wait a bit */
1736
1737 vm_object_lock(object);
1738 continue;
1739 }
1740
1741 /*
1742 * Make another reference to the copy-object,
1743 * to keep it from disappearing during the
1744 * copy.
1745 */
1746 assert(copy_object->ref_count > 0);
1747 copy_object->ref_count++;
1748 VM_OBJ_RES_INCR(copy_object);
1749
1750 /*
1751 * Does the page exist in the copy?
1752 */
1753 copy_offset = first_offset - copy_object->shadow_offset;
1754 if (copy_object->size <= copy_offset)
1755 /*
1756 * Copy object doesn't cover this page -- do nothing.
1757 */
1758 ;
1759 else if ((copy_m =
1760 vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1761 /* Page currently exists in the copy object */
1762 if (copy_m->busy) {
1763 /*
1764 * If the page is being brought
1765 * in, wait for it and then retry.
1766 */
1767 RELEASE_PAGE(m);
1768 /* take an extra ref so object won't die */
1769 assert(copy_object->ref_count > 0);
1770 copy_object->ref_count++;
1771 vm_object_res_reference(copy_object);
1772 vm_object_unlock(copy_object);
1773 vm_fault_cleanup(object, first_m);
1774 counter(c_vm_fault_page_block_backoff_kernel++);
1775 vm_object_lock(copy_object);
1776 assert(copy_object->ref_count > 0);
1777 VM_OBJ_RES_DECR(copy_object);
1778 copy_object->ref_count--;
1779 assert(copy_object->ref_count > 0);
1780 copy_m = vm_page_lookup(copy_object, copy_offset);
1781 /*
1782 * ENCRYPTED SWAP:
1783 * it's OK if the "copy_m" page is encrypted,
1784 * because we're not moving it nor handling its
1785 * contents.
1786 */
1787 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1788 PAGE_ASSERT_WAIT(copy_m, interruptible);
1789 vm_object_unlock(copy_object);
1790 wait_result = thread_block(THREAD_CONTINUE_NULL);
1791 vm_object_deallocate(copy_object);
1792 goto backoff;
1793 } else {
1794 vm_object_unlock(copy_object);
1795 vm_object_deallocate(copy_object);
1796 thread_interrupt_level(interruptible_state);
1797 return VM_FAULT_RETRY;
1798 }
1799 }
1800 }
1801 else if (!PAGED_OUT(copy_object, copy_offset)) {
1802 /*
1803 * If PAGED_OUT is TRUE, then the page used to exist
1804 * in the copy-object, and has already been paged out.
1805 * We don't need to repeat this. If PAGED_OUT is
1806 * FALSE, then either we don't know (!pager_created,
1807 * for example) or it hasn't been paged out.
1808 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1809 * We must copy the page to the copy object.
1810 */
1811
1812 /*
1813 * are we protecting the system from
1814 * backing store exhaustion. If so
1815 * sleep unless we are privileged.
1816 */
1817
1818 if(vm_backing_store_low) {
1819 if(!(current_task()->priv_flags
1820 & VM_BACKING_STORE_PRIV)) {
1821 assert_wait((event_t)
1822 &vm_backing_store_low,
1823 THREAD_UNINT);
1824 RELEASE_PAGE(m);
1825 VM_OBJ_RES_DECR(copy_object);
1826 copy_object->ref_count--;
1827 assert(copy_object->ref_count > 0);
1828 vm_object_unlock(copy_object);
1829 vm_fault_cleanup(object, first_m);
1830 thread_block(THREAD_CONTINUE_NULL);
1831 thread_interrupt_level(
1832 interruptible_state);
1833 return(VM_FAULT_RETRY);
1834 }
1835 }
1836
1837 /*
1838 * Allocate a page for the copy
1839 */
1840 copy_m = vm_page_alloc(copy_object, copy_offset);
1841 if (copy_m == VM_PAGE_NULL) {
1842 RELEASE_PAGE(m);
1843 VM_OBJ_RES_DECR(copy_object);
1844 copy_object->ref_count--;
1845 assert(copy_object->ref_count > 0);
1846 vm_object_unlock(copy_object);
1847 vm_fault_cleanup(object, first_m);
1848 thread_interrupt_level(interruptible_state);
1849 return(VM_FAULT_MEMORY_SHORTAGE);
1850 }
1851
1852 /*
1853 * Must copy page into copy-object.
1854 */
1855
1856 vm_page_copy(m, copy_m);
1857
1858 /*
1859 * If the old page was in use by any users
1860 * of the copy-object, it must be removed
1861 * from all pmaps. (We can't know which
1862 * pmaps use it.)
1863 */
1864
1865 vm_page_lock_queues();
1866 assert(!m->cleaning);
1867 pmap_disconnect(m->phys_page);
1868 copy_m->dirty = TRUE;
1869 vm_page_unlock_queues();
1870
1871 /*
1872 * If there's a pager, then immediately
1873 * page out this page, using the "initialize"
1874 * option. Else, we use the copy.
1875 */
1876
1877 if
1878 #if MACH_PAGEMAP
1879 ((!copy_object->pager_created) ||
1880 vm_external_state_get(
1881 copy_object->existence_map, copy_offset)
1882 == VM_EXTERNAL_STATE_ABSENT)
1883 #else
1884 (!copy_object->pager_created)
1885 #endif
1886 {
1887 vm_page_lock_queues();
1888 vm_page_activate(copy_m);
1889 vm_page_unlock_queues();
1890 PAGE_WAKEUP_DONE(copy_m);
1891 }
1892 else {
1893 assert(copy_m->busy == TRUE);
1894
1895 /*
1896 * The page is already ready for pageout:
1897 * not on pageout queues and busy.
1898 * Unlock everything except the
1899 * copy_object itself.
1900 */
1901
1902 vm_object_unlock(object);
1903
1904 /*
1905 * Write the page to the copy-object,
1906 * flushing it from the kernel.
1907 */
1908
1909 vm_pageout_initialize_page(copy_m);
1910
1911 /*
1912 * Since the pageout may have
1913 * temporarily dropped the
1914 * copy_object's lock, we
1915 * check whether we'll have
1916 * to deallocate the hard way.
1917 */
1918
1919 if ((copy_object->shadow != object) ||
1920 (copy_object->ref_count == 1)) {
1921 vm_object_unlock(copy_object);
1922 vm_object_deallocate(copy_object);
1923 vm_object_lock(object);
1924 continue;
1925 }
1926
1927 /*
1928 * Pick back up the old object's
1929 * lock. [It is safe to do so,
1930 * since it must be deeper in the
1931 * object tree.]
1932 */
1933
1934 vm_object_lock(object);
1935 }
1936
1937 /*
1938 * Because we're pushing a page upward
1939 * in the object tree, we must restart
1940 * any faults that are waiting here.
1941 * [Note that this is an expansion of
1942 * PAGE_WAKEUP that uses the THREAD_RESTART
1943 * wait result]. Can't turn off the page's
1944 * busy bit because we're not done with it.
1945 */
1946
1947 if (m->wanted) {
1948 m->wanted = FALSE;
1949 thread_wakeup_with_result((event_t) m,
1950 THREAD_RESTART);
1951 }
1952 }
1953
1954 /*
1955 * The reference count on copy_object must be
1956 * at least 2: one for our extra reference,
1957 * and at least one from the outside world
1958 * (we checked that when we last locked
1959 * copy_object).
1960 */
1961 copy_object->ref_count--;
1962 assert(copy_object->ref_count > 0);
1963 VM_OBJ_RES_DECR(copy_object);
1964 vm_object_unlock(copy_object);
1965
1966 break;
1967 }
1968
1969 *result_page = m;
1970 *top_page = first_m;
1971
1972 XPR(XPR_VM_FAULT,
1973 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1974 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1975 /*
1976 * If the page can be written, assume that it will be.
1977 * [Earlier, we restrict the permission to allow write
1978 * access only if the fault so required, so we don't
1979 * mark read-only data as dirty.]
1980 */
1981
1982
1983 if(m != VM_PAGE_NULL) {
1984 #if !VM_FAULT_STATIC_CONFIG
1985 if (vm_fault_dirty_handling && (*protection & VM_PROT_WRITE))
1986 m->dirty = TRUE;
1987 #endif
1988 if (vm_page_deactivate_behind)
1989 vm_fault_deactivate_behind(object, offset, behavior);
1990 } else {
1991 vm_object_unlock(object);
1992 }
1993 thread_interrupt_level(interruptible_state);
1994
1995 #if TRACEFAULTPAGE
1996 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
1997 #endif
1998 return(VM_FAULT_SUCCESS);
1999
2000 #if 0
2001 block_and_backoff:
2002 vm_fault_cleanup(object, first_m);
2003
2004 counter(c_vm_fault_page_block_backoff_kernel++);
2005 thread_block(THREAD_CONTINUE_NULL);
2006 #endif
2007
2008 backoff:
2009 thread_interrupt_level(interruptible_state);
2010 if (wait_result == THREAD_INTERRUPTED)
2011 return VM_FAULT_INTERRUPTED;
2012 return VM_FAULT_RETRY;
2013
2014 #undef RELEASE_PAGE
2015 }
2016
2017 /*
2018 * Routine: vm_fault_tws_insert
2019 * Purpose:
2020 * Add fault information to the task working set.
2021 * Implementation:
2022 * We always insert the base object/offset pair
2023 * rather the actual object/offset.
2024 * Assumptions:
2025 * Map and real_map locked.
2026 * Object locked and referenced.
2027 * Returns:
2028 * TRUE if startup file should be written.
2029 * With object locked and still referenced.
2030 * But we may drop the object lock temporarily.
2031 */
2032 static boolean_t
2033 vm_fault_tws_insert(
2034 vm_map_t map,
2035 vm_map_t real_map,
2036 vm_map_offset_t vaddr,
2037 vm_object_t object,
2038 vm_object_offset_t offset)
2039 {
2040 tws_hash_line_t line;
2041 task_t task;
2042 kern_return_t kr;
2043 boolean_t result = FALSE;
2044
2045 /* Avoid possible map lock deadlock issues */
2046 if (map == kernel_map || map == kalloc_map ||
2047 real_map == kernel_map || real_map == kalloc_map)
2048 return result;
2049
2050 task = current_task();
2051 if (task->dynamic_working_set != 0) {
2052 vm_object_t base_object;
2053 vm_object_t base_shadow;
2054 vm_object_offset_t base_offset;
2055 base_object = object;
2056 base_offset = offset;
2057 while ((base_shadow = base_object->shadow)) {
2058 vm_object_lock(base_shadow);
2059 vm_object_unlock(base_object);
2060 base_offset +=
2061 base_object->shadow_offset;
2062 base_object = base_shadow;
2063 }
2064 kr = tws_lookup(
2065 task->dynamic_working_set,
2066 base_offset, base_object,
2067 &line);
2068 if (kr == KERN_OPERATION_TIMED_OUT){
2069 result = TRUE;
2070 if (base_object != object) {
2071 vm_object_unlock(base_object);
2072 vm_object_lock(object);
2073 }
2074 } else if (kr != KERN_SUCCESS) {
2075 if(base_object != object)
2076 vm_object_reference_locked(base_object);
2077 kr = tws_insert(
2078 task->dynamic_working_set,
2079 base_offset, base_object,
2080 vaddr, real_map);
2081 if(base_object != object) {
2082 vm_object_unlock(base_object);
2083 vm_object_deallocate(base_object);
2084 }
2085 if(kr == KERN_NO_SPACE) {
2086 if (base_object == object)
2087 vm_object_unlock(object);
2088 tws_expand_working_set(
2089 task->dynamic_working_set,
2090 TWS_HASH_LINE_COUNT,
2091 FALSE);
2092 if (base_object == object)
2093 vm_object_lock(object);
2094 } else if(kr == KERN_OPERATION_TIMED_OUT) {
2095 result = TRUE;
2096 }
2097 if(base_object != object)
2098 vm_object_lock(object);
2099 } else if (base_object != object) {
2100 vm_object_unlock(base_object);
2101 vm_object_lock(object);
2102 }
2103 }
2104 return result;
2105 }
2106
2107 /*
2108 * Routine: vm_fault
2109 * Purpose:
2110 * Handle page faults, including pseudo-faults
2111 * used to change the wiring status of pages.
2112 * Returns:
2113 * Explicit continuations have been removed.
2114 * Implementation:
2115 * vm_fault and vm_fault_page save mucho state
2116 * in the moral equivalent of a closure. The state
2117 * structure is allocated when first entering vm_fault
2118 * and deallocated when leaving vm_fault.
2119 */
2120
2121 extern int _map_enter_debug;
2122
2123 kern_return_t
2124 vm_fault(
2125 vm_map_t map,
2126 vm_map_offset_t vaddr,
2127 vm_prot_t fault_type,
2128 boolean_t change_wiring,
2129 int interruptible,
2130 pmap_t caller_pmap,
2131 vm_map_offset_t caller_pmap_addr)
2132 {
2133 vm_map_version_t version; /* Map version for verificiation */
2134 boolean_t wired; /* Should mapping be wired down? */
2135 vm_object_t object; /* Top-level object */
2136 vm_object_offset_t offset; /* Top-level offset */
2137 vm_prot_t prot; /* Protection for mapping */
2138 vm_behavior_t behavior; /* Expected paging behavior */
2139 vm_map_offset_t lo_offset, hi_offset;
2140 vm_object_t old_copy_object; /* Saved copy object */
2141 vm_page_t result_page; /* Result of vm_fault_page */
2142 vm_page_t top_page; /* Placeholder page */
2143 kern_return_t kr;
2144
2145 register
2146 vm_page_t m; /* Fast access to result_page */
2147 kern_return_t error_code = 0; /* page error reasons */
2148 register
2149 vm_object_t cur_object;
2150 register
2151 vm_object_offset_t cur_offset;
2152 vm_page_t cur_m;
2153 vm_object_t new_object;
2154 int type_of_fault;
2155 vm_map_t real_map = map;
2156 vm_map_t original_map = map;
2157 pmap_t pmap = NULL;
2158 boolean_t interruptible_state;
2159 unsigned int cache_attr;
2160 int write_startup_file = 0;
2161 boolean_t need_activation;
2162 vm_prot_t full_fault_type;
2163
2164 if (get_preemption_level() != 0)
2165 return (KERN_FAILURE);
2166
2167 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_START,
2168 vaddr,
2169 0,
2170 0,
2171 0,
2172 0);
2173
2174 /* at present we do not fully check for execute permission */
2175 /* we generally treat it is read except in certain device */
2176 /* memory settings */
2177 full_fault_type = fault_type;
2178 if(fault_type & VM_PROT_EXECUTE) {
2179 fault_type &= ~VM_PROT_EXECUTE;
2180 fault_type |= VM_PROT_READ;
2181 }
2182
2183 interruptible_state = thread_interrupt_level(interruptible);
2184
2185 /*
2186 * assume we will hit a page in the cache
2187 * otherwise, explicitly override with
2188 * the real fault type once we determine it
2189 */
2190 type_of_fault = DBG_CACHE_HIT_FAULT;
2191
2192 VM_STAT(faults++);
2193 current_task()->faults++;
2194
2195 RetryFault: ;
2196
2197 /*
2198 * Find the backing store object and offset into
2199 * it to begin the search.
2200 */
2201 map = original_map;
2202 vm_map_lock_read(map);
2203 kr = vm_map_lookup_locked(&map, vaddr, fault_type, &version,
2204 &object, &offset,
2205 &prot, &wired,
2206 &behavior, &lo_offset, &hi_offset, &real_map);
2207
2208 //if (_map_enter_debug)printf("vm_map_lookup_locked(map=0x%x, addr=0x%llx, prot=%d wired=%d) = %d\n", map, vaddr, prot, wired, kr);
2209
2210 pmap = real_map->pmap;
2211
2212 if (kr != KERN_SUCCESS) {
2213 vm_map_unlock_read(map);
2214 goto done;
2215 }
2216
2217 /*
2218 * If the page is wired, we must fault for the current protection
2219 * value, to avoid further faults.
2220 */
2221
2222 if (wired)
2223 fault_type = prot | VM_PROT_WRITE;
2224
2225 #if VM_FAULT_CLASSIFY
2226 /*
2227 * Temporary data gathering code
2228 */
2229 vm_fault_classify(object, offset, fault_type);
2230 #endif
2231 /*
2232 * Fast fault code. The basic idea is to do as much as
2233 * possible while holding the map lock and object locks.
2234 * Busy pages are not used until the object lock has to
2235 * be dropped to do something (copy, zero fill, pmap enter).
2236 * Similarly, paging references aren't acquired until that
2237 * point, and object references aren't used.
2238 *
2239 * If we can figure out what to do
2240 * (zero fill, copy on write, pmap enter) while holding
2241 * the locks, then it gets done. Otherwise, we give up,
2242 * and use the original fault path (which doesn't hold
2243 * the map lock, and relies on busy pages).
2244 * The give up cases include:
2245 * - Have to talk to pager.
2246 * - Page is busy, absent or in error.
2247 * - Pager has locked out desired access.
2248 * - Fault needs to be restarted.
2249 * - Have to push page into copy object.
2250 *
2251 * The code is an infinite loop that moves one level down
2252 * the shadow chain each time. cur_object and cur_offset
2253 * refer to the current object being examined. object and offset
2254 * are the original object from the map. The loop is at the
2255 * top level if and only if object and cur_object are the same.
2256 *
2257 * Invariants: Map lock is held throughout. Lock is held on
2258 * original object and cur_object (if different) when
2259 * continuing or exiting loop.
2260 *
2261 */
2262
2263
2264 /*
2265 * If this page is to be inserted in a copy delay object
2266 * for writing, and if the object has a copy, then the
2267 * copy delay strategy is implemented in the slow fault page.
2268 */
2269 if (object->copy_strategy != MEMORY_OBJECT_COPY_DELAY ||
2270 object->copy == VM_OBJECT_NULL ||
2271 (fault_type & VM_PROT_WRITE) == 0) {
2272 cur_object = object;
2273 cur_offset = offset;
2274
2275 while (TRUE) {
2276 m = vm_page_lookup(cur_object, cur_offset);
2277 if (m != VM_PAGE_NULL) {
2278 if (m->busy) {
2279 wait_result_t result;
2280
2281 if (object != cur_object)
2282 vm_object_unlock(object);
2283
2284 vm_map_unlock_read(map);
2285 if (real_map != map)
2286 vm_map_unlock(real_map);
2287
2288 #if !VM_FAULT_STATIC_CONFIG
2289 if (!vm_fault_interruptible)
2290 interruptible = THREAD_UNINT;
2291 #endif
2292 result = PAGE_ASSERT_WAIT(m, interruptible);
2293
2294 vm_object_unlock(cur_object);
2295
2296 if (result == THREAD_WAITING) {
2297 result = thread_block(THREAD_CONTINUE_NULL);
2298
2299 counter(c_vm_fault_page_block_busy_kernel++);
2300 }
2301 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2302 goto RetryFault;
2303
2304 kr = KERN_ABORTED;
2305 goto done;
2306 }
2307 if (m->unusual && (m->error || m->restart || m->private
2308 || m->absent || (fault_type & m->page_lock))) {
2309
2310 /*
2311 * Unusual case. Give up.
2312 */
2313 break;
2314 }
2315
2316 if (m->encrypted) {
2317 /*
2318 * ENCRYPTED SWAP:
2319 * We've soft-faulted (because it's not in the page
2320 * table) on an encrypted page.
2321 * Keep the page "busy" so that noone messes with
2322 * it during the decryption.
2323 * Release the extra locks we're holding, keep only
2324 * the page's VM object lock.
2325 */
2326 m->busy = TRUE;
2327 if (object != cur_object) {
2328 vm_object_unlock(object);
2329 }
2330 vm_map_unlock_read(map);
2331 if (real_map != map)
2332 vm_map_unlock(real_map);
2333
2334 vm_page_decrypt(m, 0);
2335
2336 assert(m->busy);
2337 PAGE_WAKEUP_DONE(m);
2338 vm_object_unlock(m->object);
2339
2340 /*
2341 * Retry from the top, in case anything
2342 * changed while we were decrypting...
2343 */
2344 goto RetryFault;
2345 }
2346 ASSERT_PAGE_DECRYPTED(m);
2347
2348 /*
2349 * Two cases of map in faults:
2350 * - At top level w/o copy object.
2351 * - Read fault anywhere.
2352 * --> must disallow write.
2353 */
2354
2355 if (object == cur_object &&
2356 object->copy == VM_OBJECT_NULL)
2357 goto FastMapInFault;
2358
2359 if ((fault_type & VM_PROT_WRITE) == 0) {
2360 boolean_t sequential;
2361
2362 prot &= ~VM_PROT_WRITE;
2363
2364 /*
2365 * Set up to map the page ...
2366 * mark the page busy, drop
2367 * locks and take a paging reference
2368 * on the object with the page.
2369 */
2370
2371 if (object != cur_object) {
2372 vm_object_unlock(object);
2373 object = cur_object;
2374 }
2375 FastMapInFault:
2376 m->busy = TRUE;
2377
2378 vm_object_paging_begin(object);
2379
2380 FastPmapEnter:
2381 /*
2382 * Check a couple of global reasons to
2383 * be conservative about write access.
2384 * Then do the pmap_enter.
2385 */
2386 #if !VM_FAULT_STATIC_CONFIG
2387 if (vm_fault_dirty_handling
2388 #if MACH_KDB
2389 || db_watchpoint_list
2390 #endif
2391 && (fault_type & VM_PROT_WRITE) == 0)
2392 prot &= ~VM_PROT_WRITE;
2393 #else /* STATIC_CONFIG */
2394 #if MACH_KDB
2395 if (db_watchpoint_list
2396 && (fault_type & VM_PROT_WRITE) == 0)
2397 prot &= ~VM_PROT_WRITE;
2398 #endif /* MACH_KDB */
2399 #endif /* STATIC_CONFIG */
2400 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2401
2402 sequential = FALSE;
2403 need_activation = FALSE;
2404
2405 if (m->no_isync == TRUE) {
2406 m->no_isync = FALSE;
2407 pmap_sync_page_data_phys(m->phys_page);
2408
2409 if ((type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2410 /*
2411 * found it in the cache, but this
2412 * is the first fault-in of the page (no_isync == TRUE)
2413 * so it must have come in as part of
2414 * a cluster... account 1 pagein against it
2415 */
2416 VM_STAT(pageins++);
2417 current_task()->pageins++;
2418 type_of_fault = DBG_PAGEIN_FAULT;
2419 sequential = TRUE;
2420 }
2421 if (m->clustered)
2422 need_activation = TRUE;
2423
2424 } else if (cache_attr != VM_WIMG_DEFAULT) {
2425 pmap_sync_page_attributes_phys(m->phys_page);
2426 }
2427
2428 if(caller_pmap) {
2429 PMAP_ENTER(caller_pmap,
2430 caller_pmap_addr, m,
2431 prot, cache_attr, wired);
2432 } else {
2433 PMAP_ENTER(pmap, vaddr, m,
2434 prot, cache_attr, wired);
2435 }
2436
2437 /*
2438 * Hold queues lock to manipulate
2439 * the page queues. Change wiring
2440 * case is obvious. In soft ref bits
2441 * case activate page only if it fell
2442 * off paging queues, otherwise just
2443 * activate it if it's inactive.
2444 *
2445 * NOTE: original vm_fault code will
2446 * move active page to back of active
2447 * queue. This code doesn't.
2448 */
2449 vm_page_lock_queues();
2450
2451 if (m->clustered) {
2452 vm_pagein_cluster_used++;
2453 m->clustered = FALSE;
2454 }
2455 m->reference = TRUE;
2456
2457 if (change_wiring) {
2458 if (wired)
2459 vm_page_wire(m);
2460 else
2461 vm_page_unwire(m);
2462 }
2463 #if VM_FAULT_STATIC_CONFIG
2464 else {
2465 if ((!m->active && !m->inactive) || ((need_activation == TRUE) && !m->active))
2466 vm_page_activate(m);
2467 }
2468 #else
2469 else if (software_reference_bits) {
2470 if (!m->active && !m->inactive)
2471 vm_page_activate(m);
2472 }
2473 else if (!m->active) {
2474 vm_page_activate(m);
2475 }
2476 #endif
2477 vm_page_unlock_queues();
2478
2479 /*
2480 * That's it, clean up and return.
2481 */
2482 PAGE_WAKEUP_DONE(m);
2483
2484 sequential = (sequential && vm_page_deactivate_behind) ?
2485 vm_fault_deactivate_behind(object, cur_offset, behavior) :
2486 FALSE;
2487
2488 /*
2489 * Add non-sequential pages to the working set.
2490 * The sequential pages will be brought in through
2491 * normal clustering behavior.
2492 */
2493 if (!sequential && !object->private) {
2494 write_startup_file =
2495 vm_fault_tws_insert(map, real_map, vaddr,
2496 object, cur_offset);
2497 }
2498
2499 vm_object_paging_end(object);
2500 vm_object_unlock(object);
2501
2502 vm_map_unlock_read(map);
2503 if(real_map != map)
2504 vm_map_unlock(real_map);
2505
2506 if(write_startup_file)
2507 tws_send_startup_info(current_task());
2508
2509 thread_interrupt_level(interruptible_state);
2510
2511
2512 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2513 vaddr,
2514 type_of_fault & 0xff,
2515 KERN_SUCCESS,
2516 type_of_fault >> 8,
2517 0);
2518
2519 return KERN_SUCCESS;
2520 }
2521
2522 /*
2523 * Copy on write fault. If objects match, then
2524 * object->copy must not be NULL (else control
2525 * would be in previous code block), and we
2526 * have a potential push into the copy object
2527 * with which we won't cope here.
2528 */
2529
2530 if (cur_object == object)
2531 break;
2532 /*
2533 * This is now a shadow based copy on write
2534 * fault -- it requires a copy up the shadow
2535 * chain.
2536 *
2537 * Allocate a page in the original top level
2538 * object. Give up if allocate fails. Also
2539 * need to remember current page, as it's the
2540 * source of the copy.
2541 */
2542 cur_m = m;
2543 m = vm_page_grab();
2544 if (m == VM_PAGE_NULL) {
2545 break;
2546 }
2547 /*
2548 * Now do the copy. Mark the source busy
2549 * and take out paging references on both
2550 * objects.
2551 *
2552 * NOTE: This code holds the map lock across
2553 * the page copy.
2554 */
2555
2556 cur_m->busy = TRUE;
2557 vm_page_copy(cur_m, m);
2558 vm_page_insert(m, object, offset);
2559
2560 vm_object_paging_begin(cur_object);
2561 vm_object_paging_begin(object);
2562
2563 type_of_fault = DBG_COW_FAULT;
2564 VM_STAT(cow_faults++);
2565 current_task()->cow_faults++;
2566
2567 /*
2568 * Now cope with the source page and object
2569 * If the top object has a ref count of 1
2570 * then no other map can access it, and hence
2571 * it's not necessary to do the pmap_disconnect.
2572 */
2573
2574 vm_page_lock_queues();
2575 vm_page_deactivate(cur_m);
2576 m->dirty = TRUE;
2577 pmap_disconnect(cur_m->phys_page);
2578 vm_page_unlock_queues();
2579
2580 PAGE_WAKEUP_DONE(cur_m);
2581 vm_object_paging_end(cur_object);
2582 vm_object_unlock(cur_object);
2583
2584 /*
2585 * Slight hack to call vm_object collapse
2586 * and then reuse common map in code.
2587 * note that the object lock was taken above.
2588 */
2589
2590 vm_object_paging_end(object);
2591 vm_object_collapse(object, offset);
2592 vm_object_paging_begin(object);
2593
2594 goto FastPmapEnter;
2595 }
2596 else {
2597
2598 /*
2599 * No page at cur_object, cur_offset
2600 */
2601
2602 if (cur_object->pager_created) {
2603
2604 /*
2605 * Have to talk to the pager. Give up.
2606 */
2607 break;
2608 }
2609
2610
2611 if (cur_object->shadow == VM_OBJECT_NULL) {
2612
2613 if (cur_object->shadow_severed) {
2614 vm_object_paging_end(object);
2615 vm_object_unlock(object);
2616 vm_map_unlock_read(map);
2617 if(real_map != map)
2618 vm_map_unlock(real_map);
2619
2620 if(write_startup_file)
2621 tws_send_startup_info(
2622 current_task());
2623
2624 thread_interrupt_level(interruptible_state);
2625
2626 return KERN_MEMORY_ERROR;
2627 }
2628
2629 /*
2630 * Zero fill fault. Page gets
2631 * filled in top object. Insert
2632 * page, then drop any lower lock.
2633 * Give up if no page.
2634 */
2635 if (VM_PAGE_THROTTLED()) {
2636 break;
2637 }
2638
2639 /*
2640 * are we protecting the system from
2641 * backing store exhaustion. If so
2642 * sleep unless we are privileged.
2643 */
2644 if(vm_backing_store_low) {
2645 if(!(current_task()->priv_flags
2646 & VM_BACKING_STORE_PRIV))
2647 break;
2648 }
2649 m = vm_page_alloc(object, offset);
2650 if (m == VM_PAGE_NULL) {
2651 break;
2652 }
2653 /*
2654 * This is a zero-fill or initial fill
2655 * page fault. As such, we consider it
2656 * undefined with respect to instruction
2657 * execution. i.e. it is the responsibility
2658 * of higher layers to call for an instruction
2659 * sync after changing the contents and before
2660 * sending a program into this area. We
2661 * choose this approach for performance
2662 */
2663
2664 m->no_isync = FALSE;
2665
2666 if (cur_object != object)
2667 vm_object_unlock(cur_object);
2668
2669 vm_object_paging_begin(object);
2670 vm_object_unlock(object);
2671
2672 /*
2673 * Now zero fill page and map it.
2674 * the page is probably going to
2675 * be written soon, so don't bother
2676 * to clear the modified bit
2677 *
2678 * NOTE: This code holds the map
2679 * lock across the zero fill.
2680 */
2681
2682 if (!map->no_zero_fill) {
2683 vm_page_zero_fill(m);
2684 type_of_fault = DBG_ZERO_FILL_FAULT;
2685 VM_STAT(zero_fill_count++);
2686 }
2687 vm_page_lock_queues();
2688 VM_PAGE_QUEUES_REMOVE(m);
2689
2690 m->page_ticket = vm_page_ticket;
2691 assert(!m->laundry);
2692 assert(m->object != kernel_object);
2693 assert(m->pageq.next == NULL &&
2694 m->pageq.prev == NULL);
2695 if(m->object->size > 0x200000) {
2696 m->zero_fill = TRUE;
2697 /* depends on the queues lock */
2698 vm_zf_count += 1;
2699 queue_enter(&vm_page_queue_zf,
2700 m, vm_page_t, pageq);
2701 } else {
2702 queue_enter(
2703 &vm_page_queue_inactive,
2704 m, vm_page_t, pageq);
2705 }
2706 vm_page_ticket_roll++;
2707 if(vm_page_ticket_roll ==
2708 VM_PAGE_TICKETS_IN_ROLL) {
2709 vm_page_ticket_roll = 0;
2710 if(vm_page_ticket ==
2711 VM_PAGE_TICKET_ROLL_IDS)
2712 vm_page_ticket= 0;
2713 else
2714 vm_page_ticket++;
2715 }
2716
2717 m->inactive = TRUE;
2718 vm_page_inactive_count++;
2719 vm_page_unlock_queues();
2720 vm_object_lock(object);
2721
2722 goto FastPmapEnter;
2723 }
2724
2725 /*
2726 * On to the next level
2727 */
2728
2729 cur_offset += cur_object->shadow_offset;
2730 new_object = cur_object->shadow;
2731 vm_object_lock(new_object);
2732 if (cur_object != object)
2733 vm_object_unlock(cur_object);
2734 cur_object = new_object;
2735
2736 continue;
2737 }
2738 }
2739
2740 /*
2741 * Cleanup from fast fault failure. Drop any object
2742 * lock other than original and drop map lock.
2743 */
2744
2745 if (object != cur_object)
2746 vm_object_unlock(cur_object);
2747 }
2748 vm_map_unlock_read(map);
2749
2750 if(real_map != map)
2751 vm_map_unlock(real_map);
2752
2753 /*
2754 * Make a reference to this object to
2755 * prevent its disposal while we are messing with
2756 * it. Once we have the reference, the map is free
2757 * to be diddled. Since objects reference their
2758 * shadows (and copies), they will stay around as well.
2759 */
2760
2761 assert(object->ref_count > 0);
2762 object->ref_count++;
2763 vm_object_res_reference(object);
2764 vm_object_paging_begin(object);
2765
2766 XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2767
2768 if (!object->private) {
2769 write_startup_file =
2770 vm_fault_tws_insert(map, real_map, vaddr, object, offset);
2771 }
2772
2773 kr = vm_fault_page(object, offset, fault_type,
2774 (change_wiring && !wired),
2775 interruptible,
2776 lo_offset, hi_offset, behavior,
2777 &prot, &result_page, &top_page,
2778 &type_of_fault,
2779 &error_code, map->no_zero_fill, FALSE, map, vaddr);
2780
2781 /*
2782 * If we didn't succeed, lose the object reference immediately.
2783 */
2784
2785 if (kr != VM_FAULT_SUCCESS)
2786 vm_object_deallocate(object);
2787
2788 /*
2789 * See why we failed, and take corrective action.
2790 */
2791
2792 switch (kr) {
2793 case VM_FAULT_SUCCESS:
2794 break;
2795 case VM_FAULT_MEMORY_SHORTAGE:
2796 if (vm_page_wait((change_wiring) ?
2797 THREAD_UNINT :
2798 THREAD_ABORTSAFE))
2799 goto RetryFault;
2800 /* fall thru */
2801 case VM_FAULT_INTERRUPTED:
2802 kr = KERN_ABORTED;
2803 goto done;
2804 case VM_FAULT_RETRY:
2805 goto RetryFault;
2806 case VM_FAULT_FICTITIOUS_SHORTAGE:
2807 vm_page_more_fictitious();
2808 goto RetryFault;
2809 case VM_FAULT_MEMORY_ERROR:
2810 if (error_code)
2811 kr = error_code;
2812 else
2813 kr = KERN_MEMORY_ERROR;
2814 goto done;
2815 }
2816
2817 m = result_page;
2818
2819 if(m != VM_PAGE_NULL) {
2820 assert((change_wiring && !wired) ?
2821 (top_page == VM_PAGE_NULL) :
2822 ((top_page == VM_PAGE_NULL) == (m->object == object)));
2823 }
2824
2825 /*
2826 * How to clean up the result of vm_fault_page. This
2827 * happens whether the mapping is entered or not.
2828 */
2829
2830 #define UNLOCK_AND_DEALLOCATE \
2831 MACRO_BEGIN \
2832 vm_fault_cleanup(m->object, top_page); \
2833 vm_object_deallocate(object); \
2834 MACRO_END
2835
2836 /*
2837 * What to do with the resulting page from vm_fault_page
2838 * if it doesn't get entered into the physical map:
2839 */
2840
2841 #define RELEASE_PAGE(m) \
2842 MACRO_BEGIN \
2843 PAGE_WAKEUP_DONE(m); \
2844 vm_page_lock_queues(); \
2845 if (!m->active && !m->inactive) \
2846 vm_page_activate(m); \
2847 vm_page_unlock_queues(); \
2848 MACRO_END
2849
2850 /*
2851 * We must verify that the maps have not changed
2852 * since our last lookup.
2853 */
2854
2855 if(m != VM_PAGE_NULL) {
2856 old_copy_object = m->object->copy;
2857 vm_object_unlock(m->object);
2858 } else {
2859 old_copy_object = VM_OBJECT_NULL;
2860 }
2861 if ((map != original_map) || !vm_map_verify(map, &version)) {
2862 vm_object_t retry_object;
2863 vm_object_offset_t retry_offset;
2864 vm_prot_t retry_prot;
2865
2866 /*
2867 * To avoid trying to write_lock the map while another
2868 * thread has it read_locked (in vm_map_pageable), we
2869 * do not try for write permission. If the page is
2870 * still writable, we will get write permission. If it
2871 * is not, or has been marked needs_copy, we enter the
2872 * mapping without write permission, and will merely
2873 * take another fault.
2874 */
2875 map = original_map;
2876 vm_map_lock_read(map);
2877 kr = vm_map_lookup_locked(&map, vaddr,
2878 fault_type & ~VM_PROT_WRITE, &version,
2879 &retry_object, &retry_offset, &retry_prot,
2880 &wired, &behavior, &lo_offset, &hi_offset,
2881 &real_map);
2882 pmap = real_map->pmap;
2883
2884 if (kr != KERN_SUCCESS) {
2885 vm_map_unlock_read(map);
2886 if(m != VM_PAGE_NULL) {
2887 vm_object_lock(m->object);
2888 RELEASE_PAGE(m);
2889 UNLOCK_AND_DEALLOCATE;
2890 } else {
2891 vm_object_deallocate(object);
2892 }
2893 goto done;
2894 }
2895
2896 vm_object_unlock(retry_object);
2897 if(m != VM_PAGE_NULL) {
2898 vm_object_lock(m->object);
2899 } else {
2900 vm_object_lock(object);
2901 }
2902
2903 if ((retry_object != object) ||
2904 (retry_offset != offset)) {
2905 vm_map_unlock_read(map);
2906 if(real_map != map)
2907 vm_map_unlock(real_map);
2908 if(m != VM_PAGE_NULL) {
2909 RELEASE_PAGE(m);
2910 UNLOCK_AND_DEALLOCATE;
2911 } else {
2912 vm_object_deallocate(object);
2913 }
2914 goto RetryFault;
2915 }
2916
2917 /*
2918 * Check whether the protection has changed or the object
2919 * has been copied while we left the map unlocked.
2920 */
2921 prot &= retry_prot;
2922 if(m != VM_PAGE_NULL) {
2923 vm_object_unlock(m->object);
2924 } else {
2925 vm_object_unlock(object);
2926 }
2927 }
2928 if(m != VM_PAGE_NULL) {
2929 vm_object_lock(m->object);
2930 } else {
2931 vm_object_lock(object);
2932 }
2933
2934 /*
2935 * If the copy object changed while the top-level object
2936 * was unlocked, then we must take away write permission.
2937 */
2938
2939 if(m != VM_PAGE_NULL) {
2940 if (m->object->copy != old_copy_object)
2941 prot &= ~VM_PROT_WRITE;
2942 }
2943
2944 /*
2945 * If we want to wire down this page, but no longer have
2946 * adequate permissions, we must start all over.
2947 */
2948
2949 if (wired && (fault_type != (prot|VM_PROT_WRITE))) {
2950 vm_map_verify_done(map, &version);
2951 if(real_map != map)
2952 vm_map_unlock(real_map);
2953 if(m != VM_PAGE_NULL) {
2954 RELEASE_PAGE(m);
2955 UNLOCK_AND_DEALLOCATE;
2956 } else {
2957 vm_object_deallocate(object);
2958 }
2959 goto RetryFault;
2960 }
2961
2962 /*
2963 * Put this page into the physical map.
2964 * We had to do the unlock above because pmap_enter
2965 * may cause other faults. The page may be on
2966 * the pageout queues. If the pageout daemon comes
2967 * across the page, it will remove it from the queues.
2968 */
2969 need_activation = FALSE;
2970
2971 if (m != VM_PAGE_NULL) {
2972 if (m->no_isync == TRUE) {
2973 pmap_sync_page_data_phys(m->phys_page);
2974
2975 if ((type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2976 /*
2977 * found it in the cache, but this
2978 * is the first fault-in of the page (no_isync == TRUE)
2979 * so it must have come in as part of
2980 * a cluster... account 1 pagein against it
2981 */
2982 VM_STAT(pageins++);
2983 current_task()->pageins++;
2984
2985 type_of_fault = DBG_PAGEIN_FAULT;
2986 }
2987 if (m->clustered) {
2988 need_activation = TRUE;
2989 }
2990 m->no_isync = FALSE;
2991 }
2992 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2993
2994 if(caller_pmap) {
2995 PMAP_ENTER(caller_pmap,
2996 caller_pmap_addr, m,
2997 prot, cache_attr, wired);
2998 } else {
2999 PMAP_ENTER(pmap, vaddr, m,
3000 prot, cache_attr, wired);
3001 }
3002
3003 /*
3004 * Add working set information for private objects here.
3005 */
3006 if (m->object->private) {
3007 write_startup_file =
3008 vm_fault_tws_insert(map, real_map, vaddr,
3009 m->object, m->offset);
3010 }
3011 } else {
3012
3013 #ifndef i386
3014 vm_map_entry_t entry;
3015 vm_map_offset_t laddr;
3016 vm_map_offset_t ldelta, hdelta;
3017
3018 /*
3019 * do a pmap block mapping from the physical address
3020 * in the object
3021 */
3022
3023 /* While we do not worry about execution protection in */
3024 /* general, certian pages may have instruction execution */
3025 /* disallowed. We will check here, and if not allowed */
3026 /* to execute, we return with a protection failure. */
3027
3028 if((full_fault_type & VM_PROT_EXECUTE) &&
3029 (!pmap_eligible_for_execute((ppnum_t)
3030 (object->shadow_offset >> 12)))) {
3031
3032 vm_map_verify_done(map, &version);
3033 if(real_map != map)
3034 vm_map_unlock(real_map);
3035 vm_fault_cleanup(object, top_page);
3036 vm_object_deallocate(object);
3037 kr = KERN_PROTECTION_FAILURE;
3038 goto done;
3039 }
3040
3041 if(real_map != map) {
3042 vm_map_unlock(real_map);
3043 }
3044 if (original_map != map) {
3045 vm_map_unlock_read(map);
3046 vm_map_lock_read(original_map);
3047 map = original_map;
3048 }
3049 real_map = map;
3050
3051 laddr = vaddr;
3052 hdelta = 0xFFFFF000;
3053 ldelta = 0xFFFFF000;
3054
3055
3056 while(vm_map_lookup_entry(map, laddr, &entry)) {
3057 if(ldelta > (laddr - entry->vme_start))
3058 ldelta = laddr - entry->vme_start;
3059 if(hdelta > (entry->vme_end - laddr))
3060 hdelta = entry->vme_end - laddr;
3061 if(entry->is_sub_map) {
3062
3063 laddr = (laddr - entry->vme_start)
3064 + entry->offset;
3065 vm_map_lock_read(entry->object.sub_map);
3066 if(map != real_map)
3067 vm_map_unlock_read(map);
3068 if(entry->use_pmap) {
3069 vm_map_unlock_read(real_map);
3070 real_map = entry->object.sub_map;
3071 }
3072 map = entry->object.sub_map;
3073
3074 } else {
3075 break;
3076 }
3077 }
3078
3079 if(vm_map_lookup_entry(map, laddr, &entry) &&
3080 (entry->object.vm_object != NULL) &&
3081 (entry->object.vm_object == object)) {
3082
3083
3084 if(caller_pmap) {
3085 /* Set up a block mapped area */
3086 pmap_map_block(caller_pmap,
3087 (addr64_t)(caller_pmap_addr - ldelta),
3088 (((vm_map_offset_t)
3089 (entry->object.vm_object->shadow_offset))
3090 + entry->offset +
3091 (laddr - entry->vme_start)
3092 - ldelta) >> 12,
3093 ((ldelta + hdelta) >> 12), prot,
3094 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3095 } else {
3096 /* Set up a block mapped area */
3097 pmap_map_block(real_map->pmap,
3098 (addr64_t)(vaddr - ldelta),
3099 (((vm_map_offset_t)
3100 (entry->object.vm_object->shadow_offset))
3101 + entry->offset +
3102 (laddr - entry->vme_start) - ldelta) >> 12,
3103 ((ldelta + hdelta) >> 12), prot,
3104 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3105 }
3106 }
3107 #else
3108 #ifdef notyet
3109 if(caller_pmap) {
3110 pmap_enter(caller_pmap, caller_pmap_addr,
3111 object->shadow_offset>>12, prot, 0, TRUE);
3112 } else {
3113 pmap_enter(pmap, vaddr,
3114 object->shadow_offset>>12, prot, 0, TRUE);
3115 }
3116 /* Map it in */
3117 #endif
3118 #endif
3119
3120 }
3121
3122 /*
3123 * If the page is not wired down and isn't already
3124 * on a pageout queue, then put it where the
3125 * pageout daemon can find it.
3126 */
3127 if(m != VM_PAGE_NULL) {
3128 vm_page_lock_queues();
3129
3130 if (m->clustered) {
3131 vm_pagein_cluster_used++;
3132 m->clustered = FALSE;
3133 }
3134 m->reference = TRUE;
3135
3136 if (change_wiring) {
3137 if (wired)
3138 vm_page_wire(m);
3139 else
3140 vm_page_unwire(m);
3141 }
3142 #if VM_FAULT_STATIC_CONFIG
3143 else {
3144 if ((!m->active && !m->inactive) || ((need_activation == TRUE) && !m->active))
3145 vm_page_activate(m);
3146 }
3147 #else
3148 else if (software_reference_bits) {
3149 if (!m->active && !m->inactive)
3150 vm_page_activate(m);
3151 m->reference = TRUE;
3152 } else {
3153 vm_page_activate(m);
3154 }
3155 #endif
3156 vm_page_unlock_queues();
3157 }
3158
3159 /*
3160 * Unlock everything, and return
3161 */
3162
3163 vm_map_verify_done(map, &version);
3164 if(real_map != map)
3165 vm_map_unlock(real_map);
3166 if(m != VM_PAGE_NULL) {
3167 PAGE_WAKEUP_DONE(m);
3168 UNLOCK_AND_DEALLOCATE;
3169 } else {
3170 vm_fault_cleanup(object, top_page);
3171 vm_object_deallocate(object);
3172 }
3173 kr = KERN_SUCCESS;
3174
3175 #undef UNLOCK_AND_DEALLOCATE
3176 #undef RELEASE_PAGE
3177
3178 done:
3179 if(write_startup_file)
3180 tws_send_startup_info(current_task());
3181
3182 thread_interrupt_level(interruptible_state);
3183
3184 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
3185 vaddr,
3186 type_of_fault & 0xff,
3187 kr,
3188 type_of_fault >> 8,
3189 0);
3190
3191 return(kr);
3192 }
3193
3194 /*
3195 * vm_fault_wire:
3196 *
3197 * Wire down a range of virtual addresses in a map.
3198 */
3199 kern_return_t
3200 vm_fault_wire(
3201 vm_map_t map,
3202 vm_map_entry_t entry,
3203 pmap_t pmap,
3204 vm_map_offset_t pmap_addr)
3205 {
3206
3207 register vm_map_offset_t va;
3208 register vm_map_offset_t end_addr = entry->vme_end;
3209 register kern_return_t rc;
3210
3211 assert(entry->in_transition);
3212
3213 if ((entry->object.vm_object != NULL) &&
3214 !entry->is_sub_map &&
3215 entry->object.vm_object->phys_contiguous) {
3216 return KERN_SUCCESS;
3217 }
3218
3219 /*
3220 * Inform the physical mapping system that the
3221 * range of addresses may not fault, so that
3222 * page tables and such can be locked down as well.
3223 */
3224
3225 pmap_pageable(pmap, pmap_addr,
3226 pmap_addr + (end_addr - entry->vme_start), FALSE);
3227
3228 /*
3229 * We simulate a fault to get the page and enter it
3230 * in the physical map.
3231 */
3232
3233 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3234 if ((rc = vm_fault_wire_fast(
3235 map, va, entry, pmap,
3236 pmap_addr + (va - entry->vme_start)
3237 )) != KERN_SUCCESS) {
3238 rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
3239 (pmap == kernel_pmap) ?
3240 THREAD_UNINT : THREAD_ABORTSAFE,
3241 pmap, pmap_addr + (va - entry->vme_start));
3242 }
3243
3244 if (rc != KERN_SUCCESS) {
3245 struct vm_map_entry tmp_entry = *entry;
3246
3247 /* unwire wired pages */
3248 tmp_entry.vme_end = va;
3249 vm_fault_unwire(map,
3250 &tmp_entry, FALSE, pmap, pmap_addr);
3251
3252 return rc;
3253 }
3254 }
3255 return KERN_SUCCESS;
3256 }
3257
3258 /*
3259 * vm_fault_unwire:
3260 *
3261 * Unwire a range of virtual addresses in a map.
3262 */
3263 void
3264 vm_fault_unwire(
3265 vm_map_t map,
3266 vm_map_entry_t entry,
3267 boolean_t deallocate,
3268 pmap_t pmap,
3269 vm_map_offset_t pmap_addr)
3270 {
3271 register vm_map_offset_t va;
3272 register vm_map_offset_t end_addr = entry->vme_end;
3273 vm_object_t object;
3274
3275 object = (entry->is_sub_map)
3276 ? VM_OBJECT_NULL : entry->object.vm_object;
3277
3278 /*
3279 * Since the pages are wired down, we must be able to
3280 * get their mappings from the physical map system.
3281 */
3282
3283 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3284 pmap_change_wiring(pmap,
3285 pmap_addr + (va - entry->vme_start), FALSE);
3286
3287 if (object == VM_OBJECT_NULL) {
3288 (void) vm_fault(map, va, VM_PROT_NONE,
3289 TRUE, THREAD_UNINT, pmap, pmap_addr);
3290 } else if (object->phys_contiguous) {
3291 continue;
3292 } else {
3293 vm_prot_t prot;
3294 vm_page_t result_page;
3295 vm_page_t top_page;
3296 vm_object_t result_object;
3297 vm_fault_return_t result;
3298
3299 do {
3300 prot = VM_PROT_NONE;
3301
3302 vm_object_lock(object);
3303 vm_object_paging_begin(object);
3304 XPR(XPR_VM_FAULT,
3305 "vm_fault_unwire -> vm_fault_page\n",
3306 0,0,0,0,0);
3307 result = vm_fault_page(object,
3308 entry->offset +
3309 (va - entry->vme_start),
3310 VM_PROT_NONE, TRUE,
3311 THREAD_UNINT,
3312 entry->offset,
3313 entry->offset +
3314 (entry->vme_end
3315 - entry->vme_start),
3316 entry->behavior,
3317 &prot,
3318 &result_page,
3319 &top_page,
3320 (int *)0,
3321 0, map->no_zero_fill,
3322 FALSE, NULL, 0);
3323 } while (result == VM_FAULT_RETRY);
3324
3325 if (result != VM_FAULT_SUCCESS)
3326 panic("vm_fault_unwire: failure");
3327
3328 result_object = result_page->object;
3329 if (deallocate) {
3330 assert(!result_page->fictitious);
3331 pmap_disconnect(result_page->phys_page);
3332 VM_PAGE_FREE(result_page);
3333 } else {
3334 vm_page_lock_queues();
3335 vm_page_unwire(result_page);
3336 vm_page_unlock_queues();
3337 PAGE_WAKEUP_DONE(result_page);
3338 }
3339
3340 vm_fault_cleanup(result_object, top_page);
3341 }
3342 }
3343
3344 /*
3345 * Inform the physical mapping system that the range
3346 * of addresses may fault, so that page tables and
3347 * such may be unwired themselves.
3348 */
3349
3350 pmap_pageable(pmap, pmap_addr,
3351 pmap_addr + (end_addr - entry->vme_start), TRUE);
3352
3353 }
3354
3355 /*
3356 * vm_fault_wire_fast:
3357 *
3358 * Handle common case of a wire down page fault at the given address.
3359 * If successful, the page is inserted into the associated physical map.
3360 * The map entry is passed in to avoid the overhead of a map lookup.
3361 *
3362 * NOTE: the given address should be truncated to the
3363 * proper page address.
3364 *
3365 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
3366 * a standard error specifying why the fault is fatal is returned.
3367 *
3368 * The map in question must be referenced, and remains so.
3369 * Caller has a read lock on the map.
3370 *
3371 * This is a stripped version of vm_fault() for wiring pages. Anything
3372 * other than the common case will return KERN_FAILURE, and the caller
3373 * is expected to call vm_fault().
3374 */
3375 kern_return_t
3376 vm_fault_wire_fast(
3377 __unused vm_map_t map,
3378 vm_map_offset_t va,
3379 vm_map_entry_t entry,
3380 pmap_t pmap,
3381 vm_map_offset_t pmap_addr)
3382 {
3383 vm_object_t object;
3384 vm_object_offset_t offset;
3385 register vm_page_t m;
3386 vm_prot_t prot;
3387 thread_t thread = current_thread();
3388 unsigned int cache_attr;
3389
3390 VM_STAT(faults++);
3391
3392 if (thread != THREAD_NULL && thread->task != TASK_NULL)
3393 thread->task->faults++;
3394
3395 /*
3396 * Recovery actions
3397 */
3398
3399 #undef RELEASE_PAGE
3400 #define RELEASE_PAGE(m) { \
3401 PAGE_WAKEUP_DONE(m); \
3402 vm_page_lock_queues(); \
3403 vm_page_unwire(m); \
3404 vm_page_unlock_queues(); \
3405 }
3406
3407
3408 #undef UNLOCK_THINGS
3409 #define UNLOCK_THINGS { \
3410 vm_object_paging_end(object); \
3411 vm_object_unlock(object); \
3412 }
3413
3414 #undef UNLOCK_AND_DEALLOCATE
3415 #define UNLOCK_AND_DEALLOCATE { \
3416 UNLOCK_THINGS; \
3417 vm_object_deallocate(object); \
3418 }
3419 /*
3420 * Give up and have caller do things the hard way.
3421 */
3422
3423 #define GIVE_UP { \
3424 UNLOCK_AND_DEALLOCATE; \
3425 return(KERN_FAILURE); \
3426 }
3427
3428
3429 /*
3430 * If this entry is not directly to a vm_object, bail out.
3431 */
3432 if (entry->is_sub_map)
3433 return(KERN_FAILURE);
3434
3435 /*
3436 * Find the backing store object and offset into it.
3437 */
3438
3439 object = entry->object.vm_object;
3440 offset = (va - entry->vme_start) + entry->offset;
3441 prot = entry->protection;
3442
3443 /*
3444 * Make a reference to this object to prevent its
3445 * disposal while we are messing with it.
3446 */
3447
3448 vm_object_lock(object);
3449 assert(object->ref_count > 0);
3450 object->ref_count++;
3451 vm_object_res_reference(object);
3452 vm_object_paging_begin(object);
3453
3454 /*
3455 * INVARIANTS (through entire routine):
3456 *
3457 * 1) At all times, we must either have the object
3458 * lock or a busy page in some object to prevent
3459 * some other thread from trying to bring in
3460 * the same page.
3461 *
3462 * 2) Once we have a busy page, we must remove it from
3463 * the pageout queues, so that the pageout daemon
3464 * will not grab it away.
3465 *
3466 */
3467
3468 /*
3469 * Look for page in top-level object. If it's not there or
3470 * there's something going on, give up.
3471 * ENCRYPTED SWAP: use the slow fault path, since we'll need to
3472 * decrypt the page before wiring it down.
3473 */
3474 m = vm_page_lookup(object, offset);
3475 if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
3476 (m->unusual && ( m->error || m->restart || m->absent ||
3477 prot & m->page_lock))) {
3478
3479 GIVE_UP;
3480 }
3481 ASSERT_PAGE_DECRYPTED(m);
3482
3483 /*
3484 * Wire the page down now. All bail outs beyond this
3485 * point must unwire the page.
3486 */
3487
3488 vm_page_lock_queues();
3489 vm_page_wire(m);
3490 vm_page_unlock_queues();
3491
3492 /*
3493 * Mark page busy for other threads.
3494 */
3495 assert(!m->busy);
3496 m->busy = TRUE;
3497 assert(!m->absent);
3498
3499 /*
3500 * Give up if the page is being written and there's a copy object
3501 */
3502 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3503 RELEASE_PAGE(m);
3504 GIVE_UP;
3505 }
3506
3507 /*
3508 * Put this page into the physical map.
3509 * We have to unlock the object because pmap_enter
3510 * may cause other faults.
3511 */
3512 if (m->no_isync == TRUE) {
3513 pmap_sync_page_data_phys(m->phys_page);
3514
3515 m->no_isync = FALSE;
3516 }
3517
3518 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3519
3520 PMAP_ENTER(pmap, pmap_addr, m, prot, cache_attr, TRUE);
3521
3522 /*
3523 * Unlock everything, and return
3524 */
3525
3526 PAGE_WAKEUP_DONE(m);
3527 UNLOCK_AND_DEALLOCATE;
3528
3529 return(KERN_SUCCESS);
3530
3531 }
3532
3533 /*
3534 * Routine: vm_fault_copy_cleanup
3535 * Purpose:
3536 * Release a page used by vm_fault_copy.
3537 */
3538
3539 void
3540 vm_fault_copy_cleanup(
3541 vm_page_t page,
3542 vm_page_t top_page)
3543 {
3544 vm_object_t object = page->object;
3545
3546 vm_object_lock(object);
3547 PAGE_WAKEUP_DONE(page);
3548 vm_page_lock_queues();
3549 if (!page->active && !page->inactive)
3550 vm_page_activate(page);
3551 vm_page_unlock_queues();
3552 vm_fault_cleanup(object, top_page);
3553 }
3554
3555 void
3556 vm_fault_copy_dst_cleanup(
3557 vm_page_t page)
3558 {
3559 vm_object_t object;
3560
3561 if (page != VM_PAGE_NULL) {
3562 object = page->object;
3563 vm_object_lock(object);
3564 vm_page_lock_queues();
3565 vm_page_unwire(page);
3566 vm_page_unlock_queues();
3567 vm_object_paging_end(object);
3568 vm_object_unlock(object);
3569 }
3570 }
3571
3572 /*
3573 * Routine: vm_fault_copy
3574 *
3575 * Purpose:
3576 * Copy pages from one virtual memory object to another --
3577 * neither the source nor destination pages need be resident.
3578 *
3579 * Before actually copying a page, the version associated with
3580 * the destination address map wil be verified.
3581 *
3582 * In/out conditions:
3583 * The caller must hold a reference, but not a lock, to
3584 * each of the source and destination objects and to the
3585 * destination map.
3586 *
3587 * Results:
3588 * Returns KERN_SUCCESS if no errors were encountered in
3589 * reading or writing the data. Returns KERN_INTERRUPTED if
3590 * the operation was interrupted (only possible if the
3591 * "interruptible" argument is asserted). Other return values
3592 * indicate a permanent error in copying the data.
3593 *
3594 * The actual amount of data copied will be returned in the
3595 * "copy_size" argument. In the event that the destination map
3596 * verification failed, this amount may be less than the amount
3597 * requested.
3598 */
3599 kern_return_t
3600 vm_fault_copy(
3601 vm_object_t src_object,
3602 vm_object_offset_t src_offset,
3603 vm_map_size_t *copy_size, /* INOUT */
3604 vm_object_t dst_object,
3605 vm_object_offset_t dst_offset,
3606 vm_map_t dst_map,
3607 vm_map_version_t *dst_version,
3608 int interruptible)
3609 {
3610 vm_page_t result_page;
3611
3612 vm_page_t src_page;
3613 vm_page_t src_top_page;
3614 vm_prot_t src_prot;
3615
3616 vm_page_t dst_page;
3617 vm_page_t dst_top_page;
3618 vm_prot_t dst_prot;
3619
3620 vm_map_size_t amount_left;
3621 vm_object_t old_copy_object;
3622 kern_return_t error = 0;
3623
3624 vm_map_size_t part_size;
3625
3626 /*
3627 * In order not to confuse the clustered pageins, align
3628 * the different offsets on a page boundary.
3629 */
3630 vm_object_offset_t src_lo_offset = vm_object_trunc_page(src_offset);
3631 vm_object_offset_t dst_lo_offset = vm_object_trunc_page(dst_offset);
3632 vm_object_offset_t src_hi_offset = vm_object_round_page(src_offset + *copy_size);
3633 vm_object_offset_t dst_hi_offset = vm_object_round_page(dst_offset + *copy_size);
3634
3635 #define RETURN(x) \
3636 MACRO_BEGIN \
3637 *copy_size -= amount_left; \
3638 MACRO_RETURN(x); \
3639 MACRO_END
3640
3641 amount_left = *copy_size;
3642 do { /* while (amount_left > 0) */
3643 /*
3644 * There may be a deadlock if both source and destination
3645 * pages are the same. To avoid this deadlock, the copy must
3646 * start by getting the destination page in order to apply
3647 * COW semantics if any.
3648 */
3649
3650 RetryDestinationFault: ;
3651
3652 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3653
3654 vm_object_lock(dst_object);
3655 vm_object_paging_begin(dst_object);
3656
3657 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3658 switch (vm_fault_page(dst_object,
3659 vm_object_trunc_page(dst_offset),
3660 VM_PROT_WRITE|VM_PROT_READ,
3661 FALSE,
3662 interruptible,
3663 dst_lo_offset,
3664 dst_hi_offset,
3665 VM_BEHAVIOR_SEQUENTIAL,
3666 &dst_prot,
3667 &dst_page,
3668 &dst_top_page,
3669 (int *)0,
3670 &error,
3671 dst_map->no_zero_fill,
3672 FALSE, NULL, 0)) {
3673 case VM_FAULT_SUCCESS:
3674 break;
3675 case VM_FAULT_RETRY:
3676 goto RetryDestinationFault;
3677 case VM_FAULT_MEMORY_SHORTAGE:
3678 if (vm_page_wait(interruptible))
3679 goto RetryDestinationFault;
3680 /* fall thru */
3681 case VM_FAULT_INTERRUPTED:
3682 RETURN(MACH_SEND_INTERRUPTED);
3683 case VM_FAULT_FICTITIOUS_SHORTAGE:
3684 vm_page_more_fictitious();
3685 goto RetryDestinationFault;
3686 case VM_FAULT_MEMORY_ERROR:
3687 if (error)
3688 return (error);
3689 else
3690 return(KERN_MEMORY_ERROR);
3691 }
3692 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3693
3694 old_copy_object = dst_page->object->copy;
3695
3696 /*
3697 * There exists the possiblity that the source and
3698 * destination page are the same. But we can't
3699 * easily determine that now. If they are the
3700 * same, the call to vm_fault_page() for the
3701 * destination page will deadlock. To prevent this we
3702 * wire the page so we can drop busy without having
3703 * the page daemon steal the page. We clean up the
3704 * top page but keep the paging reference on the object
3705 * holding the dest page so it doesn't go away.
3706 */
3707
3708 vm_page_lock_queues();
3709 vm_page_wire(dst_page);
3710 vm_page_unlock_queues();
3711 PAGE_WAKEUP_DONE(dst_page);
3712 vm_object_unlock(dst_page->object);
3713
3714 if (dst_top_page != VM_PAGE_NULL) {
3715 vm_object_lock(dst_object);
3716 VM_PAGE_FREE(dst_top_page);
3717 vm_object_paging_end(dst_object);
3718 vm_object_unlock(dst_object);
3719 }
3720
3721 RetrySourceFault: ;
3722
3723 if (src_object == VM_OBJECT_NULL) {
3724 /*
3725 * No source object. We will just
3726 * zero-fill the page in dst_object.
3727 */
3728 src_page = VM_PAGE_NULL;
3729 result_page = VM_PAGE_NULL;
3730 } else {
3731 vm_object_lock(src_object);
3732 src_page = vm_page_lookup(src_object,
3733 vm_object_trunc_page(src_offset));
3734 if (src_page == dst_page) {
3735 src_prot = dst_prot;
3736 result_page = VM_PAGE_NULL;
3737 } else {
3738 src_prot = VM_PROT_READ;
3739 vm_object_paging_begin(src_object);
3740
3741 XPR(XPR_VM_FAULT,
3742 "vm_fault_copy(2) -> vm_fault_page\n",
3743 0,0,0,0,0);
3744 switch (vm_fault_page(src_object,
3745 vm_object_trunc_page(src_offset),
3746 VM_PROT_READ,
3747 FALSE,
3748 interruptible,
3749 src_lo_offset,
3750 src_hi_offset,
3751 VM_BEHAVIOR_SEQUENTIAL,
3752 &src_prot,
3753 &result_page,
3754 &src_top_page,
3755 (int *)0,
3756 &error,
3757 FALSE,
3758 FALSE, NULL, 0)) {
3759
3760 case VM_FAULT_SUCCESS:
3761 break;
3762 case VM_FAULT_RETRY:
3763 goto RetrySourceFault;
3764 case VM_FAULT_MEMORY_SHORTAGE:
3765 if (vm_page_wait(interruptible))
3766 goto RetrySourceFault;
3767 /* fall thru */
3768 case VM_FAULT_INTERRUPTED:
3769 vm_fault_copy_dst_cleanup(dst_page);
3770 RETURN(MACH_SEND_INTERRUPTED);
3771 case VM_FAULT_FICTITIOUS_SHORTAGE:
3772 vm_page_more_fictitious();
3773 goto RetrySourceFault;
3774 case VM_FAULT_MEMORY_ERROR:
3775 vm_fault_copy_dst_cleanup(dst_page);
3776 if (error)
3777 return (error);
3778 else
3779 return(KERN_MEMORY_ERROR);
3780 }
3781
3782
3783 assert((src_top_page == VM_PAGE_NULL) ==
3784 (result_page->object == src_object));
3785 }
3786 assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
3787 vm_object_unlock(result_page->object);
3788 }
3789
3790 if (!vm_map_verify(dst_map, dst_version)) {
3791 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3792 vm_fault_copy_cleanup(result_page, src_top_page);
3793 vm_fault_copy_dst_cleanup(dst_page);
3794 break;
3795 }
3796
3797 vm_object_lock(dst_page->object);
3798
3799 if (dst_page->object->copy != old_copy_object) {
3800 vm_object_unlock(dst_page->object);
3801 vm_map_verify_done(dst_map, dst_version);
3802 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3803 vm_fault_copy_cleanup(result_page, src_top_page);
3804 vm_fault_copy_dst_cleanup(dst_page);
3805 break;
3806 }
3807 vm_object_unlock(dst_page->object);
3808
3809 /*
3810 * Copy the page, and note that it is dirty
3811 * immediately.
3812 */
3813
3814 if (!page_aligned(src_offset) ||
3815 !page_aligned(dst_offset) ||
3816 !page_aligned(amount_left)) {
3817
3818 vm_object_offset_t src_po,
3819 dst_po;
3820
3821 src_po = src_offset - vm_object_trunc_page(src_offset);
3822 dst_po = dst_offset - vm_object_trunc_page(dst_offset);
3823
3824 if (dst_po > src_po) {
3825 part_size = PAGE_SIZE - dst_po;
3826 } else {
3827 part_size = PAGE_SIZE - src_po;
3828 }
3829 if (part_size > (amount_left)){
3830 part_size = amount_left;
3831 }
3832
3833 if (result_page == VM_PAGE_NULL) {
3834 vm_page_part_zero_fill(dst_page,
3835 dst_po, part_size);
3836 } else {
3837 vm_page_part_copy(result_page, src_po,
3838 dst_page, dst_po, part_size);
3839 if(!dst_page->dirty){
3840 vm_object_lock(dst_object);
3841 dst_page->dirty = TRUE;
3842 vm_object_unlock(dst_page->object);
3843 }
3844
3845 }
3846 } else {
3847 part_size = PAGE_SIZE;
3848
3849 if (result_page == VM_PAGE_NULL)
3850 vm_page_zero_fill(dst_page);
3851 else{
3852 vm_page_copy(result_page, dst_page);
3853 if(!dst_page->dirty){
3854 vm_object_lock(dst_object);
3855 dst_page->dirty = TRUE;
3856 vm_object_unlock(dst_page->object);
3857 }
3858 }
3859
3860 }
3861
3862 /*
3863 * Unlock everything, and return
3864 */
3865
3866 vm_map_verify_done(dst_map, dst_version);
3867
3868 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3869 vm_fault_copy_cleanup(result_page, src_top_page);
3870 vm_fault_copy_dst_cleanup(dst_page);
3871
3872 amount_left -= part_size;
3873 src_offset += part_size;
3874 dst_offset += part_size;
3875 } while (amount_left > 0);
3876
3877 RETURN(KERN_SUCCESS);
3878 #undef RETURN
3879
3880 /*NOTREACHED*/
3881 }
3882
3883 #ifdef notdef
3884
3885 /*
3886 * Routine: vm_fault_page_overwrite
3887 *
3888 * Description:
3889 * A form of vm_fault_page that assumes that the
3890 * resulting page will be overwritten in its entirety,
3891 * making it unnecessary to obtain the correct *contents*
3892 * of the page.
3893 *
3894 * Implementation:
3895 * XXX Untested. Also unused. Eventually, this technology
3896 * could be used in vm_fault_copy() to advantage.
3897 */
3898 vm_fault_return_t
3899 vm_fault_page_overwrite(
3900 register
3901 vm_object_t dst_object,
3902 vm_object_offset_t dst_offset,
3903 vm_page_t *result_page) /* OUT */
3904 {
3905 register
3906 vm_page_t dst_page;
3907 kern_return_t wait_result;
3908
3909 #define interruptible THREAD_UNINT /* XXX */
3910
3911 while (TRUE) {
3912 /*
3913 * Look for a page at this offset
3914 */
3915
3916 while ((dst_page = vm_page_lookup(dst_object, dst_offset))
3917 == VM_PAGE_NULL) {
3918 /*
3919 * No page, no problem... just allocate one.
3920 */
3921
3922 dst_page = vm_page_alloc(dst_object, dst_offset);
3923 if (dst_page == VM_PAGE_NULL) {
3924 vm_object_unlock(dst_object);
3925 VM_PAGE_WAIT();
3926 vm_object_lock(dst_object);
3927 continue;
3928 }
3929
3930 /*
3931 * Pretend that the memory manager
3932 * write-protected the page.
3933 *
3934 * Note that we will be asking for write
3935 * permission without asking for the data
3936 * first.
3937 */
3938
3939 dst_page->overwriting = TRUE;
3940 dst_page->page_lock = VM_PROT_WRITE;
3941 dst_page->absent = TRUE;
3942 dst_page->unusual = TRUE;
3943 dst_object->absent_count++;
3944
3945 break;
3946
3947 /*
3948 * When we bail out, we might have to throw
3949 * away the page created here.
3950 */
3951
3952 #define DISCARD_PAGE \
3953 MACRO_BEGIN \
3954 vm_object_lock(dst_object); \
3955 dst_page = vm_page_lookup(dst_object, dst_offset); \
3956 if ((dst_page != VM_PAGE_NULL) && dst_page->overwriting) \
3957 VM_PAGE_FREE(dst_page); \
3958 vm_object_unlock(dst_object); \
3959 MACRO_END
3960 }
3961
3962 /*
3963 * If the page is write-protected...
3964 */
3965
3966 if (dst_page->page_lock & VM_PROT_WRITE) {
3967 /*
3968 * ... and an unlock request hasn't been sent
3969 */
3970
3971 if ( ! (dst_page->unlock_request & VM_PROT_WRITE)) {
3972 vm_prot_t u;
3973 kern_return_t rc;
3974
3975 /*
3976 * ... then send one now.
3977 */
3978
3979 if (!dst_object->pager_ready) {
3980 wait_result = vm_object_assert_wait(dst_object,
3981 VM_OBJECT_EVENT_PAGER_READY,
3982 interruptible);
3983 vm_object_unlock(dst_object);
3984 if (wait_result == THREAD_WAITING)
3985 wait_result = thread_block(THREAD_CONTINUE_NULL);
3986 if (wait_result != THREAD_AWAKENED) {
3987 DISCARD_PAGE;
3988 return(VM_FAULT_INTERRUPTED);
3989 }
3990 continue;
3991 }
3992
3993 u = dst_page->unlock_request |= VM_PROT_WRITE;
3994 vm_object_unlock(dst_object);
3995
3996 if ((rc = memory_object_data_unlock(
3997 dst_object->pager,
3998 dst_offset + dst_object->paging_offset,
3999 PAGE_SIZE,
4000 u)) != KERN_SUCCESS) {
4001 if (vm_fault_debug)
4002 printf("vm_object_overwrite: memory_object_data_unlock failed\n");
4003 DISCARD_PAGE;
4004 return((rc == MACH_SEND_INTERRUPTED) ?
4005 VM_FAULT_INTERRUPTED :
4006 VM_FAULT_MEMORY_ERROR);
4007 }
4008 vm_object_lock(dst_object);
4009 continue;
4010 }
4011
4012 /* ... fall through to wait below */
4013 } else {
4014 /*
4015 * If the page isn't being used for other
4016 * purposes, then we're done.
4017 */
4018 if ( ! (dst_page->busy || dst_page->absent ||
4019 dst_page->error || dst_page->restart) )
4020 break;
4021 }
4022
4023 wait_result = PAGE_ASSERT_WAIT(dst_page, interruptible);
4024 vm_object_unlock(dst_object);
4025 if (wait_result == THREAD_WAITING)
4026 wait_result = thread_block(THREAD_CONTINUE_NULL);
4027 if (wait_result != THREAD_AWAKENED) {
4028 DISCARD_PAGE;
4029 return(VM_FAULT_INTERRUPTED);
4030 }
4031 }
4032
4033 *result_page = dst_page;
4034 return(VM_FAULT_SUCCESS);
4035
4036 #undef interruptible
4037 #undef DISCARD_PAGE
4038 }
4039
4040 #endif /* notdef */
4041
4042 #if VM_FAULT_CLASSIFY
4043 /*
4044 * Temporary statistics gathering support.
4045 */
4046
4047 /*
4048 * Statistics arrays:
4049 */
4050 #define VM_FAULT_TYPES_MAX 5
4051 #define VM_FAULT_LEVEL_MAX 8
4052
4053 int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4054
4055 #define VM_FAULT_TYPE_ZERO_FILL 0
4056 #define VM_FAULT_TYPE_MAP_IN 1
4057 #define VM_FAULT_TYPE_PAGER 2
4058 #define VM_FAULT_TYPE_COPY 3
4059 #define VM_FAULT_TYPE_OTHER 4
4060
4061
4062 void
4063 vm_fault_classify(vm_object_t object,
4064 vm_object_offset_t offset,
4065 vm_prot_t fault_type)
4066 {
4067 int type, level = 0;
4068 vm_page_t m;
4069
4070 while (TRUE) {
4071 m = vm_page_lookup(object, offset);
4072 if (m != VM_PAGE_NULL) {
4073 if (m->busy || m->error || m->restart || m->absent ||
4074 fault_type & m->page_lock) {
4075 type = VM_FAULT_TYPE_OTHER;
4076 break;
4077 }
4078 if (((fault_type & VM_PROT_WRITE) == 0) ||
4079 ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4080 type = VM_FAULT_TYPE_MAP_IN;
4081 break;
4082 }
4083 type = VM_FAULT_TYPE_COPY;
4084 break;
4085 }
4086 else {
4087 if (object->pager_created) {
4088 type = VM_FAULT_TYPE_PAGER;
4089 break;
4090 }
4091 if (object->shadow == VM_OBJECT_NULL) {
4092 type = VM_FAULT_TYPE_ZERO_FILL;
4093 break;
4094 }
4095
4096 offset += object->shadow_offset;
4097 object = object->shadow;
4098 level++;
4099 continue;
4100 }
4101 }
4102
4103 if (level > VM_FAULT_LEVEL_MAX)
4104 level = VM_FAULT_LEVEL_MAX;
4105
4106 vm_fault_stats[type][level] += 1;
4107
4108 return;
4109 }
4110
4111 /* cleanup routine to call from debugger */
4112
4113 void
4114 vm_fault_classify_init(void)
4115 {
4116 int type, level;
4117
4118 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4119 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4120 vm_fault_stats[type][level] = 0;
4121 }
4122 }
4123
4124 return;
4125 }
4126 #endif /* VM_FAULT_CLASSIFY */