]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_fault.c
48bc7daf7d15f74ceb959ef8999ef120e8f856ff
[apple/xnu.git] / osfmk / vm / vm_fault.c
1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /*
24 * @OSF_COPYRIGHT@
25 */
26 /*
27 * Mach Operating System
28 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
29 * All Rights Reserved.
30 *
31 * Permission to use, copy, modify and distribute this software and its
32 * documentation is hereby granted, provided that both the copyright
33 * notice and this permission notice appear in all copies of the
34 * software, derivative works or modified versions, and any portions
35 * thereof, and that both notices appear in supporting documentation.
36 *
37 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
38 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
39 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
40 *
41 * Carnegie Mellon requests users of this software to return to
42 *
43 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
44 * School of Computer Science
45 * Carnegie Mellon University
46 * Pittsburgh PA 15213-3890
47 *
48 * any improvements or extensions that they make and grant Carnegie Mellon
49 * the rights to redistribute these changes.
50 */
51 /*
52 */
53 /*
54 * File: vm_fault.c
55 * Author: Avadis Tevanian, Jr., Michael Wayne Young
56 *
57 * Page fault handling module.
58 */
59
60 #include <mach_cluster_stats.h>
61 #include <mach_pagemap.h>
62 #include <mach_kdb.h>
63
64 #include <mach/mach_types.h>
65 #include <mach/kern_return.h>
66 #include <mach/message.h> /* for error codes */
67 #include <mach/vm_param.h>
68 #include <mach/vm_behavior.h>
69 #include <mach/memory_object.h>
70 /* For memory_object_data_{request,unlock} */
71
72 #include <kern/kern_types.h>
73 #include <kern/host_statistics.h>
74 #include <kern/counters.h>
75 #include <kern/task.h>
76 #include <kern/thread.h>
77 #include <kern/sched_prim.h>
78 #include <kern/host.h>
79 #include <kern/xpr.h>
80 #include <kern/mach_param.h>
81 #include <kern/macro_help.h>
82 #include <kern/zalloc.h>
83 #include <kern/misc_protos.h>
84
85 #include <ppc/proc_reg.h>
86
87 #include <vm/vm_fault.h>
88 #include <vm/task_working_set.h>
89 #include <vm/vm_map.h>
90 #include <vm/vm_object.h>
91 #include <vm/vm_page.h>
92 #include <vm/vm_kern.h>
93 #include <vm/pmap.h>
94 #include <vm/vm_pageout.h>
95 #include <vm/vm_protos.h>
96
97 #include <sys/kdebug.h>
98
99 #define VM_FAULT_CLASSIFY 0
100 #define VM_FAULT_STATIC_CONFIG 1
101
102 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
103
104 unsigned int vm_object_absent_max = 50;
105
106 int vm_fault_debug = 0;
107
108 #if !VM_FAULT_STATIC_CONFIG
109 boolean_t vm_fault_dirty_handling = FALSE;
110 boolean_t vm_fault_interruptible = FALSE;
111 boolean_t software_reference_bits = TRUE;
112 #endif
113
114 #if MACH_KDB
115 extern struct db_watchpoint *db_watchpoint_list;
116 #endif /* MACH_KDB */
117
118
119 /* Forward declarations of internal routines. */
120 extern kern_return_t vm_fault_wire_fast(
121 vm_map_t map,
122 vm_map_offset_t va,
123 vm_map_entry_t entry,
124 pmap_t pmap,
125 vm_map_offset_t pmap_addr);
126
127 extern void vm_fault_continue(void);
128
129 extern void vm_fault_copy_cleanup(
130 vm_page_t page,
131 vm_page_t top_page);
132
133 extern void vm_fault_copy_dst_cleanup(
134 vm_page_t page);
135
136 #if VM_FAULT_CLASSIFY
137 extern void vm_fault_classify(vm_object_t object,
138 vm_object_offset_t offset,
139 vm_prot_t fault_type);
140
141 extern void vm_fault_classify_init(void);
142 #endif
143
144 /*
145 * Routine: vm_fault_init
146 * Purpose:
147 * Initialize our private data structures.
148 */
149 void
150 vm_fault_init(void)
151 {
152 }
153
154 /*
155 * Routine: vm_fault_cleanup
156 * Purpose:
157 * Clean up the result of vm_fault_page.
158 * Results:
159 * The paging reference for "object" is released.
160 * "object" is unlocked.
161 * If "top_page" is not null, "top_page" is
162 * freed and the paging reference for the object
163 * containing it is released.
164 *
165 * In/out conditions:
166 * "object" must be locked.
167 */
168 void
169 vm_fault_cleanup(
170 register vm_object_t object,
171 register vm_page_t top_page)
172 {
173 vm_object_paging_end(object);
174 vm_object_unlock(object);
175
176 if (top_page != VM_PAGE_NULL) {
177 object = top_page->object;
178 vm_object_lock(object);
179 VM_PAGE_FREE(top_page);
180 vm_object_paging_end(object);
181 vm_object_unlock(object);
182 }
183 }
184
185 #if MACH_CLUSTER_STATS
186 #define MAXCLUSTERPAGES 16
187 struct {
188 unsigned long pages_in_cluster;
189 unsigned long pages_at_higher_offsets;
190 unsigned long pages_at_lower_offsets;
191 } cluster_stats_in[MAXCLUSTERPAGES];
192 #define CLUSTER_STAT(clause) clause
193 #define CLUSTER_STAT_HIGHER(x) \
194 ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
195 #define CLUSTER_STAT_LOWER(x) \
196 ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
197 #define CLUSTER_STAT_CLUSTER(x) \
198 ((cluster_stats_in[(x)].pages_in_cluster)++)
199 #else /* MACH_CLUSTER_STATS */
200 #define CLUSTER_STAT(clause)
201 #endif /* MACH_CLUSTER_STATS */
202
203 /* XXX - temporary */
204 boolean_t vm_allow_clustered_pagein = FALSE;
205 int vm_pagein_cluster_used = 0;
206
207 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
208
209
210 boolean_t vm_page_deactivate_behind = TRUE;
211 /*
212 * Prepage default sizes given VM_BEHAVIOR_DEFAULT reference behavior
213 */
214 int vm_default_ahead = 0;
215 int vm_default_behind = MAX_UPL_TRANSFER;
216
217 /*
218 * vm_page_deactivate_behind
219 *
220 * Determine if sequential access is in progress
221 * in accordance with the behavior specified. If
222 * so, compute a potential page to deactive and
223 * deactivate it.
224 *
225 * The object must be locked.
226 */
227 static
228 boolean_t
229 vm_fault_deactivate_behind(
230 vm_object_t object,
231 vm_object_offset_t offset,
232 vm_behavior_t behavior)
233 {
234 vm_page_t m;
235
236 #if TRACEFAULTPAGE
237 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
238 #endif
239
240 if (object == kernel_object) {
241 /*
242 * Do not deactivate pages from the kernel object: they
243 * are not intended to become pageable.
244 */
245 return FALSE;
246 }
247
248 switch (behavior) {
249 case VM_BEHAVIOR_RANDOM:
250 object->sequential = PAGE_SIZE_64;
251 m = VM_PAGE_NULL;
252 break;
253 case VM_BEHAVIOR_SEQUENTIAL:
254 if (offset &&
255 object->last_alloc == offset - PAGE_SIZE_64) {
256 object->sequential += PAGE_SIZE_64;
257 m = vm_page_lookup(object, offset - PAGE_SIZE_64);
258 } else {
259 object->sequential = PAGE_SIZE_64; /* reset */
260 m = VM_PAGE_NULL;
261 }
262 break;
263 case VM_BEHAVIOR_RSEQNTL:
264 if (object->last_alloc &&
265 object->last_alloc == offset + PAGE_SIZE_64) {
266 object->sequential += PAGE_SIZE_64;
267 m = vm_page_lookup(object, offset + PAGE_SIZE_64);
268 } else {
269 object->sequential = PAGE_SIZE_64; /* reset */
270 m = VM_PAGE_NULL;
271 }
272 break;
273 case VM_BEHAVIOR_DEFAULT:
274 default:
275 if (offset &&
276 object->last_alloc == offset - PAGE_SIZE_64) {
277 vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
278
279 object->sequential += PAGE_SIZE_64;
280 m = (offset >= behind &&
281 object->sequential >= behind) ?
282 vm_page_lookup(object, offset - behind) :
283 VM_PAGE_NULL;
284 } else if (object->last_alloc &&
285 object->last_alloc == offset + PAGE_SIZE_64) {
286 vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
287
288 object->sequential += PAGE_SIZE_64;
289 m = (offset < -behind &&
290 object->sequential >= behind) ?
291 vm_page_lookup(object, offset + behind) :
292 VM_PAGE_NULL;
293 } else {
294 object->sequential = PAGE_SIZE_64;
295 m = VM_PAGE_NULL;
296 }
297 break;
298 }
299
300 object->last_alloc = offset;
301
302 if (m) {
303 if (!m->busy) {
304 vm_page_lock_queues();
305 vm_page_deactivate(m);
306 vm_page_unlock_queues();
307 #if TRACEFAULTPAGE
308 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
309 #endif
310 }
311 return TRUE;
312 }
313 return FALSE;
314 }
315
316
317 /*
318 * Routine: vm_fault_page
319 * Purpose:
320 * Find the resident page for the virtual memory
321 * specified by the given virtual memory object
322 * and offset.
323 * Additional arguments:
324 * The required permissions for the page is given
325 * in "fault_type". Desired permissions are included
326 * in "protection". The minimum and maximum valid offsets
327 * within the object for the relevant map entry are
328 * passed in "lo_offset" and "hi_offset" respectively and
329 * the expected page reference pattern is passed in "behavior".
330 * These three parameters are used to determine pagein cluster
331 * limits.
332 *
333 * If the desired page is known to be resident (for
334 * example, because it was previously wired down), asserting
335 * the "unwiring" parameter will speed the search.
336 *
337 * If the operation can be interrupted (by thread_abort
338 * or thread_terminate), then the "interruptible"
339 * parameter should be asserted.
340 *
341 * Results:
342 * The page containing the proper data is returned
343 * in "result_page".
344 *
345 * In/out conditions:
346 * The source object must be locked and referenced,
347 * and must donate one paging reference. The reference
348 * is not affected. The paging reference and lock are
349 * consumed.
350 *
351 * If the call succeeds, the object in which "result_page"
352 * resides is left locked and holding a paging reference.
353 * If this is not the original object, a busy page in the
354 * original object is returned in "top_page", to prevent other
355 * callers from pursuing this same data, along with a paging
356 * reference for the original object. The "top_page" should
357 * be destroyed when this guarantee is no longer required.
358 * The "result_page" is also left busy. It is not removed
359 * from the pageout queues.
360 */
361
362 vm_fault_return_t
363 vm_fault_page(
364 /* Arguments: */
365 vm_object_t first_object, /* Object to begin search */
366 vm_object_offset_t first_offset, /* Offset into object */
367 vm_prot_t fault_type, /* What access is requested */
368 boolean_t must_be_resident,/* Must page be resident? */
369 int interruptible, /* how may fault be interrupted? */
370 vm_map_offset_t lo_offset, /* Map entry start */
371 vm_map_offset_t hi_offset, /* Map entry end */
372 vm_behavior_t behavior, /* Page reference behavior */
373 /* Modifies in place: */
374 vm_prot_t *protection, /* Protection for mapping */
375 /* Returns: */
376 vm_page_t *result_page, /* Page found, if successful */
377 vm_page_t *top_page, /* Page in top object, if
378 * not result_page. */
379 int *type_of_fault, /* if non-null, fill in with type of fault
380 * COW, zero-fill, etc... returned in trace point */
381 /* More arguments: */
382 kern_return_t *error_code, /* code if page is in error */
383 boolean_t no_zero_fill, /* don't zero fill absent pages */
384 boolean_t data_supply, /* treat as data_supply if
385 * it is a write fault and a full
386 * page is provided */
387 vm_map_t map,
388 __unused vm_map_offset_t vaddr)
389 {
390 register
391 vm_page_t m;
392 register
393 vm_object_t object;
394 register
395 vm_object_offset_t offset;
396 vm_page_t first_m;
397 vm_object_t next_object;
398 vm_object_t copy_object;
399 boolean_t look_for_page;
400 vm_prot_t access_required = fault_type;
401 vm_prot_t wants_copy_flag;
402 vm_object_size_t length;
403 vm_object_offset_t cluster_start, cluster_end;
404 CLUSTER_STAT(int pages_at_higher_offsets;)
405 CLUSTER_STAT(int pages_at_lower_offsets;)
406 kern_return_t wait_result;
407 boolean_t interruptible_state;
408 boolean_t bumped_pagein = FALSE;
409
410
411 #if MACH_PAGEMAP
412 /*
413 * MACH page map - an optional optimization where a bit map is maintained
414 * by the VM subsystem for internal objects to indicate which pages of
415 * the object currently reside on backing store. This existence map
416 * duplicates information maintained by the vnode pager. It is
417 * created at the time of the first pageout against the object, i.e.
418 * at the same time pager for the object is created. The optimization
419 * is designed to eliminate pager interaction overhead, if it is
420 * 'known' that the page does not exist on backing store.
421 *
422 * LOOK_FOR() evaluates to TRUE if the page specified by object/offset is
423 * either marked as paged out in the existence map for the object or no
424 * existence map exists for the object. LOOK_FOR() is one of the
425 * criteria in the decision to invoke the pager. It is also used as one
426 * of the criteria to terminate the scan for adjacent pages in a clustered
427 * pagein operation. Note that LOOK_FOR() always evaluates to TRUE for
428 * permanent objects. Note also that if the pager for an internal object
429 * has not been created, the pager is not invoked regardless of the value
430 * of LOOK_FOR() and that clustered pagein scans are only done on an object
431 * for which a pager has been created.
432 *
433 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
434 * is marked as paged out in the existence map for the object. PAGED_OUT()
435 * PAGED_OUT() is used to determine if a page has already been pushed
436 * into a copy object in order to avoid a redundant page out operation.
437 */
438 #define LOOK_FOR(o, f) (vm_external_state_get((o)->existence_map, (f)) \
439 != VM_EXTERNAL_STATE_ABSENT)
440 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
441 == VM_EXTERNAL_STATE_EXISTS)
442 #else /* MACH_PAGEMAP */
443 /*
444 * If the MACH page map optimization is not enabled,
445 * LOOK_FOR() always evaluates to TRUE. The pager will always be
446 * invoked to resolve missing pages in an object, assuming the pager
447 * has been created for the object. In a clustered page operation, the
448 * absence of a page on backing backing store cannot be used to terminate
449 * a scan for adjacent pages since that information is available only in
450 * the pager. Hence pages that may not be paged out are potentially
451 * included in a clustered request. The vnode pager is coded to deal
452 * with any combination of absent/present pages in a clustered
453 * pagein request. PAGED_OUT() always evaluates to FALSE, i.e. the pager
454 * will always be invoked to push a dirty page into a copy object assuming
455 * a pager has been created. If the page has already been pushed, the
456 * pager will ingore the new request.
457 */
458 #define LOOK_FOR(o, f) TRUE
459 #define PAGED_OUT(o, f) FALSE
460 #endif /* MACH_PAGEMAP */
461
462 /*
463 * Recovery actions
464 */
465 #define PREPARE_RELEASE_PAGE(m) \
466 MACRO_BEGIN \
467 vm_page_lock_queues(); \
468 MACRO_END
469
470 #define DO_RELEASE_PAGE(m) \
471 MACRO_BEGIN \
472 PAGE_WAKEUP_DONE(m); \
473 if (!m->active && !m->inactive) \
474 vm_page_activate(m); \
475 vm_page_unlock_queues(); \
476 MACRO_END
477
478 #define RELEASE_PAGE(m) \
479 MACRO_BEGIN \
480 PREPARE_RELEASE_PAGE(m); \
481 DO_RELEASE_PAGE(m); \
482 MACRO_END
483
484 #if TRACEFAULTPAGE
485 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
486 #endif
487
488
489
490 #if !VM_FAULT_STATIC_CONFIG
491 if (vm_fault_dirty_handling
492 #if MACH_KDB
493 /*
494 * If there are watchpoints set, then
495 * we don't want to give away write permission
496 * on a read fault. Make the task write fault,
497 * so that the watchpoint code notices the access.
498 */
499 || db_watchpoint_list
500 #endif /* MACH_KDB */
501 ) {
502 /*
503 * If we aren't asking for write permission,
504 * then don't give it away. We're using write
505 * faults to set the dirty bit.
506 */
507 if (!(fault_type & VM_PROT_WRITE))
508 *protection &= ~VM_PROT_WRITE;
509 }
510
511 if (!vm_fault_interruptible)
512 interruptible = THREAD_UNINT;
513 #else /* STATIC_CONFIG */
514 #if MACH_KDB
515 /*
516 * If there are watchpoints set, then
517 * we don't want to give away write permission
518 * on a read fault. Make the task write fault,
519 * so that the watchpoint code notices the access.
520 */
521 if (db_watchpoint_list) {
522 /*
523 * If we aren't asking for write permission,
524 * then don't give it away. We're using write
525 * faults to set the dirty bit.
526 */
527 if (!(fault_type & VM_PROT_WRITE))
528 *protection &= ~VM_PROT_WRITE;
529 }
530
531 #endif /* MACH_KDB */
532 #endif /* STATIC_CONFIG */
533
534 interruptible_state = thread_interrupt_level(interruptible);
535
536 /*
537 * INVARIANTS (through entire routine):
538 *
539 * 1) At all times, we must either have the object
540 * lock or a busy page in some object to prevent
541 * some other thread from trying to bring in
542 * the same page.
543 *
544 * Note that we cannot hold any locks during the
545 * pager access or when waiting for memory, so
546 * we use a busy page then.
547 *
548 * Note also that we aren't as concerned about more than
549 * one thread attempting to memory_object_data_unlock
550 * the same page at once, so we don't hold the page
551 * as busy then, but do record the highest unlock
552 * value so far. [Unlock requests may also be delivered
553 * out of order.]
554 *
555 * 2) To prevent another thread from racing us down the
556 * shadow chain and entering a new page in the top
557 * object before we do, we must keep a busy page in
558 * the top object while following the shadow chain.
559 *
560 * 3) We must increment paging_in_progress on any object
561 * for which we have a busy page
562 *
563 * 4) We leave busy pages on the pageout queues.
564 * If the pageout daemon comes across a busy page,
565 * it will remove the page from the pageout queues.
566 */
567
568 /*
569 * Search for the page at object/offset.
570 */
571
572 object = first_object;
573 offset = first_offset;
574 first_m = VM_PAGE_NULL;
575 access_required = fault_type;
576
577 XPR(XPR_VM_FAULT,
578 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
579 (integer_t)object, offset, fault_type, *protection, 0);
580
581 /*
582 * See whether this page is resident
583 */
584
585 while (TRUE) {
586 #if TRACEFAULTPAGE
587 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
588 #endif
589 if (!object->alive) {
590 vm_fault_cleanup(object, first_m);
591 thread_interrupt_level(interruptible_state);
592 return(VM_FAULT_MEMORY_ERROR);
593 }
594 m = vm_page_lookup(object, offset);
595 #if TRACEFAULTPAGE
596 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
597 #endif
598 if (m != VM_PAGE_NULL) {
599 /*
600 * If the page was pre-paged as part of a
601 * cluster, record the fact.
602 * If we were passed a valid pointer for
603 * "type_of_fault", than we came from
604 * vm_fault... we'll let it deal with
605 * this condition, since it
606 * needs to see m->clustered to correctly
607 * account the pageins.
608 */
609 if (type_of_fault == NULL && m->clustered) {
610 vm_pagein_cluster_used++;
611 m->clustered = FALSE;
612 }
613
614 /*
615 * If the page is being brought in,
616 * wait for it and then retry.
617 *
618 * A possible optimization: if the page
619 * is known to be resident, we can ignore
620 * pages that are absent (regardless of
621 * whether they're busy).
622 */
623
624 if (m->busy) {
625 #if TRACEFAULTPAGE
626 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
627 #endif
628 wait_result = PAGE_SLEEP(object, m, interruptible);
629 XPR(XPR_VM_FAULT,
630 "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
631 (integer_t)object, offset,
632 (integer_t)m, 0, 0);
633 counter(c_vm_fault_page_block_busy_kernel++);
634
635 if (wait_result != THREAD_AWAKENED) {
636 vm_fault_cleanup(object, first_m);
637 thread_interrupt_level(interruptible_state);
638 if (wait_result == THREAD_RESTART)
639 {
640 return(VM_FAULT_RETRY);
641 }
642 else
643 {
644 return(VM_FAULT_INTERRUPTED);
645 }
646 }
647 continue;
648 }
649
650 if (m->encrypted) {
651 /*
652 * ENCRYPTED SWAP:
653 * the user needs access to a page that we
654 * encrypted before paging it out.
655 * Decrypt the page now.
656 * Keep it busy to prevent anyone from
657 * accessing it during the decryption.
658 */
659 m->busy = TRUE;
660 vm_page_decrypt(m, 0);
661 assert(object == m->object);
662 assert(m->busy);
663 PAGE_WAKEUP_DONE(m);
664
665 /*
666 * Retry from the top, in case
667 * something changed while we were
668 * decrypting.
669 */
670 continue;
671 }
672 ASSERT_PAGE_DECRYPTED(m);
673
674 /*
675 * If the page is in error, give up now.
676 */
677
678 if (m->error) {
679 #if TRACEFAULTPAGE
680 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
681 #endif
682 if (error_code)
683 *error_code = m->page_error;
684 VM_PAGE_FREE(m);
685 vm_fault_cleanup(object, first_m);
686 thread_interrupt_level(interruptible_state);
687 return(VM_FAULT_MEMORY_ERROR);
688 }
689
690 /*
691 * If the pager wants us to restart
692 * at the top of the chain,
693 * typically because it has moved the
694 * page to another pager, then do so.
695 */
696
697 if (m->restart) {
698 #if TRACEFAULTPAGE
699 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
700 #endif
701 VM_PAGE_FREE(m);
702 vm_fault_cleanup(object, first_m);
703 thread_interrupt_level(interruptible_state);
704 return(VM_FAULT_RETRY);
705 }
706
707 /*
708 * If the page isn't busy, but is absent,
709 * then it was deemed "unavailable".
710 */
711
712 if (m->absent) {
713 /*
714 * Remove the non-existent page (unless it's
715 * in the top object) and move on down to the
716 * next object (if there is one).
717 */
718 #if TRACEFAULTPAGE
719 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
720 #endif
721
722 next_object = object->shadow;
723 if (next_object == VM_OBJECT_NULL) {
724 vm_page_t real_m;
725
726 assert(!must_be_resident);
727
728 if (object->shadow_severed) {
729 vm_fault_cleanup(
730 object, first_m);
731 thread_interrupt_level(interruptible_state);
732 return VM_FAULT_MEMORY_ERROR;
733 }
734
735 /*
736 * Absent page at bottom of shadow
737 * chain; zero fill the page we left
738 * busy in the first object, and flush
739 * the absent page. But first we
740 * need to allocate a real page.
741 */
742 if (VM_PAGE_THROTTLED() ||
743 (real_m = vm_page_grab())
744 == VM_PAGE_NULL) {
745 vm_fault_cleanup(
746 object, first_m);
747 thread_interrupt_level(
748 interruptible_state);
749 return(
750 VM_FAULT_MEMORY_SHORTAGE);
751 }
752
753 /*
754 * are we protecting the system from
755 * backing store exhaustion. If so
756 * sleep unless we are privileged.
757 */
758
759 if(vm_backing_store_low) {
760 if(!(current_task()->priv_flags
761 & VM_BACKING_STORE_PRIV)) {
762 assert_wait((event_t)
763 &vm_backing_store_low,
764 THREAD_UNINT);
765 vm_fault_cleanup(object,
766 first_m);
767 thread_block(THREAD_CONTINUE_NULL);
768 thread_interrupt_level(
769 interruptible_state);
770 return(VM_FAULT_RETRY);
771 }
772 }
773
774
775 XPR(XPR_VM_FAULT,
776 "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
777 (integer_t)object, offset,
778 (integer_t)m,
779 (integer_t)first_object, 0);
780 if (object != first_object) {
781 VM_PAGE_FREE(m);
782 vm_object_paging_end(object);
783 vm_object_unlock(object);
784 object = first_object;
785 offset = first_offset;
786 m = first_m;
787 first_m = VM_PAGE_NULL;
788 vm_object_lock(object);
789 }
790
791 VM_PAGE_FREE(m);
792 assert(real_m->busy);
793 vm_page_insert(real_m, object, offset);
794 m = real_m;
795
796 /*
797 * Drop the lock while zero filling
798 * page. Then break because this
799 * is the page we wanted. Checking
800 * the page lock is a waste of time;
801 * this page was either absent or
802 * newly allocated -- in both cases
803 * it can't be page locked by a pager.
804 */
805 m->no_isync = FALSE;
806
807 if (!no_zero_fill) {
808 vm_object_unlock(object);
809 vm_page_zero_fill(m);
810 vm_object_lock(object);
811
812 if (type_of_fault)
813 *type_of_fault = DBG_ZERO_FILL_FAULT;
814 VM_STAT(zero_fill_count++);
815 }
816 if (bumped_pagein == TRUE) {
817 VM_STAT(pageins--);
818 current_task()->pageins--;
819 }
820 vm_page_lock_queues();
821 VM_PAGE_QUEUES_REMOVE(m);
822 m->page_ticket = vm_page_ticket;
823 assert(!m->laundry);
824 assert(m->object != kernel_object);
825 assert(m->pageq.next == NULL &&
826 m->pageq.prev == NULL);
827 if(m->object->size > 0x200000) {
828 m->zero_fill = TRUE;
829 /* depends on the queues lock */
830 vm_zf_count += 1;
831 queue_enter(&vm_page_queue_zf,
832 m, vm_page_t, pageq);
833 } else {
834 queue_enter(
835 &vm_page_queue_inactive,
836 m, vm_page_t, pageq);
837 }
838 vm_page_ticket_roll++;
839 if(vm_page_ticket_roll ==
840 VM_PAGE_TICKETS_IN_ROLL) {
841 vm_page_ticket_roll = 0;
842 if(vm_page_ticket ==
843 VM_PAGE_TICKET_ROLL_IDS)
844 vm_page_ticket= 0;
845 else
846 vm_page_ticket++;
847 }
848 m->inactive = TRUE;
849 vm_page_inactive_count++;
850 vm_page_unlock_queues();
851 break;
852 } else {
853 if (must_be_resident) {
854 vm_object_paging_end(object);
855 } else if (object != first_object) {
856 vm_object_paging_end(object);
857 VM_PAGE_FREE(m);
858 } else {
859 first_m = m;
860 m->absent = FALSE;
861 m->unusual = FALSE;
862 vm_object_absent_release(object);
863 m->busy = TRUE;
864
865 vm_page_lock_queues();
866 VM_PAGE_QUEUES_REMOVE(m);
867 vm_page_unlock_queues();
868 }
869 XPR(XPR_VM_FAULT,
870 "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
871 (integer_t)object, offset,
872 (integer_t)next_object,
873 offset+object->shadow_offset,0);
874 offset += object->shadow_offset;
875 hi_offset += object->shadow_offset;
876 lo_offset += object->shadow_offset;
877 access_required = VM_PROT_READ;
878 vm_object_lock(next_object);
879 vm_object_unlock(object);
880 object = next_object;
881 vm_object_paging_begin(object);
882 continue;
883 }
884 }
885
886 if ((m->cleaning)
887 && ((object != first_object) ||
888 (object->copy != VM_OBJECT_NULL))
889 && (fault_type & VM_PROT_WRITE)) {
890 /*
891 * This is a copy-on-write fault that will
892 * cause us to revoke access to this page, but
893 * this page is in the process of being cleaned
894 * in a clustered pageout. We must wait until
895 * the cleaning operation completes before
896 * revoking access to the original page,
897 * otherwise we might attempt to remove a
898 * wired mapping.
899 */
900 #if TRACEFAULTPAGE
901 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
902 #endif
903 XPR(XPR_VM_FAULT,
904 "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
905 (integer_t)object, offset,
906 (integer_t)m, 0, 0);
907 /* take an extra ref so that object won't die */
908 assert(object->ref_count > 0);
909 object->ref_count++;
910 vm_object_res_reference(object);
911 vm_fault_cleanup(object, first_m);
912 counter(c_vm_fault_page_block_backoff_kernel++);
913 vm_object_lock(object);
914 assert(object->ref_count > 0);
915 m = vm_page_lookup(object, offset);
916 if (m != VM_PAGE_NULL && m->cleaning) {
917 PAGE_ASSERT_WAIT(m, interruptible);
918 vm_object_unlock(object);
919 wait_result = thread_block(THREAD_CONTINUE_NULL);
920 vm_object_deallocate(object);
921 goto backoff;
922 } else {
923 vm_object_unlock(object);
924 vm_object_deallocate(object);
925 thread_interrupt_level(interruptible_state);
926 return VM_FAULT_RETRY;
927 }
928 }
929
930 /*
931 * If the desired access to this page has
932 * been locked out, request that it be unlocked.
933 */
934
935 if (access_required & m->page_lock) {
936 if ((access_required & m->unlock_request) != access_required) {
937 vm_prot_t new_unlock_request;
938 kern_return_t rc;
939
940 #if TRACEFAULTPAGE
941 dbgTrace(0xBEEF000A, (unsigned int) m, (unsigned int) object->pager_ready); /* (TEST/DEBUG) */
942 #endif
943 if (!object->pager_ready) {
944 XPR(XPR_VM_FAULT,
945 "vm_f_page: ready wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
946 access_required,
947 (integer_t)object, offset,
948 (integer_t)m, 0);
949 /* take an extra ref */
950 assert(object->ref_count > 0);
951 object->ref_count++;
952 vm_object_res_reference(object);
953 vm_fault_cleanup(object,
954 first_m);
955 counter(c_vm_fault_page_block_backoff_kernel++);
956 vm_object_lock(object);
957 assert(object->ref_count > 0);
958 if (!object->pager_ready) {
959 wait_result = vm_object_assert_wait(
960 object,
961 VM_OBJECT_EVENT_PAGER_READY,
962 interruptible);
963 vm_object_unlock(object);
964 if (wait_result == THREAD_WAITING)
965 wait_result = thread_block(THREAD_CONTINUE_NULL);
966 vm_object_deallocate(object);
967 goto backoff;
968 } else {
969 vm_object_unlock(object);
970 vm_object_deallocate(object);
971 thread_interrupt_level(interruptible_state);
972 return VM_FAULT_RETRY;
973 }
974 }
975
976 new_unlock_request = m->unlock_request =
977 (access_required | m->unlock_request);
978 vm_object_unlock(object);
979 XPR(XPR_VM_FAULT,
980 "vm_f_page: unlock obj 0x%X, offset 0x%X, page 0x%X, unl_req %d\n",
981 (integer_t)object, offset,
982 (integer_t)m, new_unlock_request, 0);
983 if ((rc = memory_object_data_unlock(
984 object->pager,
985 offset + object->paging_offset,
986 PAGE_SIZE,
987 new_unlock_request))
988 != KERN_SUCCESS) {
989 if (vm_fault_debug)
990 printf("vm_fault: memory_object_data_unlock failed\n");
991 vm_object_lock(object);
992 vm_fault_cleanup(object, first_m);
993 thread_interrupt_level(interruptible_state);
994 return((rc == MACH_SEND_INTERRUPTED) ?
995 VM_FAULT_INTERRUPTED :
996 VM_FAULT_MEMORY_ERROR);
997 }
998 vm_object_lock(object);
999 continue;
1000 }
1001
1002 XPR(XPR_VM_FAULT,
1003 "vm_f_page: access wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
1004 access_required, (integer_t)object,
1005 offset, (integer_t)m, 0);
1006 /* take an extra ref so object won't die */
1007 assert(object->ref_count > 0);
1008 object->ref_count++;
1009 vm_object_res_reference(object);
1010 vm_fault_cleanup(object, first_m);
1011 counter(c_vm_fault_page_block_backoff_kernel++);
1012 vm_object_lock(object);
1013 assert(object->ref_count > 0);
1014 m = vm_page_lookup(object, offset);
1015 if (m != VM_PAGE_NULL &&
1016 (access_required & m->page_lock) &&
1017 !((access_required & m->unlock_request) != access_required)) {
1018 PAGE_ASSERT_WAIT(m, interruptible);
1019 vm_object_unlock(object);
1020 wait_result = thread_block(THREAD_CONTINUE_NULL);
1021 vm_object_deallocate(object);
1022 goto backoff;
1023 } else {
1024 vm_object_unlock(object);
1025 vm_object_deallocate(object);
1026 thread_interrupt_level(interruptible_state);
1027 return VM_FAULT_RETRY;
1028 }
1029 }
1030 /*
1031 * We mark the page busy and leave it on
1032 * the pageout queues. If the pageout
1033 * deamon comes across it, then it will
1034 * remove the page.
1035 */
1036
1037 #if TRACEFAULTPAGE
1038 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1039 #endif
1040
1041 #if !VM_FAULT_STATIC_CONFIG
1042 if (!software_reference_bits) {
1043 vm_page_lock_queues();
1044 if (m->inactive)
1045 vm_stat.reactivations++;
1046
1047 VM_PAGE_QUEUES_REMOVE(m);
1048 vm_page_unlock_queues();
1049 }
1050 #endif
1051 XPR(XPR_VM_FAULT,
1052 "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1053 (integer_t)object, offset, (integer_t)m, 0, 0);
1054 assert(!m->busy);
1055 m->busy = TRUE;
1056 assert(!m->absent);
1057 break;
1058 }
1059
1060 look_for_page =
1061 (object->pager_created) &&
1062 LOOK_FOR(object, offset) &&
1063 (!data_supply);
1064
1065 #if TRACEFAULTPAGE
1066 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1067 #endif
1068 if ((look_for_page || (object == first_object))
1069 && !must_be_resident
1070 && !(object->phys_contiguous)) {
1071 /*
1072 * Allocate a new page for this object/offset
1073 * pair.
1074 */
1075
1076 m = vm_page_grab_fictitious();
1077 #if TRACEFAULTPAGE
1078 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1079 #endif
1080 if (m == VM_PAGE_NULL) {
1081 vm_fault_cleanup(object, first_m);
1082 thread_interrupt_level(interruptible_state);
1083 return(VM_FAULT_FICTITIOUS_SHORTAGE);
1084 }
1085 vm_page_insert(m, object, offset);
1086 }
1087
1088 if ((look_for_page && !must_be_resident)) {
1089 kern_return_t rc;
1090
1091 /*
1092 * If the memory manager is not ready, we
1093 * cannot make requests.
1094 */
1095 if (!object->pager_ready) {
1096 #if TRACEFAULTPAGE
1097 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1098 #endif
1099 if(m != VM_PAGE_NULL)
1100 VM_PAGE_FREE(m);
1101 XPR(XPR_VM_FAULT,
1102 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1103 (integer_t)object, offset, 0, 0, 0);
1104 /* take an extra ref so object won't die */
1105 assert(object->ref_count > 0);
1106 object->ref_count++;
1107 vm_object_res_reference(object);
1108 vm_fault_cleanup(object, first_m);
1109 counter(c_vm_fault_page_block_backoff_kernel++);
1110 vm_object_lock(object);
1111 assert(object->ref_count > 0);
1112 if (!object->pager_ready) {
1113 wait_result = vm_object_assert_wait(object,
1114 VM_OBJECT_EVENT_PAGER_READY,
1115 interruptible);
1116 vm_object_unlock(object);
1117 if (wait_result == THREAD_WAITING)
1118 wait_result = thread_block(THREAD_CONTINUE_NULL);
1119 vm_object_deallocate(object);
1120 goto backoff;
1121 } else {
1122 vm_object_unlock(object);
1123 vm_object_deallocate(object);
1124 thread_interrupt_level(interruptible_state);
1125 return VM_FAULT_RETRY;
1126 }
1127 }
1128
1129 if(object->phys_contiguous) {
1130 if(m != VM_PAGE_NULL) {
1131 VM_PAGE_FREE(m);
1132 m = VM_PAGE_NULL;
1133 }
1134 goto no_clustering;
1135 }
1136 if (object->internal) {
1137 /*
1138 * Requests to the default pager
1139 * must reserve a real page in advance,
1140 * because the pager's data-provided
1141 * won't block for pages. IMPORTANT:
1142 * this acts as a throttling mechanism
1143 * for data_requests to the default
1144 * pager.
1145 */
1146
1147 #if TRACEFAULTPAGE
1148 dbgTrace(0xBEEF000F, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1149 #endif
1150 if (m->fictitious && !vm_page_convert(m)) {
1151 VM_PAGE_FREE(m);
1152 vm_fault_cleanup(object, first_m);
1153 thread_interrupt_level(interruptible_state);
1154 return(VM_FAULT_MEMORY_SHORTAGE);
1155 }
1156 } else if (object->absent_count >
1157 vm_object_absent_max) {
1158 /*
1159 * If there are too many outstanding page
1160 * requests pending on this object, we
1161 * wait for them to be resolved now.
1162 */
1163
1164 #if TRACEFAULTPAGE
1165 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1166 #endif
1167 if(m != VM_PAGE_NULL)
1168 VM_PAGE_FREE(m);
1169 /* take an extra ref so object won't die */
1170 assert(object->ref_count > 0);
1171 object->ref_count++;
1172 vm_object_res_reference(object);
1173 vm_fault_cleanup(object, first_m);
1174 counter(c_vm_fault_page_block_backoff_kernel++);
1175 vm_object_lock(object);
1176 assert(object->ref_count > 0);
1177 if (object->absent_count > vm_object_absent_max) {
1178 vm_object_absent_assert_wait(object,
1179 interruptible);
1180 vm_object_unlock(object);
1181 wait_result = thread_block(THREAD_CONTINUE_NULL);
1182 vm_object_deallocate(object);
1183 goto backoff;
1184 } else {
1185 vm_object_unlock(object);
1186 vm_object_deallocate(object);
1187 thread_interrupt_level(interruptible_state);
1188 return VM_FAULT_RETRY;
1189 }
1190 }
1191
1192 /*
1193 * Indicate that the page is waiting for data
1194 * from the memory manager.
1195 */
1196
1197 if(m != VM_PAGE_NULL) {
1198
1199 m->list_req_pending = TRUE;
1200 m->absent = TRUE;
1201 m->unusual = TRUE;
1202 object->absent_count++;
1203
1204 }
1205
1206 no_clustering:
1207 cluster_start = offset;
1208 length = PAGE_SIZE;
1209
1210 /*
1211 * lengthen the cluster by the pages in the working set
1212 */
1213 if((map != NULL) &&
1214 (current_task()->dynamic_working_set != 0)) {
1215 cluster_end = cluster_start + length;
1216 /* tws values for start and end are just a
1217 * suggestions. Therefore, as long as
1218 * build_cluster does not use pointers or
1219 * take action based on values that
1220 * could be affected by re-entrance we
1221 * do not need to take the map lock.
1222 */
1223 cluster_end = offset + PAGE_SIZE_64;
1224 tws_build_cluster(
1225 current_task()->dynamic_working_set,
1226 object, &cluster_start,
1227 &cluster_end, 0x40000);
1228 length = cluster_end - cluster_start;
1229 }
1230 #if TRACEFAULTPAGE
1231 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1232 #endif
1233 /*
1234 * We have a busy page, so we can
1235 * release the object lock.
1236 */
1237 vm_object_unlock(object);
1238
1239 /*
1240 * Call the memory manager to retrieve the data.
1241 */
1242
1243 if (type_of_fault)
1244 *type_of_fault = ((int)length << 8) | DBG_PAGEIN_FAULT;
1245 VM_STAT(pageins++);
1246 current_task()->pageins++;
1247 bumped_pagein = TRUE;
1248
1249 /*
1250 * If this object uses a copy_call strategy,
1251 * and we are interested in a copy of this object
1252 * (having gotten here only by following a
1253 * shadow chain), then tell the memory manager
1254 * via a flag added to the desired_access
1255 * parameter, so that it can detect a race
1256 * between our walking down the shadow chain
1257 * and its pushing pages up into a copy of
1258 * the object that it manages.
1259 */
1260
1261 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL &&
1262 object != first_object) {
1263 wants_copy_flag = VM_PROT_WANTS_COPY;
1264 } else {
1265 wants_copy_flag = VM_PROT_NONE;
1266 }
1267
1268 XPR(XPR_VM_FAULT,
1269 "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1270 (integer_t)object, offset, (integer_t)m,
1271 access_required | wants_copy_flag, 0);
1272
1273 rc = memory_object_data_request(object->pager,
1274 cluster_start + object->paging_offset,
1275 length,
1276 access_required | wants_copy_flag);
1277
1278
1279 #if TRACEFAULTPAGE
1280 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1281 #endif
1282 if (rc != KERN_SUCCESS) {
1283 if (rc != MACH_SEND_INTERRUPTED
1284 && vm_fault_debug)
1285 printf("%s(0x%x, 0x%xll, 0x%xll, 0x%x) failed, rc=%d\n",
1286 "memory_object_data_request",
1287 object->pager,
1288 cluster_start + object->paging_offset,
1289 length, access_required, rc);
1290 /*
1291 * Don't want to leave a busy page around,
1292 * but the data request may have blocked,
1293 * so check if it's still there and busy.
1294 */
1295 if(!object->phys_contiguous) {
1296 vm_object_lock(object);
1297 for (; length; length -= PAGE_SIZE,
1298 cluster_start += PAGE_SIZE_64) {
1299 vm_page_t p;
1300 if ((p = vm_page_lookup(object,
1301 cluster_start))
1302 && p->absent && p->busy
1303 && p != first_m) {
1304 VM_PAGE_FREE(p);
1305 }
1306 }
1307 }
1308 vm_fault_cleanup(object, first_m);
1309 thread_interrupt_level(interruptible_state);
1310 return((rc == MACH_SEND_INTERRUPTED) ?
1311 VM_FAULT_INTERRUPTED :
1312 VM_FAULT_MEMORY_ERROR);
1313 }
1314
1315 vm_object_lock(object);
1316 if ((interruptible != THREAD_UNINT) &&
1317 (current_thread()->state & TH_ABORT)) {
1318 vm_fault_cleanup(object, first_m);
1319 thread_interrupt_level(interruptible_state);
1320 return(VM_FAULT_INTERRUPTED);
1321 }
1322 if (m == VM_PAGE_NULL &&
1323 object->phys_contiguous) {
1324 /*
1325 * No page here means that the object we
1326 * initially looked up was "physically
1327 * contiguous" (i.e. device memory). However,
1328 * with Virtual VRAM, the object might not
1329 * be backed by that device memory anymore,
1330 * so we're done here only if the object is
1331 * still "phys_contiguous".
1332 * Otherwise, if the object is no longer
1333 * "phys_contiguous", we need to retry the
1334 * page fault against the object's new backing
1335 * store (different memory object).
1336 */
1337 break;
1338 }
1339
1340 /*
1341 * Retry with same object/offset, since new data may
1342 * be in a different page (i.e., m is meaningless at
1343 * this point).
1344 */
1345 continue;
1346 }
1347
1348 /*
1349 * The only case in which we get here is if
1350 * object has no pager (or unwiring). If the pager doesn't
1351 * have the page this is handled in the m->absent case above
1352 * (and if you change things here you should look above).
1353 */
1354 #if TRACEFAULTPAGE
1355 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1356 #endif
1357 if (object == first_object)
1358 first_m = m;
1359 else
1360 assert(m == VM_PAGE_NULL);
1361
1362 XPR(XPR_VM_FAULT,
1363 "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1364 (integer_t)object, offset, (integer_t)m,
1365 (integer_t)object->shadow, 0);
1366 /*
1367 * Move on to the next object. Lock the next
1368 * object before unlocking the current one.
1369 */
1370 next_object = object->shadow;
1371 if (next_object == VM_OBJECT_NULL) {
1372 assert(!must_be_resident);
1373 /*
1374 * If there's no object left, fill the page
1375 * in the top object with zeros. But first we
1376 * need to allocate a real page.
1377 */
1378
1379 if (object != first_object) {
1380 vm_object_paging_end(object);
1381 vm_object_unlock(object);
1382
1383 object = first_object;
1384 offset = first_offset;
1385 vm_object_lock(object);
1386 }
1387
1388 m = first_m;
1389 assert(m->object == object);
1390 first_m = VM_PAGE_NULL;
1391
1392 if(m == VM_PAGE_NULL) {
1393 m = vm_page_grab();
1394 if (m == VM_PAGE_NULL) {
1395 vm_fault_cleanup(
1396 object, VM_PAGE_NULL);
1397 thread_interrupt_level(
1398 interruptible_state);
1399 return(VM_FAULT_MEMORY_SHORTAGE);
1400 }
1401 vm_page_insert(
1402 m, object, offset);
1403 }
1404
1405 if (object->shadow_severed) {
1406 VM_PAGE_FREE(m);
1407 vm_fault_cleanup(object, VM_PAGE_NULL);
1408 thread_interrupt_level(interruptible_state);
1409 return VM_FAULT_MEMORY_ERROR;
1410 }
1411
1412 /*
1413 * are we protecting the system from
1414 * backing store exhaustion. If so
1415 * sleep unless we are privileged.
1416 */
1417
1418 if(vm_backing_store_low) {
1419 if(!(current_task()->priv_flags
1420 & VM_BACKING_STORE_PRIV)) {
1421 assert_wait((event_t)
1422 &vm_backing_store_low,
1423 THREAD_UNINT);
1424 VM_PAGE_FREE(m);
1425 vm_fault_cleanup(object, VM_PAGE_NULL);
1426 thread_block(THREAD_CONTINUE_NULL);
1427 thread_interrupt_level(
1428 interruptible_state);
1429 return(VM_FAULT_RETRY);
1430 }
1431 }
1432
1433 if (VM_PAGE_THROTTLED() ||
1434 (m->fictitious && !vm_page_convert(m))) {
1435 VM_PAGE_FREE(m);
1436 vm_fault_cleanup(object, VM_PAGE_NULL);
1437 thread_interrupt_level(interruptible_state);
1438 return(VM_FAULT_MEMORY_SHORTAGE);
1439 }
1440 m->no_isync = FALSE;
1441
1442 if (!no_zero_fill) {
1443 vm_object_unlock(object);
1444 vm_page_zero_fill(m);
1445 vm_object_lock(object);
1446
1447 if (type_of_fault)
1448 *type_of_fault = DBG_ZERO_FILL_FAULT;
1449 VM_STAT(zero_fill_count++);
1450 }
1451 if (bumped_pagein == TRUE) {
1452 VM_STAT(pageins--);
1453 current_task()->pageins--;
1454 }
1455 vm_page_lock_queues();
1456 VM_PAGE_QUEUES_REMOVE(m);
1457 assert(!m->laundry);
1458 assert(m->object != kernel_object);
1459 assert(m->pageq.next == NULL &&
1460 m->pageq.prev == NULL);
1461 if(m->object->size > 0x200000) {
1462 m->zero_fill = TRUE;
1463 /* depends on the queues lock */
1464 vm_zf_count += 1;
1465 queue_enter(&vm_page_queue_zf,
1466 m, vm_page_t, pageq);
1467 } else {
1468 queue_enter(
1469 &vm_page_queue_inactive,
1470 m, vm_page_t, pageq);
1471 }
1472 m->page_ticket = vm_page_ticket;
1473 vm_page_ticket_roll++;
1474 if(vm_page_ticket_roll == VM_PAGE_TICKETS_IN_ROLL) {
1475 vm_page_ticket_roll = 0;
1476 if(vm_page_ticket ==
1477 VM_PAGE_TICKET_ROLL_IDS)
1478 vm_page_ticket= 0;
1479 else
1480 vm_page_ticket++;
1481 }
1482 m->inactive = TRUE;
1483 vm_page_inactive_count++;
1484 vm_page_unlock_queues();
1485 #if 0
1486 pmap_clear_modify(m->phys_page);
1487 #endif
1488 break;
1489 }
1490 else {
1491 if ((object != first_object) || must_be_resident)
1492 vm_object_paging_end(object);
1493 offset += object->shadow_offset;
1494 hi_offset += object->shadow_offset;
1495 lo_offset += object->shadow_offset;
1496 access_required = VM_PROT_READ;
1497 vm_object_lock(next_object);
1498 vm_object_unlock(object);
1499 object = next_object;
1500 vm_object_paging_begin(object);
1501 }
1502 }
1503
1504 /*
1505 * PAGE HAS BEEN FOUND.
1506 *
1507 * This page (m) is:
1508 * busy, so that we can play with it;
1509 * not absent, so that nobody else will fill it;
1510 * possibly eligible for pageout;
1511 *
1512 * The top-level page (first_m) is:
1513 * VM_PAGE_NULL if the page was found in the
1514 * top-level object;
1515 * busy, not absent, and ineligible for pageout.
1516 *
1517 * The current object (object) is locked. A paging
1518 * reference is held for the current and top-level
1519 * objects.
1520 */
1521
1522 #if TRACEFAULTPAGE
1523 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1524 #endif
1525 #if EXTRA_ASSERTIONS
1526 if(m != VM_PAGE_NULL) {
1527 assert(m->busy && !m->absent);
1528 assert((first_m == VM_PAGE_NULL) ||
1529 (first_m->busy && !first_m->absent &&
1530 !first_m->active && !first_m->inactive));
1531 }
1532 #endif /* EXTRA_ASSERTIONS */
1533
1534 /*
1535 * ENCRYPTED SWAP:
1536 * If we found a page, we must have decrypted it before we
1537 * get here...
1538 */
1539 if (m != VM_PAGE_NULL) {
1540 ASSERT_PAGE_DECRYPTED(m);
1541 }
1542
1543 XPR(XPR_VM_FAULT,
1544 "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1545 (integer_t)object, offset, (integer_t)m,
1546 (integer_t)first_object, (integer_t)first_m);
1547 /*
1548 * If the page is being written, but isn't
1549 * already owned by the top-level object,
1550 * we have to copy it into a new page owned
1551 * by the top-level object.
1552 */
1553
1554 if ((object != first_object) && (m != VM_PAGE_NULL)) {
1555 /*
1556 * We only really need to copy if we
1557 * want to write it.
1558 */
1559
1560 #if TRACEFAULTPAGE
1561 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1562 #endif
1563 if (fault_type & VM_PROT_WRITE) {
1564 vm_page_t copy_m;
1565
1566 assert(!must_be_resident);
1567
1568 /*
1569 * are we protecting the system from
1570 * backing store exhaustion. If so
1571 * sleep unless we are privileged.
1572 */
1573
1574 if(vm_backing_store_low) {
1575 if(!(current_task()->priv_flags
1576 & VM_BACKING_STORE_PRIV)) {
1577 assert_wait((event_t)
1578 &vm_backing_store_low,
1579 THREAD_UNINT);
1580 RELEASE_PAGE(m);
1581 vm_fault_cleanup(object, first_m);
1582 thread_block(THREAD_CONTINUE_NULL);
1583 thread_interrupt_level(
1584 interruptible_state);
1585 return(VM_FAULT_RETRY);
1586 }
1587 }
1588
1589 /*
1590 * If we try to collapse first_object at this
1591 * point, we may deadlock when we try to get
1592 * the lock on an intermediate object (since we
1593 * have the bottom object locked). We can't
1594 * unlock the bottom object, because the page
1595 * we found may move (by collapse) if we do.
1596 *
1597 * Instead, we first copy the page. Then, when
1598 * we have no more use for the bottom object,
1599 * we unlock it and try to collapse.
1600 *
1601 * Note that we copy the page even if we didn't
1602 * need to... that's the breaks.
1603 */
1604
1605 /*
1606 * Allocate a page for the copy
1607 */
1608 copy_m = vm_page_grab();
1609 if (copy_m == VM_PAGE_NULL) {
1610 RELEASE_PAGE(m);
1611 vm_fault_cleanup(object, first_m);
1612 thread_interrupt_level(interruptible_state);
1613 return(VM_FAULT_MEMORY_SHORTAGE);
1614 }
1615
1616
1617 XPR(XPR_VM_FAULT,
1618 "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1619 (integer_t)object, offset,
1620 (integer_t)m, (integer_t)copy_m, 0);
1621 vm_page_copy(m, copy_m);
1622
1623 /*
1624 * If another map is truly sharing this
1625 * page with us, we have to flush all
1626 * uses of the original page, since we
1627 * can't distinguish those which want the
1628 * original from those which need the
1629 * new copy.
1630 *
1631 * XXXO If we know that only one map has
1632 * access to this page, then we could
1633 * avoid the pmap_disconnect() call.
1634 */
1635
1636 vm_page_lock_queues();
1637 assert(!m->cleaning);
1638 pmap_disconnect(m->phys_page);
1639 vm_page_deactivate(m);
1640 copy_m->dirty = TRUE;
1641 /*
1642 * Setting reference here prevents this fault from
1643 * being counted as a (per-thread) reactivate as well
1644 * as a copy-on-write.
1645 */
1646 first_m->reference = TRUE;
1647 vm_page_unlock_queues();
1648
1649 /*
1650 * We no longer need the old page or object.
1651 */
1652
1653 PAGE_WAKEUP_DONE(m);
1654 vm_object_paging_end(object);
1655 vm_object_unlock(object);
1656
1657 if (type_of_fault)
1658 *type_of_fault = DBG_COW_FAULT;
1659 VM_STAT(cow_faults++);
1660 current_task()->cow_faults++;
1661 object = first_object;
1662 offset = first_offset;
1663
1664 vm_object_lock(object);
1665 VM_PAGE_FREE(first_m);
1666 first_m = VM_PAGE_NULL;
1667 assert(copy_m->busy);
1668 vm_page_insert(copy_m, object, offset);
1669 m = copy_m;
1670
1671 /*
1672 * Now that we've gotten the copy out of the
1673 * way, let's try to collapse the top object.
1674 * But we have to play ugly games with
1675 * paging_in_progress to do that...
1676 */
1677
1678 vm_object_paging_end(object);
1679 vm_object_collapse(object, offset);
1680 vm_object_paging_begin(object);
1681
1682 }
1683 else {
1684 *protection &= (~VM_PROT_WRITE);
1685 }
1686 }
1687
1688 /*
1689 * Now check whether the page needs to be pushed into the
1690 * copy object. The use of asymmetric copy on write for
1691 * shared temporary objects means that we may do two copies to
1692 * satisfy the fault; one above to get the page from a
1693 * shadowed object, and one here to push it into the copy.
1694 */
1695
1696 while ((copy_object = first_object->copy) != VM_OBJECT_NULL &&
1697 (m!= VM_PAGE_NULL)) {
1698 vm_object_offset_t copy_offset;
1699 vm_page_t copy_m;
1700
1701 #if TRACEFAULTPAGE
1702 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1703 #endif
1704 /*
1705 * If the page is being written, but hasn't been
1706 * copied to the copy-object, we have to copy it there.
1707 */
1708
1709 if ((fault_type & VM_PROT_WRITE) == 0) {
1710 *protection &= ~VM_PROT_WRITE;
1711 break;
1712 }
1713
1714 /*
1715 * If the page was guaranteed to be resident,
1716 * we must have already performed the copy.
1717 */
1718
1719 if (must_be_resident)
1720 break;
1721
1722 /*
1723 * Try to get the lock on the copy_object.
1724 */
1725 if (!vm_object_lock_try(copy_object)) {
1726 vm_object_unlock(object);
1727
1728 mutex_pause(); /* wait a bit */
1729
1730 vm_object_lock(object);
1731 continue;
1732 }
1733
1734 /*
1735 * Make another reference to the copy-object,
1736 * to keep it from disappearing during the
1737 * copy.
1738 */
1739 assert(copy_object->ref_count > 0);
1740 copy_object->ref_count++;
1741 VM_OBJ_RES_INCR(copy_object);
1742
1743 /*
1744 * Does the page exist in the copy?
1745 */
1746 copy_offset = first_offset - copy_object->shadow_offset;
1747 if (copy_object->size <= copy_offset)
1748 /*
1749 * Copy object doesn't cover this page -- do nothing.
1750 */
1751 ;
1752 else if ((copy_m =
1753 vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1754 /* Page currently exists in the copy object */
1755 if (copy_m->busy) {
1756 /*
1757 * If the page is being brought
1758 * in, wait for it and then retry.
1759 */
1760 RELEASE_PAGE(m);
1761 /* take an extra ref so object won't die */
1762 assert(copy_object->ref_count > 0);
1763 copy_object->ref_count++;
1764 vm_object_res_reference(copy_object);
1765 vm_object_unlock(copy_object);
1766 vm_fault_cleanup(object, first_m);
1767 counter(c_vm_fault_page_block_backoff_kernel++);
1768 vm_object_lock(copy_object);
1769 assert(copy_object->ref_count > 0);
1770 VM_OBJ_RES_DECR(copy_object);
1771 copy_object->ref_count--;
1772 assert(copy_object->ref_count > 0);
1773 copy_m = vm_page_lookup(copy_object, copy_offset);
1774 /*
1775 * ENCRYPTED SWAP:
1776 * it's OK if the "copy_m" page is encrypted,
1777 * because we're not moving it nor handling its
1778 * contents.
1779 */
1780 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1781 PAGE_ASSERT_WAIT(copy_m, interruptible);
1782 vm_object_unlock(copy_object);
1783 wait_result = thread_block(THREAD_CONTINUE_NULL);
1784 vm_object_deallocate(copy_object);
1785 goto backoff;
1786 } else {
1787 vm_object_unlock(copy_object);
1788 vm_object_deallocate(copy_object);
1789 thread_interrupt_level(interruptible_state);
1790 return VM_FAULT_RETRY;
1791 }
1792 }
1793 }
1794 else if (!PAGED_OUT(copy_object, copy_offset)) {
1795 /*
1796 * If PAGED_OUT is TRUE, then the page used to exist
1797 * in the copy-object, and has already been paged out.
1798 * We don't need to repeat this. If PAGED_OUT is
1799 * FALSE, then either we don't know (!pager_created,
1800 * for example) or it hasn't been paged out.
1801 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1802 * We must copy the page to the copy object.
1803 */
1804
1805 /*
1806 * are we protecting the system from
1807 * backing store exhaustion. If so
1808 * sleep unless we are privileged.
1809 */
1810
1811 if(vm_backing_store_low) {
1812 if(!(current_task()->priv_flags
1813 & VM_BACKING_STORE_PRIV)) {
1814 assert_wait((event_t)
1815 &vm_backing_store_low,
1816 THREAD_UNINT);
1817 RELEASE_PAGE(m);
1818 VM_OBJ_RES_DECR(copy_object);
1819 copy_object->ref_count--;
1820 assert(copy_object->ref_count > 0);
1821 vm_object_unlock(copy_object);
1822 vm_fault_cleanup(object, first_m);
1823 thread_block(THREAD_CONTINUE_NULL);
1824 thread_interrupt_level(
1825 interruptible_state);
1826 return(VM_FAULT_RETRY);
1827 }
1828 }
1829
1830 /*
1831 * Allocate a page for the copy
1832 */
1833 copy_m = vm_page_alloc(copy_object, copy_offset);
1834 if (copy_m == VM_PAGE_NULL) {
1835 RELEASE_PAGE(m);
1836 VM_OBJ_RES_DECR(copy_object);
1837 copy_object->ref_count--;
1838 assert(copy_object->ref_count > 0);
1839 vm_object_unlock(copy_object);
1840 vm_fault_cleanup(object, first_m);
1841 thread_interrupt_level(interruptible_state);
1842 return(VM_FAULT_MEMORY_SHORTAGE);
1843 }
1844
1845 /*
1846 * Must copy page into copy-object.
1847 */
1848
1849 vm_page_copy(m, copy_m);
1850
1851 /*
1852 * If the old page was in use by any users
1853 * of the copy-object, it must be removed
1854 * from all pmaps. (We can't know which
1855 * pmaps use it.)
1856 */
1857
1858 vm_page_lock_queues();
1859 assert(!m->cleaning);
1860 pmap_disconnect(m->phys_page);
1861 copy_m->dirty = TRUE;
1862 vm_page_unlock_queues();
1863
1864 /*
1865 * If there's a pager, then immediately
1866 * page out this page, using the "initialize"
1867 * option. Else, we use the copy.
1868 */
1869
1870 if
1871 #if MACH_PAGEMAP
1872 ((!copy_object->pager_created) ||
1873 vm_external_state_get(
1874 copy_object->existence_map, copy_offset)
1875 == VM_EXTERNAL_STATE_ABSENT)
1876 #else
1877 (!copy_object->pager_created)
1878 #endif
1879 {
1880 vm_page_lock_queues();
1881 vm_page_activate(copy_m);
1882 vm_page_unlock_queues();
1883 PAGE_WAKEUP_DONE(copy_m);
1884 }
1885 else {
1886 assert(copy_m->busy == TRUE);
1887
1888 /*
1889 * The page is already ready for pageout:
1890 * not on pageout queues and busy.
1891 * Unlock everything except the
1892 * copy_object itself.
1893 */
1894
1895 vm_object_unlock(object);
1896
1897 /*
1898 * Write the page to the copy-object,
1899 * flushing it from the kernel.
1900 */
1901
1902 vm_pageout_initialize_page(copy_m);
1903
1904 /*
1905 * Since the pageout may have
1906 * temporarily dropped the
1907 * copy_object's lock, we
1908 * check whether we'll have
1909 * to deallocate the hard way.
1910 */
1911
1912 if ((copy_object->shadow != object) ||
1913 (copy_object->ref_count == 1)) {
1914 vm_object_unlock(copy_object);
1915 vm_object_deallocate(copy_object);
1916 vm_object_lock(object);
1917 continue;
1918 }
1919
1920 /*
1921 * Pick back up the old object's
1922 * lock. [It is safe to do so,
1923 * since it must be deeper in the
1924 * object tree.]
1925 */
1926
1927 vm_object_lock(object);
1928 }
1929
1930 /*
1931 * Because we're pushing a page upward
1932 * in the object tree, we must restart
1933 * any faults that are waiting here.
1934 * [Note that this is an expansion of
1935 * PAGE_WAKEUP that uses the THREAD_RESTART
1936 * wait result]. Can't turn off the page's
1937 * busy bit because we're not done with it.
1938 */
1939
1940 if (m->wanted) {
1941 m->wanted = FALSE;
1942 thread_wakeup_with_result((event_t) m,
1943 THREAD_RESTART);
1944 }
1945 }
1946
1947 /*
1948 * The reference count on copy_object must be
1949 * at least 2: one for our extra reference,
1950 * and at least one from the outside world
1951 * (we checked that when we last locked
1952 * copy_object).
1953 */
1954 copy_object->ref_count--;
1955 assert(copy_object->ref_count > 0);
1956 VM_OBJ_RES_DECR(copy_object);
1957 vm_object_unlock(copy_object);
1958
1959 break;
1960 }
1961
1962 *result_page = m;
1963 *top_page = first_m;
1964
1965 XPR(XPR_VM_FAULT,
1966 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1967 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1968 /*
1969 * If the page can be written, assume that it will be.
1970 * [Earlier, we restrict the permission to allow write
1971 * access only if the fault so required, so we don't
1972 * mark read-only data as dirty.]
1973 */
1974
1975
1976 if(m != VM_PAGE_NULL) {
1977 #if !VM_FAULT_STATIC_CONFIG
1978 if (vm_fault_dirty_handling && (*protection & VM_PROT_WRITE))
1979 m->dirty = TRUE;
1980 #endif
1981 if (vm_page_deactivate_behind)
1982 vm_fault_deactivate_behind(object, offset, behavior);
1983 } else {
1984 vm_object_unlock(object);
1985 }
1986 thread_interrupt_level(interruptible_state);
1987
1988 #if TRACEFAULTPAGE
1989 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
1990 #endif
1991 return(VM_FAULT_SUCCESS);
1992
1993 #if 0
1994 block_and_backoff:
1995 vm_fault_cleanup(object, first_m);
1996
1997 counter(c_vm_fault_page_block_backoff_kernel++);
1998 thread_block(THREAD_CONTINUE_NULL);
1999 #endif
2000
2001 backoff:
2002 thread_interrupt_level(interruptible_state);
2003 if (wait_result == THREAD_INTERRUPTED)
2004 return VM_FAULT_INTERRUPTED;
2005 return VM_FAULT_RETRY;
2006
2007 #undef RELEASE_PAGE
2008 }
2009
2010 /*
2011 * Routine: vm_fault_tws_insert
2012 * Purpose:
2013 * Add fault information to the task working set.
2014 * Implementation:
2015 * We always insert the base object/offset pair
2016 * rather the actual object/offset.
2017 * Assumptions:
2018 * Map and real_map locked.
2019 * Object locked and referenced.
2020 * Returns:
2021 * TRUE if startup file should be written.
2022 * With object locked and still referenced.
2023 * But we may drop the object lock temporarily.
2024 */
2025 static boolean_t
2026 vm_fault_tws_insert(
2027 vm_map_t map,
2028 vm_map_t real_map,
2029 vm_map_offset_t vaddr,
2030 vm_object_t object,
2031 vm_object_offset_t offset)
2032 {
2033 tws_hash_line_t line;
2034 task_t task;
2035 kern_return_t kr;
2036 boolean_t result = FALSE;
2037
2038 /* Avoid possible map lock deadlock issues */
2039 if (map == kernel_map || map == kalloc_map ||
2040 real_map == kernel_map || real_map == kalloc_map)
2041 return result;
2042
2043 task = current_task();
2044 if (task->dynamic_working_set != 0) {
2045 vm_object_t base_object;
2046 vm_object_t base_shadow;
2047 vm_object_offset_t base_offset;
2048 base_object = object;
2049 base_offset = offset;
2050 while ((base_shadow = base_object->shadow)) {
2051 vm_object_lock(base_shadow);
2052 vm_object_unlock(base_object);
2053 base_offset +=
2054 base_object->shadow_offset;
2055 base_object = base_shadow;
2056 }
2057 kr = tws_lookup(
2058 task->dynamic_working_set,
2059 base_offset, base_object,
2060 &line);
2061 if (kr == KERN_OPERATION_TIMED_OUT){
2062 result = TRUE;
2063 if (base_object != object) {
2064 vm_object_unlock(base_object);
2065 vm_object_lock(object);
2066 }
2067 } else if (kr != KERN_SUCCESS) {
2068 if(base_object != object)
2069 vm_object_reference_locked(base_object);
2070 kr = tws_insert(
2071 task->dynamic_working_set,
2072 base_offset, base_object,
2073 vaddr, real_map);
2074 if(base_object != object) {
2075 vm_object_unlock(base_object);
2076 vm_object_deallocate(base_object);
2077 }
2078 if(kr == KERN_NO_SPACE) {
2079 if (base_object == object)
2080 vm_object_unlock(object);
2081 tws_expand_working_set(
2082 task->dynamic_working_set,
2083 TWS_HASH_LINE_COUNT,
2084 FALSE);
2085 if (base_object == object)
2086 vm_object_lock(object);
2087 } else if(kr == KERN_OPERATION_TIMED_OUT) {
2088 result = TRUE;
2089 }
2090 if(base_object != object)
2091 vm_object_lock(object);
2092 } else if (base_object != object) {
2093 vm_object_unlock(base_object);
2094 vm_object_lock(object);
2095 }
2096 }
2097 return result;
2098 }
2099
2100 /*
2101 * Routine: vm_fault
2102 * Purpose:
2103 * Handle page faults, including pseudo-faults
2104 * used to change the wiring status of pages.
2105 * Returns:
2106 * Explicit continuations have been removed.
2107 * Implementation:
2108 * vm_fault and vm_fault_page save mucho state
2109 * in the moral equivalent of a closure. The state
2110 * structure is allocated when first entering vm_fault
2111 * and deallocated when leaving vm_fault.
2112 */
2113
2114 extern int _map_enter_debug;
2115
2116 kern_return_t
2117 vm_fault(
2118 vm_map_t map,
2119 vm_map_offset_t vaddr,
2120 vm_prot_t fault_type,
2121 boolean_t change_wiring,
2122 int interruptible,
2123 pmap_t caller_pmap,
2124 vm_map_offset_t caller_pmap_addr)
2125 {
2126 vm_map_version_t version; /* Map version for verificiation */
2127 boolean_t wired; /* Should mapping be wired down? */
2128 vm_object_t object; /* Top-level object */
2129 vm_object_offset_t offset; /* Top-level offset */
2130 vm_prot_t prot; /* Protection for mapping */
2131 vm_behavior_t behavior; /* Expected paging behavior */
2132 vm_map_offset_t lo_offset, hi_offset;
2133 vm_object_t old_copy_object; /* Saved copy object */
2134 vm_page_t result_page; /* Result of vm_fault_page */
2135 vm_page_t top_page; /* Placeholder page */
2136 kern_return_t kr;
2137
2138 register
2139 vm_page_t m; /* Fast access to result_page */
2140 kern_return_t error_code = 0; /* page error reasons */
2141 register
2142 vm_object_t cur_object;
2143 register
2144 vm_object_offset_t cur_offset;
2145 vm_page_t cur_m;
2146 vm_object_t new_object;
2147 int type_of_fault;
2148 vm_map_t real_map = map;
2149 vm_map_t original_map = map;
2150 pmap_t pmap = NULL;
2151 boolean_t interruptible_state;
2152 unsigned int cache_attr;
2153 int write_startup_file = 0;
2154 boolean_t need_activation;
2155 vm_prot_t full_fault_type;
2156
2157 if (get_preemption_level() != 0)
2158 return (KERN_FAILURE);
2159
2160 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_START,
2161 vaddr,
2162 0,
2163 0,
2164 0,
2165 0);
2166
2167 /* at present we do not fully check for execute permission */
2168 /* we generally treat it is read except in certain device */
2169 /* memory settings */
2170 full_fault_type = fault_type;
2171 if(fault_type & VM_PROT_EXECUTE) {
2172 fault_type &= ~VM_PROT_EXECUTE;
2173 fault_type |= VM_PROT_READ;
2174 }
2175
2176 interruptible_state = thread_interrupt_level(interruptible);
2177
2178 /*
2179 * assume we will hit a page in the cache
2180 * otherwise, explicitly override with
2181 * the real fault type once we determine it
2182 */
2183 type_of_fault = DBG_CACHE_HIT_FAULT;
2184
2185 VM_STAT(faults++);
2186 current_task()->faults++;
2187
2188 RetryFault: ;
2189
2190 /*
2191 * Find the backing store object and offset into
2192 * it to begin the search.
2193 */
2194 map = original_map;
2195 vm_map_lock_read(map);
2196 kr = vm_map_lookup_locked(&map, vaddr, fault_type, &version,
2197 &object, &offset,
2198 &prot, &wired,
2199 &behavior, &lo_offset, &hi_offset, &real_map);
2200
2201 //if (_map_enter_debug)printf("vm_map_lookup_locked(map=0x%x, addr=0x%llx, prot=%d wired=%d) = %d\n", map, vaddr, prot, wired, kr);
2202
2203 pmap = real_map->pmap;
2204
2205 if (kr != KERN_SUCCESS) {
2206 vm_map_unlock_read(map);
2207 goto done;
2208 }
2209
2210 /*
2211 * If the page is wired, we must fault for the current protection
2212 * value, to avoid further faults.
2213 */
2214
2215 if (wired)
2216 fault_type = prot | VM_PROT_WRITE;
2217
2218 #if VM_FAULT_CLASSIFY
2219 /*
2220 * Temporary data gathering code
2221 */
2222 vm_fault_classify(object, offset, fault_type);
2223 #endif
2224 /*
2225 * Fast fault code. The basic idea is to do as much as
2226 * possible while holding the map lock and object locks.
2227 * Busy pages are not used until the object lock has to
2228 * be dropped to do something (copy, zero fill, pmap enter).
2229 * Similarly, paging references aren't acquired until that
2230 * point, and object references aren't used.
2231 *
2232 * If we can figure out what to do
2233 * (zero fill, copy on write, pmap enter) while holding
2234 * the locks, then it gets done. Otherwise, we give up,
2235 * and use the original fault path (which doesn't hold
2236 * the map lock, and relies on busy pages).
2237 * The give up cases include:
2238 * - Have to talk to pager.
2239 * - Page is busy, absent or in error.
2240 * - Pager has locked out desired access.
2241 * - Fault needs to be restarted.
2242 * - Have to push page into copy object.
2243 *
2244 * The code is an infinite loop that moves one level down
2245 * the shadow chain each time. cur_object and cur_offset
2246 * refer to the current object being examined. object and offset
2247 * are the original object from the map. The loop is at the
2248 * top level if and only if object and cur_object are the same.
2249 *
2250 * Invariants: Map lock is held throughout. Lock is held on
2251 * original object and cur_object (if different) when
2252 * continuing or exiting loop.
2253 *
2254 */
2255
2256
2257 /*
2258 * If this page is to be inserted in a copy delay object
2259 * for writing, and if the object has a copy, then the
2260 * copy delay strategy is implemented in the slow fault page.
2261 */
2262 if (object->copy_strategy != MEMORY_OBJECT_COPY_DELAY ||
2263 object->copy == VM_OBJECT_NULL ||
2264 (fault_type & VM_PROT_WRITE) == 0) {
2265 cur_object = object;
2266 cur_offset = offset;
2267
2268 while (TRUE) {
2269 m = vm_page_lookup(cur_object, cur_offset);
2270 if (m != VM_PAGE_NULL) {
2271 if (m->busy) {
2272 wait_result_t result;
2273
2274 if (object != cur_object)
2275 vm_object_unlock(object);
2276
2277 vm_map_unlock_read(map);
2278 if (real_map != map)
2279 vm_map_unlock(real_map);
2280
2281 #if !VM_FAULT_STATIC_CONFIG
2282 if (!vm_fault_interruptible)
2283 interruptible = THREAD_UNINT;
2284 #endif
2285 result = PAGE_ASSERT_WAIT(m, interruptible);
2286
2287 vm_object_unlock(cur_object);
2288
2289 if (result == THREAD_WAITING) {
2290 result = thread_block(THREAD_CONTINUE_NULL);
2291
2292 counter(c_vm_fault_page_block_busy_kernel++);
2293 }
2294 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2295 goto RetryFault;
2296
2297 kr = KERN_ABORTED;
2298 goto done;
2299 }
2300 if (m->unusual && (m->error || m->restart || m->private
2301 || m->absent || (fault_type & m->page_lock))) {
2302
2303 /*
2304 * Unusual case. Give up.
2305 */
2306 break;
2307 }
2308
2309 if (m->encrypted) {
2310 /*
2311 * ENCRYPTED SWAP:
2312 * We've soft-faulted (because it's not in the page
2313 * table) on an encrypted page.
2314 * Keep the page "busy" so that noone messes with
2315 * it during the decryption.
2316 * Release the extra locks we're holding, keep only
2317 * the page's VM object lock.
2318 */
2319 m->busy = TRUE;
2320 if (object != cur_object) {
2321 vm_object_unlock(object);
2322 }
2323 vm_map_unlock_read(map);
2324 if (real_map != map)
2325 vm_map_unlock(real_map);
2326
2327 vm_page_decrypt(m, 0);
2328
2329 assert(m->busy);
2330 PAGE_WAKEUP_DONE(m);
2331 vm_object_unlock(m->object);
2332
2333 /*
2334 * Retry from the top, in case anything
2335 * changed while we were decrypting...
2336 */
2337 goto RetryFault;
2338 }
2339 ASSERT_PAGE_DECRYPTED(m);
2340
2341 /*
2342 * Two cases of map in faults:
2343 * - At top level w/o copy object.
2344 * - Read fault anywhere.
2345 * --> must disallow write.
2346 */
2347
2348 if (object == cur_object &&
2349 object->copy == VM_OBJECT_NULL)
2350 goto FastMapInFault;
2351
2352 if ((fault_type & VM_PROT_WRITE) == 0) {
2353 boolean_t sequential;
2354
2355 prot &= ~VM_PROT_WRITE;
2356
2357 /*
2358 * Set up to map the page ...
2359 * mark the page busy, drop
2360 * locks and take a paging reference
2361 * on the object with the page.
2362 */
2363
2364 if (object != cur_object) {
2365 vm_object_unlock(object);
2366 object = cur_object;
2367 }
2368 FastMapInFault:
2369 m->busy = TRUE;
2370
2371 vm_object_paging_begin(object);
2372
2373 FastPmapEnter:
2374 /*
2375 * Check a couple of global reasons to
2376 * be conservative about write access.
2377 * Then do the pmap_enter.
2378 */
2379 #if !VM_FAULT_STATIC_CONFIG
2380 if (vm_fault_dirty_handling
2381 #if MACH_KDB
2382 || db_watchpoint_list
2383 #endif
2384 && (fault_type & VM_PROT_WRITE) == 0)
2385 prot &= ~VM_PROT_WRITE;
2386 #else /* STATIC_CONFIG */
2387 #if MACH_KDB
2388 if (db_watchpoint_list
2389 && (fault_type & VM_PROT_WRITE) == 0)
2390 prot &= ~VM_PROT_WRITE;
2391 #endif /* MACH_KDB */
2392 #endif /* STATIC_CONFIG */
2393 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2394
2395 sequential = FALSE;
2396 need_activation = FALSE;
2397
2398 if (m->no_isync == TRUE) {
2399 m->no_isync = FALSE;
2400 pmap_sync_page_data_phys(m->phys_page);
2401
2402 if ((type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2403 /*
2404 * found it in the cache, but this
2405 * is the first fault-in of the page (no_isync == TRUE)
2406 * so it must have come in as part of
2407 * a cluster... account 1 pagein against it
2408 */
2409 VM_STAT(pageins++);
2410 current_task()->pageins++;
2411 type_of_fault = DBG_PAGEIN_FAULT;
2412 sequential = TRUE;
2413 }
2414 if (m->clustered)
2415 need_activation = TRUE;
2416
2417 } else if (cache_attr != VM_WIMG_DEFAULT) {
2418 pmap_sync_page_attributes_phys(m->phys_page);
2419 }
2420
2421 if(caller_pmap) {
2422 PMAP_ENTER(caller_pmap,
2423 caller_pmap_addr, m,
2424 prot, cache_attr, wired);
2425 } else {
2426 PMAP_ENTER(pmap, vaddr, m,
2427 prot, cache_attr, wired);
2428 }
2429
2430 /*
2431 * Hold queues lock to manipulate
2432 * the page queues. Change wiring
2433 * case is obvious. In soft ref bits
2434 * case activate page only if it fell
2435 * off paging queues, otherwise just
2436 * activate it if it's inactive.
2437 *
2438 * NOTE: original vm_fault code will
2439 * move active page to back of active
2440 * queue. This code doesn't.
2441 */
2442 vm_page_lock_queues();
2443
2444 if (m->clustered) {
2445 vm_pagein_cluster_used++;
2446 m->clustered = FALSE;
2447 }
2448 m->reference = TRUE;
2449
2450 if (change_wiring) {
2451 if (wired)
2452 vm_page_wire(m);
2453 else
2454 vm_page_unwire(m);
2455 }
2456 #if VM_FAULT_STATIC_CONFIG
2457 else {
2458 if ((!m->active && !m->inactive) || ((need_activation == TRUE) && !m->active))
2459 vm_page_activate(m);
2460 }
2461 #else
2462 else if (software_reference_bits) {
2463 if (!m->active && !m->inactive)
2464 vm_page_activate(m);
2465 }
2466 else if (!m->active) {
2467 vm_page_activate(m);
2468 }
2469 #endif
2470 vm_page_unlock_queues();
2471
2472 /*
2473 * That's it, clean up and return.
2474 */
2475 PAGE_WAKEUP_DONE(m);
2476
2477 sequential = (sequential && vm_page_deactivate_behind) ?
2478 vm_fault_deactivate_behind(object, cur_offset, behavior) :
2479 FALSE;
2480
2481 /*
2482 * Add non-sequential pages to the working set.
2483 * The sequential pages will be brought in through
2484 * normal clustering behavior.
2485 */
2486 if (!sequential && !object->private) {
2487 write_startup_file =
2488 vm_fault_tws_insert(map, real_map, vaddr,
2489 object, cur_offset);
2490 }
2491
2492 vm_object_paging_end(object);
2493 vm_object_unlock(object);
2494
2495 vm_map_unlock_read(map);
2496 if(real_map != map)
2497 vm_map_unlock(real_map);
2498
2499 if(write_startup_file)
2500 tws_send_startup_info(current_task());
2501
2502 thread_interrupt_level(interruptible_state);
2503
2504
2505 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2506 vaddr,
2507 type_of_fault & 0xff,
2508 KERN_SUCCESS,
2509 type_of_fault >> 8,
2510 0);
2511
2512 return KERN_SUCCESS;
2513 }
2514
2515 /*
2516 * Copy on write fault. If objects match, then
2517 * object->copy must not be NULL (else control
2518 * would be in previous code block), and we
2519 * have a potential push into the copy object
2520 * with which we won't cope here.
2521 */
2522
2523 if (cur_object == object)
2524 break;
2525 /*
2526 * This is now a shadow based copy on write
2527 * fault -- it requires a copy up the shadow
2528 * chain.
2529 *
2530 * Allocate a page in the original top level
2531 * object. Give up if allocate fails. Also
2532 * need to remember current page, as it's the
2533 * source of the copy.
2534 */
2535 cur_m = m;
2536 m = vm_page_grab();
2537 if (m == VM_PAGE_NULL) {
2538 break;
2539 }
2540 /*
2541 * Now do the copy. Mark the source busy
2542 * and take out paging references on both
2543 * objects.
2544 *
2545 * NOTE: This code holds the map lock across
2546 * the page copy.
2547 */
2548
2549 cur_m->busy = TRUE;
2550 vm_page_copy(cur_m, m);
2551 vm_page_insert(m, object, offset);
2552
2553 vm_object_paging_begin(cur_object);
2554 vm_object_paging_begin(object);
2555
2556 type_of_fault = DBG_COW_FAULT;
2557 VM_STAT(cow_faults++);
2558 current_task()->cow_faults++;
2559
2560 /*
2561 * Now cope with the source page and object
2562 * If the top object has a ref count of 1
2563 * then no other map can access it, and hence
2564 * it's not necessary to do the pmap_disconnect.
2565 */
2566
2567 vm_page_lock_queues();
2568 vm_page_deactivate(cur_m);
2569 m->dirty = TRUE;
2570 pmap_disconnect(cur_m->phys_page);
2571 vm_page_unlock_queues();
2572
2573 PAGE_WAKEUP_DONE(cur_m);
2574 vm_object_paging_end(cur_object);
2575 vm_object_unlock(cur_object);
2576
2577 /*
2578 * Slight hack to call vm_object collapse
2579 * and then reuse common map in code.
2580 * note that the object lock was taken above.
2581 */
2582
2583 vm_object_paging_end(object);
2584 vm_object_collapse(object, offset);
2585 vm_object_paging_begin(object);
2586
2587 goto FastPmapEnter;
2588 }
2589 else {
2590
2591 /*
2592 * No page at cur_object, cur_offset
2593 */
2594
2595 if (cur_object->pager_created) {
2596
2597 /*
2598 * Have to talk to the pager. Give up.
2599 */
2600 break;
2601 }
2602
2603
2604 if (cur_object->shadow == VM_OBJECT_NULL) {
2605
2606 if (cur_object->shadow_severed) {
2607 vm_object_paging_end(object);
2608 vm_object_unlock(object);
2609 vm_map_unlock_read(map);
2610 if(real_map != map)
2611 vm_map_unlock(real_map);
2612
2613 if(write_startup_file)
2614 tws_send_startup_info(
2615 current_task());
2616
2617 thread_interrupt_level(interruptible_state);
2618
2619 return KERN_MEMORY_ERROR;
2620 }
2621
2622 /*
2623 * Zero fill fault. Page gets
2624 * filled in top object. Insert
2625 * page, then drop any lower lock.
2626 * Give up if no page.
2627 */
2628 if (VM_PAGE_THROTTLED()) {
2629 break;
2630 }
2631
2632 /*
2633 * are we protecting the system from
2634 * backing store exhaustion. If so
2635 * sleep unless we are privileged.
2636 */
2637 if(vm_backing_store_low) {
2638 if(!(current_task()->priv_flags
2639 & VM_BACKING_STORE_PRIV))
2640 break;
2641 }
2642 m = vm_page_alloc(object, offset);
2643 if (m == VM_PAGE_NULL) {
2644 break;
2645 }
2646 /*
2647 * This is a zero-fill or initial fill
2648 * page fault. As such, we consider it
2649 * undefined with respect to instruction
2650 * execution. i.e. it is the responsibility
2651 * of higher layers to call for an instruction
2652 * sync after changing the contents and before
2653 * sending a program into this area. We
2654 * choose this approach for performance
2655 */
2656
2657 m->no_isync = FALSE;
2658
2659 if (cur_object != object)
2660 vm_object_unlock(cur_object);
2661
2662 vm_object_paging_begin(object);
2663 vm_object_unlock(object);
2664
2665 /*
2666 * Now zero fill page and map it.
2667 * the page is probably going to
2668 * be written soon, so don't bother
2669 * to clear the modified bit
2670 *
2671 * NOTE: This code holds the map
2672 * lock across the zero fill.
2673 */
2674
2675 if (!map->no_zero_fill) {
2676 vm_page_zero_fill(m);
2677 type_of_fault = DBG_ZERO_FILL_FAULT;
2678 VM_STAT(zero_fill_count++);
2679 }
2680 vm_page_lock_queues();
2681 VM_PAGE_QUEUES_REMOVE(m);
2682
2683 m->page_ticket = vm_page_ticket;
2684 assert(!m->laundry);
2685 assert(m->object != kernel_object);
2686 assert(m->pageq.next == NULL &&
2687 m->pageq.prev == NULL);
2688 if(m->object->size > 0x200000) {
2689 m->zero_fill = TRUE;
2690 /* depends on the queues lock */
2691 vm_zf_count += 1;
2692 queue_enter(&vm_page_queue_zf,
2693 m, vm_page_t, pageq);
2694 } else {
2695 queue_enter(
2696 &vm_page_queue_inactive,
2697 m, vm_page_t, pageq);
2698 }
2699 vm_page_ticket_roll++;
2700 if(vm_page_ticket_roll ==
2701 VM_PAGE_TICKETS_IN_ROLL) {
2702 vm_page_ticket_roll = 0;
2703 if(vm_page_ticket ==
2704 VM_PAGE_TICKET_ROLL_IDS)
2705 vm_page_ticket= 0;
2706 else
2707 vm_page_ticket++;
2708 }
2709
2710 m->inactive = TRUE;
2711 vm_page_inactive_count++;
2712 vm_page_unlock_queues();
2713 vm_object_lock(object);
2714
2715 goto FastPmapEnter;
2716 }
2717
2718 /*
2719 * On to the next level
2720 */
2721
2722 cur_offset += cur_object->shadow_offset;
2723 new_object = cur_object->shadow;
2724 vm_object_lock(new_object);
2725 if (cur_object != object)
2726 vm_object_unlock(cur_object);
2727 cur_object = new_object;
2728
2729 continue;
2730 }
2731 }
2732
2733 /*
2734 * Cleanup from fast fault failure. Drop any object
2735 * lock other than original and drop map lock.
2736 */
2737
2738 if (object != cur_object)
2739 vm_object_unlock(cur_object);
2740 }
2741 vm_map_unlock_read(map);
2742
2743 if(real_map != map)
2744 vm_map_unlock(real_map);
2745
2746 /*
2747 * Make a reference to this object to
2748 * prevent its disposal while we are messing with
2749 * it. Once we have the reference, the map is free
2750 * to be diddled. Since objects reference their
2751 * shadows (and copies), they will stay around as well.
2752 */
2753
2754 assert(object->ref_count > 0);
2755 object->ref_count++;
2756 vm_object_res_reference(object);
2757 vm_object_paging_begin(object);
2758
2759 XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2760
2761 if (!object->private) {
2762 write_startup_file =
2763 vm_fault_tws_insert(map, real_map, vaddr, object, offset);
2764 }
2765
2766 kr = vm_fault_page(object, offset, fault_type,
2767 (change_wiring && !wired),
2768 interruptible,
2769 lo_offset, hi_offset, behavior,
2770 &prot, &result_page, &top_page,
2771 &type_of_fault,
2772 &error_code, map->no_zero_fill, FALSE, map, vaddr);
2773
2774 /*
2775 * If we didn't succeed, lose the object reference immediately.
2776 */
2777
2778 if (kr != VM_FAULT_SUCCESS)
2779 vm_object_deallocate(object);
2780
2781 /*
2782 * See why we failed, and take corrective action.
2783 */
2784
2785 switch (kr) {
2786 case VM_FAULT_SUCCESS:
2787 break;
2788 case VM_FAULT_MEMORY_SHORTAGE:
2789 if (vm_page_wait((change_wiring) ?
2790 THREAD_UNINT :
2791 THREAD_ABORTSAFE))
2792 goto RetryFault;
2793 /* fall thru */
2794 case VM_FAULT_INTERRUPTED:
2795 kr = KERN_ABORTED;
2796 goto done;
2797 case VM_FAULT_RETRY:
2798 goto RetryFault;
2799 case VM_FAULT_FICTITIOUS_SHORTAGE:
2800 vm_page_more_fictitious();
2801 goto RetryFault;
2802 case VM_FAULT_MEMORY_ERROR:
2803 if (error_code)
2804 kr = error_code;
2805 else
2806 kr = KERN_MEMORY_ERROR;
2807 goto done;
2808 }
2809
2810 m = result_page;
2811
2812 if(m != VM_PAGE_NULL) {
2813 assert((change_wiring && !wired) ?
2814 (top_page == VM_PAGE_NULL) :
2815 ((top_page == VM_PAGE_NULL) == (m->object == object)));
2816 }
2817
2818 /*
2819 * How to clean up the result of vm_fault_page. This
2820 * happens whether the mapping is entered or not.
2821 */
2822
2823 #define UNLOCK_AND_DEALLOCATE \
2824 MACRO_BEGIN \
2825 vm_fault_cleanup(m->object, top_page); \
2826 vm_object_deallocate(object); \
2827 MACRO_END
2828
2829 /*
2830 * What to do with the resulting page from vm_fault_page
2831 * if it doesn't get entered into the physical map:
2832 */
2833
2834 #define RELEASE_PAGE(m) \
2835 MACRO_BEGIN \
2836 PAGE_WAKEUP_DONE(m); \
2837 vm_page_lock_queues(); \
2838 if (!m->active && !m->inactive) \
2839 vm_page_activate(m); \
2840 vm_page_unlock_queues(); \
2841 MACRO_END
2842
2843 /*
2844 * We must verify that the maps have not changed
2845 * since our last lookup.
2846 */
2847
2848 if(m != VM_PAGE_NULL) {
2849 old_copy_object = m->object->copy;
2850 vm_object_unlock(m->object);
2851 } else {
2852 old_copy_object = VM_OBJECT_NULL;
2853 }
2854 if ((map != original_map) || !vm_map_verify(map, &version)) {
2855 vm_object_t retry_object;
2856 vm_object_offset_t retry_offset;
2857 vm_prot_t retry_prot;
2858
2859 /*
2860 * To avoid trying to write_lock the map while another
2861 * thread has it read_locked (in vm_map_pageable), we
2862 * do not try for write permission. If the page is
2863 * still writable, we will get write permission. If it
2864 * is not, or has been marked needs_copy, we enter the
2865 * mapping without write permission, and will merely
2866 * take another fault.
2867 */
2868 map = original_map;
2869 vm_map_lock_read(map);
2870 kr = vm_map_lookup_locked(&map, vaddr,
2871 fault_type & ~VM_PROT_WRITE, &version,
2872 &retry_object, &retry_offset, &retry_prot,
2873 &wired, &behavior, &lo_offset, &hi_offset,
2874 &real_map);
2875 pmap = real_map->pmap;
2876
2877 if (kr != KERN_SUCCESS) {
2878 vm_map_unlock_read(map);
2879 if(m != VM_PAGE_NULL) {
2880 vm_object_lock(m->object);
2881 RELEASE_PAGE(m);
2882 UNLOCK_AND_DEALLOCATE;
2883 } else {
2884 vm_object_deallocate(object);
2885 }
2886 goto done;
2887 }
2888
2889 vm_object_unlock(retry_object);
2890 if(m != VM_PAGE_NULL) {
2891 vm_object_lock(m->object);
2892 } else {
2893 vm_object_lock(object);
2894 }
2895
2896 if ((retry_object != object) ||
2897 (retry_offset != offset)) {
2898 vm_map_unlock_read(map);
2899 if(real_map != map)
2900 vm_map_unlock(real_map);
2901 if(m != VM_PAGE_NULL) {
2902 RELEASE_PAGE(m);
2903 UNLOCK_AND_DEALLOCATE;
2904 } else {
2905 vm_object_deallocate(object);
2906 }
2907 goto RetryFault;
2908 }
2909
2910 /*
2911 * Check whether the protection has changed or the object
2912 * has been copied while we left the map unlocked.
2913 */
2914 prot &= retry_prot;
2915 if(m != VM_PAGE_NULL) {
2916 vm_object_unlock(m->object);
2917 } else {
2918 vm_object_unlock(object);
2919 }
2920 }
2921 if(m != VM_PAGE_NULL) {
2922 vm_object_lock(m->object);
2923 } else {
2924 vm_object_lock(object);
2925 }
2926
2927 /*
2928 * If the copy object changed while the top-level object
2929 * was unlocked, then we must take away write permission.
2930 */
2931
2932 if(m != VM_PAGE_NULL) {
2933 if (m->object->copy != old_copy_object)
2934 prot &= ~VM_PROT_WRITE;
2935 }
2936
2937 /*
2938 * If we want to wire down this page, but no longer have
2939 * adequate permissions, we must start all over.
2940 */
2941
2942 if (wired && (fault_type != (prot|VM_PROT_WRITE))) {
2943 vm_map_verify_done(map, &version);
2944 if(real_map != map)
2945 vm_map_unlock(real_map);
2946 if(m != VM_PAGE_NULL) {
2947 RELEASE_PAGE(m);
2948 UNLOCK_AND_DEALLOCATE;
2949 } else {
2950 vm_object_deallocate(object);
2951 }
2952 goto RetryFault;
2953 }
2954
2955 /*
2956 * Put this page into the physical map.
2957 * We had to do the unlock above because pmap_enter
2958 * may cause other faults. The page may be on
2959 * the pageout queues. If the pageout daemon comes
2960 * across the page, it will remove it from the queues.
2961 */
2962 need_activation = FALSE;
2963
2964 if (m != VM_PAGE_NULL) {
2965 if (m->no_isync == TRUE) {
2966 pmap_sync_page_data_phys(m->phys_page);
2967
2968 if ((type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2969 /*
2970 * found it in the cache, but this
2971 * is the first fault-in of the page (no_isync == TRUE)
2972 * so it must have come in as part of
2973 * a cluster... account 1 pagein against it
2974 */
2975 VM_STAT(pageins++);
2976 current_task()->pageins++;
2977
2978 type_of_fault = DBG_PAGEIN_FAULT;
2979 }
2980 if (m->clustered) {
2981 need_activation = TRUE;
2982 }
2983 m->no_isync = FALSE;
2984 }
2985 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2986
2987 if(caller_pmap) {
2988 PMAP_ENTER(caller_pmap,
2989 caller_pmap_addr, m,
2990 prot, cache_attr, wired);
2991 } else {
2992 PMAP_ENTER(pmap, vaddr, m,
2993 prot, cache_attr, wired);
2994 }
2995
2996 /*
2997 * Add working set information for private objects here.
2998 */
2999 if (m->object->private) {
3000 write_startup_file =
3001 vm_fault_tws_insert(map, real_map, vaddr,
3002 m->object, m->offset);
3003 }
3004 } else {
3005
3006 #ifndef i386
3007 vm_map_entry_t entry;
3008 vm_map_offset_t laddr;
3009 vm_map_offset_t ldelta, hdelta;
3010
3011 /*
3012 * do a pmap block mapping from the physical address
3013 * in the object
3014 */
3015
3016 /* While we do not worry about execution protection in */
3017 /* general, certian pages may have instruction execution */
3018 /* disallowed. We will check here, and if not allowed */
3019 /* to execute, we return with a protection failure. */
3020
3021 if((full_fault_type & VM_PROT_EXECUTE) &&
3022 (!pmap_eligible_for_execute((ppnum_t)
3023 (object->shadow_offset >> 12)))) {
3024
3025 vm_map_verify_done(map, &version);
3026 if(real_map != map)
3027 vm_map_unlock(real_map);
3028 vm_fault_cleanup(object, top_page);
3029 vm_object_deallocate(object);
3030 kr = KERN_PROTECTION_FAILURE;
3031 goto done;
3032 }
3033
3034 if(real_map != map) {
3035 vm_map_unlock(real_map);
3036 }
3037 if (original_map != map) {
3038 vm_map_unlock_read(map);
3039 vm_map_lock_read(original_map);
3040 map = original_map;
3041 }
3042 real_map = map;
3043
3044 laddr = vaddr;
3045 hdelta = 0xFFFFF000;
3046 ldelta = 0xFFFFF000;
3047
3048
3049 while(vm_map_lookup_entry(map, laddr, &entry)) {
3050 if(ldelta > (laddr - entry->vme_start))
3051 ldelta = laddr - entry->vme_start;
3052 if(hdelta > (entry->vme_end - laddr))
3053 hdelta = entry->vme_end - laddr;
3054 if(entry->is_sub_map) {
3055
3056 laddr = (laddr - entry->vme_start)
3057 + entry->offset;
3058 vm_map_lock_read(entry->object.sub_map);
3059 if(map != real_map)
3060 vm_map_unlock_read(map);
3061 if(entry->use_pmap) {
3062 vm_map_unlock_read(real_map);
3063 real_map = entry->object.sub_map;
3064 }
3065 map = entry->object.sub_map;
3066
3067 } else {
3068 break;
3069 }
3070 }
3071
3072 if(vm_map_lookup_entry(map, laddr, &entry) &&
3073 (entry->object.vm_object != NULL) &&
3074 (entry->object.vm_object == object)) {
3075
3076
3077 if(caller_pmap) {
3078 /* Set up a block mapped area */
3079 pmap_map_block(caller_pmap,
3080 (addr64_t)(caller_pmap_addr - ldelta),
3081 (((vm_map_offset_t)
3082 (entry->object.vm_object->shadow_offset))
3083 + entry->offset +
3084 (laddr - entry->vme_start)
3085 - ldelta) >> 12,
3086 ((ldelta + hdelta) >> 12), prot,
3087 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3088 } else {
3089 /* Set up a block mapped area */
3090 pmap_map_block(real_map->pmap,
3091 (addr64_t)(vaddr - ldelta),
3092 (((vm_map_offset_t)
3093 (entry->object.vm_object->shadow_offset))
3094 + entry->offset +
3095 (laddr - entry->vme_start) - ldelta) >> 12,
3096 ((ldelta + hdelta) >> 12), prot,
3097 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3098 }
3099 }
3100 #else
3101 #ifdef notyet
3102 if(caller_pmap) {
3103 pmap_enter(caller_pmap, caller_pmap_addr,
3104 object->shadow_offset>>12, prot, 0, TRUE);
3105 } else {
3106 pmap_enter(pmap, vaddr,
3107 object->shadow_offset>>12, prot, 0, TRUE);
3108 }
3109 /* Map it in */
3110 #endif
3111 #endif
3112
3113 }
3114
3115 /*
3116 * If the page is not wired down and isn't already
3117 * on a pageout queue, then put it where the
3118 * pageout daemon can find it.
3119 */
3120 if(m != VM_PAGE_NULL) {
3121 vm_page_lock_queues();
3122
3123 if (m->clustered) {
3124 vm_pagein_cluster_used++;
3125 m->clustered = FALSE;
3126 }
3127 m->reference = TRUE;
3128
3129 if (change_wiring) {
3130 if (wired)
3131 vm_page_wire(m);
3132 else
3133 vm_page_unwire(m);
3134 }
3135 #if VM_FAULT_STATIC_CONFIG
3136 else {
3137 if ((!m->active && !m->inactive) || ((need_activation == TRUE) && !m->active))
3138 vm_page_activate(m);
3139 }
3140 #else
3141 else if (software_reference_bits) {
3142 if (!m->active && !m->inactive)
3143 vm_page_activate(m);
3144 m->reference = TRUE;
3145 } else {
3146 vm_page_activate(m);
3147 }
3148 #endif
3149 vm_page_unlock_queues();
3150 }
3151
3152 /*
3153 * Unlock everything, and return
3154 */
3155
3156 vm_map_verify_done(map, &version);
3157 if(real_map != map)
3158 vm_map_unlock(real_map);
3159 if(m != VM_PAGE_NULL) {
3160 PAGE_WAKEUP_DONE(m);
3161 UNLOCK_AND_DEALLOCATE;
3162 } else {
3163 vm_fault_cleanup(object, top_page);
3164 vm_object_deallocate(object);
3165 }
3166 kr = KERN_SUCCESS;
3167
3168 #undef UNLOCK_AND_DEALLOCATE
3169 #undef RELEASE_PAGE
3170
3171 done:
3172 if(write_startup_file)
3173 tws_send_startup_info(current_task());
3174
3175 thread_interrupt_level(interruptible_state);
3176
3177 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
3178 vaddr,
3179 type_of_fault & 0xff,
3180 kr,
3181 type_of_fault >> 8,
3182 0);
3183
3184 return(kr);
3185 }
3186
3187 /*
3188 * vm_fault_wire:
3189 *
3190 * Wire down a range of virtual addresses in a map.
3191 */
3192 kern_return_t
3193 vm_fault_wire(
3194 vm_map_t map,
3195 vm_map_entry_t entry,
3196 pmap_t pmap,
3197 vm_map_offset_t pmap_addr)
3198 {
3199
3200 register vm_map_offset_t va;
3201 register vm_map_offset_t end_addr = entry->vme_end;
3202 register kern_return_t rc;
3203
3204 assert(entry->in_transition);
3205
3206 if ((entry->object.vm_object != NULL) &&
3207 !entry->is_sub_map &&
3208 entry->object.vm_object->phys_contiguous) {
3209 return KERN_SUCCESS;
3210 }
3211
3212 /*
3213 * Inform the physical mapping system that the
3214 * range of addresses may not fault, so that
3215 * page tables and such can be locked down as well.
3216 */
3217
3218 pmap_pageable(pmap, pmap_addr,
3219 pmap_addr + (end_addr - entry->vme_start), FALSE);
3220
3221 /*
3222 * We simulate a fault to get the page and enter it
3223 * in the physical map.
3224 */
3225
3226 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3227 if ((rc = vm_fault_wire_fast(
3228 map, va, entry, pmap,
3229 pmap_addr + (va - entry->vme_start)
3230 )) != KERN_SUCCESS) {
3231 rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
3232 (pmap == kernel_pmap) ?
3233 THREAD_UNINT : THREAD_ABORTSAFE,
3234 pmap, pmap_addr + (va - entry->vme_start));
3235 }
3236
3237 if (rc != KERN_SUCCESS) {
3238 struct vm_map_entry tmp_entry = *entry;
3239
3240 /* unwire wired pages */
3241 tmp_entry.vme_end = va;
3242 vm_fault_unwire(map,
3243 &tmp_entry, FALSE, pmap, pmap_addr);
3244
3245 return rc;
3246 }
3247 }
3248 return KERN_SUCCESS;
3249 }
3250
3251 /*
3252 * vm_fault_unwire:
3253 *
3254 * Unwire a range of virtual addresses in a map.
3255 */
3256 void
3257 vm_fault_unwire(
3258 vm_map_t map,
3259 vm_map_entry_t entry,
3260 boolean_t deallocate,
3261 pmap_t pmap,
3262 vm_map_offset_t pmap_addr)
3263 {
3264 register vm_map_offset_t va;
3265 register vm_map_offset_t end_addr = entry->vme_end;
3266 vm_object_t object;
3267
3268 object = (entry->is_sub_map)
3269 ? VM_OBJECT_NULL : entry->object.vm_object;
3270
3271 /*
3272 * Since the pages are wired down, we must be able to
3273 * get their mappings from the physical map system.
3274 */
3275
3276 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3277 pmap_change_wiring(pmap,
3278 pmap_addr + (va - entry->vme_start), FALSE);
3279
3280 if (object == VM_OBJECT_NULL) {
3281 (void) vm_fault(map, va, VM_PROT_NONE,
3282 TRUE, THREAD_UNINT, pmap, pmap_addr);
3283 } else if (object->phys_contiguous) {
3284 continue;
3285 } else {
3286 vm_prot_t prot;
3287 vm_page_t result_page;
3288 vm_page_t top_page;
3289 vm_object_t result_object;
3290 vm_fault_return_t result;
3291
3292 do {
3293 prot = VM_PROT_NONE;
3294
3295 vm_object_lock(object);
3296 vm_object_paging_begin(object);
3297 XPR(XPR_VM_FAULT,
3298 "vm_fault_unwire -> vm_fault_page\n",
3299 0,0,0,0,0);
3300 result = vm_fault_page(object,
3301 entry->offset +
3302 (va - entry->vme_start),
3303 VM_PROT_NONE, TRUE,
3304 THREAD_UNINT,
3305 entry->offset,
3306 entry->offset +
3307 (entry->vme_end
3308 - entry->vme_start),
3309 entry->behavior,
3310 &prot,
3311 &result_page,
3312 &top_page,
3313 (int *)0,
3314 0, map->no_zero_fill,
3315 FALSE, NULL, 0);
3316 } while (result == VM_FAULT_RETRY);
3317
3318 if (result != VM_FAULT_SUCCESS)
3319 panic("vm_fault_unwire: failure");
3320
3321 result_object = result_page->object;
3322 if (deallocate) {
3323 assert(!result_page->fictitious);
3324 pmap_disconnect(result_page->phys_page);
3325 VM_PAGE_FREE(result_page);
3326 } else {
3327 vm_page_lock_queues();
3328 vm_page_unwire(result_page);
3329 vm_page_unlock_queues();
3330 PAGE_WAKEUP_DONE(result_page);
3331 }
3332
3333 vm_fault_cleanup(result_object, top_page);
3334 }
3335 }
3336
3337 /*
3338 * Inform the physical mapping system that the range
3339 * of addresses may fault, so that page tables and
3340 * such may be unwired themselves.
3341 */
3342
3343 pmap_pageable(pmap, pmap_addr,
3344 pmap_addr + (end_addr - entry->vme_start), TRUE);
3345
3346 }
3347
3348 /*
3349 * vm_fault_wire_fast:
3350 *
3351 * Handle common case of a wire down page fault at the given address.
3352 * If successful, the page is inserted into the associated physical map.
3353 * The map entry is passed in to avoid the overhead of a map lookup.
3354 *
3355 * NOTE: the given address should be truncated to the
3356 * proper page address.
3357 *
3358 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
3359 * a standard error specifying why the fault is fatal is returned.
3360 *
3361 * The map in question must be referenced, and remains so.
3362 * Caller has a read lock on the map.
3363 *
3364 * This is a stripped version of vm_fault() for wiring pages. Anything
3365 * other than the common case will return KERN_FAILURE, and the caller
3366 * is expected to call vm_fault().
3367 */
3368 kern_return_t
3369 vm_fault_wire_fast(
3370 __unused vm_map_t map,
3371 vm_map_offset_t va,
3372 vm_map_entry_t entry,
3373 pmap_t pmap,
3374 vm_map_offset_t pmap_addr)
3375 {
3376 vm_object_t object;
3377 vm_object_offset_t offset;
3378 register vm_page_t m;
3379 vm_prot_t prot;
3380 thread_t thread = current_thread();
3381 unsigned int cache_attr;
3382
3383 VM_STAT(faults++);
3384
3385 if (thread != THREAD_NULL && thread->task != TASK_NULL)
3386 thread->task->faults++;
3387
3388 /*
3389 * Recovery actions
3390 */
3391
3392 #undef RELEASE_PAGE
3393 #define RELEASE_PAGE(m) { \
3394 PAGE_WAKEUP_DONE(m); \
3395 vm_page_lock_queues(); \
3396 vm_page_unwire(m); \
3397 vm_page_unlock_queues(); \
3398 }
3399
3400
3401 #undef UNLOCK_THINGS
3402 #define UNLOCK_THINGS { \
3403 vm_object_paging_end(object); \
3404 vm_object_unlock(object); \
3405 }
3406
3407 #undef UNLOCK_AND_DEALLOCATE
3408 #define UNLOCK_AND_DEALLOCATE { \
3409 UNLOCK_THINGS; \
3410 vm_object_deallocate(object); \
3411 }
3412 /*
3413 * Give up and have caller do things the hard way.
3414 */
3415
3416 #define GIVE_UP { \
3417 UNLOCK_AND_DEALLOCATE; \
3418 return(KERN_FAILURE); \
3419 }
3420
3421
3422 /*
3423 * If this entry is not directly to a vm_object, bail out.
3424 */
3425 if (entry->is_sub_map)
3426 return(KERN_FAILURE);
3427
3428 /*
3429 * Find the backing store object and offset into it.
3430 */
3431
3432 object = entry->object.vm_object;
3433 offset = (va - entry->vme_start) + entry->offset;
3434 prot = entry->protection;
3435
3436 /*
3437 * Make a reference to this object to prevent its
3438 * disposal while we are messing with it.
3439 */
3440
3441 vm_object_lock(object);
3442 assert(object->ref_count > 0);
3443 object->ref_count++;
3444 vm_object_res_reference(object);
3445 vm_object_paging_begin(object);
3446
3447 /*
3448 * INVARIANTS (through entire routine):
3449 *
3450 * 1) At all times, we must either have the object
3451 * lock or a busy page in some object to prevent
3452 * some other thread from trying to bring in
3453 * the same page.
3454 *
3455 * 2) Once we have a busy page, we must remove it from
3456 * the pageout queues, so that the pageout daemon
3457 * will not grab it away.
3458 *
3459 */
3460
3461 /*
3462 * Look for page in top-level object. If it's not there or
3463 * there's something going on, give up.
3464 * ENCRYPTED SWAP: use the slow fault path, since we'll need to
3465 * decrypt the page before wiring it down.
3466 */
3467 m = vm_page_lookup(object, offset);
3468 if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
3469 (m->unusual && ( m->error || m->restart || m->absent ||
3470 prot & m->page_lock))) {
3471
3472 GIVE_UP;
3473 }
3474 ASSERT_PAGE_DECRYPTED(m);
3475
3476 /*
3477 * Wire the page down now. All bail outs beyond this
3478 * point must unwire the page.
3479 */
3480
3481 vm_page_lock_queues();
3482 vm_page_wire(m);
3483 vm_page_unlock_queues();
3484
3485 /*
3486 * Mark page busy for other threads.
3487 */
3488 assert(!m->busy);
3489 m->busy = TRUE;
3490 assert(!m->absent);
3491
3492 /*
3493 * Give up if the page is being written and there's a copy object
3494 */
3495 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3496 RELEASE_PAGE(m);
3497 GIVE_UP;
3498 }
3499
3500 /*
3501 * Put this page into the physical map.
3502 * We have to unlock the object because pmap_enter
3503 * may cause other faults.
3504 */
3505 if (m->no_isync == TRUE) {
3506 pmap_sync_page_data_phys(m->phys_page);
3507
3508 m->no_isync = FALSE;
3509 }
3510
3511 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3512
3513 PMAP_ENTER(pmap, pmap_addr, m, prot, cache_attr, TRUE);
3514
3515 /*
3516 * Unlock everything, and return
3517 */
3518
3519 PAGE_WAKEUP_DONE(m);
3520 UNLOCK_AND_DEALLOCATE;
3521
3522 return(KERN_SUCCESS);
3523
3524 }
3525
3526 /*
3527 * Routine: vm_fault_copy_cleanup
3528 * Purpose:
3529 * Release a page used by vm_fault_copy.
3530 */
3531
3532 void
3533 vm_fault_copy_cleanup(
3534 vm_page_t page,
3535 vm_page_t top_page)
3536 {
3537 vm_object_t object = page->object;
3538
3539 vm_object_lock(object);
3540 PAGE_WAKEUP_DONE(page);
3541 vm_page_lock_queues();
3542 if (!page->active && !page->inactive)
3543 vm_page_activate(page);
3544 vm_page_unlock_queues();
3545 vm_fault_cleanup(object, top_page);
3546 }
3547
3548 void
3549 vm_fault_copy_dst_cleanup(
3550 vm_page_t page)
3551 {
3552 vm_object_t object;
3553
3554 if (page != VM_PAGE_NULL) {
3555 object = page->object;
3556 vm_object_lock(object);
3557 vm_page_lock_queues();
3558 vm_page_unwire(page);
3559 vm_page_unlock_queues();
3560 vm_object_paging_end(object);
3561 vm_object_unlock(object);
3562 }
3563 }
3564
3565 /*
3566 * Routine: vm_fault_copy
3567 *
3568 * Purpose:
3569 * Copy pages from one virtual memory object to another --
3570 * neither the source nor destination pages need be resident.
3571 *
3572 * Before actually copying a page, the version associated with
3573 * the destination address map wil be verified.
3574 *
3575 * In/out conditions:
3576 * The caller must hold a reference, but not a lock, to
3577 * each of the source and destination objects and to the
3578 * destination map.
3579 *
3580 * Results:
3581 * Returns KERN_SUCCESS if no errors were encountered in
3582 * reading or writing the data. Returns KERN_INTERRUPTED if
3583 * the operation was interrupted (only possible if the
3584 * "interruptible" argument is asserted). Other return values
3585 * indicate a permanent error in copying the data.
3586 *
3587 * The actual amount of data copied will be returned in the
3588 * "copy_size" argument. In the event that the destination map
3589 * verification failed, this amount may be less than the amount
3590 * requested.
3591 */
3592 kern_return_t
3593 vm_fault_copy(
3594 vm_object_t src_object,
3595 vm_object_offset_t src_offset,
3596 vm_map_size_t *copy_size, /* INOUT */
3597 vm_object_t dst_object,
3598 vm_object_offset_t dst_offset,
3599 vm_map_t dst_map,
3600 vm_map_version_t *dst_version,
3601 int interruptible)
3602 {
3603 vm_page_t result_page;
3604
3605 vm_page_t src_page;
3606 vm_page_t src_top_page;
3607 vm_prot_t src_prot;
3608
3609 vm_page_t dst_page;
3610 vm_page_t dst_top_page;
3611 vm_prot_t dst_prot;
3612
3613 vm_map_size_t amount_left;
3614 vm_object_t old_copy_object;
3615 kern_return_t error = 0;
3616
3617 vm_map_size_t part_size;
3618
3619 /*
3620 * In order not to confuse the clustered pageins, align
3621 * the different offsets on a page boundary.
3622 */
3623 vm_object_offset_t src_lo_offset = vm_object_trunc_page(src_offset);
3624 vm_object_offset_t dst_lo_offset = vm_object_trunc_page(dst_offset);
3625 vm_object_offset_t src_hi_offset = vm_object_round_page(src_offset + *copy_size);
3626 vm_object_offset_t dst_hi_offset = vm_object_round_page(dst_offset + *copy_size);
3627
3628 #define RETURN(x) \
3629 MACRO_BEGIN \
3630 *copy_size -= amount_left; \
3631 MACRO_RETURN(x); \
3632 MACRO_END
3633
3634 amount_left = *copy_size;
3635 do { /* while (amount_left > 0) */
3636 /*
3637 * There may be a deadlock if both source and destination
3638 * pages are the same. To avoid this deadlock, the copy must
3639 * start by getting the destination page in order to apply
3640 * COW semantics if any.
3641 */
3642
3643 RetryDestinationFault: ;
3644
3645 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3646
3647 vm_object_lock(dst_object);
3648 vm_object_paging_begin(dst_object);
3649
3650 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3651 switch (vm_fault_page(dst_object,
3652 vm_object_trunc_page(dst_offset),
3653 VM_PROT_WRITE|VM_PROT_READ,
3654 FALSE,
3655 interruptible,
3656 dst_lo_offset,
3657 dst_hi_offset,
3658 VM_BEHAVIOR_SEQUENTIAL,
3659 &dst_prot,
3660 &dst_page,
3661 &dst_top_page,
3662 (int *)0,
3663 &error,
3664 dst_map->no_zero_fill,
3665 FALSE, NULL, 0)) {
3666 case VM_FAULT_SUCCESS:
3667 break;
3668 case VM_FAULT_RETRY:
3669 goto RetryDestinationFault;
3670 case VM_FAULT_MEMORY_SHORTAGE:
3671 if (vm_page_wait(interruptible))
3672 goto RetryDestinationFault;
3673 /* fall thru */
3674 case VM_FAULT_INTERRUPTED:
3675 RETURN(MACH_SEND_INTERRUPTED);
3676 case VM_FAULT_FICTITIOUS_SHORTAGE:
3677 vm_page_more_fictitious();
3678 goto RetryDestinationFault;
3679 case VM_FAULT_MEMORY_ERROR:
3680 if (error)
3681 return (error);
3682 else
3683 return(KERN_MEMORY_ERROR);
3684 }
3685 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3686
3687 old_copy_object = dst_page->object->copy;
3688
3689 /*
3690 * There exists the possiblity that the source and
3691 * destination page are the same. But we can't
3692 * easily determine that now. If they are the
3693 * same, the call to vm_fault_page() for the
3694 * destination page will deadlock. To prevent this we
3695 * wire the page so we can drop busy without having
3696 * the page daemon steal the page. We clean up the
3697 * top page but keep the paging reference on the object
3698 * holding the dest page so it doesn't go away.
3699 */
3700
3701 vm_page_lock_queues();
3702 vm_page_wire(dst_page);
3703 vm_page_unlock_queues();
3704 PAGE_WAKEUP_DONE(dst_page);
3705 vm_object_unlock(dst_page->object);
3706
3707 if (dst_top_page != VM_PAGE_NULL) {
3708 vm_object_lock(dst_object);
3709 VM_PAGE_FREE(dst_top_page);
3710 vm_object_paging_end(dst_object);
3711 vm_object_unlock(dst_object);
3712 }
3713
3714 RetrySourceFault: ;
3715
3716 if (src_object == VM_OBJECT_NULL) {
3717 /*
3718 * No source object. We will just
3719 * zero-fill the page in dst_object.
3720 */
3721 src_page = VM_PAGE_NULL;
3722 result_page = VM_PAGE_NULL;
3723 } else {
3724 vm_object_lock(src_object);
3725 src_page = vm_page_lookup(src_object,
3726 vm_object_trunc_page(src_offset));
3727 if (src_page == dst_page) {
3728 src_prot = dst_prot;
3729 result_page = VM_PAGE_NULL;
3730 } else {
3731 src_prot = VM_PROT_READ;
3732 vm_object_paging_begin(src_object);
3733
3734 XPR(XPR_VM_FAULT,
3735 "vm_fault_copy(2) -> vm_fault_page\n",
3736 0,0,0,0,0);
3737 switch (vm_fault_page(src_object,
3738 vm_object_trunc_page(src_offset),
3739 VM_PROT_READ,
3740 FALSE,
3741 interruptible,
3742 src_lo_offset,
3743 src_hi_offset,
3744 VM_BEHAVIOR_SEQUENTIAL,
3745 &src_prot,
3746 &result_page,
3747 &src_top_page,
3748 (int *)0,
3749 &error,
3750 FALSE,
3751 FALSE, NULL, 0)) {
3752
3753 case VM_FAULT_SUCCESS:
3754 break;
3755 case VM_FAULT_RETRY:
3756 goto RetrySourceFault;
3757 case VM_FAULT_MEMORY_SHORTAGE:
3758 if (vm_page_wait(interruptible))
3759 goto RetrySourceFault;
3760 /* fall thru */
3761 case VM_FAULT_INTERRUPTED:
3762 vm_fault_copy_dst_cleanup(dst_page);
3763 RETURN(MACH_SEND_INTERRUPTED);
3764 case VM_FAULT_FICTITIOUS_SHORTAGE:
3765 vm_page_more_fictitious();
3766 goto RetrySourceFault;
3767 case VM_FAULT_MEMORY_ERROR:
3768 vm_fault_copy_dst_cleanup(dst_page);
3769 if (error)
3770 return (error);
3771 else
3772 return(KERN_MEMORY_ERROR);
3773 }
3774
3775
3776 assert((src_top_page == VM_PAGE_NULL) ==
3777 (result_page->object == src_object));
3778 }
3779 assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
3780 vm_object_unlock(result_page->object);
3781 }
3782
3783 if (!vm_map_verify(dst_map, dst_version)) {
3784 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3785 vm_fault_copy_cleanup(result_page, src_top_page);
3786 vm_fault_copy_dst_cleanup(dst_page);
3787 break;
3788 }
3789
3790 vm_object_lock(dst_page->object);
3791
3792 if (dst_page->object->copy != old_copy_object) {
3793 vm_object_unlock(dst_page->object);
3794 vm_map_verify_done(dst_map, dst_version);
3795 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3796 vm_fault_copy_cleanup(result_page, src_top_page);
3797 vm_fault_copy_dst_cleanup(dst_page);
3798 break;
3799 }
3800 vm_object_unlock(dst_page->object);
3801
3802 /*
3803 * Copy the page, and note that it is dirty
3804 * immediately.
3805 */
3806
3807 if (!page_aligned(src_offset) ||
3808 !page_aligned(dst_offset) ||
3809 !page_aligned(amount_left)) {
3810
3811 vm_object_offset_t src_po,
3812 dst_po;
3813
3814 src_po = src_offset - vm_object_trunc_page(src_offset);
3815 dst_po = dst_offset - vm_object_trunc_page(dst_offset);
3816
3817 if (dst_po > src_po) {
3818 part_size = PAGE_SIZE - dst_po;
3819 } else {
3820 part_size = PAGE_SIZE - src_po;
3821 }
3822 if (part_size > (amount_left)){
3823 part_size = amount_left;
3824 }
3825
3826 if (result_page == VM_PAGE_NULL) {
3827 vm_page_part_zero_fill(dst_page,
3828 dst_po, part_size);
3829 } else {
3830 vm_page_part_copy(result_page, src_po,
3831 dst_page, dst_po, part_size);
3832 if(!dst_page->dirty){
3833 vm_object_lock(dst_object);
3834 dst_page->dirty = TRUE;
3835 vm_object_unlock(dst_page->object);
3836 }
3837
3838 }
3839 } else {
3840 part_size = PAGE_SIZE;
3841
3842 if (result_page == VM_PAGE_NULL)
3843 vm_page_zero_fill(dst_page);
3844 else{
3845 vm_page_copy(result_page, dst_page);
3846 if(!dst_page->dirty){
3847 vm_object_lock(dst_object);
3848 dst_page->dirty = TRUE;
3849 vm_object_unlock(dst_page->object);
3850 }
3851 }
3852
3853 }
3854
3855 /*
3856 * Unlock everything, and return
3857 */
3858
3859 vm_map_verify_done(dst_map, dst_version);
3860
3861 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3862 vm_fault_copy_cleanup(result_page, src_top_page);
3863 vm_fault_copy_dst_cleanup(dst_page);
3864
3865 amount_left -= part_size;
3866 src_offset += part_size;
3867 dst_offset += part_size;
3868 } while (amount_left > 0);
3869
3870 RETURN(KERN_SUCCESS);
3871 #undef RETURN
3872
3873 /*NOTREACHED*/
3874 }
3875
3876 #ifdef notdef
3877
3878 /*
3879 * Routine: vm_fault_page_overwrite
3880 *
3881 * Description:
3882 * A form of vm_fault_page that assumes that the
3883 * resulting page will be overwritten in its entirety,
3884 * making it unnecessary to obtain the correct *contents*
3885 * of the page.
3886 *
3887 * Implementation:
3888 * XXX Untested. Also unused. Eventually, this technology
3889 * could be used in vm_fault_copy() to advantage.
3890 */
3891 vm_fault_return_t
3892 vm_fault_page_overwrite(
3893 register
3894 vm_object_t dst_object,
3895 vm_object_offset_t dst_offset,
3896 vm_page_t *result_page) /* OUT */
3897 {
3898 register
3899 vm_page_t dst_page;
3900 kern_return_t wait_result;
3901
3902 #define interruptible THREAD_UNINT /* XXX */
3903
3904 while (TRUE) {
3905 /*
3906 * Look for a page at this offset
3907 */
3908
3909 while ((dst_page = vm_page_lookup(dst_object, dst_offset))
3910 == VM_PAGE_NULL) {
3911 /*
3912 * No page, no problem... just allocate one.
3913 */
3914
3915 dst_page = vm_page_alloc(dst_object, dst_offset);
3916 if (dst_page == VM_PAGE_NULL) {
3917 vm_object_unlock(dst_object);
3918 VM_PAGE_WAIT();
3919 vm_object_lock(dst_object);
3920 continue;
3921 }
3922
3923 /*
3924 * Pretend that the memory manager
3925 * write-protected the page.
3926 *
3927 * Note that we will be asking for write
3928 * permission without asking for the data
3929 * first.
3930 */
3931
3932 dst_page->overwriting = TRUE;
3933 dst_page->page_lock = VM_PROT_WRITE;
3934 dst_page->absent = TRUE;
3935 dst_page->unusual = TRUE;
3936 dst_object->absent_count++;
3937
3938 break;
3939
3940 /*
3941 * When we bail out, we might have to throw
3942 * away the page created here.
3943 */
3944
3945 #define DISCARD_PAGE \
3946 MACRO_BEGIN \
3947 vm_object_lock(dst_object); \
3948 dst_page = vm_page_lookup(dst_object, dst_offset); \
3949 if ((dst_page != VM_PAGE_NULL) && dst_page->overwriting) \
3950 VM_PAGE_FREE(dst_page); \
3951 vm_object_unlock(dst_object); \
3952 MACRO_END
3953 }
3954
3955 /*
3956 * If the page is write-protected...
3957 */
3958
3959 if (dst_page->page_lock & VM_PROT_WRITE) {
3960 /*
3961 * ... and an unlock request hasn't been sent
3962 */
3963
3964 if ( ! (dst_page->unlock_request & VM_PROT_WRITE)) {
3965 vm_prot_t u;
3966 kern_return_t rc;
3967
3968 /*
3969 * ... then send one now.
3970 */
3971
3972 if (!dst_object->pager_ready) {
3973 wait_result = vm_object_assert_wait(dst_object,
3974 VM_OBJECT_EVENT_PAGER_READY,
3975 interruptible);
3976 vm_object_unlock(dst_object);
3977 if (wait_result == THREAD_WAITING)
3978 wait_result = thread_block(THREAD_CONTINUE_NULL);
3979 if (wait_result != THREAD_AWAKENED) {
3980 DISCARD_PAGE;
3981 return(VM_FAULT_INTERRUPTED);
3982 }
3983 continue;
3984 }
3985
3986 u = dst_page->unlock_request |= VM_PROT_WRITE;
3987 vm_object_unlock(dst_object);
3988
3989 if ((rc = memory_object_data_unlock(
3990 dst_object->pager,
3991 dst_offset + dst_object->paging_offset,
3992 PAGE_SIZE,
3993 u)) != KERN_SUCCESS) {
3994 if (vm_fault_debug)
3995 printf("vm_object_overwrite: memory_object_data_unlock failed\n");
3996 DISCARD_PAGE;
3997 return((rc == MACH_SEND_INTERRUPTED) ?
3998 VM_FAULT_INTERRUPTED :
3999 VM_FAULT_MEMORY_ERROR);
4000 }
4001 vm_object_lock(dst_object);
4002 continue;
4003 }
4004
4005 /* ... fall through to wait below */
4006 } else {
4007 /*
4008 * If the page isn't being used for other
4009 * purposes, then we're done.
4010 */
4011 if ( ! (dst_page->busy || dst_page->absent ||
4012 dst_page->error || dst_page->restart) )
4013 break;
4014 }
4015
4016 wait_result = PAGE_ASSERT_WAIT(dst_page, interruptible);
4017 vm_object_unlock(dst_object);
4018 if (wait_result == THREAD_WAITING)
4019 wait_result = thread_block(THREAD_CONTINUE_NULL);
4020 if (wait_result != THREAD_AWAKENED) {
4021 DISCARD_PAGE;
4022 return(VM_FAULT_INTERRUPTED);
4023 }
4024 }
4025
4026 *result_page = dst_page;
4027 return(VM_FAULT_SUCCESS);
4028
4029 #undef interruptible
4030 #undef DISCARD_PAGE
4031 }
4032
4033 #endif /* notdef */
4034
4035 #if VM_FAULT_CLASSIFY
4036 /*
4037 * Temporary statistics gathering support.
4038 */
4039
4040 /*
4041 * Statistics arrays:
4042 */
4043 #define VM_FAULT_TYPES_MAX 5
4044 #define VM_FAULT_LEVEL_MAX 8
4045
4046 int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4047
4048 #define VM_FAULT_TYPE_ZERO_FILL 0
4049 #define VM_FAULT_TYPE_MAP_IN 1
4050 #define VM_FAULT_TYPE_PAGER 2
4051 #define VM_FAULT_TYPE_COPY 3
4052 #define VM_FAULT_TYPE_OTHER 4
4053
4054
4055 void
4056 vm_fault_classify(vm_object_t object,
4057 vm_object_offset_t offset,
4058 vm_prot_t fault_type)
4059 {
4060 int type, level = 0;
4061 vm_page_t m;
4062
4063 while (TRUE) {
4064 m = vm_page_lookup(object, offset);
4065 if (m != VM_PAGE_NULL) {
4066 if (m->busy || m->error || m->restart || m->absent ||
4067 fault_type & m->page_lock) {
4068 type = VM_FAULT_TYPE_OTHER;
4069 break;
4070 }
4071 if (((fault_type & VM_PROT_WRITE) == 0) ||
4072 ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4073 type = VM_FAULT_TYPE_MAP_IN;
4074 break;
4075 }
4076 type = VM_FAULT_TYPE_COPY;
4077 break;
4078 }
4079 else {
4080 if (object->pager_created) {
4081 type = VM_FAULT_TYPE_PAGER;
4082 break;
4083 }
4084 if (object->shadow == VM_OBJECT_NULL) {
4085 type = VM_FAULT_TYPE_ZERO_FILL;
4086 break;
4087 }
4088
4089 offset += object->shadow_offset;
4090 object = object->shadow;
4091 level++;
4092 continue;
4093 }
4094 }
4095
4096 if (level > VM_FAULT_LEVEL_MAX)
4097 level = VM_FAULT_LEVEL_MAX;
4098
4099 vm_fault_stats[type][level] += 1;
4100
4101 return;
4102 }
4103
4104 /* cleanup routine to call from debugger */
4105
4106 void
4107 vm_fault_classify_init(void)
4108 {
4109 int type, level;
4110
4111 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4112 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4113 vm_fault_stats[type][level] = 0;
4114 }
4115 }
4116
4117 return;
4118 }
4119 #endif /* VM_FAULT_CLASSIFY */