]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_fault.c
e94ac06f4fc3690743d7b8a4b92d20371a3823f3
[apple/xnu.git] / osfmk / vm / vm_fault.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /*
23 * @OSF_COPYRIGHT@
24 */
25 /*
26 * Mach Operating System
27 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
28 * All Rights Reserved.
29 *
30 * Permission to use, copy, modify and distribute this software and its
31 * documentation is hereby granted, provided that both the copyright
32 * notice and this permission notice appear in all copies of the
33 * software, derivative works or modified versions, and any portions
34 * thereof, and that both notices appear in supporting documentation.
35 *
36 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
37 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
38 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
39 *
40 * Carnegie Mellon requests users of this software to return to
41 *
42 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
43 * School of Computer Science
44 * Carnegie Mellon University
45 * Pittsburgh PA 15213-3890
46 *
47 * any improvements or extensions that they make and grant Carnegie Mellon
48 * the rights to redistribute these changes.
49 */
50 /*
51 */
52 /*
53 * File: vm_fault.c
54 * Author: Avadis Tevanian, Jr., Michael Wayne Young
55 *
56 * Page fault handling module.
57 */
58 #ifdef MACH_BSD
59 /* remove after component interface available */
60 extern int vnode_pager_workaround;
61 #endif
62
63 #include <mach_cluster_stats.h>
64 #include <mach_pagemap.h>
65 #include <mach_kdb.h>
66
67 #include <vm/vm_fault.h>
68 #include <mach/kern_return.h>
69 #include <mach/message.h> /* for error codes */
70 #include <kern/host_statistics.h>
71 #include <kern/counters.h>
72 #include <kern/task.h>
73 #include <kern/thread.h>
74 #include <kern/sched_prim.h>
75 #include <kern/host.h>
76 #include <kern/xpr.h>
77 #include <vm/vm_map.h>
78 #include <vm/vm_object.h>
79 #include <vm/vm_page.h>
80 #include <vm/pmap.h>
81 #include <vm/vm_pageout.h>
82 #include <mach/vm_param.h>
83 #include <mach/vm_behavior.h>
84 #include <mach/memory_object.h>
85 /* For memory_object_data_{request,unlock} */
86 #include <kern/mach_param.h>
87 #include <kern/macro_help.h>
88 #include <kern/zalloc.h>
89 #include <kern/misc_protos.h>
90
91 #include <sys/kdebug.h>
92
93 #define VM_FAULT_CLASSIFY 0
94 #define VM_FAULT_STATIC_CONFIG 1
95
96 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
97
98 int vm_object_absent_max = 50;
99
100 int vm_fault_debug = 0;
101 boolean_t vm_page_deactivate_behind = TRUE;
102
103 vm_machine_attribute_val_t mv_cache_sync = MATTR_VAL_CACHE_SYNC;
104
105 #if !VM_FAULT_STATIC_CONFIG
106 boolean_t vm_fault_dirty_handling = FALSE;
107 boolean_t vm_fault_interruptible = FALSE;
108 boolean_t software_reference_bits = TRUE;
109 #endif
110
111 #if MACH_KDB
112 extern struct db_watchpoint *db_watchpoint_list;
113 #endif /* MACH_KDB */
114
115 /* Forward declarations of internal routines. */
116 extern kern_return_t vm_fault_wire_fast(
117 vm_map_t map,
118 vm_offset_t va,
119 vm_map_entry_t entry,
120 pmap_t pmap);
121
122 extern void vm_fault_continue(void);
123
124 extern void vm_fault_copy_cleanup(
125 vm_page_t page,
126 vm_page_t top_page);
127
128 extern void vm_fault_copy_dst_cleanup(
129 vm_page_t page);
130
131 #if VM_FAULT_CLASSIFY
132 extern void vm_fault_classify(vm_object_t object,
133 vm_object_offset_t offset,
134 vm_prot_t fault_type);
135
136 extern void vm_fault_classify_init(void);
137 #endif
138
139 /*
140 * Routine: vm_fault_init
141 * Purpose:
142 * Initialize our private data structures.
143 */
144 void
145 vm_fault_init(void)
146 {
147 }
148
149 /*
150 * Routine: vm_fault_cleanup
151 * Purpose:
152 * Clean up the result of vm_fault_page.
153 * Results:
154 * The paging reference for "object" is released.
155 * "object" is unlocked.
156 * If "top_page" is not null, "top_page" is
157 * freed and the paging reference for the object
158 * containing it is released.
159 *
160 * In/out conditions:
161 * "object" must be locked.
162 */
163 void
164 vm_fault_cleanup(
165 register vm_object_t object,
166 register vm_page_t top_page)
167 {
168 vm_object_paging_end(object);
169 vm_object_unlock(object);
170
171 if (top_page != VM_PAGE_NULL) {
172 object = top_page->object;
173 vm_object_lock(object);
174 VM_PAGE_FREE(top_page);
175 vm_object_paging_end(object);
176 vm_object_unlock(object);
177 }
178 }
179
180 #if MACH_CLUSTER_STATS
181 #define MAXCLUSTERPAGES 16
182 struct {
183 unsigned long pages_in_cluster;
184 unsigned long pages_at_higher_offsets;
185 unsigned long pages_at_lower_offsets;
186 } cluster_stats_in[MAXCLUSTERPAGES];
187 #define CLUSTER_STAT(clause) clause
188 #define CLUSTER_STAT_HIGHER(x) \
189 ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
190 #define CLUSTER_STAT_LOWER(x) \
191 ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
192 #define CLUSTER_STAT_CLUSTER(x) \
193 ((cluster_stats_in[(x)].pages_in_cluster)++)
194 #else /* MACH_CLUSTER_STATS */
195 #define CLUSTER_STAT(clause)
196 #endif /* MACH_CLUSTER_STATS */
197
198 /* XXX - temporary */
199 boolean_t vm_allow_clustered_pagein = FALSE;
200 int vm_pagein_cluster_used = 0;
201
202 /*
203 * Prepage default sizes given VM_BEHAVIOR_DEFAULT reference behavior
204 */
205 int vm_default_ahead = 1; /* Number of pages to prepage ahead */
206 int vm_default_behind = 0; /* Number of pages to prepage behind */
207
208 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
209
210 /*
211 * Routine: vm_fault_page
212 * Purpose:
213 * Find the resident page for the virtual memory
214 * specified by the given virtual memory object
215 * and offset.
216 * Additional arguments:
217 * The required permissions for the page is given
218 * in "fault_type". Desired permissions are included
219 * in "protection". The minimum and maximum valid offsets
220 * within the object for the relevant map entry are
221 * passed in "lo_offset" and "hi_offset" respectively and
222 * the expected page reference pattern is passed in "behavior".
223 * These three parameters are used to determine pagein cluster
224 * limits.
225 *
226 * If the desired page is known to be resident (for
227 * example, because it was previously wired down), asserting
228 * the "unwiring" parameter will speed the search.
229 *
230 * If the operation can be interrupted (by thread_abort
231 * or thread_terminate), then the "interruptible"
232 * parameter should be asserted.
233 *
234 * Results:
235 * The page containing the proper data is returned
236 * in "result_page".
237 *
238 * In/out conditions:
239 * The source object must be locked and referenced,
240 * and must donate one paging reference. The reference
241 * is not affected. The paging reference and lock are
242 * consumed.
243 *
244 * If the call succeeds, the object in which "result_page"
245 * resides is left locked and holding a paging reference.
246 * If this is not the original object, a busy page in the
247 * original object is returned in "top_page", to prevent other
248 * callers from pursuing this same data, along with a paging
249 * reference for the original object. The "top_page" should
250 * be destroyed when this guarantee is no longer required.
251 * The "result_page" is also left busy. It is not removed
252 * from the pageout queues.
253 */
254
255 vm_fault_return_t
256 vm_fault_page(
257 /* Arguments: */
258 vm_object_t first_object, /* Object to begin search */
259 vm_object_offset_t first_offset, /* Offset into object */
260 vm_prot_t fault_type, /* What access is requested */
261 boolean_t must_be_resident,/* Must page be resident? */
262 int interruptible, /* how may fault be interrupted? */
263 vm_object_offset_t lo_offset, /* Map entry start */
264 vm_object_offset_t hi_offset, /* Map entry end */
265 vm_behavior_t behavior, /* Page reference behavior */
266 /* Modifies in place: */
267 vm_prot_t *protection, /* Protection for mapping */
268 /* Returns: */
269 vm_page_t *result_page, /* Page found, if successful */
270 vm_page_t *top_page, /* Page in top object, if
271 * not result_page. */
272 int *type_of_fault, /* if non-null, fill in with type of fault
273 * COW, zero-fill, etc... returned in trace point */
274 /* More arguments: */
275 kern_return_t *error_code, /* code if page is in error */
276 boolean_t no_zero_fill, /* don't zero fill absent pages */
277 boolean_t data_supply) /* treat as data_supply if
278 * it is a write fault and a full
279 * page is provided */
280 {
281 register
282 vm_page_t m;
283 register
284 vm_object_t object;
285 register
286 vm_object_offset_t offset;
287 vm_page_t first_m;
288 vm_object_t next_object;
289 vm_object_t copy_object;
290 boolean_t look_for_page;
291 vm_prot_t access_required = fault_type;
292 vm_prot_t wants_copy_flag;
293 vm_size_t cluster_size, length;
294 vm_object_offset_t cluster_offset;
295 vm_object_offset_t cluster_start, cluster_end, paging_offset;
296 vm_object_offset_t align_offset;
297 CLUSTER_STAT(int pages_at_higher_offsets;)
298 CLUSTER_STAT(int pages_at_lower_offsets;)
299 kern_return_t wait_result;
300 thread_t cur_thread;
301 boolean_t interruptible_state;
302
303 #ifdef MACH_BSD
304 kern_return_t vnode_pager_data_request(ipc_port_t,
305 ipc_port_t, vm_object_offset_t, vm_size_t, vm_prot_t);
306 #endif
307
308 #if MACH_PAGEMAP
309 /*
310 * MACH page map - an optional optimization where a bit map is maintained
311 * by the VM subsystem for internal objects to indicate which pages of
312 * the object currently reside on backing store. This existence map
313 * duplicates information maintained by the vnode pager. It is
314 * created at the time of the first pageout against the object, i.e.
315 * at the same time pager for the object is created. The optimization
316 * is designed to eliminate pager interaction overhead, if it is
317 * 'known' that the page does not exist on backing store.
318 *
319 * LOOK_FOR() evaluates to TRUE if the page specified by object/offset is
320 * either marked as paged out in the existence map for the object or no
321 * existence map exists for the object. LOOK_FOR() is one of the
322 * criteria in the decision to invoke the pager. It is also used as one
323 * of the criteria to terminate the scan for adjacent pages in a clustered
324 * pagein operation. Note that LOOK_FOR() always evaluates to TRUE for
325 * permanent objects. Note also that if the pager for an internal object
326 * has not been created, the pager is not invoked regardless of the value
327 * of LOOK_FOR() and that clustered pagein scans are only done on an object
328 * for which a pager has been created.
329 *
330 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
331 * is marked as paged out in the existence map for the object. PAGED_OUT()
332 * PAGED_OUT() is used to determine if a page has already been pushed
333 * into a copy object in order to avoid a redundant page out operation.
334 */
335 #define LOOK_FOR(o, f) (vm_external_state_get((o)->existence_map, (f)) \
336 != VM_EXTERNAL_STATE_ABSENT)
337 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
338 == VM_EXTERNAL_STATE_EXISTS)
339 #else /* MACH_PAGEMAP */
340 /*
341 * If the MACH page map optimization is not enabled,
342 * LOOK_FOR() always evaluates to TRUE. The pager will always be
343 * invoked to resolve missing pages in an object, assuming the pager
344 * has been created for the object. In a clustered page operation, the
345 * absence of a page on backing backing store cannot be used to terminate
346 * a scan for adjacent pages since that information is available only in
347 * the pager. Hence pages that may not be paged out are potentially
348 * included in a clustered request. The vnode pager is coded to deal
349 * with any combination of absent/present pages in a clustered
350 * pagein request. PAGED_OUT() always evaluates to FALSE, i.e. the pager
351 * will always be invoked to push a dirty page into a copy object assuming
352 * a pager has been created. If the page has already been pushed, the
353 * pager will ingore the new request.
354 */
355 #define LOOK_FOR(o, f) TRUE
356 #define PAGED_OUT(o, f) FALSE
357 #endif /* MACH_PAGEMAP */
358
359 /*
360 * Recovery actions
361 */
362 #define PREPARE_RELEASE_PAGE(m) \
363 MACRO_BEGIN \
364 vm_page_lock_queues(); \
365 MACRO_END
366
367 #define DO_RELEASE_PAGE(m) \
368 MACRO_BEGIN \
369 PAGE_WAKEUP_DONE(m); \
370 if (!m->active && !m->inactive) \
371 vm_page_activate(m); \
372 vm_page_unlock_queues(); \
373 MACRO_END
374
375 #define RELEASE_PAGE(m) \
376 MACRO_BEGIN \
377 PREPARE_RELEASE_PAGE(m); \
378 DO_RELEASE_PAGE(m); \
379 MACRO_END
380
381 #if TRACEFAULTPAGE
382 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
383 #endif
384
385
386
387 #if !VM_FAULT_STATIC_CONFIG
388 if (vm_fault_dirty_handling
389 #if MACH_KDB
390 /*
391 * If there are watchpoints set, then
392 * we don't want to give away write permission
393 * on a read fault. Make the task write fault,
394 * so that the watchpoint code notices the access.
395 */
396 || db_watchpoint_list
397 #endif /* MACH_KDB */
398 ) {
399 /*
400 * If we aren't asking for write permission,
401 * then don't give it away. We're using write
402 * faults to set the dirty bit.
403 */
404 if (!(fault_type & VM_PROT_WRITE))
405 *protection &= ~VM_PROT_WRITE;
406 }
407
408 if (!vm_fault_interruptible)
409 interruptible = THREAD_UNINT;
410 #else /* STATIC_CONFIG */
411 #if MACH_KDB
412 /*
413 * If there are watchpoints set, then
414 * we don't want to give away write permission
415 * on a read fault. Make the task write fault,
416 * so that the watchpoint code notices the access.
417 */
418 if (db_watchpoint_list) {
419 /*
420 * If we aren't asking for write permission,
421 * then don't give it away. We're using write
422 * faults to set the dirty bit.
423 */
424 if (!(fault_type & VM_PROT_WRITE))
425 *protection &= ~VM_PROT_WRITE;
426 }
427
428 #endif /* MACH_KDB */
429 #endif /* STATIC_CONFIG */
430
431 cur_thread = current_thread();
432
433 interruptible_state = cur_thread->interruptible;
434 if (interruptible == THREAD_UNINT)
435 cur_thread->interruptible = FALSE;
436
437 /*
438 * INVARIANTS (through entire routine):
439 *
440 * 1) At all times, we must either have the object
441 * lock or a busy page in some object to prevent
442 * some other thread from trying to bring in
443 * the same page.
444 *
445 * Note that we cannot hold any locks during the
446 * pager access or when waiting for memory, so
447 * we use a busy page then.
448 *
449 * Note also that we aren't as concerned about more than
450 * one thread attempting to memory_object_data_unlock
451 * the same page at once, so we don't hold the page
452 * as busy then, but do record the highest unlock
453 * value so far. [Unlock requests may also be delivered
454 * out of order.]
455 *
456 * 2) To prevent another thread from racing us down the
457 * shadow chain and entering a new page in the top
458 * object before we do, we must keep a busy page in
459 * the top object while following the shadow chain.
460 *
461 * 3) We must increment paging_in_progress on any object
462 * for which we have a busy page
463 *
464 * 4) We leave busy pages on the pageout queues.
465 * If the pageout daemon comes across a busy page,
466 * it will remove the page from the pageout queues.
467 */
468
469 /*
470 * Search for the page at object/offset.
471 */
472
473 object = first_object;
474 offset = first_offset;
475 first_m = VM_PAGE_NULL;
476 access_required = fault_type;
477
478 XPR(XPR_VM_FAULT,
479 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
480 (integer_t)object, offset, fault_type, *protection, 0);
481
482 /*
483 * See whether this page is resident
484 */
485
486 while (TRUE) {
487 #if TRACEFAULTPAGE
488 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
489 #endif
490 if (!object->alive) {
491 vm_fault_cleanup(object, first_m);
492 cur_thread->interruptible = interruptible_state;
493 return(VM_FAULT_MEMORY_ERROR);
494 }
495 m = vm_page_lookup(object, offset);
496 #if TRACEFAULTPAGE
497 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
498 #endif
499 if (m != VM_PAGE_NULL) {
500 /*
501 * If the page was pre-paged as part of a
502 * cluster, record the fact.
503 */
504 if (m->clustered) {
505 vm_pagein_cluster_used++;
506 m->clustered = FALSE;
507 }
508
509 /*
510 * If the page is being brought in,
511 * wait for it and then retry.
512 *
513 * A possible optimization: if the page
514 * is known to be resident, we can ignore
515 * pages that are absent (regardless of
516 * whether they're busy).
517 */
518
519 if (m->busy) {
520 #if TRACEFAULTPAGE
521 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
522 #endif
523 PAGE_ASSERT_WAIT(m, interruptible);
524 vm_object_unlock(object);
525 XPR(XPR_VM_FAULT,
526 "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
527 (integer_t)object, offset,
528 (integer_t)m, 0, 0);
529 counter(c_vm_fault_page_block_busy_kernel++);
530 wait_result = thread_block((void (*)(void))0);
531
532 vm_object_lock(object);
533 if (wait_result != THREAD_AWAKENED) {
534 vm_fault_cleanup(object, first_m);
535 cur_thread->interruptible = interruptible_state;
536 if (wait_result == THREAD_RESTART)
537 {
538 return(VM_FAULT_RETRY);
539 }
540 else
541 {
542 return(VM_FAULT_INTERRUPTED);
543 }
544 }
545 continue;
546 }
547
548 /*
549 * If the page is in error, give up now.
550 */
551
552 if (m->error) {
553 #if TRACEFAULTPAGE
554 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
555 #endif
556 if (error_code)
557 *error_code = m->page_error;
558 VM_PAGE_FREE(m);
559 vm_fault_cleanup(object, first_m);
560 cur_thread->interruptible = interruptible_state;
561 return(VM_FAULT_MEMORY_ERROR);
562 }
563
564 /*
565 * If the pager wants us to restart
566 * at the top of the chain,
567 * typically because it has moved the
568 * page to another pager, then do so.
569 */
570
571 if (m->restart) {
572 #if TRACEFAULTPAGE
573 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
574 #endif
575 VM_PAGE_FREE(m);
576 vm_fault_cleanup(object, first_m);
577 cur_thread->interruptible = interruptible_state;
578 return(VM_FAULT_RETRY);
579 }
580
581 /*
582 * If the page isn't busy, but is absent,
583 * then it was deemed "unavailable".
584 */
585
586 if (m->absent) {
587 /*
588 * Remove the non-existent page (unless it's
589 * in the top object) and move on down to the
590 * next object (if there is one).
591 */
592 #if TRACEFAULTPAGE
593 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
594 #endif
595
596 next_object = object->shadow;
597 if (next_object == VM_OBJECT_NULL) {
598 vm_page_t real_m;
599
600 assert(!must_be_resident);
601
602 if (object->shadow_severed) {
603 vm_fault_cleanup(
604 object, first_m);
605 cur_thread->interruptible = interruptible_state;
606 return VM_FAULT_MEMORY_ERROR;
607 }
608
609 /*
610 * Absent page at bottom of shadow
611 * chain; zero fill the page we left
612 * busy in the first object, and flush
613 * the absent page. But first we
614 * need to allocate a real page.
615 */
616 if (VM_PAGE_THROTTLED() ||
617 (real_m = vm_page_grab()) == VM_PAGE_NULL) {
618 vm_fault_cleanup(object, first_m);
619 cur_thread->interruptible = interruptible_state;
620 return(VM_FAULT_MEMORY_SHORTAGE);
621 }
622
623 XPR(XPR_VM_FAULT,
624 "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
625 (integer_t)object, offset,
626 (integer_t)m,
627 (integer_t)first_object, 0);
628 if (object != first_object) {
629 VM_PAGE_FREE(m);
630 vm_object_paging_end(object);
631 vm_object_unlock(object);
632 object = first_object;
633 offset = first_offset;
634 m = first_m;
635 first_m = VM_PAGE_NULL;
636 vm_object_lock(object);
637 }
638
639 VM_PAGE_FREE(m);
640 assert(real_m->busy);
641 vm_page_insert(real_m, object, offset);
642 m = real_m;
643
644 /*
645 * Drop the lock while zero filling
646 * page. Then break because this
647 * is the page we wanted. Checking
648 * the page lock is a waste of time;
649 * this page was either absent or
650 * newly allocated -- in both cases
651 * it can't be page locked by a pager.
652 */
653 if (!no_zero_fill) {
654 vm_object_unlock(object);
655 vm_page_zero_fill(m);
656 if (type_of_fault)
657 *type_of_fault = DBG_ZERO_FILL_FAULT;
658 VM_STAT(zero_fill_count++);
659 vm_object_lock(object);
660 }
661 pmap_clear_modify(m->phys_addr);
662 vm_page_lock_queues();
663 VM_PAGE_QUEUES_REMOVE(m);
664 queue_enter(&vm_page_queue_inactive,
665 m, vm_page_t, pageq);
666 m->inactive = TRUE;
667 vm_page_inactive_count++;
668 vm_page_unlock_queues();
669 break;
670 } else {
671 if (must_be_resident) {
672 vm_object_paging_end(object);
673 } else if (object != first_object) {
674 vm_object_paging_end(object);
675 VM_PAGE_FREE(m);
676 } else {
677 first_m = m;
678 m->absent = FALSE;
679 m->unusual = FALSE;
680 vm_object_absent_release(object);
681 m->busy = TRUE;
682
683 vm_page_lock_queues();
684 VM_PAGE_QUEUES_REMOVE(m);
685 vm_page_unlock_queues();
686 }
687 XPR(XPR_VM_FAULT,
688 "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
689 (integer_t)object, offset,
690 (integer_t)next_object,
691 offset+object->shadow_offset,0);
692 offset += object->shadow_offset;
693 hi_offset += object->shadow_offset;
694 lo_offset += object->shadow_offset;
695 access_required = VM_PROT_READ;
696 vm_object_lock(next_object);
697 vm_object_unlock(object);
698 object = next_object;
699 vm_object_paging_begin(object);
700 continue;
701 }
702 }
703
704 if ((m->cleaning)
705 && ((object != first_object) ||
706 (object->copy != VM_OBJECT_NULL))
707 && (fault_type & VM_PROT_WRITE)) {
708 /*
709 * This is a copy-on-write fault that will
710 * cause us to revoke access to this page, but
711 * this page is in the process of being cleaned
712 * in a clustered pageout. We must wait until
713 * the cleaning operation completes before
714 * revoking access to the original page,
715 * otherwise we might attempt to remove a
716 * wired mapping.
717 */
718 #if TRACEFAULTPAGE
719 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
720 #endif
721 XPR(XPR_VM_FAULT,
722 "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
723 (integer_t)object, offset,
724 (integer_t)m, 0, 0);
725 /* take an extra ref so that object won't die */
726 assert(object->ref_count > 0);
727 object->ref_count++;
728 vm_object_res_reference(object);
729 vm_fault_cleanup(object, first_m);
730 counter(c_vm_fault_page_block_backoff_kernel++);
731 vm_object_lock(object);
732 assert(object->ref_count > 0);
733 m = vm_page_lookup(object, offset);
734 if (m != VM_PAGE_NULL && m->cleaning) {
735 PAGE_ASSERT_WAIT(m, interruptible);
736 vm_object_unlock(object);
737 wait_result = thread_block((void (*)(void)) 0);
738 vm_object_deallocate(object);
739 goto backoff;
740 } else {
741 vm_object_unlock(object);
742 vm_object_deallocate(object);
743 cur_thread->interruptible = interruptible_state;
744 return VM_FAULT_RETRY;
745 }
746 }
747
748 /*
749 * If the desired access to this page has
750 * been locked out, request that it be unlocked.
751 */
752
753 if (access_required & m->page_lock) {
754 if ((access_required & m->unlock_request) != access_required) {
755 vm_prot_t new_unlock_request;
756 kern_return_t rc;
757
758 #if TRACEFAULTPAGE
759 dbgTrace(0xBEEF000A, (unsigned int) m, (unsigned int) object->pager_ready); /* (TEST/DEBUG) */
760 #endif
761 if (!object->pager_ready) {
762 XPR(XPR_VM_FAULT,
763 "vm_f_page: ready wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
764 access_required,
765 (integer_t)object, offset,
766 (integer_t)m, 0);
767 /* take an extra ref */
768 assert(object->ref_count > 0);
769 object->ref_count++;
770 vm_object_res_reference(object);
771 vm_fault_cleanup(object,
772 first_m);
773 counter(c_vm_fault_page_block_backoff_kernel++);
774 vm_object_lock(object);
775 assert(object->ref_count > 0);
776 if (!object->pager_ready) {
777 vm_object_assert_wait(
778 object,
779 VM_OBJECT_EVENT_PAGER_READY,
780 interruptible);
781 vm_object_unlock(object);
782 wait_result = thread_block((void (*)(void))0);
783 vm_object_deallocate(object);
784 goto backoff;
785 } else {
786 vm_object_unlock(object);
787 vm_object_deallocate(object);
788 cur_thread->interruptible = interruptible_state;
789 return VM_FAULT_RETRY;
790 }
791 }
792
793 new_unlock_request = m->unlock_request =
794 (access_required | m->unlock_request);
795 vm_object_unlock(object);
796 XPR(XPR_VM_FAULT,
797 "vm_f_page: unlock obj 0x%X, offset 0x%X, page 0x%X, unl_req %d\n",
798 (integer_t)object, offset,
799 (integer_t)m, new_unlock_request, 0);
800 if ((rc = memory_object_data_unlock(
801 object->pager,
802 object->pager_request,
803 offset + object->paging_offset,
804 PAGE_SIZE,
805 new_unlock_request))
806 != KERN_SUCCESS) {
807 if (vm_fault_debug)
808 printf("vm_fault: memory_object_data_unlock failed\n");
809 vm_object_lock(object);
810 vm_fault_cleanup(object, first_m);
811 cur_thread->interruptible = interruptible_state;
812 return((rc == MACH_SEND_INTERRUPTED) ?
813 VM_FAULT_INTERRUPTED :
814 VM_FAULT_MEMORY_ERROR);
815 }
816 vm_object_lock(object);
817 continue;
818 }
819
820 XPR(XPR_VM_FAULT,
821 "vm_f_page: access wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
822 access_required, (integer_t)object,
823 offset, (integer_t)m, 0);
824 /* take an extra ref so object won't die */
825 assert(object->ref_count > 0);
826 object->ref_count++;
827 vm_object_res_reference(object);
828 vm_fault_cleanup(object, first_m);
829 counter(c_vm_fault_page_block_backoff_kernel++);
830 vm_object_lock(object);
831 assert(object->ref_count > 0);
832 m = vm_page_lookup(object, offset);
833 if (m != VM_PAGE_NULL &&
834 (access_required & m->page_lock) &&
835 !((access_required & m->unlock_request) != access_required)) {
836 PAGE_ASSERT_WAIT(m, interruptible);
837 vm_object_unlock(object);
838 wait_result = thread_block((void (*)(void)) 0);
839 vm_object_deallocate(object);
840 goto backoff;
841 } else {
842 vm_object_unlock(object);
843 vm_object_deallocate(object);
844 cur_thread->interruptible = interruptible_state;
845 return VM_FAULT_RETRY;
846 }
847 }
848 /*
849 * We mark the page busy and leave it on
850 * the pageout queues. If the pageout
851 * deamon comes across it, then it will
852 * remove the page.
853 */
854
855 #if TRACEFAULTPAGE
856 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
857 #endif
858
859 #if !VM_FAULT_STATIC_CONFIG
860 if (!software_reference_bits) {
861 vm_page_lock_queues();
862 if (m->inactive)
863 vm_stat.reactivations++;
864
865 VM_PAGE_QUEUES_REMOVE(m);
866 vm_page_unlock_queues();
867 }
868 #endif
869 XPR(XPR_VM_FAULT,
870 "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
871 (integer_t)object, offset, (integer_t)m, 0, 0);
872 assert(!m->busy);
873 m->busy = TRUE;
874 assert(!m->absent);
875 break;
876 }
877
878 look_for_page =
879 (object->pager_created) &&
880 LOOK_FOR(object, offset) &&
881 (!data_supply);
882
883 #if TRACEFAULTPAGE
884 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
885 #endif
886 if ((look_for_page || (object == first_object))
887 && !must_be_resident) {
888 /*
889 * Allocate a new page for this object/offset
890 * pair.
891 */
892
893 m = vm_page_grab_fictitious();
894 #if TRACEFAULTPAGE
895 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
896 #endif
897 if (m == VM_PAGE_NULL) {
898 vm_fault_cleanup(object, first_m);
899 cur_thread->interruptible = interruptible_state;
900 return(VM_FAULT_FICTITIOUS_SHORTAGE);
901 }
902 vm_page_insert(m, object, offset);
903 }
904
905 if (look_for_page && !must_be_resident) {
906 kern_return_t rc;
907
908 /*
909 * If the memory manager is not ready, we
910 * cannot make requests.
911 */
912 if (!object->pager_ready) {
913 #if TRACEFAULTPAGE
914 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
915 #endif
916 VM_PAGE_FREE(m);
917 XPR(XPR_VM_FAULT,
918 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
919 (integer_t)object, offset, 0, 0, 0);
920 /* take an extra ref so object won't die */
921 assert(object->ref_count > 0);
922 object->ref_count++;
923 vm_object_res_reference(object);
924 vm_fault_cleanup(object, first_m);
925 counter(c_vm_fault_page_block_backoff_kernel++);
926 vm_object_lock(object);
927 assert(object->ref_count > 0);
928 if (!object->pager_ready) {
929 vm_object_assert_wait(object,
930 VM_OBJECT_EVENT_PAGER_READY,
931 interruptible);
932 vm_object_unlock(object);
933 wait_result = thread_block((void (*)(void))0);
934 vm_object_deallocate(object);
935 goto backoff;
936 } else {
937 vm_object_unlock(object);
938 vm_object_deallocate(object);
939 cur_thread->interruptible = interruptible_state;
940 return VM_FAULT_RETRY;
941 }
942 }
943
944 if (object->internal) {
945 /*
946 * Requests to the default pager
947 * must reserve a real page in advance,
948 * because the pager's data-provided
949 * won't block for pages. IMPORTANT:
950 * this acts as a throttling mechanism
951 * for data_requests to the default
952 * pager.
953 */
954
955 #if TRACEFAULTPAGE
956 dbgTrace(0xBEEF000F, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
957 #endif
958 if (m->fictitious && !vm_page_convert(m)) {
959 VM_PAGE_FREE(m);
960 vm_fault_cleanup(object, first_m);
961 cur_thread->interruptible = interruptible_state;
962 return(VM_FAULT_MEMORY_SHORTAGE);
963 }
964 } else if (object->absent_count >
965 vm_object_absent_max) {
966 /*
967 * If there are too many outstanding page
968 * requests pending on this object, we
969 * wait for them to be resolved now.
970 */
971
972 #if TRACEFAULTPAGE
973 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
974 #endif
975 VM_PAGE_FREE(m);
976 /* take an extra ref so object won't die */
977 assert(object->ref_count > 0);
978 object->ref_count++;
979 vm_object_res_reference(object);
980 vm_fault_cleanup(object, first_m);
981 counter(c_vm_fault_page_block_backoff_kernel++);
982 vm_object_lock(object);
983 assert(object->ref_count > 0);
984 if (object->absent_count > vm_object_absent_max) {
985 vm_object_absent_assert_wait(object,
986 interruptible);
987 vm_object_unlock(object);
988 wait_result = thread_block((void (*)(void))0);
989 vm_object_deallocate(object);
990 goto backoff;
991 } else {
992 vm_object_unlock(object);
993 vm_object_deallocate(object);
994 cur_thread->interruptible = interruptible_state;
995 return VM_FAULT_RETRY;
996 }
997 }
998
999 /*
1000 * Indicate that the page is waiting for data
1001 * from the memory manager.
1002 */
1003
1004 m->list_req_pending = TRUE;
1005 m->absent = TRUE;
1006 m->unusual = TRUE;
1007 object->absent_count++;
1008
1009 cluster_start = offset;
1010 length = PAGE_SIZE;
1011 cluster_size = object->cluster_size;
1012
1013 /*
1014 * Skip clustered pagein if it is globally disabled
1015 * or random page reference behavior is expected
1016 * for the address range containing the faulting
1017 * address or the object paging block size is
1018 * equal to the page size.
1019 */
1020 if (!vm_allow_clustered_pagein ||
1021 behavior == VM_BEHAVIOR_RANDOM ||
1022 cluster_size == PAGE_SIZE) {
1023 cluster_start = trunc_page_64(cluster_start);
1024 goto no_clustering;
1025 }
1026
1027 assert(offset >= lo_offset);
1028 assert(offset < hi_offset);
1029 assert(ALIGNED(object->paging_offset));
1030 assert(cluster_size >= PAGE_SIZE);
1031
1032 #if TRACEFAULTPAGE
1033 dbgTrace(0xBEEF0011, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1034 #endif
1035 /*
1036 * Decide whether to scan ahead or behind for
1037 * additional pages contiguous to the faulted
1038 * page in the same paging block. The decision
1039 * is based on system wide globals and the
1040 * expected page reference behavior of the
1041 * address range contained the faulting address.
1042 * First calculate some constants.
1043 */
1044 paging_offset = offset + object->paging_offset;
1045 cluster_offset = paging_offset & (cluster_size - 1);
1046 align_offset = paging_offset&(PAGE_SIZE_64-1);
1047 if (align_offset != 0) {
1048 cluster_offset = trunc_page_64(cluster_offset);
1049 }
1050
1051 #define SPANS_CLUSTER(x) ((((x) - align_offset) & (vm_object_offset_t)(cluster_size - 1)) == 0)
1052
1053 /*
1054 * Backward scan only if reverse sequential
1055 * behavior has been specified
1056 */
1057 CLUSTER_STAT(pages_at_lower_offsets = 0;)
1058 if (((vm_default_behind != 0 &&
1059 behavior == VM_BEHAVIOR_DEFAULT) ||
1060 behavior == VM_BEHAVIOR_RSEQNTL) && offset) {
1061 vm_object_offset_t cluster_bot;
1062
1063 /*
1064 * Calculate lower search boundary.
1065 * Exclude pages that span a cluster boundary.
1066 * Clip to start of map entry.
1067 * For default page reference behavior, scan
1068 * default pages behind.
1069 */
1070 cluster_bot = (offset > cluster_offset) ?
1071 offset - cluster_offset : offset;
1072 if (align_offset != 0) {
1073 if ((cluster_bot < offset) &&
1074 SPANS_CLUSTER(cluster_bot)) {
1075 cluster_bot += PAGE_SIZE_64;
1076 }
1077 }
1078 if (behavior == VM_BEHAVIOR_DEFAULT) {
1079 vm_object_offset_t
1080 bot = (vm_object_offset_t)
1081 (vm_default_behind * PAGE_SIZE);
1082
1083 if (cluster_bot < (offset - bot))
1084 cluster_bot = offset - bot;
1085 }
1086 if (lo_offset > cluster_bot)
1087 cluster_bot = lo_offset;
1088
1089 for ( cluster_start = offset - PAGE_SIZE_64;
1090 (cluster_start >= cluster_bot) &&
1091 (cluster_start !=
1092 (align_offset - PAGE_SIZE_64));
1093 cluster_start -= PAGE_SIZE_64) {
1094 assert(cluster_size > PAGE_SIZE_64);
1095 retry_cluster_backw:
1096 if (!LOOK_FOR(object, cluster_start) ||
1097 vm_page_lookup(object, cluster_start)
1098 != VM_PAGE_NULL) {
1099 break;
1100 }
1101 if (object->internal) {
1102 /*
1103 * need to acquire a real page in
1104 * advance because this acts as
1105 * a throttling mechanism for
1106 * data_requests to the default
1107 * pager. If this fails, give up
1108 * trying to find any more pages
1109 * in the cluster and send off the
1110 * request for what we already have.
1111 */
1112 if ((m = vm_page_grab())
1113 == VM_PAGE_NULL) {
1114 cluster_start += PAGE_SIZE_64;
1115 cluster_end = offset + PAGE_SIZE_64;
1116 goto give_up;
1117 }
1118 } else if ((m = vm_page_grab_fictitious())
1119 == VM_PAGE_NULL) {
1120 vm_object_unlock(object);
1121 vm_page_more_fictitious();
1122 vm_object_lock(object);
1123 goto retry_cluster_backw;
1124 }
1125 m->absent = TRUE;
1126 m->unusual = TRUE;
1127 m->clustered = TRUE;
1128 m->list_req_pending = TRUE;
1129
1130 vm_page_insert(m, object, cluster_start);
1131 CLUSTER_STAT(pages_at_lower_offsets++;)
1132 object->absent_count++;
1133 }
1134 cluster_start += PAGE_SIZE_64;
1135 assert(cluster_start >= cluster_bot);
1136 }
1137 assert(cluster_start <= offset);
1138
1139 /*
1140 * Forward scan if default or sequential behavior
1141 * specified
1142 */
1143 CLUSTER_STAT(pages_at_higher_offsets = 0;)
1144 if ((behavior == VM_BEHAVIOR_DEFAULT &&
1145 vm_default_ahead != 0) ||
1146 behavior == VM_BEHAVIOR_SEQUENTIAL) {
1147 vm_object_offset_t cluster_top;
1148
1149 /*
1150 * Calculate upper search boundary.
1151 * Exclude pages that span a cluster boundary.
1152 * Clip to end of map entry.
1153 * For default page reference behavior, scan
1154 * default pages ahead.
1155 */
1156 cluster_top = (offset + cluster_size) -
1157 cluster_offset;
1158 if (align_offset != 0) {
1159 if ((cluster_top > (offset + PAGE_SIZE_64)) &&
1160 SPANS_CLUSTER(cluster_top)) {
1161 cluster_top -= PAGE_SIZE_64;
1162 }
1163 }
1164 if (behavior == VM_BEHAVIOR_DEFAULT) {
1165 vm_object_offset_t top = (vm_object_offset_t)
1166 ((vm_default_ahead*PAGE_SIZE)+PAGE_SIZE);
1167
1168 if (cluster_top > (offset + top))
1169 cluster_top = offset + top;
1170 }
1171 if (cluster_top > hi_offset)
1172 cluster_top = hi_offset;
1173
1174 for (cluster_end = offset + PAGE_SIZE_64;
1175 cluster_end < cluster_top;
1176 cluster_end += PAGE_SIZE_64) {
1177 assert(cluster_size > PAGE_SIZE);
1178 retry_cluster_forw:
1179 if (!LOOK_FOR(object, cluster_end) ||
1180 vm_page_lookup(object, cluster_end)
1181 != VM_PAGE_NULL) {
1182 break;
1183 }
1184 if (object->internal) {
1185 /*
1186 * need to acquire a real page in
1187 * advance because this acts as
1188 * a throttling mechanism for
1189 * data_requests to the default
1190 * pager. If this fails, give up
1191 * trying to find any more pages
1192 * in the cluster and send off the
1193 * request for what we already have.
1194 */
1195 if ((m = vm_page_grab())
1196 == VM_PAGE_NULL) {
1197 break;
1198 }
1199 } else if ((m = vm_page_grab_fictitious())
1200 == VM_PAGE_NULL) {
1201 vm_object_unlock(object);
1202 vm_page_more_fictitious();
1203 vm_object_lock(object);
1204 goto retry_cluster_forw;
1205 }
1206 m->absent = TRUE;
1207 m->unusual = TRUE;
1208 m->clustered = TRUE;
1209 m->list_req_pending = TRUE;
1210
1211 vm_page_insert(m, object, cluster_end);
1212 CLUSTER_STAT(pages_at_higher_offsets++;)
1213 object->absent_count++;
1214 }
1215 assert(cluster_end <= cluster_top);
1216 }
1217 else {
1218 cluster_end = offset + PAGE_SIZE_64;
1219 }
1220 give_up:
1221 assert(cluster_end >= offset + PAGE_SIZE_64);
1222 length = cluster_end - cluster_start;
1223
1224 #if MACH_CLUSTER_STATS
1225 CLUSTER_STAT_HIGHER(pages_at_higher_offsets);
1226 CLUSTER_STAT_LOWER(pages_at_lower_offsets);
1227 CLUSTER_STAT_CLUSTER(length/PAGE_SIZE);
1228 #endif /* MACH_CLUSTER_STATS */
1229
1230 no_clustering:
1231 #if TRACEFAULTPAGE
1232 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1233 #endif
1234 /*
1235 * We have a busy page, so we can
1236 * release the object lock.
1237 */
1238 vm_object_unlock(object);
1239
1240 /*
1241 * Call the memory manager to retrieve the data.
1242 */
1243
1244 if (type_of_fault)
1245 *type_of_fault = DBG_PAGEIN_FAULT;
1246 VM_STAT(pageins++);
1247 current_task()->pageins++;
1248
1249 /*
1250 * If this object uses a copy_call strategy,
1251 * and we are interested in a copy of this object
1252 * (having gotten here only by following a
1253 * shadow chain), then tell the memory manager
1254 * via a flag added to the desired_access
1255 * parameter, so that it can detect a race
1256 * between our walking down the shadow chain
1257 * and its pushing pages up into a copy of
1258 * the object that it manages.
1259 */
1260
1261 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL &&
1262 object != first_object) {
1263 wants_copy_flag = VM_PROT_WANTS_COPY;
1264 } else {
1265 wants_copy_flag = VM_PROT_NONE;
1266 }
1267
1268 XPR(XPR_VM_FAULT,
1269 "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1270 (integer_t)object, offset, (integer_t)m,
1271 access_required | wants_copy_flag, 0);
1272
1273 #ifdef MACH_BSD
1274 if (((rpc_subsystem_t)pager_mux_hash_lookup(object->pager)) ==
1275 ((rpc_subsystem_t) &vnode_pager_workaround)) {
1276 rc = vnode_pager_data_request(object->pager,
1277 object->pager_request,
1278 cluster_start + object->paging_offset,
1279 length,
1280 access_required | wants_copy_flag);
1281 } else {
1282 rc = memory_object_data_request(object->pager,
1283 object->pager_request,
1284 cluster_start + object->paging_offset,
1285 length,
1286 access_required | wants_copy_flag);
1287 }
1288 #else
1289 rc = memory_object_data_request(object->pager,
1290 object->pager_request,
1291 cluster_start + object->paging_offset,
1292 length,
1293 access_required | wants_copy_flag);
1294
1295 #endif
1296
1297 #if TRACEFAULTPAGE
1298 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1299 #endif
1300 if (rc != KERN_SUCCESS) {
1301 if (rc != MACH_SEND_INTERRUPTED
1302 && vm_fault_debug)
1303 printf("%s(0x%x, 0x%x, 0x%x, 0x%x, 0x%x) failed, rc=%d, object=0x%x\n",
1304 "memory_object_data_request",
1305 object->pager,
1306 object->pager_request,
1307 cluster_start + object->paging_offset,
1308 length, access_required,
1309 rc, object);
1310 /*
1311 * Don't want to leave a busy page around,
1312 * but the data request may have blocked,
1313 * so check if it's still there and busy.
1314 */
1315 vm_object_lock(object);
1316 for (; length;
1317 length -= PAGE_SIZE,
1318 cluster_start += PAGE_SIZE_64) {
1319 vm_page_t p;
1320 if ((p = vm_page_lookup(object,
1321 cluster_start))
1322 && p->absent && p->busy
1323 && p != first_m) {
1324 VM_PAGE_FREE(m);
1325 }
1326 }
1327 vm_fault_cleanup(object, first_m);
1328 cur_thread->interruptible = interruptible_state;
1329 return((rc == MACH_SEND_INTERRUPTED) ?
1330 VM_FAULT_INTERRUPTED :
1331 VM_FAULT_MEMORY_ERROR);
1332 }
1333
1334 /*
1335 * Retry with same object/offset, since new data may
1336 * be in a different page (i.e., m is meaningless at
1337 * this point).
1338 */
1339 vm_object_lock(object);
1340 if ((interruptible != THREAD_UNINT) &&
1341 (current_thread()->state & TH_ABORT)) {
1342 vm_fault_cleanup(object, first_m);
1343 cur_thread->interruptible = interruptible_state;
1344 return(VM_FAULT_INTERRUPTED);
1345 }
1346 continue;
1347 }
1348
1349 /*
1350 * The only case in which we get here is if
1351 * object has no pager (or unwiring). If the pager doesn't
1352 * have the page this is handled in the m->absent case above
1353 * (and if you change things here you should look above).
1354 */
1355 #if TRACEFAULTPAGE
1356 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1357 #endif
1358 if (object == first_object)
1359 first_m = m;
1360 else
1361 assert(m == VM_PAGE_NULL);
1362
1363 XPR(XPR_VM_FAULT,
1364 "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1365 (integer_t)object, offset, (integer_t)m,
1366 (integer_t)object->shadow, 0);
1367 /*
1368 * Move on to the next object. Lock the next
1369 * object before unlocking the current one.
1370 */
1371 next_object = object->shadow;
1372 if (next_object == VM_OBJECT_NULL) {
1373 assert(!must_be_resident);
1374 /*
1375 * If there's no object left, fill the page
1376 * in the top object with zeros. But first we
1377 * need to allocate a real page.
1378 */
1379
1380 if (object != first_object) {
1381 vm_object_paging_end(object);
1382 vm_object_unlock(object);
1383
1384 object = first_object;
1385 offset = first_offset;
1386 vm_object_lock(object);
1387 }
1388
1389 m = first_m;
1390 assert(m->object == object);
1391 first_m = VM_PAGE_NULL;
1392
1393 if (object->shadow_severed) {
1394 VM_PAGE_FREE(m);
1395 vm_fault_cleanup(object, VM_PAGE_NULL);
1396 cur_thread->interruptible = interruptible_state;
1397 return VM_FAULT_MEMORY_ERROR;
1398 }
1399
1400 if (VM_PAGE_THROTTLED() ||
1401 (m->fictitious && !vm_page_convert(m))) {
1402 VM_PAGE_FREE(m);
1403 vm_fault_cleanup(object, VM_PAGE_NULL);
1404 cur_thread->interruptible = interruptible_state;
1405 return(VM_FAULT_MEMORY_SHORTAGE);
1406 }
1407
1408 if (!no_zero_fill) {
1409 vm_object_unlock(object);
1410 vm_page_zero_fill(m);
1411 if (type_of_fault)
1412 *type_of_fault = DBG_ZERO_FILL_FAULT;
1413 VM_STAT(zero_fill_count++);
1414 vm_object_lock(object);
1415 }
1416 vm_page_lock_queues();
1417 VM_PAGE_QUEUES_REMOVE(m);
1418 queue_enter(&vm_page_queue_inactive,
1419 m, vm_page_t, pageq);
1420 m->inactive = TRUE;
1421 vm_page_inactive_count++;
1422 vm_page_unlock_queues();
1423 pmap_clear_modify(m->phys_addr);
1424 break;
1425 }
1426 else {
1427 if ((object != first_object) || must_be_resident)
1428 vm_object_paging_end(object);
1429 offset += object->shadow_offset;
1430 hi_offset += object->shadow_offset;
1431 lo_offset += object->shadow_offset;
1432 access_required = VM_PROT_READ;
1433 vm_object_lock(next_object);
1434 vm_object_unlock(object);
1435 object = next_object;
1436 vm_object_paging_begin(object);
1437 }
1438 }
1439
1440 /*
1441 * PAGE HAS BEEN FOUND.
1442 *
1443 * This page (m) is:
1444 * busy, so that we can play with it;
1445 * not absent, so that nobody else will fill it;
1446 * possibly eligible for pageout;
1447 *
1448 * The top-level page (first_m) is:
1449 * VM_PAGE_NULL if the page was found in the
1450 * top-level object;
1451 * busy, not absent, and ineligible for pageout.
1452 *
1453 * The current object (object) is locked. A paging
1454 * reference is held for the current and top-level
1455 * objects.
1456 */
1457
1458 #if TRACEFAULTPAGE
1459 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1460 #endif
1461 #if EXTRA_ASSERTIONS
1462 assert(m->busy && !m->absent);
1463 assert((first_m == VM_PAGE_NULL) ||
1464 (first_m->busy && !first_m->absent &&
1465 !first_m->active && !first_m->inactive));
1466 #endif /* EXTRA_ASSERTIONS */
1467
1468 XPR(XPR_VM_FAULT,
1469 "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1470 (integer_t)object, offset, (integer_t)m,
1471 (integer_t)first_object, (integer_t)first_m);
1472 /*
1473 * If the page is being written, but isn't
1474 * already owned by the top-level object,
1475 * we have to copy it into a new page owned
1476 * by the top-level object.
1477 */
1478
1479 if (object != first_object) {
1480 /*
1481 * We only really need to copy if we
1482 * want to write it.
1483 */
1484
1485 #if TRACEFAULTPAGE
1486 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1487 #endif
1488 if (fault_type & VM_PROT_WRITE) {
1489 vm_page_t copy_m;
1490
1491 assert(!must_be_resident);
1492
1493 /*
1494 * If we try to collapse first_object at this
1495 * point, we may deadlock when we try to get
1496 * the lock on an intermediate object (since we
1497 * have the bottom object locked). We can't
1498 * unlock the bottom object, because the page
1499 * we found may move (by collapse) if we do.
1500 *
1501 * Instead, we first copy the page. Then, when
1502 * we have no more use for the bottom object,
1503 * we unlock it and try to collapse.
1504 *
1505 * Note that we copy the page even if we didn't
1506 * need to... that's the breaks.
1507 */
1508
1509 /*
1510 * Allocate a page for the copy
1511 */
1512 copy_m = vm_page_grab();
1513 if (copy_m == VM_PAGE_NULL) {
1514 RELEASE_PAGE(m);
1515 vm_fault_cleanup(object, first_m);
1516 cur_thread->interruptible = interruptible_state;
1517 return(VM_FAULT_MEMORY_SHORTAGE);
1518 }
1519
1520
1521 XPR(XPR_VM_FAULT,
1522 "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1523 (integer_t)object, offset,
1524 (integer_t)m, (integer_t)copy_m, 0);
1525 vm_page_copy(m, copy_m);
1526
1527 /*
1528 * If another map is truly sharing this
1529 * page with us, we have to flush all
1530 * uses of the original page, since we
1531 * can't distinguish those which want the
1532 * original from those which need the
1533 * new copy.
1534 *
1535 * XXXO If we know that only one map has
1536 * access to this page, then we could
1537 * avoid the pmap_page_protect() call.
1538 */
1539
1540 vm_page_lock_queues();
1541 assert(!m->cleaning);
1542 pmap_page_protect(m->phys_addr, VM_PROT_NONE);
1543 vm_page_deactivate(m);
1544 copy_m->dirty = TRUE;
1545 /*
1546 * Setting reference here prevents this fault from
1547 * being counted as a (per-thread) reactivate as well
1548 * as a copy-on-write.
1549 */
1550 first_m->reference = TRUE;
1551 vm_page_unlock_queues();
1552
1553 /*
1554 * We no longer need the old page or object.
1555 */
1556
1557 PAGE_WAKEUP_DONE(m);
1558 vm_object_paging_end(object);
1559 vm_object_unlock(object);
1560
1561 if (type_of_fault)
1562 *type_of_fault = DBG_COW_FAULT;
1563 VM_STAT(cow_faults++);
1564 current_task()->cow_faults++;
1565 object = first_object;
1566 offset = first_offset;
1567
1568 vm_object_lock(object);
1569 VM_PAGE_FREE(first_m);
1570 first_m = VM_PAGE_NULL;
1571 assert(copy_m->busy);
1572 vm_page_insert(copy_m, object, offset);
1573 m = copy_m;
1574
1575 /*
1576 * Now that we've gotten the copy out of the
1577 * way, let's try to collapse the top object.
1578 * But we have to play ugly games with
1579 * paging_in_progress to do that...
1580 */
1581
1582 vm_object_paging_end(object);
1583 vm_object_collapse(object);
1584 vm_object_paging_begin(object);
1585
1586 }
1587 else {
1588 *protection &= (~VM_PROT_WRITE);
1589 }
1590 }
1591
1592 /*
1593 * Now check whether the page needs to be pushed into the
1594 * copy object. The use of asymmetric copy on write for
1595 * shared temporary objects means that we may do two copies to
1596 * satisfy the fault; one above to get the page from a
1597 * shadowed object, and one here to push it into the copy.
1598 */
1599
1600 while (first_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
1601 (copy_object = first_object->copy) != VM_OBJECT_NULL) {
1602 vm_object_offset_t copy_offset;
1603 vm_page_t copy_m;
1604
1605 #if TRACEFAULTPAGE
1606 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1607 #endif
1608 /*
1609 * If the page is being written, but hasn't been
1610 * copied to the copy-object, we have to copy it there.
1611 */
1612
1613 if ((fault_type & VM_PROT_WRITE) == 0) {
1614 *protection &= ~VM_PROT_WRITE;
1615 break;
1616 }
1617
1618 /*
1619 * If the page was guaranteed to be resident,
1620 * we must have already performed the copy.
1621 */
1622
1623 if (must_be_resident)
1624 break;
1625
1626 /*
1627 * Try to get the lock on the copy_object.
1628 */
1629 if (!vm_object_lock_try(copy_object)) {
1630 vm_object_unlock(object);
1631
1632 mutex_pause(); /* wait a bit */
1633
1634 vm_object_lock(object);
1635 continue;
1636 }
1637
1638 /*
1639 * Make another reference to the copy-object,
1640 * to keep it from disappearing during the
1641 * copy.
1642 */
1643 assert(copy_object->ref_count > 0);
1644 copy_object->ref_count++;
1645 VM_OBJ_RES_INCR(copy_object);
1646
1647 /*
1648 * Does the page exist in the copy?
1649 */
1650 copy_offset = first_offset - copy_object->shadow_offset;
1651 if (copy_object->size <= copy_offset)
1652 /*
1653 * Copy object doesn't cover this page -- do nothing.
1654 */
1655 ;
1656 else if ((copy_m =
1657 vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1658 /* Page currently exists in the copy object */
1659 if (copy_m->busy) {
1660 /*
1661 * If the page is being brought
1662 * in, wait for it and then retry.
1663 */
1664 RELEASE_PAGE(m);
1665 /* take an extra ref so object won't die */
1666 assert(copy_object->ref_count > 0);
1667 copy_object->ref_count++;
1668 vm_object_res_reference(copy_object);
1669 vm_object_unlock(copy_object);
1670 vm_fault_cleanup(object, first_m);
1671 counter(c_vm_fault_page_block_backoff_kernel++);
1672 vm_object_lock(copy_object);
1673 assert(copy_object->ref_count > 0);
1674 VM_OBJ_RES_DECR(copy_object);
1675 copy_object->ref_count--;
1676 assert(copy_object->ref_count > 0);
1677 copy_m = vm_page_lookup(copy_object, copy_offset);
1678 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1679 PAGE_ASSERT_WAIT(copy_m, interruptible);
1680 vm_object_unlock(copy_object);
1681 wait_result = thread_block((void (*)(void))0);
1682 vm_object_deallocate(copy_object);
1683 goto backoff;
1684 } else {
1685 vm_object_unlock(copy_object);
1686 vm_object_deallocate(copy_object);
1687 cur_thread->interruptible = interruptible_state;
1688 return VM_FAULT_RETRY;
1689 }
1690 }
1691 }
1692 else if (!PAGED_OUT(copy_object, copy_offset)) {
1693 /*
1694 * If PAGED_OUT is TRUE, then the page used to exist
1695 * in the copy-object, and has already been paged out.
1696 * We don't need to repeat this. If PAGED_OUT is
1697 * FALSE, then either we don't know (!pager_created,
1698 * for example) or it hasn't been paged out.
1699 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1700 * We must copy the page to the copy object.
1701 */
1702
1703 /*
1704 * Allocate a page for the copy
1705 */
1706 copy_m = vm_page_alloc(copy_object, copy_offset);
1707 if (copy_m == VM_PAGE_NULL) {
1708 RELEASE_PAGE(m);
1709 VM_OBJ_RES_DECR(copy_object);
1710 copy_object->ref_count--;
1711 assert(copy_object->ref_count > 0);
1712 vm_object_unlock(copy_object);
1713 vm_fault_cleanup(object, first_m);
1714 cur_thread->interruptible = interruptible_state;
1715 return(VM_FAULT_MEMORY_SHORTAGE);
1716 }
1717
1718 /*
1719 * Must copy page into copy-object.
1720 */
1721
1722 vm_page_copy(m, copy_m);
1723
1724 /*
1725 * If the old page was in use by any users
1726 * of the copy-object, it must be removed
1727 * from all pmaps. (We can't know which
1728 * pmaps use it.)
1729 */
1730
1731 vm_page_lock_queues();
1732 assert(!m->cleaning);
1733 pmap_page_protect(m->phys_addr, VM_PROT_NONE);
1734 copy_m->dirty = TRUE;
1735 vm_page_unlock_queues();
1736
1737 /*
1738 * If there's a pager, then immediately
1739 * page out this page, using the "initialize"
1740 * option. Else, we use the copy.
1741 */
1742
1743 if
1744 #if MACH_PAGEMAP
1745 ((!copy_object->pager_created) ||
1746 vm_external_state_get(
1747 copy_object->existence_map, copy_offset)
1748 == VM_EXTERNAL_STATE_ABSENT)
1749 #else
1750 (!copy_object->pager_created)
1751 #endif
1752 {
1753 vm_page_lock_queues();
1754 vm_page_activate(copy_m);
1755 vm_page_unlock_queues();
1756 PAGE_WAKEUP_DONE(copy_m);
1757 }
1758 else {
1759 assert(copy_m->busy == TRUE);
1760
1761 /*
1762 * The page is already ready for pageout:
1763 * not on pageout queues and busy.
1764 * Unlock everything except the
1765 * copy_object itself.
1766 */
1767
1768 vm_object_unlock(object);
1769
1770 /*
1771 * Write the page to the copy-object,
1772 * flushing it from the kernel.
1773 */
1774
1775 vm_pageout_initialize_page(copy_m);
1776
1777 /*
1778 * Since the pageout may have
1779 * temporarily dropped the
1780 * copy_object's lock, we
1781 * check whether we'll have
1782 * to deallocate the hard way.
1783 */
1784
1785 if ((copy_object->shadow != object) ||
1786 (copy_object->ref_count == 1)) {
1787 vm_object_unlock(copy_object);
1788 vm_object_deallocate(copy_object);
1789 vm_object_lock(object);
1790 continue;
1791 }
1792
1793 /*
1794 * Pick back up the old object's
1795 * lock. [It is safe to do so,
1796 * since it must be deeper in the
1797 * object tree.]
1798 */
1799
1800 vm_object_lock(object);
1801 }
1802
1803 /*
1804 * Because we're pushing a page upward
1805 * in the object tree, we must restart
1806 * any faults that are waiting here.
1807 * [Note that this is an expansion of
1808 * PAGE_WAKEUP that uses the THREAD_RESTART
1809 * wait result]. Can't turn off the page's
1810 * busy bit because we're not done with it.
1811 */
1812
1813 if (m->wanted) {
1814 m->wanted = FALSE;
1815 thread_wakeup_with_result((event_t) m,
1816 THREAD_RESTART);
1817 }
1818 }
1819
1820 /*
1821 * The reference count on copy_object must be
1822 * at least 2: one for our extra reference,
1823 * and at least one from the outside world
1824 * (we checked that when we last locked
1825 * copy_object).
1826 */
1827 copy_object->ref_count--;
1828 assert(copy_object->ref_count > 0);
1829 VM_OBJ_RES_DECR(copy_object);
1830 vm_object_unlock(copy_object);
1831
1832 break;
1833 }
1834
1835 *result_page = m;
1836 *top_page = first_m;
1837
1838 XPR(XPR_VM_FAULT,
1839 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1840 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1841 /*
1842 * If the page can be written, assume that it will be.
1843 * [Earlier, we restrict the permission to allow write
1844 * access only if the fault so required, so we don't
1845 * mark read-only data as dirty.]
1846 */
1847
1848 #if !VM_FAULT_STATIC_CONFIG
1849 if (vm_fault_dirty_handling && (*protection & VM_PROT_WRITE))
1850 m->dirty = TRUE;
1851 #endif
1852 #if TRACEFAULTPAGE
1853 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_page_deactivate_behind); /* (TEST/DEBUG) */
1854 #endif
1855 if (vm_page_deactivate_behind) {
1856 if (offset && /* don't underflow */
1857 (object->last_alloc == (offset - PAGE_SIZE_64))) {
1858 m = vm_page_lookup(object, object->last_alloc);
1859 if ((m != VM_PAGE_NULL) && !m->busy) {
1860 vm_page_lock_queues();
1861 vm_page_deactivate(m);
1862 vm_page_unlock_queues();
1863 }
1864 #if TRACEFAULTPAGE
1865 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1866 #endif
1867 }
1868 object->last_alloc = offset;
1869 }
1870 #if TRACEFAULTPAGE
1871 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
1872 #endif
1873 cur_thread->interruptible = interruptible_state;
1874 return(VM_FAULT_SUCCESS);
1875
1876 #if 0
1877 block_and_backoff:
1878 vm_fault_cleanup(object, first_m);
1879
1880 counter(c_vm_fault_page_block_backoff_kernel++);
1881 thread_block((void (*)(void))0);
1882 #endif
1883
1884 backoff:
1885 cur_thread->interruptible = interruptible_state;
1886 if (wait_result == THREAD_INTERRUPTED)
1887 return VM_FAULT_INTERRUPTED;
1888 return VM_FAULT_RETRY;
1889
1890 #undef RELEASE_PAGE
1891 }
1892
1893 /*
1894 * Routine: vm_fault
1895 * Purpose:
1896 * Handle page faults, including pseudo-faults
1897 * used to change the wiring status of pages.
1898 * Returns:
1899 * Explicit continuations have been removed.
1900 * Implementation:
1901 * vm_fault and vm_fault_page save mucho state
1902 * in the moral equivalent of a closure. The state
1903 * structure is allocated when first entering vm_fault
1904 * and deallocated when leaving vm_fault.
1905 */
1906
1907 kern_return_t
1908 vm_fault(
1909 vm_map_t map,
1910 vm_offset_t vaddr,
1911 vm_prot_t fault_type,
1912 boolean_t change_wiring,
1913 int interruptible)
1914 {
1915 vm_map_version_t version; /* Map version for verificiation */
1916 boolean_t wired; /* Should mapping be wired down? */
1917 vm_object_t object; /* Top-level object */
1918 vm_object_offset_t offset; /* Top-level offset */
1919 vm_prot_t prot; /* Protection for mapping */
1920 vm_behavior_t behavior; /* Expected paging behavior */
1921 vm_object_offset_t lo_offset, hi_offset;
1922 vm_object_t old_copy_object; /* Saved copy object */
1923 vm_page_t result_page; /* Result of vm_fault_page */
1924 vm_page_t top_page; /* Placeholder page */
1925 kern_return_t kr;
1926
1927 register
1928 vm_page_t m; /* Fast access to result_page */
1929 kern_return_t error_code; /* page error reasons */
1930 register
1931 vm_object_t cur_object;
1932 register
1933 vm_object_offset_t cur_offset;
1934 vm_page_t cur_m;
1935 vm_object_t new_object;
1936 int type_of_fault;
1937 vm_map_t pmap_map = map;
1938 vm_map_t original_map = map;
1939 pmap_t pmap = NULL;
1940 boolean_t funnel_set = FALSE;
1941 funnel_t *curflock;
1942 thread_t cur_thread;
1943 boolean_t interruptible_state;
1944
1945
1946 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_START,
1947 vaddr,
1948 0,
1949 0,
1950 0,
1951 0);
1952
1953 cur_thread = current_thread();
1954
1955 interruptible_state = cur_thread->interruptible;
1956 if (interruptible == THREAD_UNINT)
1957 cur_thread->interruptible = FALSE;
1958
1959 /*
1960 * assume we will hit a page in the cache
1961 * otherwise, explicitly override with
1962 * the real fault type once we determine it
1963 */
1964 type_of_fault = DBG_CACHE_HIT_FAULT;
1965
1966 VM_STAT(faults++);
1967 current_task()->faults++;
1968
1969 /*
1970 * drop funnel if it is already held. Then restore while returning
1971 */
1972 if ((cur_thread->funnel_state & TH_FN_OWNED) == TH_FN_OWNED) {
1973 funnel_set = TRUE;
1974 curflock = cur_thread->funnel_lock;
1975 thread_funnel_set( curflock , FALSE);
1976 }
1977
1978 RetryFault: ;
1979
1980 /*
1981 * Find the backing store object and offset into
1982 * it to begin the search.
1983 */
1984 map = original_map;
1985 vm_map_lock_read(map);
1986 kr = vm_map_lookup_locked(&map, vaddr, fault_type, &version,
1987 &object, &offset,
1988 &prot, &wired,
1989 &behavior, &lo_offset, &hi_offset, &pmap_map);
1990
1991 pmap = pmap_map->pmap;
1992
1993 if (kr != KERN_SUCCESS) {
1994 vm_map_unlock_read(map);
1995 goto done;
1996 }
1997
1998 /*
1999 * If the page is wired, we must fault for the current protection
2000 * value, to avoid further faults.
2001 */
2002
2003 if (wired)
2004 fault_type = prot | VM_PROT_WRITE;
2005
2006 #if VM_FAULT_CLASSIFY
2007 /*
2008 * Temporary data gathering code
2009 */
2010 vm_fault_classify(object, offset, fault_type);
2011 #endif
2012 /*
2013 * Fast fault code. The basic idea is to do as much as
2014 * possible while holding the map lock and object locks.
2015 * Busy pages are not used until the object lock has to
2016 * be dropped to do something (copy, zero fill, pmap enter).
2017 * Similarly, paging references aren't acquired until that
2018 * point, and object references aren't used.
2019 *
2020 * If we can figure out what to do
2021 * (zero fill, copy on write, pmap enter) while holding
2022 * the locks, then it gets done. Otherwise, we give up,
2023 * and use the original fault path (which doesn't hold
2024 * the map lock, and relies on busy pages).
2025 * The give up cases include:
2026 * - Have to talk to pager.
2027 * - Page is busy, absent or in error.
2028 * - Pager has locked out desired access.
2029 * - Fault needs to be restarted.
2030 * - Have to push page into copy object.
2031 *
2032 * The code is an infinite loop that moves one level down
2033 * the shadow chain each time. cur_object and cur_offset
2034 * refer to the current object being examined. object and offset
2035 * are the original object from the map. The loop is at the
2036 * top level if and only if object and cur_object are the same.
2037 *
2038 * Invariants: Map lock is held throughout. Lock is held on
2039 * original object and cur_object (if different) when
2040 * continuing or exiting loop.
2041 *
2042 */
2043
2044
2045 /*
2046 * If this page is to be inserted in a copy delay object
2047 * for writing, and if the object has a copy, then the
2048 * copy delay strategy is implemented in the slow fault page.
2049 */
2050 if (object->copy_strategy != MEMORY_OBJECT_COPY_DELAY ||
2051 object->copy == VM_OBJECT_NULL ||
2052 (fault_type & VM_PROT_WRITE) == 0) {
2053 cur_object = object;
2054 cur_offset = offset;
2055
2056 while (TRUE) {
2057 m = vm_page_lookup(cur_object, cur_offset);
2058 if (m != VM_PAGE_NULL) {
2059 if (m->busy)
2060 break;
2061
2062 if (m->unusual && (m->error || m->restart ||
2063 m->absent || (fault_type & m->page_lock))) {
2064
2065 /*
2066 * Unusual case. Give up.
2067 */
2068 break;
2069 }
2070
2071 /*
2072 * Two cases of map in faults:
2073 * - At top level w/o copy object.
2074 * - Read fault anywhere.
2075 * --> must disallow write.
2076 */
2077
2078 if (object == cur_object &&
2079 object->copy == VM_OBJECT_NULL)
2080 goto FastMapInFault;
2081
2082 if ((fault_type & VM_PROT_WRITE) == 0) {
2083
2084 prot &= ~VM_PROT_WRITE;
2085
2086 /*
2087 * Set up to map the page ...
2088 * mark the page busy, drop
2089 * locks and take a paging reference
2090 * on the object with the page.
2091 */
2092
2093 if (object != cur_object) {
2094 vm_object_unlock(object);
2095 object = cur_object;
2096 }
2097 FastMapInFault:
2098 m->busy = TRUE;
2099
2100 vm_object_paging_begin(object);
2101 vm_object_unlock(object);
2102
2103 FastPmapEnter:
2104 /*
2105 * Check a couple of global reasons to
2106 * be conservative about write access.
2107 * Then do the pmap_enter.
2108 */
2109 #if !VM_FAULT_STATIC_CONFIG
2110 if (vm_fault_dirty_handling
2111 #if MACH_KDB
2112 || db_watchpoint_list
2113 #endif
2114 && (fault_type & VM_PROT_WRITE) == 0)
2115 prot &= ~VM_PROT_WRITE;
2116 #else /* STATIC_CONFIG */
2117 #if MACH_KDB
2118 if (db_watchpoint_list
2119 && (fault_type & VM_PROT_WRITE) == 0)
2120 prot &= ~VM_PROT_WRITE;
2121 #endif /* MACH_KDB */
2122 #endif /* STATIC_CONFIG */
2123 PMAP_ENTER(pmap, vaddr, m, prot, wired);
2124 pmap_attribute(pmap,
2125 vaddr,
2126 PAGE_SIZE,
2127 MATTR_CACHE,
2128 &mv_cache_sync);
2129
2130 if (m->clustered) {
2131 vm_pagein_cluster_used++;
2132 m->clustered = FALSE;
2133
2134 }
2135 /*
2136 * Grab the object lock to manipulate
2137 * the page queues. Change wiring
2138 * case is obvious. In soft ref bits
2139 * case activate page only if it fell
2140 * off paging queues, otherwise just
2141 * activate it if it's inactive.
2142 *
2143 * NOTE: original vm_fault code will
2144 * move active page to back of active
2145 * queue. This code doesn't.
2146 */
2147 vm_object_lock(object);
2148 vm_page_lock_queues();
2149
2150 m->reference = TRUE;
2151
2152 if (change_wiring) {
2153 if (wired)
2154 vm_page_wire(m);
2155 else
2156 vm_page_unwire(m);
2157 }
2158 #if VM_FAULT_STATIC_CONFIG
2159 else {
2160 if (!m->active && !m->inactive)
2161 vm_page_activate(m);
2162 }
2163 #else
2164 else if (software_reference_bits) {
2165 if (!m->active && !m->inactive)
2166 vm_page_activate(m);
2167 }
2168 else if (!m->active) {
2169 vm_page_activate(m);
2170 }
2171 #endif
2172 vm_page_unlock_queues();
2173
2174 /*
2175 * That's it, clean up and return.
2176 */
2177 PAGE_WAKEUP_DONE(m);
2178 vm_object_paging_end(object);
2179 vm_object_unlock(object);
2180 vm_map_unlock_read(map);
2181 if(pmap_map != map)
2182 vm_map_unlock(pmap_map);
2183
2184 if (funnel_set) {
2185 thread_funnel_set( curflock, TRUE);
2186 funnel_set = FALSE;
2187 }
2188 cur_thread->interruptible = interruptible_state;
2189
2190 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2191 vaddr,
2192 type_of_fault,
2193 KERN_SUCCESS,
2194 0,
2195 0);
2196 return KERN_SUCCESS;
2197 }
2198
2199 /*
2200 * Copy on write fault. If objects match, then
2201 * object->copy must not be NULL (else control
2202 * would be in previous code block), and we
2203 * have a potential push into the copy object
2204 * with which we won't cope here.
2205 */
2206
2207 if (cur_object == object)
2208 break;
2209
2210 /*
2211 * This is now a shadow based copy on write
2212 * fault -- it requires a copy up the shadow
2213 * chain.
2214 *
2215 * Allocate a page in the original top level
2216 * object. Give up if allocate fails. Also
2217 * need to remember current page, as it's the
2218 * source of the copy.
2219 */
2220 cur_m = m;
2221 m = vm_page_grab();
2222 if (m == VM_PAGE_NULL) {
2223 break;
2224 }
2225
2226 /*
2227 * Now do the copy. Mark the source busy
2228 * and take out paging references on both
2229 * objects.
2230 *
2231 * NOTE: This code holds the map lock across
2232 * the page copy.
2233 */
2234
2235 cur_m->busy = TRUE;
2236 vm_page_copy(cur_m, m);
2237 vm_page_insert(m, object, offset);
2238
2239 vm_object_paging_begin(cur_object);
2240 vm_object_paging_begin(object);
2241
2242 type_of_fault = DBG_COW_FAULT;
2243 VM_STAT(cow_faults++);
2244 current_task()->cow_faults++;
2245
2246 /*
2247 * Now cope with the source page and object
2248 * If the top object has a ref count of 1
2249 * then no other map can access it, and hence
2250 * it's not necessary to do the pmap_page_protect.
2251 */
2252
2253
2254 vm_page_lock_queues();
2255 vm_page_deactivate(cur_m);
2256 m->dirty = TRUE;
2257 pmap_page_protect(cur_m->phys_addr,
2258 VM_PROT_NONE);
2259 vm_page_unlock_queues();
2260
2261 PAGE_WAKEUP_DONE(cur_m);
2262 vm_object_paging_end(cur_object);
2263 vm_object_unlock(cur_object);
2264
2265 /*
2266 * Slight hack to call vm_object collapse
2267 * and then reuse common map in code.
2268 * note that the object lock was taken above.
2269 */
2270
2271 vm_object_paging_end(object);
2272 vm_object_collapse(object);
2273 vm_object_paging_begin(object);
2274 vm_object_unlock(object);
2275
2276 goto FastPmapEnter;
2277 }
2278 else {
2279
2280 /*
2281 * No page at cur_object, cur_offset
2282 */
2283
2284 if (cur_object->pager_created) {
2285
2286 /*
2287 * Have to talk to the pager. Give up.
2288 */
2289
2290 break;
2291 }
2292
2293
2294 if (cur_object->shadow == VM_OBJECT_NULL) {
2295
2296 if (cur_object->shadow_severed) {
2297 vm_object_paging_end(object);
2298 vm_object_unlock(object);
2299 vm_map_unlock_read(map);
2300 if(pmap_map != map)
2301 vm_map_unlock(pmap_map);
2302
2303 if (funnel_set) {
2304 thread_funnel_set( curflock, TRUE);
2305 funnel_set = FALSE;
2306 }
2307 cur_thread->interruptible = interruptible_state;
2308
2309 return VM_FAULT_MEMORY_ERROR;
2310 }
2311
2312 /*
2313 * Zero fill fault. Page gets
2314 * filled in top object. Insert
2315 * page, then drop any lower lock.
2316 * Give up if no page.
2317 */
2318 if ((vm_page_free_target -
2319 ((vm_page_free_target-vm_page_free_min)>>2))
2320 > vm_page_free_count) {
2321 break;
2322 }
2323 m = vm_page_alloc(object, offset);
2324 if (m == VM_PAGE_NULL) {
2325 break;
2326 }
2327
2328 if (cur_object != object)
2329 vm_object_unlock(cur_object);
2330
2331 vm_object_paging_begin(object);
2332 vm_object_unlock(object);
2333
2334 /*
2335 * Now zero fill page and map it.
2336 * the page is probably going to
2337 * be written soon, so don't bother
2338 * to clear the modified bit
2339 *
2340 * NOTE: This code holds the map
2341 * lock across the zero fill.
2342 */
2343
2344 if (!map->no_zero_fill) {
2345 vm_page_zero_fill(m);
2346 type_of_fault = DBG_ZERO_FILL_FAULT;
2347 VM_STAT(zero_fill_count++);
2348 }
2349 vm_page_lock_queues();
2350 VM_PAGE_QUEUES_REMOVE(m);
2351 queue_enter(&vm_page_queue_inactive,
2352 m, vm_page_t, pageq);
2353 m->inactive = TRUE;
2354 vm_page_inactive_count++;
2355 vm_page_unlock_queues();
2356 goto FastPmapEnter;
2357 }
2358
2359 /*
2360 * On to the next level
2361 */
2362
2363 cur_offset += cur_object->shadow_offset;
2364 new_object = cur_object->shadow;
2365 vm_object_lock(new_object);
2366 if (cur_object != object)
2367 vm_object_unlock(cur_object);
2368 cur_object = new_object;
2369
2370 continue;
2371 }
2372 }
2373
2374 /*
2375 * Cleanup from fast fault failure. Drop any object
2376 * lock other than original and drop map lock.
2377 */
2378
2379 if (object != cur_object)
2380 vm_object_unlock(cur_object);
2381 }
2382 vm_map_unlock_read(map);
2383 if(pmap_map != map)
2384 vm_map_unlock(pmap_map);
2385
2386 /*
2387 * Make a reference to this object to
2388 * prevent its disposal while we are messing with
2389 * it. Once we have the reference, the map is free
2390 * to be diddled. Since objects reference their
2391 * shadows (and copies), they will stay around as well.
2392 */
2393
2394 assert(object->ref_count > 0);
2395 object->ref_count++;
2396 vm_object_res_reference(object);
2397 vm_object_paging_begin(object);
2398
2399 XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2400 kr = vm_fault_page(object, offset, fault_type,
2401 (change_wiring && !wired),
2402 interruptible,
2403 lo_offset, hi_offset, behavior,
2404 &prot, &result_page, &top_page,
2405 &type_of_fault,
2406 &error_code, map->no_zero_fill, FALSE);
2407
2408 /*
2409 * If we didn't succeed, lose the object reference immediately.
2410 */
2411
2412 if (kr != VM_FAULT_SUCCESS)
2413 vm_object_deallocate(object);
2414
2415 /*
2416 * See why we failed, and take corrective action.
2417 */
2418
2419 switch (kr) {
2420 case VM_FAULT_SUCCESS:
2421 break;
2422 case VM_FAULT_MEMORY_SHORTAGE:
2423 if (vm_page_wait((change_wiring) ?
2424 THREAD_UNINT :
2425 THREAD_ABORTSAFE))
2426 goto RetryFault;
2427 /* fall thru */
2428 case VM_FAULT_INTERRUPTED:
2429 kr = KERN_ABORTED;
2430 goto done;
2431 case VM_FAULT_RETRY:
2432 goto RetryFault;
2433 case VM_FAULT_FICTITIOUS_SHORTAGE:
2434 vm_page_more_fictitious();
2435 goto RetryFault;
2436 case VM_FAULT_MEMORY_ERROR:
2437 if (error_code)
2438 kr = error_code;
2439 else
2440 kr = KERN_MEMORY_ERROR;
2441 goto done;
2442 }
2443
2444 m = result_page;
2445
2446 assert((change_wiring && !wired) ?
2447 (top_page == VM_PAGE_NULL) :
2448 ((top_page == VM_PAGE_NULL) == (m->object == object)));
2449
2450 /*
2451 * How to clean up the result of vm_fault_page. This
2452 * happens whether the mapping is entered or not.
2453 */
2454
2455 #define UNLOCK_AND_DEALLOCATE \
2456 MACRO_BEGIN \
2457 vm_fault_cleanup(m->object, top_page); \
2458 vm_object_deallocate(object); \
2459 MACRO_END
2460
2461 /*
2462 * What to do with the resulting page from vm_fault_page
2463 * if it doesn't get entered into the physical map:
2464 */
2465
2466 #define RELEASE_PAGE(m) \
2467 MACRO_BEGIN \
2468 PAGE_WAKEUP_DONE(m); \
2469 vm_page_lock_queues(); \
2470 if (!m->active && !m->inactive) \
2471 vm_page_activate(m); \
2472 vm_page_unlock_queues(); \
2473 MACRO_END
2474
2475 /*
2476 * We must verify that the maps have not changed
2477 * since our last lookup.
2478 */
2479
2480 old_copy_object = m->object->copy;
2481
2482 vm_object_unlock(m->object);
2483 if ((map != original_map) || !vm_map_verify(map, &version)) {
2484 vm_object_t retry_object;
2485 vm_object_offset_t retry_offset;
2486 vm_prot_t retry_prot;
2487
2488 /*
2489 * To avoid trying to write_lock the map while another
2490 * thread has it read_locked (in vm_map_pageable), we
2491 * do not try for write permission. If the page is
2492 * still writable, we will get write permission. If it
2493 * is not, or has been marked needs_copy, we enter the
2494 * mapping without write permission, and will merely
2495 * take another fault.
2496 */
2497 map = original_map;
2498 vm_map_lock_read(map);
2499 kr = vm_map_lookup_locked(&map, vaddr,
2500 fault_type & ~VM_PROT_WRITE, &version,
2501 &retry_object, &retry_offset, &retry_prot,
2502 &wired, &behavior, &lo_offset, &hi_offset,
2503 &pmap_map);
2504 pmap = pmap_map->pmap;
2505
2506 if (kr != KERN_SUCCESS) {
2507 vm_map_unlock_read(map);
2508 vm_object_lock(m->object);
2509 RELEASE_PAGE(m);
2510 UNLOCK_AND_DEALLOCATE;
2511 goto done;
2512 }
2513
2514 vm_object_unlock(retry_object);
2515 vm_object_lock(m->object);
2516
2517 if ((retry_object != object) ||
2518 (retry_offset != offset)) {
2519 vm_map_unlock_read(map);
2520 if(pmap_map != map)
2521 vm_map_unlock(pmap_map);
2522 RELEASE_PAGE(m);
2523 UNLOCK_AND_DEALLOCATE;
2524 goto RetryFault;
2525 }
2526
2527 /*
2528 * Check whether the protection has changed or the object
2529 * has been copied while we left the map unlocked.
2530 */
2531 prot &= retry_prot;
2532 vm_object_unlock(m->object);
2533 }
2534 vm_object_lock(m->object);
2535
2536 /*
2537 * If the copy object changed while the top-level object
2538 * was unlocked, then we must take away write permission.
2539 */
2540
2541 if (m->object->copy != old_copy_object)
2542 prot &= ~VM_PROT_WRITE;
2543
2544 /*
2545 * If we want to wire down this page, but no longer have
2546 * adequate permissions, we must start all over.
2547 */
2548
2549 if (wired && (fault_type != (prot|VM_PROT_WRITE))) {
2550 vm_map_verify_done(map, &version);
2551 if(pmap_map != map)
2552 vm_map_unlock(pmap_map);
2553 RELEASE_PAGE(m);
2554 UNLOCK_AND_DEALLOCATE;
2555 goto RetryFault;
2556 }
2557
2558 /*
2559 * It's critically important that a wired-down page be faulted
2560 * only once in each map for which it is wired.
2561 */
2562 vm_object_unlock(m->object);
2563
2564 /*
2565 * Put this page into the physical map.
2566 * We had to do the unlock above because pmap_enter
2567 * may cause other faults. The page may be on
2568 * the pageout queues. If the pageout daemon comes
2569 * across the page, it will remove it from the queues.
2570 */
2571 PMAP_ENTER(pmap, vaddr, m, prot, wired);
2572
2573 /* Sync I & D caches for new mapping*/
2574 pmap_attribute(pmap,
2575 vaddr,
2576 PAGE_SIZE,
2577 MATTR_CACHE,
2578 &mv_cache_sync);
2579
2580 /*
2581 * If the page is not wired down and isn't already
2582 * on a pageout queue, then put it where the
2583 * pageout daemon can find it.
2584 */
2585 vm_object_lock(m->object);
2586 vm_page_lock_queues();
2587 if (change_wiring) {
2588 if (wired)
2589 vm_page_wire(m);
2590 else
2591 vm_page_unwire(m);
2592 }
2593 #if VM_FAULT_STATIC_CONFIG
2594 else {
2595 if (!m->active && !m->inactive)
2596 vm_page_activate(m);
2597 m->reference = TRUE;
2598 }
2599 #else
2600 else if (software_reference_bits) {
2601 if (!m->active && !m->inactive)
2602 vm_page_activate(m);
2603 m->reference = TRUE;
2604 } else {
2605 vm_page_activate(m);
2606 }
2607 #endif
2608 vm_page_unlock_queues();
2609
2610 /*
2611 * Unlock everything, and return
2612 */
2613
2614 vm_map_verify_done(map, &version);
2615 if(pmap_map != map)
2616 vm_map_unlock(pmap_map);
2617 PAGE_WAKEUP_DONE(m);
2618 kr = KERN_SUCCESS;
2619 UNLOCK_AND_DEALLOCATE;
2620
2621 #undef UNLOCK_AND_DEALLOCATE
2622 #undef RELEASE_PAGE
2623
2624 done:
2625 if (funnel_set) {
2626 thread_funnel_set( curflock, TRUE);
2627 funnel_set = FALSE;
2628 }
2629 cur_thread->interruptible = interruptible_state;
2630
2631 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2632 vaddr,
2633 type_of_fault,
2634 kr,
2635 0,
2636 0);
2637 return(kr);
2638 }
2639
2640 /*
2641 * vm_fault_wire:
2642 *
2643 * Wire down a range of virtual addresses in a map.
2644 */
2645 kern_return_t
2646 vm_fault_wire(
2647 vm_map_t map,
2648 vm_map_entry_t entry,
2649 pmap_t pmap)
2650 {
2651
2652 register vm_offset_t va;
2653 register vm_offset_t end_addr = entry->vme_end;
2654 register kern_return_t rc;
2655
2656 assert(entry->in_transition);
2657
2658 /*
2659 * Inform the physical mapping system that the
2660 * range of addresses may not fault, so that
2661 * page tables and such can be locked down as well.
2662 */
2663
2664 pmap_pageable(pmap, entry->vme_start, end_addr, FALSE);
2665
2666 /*
2667 * We simulate a fault to get the page and enter it
2668 * in the physical map.
2669 */
2670
2671 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
2672 if ((rc = vm_fault_wire_fast(
2673 map, va, entry, pmap)) != KERN_SUCCESS) {
2674 rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
2675 (pmap == kernel_pmap) ? THREAD_UNINT : THREAD_ABORTSAFE);
2676 }
2677
2678 if (rc != KERN_SUCCESS) {
2679 struct vm_map_entry tmp_entry = *entry;
2680
2681 /* unwire wired pages */
2682 tmp_entry.vme_end = va;
2683 vm_fault_unwire(map, &tmp_entry, FALSE, pmap);
2684
2685 return rc;
2686 }
2687 }
2688 return KERN_SUCCESS;
2689 }
2690
2691 /*
2692 * vm_fault_unwire:
2693 *
2694 * Unwire a range of virtual addresses in a map.
2695 */
2696 void
2697 vm_fault_unwire(
2698 vm_map_t map,
2699 vm_map_entry_t entry,
2700 boolean_t deallocate,
2701 pmap_t pmap)
2702 {
2703 register vm_offset_t va;
2704 register vm_offset_t end_addr = entry->vme_end;
2705 vm_object_t object;
2706
2707 object = (entry->is_sub_map)
2708 ? VM_OBJECT_NULL : entry->object.vm_object;
2709
2710 /*
2711 * Since the pages are wired down, we must be able to
2712 * get their mappings from the physical map system.
2713 */
2714
2715 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
2716 pmap_change_wiring(pmap, va, FALSE);
2717
2718 if (object == VM_OBJECT_NULL) {
2719 (void) vm_fault(map, va, VM_PROT_NONE, TRUE, THREAD_UNINT);
2720 } else {
2721 vm_prot_t prot;
2722 vm_page_t result_page;
2723 vm_page_t top_page;
2724 vm_object_t result_object;
2725 vm_fault_return_t result;
2726
2727 do {
2728 prot = VM_PROT_NONE;
2729
2730 vm_object_lock(object);
2731 vm_object_paging_begin(object);
2732 XPR(XPR_VM_FAULT,
2733 "vm_fault_unwire -> vm_fault_page\n",
2734 0,0,0,0,0);
2735 result = vm_fault_page(object,
2736 entry->offset +
2737 (va - entry->vme_start),
2738 VM_PROT_NONE, TRUE,
2739 THREAD_UNINT,
2740 entry->offset,
2741 entry->offset +
2742 (entry->vme_end
2743 - entry->vme_start),
2744 entry->behavior,
2745 &prot,
2746 &result_page,
2747 &top_page,
2748 (int *)0,
2749 0, map->no_zero_fill,
2750 FALSE);
2751 } while (result == VM_FAULT_RETRY);
2752
2753 if (result != VM_FAULT_SUCCESS)
2754 panic("vm_fault_unwire: failure");
2755
2756 result_object = result_page->object;
2757 if (deallocate) {
2758 assert(!result_page->fictitious);
2759 pmap_page_protect(result_page->phys_addr,
2760 VM_PROT_NONE);
2761 VM_PAGE_FREE(result_page);
2762 } else {
2763 vm_page_lock_queues();
2764 vm_page_unwire(result_page);
2765 vm_page_unlock_queues();
2766 PAGE_WAKEUP_DONE(result_page);
2767 }
2768
2769 vm_fault_cleanup(result_object, top_page);
2770 }
2771 }
2772
2773 /*
2774 * Inform the physical mapping system that the range
2775 * of addresses may fault, so that page tables and
2776 * such may be unwired themselves.
2777 */
2778
2779 pmap_pageable(pmap, entry->vme_start, end_addr, TRUE);
2780
2781 }
2782
2783 /*
2784 * vm_fault_wire_fast:
2785 *
2786 * Handle common case of a wire down page fault at the given address.
2787 * If successful, the page is inserted into the associated physical map.
2788 * The map entry is passed in to avoid the overhead of a map lookup.
2789 *
2790 * NOTE: the given address should be truncated to the
2791 * proper page address.
2792 *
2793 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
2794 * a standard error specifying why the fault is fatal is returned.
2795 *
2796 * The map in question must be referenced, and remains so.
2797 * Caller has a read lock on the map.
2798 *
2799 * This is a stripped version of vm_fault() for wiring pages. Anything
2800 * other than the common case will return KERN_FAILURE, and the caller
2801 * is expected to call vm_fault().
2802 */
2803 kern_return_t
2804 vm_fault_wire_fast(
2805 vm_map_t map,
2806 vm_offset_t va,
2807 vm_map_entry_t entry,
2808 pmap_t pmap)
2809 {
2810 vm_object_t object;
2811 vm_object_offset_t offset;
2812 register vm_page_t m;
2813 vm_prot_t prot;
2814 thread_act_t thr_act;
2815
2816 VM_STAT(faults++);
2817
2818 if((thr_act=current_act()) && (thr_act->task != TASK_NULL))
2819 thr_act->task->faults++;
2820
2821 /*
2822 * Recovery actions
2823 */
2824
2825 #undef RELEASE_PAGE
2826 #define RELEASE_PAGE(m) { \
2827 PAGE_WAKEUP_DONE(m); \
2828 vm_page_lock_queues(); \
2829 vm_page_unwire(m); \
2830 vm_page_unlock_queues(); \
2831 }
2832
2833
2834 #undef UNLOCK_THINGS
2835 #define UNLOCK_THINGS { \
2836 object->paging_in_progress--; \
2837 vm_object_unlock(object); \
2838 }
2839
2840 #undef UNLOCK_AND_DEALLOCATE
2841 #define UNLOCK_AND_DEALLOCATE { \
2842 UNLOCK_THINGS; \
2843 vm_object_deallocate(object); \
2844 }
2845 /*
2846 * Give up and have caller do things the hard way.
2847 */
2848
2849 #define GIVE_UP { \
2850 UNLOCK_AND_DEALLOCATE; \
2851 return(KERN_FAILURE); \
2852 }
2853
2854
2855 /*
2856 * If this entry is not directly to a vm_object, bail out.
2857 */
2858 if (entry->is_sub_map)
2859 return(KERN_FAILURE);
2860
2861 /*
2862 * Find the backing store object and offset into it.
2863 */
2864
2865 object = entry->object.vm_object;
2866 offset = (va - entry->vme_start) + entry->offset;
2867 prot = entry->protection;
2868
2869 /*
2870 * Make a reference to this object to prevent its
2871 * disposal while we are messing with it.
2872 */
2873
2874 vm_object_lock(object);
2875 assert(object->ref_count > 0);
2876 object->ref_count++;
2877 vm_object_res_reference(object);
2878 object->paging_in_progress++;
2879
2880 /*
2881 * INVARIANTS (through entire routine):
2882 *
2883 * 1) At all times, we must either have the object
2884 * lock or a busy page in some object to prevent
2885 * some other thread from trying to bring in
2886 * the same page.
2887 *
2888 * 2) Once we have a busy page, we must remove it from
2889 * the pageout queues, so that the pageout daemon
2890 * will not grab it away.
2891 *
2892 */
2893
2894 /*
2895 * Look for page in top-level object. If it's not there or
2896 * there's something going on, give up.
2897 */
2898 m = vm_page_lookup(object, offset);
2899 if ((m == VM_PAGE_NULL) || (m->busy) ||
2900 (m->unusual && ( m->error || m->restart || m->absent ||
2901 prot & m->page_lock))) {
2902
2903 GIVE_UP;
2904 }
2905
2906 /*
2907 * Wire the page down now. All bail outs beyond this
2908 * point must unwire the page.
2909 */
2910
2911 vm_page_lock_queues();
2912 vm_page_wire(m);
2913 vm_page_unlock_queues();
2914
2915 /*
2916 * Mark page busy for other threads.
2917 */
2918 assert(!m->busy);
2919 m->busy = TRUE;
2920 assert(!m->absent);
2921
2922 /*
2923 * Give up if the page is being written and there's a copy object
2924 */
2925 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
2926 RELEASE_PAGE(m);
2927 GIVE_UP;
2928 }
2929
2930 /*
2931 * Put this page into the physical map.
2932 * We have to unlock the object because pmap_enter
2933 * may cause other faults.
2934 */
2935 vm_object_unlock(object);
2936
2937 PMAP_ENTER(pmap, va, m, prot, TRUE);
2938 /* Sync I & D caches for new mapping */
2939 pmap_attribute(pmap,
2940 va,
2941 PAGE_SIZE,
2942 MATTR_CACHE,
2943 &mv_cache_sync);
2944
2945 /*
2946 * Must relock object so that paging_in_progress can be cleared.
2947 */
2948 vm_object_lock(object);
2949
2950 /*
2951 * Unlock everything, and return
2952 */
2953
2954 PAGE_WAKEUP_DONE(m);
2955 UNLOCK_AND_DEALLOCATE;
2956
2957 return(KERN_SUCCESS);
2958
2959 }
2960
2961 /*
2962 * Routine: vm_fault_copy_cleanup
2963 * Purpose:
2964 * Release a page used by vm_fault_copy.
2965 */
2966
2967 void
2968 vm_fault_copy_cleanup(
2969 vm_page_t page,
2970 vm_page_t top_page)
2971 {
2972 vm_object_t object = page->object;
2973
2974 vm_object_lock(object);
2975 PAGE_WAKEUP_DONE(page);
2976 vm_page_lock_queues();
2977 if (!page->active && !page->inactive)
2978 vm_page_activate(page);
2979 vm_page_unlock_queues();
2980 vm_fault_cleanup(object, top_page);
2981 }
2982
2983 void
2984 vm_fault_copy_dst_cleanup(
2985 vm_page_t page)
2986 {
2987 vm_object_t object;
2988
2989 if (page != VM_PAGE_NULL) {
2990 object = page->object;
2991 vm_object_lock(object);
2992 vm_page_lock_queues();
2993 vm_page_unwire(page);
2994 vm_page_unlock_queues();
2995 vm_object_paging_end(object);
2996 vm_object_unlock(object);
2997 }
2998 }
2999
3000 /*
3001 * Routine: vm_fault_copy
3002 *
3003 * Purpose:
3004 * Copy pages from one virtual memory object to another --
3005 * neither the source nor destination pages need be resident.
3006 *
3007 * Before actually copying a page, the version associated with
3008 * the destination address map wil be verified.
3009 *
3010 * In/out conditions:
3011 * The caller must hold a reference, but not a lock, to
3012 * each of the source and destination objects and to the
3013 * destination map.
3014 *
3015 * Results:
3016 * Returns KERN_SUCCESS if no errors were encountered in
3017 * reading or writing the data. Returns KERN_INTERRUPTED if
3018 * the operation was interrupted (only possible if the
3019 * "interruptible" argument is asserted). Other return values
3020 * indicate a permanent error in copying the data.
3021 *
3022 * The actual amount of data copied will be returned in the
3023 * "copy_size" argument. In the event that the destination map
3024 * verification failed, this amount may be less than the amount
3025 * requested.
3026 */
3027 kern_return_t
3028 vm_fault_copy(
3029 vm_object_t src_object,
3030 vm_object_offset_t src_offset,
3031 vm_size_t *src_size, /* INOUT */
3032 vm_object_t dst_object,
3033 vm_object_offset_t dst_offset,
3034 vm_map_t dst_map,
3035 vm_map_version_t *dst_version,
3036 int interruptible)
3037 {
3038 vm_page_t result_page;
3039
3040 vm_page_t src_page;
3041 vm_page_t src_top_page;
3042 vm_prot_t src_prot;
3043
3044 vm_page_t dst_page;
3045 vm_page_t dst_top_page;
3046 vm_prot_t dst_prot;
3047
3048 vm_size_t amount_left;
3049 vm_object_t old_copy_object;
3050 kern_return_t error = 0;
3051
3052 vm_size_t part_size;
3053
3054 /*
3055 * In order not to confuse the clustered pageins, align
3056 * the different offsets on a page boundary.
3057 */
3058 vm_object_offset_t src_lo_offset = trunc_page_64(src_offset);
3059 vm_object_offset_t dst_lo_offset = trunc_page_64(dst_offset);
3060 vm_object_offset_t src_hi_offset = round_page_64(src_offset + *src_size);
3061 vm_object_offset_t dst_hi_offset = round_page_64(dst_offset + *src_size);
3062
3063 #define RETURN(x) \
3064 MACRO_BEGIN \
3065 *src_size -= amount_left; \
3066 MACRO_RETURN(x); \
3067 MACRO_END
3068
3069 amount_left = *src_size;
3070 do { /* while (amount_left > 0) */
3071 /*
3072 * There may be a deadlock if both source and destination
3073 * pages are the same. To avoid this deadlock, the copy must
3074 * start by getting the destination page in order to apply
3075 * COW semantics if any.
3076 */
3077
3078 RetryDestinationFault: ;
3079
3080 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3081
3082 vm_object_lock(dst_object);
3083 vm_object_paging_begin(dst_object);
3084
3085 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3086 switch (vm_fault_page(dst_object,
3087 trunc_page_64(dst_offset),
3088 VM_PROT_WRITE|VM_PROT_READ,
3089 FALSE,
3090 interruptible,
3091 dst_lo_offset,
3092 dst_hi_offset,
3093 VM_BEHAVIOR_SEQUENTIAL,
3094 &dst_prot,
3095 &dst_page,
3096 &dst_top_page,
3097 (int *)0,
3098 &error,
3099 dst_map->no_zero_fill,
3100 FALSE)) {
3101 case VM_FAULT_SUCCESS:
3102 break;
3103 case VM_FAULT_RETRY:
3104 goto RetryDestinationFault;
3105 case VM_FAULT_MEMORY_SHORTAGE:
3106 if (vm_page_wait(interruptible))
3107 goto RetryDestinationFault;
3108 /* fall thru */
3109 case VM_FAULT_INTERRUPTED:
3110 RETURN(MACH_SEND_INTERRUPTED);
3111 case VM_FAULT_FICTITIOUS_SHORTAGE:
3112 vm_page_more_fictitious();
3113 goto RetryDestinationFault;
3114 case VM_FAULT_MEMORY_ERROR:
3115 if (error)
3116 return (error);
3117 else
3118 return(KERN_MEMORY_ERROR);
3119 }
3120 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3121
3122 old_copy_object = dst_page->object->copy;
3123
3124 /*
3125 * There exists the possiblity that the source and
3126 * destination page are the same. But we can't
3127 * easily determine that now. If they are the
3128 * same, the call to vm_fault_page() for the
3129 * destination page will deadlock. To prevent this we
3130 * wire the page so we can drop busy without having
3131 * the page daemon steal the page. We clean up the
3132 * top page but keep the paging reference on the object
3133 * holding the dest page so it doesn't go away.
3134 */
3135
3136 vm_page_lock_queues();
3137 vm_page_wire(dst_page);
3138 vm_page_unlock_queues();
3139 PAGE_WAKEUP_DONE(dst_page);
3140 vm_object_unlock(dst_page->object);
3141
3142 if (dst_top_page != VM_PAGE_NULL) {
3143 vm_object_lock(dst_object);
3144 VM_PAGE_FREE(dst_top_page);
3145 vm_object_paging_end(dst_object);
3146 vm_object_unlock(dst_object);
3147 }
3148
3149 RetrySourceFault: ;
3150
3151 if (src_object == VM_OBJECT_NULL) {
3152 /*
3153 * No source object. We will just
3154 * zero-fill the page in dst_object.
3155 */
3156 src_page = VM_PAGE_NULL;
3157 result_page = VM_PAGE_NULL;
3158 } else {
3159 vm_object_lock(src_object);
3160 src_page = vm_page_lookup(src_object,
3161 trunc_page_64(src_offset));
3162 if (src_page == dst_page) {
3163 src_prot = dst_prot;
3164 result_page = VM_PAGE_NULL;
3165 } else {
3166 src_prot = VM_PROT_READ;
3167 vm_object_paging_begin(src_object);
3168
3169 XPR(XPR_VM_FAULT,
3170 "vm_fault_copy(2) -> vm_fault_page\n",
3171 0,0,0,0,0);
3172 switch (vm_fault_page(src_object,
3173 trunc_page_64(src_offset),
3174 VM_PROT_READ,
3175 FALSE,
3176 interruptible,
3177 src_lo_offset,
3178 src_hi_offset,
3179 VM_BEHAVIOR_SEQUENTIAL,
3180 &src_prot,
3181 &result_page,
3182 &src_top_page,
3183 (int *)0,
3184 &error,
3185 FALSE,
3186 FALSE)) {
3187
3188 case VM_FAULT_SUCCESS:
3189 break;
3190 case VM_FAULT_RETRY:
3191 goto RetrySourceFault;
3192 case VM_FAULT_MEMORY_SHORTAGE:
3193 if (vm_page_wait(interruptible))
3194 goto RetrySourceFault;
3195 /* fall thru */
3196 case VM_FAULT_INTERRUPTED:
3197 vm_fault_copy_dst_cleanup(dst_page);
3198 RETURN(MACH_SEND_INTERRUPTED);
3199 case VM_FAULT_FICTITIOUS_SHORTAGE:
3200 vm_page_more_fictitious();
3201 goto RetrySourceFault;
3202 case VM_FAULT_MEMORY_ERROR:
3203 vm_fault_copy_dst_cleanup(dst_page);
3204 if (error)
3205 return (error);
3206 else
3207 return(KERN_MEMORY_ERROR);
3208 }
3209
3210
3211 assert((src_top_page == VM_PAGE_NULL) ==
3212 (result_page->object == src_object));
3213 }
3214 assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
3215 vm_object_unlock(result_page->object);
3216 }
3217
3218 if (!vm_map_verify(dst_map, dst_version)) {
3219 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3220 vm_fault_copy_cleanup(result_page, src_top_page);
3221 vm_fault_copy_dst_cleanup(dst_page);
3222 break;
3223 }
3224
3225 vm_object_lock(dst_page->object);
3226
3227 if (dst_page->object->copy != old_copy_object) {
3228 vm_object_unlock(dst_page->object);
3229 vm_map_verify_done(dst_map, dst_version);
3230 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3231 vm_fault_copy_cleanup(result_page, src_top_page);
3232 vm_fault_copy_dst_cleanup(dst_page);
3233 break;
3234 }
3235 vm_object_unlock(dst_page->object);
3236
3237 /*
3238 * Copy the page, and note that it is dirty
3239 * immediately.
3240 */
3241
3242 if (!page_aligned(src_offset) ||
3243 !page_aligned(dst_offset) ||
3244 !page_aligned(amount_left)) {
3245
3246 vm_object_offset_t src_po,
3247 dst_po;
3248
3249 src_po = src_offset - trunc_page_64(src_offset);
3250 dst_po = dst_offset - trunc_page_64(dst_offset);
3251
3252 if (dst_po > src_po) {
3253 part_size = PAGE_SIZE - dst_po;
3254 } else {
3255 part_size = PAGE_SIZE - src_po;
3256 }
3257 if (part_size > (amount_left)){
3258 part_size = amount_left;
3259 }
3260
3261 if (result_page == VM_PAGE_NULL) {
3262 vm_page_part_zero_fill(dst_page,
3263 dst_po, part_size);
3264 } else {
3265 vm_page_part_copy(result_page, src_po,
3266 dst_page, dst_po, part_size);
3267 if(!dst_page->dirty){
3268 vm_object_lock(dst_object);
3269 dst_page->dirty = TRUE;
3270 vm_object_unlock(dst_page->object);
3271 }
3272
3273 }
3274 } else {
3275 part_size = PAGE_SIZE;
3276
3277 if (result_page == VM_PAGE_NULL)
3278 vm_page_zero_fill(dst_page);
3279 else{
3280 vm_page_copy(result_page, dst_page);
3281 if(!dst_page->dirty){
3282 vm_object_lock(dst_object);
3283 dst_page->dirty = TRUE;
3284 vm_object_unlock(dst_page->object);
3285 }
3286 }
3287
3288 }
3289
3290 /*
3291 * Unlock everything, and return
3292 */
3293
3294 vm_map_verify_done(dst_map, dst_version);
3295
3296 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3297 vm_fault_copy_cleanup(result_page, src_top_page);
3298 vm_fault_copy_dst_cleanup(dst_page);
3299
3300 amount_left -= part_size;
3301 src_offset += part_size;
3302 dst_offset += part_size;
3303 } while (amount_left > 0);
3304
3305 RETURN(KERN_SUCCESS);
3306 #undef RETURN
3307
3308 /*NOTREACHED*/
3309 }
3310
3311 #ifdef notdef
3312
3313 /*
3314 * Routine: vm_fault_page_overwrite
3315 *
3316 * Description:
3317 * A form of vm_fault_page that assumes that the
3318 * resulting page will be overwritten in its entirety,
3319 * making it unnecessary to obtain the correct *contents*
3320 * of the page.
3321 *
3322 * Implementation:
3323 * XXX Untested. Also unused. Eventually, this technology
3324 * could be used in vm_fault_copy() to advantage.
3325 */
3326 vm_fault_return_t
3327 vm_fault_page_overwrite(
3328 register
3329 vm_object_t dst_object,
3330 vm_object_offset_t dst_offset,
3331 vm_page_t *result_page) /* OUT */
3332 {
3333 register
3334 vm_page_t dst_page;
3335 kern_return_t wait_result;
3336
3337 #define interruptible THREAD_UNINT /* XXX */
3338
3339 while (TRUE) {
3340 /*
3341 * Look for a page at this offset
3342 */
3343
3344 while ((dst_page = vm_page_lookup(dst_object, dst_offset))
3345 == VM_PAGE_NULL) {
3346 /*
3347 * No page, no problem... just allocate one.
3348 */
3349
3350 dst_page = vm_page_alloc(dst_object, dst_offset);
3351 if (dst_page == VM_PAGE_NULL) {
3352 vm_object_unlock(dst_object);
3353 VM_PAGE_WAIT();
3354 vm_object_lock(dst_object);
3355 continue;
3356 }
3357
3358 /*
3359 * Pretend that the memory manager
3360 * write-protected the page.
3361 *
3362 * Note that we will be asking for write
3363 * permission without asking for the data
3364 * first.
3365 */
3366
3367 dst_page->overwriting = TRUE;
3368 dst_page->page_lock = VM_PROT_WRITE;
3369 dst_page->absent = TRUE;
3370 dst_page->unusual = TRUE;
3371 dst_object->absent_count++;
3372
3373 break;
3374
3375 /*
3376 * When we bail out, we might have to throw
3377 * away the page created here.
3378 */
3379
3380 #define DISCARD_PAGE \
3381 MACRO_BEGIN \
3382 vm_object_lock(dst_object); \
3383 dst_page = vm_page_lookup(dst_object, dst_offset); \
3384 if ((dst_page != VM_PAGE_NULL) && dst_page->overwriting) \
3385 VM_PAGE_FREE(dst_page); \
3386 vm_object_unlock(dst_object); \
3387 MACRO_END
3388 }
3389
3390 /*
3391 * If the page is write-protected...
3392 */
3393
3394 if (dst_page->page_lock & VM_PROT_WRITE) {
3395 /*
3396 * ... and an unlock request hasn't been sent
3397 */
3398
3399 if ( ! (dst_page->unlock_request & VM_PROT_WRITE)) {
3400 vm_prot_t u;
3401 kern_return_t rc;
3402
3403 /*
3404 * ... then send one now.
3405 */
3406
3407 if (!dst_object->pager_ready) {
3408 vm_object_assert_wait(dst_object,
3409 VM_OBJECT_EVENT_PAGER_READY,
3410 interruptible);
3411 vm_object_unlock(dst_object);
3412 wait_result = thread_block((void (*)(void))0);
3413 if (wait_result != THREAD_AWAKENED) {
3414 DISCARD_PAGE;
3415 return(VM_FAULT_INTERRUPTED);
3416 }
3417 continue;
3418 }
3419
3420 u = dst_page->unlock_request |= VM_PROT_WRITE;
3421 vm_object_unlock(dst_object);
3422
3423 if ((rc = memory_object_data_unlock(
3424 dst_object->pager,
3425 dst_object->pager_request,
3426 dst_offset + dst_object->paging_offset,
3427 PAGE_SIZE,
3428 u)) != KERN_SUCCESS) {
3429 if (vm_fault_debug)
3430 printf("vm_object_overwrite: memory_object_data_unlock failed\n");
3431 DISCARD_PAGE;
3432 return((rc == MACH_SEND_INTERRUPTED) ?
3433 VM_FAULT_INTERRUPTED :
3434 VM_FAULT_MEMORY_ERROR);
3435 }
3436 vm_object_lock(dst_object);
3437 continue;
3438 }
3439
3440 /* ... fall through to wait below */
3441 } else {
3442 /*
3443 * If the page isn't being used for other
3444 * purposes, then we're done.
3445 */
3446 if ( ! (dst_page->busy || dst_page->absent ||
3447 dst_page->error || dst_page->restart) )
3448 break;
3449 }
3450
3451 PAGE_ASSERT_WAIT(dst_page, interruptible);
3452 vm_object_unlock(dst_object);
3453 wait_result = thread_block((void (*)(void))0);
3454 if (wait_result != THREAD_AWAKENED) {
3455 DISCARD_PAGE;
3456 return(VM_FAULT_INTERRUPTED);
3457 }
3458 }
3459
3460 *result_page = dst_page;
3461 return(VM_FAULT_SUCCESS);
3462
3463 #undef interruptible
3464 #undef DISCARD_PAGE
3465 }
3466
3467 #endif /* notdef */
3468
3469 #if VM_FAULT_CLASSIFY
3470 /*
3471 * Temporary statistics gathering support.
3472 */
3473
3474 /*
3475 * Statistics arrays:
3476 */
3477 #define VM_FAULT_TYPES_MAX 5
3478 #define VM_FAULT_LEVEL_MAX 8
3479
3480 int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
3481
3482 #define VM_FAULT_TYPE_ZERO_FILL 0
3483 #define VM_FAULT_TYPE_MAP_IN 1
3484 #define VM_FAULT_TYPE_PAGER 2
3485 #define VM_FAULT_TYPE_COPY 3
3486 #define VM_FAULT_TYPE_OTHER 4
3487
3488
3489 void
3490 vm_fault_classify(vm_object_t object,
3491 vm_object_offset_t offset,
3492 vm_prot_t fault_type)
3493 {
3494 int type, level = 0;
3495 vm_page_t m;
3496
3497 while (TRUE) {
3498 m = vm_page_lookup(object, offset);
3499 if (m != VM_PAGE_NULL) {
3500 if (m->busy || m->error || m->restart || m->absent ||
3501 fault_type & m->page_lock) {
3502 type = VM_FAULT_TYPE_OTHER;
3503 break;
3504 }
3505 if (((fault_type & VM_PROT_WRITE) == 0) ||
3506 ((level == 0) && object->copy == VM_OBJECT_NULL)) {
3507 type = VM_FAULT_TYPE_MAP_IN;
3508 break;
3509 }
3510 type = VM_FAULT_TYPE_COPY;
3511 break;
3512 }
3513 else {
3514 if (object->pager_created) {
3515 type = VM_FAULT_TYPE_PAGER;
3516 break;
3517 }
3518 if (object->shadow == VM_OBJECT_NULL) {
3519 type = VM_FAULT_TYPE_ZERO_FILL;
3520 break;
3521 }
3522
3523 offset += object->shadow_offset;
3524 object = object->shadow;
3525 level++;
3526 continue;
3527 }
3528 }
3529
3530 if (level > VM_FAULT_LEVEL_MAX)
3531 level = VM_FAULT_LEVEL_MAX;
3532
3533 vm_fault_stats[type][level] += 1;
3534
3535 return;
3536 }
3537
3538 /* cleanup routine to call from debugger */
3539
3540 void
3541 vm_fault_classify_init(void)
3542 {
3543 int type, level;
3544
3545 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
3546 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
3547 vm_fault_stats[type][level] = 0;
3548 }
3549 }
3550
3551 return;
3552 }
3553 #endif /* VM_FAULT_CLASSIFY */