X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/765c9de3b4af7c2078d16a03812ae2c7c2b24938..060df5ea7c632b1ac8cc8aac1fb59758165c2084:/osfmk/vm/vm_fault.c diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index a1200a69f..a36714b57 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -1,23 +1,29 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. * - * @APPLE_LICENSE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ @@ -55,19 +61,22 @@ * * Page fault handling module. */ -#ifdef MACH_BSD -/* remove after component interface available */ -extern int vnode_pager_workaround; -extern int device_pager_workaround; -#endif #include #include #include +#include -#include +#include #include #include /* for error codes */ +#include +#include +#include + /* For memory_object_data_{request,unlock} */ +#include + +#include #include #include #include @@ -75,52 +84,78 @@ extern int device_pager_workaround; #include #include #include +#include +#include +#include +#include + #include -#include -#include + +#include #include #include #include +#include #include #include -#include -#include -#include - /* For memory_object_data_{request,unlock} */ -#include -#include -#include -#include +#include +#include +#include +#include /* Needed by some vm_page.h macros */ #include #define VM_FAULT_CLASSIFY 0 -#define VM_FAULT_STATIC_CONFIG 1 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */ -int vm_object_absent_max = 50; +int vm_object_pagein_throttle = 16; -int vm_fault_debug = 0; -boolean_t vm_page_deactivate_behind = TRUE; +/* + * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which + * kicks in when swap space runs out. 64-bit programs have massive address spaces and can leak enormous amounts + * of memory if they're buggy and can run the system completely out of swap space. If this happens, we + * impose a hard throttle on them to prevent them from taking the last bit of memory left. This helps + * keep the UI active so that the user has a chance to kill the offending task before the system + * completely hangs. + * + * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied + * to tasks that appear to be bloated. When swap runs out, any task using more than vm_hard_throttle_threshold + * will be throttled. The throttling is done by giving the thread that's trying to demand zero a page a + * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again. + */ +boolean_t thread_is_io_throttled(void); -#if !VM_FAULT_STATIC_CONFIG -boolean_t vm_fault_dirty_handling = FALSE; -boolean_t vm_fault_interruptible = FALSE; -boolean_t software_reference_bits = TRUE; -#endif +uint64_t vm_hard_throttle_threshold; + +extern unsigned int dp_pages_free, dp_pages_reserve; + +#define NEED_TO_HARD_THROTTLE_THIS_TASK() (((dp_pages_free + dp_pages_reserve < 2000) && \ + (get_task_resident_size(current_task()) > vm_hard_throttle_threshold) && \ + (current_task() != kernel_task) && IP_VALID(memory_manager_default)) || \ + (vm_page_free_count < vm_page_throttle_limit && thread_is_io_throttled() && \ + (get_task_resident_size(current_task()) > vm_hard_throttle_threshold))) + + +#define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */ + + +extern int cs_debug; #if MACH_KDB extern struct db_watchpoint *db_watchpoint_list; #endif /* MACH_KDB */ +boolean_t current_thread_aborted(void); + /* Forward declarations of internal routines. */ extern kern_return_t vm_fault_wire_fast( vm_map_t map, - vm_offset_t va, + vm_map_offset_t va, vm_map_entry_t entry, - pmap_t pmap); + pmap_t pmap, + vm_map_offset_t pmap_addr); extern void vm_fault_continue(void); @@ -139,6 +174,18 @@ extern void vm_fault_classify(vm_object_t object, extern void vm_fault_classify_init(void); #endif +unsigned long vm_pmap_enter_blocked = 0; + +unsigned long vm_cs_validates = 0; +unsigned long vm_cs_revalidates = 0; +unsigned long vm_cs_query_modified = 0; +unsigned long vm_cs_validated_dirtied = 0; +#if CONFIG_ENFORCE_SIGNED_CODE +int cs_enforcement_disable=0; +#else +static const int cs_enforcement_disable=1; +#endif + /* * Routine: vm_fault_init * Purpose: @@ -147,6 +194,24 @@ extern void vm_fault_classify_init(void); void vm_fault_init(void) { +#if !SECURE_KERNEL +#if CONFIG_ENFORCE_SIGNED_CODE + PE_parse_boot_argn("cs_enforcement_disable", &cs_enforcement_disable, + sizeof (cs_enforcement_disable)); +#endif + PE_parse_boot_argn("cs_debug", &cs_debug, sizeof (cs_debug)); +#endif + + /* + * Choose a value for the hard throttle threshold based on the amount of ram. The threshold is + * computed as a percentage of available memory, and the percentage used is scaled inversely with + * the amount of memory. The pertange runs between 10% and 35%. We use 35% for small memory systems + * and reduce the value down to 10% for very large memory configurations. This helps give us a + * definition of a memory hog that makes more sense relative to the amount of ram in the machine. + * The formula here simply uses the number of gigabytes of ram to adjust the percentage. + */ + + vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100; } /* @@ -172,11 +237,12 @@ vm_fault_cleanup( vm_object_unlock(object); if (top_page != VM_PAGE_NULL) { - object = top_page->object; - vm_object_lock(object); - VM_PAGE_FREE(top_page); - vm_object_paging_end(object); - vm_object_unlock(object); + object = top_page->object; + + vm_object_lock(object); + VM_PAGE_FREE(top_page); + vm_object_paging_end(object); + vm_object_unlock(object); } } @@ -198,17 +264,451 @@ struct { #define CLUSTER_STAT(clause) #endif /* MACH_CLUSTER_STATS */ -/* XXX - temporary */ -boolean_t vm_allow_clustered_pagein = FALSE; -int vm_pagein_cluster_used = 0; +#define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0) + +boolean_t vm_page_deactivate_behind = TRUE; /* - * Prepage default sizes given VM_BEHAVIOR_DEFAULT reference behavior + * default sizes given VM_BEHAVIOR_DEFAULT reference behavior */ -int vm_default_ahead = 1; /* Number of pages to prepage ahead */ -int vm_default_behind = 0; /* Number of pages to prepage behind */ +#define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW 128 +#define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER 16 /* don't make this too big... */ + /* we use it to size an array on the stack */ + +int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW; + +#define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024) + +/* + * vm_page_is_sequential + * + * Determine if sequential access is in progress + * in accordance with the behavior specified. + * Update state to indicate current access pattern. + * + * object must have at least the shared lock held + */ +static +void +vm_fault_is_sequential( + vm_object_t object, + vm_object_offset_t offset, + vm_behavior_t behavior) +{ + vm_object_offset_t last_alloc; + int sequential; + int orig_sequential; + + last_alloc = object->last_alloc; + sequential = object->sequential; + orig_sequential = sequential; + + switch (behavior) { + case VM_BEHAVIOR_RANDOM: + /* + * reset indicator of sequential behavior + */ + sequential = 0; + break; + + case VM_BEHAVIOR_SEQUENTIAL: + if (offset && last_alloc == offset - PAGE_SIZE_64) { + /* + * advance indicator of sequential behavior + */ + if (sequential < MAX_SEQUENTIAL_RUN) + sequential += PAGE_SIZE; + } else { + /* + * reset indicator of sequential behavior + */ + sequential = 0; + } + break; + + case VM_BEHAVIOR_RSEQNTL: + if (last_alloc && last_alloc == offset + PAGE_SIZE_64) { + /* + * advance indicator of sequential behavior + */ + if (sequential > -MAX_SEQUENTIAL_RUN) + sequential -= PAGE_SIZE; + } else { + /* + * reset indicator of sequential behavior + */ + sequential = 0; + } + break; + + case VM_BEHAVIOR_DEFAULT: + default: + if (offset && last_alloc == (offset - PAGE_SIZE_64)) { + /* + * advance indicator of sequential behavior + */ + if (sequential < 0) + sequential = 0; + if (sequential < MAX_SEQUENTIAL_RUN) + sequential += PAGE_SIZE; + + } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) { + /* + * advance indicator of sequential behavior + */ + if (sequential > 0) + sequential = 0; + if (sequential > -MAX_SEQUENTIAL_RUN) + sequential -= PAGE_SIZE; + } else { + /* + * reset indicator of sequential behavior + */ + sequential = 0; + } + break; + } + if (sequential != orig_sequential) { + if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) { + /* + * if someone else has already updated object->sequential + * don't bother trying to update it or object->last_alloc + */ + return; + } + } + /* + * I'd like to do this with a OSCompareAndSwap64, but that + * doesn't exist for PPC... however, it shouldn't matter + * that much... last_alloc is maintained so that we can determine + * if a sequential access pattern is taking place... if only + * one thread is banging on this object, no problem with the unprotected + * update... if 2 or more threads are banging away, we run the risk of + * someone seeing a mangled update... however, in the face of multiple + * accesses, no sequential access pattern can develop anyway, so we + * haven't lost any real info. + */ + object->last_alloc = offset; +} + + +int vm_page_deactivate_behind_count = 0; + +/* + * vm_page_deactivate_behind + * + * Determine if sequential access is in progress + * in accordance with the behavior specified. If + * so, compute a potential page to deactivate and + * deactivate it. + * + * object must be locked. + * + * return TRUE if we actually deactivate a page + */ +static +boolean_t +vm_fault_deactivate_behind( + vm_object_t object, + vm_object_offset_t offset, + vm_behavior_t behavior) +{ + int n; + int pages_in_run = 0; + int max_pages_in_run = 0; + int sequential_run; + int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL; + vm_object_offset_t run_offset = 0; + vm_object_offset_t pg_offset = 0; + vm_page_t m; + vm_page_t page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER]; + + pages_in_run = 0; +#if TRACEFAULTPAGE + dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */ +#endif + + if (object == kernel_object || vm_page_deactivate_behind == FALSE) { + /* + * Do not deactivate pages from the kernel object: they + * are not intended to become pageable. + * or we've disabled the deactivate behind mechanism + */ + return FALSE; + } + if ((sequential_run = object->sequential)) { + if (sequential_run < 0) { + sequential_behavior = VM_BEHAVIOR_RSEQNTL; + sequential_run = 0 - sequential_run; + } else { + sequential_behavior = VM_BEHAVIOR_SEQUENTIAL; + } + } + switch (behavior) { + case VM_BEHAVIOR_RANDOM: + break; + case VM_BEHAVIOR_SEQUENTIAL: + if (sequential_run >= (int)PAGE_SIZE) { + run_offset = 0 - PAGE_SIZE_64; + max_pages_in_run = 1; + } + break; + case VM_BEHAVIOR_RSEQNTL: + if (sequential_run >= (int)PAGE_SIZE) { + run_offset = PAGE_SIZE_64; + max_pages_in_run = 1; + } + break; + case VM_BEHAVIOR_DEFAULT: + default: + { vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64; + + /* + * determine if the run of sequential accesss has been + * long enough on an object with default access behavior + * to consider it for deactivation + */ + if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) { + /* + * the comparisons between offset and behind are done + * in this kind of odd fashion in order to prevent wrap around + * at the end points + */ + if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) { + if (offset >= behind) { + run_offset = 0 - behind; + pg_offset = PAGE_SIZE_64; + max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER; + } + } else { + if (offset < -behind) { + run_offset = behind; + pg_offset = 0 - PAGE_SIZE_64; + max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER; + } + } + } + break; + } + } + for (n = 0; n < max_pages_in_run; n++) { + m = vm_page_lookup(object, offset + run_offset + (n * pg_offset)); + + if (m && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) { + page_run[pages_in_run++] = m; + pmap_clear_reference(m->phys_page); + } + } + if (pages_in_run) { + vm_page_lockspin_queues(); + + for (n = 0; n < pages_in_run; n++) { + + m = page_run[n]; + + vm_page_deactivate_internal(m, FALSE); + + vm_page_deactivate_behind_count++; +#if TRACEFAULTPAGE + dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */ +#endif + } + vm_page_unlock_queues(); + + return TRUE; + } + return FALSE; +} + + +static boolean_t +vm_page_throttled(void) +{ + clock_sec_t elapsed_sec; + clock_sec_t tv_sec; + clock_usec_t tv_usec; + + thread_t thread = current_thread(); + + if (thread->options & TH_OPT_VMPRIV) + return (FALSE); + + thread->t_page_creation_count++; + + if (NEED_TO_HARD_THROTTLE_THIS_TASK()) + return (TRUE); + + if (vm_page_free_count < vm_page_throttle_limit && + thread->t_page_creation_count > vm_page_creation_throttle) { + + clock_get_system_microtime(&tv_sec, &tv_usec); + + elapsed_sec = tv_sec - thread->t_page_creation_time; + + if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) { + + if (elapsed_sec >= 60) { + /* + * we'll reset our stats to give a well behaved app + * that was unlucky enough to accumulate a bunch of pages + * over a long period of time a chance to get out of + * the throttled state... we reset the counter and timestamp + * so that if it stays under the rate limit for the next second + * it will be back in our good graces... if it exceeds it, it + * will remain in the throttled state + */ + thread->t_page_creation_time = tv_sec; + thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5; + } + ++vm_page_throttle_count; + + return (TRUE); + } + thread->t_page_creation_time = tv_sec; + thread->t_page_creation_count = 0; + } + return (FALSE); +} + + +/* + * check for various conditions that would + * prevent us from creating a ZF page... + * cleanup is based on being called from vm_fault_page + * + * object must be locked + * object == m->object + */ +static vm_fault_return_t +vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state) +{ + if (object->shadow_severed || + VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) { + /* + * Either: + * 1. the shadow chain was severed, + * 2. the purgeable object is volatile or empty and is marked + * to fault on access while volatile. + * Just have to return an error at this point + */ + if (m != VM_PAGE_NULL) + VM_PAGE_FREE(m); + vm_fault_cleanup(object, first_m); + + thread_interrupt_level(interruptible_state); + + return (VM_FAULT_MEMORY_ERROR); + } + if (vm_backing_store_low) { + /* + * are we protecting the system from + * backing store exhaustion. If so + * sleep unless we are privileged. + */ + if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) { + + if (m != VM_PAGE_NULL) + VM_PAGE_FREE(m); + vm_fault_cleanup(object, first_m); + + assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT); + + thread_block(THREAD_CONTINUE_NULL); + thread_interrupt_level(interruptible_state); + + return (VM_FAULT_RETRY); + } + } + if (vm_page_throttled()) { + /* + * we're throttling zero-fills... + * treat this as if we couldn't grab a page + */ + if (m != VM_PAGE_NULL) + VM_PAGE_FREE(m); + vm_fault_cleanup(object, first_m); + + if (NEED_TO_HARD_THROTTLE_THIS_TASK()) { + delay(HARD_THROTTLE_DELAY); + + if (current_thread_aborted()) { + thread_interrupt_level(interruptible_state); + return VM_FAULT_INTERRUPTED; + } + } + + thread_interrupt_level(interruptible_state); + + return (VM_FAULT_MEMORY_SHORTAGE); + } + return (VM_FAULT_SUCCESS); +} + + +/* + * do the work to zero fill a page and + * inject it into the correct paging queue + * + * m->object must be locked + * page queue lock must NOT be held + */ +static int +vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) +{ + int my_fault = DBG_ZERO_FILL_FAULT; + + /* + * This is is a zero-fill page fault... + * + * Checking the page lock is a waste of + * time; this page was absent, so + * it can't be page locked by a pager. + * + * we also consider it undefined + * with respect to instruction + * execution. i.e. it is the responsibility + * of higher layers to call for an instruction + * sync after changing the contents and before + * sending a program into this area. We + * choose this approach for performance + */ + m->pmapped = TRUE; + + m->cs_validated = FALSE; + m->cs_tainted = FALSE; + + if (no_zero_fill == TRUE) + my_fault = DBG_NZF_PAGE_FAULT; + else { + vm_page_zero_fill(m); + + VM_STAT_INCR(zero_fill_count); + DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL); + } + assert(!m->laundry); + assert(m->object != kernel_object); + //assert(m->pageq.next == NULL && m->pageq.prev == NULL); + + if (!IP_VALID(memory_manager_default) && + (m->object->purgable == VM_PURGABLE_DENY || + m->object->purgable == VM_PURGABLE_NONVOLATILE || + m->object->purgable == VM_PURGABLE_VOLATILE )) { + vm_page_lockspin_queues(); + + queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); + m->throttled = TRUE; + vm_page_throttled_count++; + + vm_page_unlock_queues(); + } else { + if (current_thread()->t_page_creation_count > vm_page_creation_throttle) { + m->zero_fill = TRUE; + VM_ZF_COUNT_INCR(); + } + } + return (my_fault); +} -#define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0) /* * Routine: vm_fault_page @@ -219,12 +719,10 @@ int vm_default_behind = 0; /* Number of pages to prepage behind */ * Additional arguments: * The required permissions for the page is given * in "fault_type". Desired permissions are included - * in "protection". The minimum and maximum valid offsets - * within the object for the relevant map entry are - * passed in "lo_offset" and "hi_offset" respectively and - * the expected page reference pattern is passed in "behavior". - * These three parameters are used to determine pagein cluster - * limits. + * in "protection". + * fault_info is passed along to determine pagein cluster + * limits... it contains the expected reference pattern, + * cluster size if available, etc... * * If the desired page is known to be resident (for * example, because it was previously wired down), asserting @@ -253,7 +751,14 @@ int vm_default_behind = 0; /* Number of pages to prepage behind */ * be destroyed when this guarantee is no longer required. * The "result_page" is also left busy. It is not removed * from the pageout queues. + * Special Case: + * A return value of VM_FAULT_SUCCESS_NO_PAGE means that the + * fault succeeded but there's no VM page (i.e. the VM object + * does not actually hold VM pages, but device memory or + * large pages). The object is still locked and we still hold a + * paging_in_progress reference. */ +unsigned int vm_fault_page_blocked_access = 0; vm_fault_return_t vm_fault_page( @@ -262,10 +767,6 @@ vm_fault_page( vm_object_offset_t first_offset, /* Offset into object */ vm_prot_t fault_type, /* What access is requested */ boolean_t must_be_resident,/* Must page be resident? */ - int interruptible, /* how may fault be interrupted? */ - vm_object_offset_t lo_offset, /* Map entry start */ - vm_object_offset_t hi_offset, /* Map entry end */ - vm_behavior_t behavior, /* Page reference behavior */ /* Modifies in place: */ vm_prot_t *protection, /* Protection for mapping */ /* Returns: */ @@ -277,17 +778,17 @@ vm_fault_page( /* More arguments: */ kern_return_t *error_code, /* code if page is in error */ boolean_t no_zero_fill, /* don't zero fill absent pages */ +#if MACH_PAGEMAP boolean_t data_supply, /* treat as data_supply if * it is a write fault and a full * page is provided */ - vm_map_t map, - vm_offset_t vaddr) +#else + __unused boolean_t data_supply, +#endif + vm_object_fault_info_t fault_info) { - register vm_page_t m; - register vm_object_t object; - register vm_object_offset_t offset; vm_page_t first_m; vm_object_t next_object; @@ -295,19 +796,17 @@ vm_fault_page( boolean_t look_for_page; vm_prot_t access_required = fault_type; vm_prot_t wants_copy_flag; - vm_size_t cluster_size, length; - vm_object_offset_t cluster_offset; - vm_object_offset_t cluster_start, cluster_end, paging_offset; - vm_object_offset_t align_offset; CLUSTER_STAT(int pages_at_higher_offsets;) CLUSTER_STAT(int pages_at_lower_offsets;) - kern_return_t wait_result; - thread_t cur_thread; + kern_return_t wait_result; boolean_t interruptible_state; - boolean_t bumped_pagein = FALSE; + vm_fault_return_t error; + int my_fault; + uint32_t try_failed_count; + int interruptible; /* how may fault be interrupted? */ + memory_object_t pager; + vm_fault_return_t retval; - -#if MACH_PAGEMAP /* * MACH page map - an optional optimization where a bit map is maintained * by the VM subsystem for internal objects to indicate which pages of @@ -318,15 +817,15 @@ vm_fault_page( * is designed to eliminate pager interaction overhead, if it is * 'known' that the page does not exist on backing store. * - * LOOK_FOR() evaluates to TRUE if the page specified by object/offset is + * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is * either marked as paged out in the existence map for the object or no - * existence map exists for the object. LOOK_FOR() is one of the + * existence map exists for the object. MUST_ASK_PAGER() is one of the * criteria in the decision to invoke the pager. It is also used as one * of the criteria to terminate the scan for adjacent pages in a clustered - * pagein operation. Note that LOOK_FOR() always evaluates to TRUE for + * pagein operation. Note that MUST_ASK_PAGER() always evaluates to TRUE for * permanent objects. Note also that if the pager for an internal object * has not been created, the pager is not invoked regardless of the value - * of LOOK_FOR() and that clustered pagein scans are only done on an object + * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object * for which a pager has been created. * * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset @@ -334,50 +833,28 @@ vm_fault_page( * PAGED_OUT() is used to determine if a page has already been pushed * into a copy object in order to avoid a redundant page out operation. */ -#define LOOK_FOR(o, f) (vm_external_state_get((o)->existence_map, (f)) \ +#if MACH_PAGEMAP +#define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \ != VM_EXTERNAL_STATE_ABSENT) #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \ == VM_EXTERNAL_STATE_EXISTS) -#else /* MACH_PAGEMAP */ -/* - * If the MACH page map optimization is not enabled, - * LOOK_FOR() always evaluates to TRUE. The pager will always be - * invoked to resolve missing pages in an object, assuming the pager - * has been created for the object. In a clustered page operation, the - * absence of a page on backing backing store cannot be used to terminate - * a scan for adjacent pages since that information is available only in - * the pager. Hence pages that may not be paged out are potentially - * included in a clustered request. The vnode pager is coded to deal - * with any combination of absent/present pages in a clustered - * pagein request. PAGED_OUT() always evaluates to FALSE, i.e. the pager - * will always be invoked to push a dirty page into a copy object assuming - * a pager has been created. If the page has already been pushed, the - * pager will ingore the new request. - */ -#define LOOK_FOR(o, f) TRUE -#define PAGED_OUT(o, f) FALSE -#endif /* MACH_PAGEMAP */ +#else +#define MUST_ASK_PAGER(o, f) (TRUE) +#define PAGED_OUT(o, f) (FALSE) +#endif /* * Recovery actions */ -#define PREPARE_RELEASE_PAGE(m) \ - MACRO_BEGIN \ - vm_page_lock_queues(); \ - MACRO_END - -#define DO_RELEASE_PAGE(m) \ - MACRO_BEGIN \ - PAGE_WAKEUP_DONE(m); \ - if (!m->active && !m->inactive) \ - vm_page_activate(m); \ - vm_page_unlock_queues(); \ - MACRO_END - #define RELEASE_PAGE(m) \ MACRO_BEGIN \ - PREPARE_RELEASE_PAGE(m); \ - DO_RELEASE_PAGE(m); \ + PAGE_WAKEUP_DONE(m); \ + if (!m->active && !m->inactive && !m->throttled) { \ + vm_page_lockspin_queues(); \ + if (!m->active && !m->inactive && !m->throttled) \ + vm_page_activate(m); \ + vm_page_unlock_queues(); \ + } \ MACRO_END #if TRACEFAULTPAGE @@ -385,31 +862,6 @@ vm_fault_page( #endif - -#if !VM_FAULT_STATIC_CONFIG - if (vm_fault_dirty_handling -#if MACH_KDB - /* - * If there are watchpoints set, then - * we don't want to give away write permission - * on a read fault. Make the task write fault, - * so that the watchpoint code notices the access. - */ - || db_watchpoint_list -#endif /* MACH_KDB */ - ) { - /* - * If we aren't asking for write permission, - * then don't give it away. We're using write - * faults to set the dirty bit. - */ - if (!(fault_type & VM_PROT_WRITE)) - *protection &= ~VM_PROT_WRITE; - } - - if (!vm_fault_interruptible) - interruptible = THREAD_UNINT; -#else /* STATIC_CONFIG */ #if MACH_KDB /* * If there are watchpoints set, then @@ -426,15 +878,10 @@ vm_fault_page( if (!(fault_type & VM_PROT_WRITE)) *protection &= ~VM_PROT_WRITE; } - #endif /* MACH_KDB */ -#endif /* STATIC_CONFIG */ - - cur_thread = current_thread(); - interruptible_state = cur_thread->interruptible; - if (interruptible == THREAD_UNINT) - cur_thread->interruptible = FALSE; + interruptible = fault_info->interruptible; + interruptible_state = thread_interrupt_level(interruptible); /* * INVARIANTS (through entire routine): @@ -448,145 +895,187 @@ vm_fault_page( * pager access or when waiting for memory, so * we use a busy page then. * - * Note also that we aren't as concerned about more than - * one thread attempting to memory_object_data_unlock - * the same page at once, so we don't hold the page - * as busy then, but do record the highest unlock - * value so far. [Unlock requests may also be delivered - * out of order.] - * * 2) To prevent another thread from racing us down the * shadow chain and entering a new page in the top * object before we do, we must keep a busy page in * the top object while following the shadow chain. * * 3) We must increment paging_in_progress on any object - * for which we have a busy page + * for which we have a busy page before dropping + * the object lock * * 4) We leave busy pages on the pageout queues. * If the pageout daemon comes across a busy page, * it will remove the page from the pageout queues. */ - /* - * Search for the page at object/offset. - */ - object = first_object; offset = first_offset; first_m = VM_PAGE_NULL; access_required = fault_type; + XPR(XPR_VM_FAULT, "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n", - (integer_t)object, offset, fault_type, *protection, 0); + object, offset, fault_type, *protection, 0); /* - * See whether this page is resident + * default type of fault */ + my_fault = DBG_CACHE_HIT_FAULT; while (TRUE) { #if TRACEFAULTPAGE dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */ #endif if (!object->alive) { + /* + * object is no longer valid + * clean up and return error + */ vm_fault_cleanup(object, first_m); - cur_thread->interruptible = interruptible_state; - return(VM_FAULT_MEMORY_ERROR); + thread_interrupt_level(interruptible_state); + + return (VM_FAULT_MEMORY_ERROR); } - m = vm_page_lookup(object, offset); -#if TRACEFAULTPAGE - dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */ -#endif - if (m != VM_PAGE_NULL) { + + if (!object->pager_created && object->phys_contiguous) { /* - * If the page was pre-paged as part of a - * cluster, record the fact. + * A physically-contiguous object without a pager: + * must be a "large page" object. We do not deal + * with VM pages for this object. */ - if (m->clustered) { - vm_pagein_cluster_used++; - m->clustered = FALSE; - } + m = VM_PAGE_NULL; + goto phys_contig_object; + } + if (object->blocked_access) { /* - * If the page is being brought in, - * wait for it and then retry. - * - * A possible optimization: if the page - * is known to be resident, we can ignore - * pages that are absent (regardless of - * whether they're busy). + * Access to this VM object has been blocked. + * Replace our "paging_in_progress" reference with + * a "activity_in_progress" reference and wait for + * access to be unblocked. */ + vm_object_activity_begin(object); + vm_object_paging_end(object); + while (object->blocked_access) { + vm_object_sleep(object, + VM_OBJECT_EVENT_UNBLOCKED, + THREAD_UNINT); + } + vm_fault_page_blocked_access++; + vm_object_paging_begin(object); + vm_object_activity_end(object); + } + + /* + * See whether the page at 'offset' is resident + */ + m = vm_page_lookup(object, offset); +#if TRACEFAULTPAGE + dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */ +#endif + if (m != VM_PAGE_NULL) { if (m->busy) { + /* + * The page is being brought in, + * wait for it and then retry. + * + * A possible optimization: if the page + * is known to be resident, we can ignore + * pages that are absent (regardless of + * whether they're busy). + */ #if TRACEFAULTPAGE dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ #endif - PAGE_ASSERT_WAIT(m, interruptible); - vm_object_unlock(object); + wait_result = PAGE_SLEEP(object, m, interruptible); XPR(XPR_VM_FAULT, "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n", - (integer_t)object, offset, - (integer_t)m, 0, 0); + object, offset, + m, 0, 0); counter(c_vm_fault_page_block_busy_kernel++); - wait_result = thread_block((void (*)(void))0); - vm_object_lock(object); if (wait_result != THREAD_AWAKENED) { vm_fault_cleanup(object, first_m); - cur_thread->interruptible = interruptible_state; + thread_interrupt_level(interruptible_state); + if (wait_result == THREAD_RESTART) - { - return(VM_FAULT_RETRY); - } + return (VM_FAULT_RETRY); else - { - return(VM_FAULT_INTERRUPTED); - } + return (VM_FAULT_INTERRUPTED); } continue; } - /* - * If the page is in error, give up now. - */ + if (m->phys_page == vm_page_guard_addr) { + /* + * Guard page: off limits ! + */ + if (fault_type == VM_PROT_NONE) { + /* + * The fault is not requesting any + * access to the guard page, so it must + * be just to wire or unwire it. + * Let's pretend it succeeded... + */ + m->busy = TRUE; + *result_page = m; + assert(first_m == VM_PAGE_NULL); + *top_page = first_m; + if (type_of_fault) + *type_of_fault = DBG_GUARD_FAULT; + return VM_FAULT_SUCCESS; + } else { + /* + * The fault requests access to the + * guard page: let's deny that ! + */ + vm_fault_cleanup(object, first_m); + thread_interrupt_level(interruptible_state); + return VM_FAULT_MEMORY_ERROR; + } + } if (m->error) { + /* + * The page is in error, give up now. + */ #if TRACEFAULTPAGE dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */ #endif if (error_code) - *error_code = m->page_error; + *error_code = KERN_MEMORY_ERROR; VM_PAGE_FREE(m); - vm_fault_cleanup(object, first_m); - cur_thread->interruptible = interruptible_state; - return(VM_FAULT_MEMORY_ERROR); - } - /* - * If the pager wants us to restart - * at the top of the chain, - * typically because it has moved the - * page to another pager, then do so. - */ + vm_fault_cleanup(object, first_m); + thread_interrupt_level(interruptible_state); + return (VM_FAULT_MEMORY_ERROR); + } if (m->restart) { + /* + * The pager wants us to restart + * at the top of the chain, + * typically because it has moved the + * page to another pager, then do so. + */ #if TRACEFAULTPAGE dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ #endif VM_PAGE_FREE(m); - vm_fault_cleanup(object, first_m); - cur_thread->interruptible = interruptible_state; - return(VM_FAULT_RETRY); - } - /* - * If the page isn't busy, but is absent, - * then it was deemed "unavailable". - */ + vm_fault_cleanup(object, first_m); + thread_interrupt_level(interruptible_state); + return (VM_FAULT_RETRY); + } if (m->absent) { - /* + /* + * The page isn't busy, but is absent, + * therefore it's deemed "unavailable". + * * Remove the non-existent page (unless it's * in the top object) and move on down to the * next object (if there is one). @@ -594,137 +1083,118 @@ vm_fault_page( #if TRACEFAULTPAGE dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */ #endif - next_object = object->shadow; - if (next_object == VM_OBJECT_NULL) { - vm_page_t real_m; - - assert(!must_be_resident); - - if (object->shadow_severed) { - vm_fault_cleanup( - object, first_m); - cur_thread->interruptible = interruptible_state; - return VM_FAULT_MEMORY_ERROR; - } + if (next_object == VM_OBJECT_NULL) { /* * Absent page at bottom of shadow * chain; zero fill the page we left - * busy in the first object, and flush - * the absent page. But first we - * need to allocate a real page. + * busy in the first object, and free + * the absent page. */ - if (VM_PAGE_THROTTLED() || - (real_m = vm_page_grab()) == VM_PAGE_NULL) { - vm_fault_cleanup(object, first_m); - cur_thread->interruptible = interruptible_state; - return(VM_FAULT_MEMORY_SHORTAGE); - } + assert(!must_be_resident); + + /* + * check for any conditions that prevent + * us from creating a new zero-fill page + * vm_fault_check will do all of the + * fault cleanup in the case of an error condition + * including resetting the thread_interrupt_level + */ + error = vm_fault_check(object, m, first_m, interruptible_state); + + if (error != VM_FAULT_SUCCESS) + return (error); XPR(XPR_VM_FAULT, - "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n", - (integer_t)object, offset, - (integer_t)m, - (integer_t)first_object, 0); + "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n", + object, offset, + m, + first_object, 0); + if (object != first_object) { + /* + * free the absent page we just found + */ VM_PAGE_FREE(m); + + /* + * drop reference and lock on current object + */ vm_object_paging_end(object); vm_object_unlock(object); - object = first_object; - offset = first_offset; + + /* + * grab the original page we + * 'soldered' in place and + * retake lock on 'first_object' + */ m = first_m; first_m = VM_PAGE_NULL; - vm_object_lock(object); - } - VM_PAGE_FREE(m); - assert(real_m->busy); - vm_page_insert(real_m, object, offset); - m = real_m; + object = first_object; + offset = first_offset; + vm_object_lock(object); + } else { + /* + * we're going to use the absent page we just found + * so convert it to a 'busy' page + */ + m->absent = FALSE; + m->busy = TRUE; + } /* - * Drop the lock while zero filling - * page. Then break because this - * is the page we wanted. Checking - * the page lock is a waste of time; - * this page was either absent or - * newly allocated -- in both cases - * it can't be page locked by a pager. + * zero-fill the page and put it on + * the correct paging queue */ - m->no_isync = FALSE; + my_fault = vm_fault_zero_page(m, no_zero_fill); - if (!no_zero_fill) { - vm_object_unlock(object); - vm_page_zero_fill(m); - if (type_of_fault) - *type_of_fault = DBG_ZERO_FILL_FAULT; - VM_STAT(zero_fill_count++); - - if (bumped_pagein == TRUE) { - VM_STAT(pageins--); - current_task()->pageins--; - } - vm_object_lock(object); - } - pmap_clear_modify(m->phys_addr); - vm_page_lock_queues(); - VM_PAGE_QUEUES_REMOVE(m); - m->page_ticket = vm_page_ticket; - vm_page_ticket_roll++; - if(vm_page_ticket_roll == - VM_PAGE_TICKETS_IN_ROLL) { - vm_page_ticket_roll = 0; - if(vm_page_ticket == - VM_PAGE_TICKET_ROLL_IDS) - vm_page_ticket= 0; - else - vm_page_ticket++; - } - queue_enter(&vm_page_queue_inactive, - m, vm_page_t, pageq); - m->inactive = TRUE; - vm_page_inactive_count++; - vm_page_unlock_queues(); + if (fault_info->mark_zf_absent && no_zero_fill == TRUE) + m->absent = TRUE; break; } else { - if (must_be_resident) { + if (must_be_resident) vm_object_paging_end(object); - } else if (object != first_object) { + else if (object != first_object) { vm_object_paging_end(object); VM_PAGE_FREE(m); } else { first_m = m; m->absent = FALSE; - m->unusual = FALSE; - vm_object_absent_release(object); m->busy = TRUE; - vm_page_lock_queues(); + vm_page_lockspin_queues(); VM_PAGE_QUEUES_REMOVE(m); vm_page_unlock_queues(); } XPR(XPR_VM_FAULT, "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n", - (integer_t)object, offset, - (integer_t)next_object, + object, offset, + next_object, offset+object->shadow_offset,0); + offset += object->shadow_offset; - hi_offset += object->shadow_offset; - lo_offset += object->shadow_offset; + fault_info->lo_offset += object->shadow_offset; + fault_info->hi_offset += object->shadow_offset; access_required = VM_PROT_READ; + vm_object_lock(next_object); vm_object_unlock(object); object = next_object; vm_object_paging_begin(object); + + /* + * reset to default type of fault + */ + my_fault = DBG_CACHE_HIT_FAULT; + continue; } } - if ((m->cleaning) - && ((object != first_object) || - (object->copy != VM_OBJECT_NULL)) - && (fault_type & VM_PROT_WRITE)) { + && ((object != first_object) || (object->copy != VM_OBJECT_NULL)) + && (fault_type & VM_PROT_WRITE)) { /* * This is a copy-on-write fault that will * cause us to revoke access to this page, but @@ -740,189 +1210,147 @@ vm_fault_page( #endif XPR(XPR_VM_FAULT, "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n", - (integer_t)object, offset, - (integer_t)m, 0, 0); - /* take an extra ref so that object won't die */ - assert(object->ref_count > 0); - object->ref_count++; - vm_object_res_reference(object); + object, offset, + m, 0, 0); + /* + * take an extra ref so that object won't die + */ + vm_object_reference_locked(object); + vm_fault_cleanup(object, first_m); + counter(c_vm_fault_page_block_backoff_kernel++); vm_object_lock(object); assert(object->ref_count > 0); + m = vm_page_lookup(object, offset); + if (m != VM_PAGE_NULL && m->cleaning) { PAGE_ASSERT_WAIT(m, interruptible); + vm_object_unlock(object); - wait_result = thread_block((void (*)(void)) 0); + wait_result = thread_block(THREAD_CONTINUE_NULL); vm_object_deallocate(object); + goto backoff; } else { vm_object_unlock(object); + vm_object_deallocate(object); - cur_thread->interruptible = interruptible_state; - return VM_FAULT_RETRY; + thread_interrupt_level(interruptible_state); + + return (VM_FAULT_RETRY); } } + if (type_of_fault == NULL && m->speculative && + !(fault_info != NULL && fault_info->stealth)) { + /* + * If we were passed a non-NULL pointer for + * "type_of_fault", than we came from + * vm_fault... we'll let it deal with + * this condition, since it + * needs to see m->speculative to correctly + * account the pageins, otherwise... + * take it off the speculative queue, we'll + * let the caller of vm_fault_page deal + * with getting it onto the correct queue + * + * If the caller specified in fault_info that + * it wants a "stealth" fault, we also leave + * the page in the speculative queue. + */ + vm_page_lockspin_queues(); + VM_PAGE_QUEUES_REMOVE(m); + vm_page_unlock_queues(); + } - /* - * If the desired access to this page has - * been locked out, request that it be unlocked. - */ - - if (access_required & m->page_lock) { - if ((access_required & m->unlock_request) != access_required) { - vm_prot_t new_unlock_request; - kern_return_t rc; - -#if TRACEFAULTPAGE - dbgTrace(0xBEEF000A, (unsigned int) m, (unsigned int) object->pager_ready); /* (TEST/DEBUG) */ -#endif - if (!object->pager_ready) { - XPR(XPR_VM_FAULT, - "vm_f_page: ready wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n", - access_required, - (integer_t)object, offset, - (integer_t)m, 0); - /* take an extra ref */ - assert(object->ref_count > 0); - object->ref_count++; - vm_object_res_reference(object); - vm_fault_cleanup(object, - first_m); - counter(c_vm_fault_page_block_backoff_kernel++); - vm_object_lock(object); - assert(object->ref_count > 0); - if (!object->pager_ready) { - vm_object_assert_wait( - object, - VM_OBJECT_EVENT_PAGER_READY, - interruptible); - vm_object_unlock(object); - wait_result = thread_block((void (*)(void))0); - vm_object_deallocate(object); - goto backoff; - } else { - vm_object_unlock(object); - vm_object_deallocate(object); - cur_thread->interruptible = interruptible_state; - return VM_FAULT_RETRY; - } - } + if (m->encrypted) { + /* + * ENCRYPTED SWAP: + * the user needs access to a page that we + * encrypted before paging it out. + * Decrypt the page now. + * Keep it busy to prevent anyone from + * accessing it during the decryption. + */ + m->busy = TRUE; + vm_page_decrypt(m, 0); + assert(object == m->object); + assert(m->busy); + PAGE_WAKEUP_DONE(m); - new_unlock_request = m->unlock_request = - (access_required | m->unlock_request); - vm_object_unlock(object); - XPR(XPR_VM_FAULT, - "vm_f_page: unlock obj 0x%X, offset 0x%X, page 0x%X, unl_req %d\n", - (integer_t)object, offset, - (integer_t)m, new_unlock_request, 0); - if ((rc = memory_object_data_unlock( - object->pager, - offset + object->paging_offset, - PAGE_SIZE, - new_unlock_request)) - != KERN_SUCCESS) { - if (vm_fault_debug) - printf("vm_fault: memory_object_data_unlock failed\n"); - vm_object_lock(object); - vm_fault_cleanup(object, first_m); - cur_thread->interruptible = interruptible_state; - return((rc == MACH_SEND_INTERRUPTED) ? - VM_FAULT_INTERRUPTED : - VM_FAULT_MEMORY_ERROR); - } - vm_object_lock(object); - continue; - } + /* + * Retry from the top, in case + * something changed while we were + * decrypting. + */ + continue; + } + ASSERT_PAGE_DECRYPTED(m); - XPR(XPR_VM_FAULT, - "vm_f_page: access wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n", - access_required, (integer_t)object, - offset, (integer_t)m, 0); - /* take an extra ref so object won't die */ - assert(object->ref_count > 0); - object->ref_count++; - vm_object_res_reference(object); - vm_fault_cleanup(object, first_m); - counter(c_vm_fault_page_block_backoff_kernel++); - vm_object_lock(object); - assert(object->ref_count > 0); - m = vm_page_lookup(object, offset); - if (m != VM_PAGE_NULL && - (access_required & m->page_lock) && - !((access_required & m->unlock_request) != access_required)) { - PAGE_ASSERT_WAIT(m, interruptible); - vm_object_unlock(object); - wait_result = thread_block((void (*)(void)) 0); - vm_object_deallocate(object); - goto backoff; - } else { - vm_object_unlock(object); - vm_object_deallocate(object); - cur_thread->interruptible = interruptible_state; - return VM_FAULT_RETRY; - } + if (m->object->code_signed) { + /* + * CODE SIGNING: + * We just paged in a page from a signed + * memory object but we don't need to + * validate it now. We'll validate it if + * when it gets mapped into a user address + * space for the first time or when the page + * gets copied to another object as a result + * of a copy-on-write. + */ } + /* - * We mark the page busy and leave it on - * the pageout queues. If the pageout - * deamon comes across it, then it will - * remove the page. + * We mark the page busy and leave it on + * the pageout queues. If the pageout + * deamon comes across it, then it will + * remove the page from the queue, but not the object */ - #if TRACEFAULTPAGE dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ #endif - -#if !VM_FAULT_STATIC_CONFIG - if (!software_reference_bits) { - vm_page_lock_queues(); - if (m->inactive) - vm_stat.reactivations++; - - VM_PAGE_QUEUES_REMOVE(m); - vm_page_unlock_queues(); - } -#endif XPR(XPR_VM_FAULT, "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n", - (integer_t)object, offset, (integer_t)m, 0, 0); + object, offset, m, 0, 0); assert(!m->busy); - m->busy = TRUE; assert(!m->absent); + + m->busy = TRUE; break; } + - look_for_page = - (object->pager_created) && - LOOK_FOR(object, offset) && - (!data_supply); - + /* + * we get here when there is no page present in the object at + * the offset we're interested in... we'll allocate a page + * at this point if the pager associated with + * this object can provide the data or we're the top object... + * object is locked; m == NULL + */ + look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply); + #if TRACEFAULTPAGE dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */ #endif - if ((look_for_page || (object == first_object)) - && !must_be_resident - && !(object->phys_contiguous)) { + if ((look_for_page || (object == first_object)) && !must_be_resident && !object->phys_contiguous) { /* - * Allocate a new page for this object/offset - * pair. + * Allocate a new page for this object/offset pair */ - - m = vm_page_grab_fictitious(); + m = vm_page_grab(); #if TRACEFAULTPAGE dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */ #endif if (m == VM_PAGE_NULL) { + vm_fault_cleanup(object, first_m); - cur_thread->interruptible = interruptible_state; - return(VM_FAULT_FICTITIOUS_SHORTAGE); + thread_interrupt_level(interruptible_state); + + return (VM_FAULT_MEMORY_SHORTAGE); } vm_page_insert(m, object, offset); } - - if ((look_for_page && !must_be_resident)) { + if (look_for_page && !must_be_resident) { kern_return_t rc; /* @@ -933,481 +1361,210 @@ vm_fault_page( #if TRACEFAULTPAGE dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */ #endif - if(m != VM_PAGE_NULL) - VM_PAGE_FREE(m); + if (m != VM_PAGE_NULL) + VM_PAGE_FREE(m); + XPR(XPR_VM_FAULT, "vm_f_page: ready wait obj 0x%X, offset 0x%X\n", - (integer_t)object, offset, 0, 0, 0); - /* take an extra ref so object won't die */ - assert(object->ref_count > 0); - object->ref_count++; - vm_object_res_reference(object); + object, offset, 0, 0, 0); + + /* + * take an extra ref so object won't die + */ + vm_object_reference_locked(object); vm_fault_cleanup(object, first_m); counter(c_vm_fault_page_block_backoff_kernel++); + vm_object_lock(object); assert(object->ref_count > 0); + if (!object->pager_ready) { - vm_object_assert_wait(object, - VM_OBJECT_EVENT_PAGER_READY, - interruptible); + wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible); + vm_object_unlock(object); - wait_result = thread_block((void (*)(void))0); + if (wait_result == THREAD_WAITING) + wait_result = thread_block(THREAD_CONTINUE_NULL); vm_object_deallocate(object); + goto backoff; } else { vm_object_unlock(object); vm_object_deallocate(object); - cur_thread->interruptible = interruptible_state; - return VM_FAULT_RETRY; - } - } + thread_interrupt_level(interruptible_state); - if(object->phys_contiguous) { - if(m != VM_PAGE_NULL) { - VM_PAGE_FREE(m); - m = VM_PAGE_NULL; + return (VM_FAULT_RETRY); } - goto no_clustering; } - if (object->internal) { + if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) { /* - * Requests to the default pager - * must reserve a real page in advance, - * because the pager's data-provided - * won't block for pages. IMPORTANT: - * this acts as a throttling mechanism - * for data_requests to the default - * pager. + * If there are too many outstanding page + * requests pending on this external object, we + * wait for them to be resolved now. */ - #if TRACEFAULTPAGE - dbgTrace(0xBEEF000F, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ + dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ #endif - if (m->fictitious && !vm_page_convert(m)) { + if (m != VM_PAGE_NULL) VM_PAGE_FREE(m); - vm_fault_cleanup(object, first_m); - cur_thread->interruptible = interruptible_state; - return(VM_FAULT_MEMORY_SHORTAGE); - } - } else if (object->absent_count > - vm_object_absent_max) { /* - * If there are too many outstanding page - * requests pending on this object, we - * wait for them to be resolved now. + * take an extra ref so object won't die */ + vm_object_reference_locked(object); -#if TRACEFAULTPAGE - dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ -#endif - if(m != VM_PAGE_NULL) - VM_PAGE_FREE(m); - /* take an extra ref so object won't die */ - assert(object->ref_count > 0); - object->ref_count++; - vm_object_res_reference(object); vm_fault_cleanup(object, first_m); + counter(c_vm_fault_page_block_backoff_kernel++); + vm_object_lock(object); assert(object->ref_count > 0); - if (object->absent_count > vm_object_absent_max) { - vm_object_absent_assert_wait(object, - interruptible); + + if (object->paging_in_progress > vm_object_pagein_throttle) { + vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS, interruptible); + vm_object_unlock(object); - wait_result = thread_block((void (*)(void))0); + wait_result = thread_block(THREAD_CONTINUE_NULL); vm_object_deallocate(object); + goto backoff; } else { vm_object_unlock(object); vm_object_deallocate(object); - cur_thread->interruptible = interruptible_state; - return VM_FAULT_RETRY; + thread_interrupt_level(interruptible_state); + + return (VM_FAULT_RETRY); } } - - /* - * Indicate that the page is waiting for data - * from the memory manager. - */ - - if(m != VM_PAGE_NULL) { - - m->list_req_pending = TRUE; + if (m != VM_PAGE_NULL) { + /* + * Indicate that the page is waiting for data + * from the memory manager. + */ + m->list_req_pending = TRUE; m->absent = TRUE; - m->unusual = TRUE; - object->absent_count++; - - } - - cluster_start = offset; - length = PAGE_SIZE; - cluster_size = object->cluster_size; - - /* - * Skip clustered pagein if it is globally disabled - * or random page reference behavior is expected - * for the address range containing the faulting - * address or the object paging block size is - * equal to the page size. - */ - if (!vm_allow_clustered_pagein || - behavior == VM_BEHAVIOR_RANDOM || - m == VM_PAGE_NULL || - cluster_size == PAGE_SIZE) { - cluster_start = trunc_page_64(cluster_start); - goto no_clustering; } - assert(offset >= lo_offset); - assert(offset < hi_offset); - assert(ALIGNED(object->paging_offset)); - assert(cluster_size >= PAGE_SIZE); - #if TRACEFAULTPAGE - dbgTrace(0xBEEF0011, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ + dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */ #endif - /* - * Decide whether to scan ahead or behind for - * additional pages contiguous to the faulted - * page in the same paging block. The decision - * is based on system wide globals and the - * expected page reference behavior of the - * address range contained the faulting address. - * First calculate some constants. - */ - paging_offset = offset + object->paging_offset; - cluster_offset = paging_offset & (cluster_size - 1); - align_offset = paging_offset&(PAGE_SIZE_64-1); - if (align_offset != 0) { - cluster_offset = trunc_page_64(cluster_offset); - } - -#define SPANS_CLUSTER(x) ((((x) - align_offset) & (vm_object_offset_t)(cluster_size - 1)) == 0) - - /* - * Backward scan only if reverse sequential - * behavior has been specified - */ - CLUSTER_STAT(pages_at_lower_offsets = 0;) - if (((vm_default_behind != 0 && - behavior == VM_BEHAVIOR_DEFAULT) || - behavior == VM_BEHAVIOR_RSEQNTL) && offset) { - vm_object_offset_t cluster_bot; - - /* - * Calculate lower search boundary. - * Exclude pages that span a cluster boundary. - * Clip to start of map entry. - * For default page reference behavior, scan - * default pages behind. - */ - cluster_bot = (offset > cluster_offset) ? - offset - cluster_offset : offset; - if (align_offset != 0) { - if ((cluster_bot < offset) && - SPANS_CLUSTER(cluster_bot)) { - cluster_bot += PAGE_SIZE_64; - } - } - if (behavior == VM_BEHAVIOR_DEFAULT) { - vm_object_offset_t - bot = (vm_object_offset_t) - (vm_default_behind * PAGE_SIZE); - - if (cluster_bot < (offset - bot)) - cluster_bot = offset - bot; - } - if (lo_offset > cluster_bot) - cluster_bot = lo_offset; - - for ( cluster_start = offset - PAGE_SIZE_64; - (cluster_start >= cluster_bot) && - (cluster_start != - (align_offset - PAGE_SIZE_64)); - cluster_start -= PAGE_SIZE_64) { - assert(cluster_size > PAGE_SIZE_64); -retry_cluster_backw: - if (!LOOK_FOR(object, cluster_start) || - vm_page_lookup(object, cluster_start) - != VM_PAGE_NULL) { - break; - } - if (object->internal) { - /* - * need to acquire a real page in - * advance because this acts as - * a throttling mechanism for - * data_requests to the default - * pager. If this fails, give up - * trying to find any more pages - * in the cluster and send off the - * request for what we already have. - */ - if ((m = vm_page_grab()) - == VM_PAGE_NULL) { - cluster_start += PAGE_SIZE_64; - cluster_end = offset + PAGE_SIZE_64; - goto give_up; - } - } else if ((m = vm_page_grab_fictitious()) - == VM_PAGE_NULL) { - vm_object_unlock(object); - vm_page_more_fictitious(); - vm_object_lock(object); - goto retry_cluster_backw; - } - m->absent = TRUE; - m->unusual = TRUE; - m->clustered = TRUE; - m->list_req_pending = TRUE; - - vm_page_insert(m, object, cluster_start); - CLUSTER_STAT(pages_at_lower_offsets++;) - object->absent_count++; - } - cluster_start += PAGE_SIZE_64; - assert(cluster_start >= cluster_bot); - } - assert(cluster_start <= offset); /* - * Forward scan if default or sequential behavior - * specified + * It's possible someone called vm_object_destroy while we weren't + * holding the object lock. If that has happened, then bail out + * here. */ - CLUSTER_STAT(pages_at_higher_offsets = 0;) - if ((behavior == VM_BEHAVIOR_DEFAULT && - vm_default_ahead != 0) || - behavior == VM_BEHAVIOR_SEQUENTIAL) { - vm_object_offset_t cluster_top; - - /* - * Calculate upper search boundary. - * Exclude pages that span a cluster boundary. - * Clip to end of map entry. - * For default page reference behavior, scan - * default pages ahead. - */ - cluster_top = (offset + cluster_size) - - cluster_offset; - if (align_offset != 0) { - if ((cluster_top > (offset + PAGE_SIZE_64)) && - SPANS_CLUSTER(cluster_top)) { - cluster_top -= PAGE_SIZE_64; - } - } - if (behavior == VM_BEHAVIOR_DEFAULT) { - vm_object_offset_t top = (vm_object_offset_t) - ((vm_default_ahead*PAGE_SIZE)+PAGE_SIZE); - - if (cluster_top > (offset + top)) - cluster_top = offset + top; - } - if (cluster_top > hi_offset) - cluster_top = hi_offset; - - for (cluster_end = offset + PAGE_SIZE_64; - cluster_end < cluster_top; - cluster_end += PAGE_SIZE_64) { - assert(cluster_size > PAGE_SIZE); -retry_cluster_forw: - if (!LOOK_FOR(object, cluster_end) || - vm_page_lookup(object, cluster_end) - != VM_PAGE_NULL) { - break; - } - if (object->internal) { - /* - * need to acquire a real page in - * advance because this acts as - * a throttling mechanism for - * data_requests to the default - * pager. If this fails, give up - * trying to find any more pages - * in the cluster and send off the - * request for what we already have. - */ - if ((m = vm_page_grab()) - == VM_PAGE_NULL) { - break; - } - } else if ((m = vm_page_grab_fictitious()) - == VM_PAGE_NULL) { - vm_object_unlock(object); - vm_page_more_fictitious(); - vm_object_lock(object); - goto retry_cluster_forw; - } - m->absent = TRUE; - m->unusual = TRUE; - m->clustered = TRUE; - m->list_req_pending = TRUE; - - vm_page_insert(m, object, cluster_end); - CLUSTER_STAT(pages_at_higher_offsets++;) - object->absent_count++; - } - assert(cluster_end <= cluster_top); - } - else { - cluster_end = offset + PAGE_SIZE_64; - } -give_up: - assert(cluster_end >= offset + PAGE_SIZE_64); - length = cluster_end - cluster_start; -#if MACH_CLUSTER_STATS - CLUSTER_STAT_HIGHER(pages_at_higher_offsets); - CLUSTER_STAT_LOWER(pages_at_lower_offsets); - CLUSTER_STAT_CLUSTER(length/PAGE_SIZE); -#endif /* MACH_CLUSTER_STATS */ + pager = object->pager; -no_clustering: - /* - * lengthen the cluster by the pages in the working set - */ - if((map != NULL) && - (current_task()->dynamic_working_set != 0)) { - cluster_end = cluster_start + length; - /* tws values for start and end are just a - * suggestions. Therefore, as long as - * build_cluster does not use pointers or - * take action based on values that - * could be affected by re-entrance we - * do not need to take the map lock. - */ - tws_build_cluster((tws_hash_t) - current_task()->dynamic_working_set, - object, &cluster_start, - &cluster_end, 0x16000); - length = cluster_end - cluster_start; + if (pager == MEMORY_OBJECT_NULL) { + vm_fault_cleanup(object, first_m); + thread_interrupt_level(interruptible_state); + return VM_FAULT_MEMORY_ERROR; } -#if TRACEFAULTPAGE - dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */ -#endif - /* - * We have a busy page, so we can - * release the object lock. - */ - vm_object_unlock(object); /* - * Call the memory manager to retrieve the data. + * We have an absent page in place for the faulting offset, + * so we can release the object lock. */ - if (type_of_fault) - *type_of_fault = DBG_PAGEIN_FAULT; - VM_STAT(pageins++); - current_task()->pageins++; - bumped_pagein = TRUE; + vm_object_unlock(object); /* - * If this object uses a copy_call strategy, - * and we are interested in a copy of this object - * (having gotten here only by following a - * shadow chain), then tell the memory manager - * via a flag added to the desired_access - * parameter, so that it can detect a race - * between our walking down the shadow chain - * and its pushing pages up into a copy of - * the object that it manages. + * If this object uses a copy_call strategy, + * and we are interested in a copy of this object + * (having gotten here only by following a + * shadow chain), then tell the memory manager + * via a flag added to the desired_access + * parameter, so that it can detect a race + * between our walking down the shadow chain + * and its pushing pages up into a copy of + * the object that it manages. */ - - if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && - object != first_object) { + if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) wants_copy_flag = VM_PROT_WANTS_COPY; - } else { + else wants_copy_flag = VM_PROT_NONE; - } XPR(XPR_VM_FAULT, "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n", - (integer_t)object, offset, (integer_t)m, + object, offset, m, access_required | wants_copy_flag, 0); - rc = memory_object_data_request(object->pager, - cluster_start + object->paging_offset, - length, - access_required | wants_copy_flag); - + /* + * Call the memory manager to retrieve the data. + */ + rc = memory_object_data_request( + pager, + offset + object->paging_offset, + PAGE_SIZE, + access_required | wants_copy_flag, + (memory_object_fault_info_t)fault_info); #if TRACEFAULTPAGE dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */ #endif + vm_object_lock(object); + if (rc != KERN_SUCCESS) { - if (rc != MACH_SEND_INTERRUPTED - && vm_fault_debug) - printf("%s(0x%x, 0x%x, 0x%x, 0x%x) failed, rc=%d\n", - "memory_object_data_request", - object->pager, - cluster_start + object->paging_offset, - length, access_required, rc); - /* - * Don't want to leave a busy page around, - * but the data request may have blocked, - * so check if it's still there and busy. - */ - if(!object->phys_contiguous) { - vm_object_lock(object); - for (; length; length -= PAGE_SIZE, - cluster_start += PAGE_SIZE_64) { - vm_page_t p; - if ((p = vm_page_lookup(object, - cluster_start)) - && p->absent && p->busy - && p != first_m) { - VM_PAGE_FREE(p); - } - } - } + vm_fault_cleanup(object, first_m); - cur_thread->interruptible = interruptible_state; - return((rc == MACH_SEND_INTERRUPTED) ? + thread_interrupt_level(interruptible_state); + + return ((rc == MACH_SEND_INTERRUPTED) ? VM_FAULT_INTERRUPTED : VM_FAULT_MEMORY_ERROR); } else { -#ifdef notdefcdy - tws_hash_line_t line; - task_t task; + clock_sec_t tv_sec; + clock_usec_t tv_usec; + + clock_get_system_microtime(&tv_sec, &tv_usec); + current_thread()->t_page_creation_time = tv_sec; + current_thread()->t_page_creation_count = 0; + } + if ((interruptible != THREAD_UNINT) && (current_thread()->sched_mode & TH_MODE_ABORT)) { - task = current_task(); - - if((map != NULL) && - (task->dynamic_working_set != 0)) { - if(tws_lookup - ((tws_hash_t) - task->dynamic_working_set, - offset, object, - &line) == KERN_SUCCESS) { - tws_line_signal((tws_hash_t) - task->dynamic_working_set, - map, line, vaddr); - } - } -#endif + vm_fault_cleanup(object, first_m); + thread_interrupt_level(interruptible_state); + + return (VM_FAULT_INTERRUPTED); } - + if (m == VM_PAGE_NULL && object->phys_contiguous) { + /* + * No page here means that the object we + * initially looked up was "physically + * contiguous" (i.e. device memory). However, + * with Virtual VRAM, the object might not + * be backed by that device memory anymore, + * so we're done here only if the object is + * still "phys_contiguous". + * Otherwise, if the object is no longer + * "phys_contiguous", we need to retry the + * page fault against the object's new backing + * store (different memory object). + */ + phys_contig_object: + goto done; + } + /* + * potentially a pagein fault + * if we make it through the state checks + * above, than we'll count it as such + */ + my_fault = DBG_PAGEIN_FAULT; + /* * Retry with same object/offset, since new data may * be in a different page (i.e., m is meaningless at * this point). */ - vm_object_lock(object); - if ((interruptible != THREAD_UNINT) && - (current_thread()->state & TH_ABORT)) { - vm_fault_cleanup(object, first_m); - cur_thread->interruptible = interruptible_state; - return(VM_FAULT_INTERRUPTED); - } - if(m == VM_PAGE_NULL) - break; continue; } /* - * The only case in which we get here is if - * object has no pager (or unwiring). If the pager doesn't - * have the page this is handled in the m->absent case above - * (and if you change things here you should look above). + * We get here if the object has no pager, or an existence map + * exists and indicates the page isn't present on the pager + * or we're unwiring a page. If a pager exists, but there + * is no existence map, then the m->absent case above handles + * the ZF case when the pager can't provide the page */ #if TRACEFAULTPAGE dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */ @@ -1419,20 +1576,17 @@ no_clustering: XPR(XPR_VM_FAULT, "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n", - (integer_t)object, offset, (integer_t)m, - (integer_t)object->shadow, 0); - /* - * Move on to the next object. Lock the next - * object before unlocking the current one. - */ + object, offset, m, + object->shadow, 0); + next_object = object->shadow; + if (next_object == VM_OBJECT_NULL) { - assert(!must_be_resident); /* - * If there's no object left, fill the page - * in the top object with zeros. But first we - * need to allocate a real page. + * we've hit the bottom of the shadown chain, + * fill the page in the top object with zeros. */ + assert(!must_be_resident); if (object != first_object) { vm_object_paging_end(object); @@ -1442,69 +1596,55 @@ no_clustering: offset = first_offset; vm_object_lock(object); } - m = first_m; assert(m->object == object); first_m = VM_PAGE_NULL; - if (object->shadow_severed) { - VM_PAGE_FREE(m); - vm_fault_cleanup(object, VM_PAGE_NULL); - cur_thread->interruptible = interruptible_state; - return VM_FAULT_MEMORY_ERROR; - } + /* + * check for any conditions that prevent + * us from creating a new zero-fill page + * vm_fault_check will do all of the + * fault cleanup in the case of an error condition + * including resetting the thread_interrupt_level + */ + error = vm_fault_check(object, m, first_m, interruptible_state); - if (VM_PAGE_THROTTLED() || - (m->fictitious && !vm_page_convert(m))) { - VM_PAGE_FREE(m); - vm_fault_cleanup(object, VM_PAGE_NULL); - cur_thread->interruptible = interruptible_state; - return(VM_FAULT_MEMORY_SHORTAGE); - } - m->no_isync = FALSE; + if (error != VM_FAULT_SUCCESS) + return (error); - if (!no_zero_fill) { - vm_object_unlock(object); - vm_page_zero_fill(m); - if (type_of_fault) - *type_of_fault = DBG_ZERO_FILL_FAULT; - VM_STAT(zero_fill_count++); - - if (bumped_pagein == TRUE) { - VM_STAT(pageins--); - current_task()->pageins--; + if (m == VM_PAGE_NULL) { + m = vm_page_grab(); + + if (m == VM_PAGE_NULL) { + vm_fault_cleanup(object, VM_PAGE_NULL); + thread_interrupt_level(interruptible_state); + + return (VM_FAULT_MEMORY_SHORTAGE); } - vm_object_lock(object); + vm_page_insert(m, object, offset); } - vm_page_lock_queues(); - VM_PAGE_QUEUES_REMOVE(m); - m->page_ticket = vm_page_ticket; - vm_page_ticket_roll++; - if(vm_page_ticket_roll == VM_PAGE_TICKETS_IN_ROLL) { - vm_page_ticket_roll = 0; - if(vm_page_ticket == - VM_PAGE_TICKET_ROLL_IDS) - vm_page_ticket= 0; - else - vm_page_ticket++; - } - queue_enter(&vm_page_queue_inactive, - m, vm_page_t, pageq); - m->inactive = TRUE; - vm_page_inactive_count++; - vm_page_unlock_queues(); - pmap_clear_modify(m->phys_addr); + my_fault = vm_fault_zero_page(m, no_zero_fill); + + if (fault_info->mark_zf_absent && no_zero_fill == TRUE) + m->absent = TRUE; break; - } - else { + + } else { + /* + * Move on to the next object. Lock the next + * object before unlocking the current one. + */ if ((object != first_object) || must_be_resident) vm_object_paging_end(object); + offset += object->shadow_offset; - hi_offset += object->shadow_offset; - lo_offset += object->shadow_offset; + fault_info->lo_offset += object->shadow_offset; + fault_info->hi_offset += object->shadow_offset; access_required = VM_PROT_READ; + vm_object_lock(next_object); vm_object_unlock(object); + object = next_object; vm_object_paging_begin(object); } @@ -1532,149 +1672,171 @@ no_clustering: dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */ #endif #if EXTRA_ASSERTIONS - if(m != VM_PAGE_NULL) { - assert(m->busy && !m->absent); - assert((first_m == VM_PAGE_NULL) || - (first_m->busy && !first_m->absent && - !first_m->active && !first_m->inactive)); - } + assert(m->busy && !m->absent); + assert((first_m == VM_PAGE_NULL) || + (first_m->busy && !first_m->absent && + !first_m->active && !first_m->inactive)); #endif /* EXTRA_ASSERTIONS */ - XPR(XPR_VM_FAULT, - "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n", - (integer_t)object, offset, (integer_t)m, - (integer_t)first_object, (integer_t)first_m); /* - * If the page is being written, but isn't - * already owned by the top-level object, - * we have to copy it into a new page owned - * by the top-level object. + * ENCRYPTED SWAP: + * If we found a page, we must have decrypted it before we + * get here... */ + ASSERT_PAGE_DECRYPTED(m); - if ((object != first_object) && (m != VM_PAGE_NULL)) { - /* - * We only really need to copy if we - * want to write it. - */ + XPR(XPR_VM_FAULT, + "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n", + object, offset, m, + first_object, first_m); + + /* + * If the page is being written, but isn't + * already owned by the top-level object, + * we have to copy it into a new page owned + * by the top-level object. + */ + if (object != first_object) { #if TRACEFAULTPAGE - dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */ + dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */ #endif if (fault_type & VM_PROT_WRITE) { vm_page_t copy_m; + /* + * We only really need to copy if we + * want to write it. + */ assert(!must_be_resident); /* - * If we try to collapse first_object at this - * point, we may deadlock when we try to get - * the lock on an intermediate object (since we - * have the bottom object locked). We can't - * unlock the bottom object, because the page - * we found may move (by collapse) if we do. + * are we protecting the system from + * backing store exhaustion. If so + * sleep unless we are privileged. + */ + if (vm_backing_store_low) { + if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) { + + RELEASE_PAGE(m); + vm_fault_cleanup(object, first_m); + + assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT); + + thread_block(THREAD_CONTINUE_NULL); + thread_interrupt_level(interruptible_state); + + return (VM_FAULT_RETRY); + } + } + /* + * If we try to collapse first_object at this + * point, we may deadlock when we try to get + * the lock on an intermediate object (since we + * have the bottom object locked). We can't + * unlock the bottom object, because the page + * we found may move (by collapse) if we do. * - * Instead, we first copy the page. Then, when - * we have no more use for the bottom object, - * we unlock it and try to collapse. + * Instead, we first copy the page. Then, when + * we have no more use for the bottom object, + * we unlock it and try to collapse. * - * Note that we copy the page even if we didn't - * need to... that's the breaks. + * Note that we copy the page even if we didn't + * need to... that's the breaks. */ /* - * Allocate a page for the copy + * Allocate a page for the copy */ copy_m = vm_page_grab(); + if (copy_m == VM_PAGE_NULL) { RELEASE_PAGE(m); - vm_fault_cleanup(object, first_m); - cur_thread->interruptible = interruptible_state; - return(VM_FAULT_MEMORY_SHORTAGE); - } + vm_fault_cleanup(object, first_m); + thread_interrupt_level(interruptible_state); + return (VM_FAULT_MEMORY_SHORTAGE); + } XPR(XPR_VM_FAULT, "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n", - (integer_t)object, offset, - (integer_t)m, (integer_t)copy_m, 0); + object, offset, + m, copy_m, 0); + vm_page_copy(m, copy_m); /* - * If another map is truly sharing this - * page with us, we have to flush all - * uses of the original page, since we - * can't distinguish those which want the - * original from those which need the - * new copy. + * If another map is truly sharing this + * page with us, we have to flush all + * uses of the original page, since we + * can't distinguish those which want the + * original from those which need the + * new copy. * - * XXXO If we know that only one map has - * access to this page, then we could - * avoid the pmap_page_protect() call. + * XXXO If we know that only one map has + * access to this page, then we could + * avoid the pmap_disconnect() call. */ + if (m->pmapped) + pmap_disconnect(m->phys_page); - vm_page_lock_queues(); assert(!m->cleaning); - pmap_page_protect(m->phys_addr, VM_PROT_NONE); - vm_page_deactivate(m); - copy_m->dirty = TRUE; - /* - * Setting reference here prevents this fault from - * being counted as a (per-thread) reactivate as well - * as a copy-on-write. - */ - first_m->reference = TRUE; - vm_page_unlock_queues(); /* - * We no longer need the old page or object. + * We no longer need the old page or object. */ - PAGE_WAKEUP_DONE(m); vm_object_paging_end(object); vm_object_unlock(object); - if (type_of_fault) - *type_of_fault = DBG_COW_FAULT; - VM_STAT(cow_faults++); + my_fault = DBG_COW_FAULT; + VM_STAT_INCR(cow_faults); + DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL); current_task()->cow_faults++; + object = first_object; offset = first_offset; vm_object_lock(object); + /* + * get rid of the place holder + * page that we soldered in earlier + */ VM_PAGE_FREE(first_m); first_m = VM_PAGE_NULL; + + /* + * and replace it with the + * page we just copied into + */ assert(copy_m->busy); vm_page_insert(copy_m, object, offset); - m = copy_m; + copy_m->dirty = TRUE; + m = copy_m; /* - * Now that we've gotten the copy out of the - * way, let's try to collapse the top object. - * But we have to play ugly games with - * paging_in_progress to do that... + * Now that we've gotten the copy out of the + * way, let's try to collapse the top object. + * But we have to play ugly games with + * paging_in_progress to do that... */ - vm_object_paging_end(object); - vm_object_collapse(object); + vm_object_collapse(object, offset, TRUE); vm_object_paging_begin(object); - } - else { + } else *protection &= (~VM_PROT_WRITE); - } } - /* - * Now check whether the page needs to be pushed into the - * copy object. The use of asymmetric copy on write for - * shared temporary objects means that we may do two copies to - * satisfy the fault; one above to get the page from a - * shadowed object, and one here to push it into the copy. + * Now check whether the page needs to be pushed into the + * copy object. The use of asymmetric copy on write for + * shared temporary objects means that we may do two copies to + * satisfy the fault; one above to get the page from a + * shadowed object, and one here to push it into the copy. */ + try_failed_count = 0; - while (first_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY && - (copy_object = first_object->copy) != VM_OBJECT_NULL && - (m!= VM_PAGE_NULL)) { + while ((copy_object = first_object->copy) != VM_OBJECT_NULL) { vm_object_offset_t copy_offset; vm_page_t copy_m; @@ -1682,86 +1844,99 @@ no_clustering: dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */ #endif /* - * If the page is being written, but hasn't been - * copied to the copy-object, we have to copy it there. + * If the page is being written, but hasn't been + * copied to the copy-object, we have to copy it there. */ - if ((fault_type & VM_PROT_WRITE) == 0) { *protection &= ~VM_PROT_WRITE; break; } /* - * If the page was guaranteed to be resident, - * we must have already performed the copy. + * If the page was guaranteed to be resident, + * we must have already performed the copy. */ - if (must_be_resident) break; /* - * Try to get the lock on the copy_object. + * Try to get the lock on the copy_object. */ if (!vm_object_lock_try(copy_object)) { - vm_object_unlock(object); - mutex_pause(); /* wait a bit */ + vm_object_unlock(object); + try_failed_count++; + mutex_pause(try_failed_count); /* wait a bit */ vm_object_lock(object); + continue; } + try_failed_count = 0; /* - * Make another reference to the copy-object, - * to keep it from disappearing during the - * copy. + * Make another reference to the copy-object, + * to keep it from disappearing during the + * copy. */ - assert(copy_object->ref_count > 0); - copy_object->ref_count++; - VM_OBJ_RES_INCR(copy_object); + vm_object_reference_locked(copy_object); /* - * Does the page exist in the copy? + * Does the page exist in the copy? */ copy_offset = first_offset - copy_object->shadow_offset; + if (copy_object->size <= copy_offset) /* * Copy object doesn't cover this page -- do nothing. */ ; - else if ((copy_m = - vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) { - /* Page currently exists in the copy object */ + else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) { + /* + * Page currently exists in the copy object + */ if (copy_m->busy) { /* - * If the page is being brought - * in, wait for it and then retry. + * If the page is being brought + * in, wait for it and then retry. */ RELEASE_PAGE(m); - /* take an extra ref so object won't die */ - assert(copy_object->ref_count > 0); - copy_object->ref_count++; - vm_object_res_reference(copy_object); + + /* + * take an extra ref so object won't die + */ + vm_object_reference_locked(copy_object); vm_object_unlock(copy_object); vm_fault_cleanup(object, first_m); counter(c_vm_fault_page_block_backoff_kernel++); + vm_object_lock(copy_object); assert(copy_object->ref_count > 0); VM_OBJ_RES_DECR(copy_object); + vm_object_lock_assert_exclusive(copy_object); copy_object->ref_count--; assert(copy_object->ref_count > 0); copy_m = vm_page_lookup(copy_object, copy_offset); + /* + * ENCRYPTED SWAP: + * it's OK if the "copy_m" page is encrypted, + * because we're not moving it nor handling its + * contents. + */ if (copy_m != VM_PAGE_NULL && copy_m->busy) { PAGE_ASSERT_WAIT(copy_m, interruptible); + vm_object_unlock(copy_object); - wait_result = thread_block((void (*)(void))0); + wait_result = thread_block(THREAD_CONTINUE_NULL); vm_object_deallocate(copy_object); + goto backoff; } else { vm_object_unlock(copy_object); vm_object_deallocate(copy_object); - cur_thread->interruptible = interruptible_state; - return VM_FAULT_RETRY; + thread_interrupt_level(interruptible_state); + + return (VM_FAULT_RETRY); } } } @@ -1776,317 +1951,742 @@ no_clustering: * We must copy the page to the copy object. */ + if (vm_backing_store_low) { + /* + * we are protecting the system from + * backing store exhaustion. If so + * sleep unless we are privileged. + */ + if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) { + assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT); + + RELEASE_PAGE(m); + VM_OBJ_RES_DECR(copy_object); + vm_object_lock_assert_exclusive(copy_object); + copy_object->ref_count--; + assert(copy_object->ref_count > 0); + + vm_object_unlock(copy_object); + vm_fault_cleanup(object, first_m); + thread_block(THREAD_CONTINUE_NULL); + thread_interrupt_level(interruptible_state); + + return (VM_FAULT_RETRY); + } + } /* - * Allocate a page for the copy + * Allocate a page for the copy */ copy_m = vm_page_alloc(copy_object, copy_offset); + if (copy_m == VM_PAGE_NULL) { RELEASE_PAGE(m); + VM_OBJ_RES_DECR(copy_object); + vm_object_lock_assert_exclusive(copy_object); copy_object->ref_count--; assert(copy_object->ref_count > 0); + vm_object_unlock(copy_object); vm_fault_cleanup(object, first_m); - cur_thread->interruptible = interruptible_state; - return(VM_FAULT_MEMORY_SHORTAGE); - } + thread_interrupt_level(interruptible_state); + return (VM_FAULT_MEMORY_SHORTAGE); + } /* - * Must copy page into copy-object. + * Must copy page into copy-object. */ - vm_page_copy(m, copy_m); /* - * If the old page was in use by any users - * of the copy-object, it must be removed - * from all pmaps. (We can't know which - * pmaps use it.) + * If the old page was in use by any users + * of the copy-object, it must be removed + * from all pmaps. (We can't know which + * pmaps use it.) */ - - vm_page_lock_queues(); - assert(!m->cleaning); - pmap_page_protect(m->phys_addr, VM_PROT_NONE); - copy_m->dirty = TRUE; - vm_page_unlock_queues(); + if (m->pmapped) + pmap_disconnect(m->phys_page); /* - * If there's a pager, then immediately - * page out this page, using the "initialize" - * option. Else, we use the copy. + * If there's a pager, then immediately + * page out this page, using the "initialize" + * option. Else, we use the copy. */ - - if -#if MACH_PAGEMAP - ((!copy_object->pager_created) || - vm_external_state_get( - copy_object->existence_map, copy_offset) - == VM_EXTERNAL_STATE_ABSENT) -#else - (!copy_object->pager_created) + if ((!copy_object->pager_created) +#if MACH_PAGEMAP + || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT #endif - { - vm_page_lock_queues(); + ) { + + vm_page_lockspin_queues(); + assert(!m->cleaning); vm_page_activate(copy_m); vm_page_unlock_queues(); + + copy_m->dirty = TRUE; PAGE_WAKEUP_DONE(copy_m); } else { assert(copy_m->busy == TRUE); + assert(!m->cleaning); /* - * The page is already ready for pageout: - * not on pageout queues and busy. - * Unlock everything except the - * copy_object itself. + * dirty is protected by the object lock */ + copy_m->dirty = TRUE; + /* + * The page is already ready for pageout: + * not on pageout queues and busy. + * Unlock everything except the + * copy_object itself. + */ vm_object_unlock(object); /* - * Write the page to the copy-object, - * flushing it from the kernel. + * Write the page to the copy-object, + * flushing it from the kernel. */ - vm_pageout_initialize_page(copy_m); /* - * Since the pageout may have - * temporarily dropped the - * copy_object's lock, we - * check whether we'll have - * to deallocate the hard way. + * Since the pageout may have + * temporarily dropped the + * copy_object's lock, we + * check whether we'll have + * to deallocate the hard way. */ - - if ((copy_object->shadow != object) || - (copy_object->ref_count == 1)) { + if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) { vm_object_unlock(copy_object); vm_object_deallocate(copy_object); vm_object_lock(object); + continue; } - /* - * Pick back up the old object's - * lock. [It is safe to do so, - * since it must be deeper in the - * object tree.] + * Pick back up the old object's + * lock. [It is safe to do so, + * since it must be deeper in the + * object tree.] */ - vm_object_lock(object); } - /* - * Because we're pushing a page upward - * in the object tree, we must restart - * any faults that are waiting here. - * [Note that this is an expansion of - * PAGE_WAKEUP that uses the THREAD_RESTART - * wait result]. Can't turn off the page's - * busy bit because we're not done with it. + * Because we're pushing a page upward + * in the object tree, we must restart + * any faults that are waiting here. + * [Note that this is an expansion of + * PAGE_WAKEUP that uses the THREAD_RESTART + * wait result]. Can't turn off the page's + * busy bit because we're not done with it. */ - if (m->wanted) { m->wanted = FALSE; - thread_wakeup_with_result((event_t) m, - THREAD_RESTART); + thread_wakeup_with_result((event_t) m, THREAD_RESTART); } } - /* - * The reference count on copy_object must be - * at least 2: one for our extra reference, - * and at least one from the outside world - * (we checked that when we last locked - * copy_object). + * The reference count on copy_object must be + * at least 2: one for our extra reference, + * and at least one from the outside world + * (we checked that when we last locked + * copy_object). */ + vm_object_lock_assert_exclusive(copy_object); copy_object->ref_count--; assert(copy_object->ref_count > 0); + VM_OBJ_RES_DECR(copy_object); vm_object_unlock(copy_object); break; } +done: *result_page = m; *top_page = first_m; XPR(XPR_VM_FAULT, "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n", - (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0); - /* - * If the page can be written, assume that it will be. - * [Earlier, we restrict the permission to allow write - * access only if the fault so required, so we don't - * mark read-only data as dirty.] - */ + object, offset, m, first_m, 0); -#if !VM_FAULT_STATIC_CONFIG - if (vm_fault_dirty_handling && (*protection & VM_PROT_WRITE) && - (m != VM_PAGE_NULL)) { - m->dirty = TRUE; - } -#endif -#if TRACEFAULTPAGE - dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_page_deactivate_behind); /* (TEST/DEBUG) */ -#endif - if (vm_page_deactivate_behind) { - if (offset && /* don't underflow */ - (object->last_alloc == (offset - PAGE_SIZE_64))) { - m = vm_page_lookup(object, object->last_alloc); - if ((m != VM_PAGE_NULL) && !m->busy) { - vm_page_lock_queues(); - vm_page_deactivate(m); - vm_page_unlock_queues(); + if (m != VM_PAGE_NULL) { + retval = VM_FAULT_SUCCESS; + if (my_fault == DBG_PAGEIN_FAULT) { + + VM_STAT_INCR(pageins); + DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL); + DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL); + current_task()->pageins++; + + if (m->object->internal) { + DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL); + my_fault = DBG_PAGEIND_FAULT; + } else { + DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL); + my_fault = DBG_PAGEINV_FAULT; } -#if TRACEFAULTPAGE - dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */ -#endif + + /* + * evaluate access pattern and update state + * vm_fault_deactivate_behind depends on the + * state being up to date + */ + vm_fault_is_sequential(object, offset, fault_info->behavior); + + vm_fault_deactivate_behind(object, offset, fault_info->behavior); } - object->last_alloc = offset; + if (type_of_fault) + *type_of_fault = my_fault; + } else { + retval = VM_FAULT_SUCCESS_NO_VM_PAGE; + assert(first_m == VM_PAGE_NULL); + assert(object == first_object); } + + thread_interrupt_level(interruptible_state); + #if TRACEFAULTPAGE dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */ #endif - cur_thread->interruptible = interruptible_state; - if(*result_page == VM_PAGE_NULL) { - vm_object_unlock(object); - } - return(VM_FAULT_SUCCESS); + return retval; -#if 0 - block_and_backoff: - vm_fault_cleanup(object, first_m); - - counter(c_vm_fault_page_block_backoff_kernel++); - thread_block((void (*)(void))0); -#endif +backoff: + thread_interrupt_level(interruptible_state); - backoff: - cur_thread->interruptible = interruptible_state; if (wait_result == THREAD_INTERRUPTED) - return VM_FAULT_INTERRUPTED; - return VM_FAULT_RETRY; + return (VM_FAULT_INTERRUPTED); + return (VM_FAULT_RETRY); #undef RELEASE_PAGE } + + /* - * Routine: vm_fault - * Purpose: - * Handle page faults, including pseudo-faults - * used to change the wiring status of pages. - * Returns: - * Explicit continuations have been removed. - * Implementation: - * vm_fault and vm_fault_page save mucho state - * in the moral equivalent of a closure. The state - * structure is allocated when first entering vm_fault - * and deallocated when leaving vm_fault. + * CODE SIGNING: + * When soft faulting a page, we have to validate the page if: + * 1. the page is being mapped in user space + * 2. the page hasn't already been found to be "tainted" + * 3. the page belongs to a code-signed object + * 4. the page has not been validated yet or has been mapped for write. */ +#define VM_FAULT_NEED_CS_VALIDATION(pmap, page) \ + ((pmap) != kernel_pmap /*1*/ && \ + !(page)->cs_tainted /*2*/ && \ + (page)->object->code_signed /*3*/ && \ + (!(page)->cs_validated || (page)->wpmapped /*4*/)) + +/* + * page queue lock must NOT be held + * m->object must be locked + * + * NOTE: m->object could be locked "shared" only if we are called + * from vm_fault() as part of a soft fault. If so, we must be + * careful not to modify the VM object in any way that is not + * legal under a shared lock... + */ +unsigned long cs_enter_tainted_rejected = 0; +unsigned long cs_enter_tainted_accepted = 0; kern_return_t -vm_fault( - vm_map_t map, - vm_offset_t vaddr, - vm_prot_t fault_type, - boolean_t change_wiring, - int interruptible) +vm_fault_enter(vm_page_t m, + pmap_t pmap, + vm_map_offset_t vaddr, + vm_prot_t prot, + boolean_t wired, + boolean_t change_wiring, + boolean_t no_cache, + int *type_of_fault) { - vm_map_version_t version; /* Map version for verificiation */ - boolean_t wired; /* Should mapping be wired down? */ - vm_object_t object; /* Top-level object */ - vm_object_offset_t offset; /* Top-level offset */ - vm_prot_t prot; /* Protection for mapping */ - vm_behavior_t behavior; /* Expected paging behavior */ - vm_object_offset_t lo_offset, hi_offset; - vm_object_t old_copy_object; /* Saved copy object */ - vm_page_t result_page; /* Result of vm_fault_page */ - vm_page_t top_page; /* Placeholder page */ - kern_return_t kr; - - register - vm_page_t m; /* Fast access to result_page */ - kern_return_t error_code; /* page error reasons */ - register - vm_object_t cur_object; - register - vm_object_offset_t cur_offset; - vm_page_t cur_m; - vm_object_t new_object; - int type_of_fault; - vm_map_t pmap_map = map; - vm_map_t original_map = map; - pmap_t pmap = NULL; - boolean_t funnel_set = FALSE; - funnel_t *curflock; - thread_t cur_thread; - boolean_t interruptible_state; + unsigned int cache_attr; + kern_return_t kr, pe_result; + boolean_t previously_pmapped = m->pmapped; + boolean_t must_disconnect = 0; + boolean_t map_is_switched, map_is_switch_protected; + vm_object_lock_assert_held(m->object); +#if DEBUG + lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED); +#endif /* DEBUG */ + + if (m->phys_page == vm_page_guard_addr) { + assert(m->fictitious); + return KERN_SUCCESS; + } - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_START, - vaddr, - 0, - 0, - 0, - 0); + cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK; - cur_thread = current_thread(); + if (m->pmapped == FALSE) { + /* + * This is the first time this page is being + * mapped in an address space (pmapped == FALSE). + * + * Part of that page may still be in the data cache + * and not flushed to memory. In case we end up + * accessing that page via the instruction cache, + * we need to ensure that the 2 caches are in sync. + */ + pmap_sync_page_data_phys(m->phys_page); + + if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) { + /* + * found it in the cache, but this + * is the first fault-in of the page (m->pmapped == FALSE) + * so it must have come in as part of + * a cluster... account 1 pagein against it + */ + VM_STAT_INCR(pageins); + DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL); - interruptible_state = cur_thread->interruptible; - if (interruptible == THREAD_UNINT) - cur_thread->interruptible = FALSE; + if (m->object->internal) { + DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL); + *type_of_fault = DBG_PAGEIND_FAULT; + } else { + DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL); + *type_of_fault = DBG_PAGEINV_FAULT; + } - /* - * assume we will hit a page in the cache - * otherwise, explicitly override with - * the real fault type once we determine it - */ - type_of_fault = DBG_CACHE_HIT_FAULT; + current_task()->pageins++; + } + VM_PAGE_CONSUME_CLUSTERED(m); - VM_STAT(faults++); - current_task()->faults++; + } else if (cache_attr != VM_WIMG_DEFAULT) + pmap_sync_page_attributes_phys(m->phys_page); - /* - * drop funnel if it is already held. Then restore while returning - */ - if ((cur_thread->funnel_state & TH_FN_OWNED) == TH_FN_OWNED) { - funnel_set = TRUE; - curflock = cur_thread->funnel_lock; - thread_funnel_set( curflock , FALSE); + if (*type_of_fault != DBG_COW_FAULT) { + DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL); + + if (pmap == kernel_pmap) { + DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL); + } } - - RetryFault: ; - /* - * Find the backing store object and offset into - * it to begin the search. - */ - map = original_map; - vm_map_lock_read(map); - kr = vm_map_lookup_locked(&map, vaddr, fault_type, &version, - &object, &offset, - &prot, &wired, - &behavior, &lo_offset, &hi_offset, &pmap_map); + /* Validate code signature if necessary. */ + if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) { + vm_object_lock_assert_exclusive(m->object); - pmap = pmap_map->pmap; + if (m->cs_validated) { + vm_cs_revalidates++; + } - if (kr != KERN_SUCCESS) { - vm_map_unlock_read(map); - goto done; + /* VM map is locked, so 1 ref will remain on VM object - + * so no harm if vm_page_validate_cs drops the object lock */ + vm_page_validate_cs(m); } - /* - * If the page is wired, we must fault for the current protection - * value, to avoid further faults. +#define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/) + + map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) && + (pmap == vm_map_pmap(current_thread()->map))); + map_is_switch_protected = current_thread()->map->switch_protect; + + /* If the map is switched, and is switch-protected, we must protect + * some pages from being write-faulted: immutable pages because by + * definition they may not be written, and executable pages because that + * would provide a way to inject unsigned code. + * If the page is immutable, we can simply return. However, we can't + * immediately determine whether a page is executable anywhere. But, + * we can disconnect it everywhere and remove the executable protection + * from the current map. We do that below right before we do the + * PMAP_ENTER. */ + if(!cs_enforcement_disable && map_is_switched && + map_is_switch_protected && page_immutable(m, prot) && + (prot & VM_PROT_WRITE)) + { + return KERN_CODESIGN_ERROR; + } - if (wired) - fault_type = prot | VM_PROT_WRITE; + /* A page could be tainted, or pose a risk of being tainted later. + * Check whether the receiving process wants it, and make it feel + * the consequences (that hapens in cs_invalid_page()). + * For CS Enforcement, two other conditions will + * cause that page to be tainted as well: + * - pmapping an unsigned page executable - this means unsigned code; + * - writeable mapping of a validated page - the content of that page + * can be changed without the kernel noticing, therefore unsigned + * code can be created + */ + if (m->cs_tainted || + ( !cs_enforcement_disable && + (/* The page is unsigned and wants to be executable */ + (!m->cs_validated && (prot & VM_PROT_EXECUTE)) || + /* The page should be immutable, but is in danger of being modified + * This is the case where we want policy from the code directory - + * is the page immutable or not? For now we have to assume that + * code pages will be immutable, data pages not. + * We'll assume a page is a code page if it has a code directory + * and we fault for execution. + * That is good enough since if we faulted the code page for + * writing in another map before, it is wpmapped; if we fault + * it for writing in this map later it will also be faulted for executing + * at the same time; and if we fault for writing in another map + * later, we will disconnect it from this pmap so we'll notice + * the change. + */ + (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped)) + )) + ) + { + /* We will have a tainted page. Have to handle the special case + * of a switched map now. If the map is not switched, standard + * procedure applies - call cs_invalid_page(). + * If the map is switched, the real owner is invalid already. + * There is no point in invalidating the switching process since + * it will not be executing from the map. So we don't call + * cs_invalid_page() in that case. */ + boolean_t reject_page; + if(map_is_switched) { + assert(pmap==vm_map_pmap(current_thread()->map)); + assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE)); + reject_page = FALSE; + } else { + reject_page = cs_invalid_page((addr64_t) vaddr); + } + + if (reject_page) { + /* reject the tainted page: abort the page fault */ + kr = KERN_CODESIGN_ERROR; + cs_enter_tainted_rejected++; + } else { + /* proceed with the tainted page */ + kr = KERN_SUCCESS; + /* Page might have been tainted before or not; now it + * definitively is. If the page wasn't tainted, we must + * disconnect it from all pmaps later. */ + must_disconnect = !m->cs_tainted; + m->cs_tainted = TRUE; + cs_enter_tainted_accepted++; + } + if (cs_debug || kr != KERN_SUCCESS) { + printf("CODESIGNING: vm_fault_enter(0x%llx): " + "page %p obj %p off 0x%llx *** INVALID PAGE ***\n", + (long long)vaddr, m, m->object, m->offset); + } + + } else { + /* proceed with the valid page */ + kr = KERN_SUCCESS; + } -#if VM_FAULT_CLASSIFY - /* - * Temporary data gathering code + /* If we have a KERN_SUCCESS from the previous checks, we either have + * a good page, or a tainted page that has been accepted by the process. + * In both cases the page will be entered into the pmap. + * If the page is writeable, we need to disconnect it from other pmaps + * now so those processes can take note. + */ + if (kr == KERN_SUCCESS) { + /* + * NOTE: we may only hold the vm_object lock SHARED + * at this point, but the update of pmapped is ok + * since this is the ONLY bit updated behind the SHARED + * lock... however, we need to figure out how to do an atomic + * update on a bit field to make this less fragile... right + * now I don't know how to coerce 'C' to give me the offset info + * that's needed for an AtomicCompareAndSwap + */ + m->pmapped = TRUE; + if (prot & VM_PROT_WRITE) { + vm_object_lock_assert_exclusive(m->object); + m->wpmapped = TRUE; + if(must_disconnect) { + /* We can only get here + * because of the CSE logic */ + assert(cs_enforcement_disable == FALSE); + pmap_disconnect(m->phys_page); + /* If we are faulting for a write, we can clear + * the execute bit - that will ensure the page is + * checked again before being executable, which + * protects against a map switch. + * This only happens the first time the page + * gets tainted, so we won't get stuck here + * to make an already writeable page executable. */ + prot &= ~VM_PROT_EXECUTE; + } + } + + /* Prevent a deadlock by not + * holding the object lock if we need to wait for a page in + * pmap_enter() - */ + PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, cache_attr, + wired, PMAP_OPTIONS_NOWAIT, pe_result); + + if(pe_result == KERN_RESOURCE_SHORTAGE) { + /* The nonblocking version of pmap_enter did not succeed. + * Use the blocking version instead. Requires marking + * the page busy and unlocking the object */ + boolean_t was_busy = m->busy; + m->busy = TRUE; + vm_object_unlock(m->object); + + PMAP_ENTER(pmap, vaddr, m, prot, cache_attr, wired); + + /* Take the object lock again. */ + vm_object_lock(m->object); + + /* If the page was busy, someone else will wake it up. + * Otherwise, we have to do it now. */ + assert(m->busy); + if(!was_busy) { + PAGE_WAKEUP_DONE(m); + } + vm_pmap_enter_blocked++; + } + } + + /* + * Hold queues lock to manipulate + * the page queues. Change wiring + * case is obvious. + */ + if (change_wiring) { + vm_page_lockspin_queues(); + + if (wired) { + if (kr == KERN_SUCCESS) { + vm_page_wire(m); + } + } else { + vm_page_unwire(m, TRUE); + } + vm_page_unlock_queues(); + + } else { + if (kr != KERN_SUCCESS) { + vm_page_lockspin_queues(); + vm_page_deactivate(m); + vm_page_unlock_queues(); + } else { + if (((!m->active && !m->inactive) || no_cache) && !VM_PAGE_WIRED(m) && !m->throttled) { + + if ( vm_page_local_q && !no_cache && (*type_of_fault == DBG_COW_FAULT || *type_of_fault == DBG_ZERO_FILL_FAULT) ) { + struct vpl *lq; + uint32_t lid; + + /* + * we got a local queue to stuff this new page on... + * its safe to manipulate local and local_id at this point + * since we're behind an exclusive object lock and the + * page is not on any global queue. + * + * we'll use the current cpu number to select the queue + * note that we don't need to disable preemption... we're + * going to behind the local queue's lock to do the real + * work + */ + lid = cpu_number(); + + lq = &vm_page_local_q[lid].vpl_un.vpl; + + VPL_LOCK(&lq->vpl_lock); + + queue_enter(&lq->vpl_queue, m, vm_page_t, pageq); + m->local = TRUE; + m->local_id = lid; + lq->vpl_count++; + + VPL_UNLOCK(&lq->vpl_lock); + + if (lq->vpl_count > vm_page_local_q_soft_limit) { + /* + * we're beyond the soft limit for the local queue + * vm_page_reactivate_local will 'try' to take + * the global page queue lock... if it can't that's + * ok... we'll let the queue continue to grow up + * to the hard limit... at that point we'll wait + * for the lock... once we've got the lock, we'll + * transfer all of the pages from the local queue + * to the global active queue + */ + vm_page_reactivate_local(lid, FALSE, FALSE); + } + return kr; + } + + vm_page_lockspin_queues(); + /* + * test again now that we hold the page queue lock + */ + if (((!m->active && !m->inactive) || no_cache) && !VM_PAGE_WIRED(m)) { + + /* + * If this is a no_cache mapping and the page has never been + * mapped before or was previously a no_cache page, then we + * want to leave pages in the speculative state so that they + * can be readily recycled if free memory runs low. Otherwise + * the page is activated as normal. + */ + + if (no_cache && (!previously_pmapped || m->no_cache)) { + m->no_cache = TRUE; + + if (m->active || m->inactive) + VM_PAGE_QUEUES_REMOVE(m); + + if (!m->speculative) + vm_page_speculate(m, TRUE); + + } else if (!m->active && !m->inactive) + vm_page_activate(m); + + } + + vm_page_unlock_queues(); + } + } + } + return kr; +} + + +/* + * Routine: vm_fault + * Purpose: + * Handle page faults, including pseudo-faults + * used to change the wiring status of pages. + * Returns: + * Explicit continuations have been removed. + * Implementation: + * vm_fault and vm_fault_page save mucho state + * in the moral equivalent of a closure. The state + * structure is allocated when first entering vm_fault + * and deallocated when leaving vm_fault. + */ + +extern int _map_enter_debug; + +unsigned long vm_fault_collapse_total = 0; +unsigned long vm_fault_collapse_skipped = 0; + +kern_return_t +vm_fault( + vm_map_t map, + vm_map_offset_t vaddr, + vm_prot_t fault_type, + boolean_t change_wiring, + int interruptible, + pmap_t caller_pmap, + vm_map_offset_t caller_pmap_addr) +{ + vm_map_version_t version; /* Map version for verificiation */ + boolean_t wired; /* Should mapping be wired down? */ + vm_object_t object; /* Top-level object */ + vm_object_offset_t offset; /* Top-level offset */ + vm_prot_t prot; /* Protection for mapping */ + vm_object_t old_copy_object; /* Saved copy object */ + vm_page_t result_page; /* Result of vm_fault_page */ + vm_page_t top_page; /* Placeholder page */ + kern_return_t kr; + + vm_page_t m; /* Fast access to result_page */ + kern_return_t error_code; + vm_object_t cur_object; + vm_object_offset_t cur_offset; + vm_page_t cur_m; + vm_object_t new_object; + int type_of_fault; + pmap_t pmap; + boolean_t interruptible_state; + vm_map_t real_map = map; + vm_map_t original_map = map; + vm_prot_t original_fault_type; + struct vm_object_fault_info fault_info; + boolean_t need_collapse = FALSE; + int object_lock_type = 0; + int cur_object_lock_type; + vm_object_t top_object = VM_OBJECT_NULL; + + + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START, + (int)((uint64_t)vaddr >> 32), + (int)vaddr, + 0, + 0, + 0); + + if (get_preemption_level() != 0) { + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END, + (int)((uint64_t)vaddr >> 32), + (int)vaddr, + KERN_FAILURE, + 0, + 0); + + return (KERN_FAILURE); + } + + interruptible_state = thread_interrupt_level(interruptible); + + VM_STAT_INCR(faults); + current_task()->faults++; + original_fault_type = fault_type; + + if (fault_type & VM_PROT_WRITE) + object_lock_type = OBJECT_LOCK_EXCLUSIVE; + else + object_lock_type = OBJECT_LOCK_SHARED; + + cur_object_lock_type = OBJECT_LOCK_SHARED; + +RetryFault: + /* + * assume we will hit a page in the cache + * otherwise, explicitly override with + * the real fault type once we determine it + */ + type_of_fault = DBG_CACHE_HIT_FAULT; + + /* + * Find the backing store object and offset into + * it to begin the search. + */ + fault_type = original_fault_type; + map = original_map; + vm_map_lock_read(map); + + kr = vm_map_lookup_locked(&map, vaddr, fault_type, + object_lock_type, &version, + &object, &offset, &prot, &wired, + &fault_info, + &real_map); + + if (kr != KERN_SUCCESS) { + vm_map_unlock_read(map); + goto done; + } + pmap = real_map->pmap; + fault_info.interruptible = interruptible; + fault_info.stealth = FALSE; + fault_info.mark_zf_absent = FALSE; + + /* + * If the page is wired, we must fault for the current protection + * value, to avoid further faults. + */ + if (wired) { + fault_type = prot | VM_PROT_WRITE; + /* + * since we're treating this fault as a 'write' + * we must hold the top object lock exclusively + */ + if (object_lock_type == OBJECT_LOCK_SHARED) { + + object_lock_type = OBJECT_LOCK_EXCLUSIVE; + + if (vm_object_lock_upgrade(object) == FALSE) { + /* + * couldn't upgrade, so explictly + * take the lock exclusively + */ + vm_object_lock(object); + } + } + } + +#if VM_FAULT_CLASSIFY + /* + * Temporary data gathering code */ vm_fault_classify(object, offset, fault_type); #endif @@ -2124,31 +2724,237 @@ vm_fault( /* - * If this page is to be inserted in a copy delay object - * for writing, and if the object has a copy, then the - * copy delay strategy is implemented in the slow fault page. + * If this page is to be inserted in a copy delay object + * for writing, and if the object has a copy, then the + * copy delay strategy is implemented in the slow fault page. */ - if (object->copy_strategy != MEMORY_OBJECT_COPY_DELAY || - object->copy == VM_OBJECT_NULL || - (fault_type & VM_PROT_WRITE) == 0) { + if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY && + object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) + goto handle_copy_delay; + cur_object = object; cur_offset = offset; while (TRUE) { + if (!cur_object->pager_created && + cur_object->phys_contiguous) /* superpage */ + break; + + if (cur_object->blocked_access) { + /* + * Access to this VM object has been blocked. + * Let the slow path handle it. + */ + break; + } + m = vm_page_lookup(cur_object, cur_offset); + if (m != VM_PAGE_NULL) { - if (m->busy) - break; + if (m->busy) { + wait_result_t result; - if (m->unusual && (m->error || m->restart || m->private - || m->absent || (fault_type & m->page_lock))) { + /* + * in order to do the PAGE_ASSERT_WAIT, we must + * have object that 'm' belongs to locked exclusively + */ + if (object != cur_object) { + vm_object_unlock(object); + + if (cur_object_lock_type == OBJECT_LOCK_SHARED) { + + cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE; + + if (vm_object_lock_upgrade(cur_object) == FALSE) { + /* + * couldn't upgrade so go do a full retry + * immediately since we've already dropped + * the top object lock associated with this page + * and the current one got dropped due to the + * failed upgrade... the state is no longer valid + */ + vm_map_unlock_read(map); + if (real_map != map) + vm_map_unlock(real_map); + + goto RetryFault; + } + } + } else if (object_lock_type == OBJECT_LOCK_SHARED) { + + object_lock_type = OBJECT_LOCK_EXCLUSIVE; + + if (vm_object_lock_upgrade(object) == FALSE) { + /* + * couldn't upgrade, so explictly take the lock + * exclusively and go relookup the page since we + * will have dropped the object lock and + * a different thread could have inserted + * a page at this offset + * no need for a full retry since we're + * at the top level of the object chain + */ + vm_object_lock(object); + + continue; + } + } + vm_map_unlock_read(map); + if (real_map != map) + vm_map_unlock(real_map); - /* - * Unusual case. Give up. + result = PAGE_ASSERT_WAIT(m, interruptible); + + vm_object_unlock(cur_object); + + if (result == THREAD_WAITING) { + result = thread_block(THREAD_CONTINUE_NULL); + + counter(c_vm_fault_page_block_busy_kernel++); + } + if (result == THREAD_AWAKENED || result == THREAD_RESTART) + goto RetryFault; + + kr = KERN_ABORTED; + goto done; + } + if (m->phys_page == vm_page_guard_addr) { + /* + * Guard page: let the slow path deal with it */ break; } + if (m->unusual && (m->error || m->restart || m->private || m->absent)) { + /* + * Unusual case... let the slow path deal with it + */ + break; + } + if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) { + if (object != cur_object) + vm_object_unlock(object); + vm_map_unlock_read(map); + if (real_map != map) + vm_map_unlock(real_map); + vm_object_unlock(cur_object); + kr = KERN_MEMORY_ERROR; + goto done; + } + + if (m->encrypted) { + /* + * ENCRYPTED SWAP: + * We've soft-faulted (because it's not in the page + * table) on an encrypted page. + * Keep the page "busy" so that no one messes with + * it during the decryption. + * Release the extra locks we're holding, keep only + * the page's VM object lock. + * + * in order to set 'busy' on 'm', we must + * have object that 'm' belongs to locked exclusively + */ + if (object != cur_object) { + vm_object_unlock(object); + + if (cur_object_lock_type == OBJECT_LOCK_SHARED) { + + cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE; + + if (vm_object_lock_upgrade(cur_object) == FALSE) { + /* + * couldn't upgrade so go do a full retry + * immediately since we've already dropped + * the top object lock associated with this page + * and the current one got dropped due to the + * failed upgrade... the state is no longer valid + */ + vm_map_unlock_read(map); + if (real_map != map) + vm_map_unlock(real_map); + + goto RetryFault; + } + } + } else if (object_lock_type == OBJECT_LOCK_SHARED) { + + object_lock_type = OBJECT_LOCK_EXCLUSIVE; + + if (vm_object_lock_upgrade(object) == FALSE) { + /* + * couldn't upgrade, so explictly take the lock + * exclusively and go relookup the page since we + * will have dropped the object lock and + * a different thread could have inserted + * a page at this offset + * no need for a full retry since we're + * at the top level of the object chain + */ + vm_object_lock(object); + + continue; + } + } + m->busy = TRUE; + + vm_map_unlock_read(map); + if (real_map != map) + vm_map_unlock(real_map); + + vm_page_decrypt(m, 0); + + assert(m->busy); + PAGE_WAKEUP_DONE(m); + + vm_object_unlock(cur_object); + /* + * Retry from the top, in case anything + * changed while we were decrypting... + */ + goto RetryFault; + } + ASSERT_PAGE_DECRYPTED(m); + + if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) { + /* + * We might need to validate this page + * against its code signature, so we + * want to hold the VM object exclusively. + */ + if (object != cur_object) { + if (cur_object_lock_type == OBJECT_LOCK_SHARED) { + vm_object_unlock(object); + vm_object_unlock(cur_object); + + cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE; + + vm_map_unlock_read(map); + if (real_map != map) + vm_map_unlock(real_map); + goto RetryFault; + } + + } else if (object_lock_type == OBJECT_LOCK_SHARED) { + + object_lock_type = OBJECT_LOCK_EXCLUSIVE; + + if (vm_object_lock_upgrade(object) == FALSE) { + /* + * couldn't upgrade, so explictly take the lock + * exclusively and go relookup the page since we + * will have dropped the object lock and + * a different thread could have inserted + * a page at this offset + * no need for a full retry since we're + * at the top level of the object chain + */ + vm_object_lock(object); + + continue; + } + } + } /* * Two cases of map in faults: * - At top level w/o copy object. @@ -2156,713 +2962,828 @@ vm_fault( * --> must disallow write. */ - if (object == cur_object && - object->copy == VM_OBJECT_NULL) - goto FastMapInFault; + if (object == cur_object && object->copy == VM_OBJECT_NULL) { + if ((fault_type & VM_PROT_WRITE) == 0) { + /* + * This is not a "write" fault, so we + * might not have taken the object lock + * exclusively and we might not be able + * to update the "wpmapped" bit in + * vm_fault_enter(). + * Let's just grant read access to + * the page for now and we'll + * soft-fault again if we need write + * access later... + */ + prot &= ~VM_PROT_WRITE; + } + goto FastPmapEnter; + } if ((fault_type & VM_PROT_WRITE) == 0) { prot &= ~VM_PROT_WRITE; - /* - * Set up to map the page ... - * mark the page busy, drop - * locks and take a paging reference - * on the object with the page. - */ - if (object != cur_object) { - vm_object_unlock(object); + /* + * We still need to hold the top object + * lock here to prevent a race between + * a read fault (taking only "shared" + * locks) and a write fault (taking + * an "exclusive" lock on the top + * object. + * Otherwise, as soon as we release the + * top lock, the write fault could + * proceed and actually complete before + * the read fault, and the copied page's + * translation could then be overwritten + * by the read fault's translation for + * the original page. + * + * Let's just record what the top object + * is and we'll release it later. + */ + top_object = object; + + /* + * switch to the object that has the new page + */ object = cur_object; + object_lock_type = cur_object_lock_type; } -FastMapInFault: - m->busy = TRUE; - - vm_object_paging_begin(object); - vm_object_unlock(object); - FastPmapEnter: /* - * Check a couple of global reasons to - * be conservative about write access. - * Then do the pmap_enter. + * prepare for the pmap_enter... + * object and map are both locked + * m contains valid data + * object == m->object + * cur_object == NULL or it's been unlocked + * no paging references on either object or cur_object */ -#if !VM_FAULT_STATIC_CONFIG - if (vm_fault_dirty_handling -#if MACH_KDB - || db_watchpoint_list -#endif - && (fault_type & VM_PROT_WRITE) == 0) - prot &= ~VM_PROT_WRITE; -#else /* STATIC_CONFIG */ #if MACH_KDB - if (db_watchpoint_list - && (fault_type & VM_PROT_WRITE) == 0) + if (db_watchpoint_list && (fault_type & VM_PROT_WRITE) == 0) prot &= ~VM_PROT_WRITE; -#endif /* MACH_KDB */ -#endif /* STATIC_CONFIG */ - if (m->no_isync == TRUE) - pmap_sync_caches_phys(m->phys_addr); - - PMAP_ENTER(pmap, vaddr, m, prot, wired); - { - tws_hash_line_t line; - task_t task; - - task = current_task(); - if((map != NULL) && - (task->dynamic_working_set != 0)) { - if(tws_lookup - ((tws_hash_t) - task->dynamic_working_set, - cur_offset, object, - &line) != KERN_SUCCESS) { - if(tws_insert((tws_hash_t) - task->dynamic_working_set, - m->offset, m->object, - vaddr, pmap_map) - == KERN_NO_SPACE) { - tws_expand_working_set( - task->dynamic_working_set, - TWS_HASH_LINE_COUNT); - } - } - } +#endif + if (caller_pmap) { + kr = vm_fault_enter(m, + caller_pmap, + caller_pmap_addr, + prot, + wired, + change_wiring, + fault_info.no_cache, + &type_of_fault); + } else { + kr = vm_fault_enter(m, + pmap, + vaddr, + prot, + wired, + change_wiring, + fault_info.no_cache, + &type_of_fault); } - /* - * Grab the object lock to manipulate - * the page queues. Change wiring - * case is obvious. In soft ref bits - * case activate page only if it fell - * off paging queues, otherwise just - * activate it if it's inactive. - * - * NOTE: original vm_fault code will - * move active page to back of active - * queue. This code doesn't. - */ - vm_object_lock(object); - vm_page_lock_queues(); - if (m->clustered) { - vm_pagein_cluster_used++; - m->clustered = FALSE; + if (top_object != VM_OBJECT_NULL) { + /* + * It's safe to drop the top object + * now that we've done our + * vm_fault_enter(). Any other fault + * in progress for that virtual + * address will either find our page + * and translation or put in a new page + * and translation. + */ + vm_object_unlock(top_object); + top_object = VM_OBJECT_NULL; } - /* - * we did the isync above (if needed)... we're clearing - * the flag here to avoid holding a lock - * while calling pmap functions, however - * we need hold the object lock before - * we can modify the flag - */ - m->no_isync = FALSE; - m->reference = TRUE; - if (change_wiring) { - if (wired) - vm_page_wire(m); - else - vm_page_unwire(m); - } -#if VM_FAULT_STATIC_CONFIG - else { - if (!m->active && !m->inactive) - vm_page_activate(m); - } -#else - else if (software_reference_bits) { - if (!m->active && !m->inactive) - vm_page_activate(m); - } - else if (!m->active) { - vm_page_activate(m); - } -#endif - vm_page_unlock_queues(); + if (need_collapse == TRUE) + vm_object_collapse(object, offset, TRUE); + if (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT) { + /* + * evaluate access pattern and update state + * vm_fault_deactivate_behind depends on the + * state being up to date + */ + vm_fault_is_sequential(object, cur_offset, fault_info.behavior); + + vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior); + } /* - * That's it, clean up and return. + * That's it, clean up and return. */ - PAGE_WAKEUP_DONE(m); - vm_object_paging_end(object); + if (m->busy) + PAGE_WAKEUP_DONE(m); + vm_object_unlock(object); - vm_map_unlock_read(map); - if(pmap_map != map) - vm_map_unlock(pmap_map); - if (funnel_set) { - thread_funnel_set( curflock, TRUE); - funnel_set = FALSE; - } - cur_thread->interruptible = interruptible_state; + vm_map_unlock_read(map); + if (real_map != map) + vm_map_unlock(real_map); - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END, - vaddr, - type_of_fault, - KERN_SUCCESS, - 0, - 0); - return KERN_SUCCESS; + goto done; } - /* - * Copy on write fault. If objects match, then - * object->copy must not be NULL (else control - * would be in previous code block), and we - * have a potential push into the copy object - * with which we won't cope here. + * COPY ON WRITE FAULT */ + assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE); - if (cur_object == object) + if (vm_page_throttled()) { + /* + * drop all of our locks... + * wait until the free queue is + * pumped back up and then + * redrive the fault + */ + if (object != cur_object) + vm_object_unlock(cur_object); + vm_object_unlock(object); + vm_map_unlock_read(map); + if (real_map != map) + vm_map_unlock(real_map); + + if (NEED_TO_HARD_THROTTLE_THIS_TASK()) + delay(HARD_THROTTLE_DELAY); + + if (!current_thread_aborted() && vm_page_wait((change_wiring) ? + THREAD_UNINT : + THREAD_ABORTSAFE)) + goto RetryFault; + kr = KERN_ABORTED; + goto done; + } + /* + * If objects match, then + * object->copy must not be NULL (else control + * would be in previous code block), and we + * have a potential push into the copy object + * with which we can't cope with here. + */ + if (cur_object == object) { + /* + * must take the slow path to + * deal with the copy push + */ break; - + } /* - * This is now a shadow based copy on write - * fault -- it requires a copy up the shadow - * chain. + * This is now a shadow based copy on write + * fault -- it requires a copy up the shadow + * chain. + * + * Allocate a page in the original top level + * object. Give up if allocate fails. Also + * need to remember current page, as it's the + * source of the copy. * - * Allocate a page in the original top level - * object. Give up if allocate fails. Also - * need to remember current page, as it's the - * source of the copy. + * at this point we hold locks on both + * object and cur_object... no need to take + * paging refs or mark pages BUSY since + * we don't drop either object lock until + * the page has been copied and inserted */ cur_m = m; m = vm_page_grab(); + if (m == VM_PAGE_NULL) { + /* + * no free page currently available... + * must take the slow path + */ break; } - /* - * Now do the copy. Mark the source busy - * and take out paging references on both - * objects. + * Now do the copy. Mark the source page busy... * * NOTE: This code holds the map lock across * the page copy. */ - - cur_m->busy = TRUE; vm_page_copy(cur_m, m); vm_page_insert(m, object, offset); - - vm_object_paging_begin(cur_object); - vm_object_paging_begin(object); - - type_of_fault = DBG_COW_FAULT; - VM_STAT(cow_faults++); - current_task()->cow_faults++; + m->dirty = TRUE; /* - * Now cope with the source page and object - * If the top object has a ref count of 1 - * then no other map can access it, and hence - * it's not necessary to do the pmap_page_protect. + * Now cope with the source page and object */ - - - vm_page_lock_queues(); - vm_page_deactivate(cur_m); - m->dirty = TRUE; - pmap_page_protect(cur_m->phys_addr, - VM_PROT_NONE); - vm_page_unlock_queues(); - - PAGE_WAKEUP_DONE(cur_m); - vm_object_paging_end(cur_object); + if (object->ref_count > 1 && cur_m->pmapped) + pmap_disconnect(cur_m->phys_page); + + need_collapse = TRUE; + + if (!cur_object->internal && + cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) { + /* + * The object from which we've just + * copied a page is most probably backed + * by a vnode. We don't want to waste too + * much time trying to collapse the VM objects + * and create a bottleneck when several tasks + * map the same file. + */ + if (cur_object->copy == object) { + /* + * Shared mapping or no COW yet. + * We can never collapse a copy + * object into its backing object. + */ + need_collapse = FALSE; + } else if (cur_object->copy == object->shadow && + object->shadow->resident_page_count == 0) { + /* + * Shared mapping after a COW occurred. + */ + need_collapse = FALSE; + } + } vm_object_unlock(cur_object); - /* - * Slight hack to call vm_object collapse - * and then reuse common map in code. - * note that the object lock was taken above. - */ - - vm_object_paging_end(object); - vm_object_collapse(object); - vm_object_paging_begin(object); - vm_object_unlock(object); + if (need_collapse == FALSE) + vm_fault_collapse_skipped++; + vm_fault_collapse_total++; + + type_of_fault = DBG_COW_FAULT; + VM_STAT_INCR(cow_faults); + DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL); + current_task()->cow_faults++; goto FastPmapEnter; - } - else { + } else { /* - * No page at cur_object, cur_offset + * No page at cur_object, cur_offset... m == NULL */ - if (cur_object->pager_created) { - + if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) { + /* + * May have to talk to a pager... + * take the slow path. + */ + break; + } /* - * Have to talk to the pager. Give up. + * existence map present and indicates + * that the pager doesn't have this page */ - - break; } - - if (cur_object->shadow == VM_OBJECT_NULL) { - - if (cur_object->shadow_severed) { - vm_object_paging_end(object); + /* + * Zero fill fault. Page gets + * inserted into the original object. + */ + if (cur_object->shadow_severed || + VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object)) + { + if (object != cur_object) + vm_object_unlock(cur_object); vm_object_unlock(object); - vm_map_unlock_read(map); - if(pmap_map != map) - vm_map_unlock(pmap_map); - if (funnel_set) { - thread_funnel_set( curflock, TRUE); - funnel_set = FALSE; - } - cur_thread->interruptible = interruptible_state; + vm_map_unlock_read(map); + if (real_map != map) + vm_map_unlock(real_map); - return VM_FAULT_MEMORY_ERROR; + kr = KERN_MEMORY_ERROR; + goto done; } + if (vm_page_throttled()) { + /* + * drop all of our locks... + * wait until the free queue is + * pumped back up and then + * redrive the fault + */ + if (object != cur_object) + vm_object_unlock(cur_object); + vm_object_unlock(object); + vm_map_unlock_read(map); + if (real_map != map) + vm_map_unlock(real_map); + + if (NEED_TO_HARD_THROTTLE_THIS_TASK()) + delay(HARD_THROTTLE_DELAY); + + if (!current_thread_aborted() && vm_page_wait((change_wiring) ? + THREAD_UNINT : + THREAD_ABORTSAFE)) + goto RetryFault; + kr = KERN_ABORTED; + goto done; + } + if (vm_backing_store_low) { + /* + * we are protecting the system from + * backing store exhaustion... + * must take the slow path if we're + * not privileged + */ + if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) + break; + } + if (cur_object != object) { + vm_object_unlock(cur_object); - /* - * Zero fill fault. Page gets - * filled in top object. Insert - * page, then drop any lower lock. - * Give up if no page. - */ - if ((vm_page_free_target - - ((vm_page_free_target-vm_page_free_min)>>2)) - > vm_page_free_count) { - break; + cur_object = object; + } + if (object_lock_type == OBJECT_LOCK_SHARED) { + + object_lock_type = OBJECT_LOCK_EXCLUSIVE; + + if (vm_object_lock_upgrade(object) == FALSE) { + /* + * couldn't upgrade so do a full retry on the fault + * since we dropped the object lock which + * could allow another thread to insert + * a page at this offset + */ + vm_map_unlock_read(map); + if (real_map != map) + vm_map_unlock(real_map); + + goto RetryFault; + } } m = vm_page_alloc(object, offset); + if (m == VM_PAGE_NULL) { + /* + * no free page currently available... + * must take the slow path + */ break; } - /* - * This is a zero-fill or initial fill - * page fault. As such, we consider it - * undefined with respect to instruction - * execution. i.e. it is the responsibility - * of higher layers to call for an instruction - * sync after changing the contents and before - * sending a program into this area. We - * choose this approach for performance - */ - - m->no_isync = FALSE; - - if (cur_object != object) - vm_object_unlock(cur_object); - - vm_object_paging_begin(object); - vm_object_unlock(object); /* - * Now zero fill page and map it. - * the page is probably going to - * be written soon, so don't bother - * to clear the modified bit + * Now zero fill page... + * the page is probably going to + * be written soon, so don't bother + * to clear the modified bit * - * NOTE: This code holds the map - * lock across the zero fill. + * NOTE: This code holds the map + * lock across the zero fill. */ + type_of_fault = vm_fault_zero_page(m, map->no_zero_fill); - if (!map->no_zero_fill) { - vm_page_zero_fill(m); - type_of_fault = DBG_ZERO_FILL_FAULT; - VM_STAT(zero_fill_count++); - } - vm_page_lock_queues(); - VM_PAGE_QUEUES_REMOVE(m); - - m->page_ticket = vm_page_ticket; - vm_page_ticket_roll++; - if(vm_page_ticket_roll == - VM_PAGE_TICKETS_IN_ROLL) { - vm_page_ticket_roll = 0; - if(vm_page_ticket == - VM_PAGE_TICKET_ROLL_IDS) - vm_page_ticket= 0; - else - vm_page_ticket++; - } - - queue_enter(&vm_page_queue_inactive, - m, vm_page_t, pageq); - m->inactive = TRUE; - vm_page_inactive_count++; - vm_page_unlock_queues(); goto FastPmapEnter; } - /* - * On to the next level + * On to the next level in the shadow chain */ - cur_offset += cur_object->shadow_offset; new_object = cur_object->shadow; - vm_object_lock(new_object); + + /* + * take the new_object's lock with the indicated state + */ + if (cur_object_lock_type == OBJECT_LOCK_SHARED) + vm_object_lock_shared(new_object); + else + vm_object_lock(new_object); + if (cur_object != object) vm_object_unlock(cur_object); + cur_object = new_object; continue; } } - /* - * Cleanup from fast fault failure. Drop any object - * lock other than original and drop map lock. + * Cleanup from fast fault failure. Drop any object + * lock other than original and drop map lock. */ - if (object != cur_object) vm_object_unlock(cur_object); + + /* + * must own the object lock exclusively at this point + */ + if (object_lock_type == OBJECT_LOCK_SHARED) { + object_lock_type = OBJECT_LOCK_EXCLUSIVE; + + if (vm_object_lock_upgrade(object) == FALSE) { + /* + * couldn't upgrade, so explictly + * take the lock exclusively + * no need to retry the fault at this + * point since "vm_fault_page" will + * completely re-evaluate the state + */ + vm_object_lock(object); + } } + +handle_copy_delay: vm_map_unlock_read(map); - if(pmap_map != map) - vm_map_unlock(pmap_map); + if (real_map != map) + vm_map_unlock(real_map); /* - * Make a reference to this object to - * prevent its disposal while we are messing with - * it. Once we have the reference, the map is free - * to be diddled. Since objects reference their - * shadows (and copies), they will stay around as well. + * Make a reference to this object to + * prevent its disposal while we are messing with + * it. Once we have the reference, the map is free + * to be diddled. Since objects reference their + * shadows (and copies), they will stay around as well. */ - - assert(object->ref_count > 0); - object->ref_count++; - vm_object_res_reference(object); + vm_object_reference_locked(object); vm_object_paging_begin(object); XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0); + + error_code = 0; + kr = vm_fault_page(object, offset, fault_type, (change_wiring && !wired), - interruptible, - lo_offset, hi_offset, behavior, &prot, &result_page, &top_page, &type_of_fault, - &error_code, map->no_zero_fill, FALSE, map, vaddr); + &error_code, map->no_zero_fill, + FALSE, &fault_info); /* - * If we didn't succeed, lose the object reference immediately. + * if kr != VM_FAULT_SUCCESS, then the paging reference + * has been dropped and the object unlocked... the ref_count + * is still held + * + * if kr == VM_FAULT_SUCCESS, then the paging reference + * is still held along with the ref_count on the original object + * + * the object is returned locked with a paging reference + * + * if top_page != NULL, then it's BUSY and the + * object it belongs to has a paging reference + * but is returned unlocked */ - - if (kr != VM_FAULT_SUCCESS) + if (kr != VM_FAULT_SUCCESS && + kr != VM_FAULT_SUCCESS_NO_VM_PAGE) { + /* + * we didn't succeed, lose the object reference immediately. + */ vm_object_deallocate(object); - /* - * See why we failed, and take corrective action. - */ - - switch (kr) { - case VM_FAULT_SUCCESS: - break; + /* + * See why we failed, and take corrective action. + */ + switch (kr) { case VM_FAULT_MEMORY_SHORTAGE: if (vm_page_wait((change_wiring) ? THREAD_UNINT : THREAD_ABORTSAFE)) goto RetryFault; - /* fall thru */ + /* + * fall thru + */ case VM_FAULT_INTERRUPTED: kr = KERN_ABORTED; goto done; case VM_FAULT_RETRY: goto RetryFault; - case VM_FAULT_FICTITIOUS_SHORTAGE: - vm_page_more_fictitious(); - goto RetryFault; case VM_FAULT_MEMORY_ERROR: if (error_code) kr = error_code; else kr = KERN_MEMORY_ERROR; goto done; + default: + panic("vm_fault: unexpected error 0x%x from " + "vm_fault_page()\n", kr); + } } - m = result_page; - if(m != VM_PAGE_NULL) { + if (m != VM_PAGE_NULL) { assert((change_wiring && !wired) ? (top_page == VM_PAGE_NULL) : ((top_page == VM_PAGE_NULL) == (m->object == object))); } /* - * How to clean up the result of vm_fault_page. This - * happens whether the mapping is entered or not. - */ - -#define UNLOCK_AND_DEALLOCATE \ - MACRO_BEGIN \ - vm_fault_cleanup(m->object, top_page); \ - vm_object_deallocate(object); \ - MACRO_END - - /* - * What to do with the resulting page from vm_fault_page - * if it doesn't get entered into the physical map: + * What to do with the resulting page from vm_fault_page + * if it doesn't get entered into the physical map: */ - #define RELEASE_PAGE(m) \ MACRO_BEGIN \ PAGE_WAKEUP_DONE(m); \ - vm_page_lock_queues(); \ - if (!m->active && !m->inactive) \ - vm_page_activate(m); \ - vm_page_unlock_queues(); \ + if (!m->active && !m->inactive && !m->throttled) { \ + vm_page_lockspin_queues(); \ + if (!m->active && !m->inactive && !m->throttled) \ + vm_page_activate(m); \ + vm_page_unlock_queues(); \ + } \ MACRO_END /* - * We must verify that the maps have not changed - * since our last lookup. + * We must verify that the maps have not changed + * since our last lookup. */ - - if(m != VM_PAGE_NULL) { + if (m != VM_PAGE_NULL) { old_copy_object = m->object->copy; - vm_object_unlock(m->object); } else { old_copy_object = VM_OBJECT_NULL; + vm_object_unlock(object); } + + /* + * no object locks are held at this point + */ if ((map != original_map) || !vm_map_verify(map, &version)) { vm_object_t retry_object; vm_object_offset_t retry_offset; vm_prot_t retry_prot; /* - * To avoid trying to write_lock the map while another - * thread has it read_locked (in vm_map_pageable), we - * do not try for write permission. If the page is - * still writable, we will get write permission. If it - * is not, or has been marked needs_copy, we enter the - * mapping without write permission, and will merely - * take another fault. + * To avoid trying to write_lock the map while another + * thread has it read_locked (in vm_map_pageable), we + * do not try for write permission. If the page is + * still writable, we will get write permission. If it + * is not, or has been marked needs_copy, we enter the + * mapping without write permission, and will merely + * take another fault. */ map = original_map; vm_map_lock_read(map); + kr = vm_map_lookup_locked(&map, vaddr, - fault_type & ~VM_PROT_WRITE, &version, - &retry_object, &retry_offset, &retry_prot, - &wired, &behavior, &lo_offset, &hi_offset, - &pmap_map); - pmap = pmap_map->pmap; + fault_type & ~VM_PROT_WRITE, + OBJECT_LOCK_EXCLUSIVE, &version, + &retry_object, &retry_offset, &retry_prot, + &wired, + &fault_info, + &real_map); + pmap = real_map->pmap; if (kr != KERN_SUCCESS) { vm_map_unlock_read(map); - if(m != VM_PAGE_NULL) { + + if (m != VM_PAGE_NULL) { + /* + * retake the lock so that + * we can drop the paging reference + * in vm_fault_cleanup and do the + * PAGE_WAKEUP_DONE in RELEASE_PAGE + */ vm_object_lock(m->object); + RELEASE_PAGE(m); - UNLOCK_AND_DEALLOCATE; + + vm_fault_cleanup(m->object, top_page); } else { - vm_object_deallocate(object); + /* + * retake the lock so that + * we can drop the paging reference + * in vm_fault_cleanup + */ + vm_object_lock(object); + + vm_fault_cleanup(object, top_page); } + vm_object_deallocate(object); + goto done; } - vm_object_unlock(retry_object); - if(m != VM_PAGE_NULL) { - vm_object_lock(m->object); - } else { - vm_object_lock(object); - } - if ((retry_object != object) || - (retry_offset != offset)) { + if ((retry_object != object) || (retry_offset != offset)) { + vm_map_unlock_read(map); - if(pmap_map != map) - vm_map_unlock(pmap_map); - if(m != VM_PAGE_NULL) { + if (real_map != map) + vm_map_unlock(real_map); + + if (m != VM_PAGE_NULL) { + /* + * retake the lock so that + * we can drop the paging reference + * in vm_fault_cleanup and do the + * PAGE_WAKEUP_DONE in RELEASE_PAGE + */ + vm_object_lock(m->object); + RELEASE_PAGE(m); - UNLOCK_AND_DEALLOCATE; + + vm_fault_cleanup(m->object, top_page); } else { - vm_object_deallocate(object); + /* + * retake the lock so that + * we can drop the paging reference + * in vm_fault_cleanup + */ + vm_object_lock(object); + + vm_fault_cleanup(object, top_page); } + vm_object_deallocate(object); + goto RetryFault; } - /* - * Check whether the protection has changed or the object - * has been copied while we left the map unlocked. + * Check whether the protection has changed or the object + * has been copied while we left the map unlocked. */ prot &= retry_prot; - if(m != VM_PAGE_NULL) { - vm_object_unlock(m->object); - } else { - vm_object_unlock(object); - } } - if(m != VM_PAGE_NULL) { + if (m != VM_PAGE_NULL) { vm_object_lock(m->object); - } else { - vm_object_lock(object); - } - /* - * If the copy object changed while the top-level object - * was unlocked, then we must take away write permission. - */ - - if(m != VM_PAGE_NULL) { - if (m->object->copy != old_copy_object) + if (m->object->copy != old_copy_object) { + /* + * The copy object changed while the top-level object + * was unlocked, so take away write permission. + */ prot &= ~VM_PROT_WRITE; - } + } + } else + vm_object_lock(object); /* - * If we want to wire down this page, but no longer have - * adequate permissions, we must start all over. + * If we want to wire down this page, but no longer have + * adequate permissions, we must start all over. */ + if (wired && (fault_type != (prot | VM_PROT_WRITE))) { - if (wired && (fault_type != (prot|VM_PROT_WRITE))) { vm_map_verify_done(map, &version); - if(pmap_map != map) - vm_map_unlock(pmap_map); - if(m != VM_PAGE_NULL) { + if (real_map != map) + vm_map_unlock(real_map); + + if (m != VM_PAGE_NULL) { RELEASE_PAGE(m); - UNLOCK_AND_DEALLOCATE; - } else { - vm_object_deallocate(object); - } + + vm_fault_cleanup(m->object, top_page); + } else + vm_fault_cleanup(object, top_page); + + vm_object_deallocate(object); + goto RetryFault; } - - /* - * Put this page into the physical map. - * We had to do the unlock above because pmap_enter - * may cause other faults. The page may be on - * the pageout queues. If the pageout daemon comes - * across the page, it will remove it from the queues. - */ if (m != VM_PAGE_NULL) { - if (m->no_isync == TRUE) { - pmap_sync_caches_phys(m->phys_addr); - - m->no_isync = FALSE; - } - vm_object_unlock(m->object); - - PMAP_ENTER(pmap, vaddr, m, prot, wired); - { - tws_hash_line_t line; - task_t task; - - task = current_task(); - if((map != NULL) && - (task->dynamic_working_set != 0)) { - if(tws_lookup - ((tws_hash_t) - task->dynamic_working_set, - m->offset, m->object, - &line) != KERN_SUCCESS) { - tws_insert((tws_hash_t) - task->dynamic_working_set, - m->offset, m->object, - vaddr, pmap_map); - if(tws_insert((tws_hash_t) - task->dynamic_working_set, - m->offset, m->object, - vaddr, pmap_map) - == KERN_NO_SPACE) { - tws_expand_working_set( - task->dynamic_working_set, - TWS_HASH_LINE_COUNT); - } - } - } + /* + * Put this page into the physical map. + * We had to do the unlock above because pmap_enter + * may cause other faults. The page may be on + * the pageout queues. If the pageout daemon comes + * across the page, it will remove it from the queues. + */ + if (caller_pmap) { + kr = vm_fault_enter(m, + caller_pmap, + caller_pmap_addr, + prot, + wired, + change_wiring, + fault_info.no_cache, + &type_of_fault); + } else { + kr = vm_fault_enter(m, + pmap, + vaddr, + prot, + wired, + change_wiring, + fault_info.no_cache, + &type_of_fault); + } + if (kr != KERN_SUCCESS) { + /* abort this page fault */ + vm_map_verify_done(map, &version); + if (real_map != map) + vm_map_unlock(real_map); + PAGE_WAKEUP_DONE(m); + vm_fault_cleanup(m->object, top_page); + vm_object_deallocate(object); + goto done; } } else { -/* if __ppc__ not working until figure out phys copy on block maps */ -#ifdef notdefcdy - int memattr; - struct phys_entry *pp; + vm_map_entry_t entry; + vm_map_offset_t laddr; + vm_map_offset_t ldelta, hdelta; + /* * do a pmap block mapping from the physical address * in the object */ - if(pp = pmap_find_physentry( - (vm_offset_t)object->shadow_offset)) { - memattr = ((pp->pte1 & 0x00000078) >> 3); - } else { - memattr = PTE_WIMG_UNCACHED_COHERENT_GUARDED; - } - - pmap_map_block(pmap, vaddr, - (vm_offset_t)object->shadow_offset, - object->size, prot, - memattr, 0); /* Set up a block mapped area */ -//#else - vm_offset_t off; - for (off = 0; off < object->size; off += page_size) { - pmap_enter(pmap, vaddr + off, - object->shadow_offset + off, prot, TRUE); - /* Map it in */ - } -#endif - } +#ifdef ppc + /* While we do not worry about execution protection in */ + /* general, certian pages may have instruction execution */ + /* disallowed. We will check here, and if not allowed */ + /* to execute, we return with a protection failure. */ - /* - * If the page is not wired down and isn't already - * on a pageout queue, then put it where the - * pageout daemon can find it. - */ - if(m != VM_PAGE_NULL) { - vm_object_lock(m->object); - vm_page_lock_queues(); + if ((fault_type & VM_PROT_EXECUTE) && + (!pmap_eligible_for_execute((ppnum_t)(object->shadow_offset >> 12)))) { - if (change_wiring) { - if (wired) - vm_page_wire(m); - else - vm_page_unwire(m); + vm_map_verify_done(map, &version); + + if (real_map != map) + vm_map_unlock(real_map); + + vm_fault_cleanup(object, top_page); + vm_object_deallocate(object); + + kr = KERN_PROTECTION_FAILURE; + goto done; } -#if VM_FAULT_STATIC_CONFIG - else { - if (!m->active && !m->inactive) - vm_page_activate(m); - m->reference = TRUE; +#endif /* ppc */ + + if (real_map != map) + vm_map_unlock(real_map); + + if (original_map != map) { + vm_map_unlock_read(map); + vm_map_lock_read(original_map); + map = original_map; } -#else - else if (software_reference_bits) { - if (!m->active && !m->inactive) - vm_page_activate(m); - m->reference = TRUE; - } else { - vm_page_activate(m); + real_map = map; + + laddr = vaddr; + hdelta = 0xFFFFF000; + ldelta = 0xFFFFF000; + + while (vm_map_lookup_entry(map, laddr, &entry)) { + if (ldelta > (laddr - entry->vme_start)) + ldelta = laddr - entry->vme_start; + if (hdelta > (entry->vme_end - laddr)) + hdelta = entry->vme_end - laddr; + if (entry->is_sub_map) { + + laddr = (laddr - entry->vme_start) + + entry->offset; + vm_map_lock_read(entry->object.sub_map); + + if (map != real_map) + vm_map_unlock_read(map); + if (entry->use_pmap) { + vm_map_unlock_read(real_map); + real_map = entry->object.sub_map; + } + map = entry->object.sub_map; + + } else { + break; + } + } + + if (vm_map_lookup_entry(map, laddr, &entry) && + (entry->object.vm_object != NULL) && + (entry->object.vm_object == object)) { + + int superpage = (!object->pager_created && object->phys_contiguous)? VM_MEM_SUPERPAGE : 0; + if (caller_pmap) { + /* + * Set up a block mapped area + */ + assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12)); + pmap_map_block(caller_pmap, + (addr64_t)(caller_pmap_addr - ldelta), + (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->shadow_offset)) + + entry->offset + (laddr - entry->vme_start) - ldelta) >> 12), + (uint32_t)((ldelta + hdelta) >> 12), prot, + (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0); + } else { + /* + * Set up a block mapped area + */ + assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12)); + pmap_map_block(real_map->pmap, + (addr64_t)(vaddr - ldelta), + (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->shadow_offset)) + + entry->offset + (laddr - entry->vme_start) - ldelta) >> 12), + (uint32_t)((ldelta + hdelta) >> 12), prot, + (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0); + } } -#endif - vm_page_unlock_queues(); } /* - * Unlock everything, and return + * Unlock everything, and return */ - vm_map_verify_done(map, &version); - if(pmap_map != map) - vm_map_unlock(pmap_map); - if(m != VM_PAGE_NULL) { + if (real_map != map) + vm_map_unlock(real_map); + + if (m != VM_PAGE_NULL) { PAGE_WAKEUP_DONE(m); - UNLOCK_AND_DEALLOCATE; - } else { - vm_fault_cleanup(object, top_page); - vm_object_deallocate(object); - } - kr = KERN_SUCCESS; -#undef UNLOCK_AND_DEALLOCATE + vm_fault_cleanup(m->object, top_page); + } else + vm_fault_cleanup(object, top_page); + + vm_object_deallocate(object); + #undef RELEASE_PAGE - done: - if (funnel_set) { - thread_funnel_set( curflock, TRUE); - funnel_set = FALSE; - } - cur_thread->interruptible = interruptible_state; + kr = KERN_SUCCESS; +done: + thread_interrupt_level(interruptible_state); - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END, - vaddr, - type_of_fault, + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END, + (int)((uint64_t)vaddr >> 32), + (int)vaddr, kr, - 0, + type_of_fault, 0); - return(kr); + + return (kr); } /* @@ -2874,22 +3795,30 @@ kern_return_t vm_fault_wire( vm_map_t map, vm_map_entry_t entry, - pmap_t pmap) + pmap_t pmap, + vm_map_offset_t pmap_addr) { - register vm_offset_t va; - register vm_offset_t end_addr = entry->vme_end; + register vm_map_offset_t va; + register vm_map_offset_t end_addr = entry->vme_end; register kern_return_t rc; assert(entry->in_transition); + if ((entry->object.vm_object != NULL) && + !entry->is_sub_map && + entry->object.vm_object->phys_contiguous) { + return KERN_SUCCESS; + } + /* * Inform the physical mapping system that the * range of addresses may not fault, so that * page tables and such can be locked down as well. */ - pmap_pageable(pmap, entry->vme_start, end_addr, FALSE); + pmap_pageable(pmap, pmap_addr, + pmap_addr + (end_addr - entry->vme_start), FALSE); /* * We simulate a fault to get the page and enter it @@ -2898,9 +3827,14 @@ vm_fault_wire( for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) { if ((rc = vm_fault_wire_fast( - map, va, entry, pmap)) != KERN_SUCCESS) { + map, va, entry, pmap, + pmap_addr + (va - entry->vme_start) + )) != KERN_SUCCESS) { rc = vm_fault(map, va, VM_PROT_NONE, TRUE, - (pmap == kernel_pmap) ? THREAD_UNINT : THREAD_ABORTSAFE); + (pmap == kernel_pmap) ? + THREAD_UNINT : THREAD_ABORTSAFE, + pmap, pmap_addr + (va - entry->vme_start)); + DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL); } if (rc != KERN_SUCCESS) { @@ -2908,7 +3842,8 @@ vm_fault_wire( /* unwire wired pages */ tmp_entry.vme_end = va; - vm_fault_unwire(map, &tmp_entry, FALSE, pmap); + vm_fault_unwire(map, + &tmp_entry, FALSE, pmap, pmap_addr); return rc; } @@ -2926,25 +3861,49 @@ vm_fault_unwire( vm_map_t map, vm_map_entry_t entry, boolean_t deallocate, - pmap_t pmap) + pmap_t pmap, + vm_map_offset_t pmap_addr) { - register vm_offset_t va; - register vm_offset_t end_addr = entry->vme_end; + register vm_map_offset_t va; + register vm_map_offset_t end_addr = entry->vme_end; vm_object_t object; + struct vm_object_fault_info fault_info; object = (entry->is_sub_map) ? VM_OBJECT_NULL : entry->object.vm_object; + /* + * If it's marked phys_contiguous, then vm_fault_wire() didn't actually + * do anything since such memory is wired by default. So we don't have + * anything to undo here. + */ + + if (object != VM_OBJECT_NULL && object->phys_contiguous) + return; + + fault_info.interruptible = THREAD_UNINT; + fault_info.behavior = entry->behavior; + fault_info.user_tag = entry->alias; + fault_info.lo_offset = entry->offset; + fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset; + fault_info.no_cache = entry->no_cache; + fault_info.stealth = TRUE; + fault_info.mark_zf_absent = FALSE; + /* * Since the pages are wired down, we must be able to * get their mappings from the physical map system. */ for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) { - pmap_change_wiring(pmap, va, FALSE); if (object == VM_OBJECT_NULL) { - (void) vm_fault(map, va, VM_PROT_NONE, TRUE, THREAD_UNINT); + if (pmap) { + pmap_change_wiring(pmap, + pmap_addr + (va - entry->vme_start), FALSE); + } + (void) vm_fault(map, va, VM_PROT_NONE, + TRUE, THREAD_UNINT, pmap, pmap_addr); } else { vm_prot_t prot; vm_page_t result_page; @@ -2952,6 +3911,14 @@ vm_fault_unwire( vm_object_t result_object; vm_fault_return_t result; + if (end_addr - va > (vm_size_t) -1) { + /* 32-bit overflow */ + fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE); + } else { + fault_info.cluster_size = (vm_size_t) (end_addr - va); + assert(fault_info.cluster_size == end_addr - va); + } + do { prot = VM_PROT_NONE; @@ -2960,40 +3927,56 @@ vm_fault_unwire( XPR(XPR_VM_FAULT, "vm_fault_unwire -> vm_fault_page\n", 0,0,0,0,0); - result = vm_fault_page(object, - entry->offset + - (va - entry->vme_start), - VM_PROT_NONE, TRUE, - THREAD_UNINT, - entry->offset, - entry->offset + - (entry->vme_end - - entry->vme_start), - entry->behavior, - &prot, - &result_page, - &top_page, - (int *)0, - 0, map->no_zero_fill, - FALSE, NULL, 0); + result = vm_fault_page( + object, + entry->offset + (va - entry->vme_start), + VM_PROT_NONE, TRUE, + &prot, &result_page, &top_page, + (int *)0, + NULL, map->no_zero_fill, + FALSE, &fault_info); } while (result == VM_FAULT_RETRY); + /* + * If this was a mapping to a file on a device that has been forcibly + * unmounted, then we won't get a page back from vm_fault_page(). Just + * move on to the next one in case the remaining pages are mapped from + * different objects. During a forced unmount, the object is terminated + * so the alive flag will be false if this happens. A forced unmount will + * will occur when an external disk is unplugged before the user does an + * eject, so we don't want to panic in that situation. + */ + + if (result == VM_FAULT_MEMORY_ERROR && !object->alive) + continue; + if (result != VM_FAULT_SUCCESS) panic("vm_fault_unwire: failure"); result_object = result_page->object; + + if ((pmap) && (result_page->phys_page != vm_page_guard_addr)) { + pmap_change_wiring(pmap, + pmap_addr + (va - entry->vme_start), FALSE); + } if (deallocate) { - assert(!result_page->fictitious); - pmap_page_protect(result_page->phys_addr, - VM_PROT_NONE); + assert(result_page->phys_page != + vm_page_fictitious_addr); + pmap_disconnect(result_page->phys_page); VM_PAGE_FREE(result_page); } else { - vm_page_lock_queues(); - vm_page_unwire(result_page); - vm_page_unlock_queues(); + if (VM_PAGE_WIRED(result_page)) { + vm_page_lockspin_queues(); + vm_page_unwire(result_page, TRUE); + vm_page_unlock_queues(); + } + if(entry->zero_wired_pages) { + pmap_zero_page(result_page->phys_page); + entry->zero_wired_pages = FALSE; + } + PAGE_WAKEUP_DONE(result_page); } - vm_fault_cleanup(result_object, top_page); } } @@ -3004,7 +3987,8 @@ vm_fault_unwire( * such may be unwired themselves. */ - pmap_pageable(pmap, entry->vme_start, end_addr, TRUE); + pmap_pageable(pmap, pmap_addr, + pmap_addr + (end_addr - entry->vme_start), TRUE); } @@ -3030,21 +4014,24 @@ vm_fault_unwire( */ kern_return_t vm_fault_wire_fast( - vm_map_t map, - vm_offset_t va, + __unused vm_map_t map, + vm_map_offset_t va, vm_map_entry_t entry, - pmap_t pmap) + pmap_t pmap, + vm_map_offset_t pmap_addr) { vm_object_t object; vm_object_offset_t offset; register vm_page_t m; vm_prot_t prot; - thread_act_t thr_act; + thread_t thread = current_thread(); + int type_of_fault; + kern_return_t kr; - VM_STAT(faults++); + VM_STAT_INCR(faults); - if((thr_act=current_act()) && (thr_act->task != TASK_NULL)) - thr_act->task->faults++; + if (thread != THREAD_NULL && thread->task != TASK_NULL) + thread->task->faults++; /* * Recovery actions @@ -3053,16 +4040,16 @@ vm_fault_wire_fast( #undef RELEASE_PAGE #define RELEASE_PAGE(m) { \ PAGE_WAKEUP_DONE(m); \ - vm_page_lock_queues(); \ - vm_page_unwire(m); \ + vm_page_lockspin_queues(); \ + vm_page_unwire(m, TRUE); \ vm_page_unlock_queues(); \ } #undef UNLOCK_THINGS #define UNLOCK_THINGS { \ - object->paging_in_progress--; \ - vm_object_unlock(object); \ + vm_object_paging_end(object); \ + vm_object_unlock(object); \ } #undef UNLOCK_AND_DEALLOCATE @@ -3100,10 +4087,8 @@ vm_fault_wire_fast( */ vm_object_lock(object); - assert(object->ref_count > 0); - object->ref_count++; - vm_object_res_reference(object); - object->paging_in_progress++; + vm_object_reference_locked(object); + vm_object_paging_begin(object); /* * INVARIANTS (through entire routine): @@ -3122,21 +4107,33 @@ vm_fault_wire_fast( /* * Look for page in top-level object. If it's not there or * there's something going on, give up. + * ENCRYPTED SWAP: use the slow fault path, since we'll need to + * decrypt the page before wiring it down. */ m = vm_page_lookup(object, offset); - if ((m == VM_PAGE_NULL) || (m->busy) || - (m->unusual && ( m->error || m->restart || m->absent || - prot & m->page_lock))) { + if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) || + (m->unusual && ( m->error || m->restart || m->absent))) { GIVE_UP; } + ASSERT_PAGE_DECRYPTED(m); + + if (m->fictitious && + m->phys_page == vm_page_guard_addr) { + /* + * Guard pages are fictitious pages and are never + * entered into a pmap, so let's say it's been wired... + */ + kr = KERN_SUCCESS; + goto done; + } /* * Wire the page down now. All bail outs beyond this * point must unwire the page. */ - vm_page_lock_queues(); + vm_page_lockspin_queues(); vm_page_wire(m); vm_page_unlock_queues(); @@ -3157,23 +4154,18 @@ vm_fault_wire_fast( /* * Put this page into the physical map. - * We have to unlock the object because pmap_enter - * may cause other faults. - */ - if (m->no_isync == TRUE) { - pmap_sync_caches_phys(m->phys_addr); - - m->no_isync = FALSE; - } - vm_object_unlock(object); - - PMAP_ENTER(pmap, va, m, prot, TRUE); - - /* - * Must relock object so that paging_in_progress can be cleared. */ - vm_object_lock(object); - + type_of_fault = DBG_CACHE_HIT_FAULT; + kr = vm_fault_enter(m, + pmap, + pmap_addr, + prot, + TRUE, + FALSE, + FALSE, + &type_of_fault); + +done: /* * Unlock everything, and return */ @@ -3181,7 +4173,7 @@ vm_fault_wire_fast( PAGE_WAKEUP_DONE(m); UNLOCK_AND_DEALLOCATE; - return(KERN_SUCCESS); + return kr; } @@ -3200,10 +4192,12 @@ vm_fault_copy_cleanup( vm_object_lock(object); PAGE_WAKEUP_DONE(page); - vm_page_lock_queues(); - if (!page->active && !page->inactive) - vm_page_activate(page); - vm_page_unlock_queues(); + if (!page->active && !page->inactive && !page->throttled) { + vm_page_lockspin_queues(); + if (!page->active && !page->inactive && !page->throttled) + vm_page_activate(page); + vm_page_unlock_queues(); + } vm_fault_cleanup(object, top_page); } @@ -3216,8 +4210,8 @@ vm_fault_copy_dst_cleanup( if (page != VM_PAGE_NULL) { object = page->object; vm_object_lock(object); - vm_page_lock_queues(); - vm_page_unwire(page); + vm_page_lockspin_queues(); + vm_page_unwire(page, TRUE); vm_page_unlock_queues(); vm_object_paging_end(object); vm_object_unlock(object); @@ -3255,7 +4249,7 @@ kern_return_t vm_fault_copy( vm_object_t src_object, vm_object_offset_t src_offset, - vm_size_t *src_size, /* INOUT */ + vm_map_size_t *copy_size, /* INOUT */ vm_object_t dst_object, vm_object_offset_t dst_offset, vm_map_t dst_map, @@ -3272,28 +4266,46 @@ vm_fault_copy( vm_page_t dst_top_page; vm_prot_t dst_prot; - vm_size_t amount_left; + vm_map_size_t amount_left; vm_object_t old_copy_object; kern_return_t error = 0; + vm_fault_return_t result; - vm_size_t part_size; + vm_map_size_t part_size; + struct vm_object_fault_info fault_info_src; + struct vm_object_fault_info fault_info_dst; /* * In order not to confuse the clustered pageins, align * the different offsets on a page boundary. */ - vm_object_offset_t src_lo_offset = trunc_page_64(src_offset); - vm_object_offset_t dst_lo_offset = trunc_page_64(dst_offset); - vm_object_offset_t src_hi_offset = round_page_64(src_offset + *src_size); - vm_object_offset_t dst_hi_offset = round_page_64(dst_offset + *src_size); #define RETURN(x) \ MACRO_BEGIN \ - *src_size -= amount_left; \ + *copy_size -= amount_left; \ MACRO_RETURN(x); \ MACRO_END - amount_left = *src_size; + amount_left = *copy_size; + + fault_info_src.interruptible = interruptible; + fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL; + fault_info_src.user_tag = 0; + fault_info_src.lo_offset = vm_object_trunc_page(src_offset); + fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left; + fault_info_src.no_cache = FALSE; + fault_info_src.stealth = TRUE; + fault_info_src.mark_zf_absent = FALSE; + + fault_info_dst.interruptible = interruptible; + fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL; + fault_info_dst.user_tag = 0; + fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset); + fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left; + fault_info_dst.no_cache = FALSE; + fault_info_dst.stealth = TRUE; + fault_info_dst.mark_zf_absent = FALSE; + do { /* while (amount_left > 0) */ /* * There may be a deadlock if both source and destination @@ -3309,22 +4321,25 @@ vm_fault_copy( vm_object_lock(dst_object); vm_object_paging_begin(dst_object); + if (amount_left > (vm_size_t) -1) { + /* 32-bit overflow */ + fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE); + } else { + fault_info_dst.cluster_size = (vm_size_t) amount_left; + assert(fault_info_dst.cluster_size == amount_left); + } + XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0); - switch (vm_fault_page(dst_object, - trunc_page_64(dst_offset), - VM_PROT_WRITE|VM_PROT_READ, - FALSE, - interruptible, - dst_lo_offset, - dst_hi_offset, - VM_BEHAVIOR_SEQUENTIAL, - &dst_prot, - &dst_page, - &dst_top_page, - (int *)0, - &error, - dst_map->no_zero_fill, - FALSE, NULL, 0)) { + result = vm_fault_page(dst_object, + vm_object_trunc_page(dst_offset), + VM_PROT_WRITE|VM_PROT_READ, + FALSE, + &dst_prot, &dst_page, &dst_top_page, + (int *)0, + &error, + dst_map->no_zero_fill, + FALSE, &fault_info_dst); + switch (result) { case VM_FAULT_SUCCESS: break; case VM_FAULT_RETRY: @@ -3335,14 +4350,19 @@ vm_fault_copy( /* fall thru */ case VM_FAULT_INTERRUPTED: RETURN(MACH_SEND_INTERRUPTED); - case VM_FAULT_FICTITIOUS_SHORTAGE: - vm_page_more_fictitious(); - goto RetryDestinationFault; + case VM_FAULT_SUCCESS_NO_VM_PAGE: + /* success but no VM page: fail the copy */ + vm_object_paging_end(dst_object); + vm_object_unlock(dst_object); + /*FALLTHROUGH*/ case VM_FAULT_MEMORY_ERROR: if (error) return (error); else return(KERN_MEMORY_ERROR); + default: + panic("vm_fault_copy: unexpected error 0x%x from " + "vm_fault_page()\n", result); } assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE); @@ -3360,7 +4380,7 @@ vm_fault_copy( * holding the dest page so it doesn't go away. */ - vm_page_lock_queues(); + vm_page_lockspin_queues(); vm_page_wire(dst_page); vm_page_unlock_queues(); PAGE_WAKEUP_DONE(dst_page); @@ -3385,7 +4405,7 @@ vm_fault_copy( } else { vm_object_lock(src_object); src_page = vm_page_lookup(src_object, - trunc_page_64(src_offset)); + vm_object_trunc_page(src_offset)); if (src_page == dst_page) { src_prot = dst_prot; result_page = VM_PAGE_NULL; @@ -3393,25 +4413,27 @@ vm_fault_copy( src_prot = VM_PROT_READ; vm_object_paging_begin(src_object); + if (amount_left > (vm_size_t) -1) { + /* 32-bit overflow */ + fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE); + } else { + fault_info_src.cluster_size = (vm_size_t) amount_left; + assert(fault_info_src.cluster_size == amount_left); + } + XPR(XPR_VM_FAULT, "vm_fault_copy(2) -> vm_fault_page\n", 0,0,0,0,0); - switch (vm_fault_page(src_object, - trunc_page_64(src_offset), - VM_PROT_READ, - FALSE, - interruptible, - src_lo_offset, - src_hi_offset, - VM_BEHAVIOR_SEQUENTIAL, - &src_prot, - &result_page, - &src_top_page, - (int *)0, - &error, - FALSE, - FALSE, NULL, 0)) { - + result = vm_fault_page( + src_object, + vm_object_trunc_page(src_offset), + VM_PROT_READ, FALSE, + &src_prot, + &result_page, &src_top_page, + (int *)0, &error, FALSE, + FALSE, &fault_info_src); + + switch (result) { case VM_FAULT_SUCCESS: break; case VM_FAULT_RETRY: @@ -3423,15 +4445,21 @@ vm_fault_copy( case VM_FAULT_INTERRUPTED: vm_fault_copy_dst_cleanup(dst_page); RETURN(MACH_SEND_INTERRUPTED); - case VM_FAULT_FICTITIOUS_SHORTAGE: - vm_page_more_fictitious(); - goto RetrySourceFault; + case VM_FAULT_SUCCESS_NO_VM_PAGE: + /* success but no VM page: fail */ + vm_object_paging_end(src_object); + vm_object_unlock(src_object); + /*FALLTHROUGH*/ case VM_FAULT_MEMORY_ERROR: vm_fault_copy_dst_cleanup(dst_page); if (error) return (error); else return(KERN_MEMORY_ERROR); + default: + panic("vm_fault_copy(2): unexpected " + "error 0x%x from " + "vm_fault_page()\n", result); } @@ -3473,8 +4501,8 @@ vm_fault_copy( vm_object_offset_t src_po, dst_po; - src_po = src_offset - trunc_page_64(src_offset); - dst_po = dst_offset - trunc_page_64(dst_offset); + src_po = src_offset - vm_object_trunc_page(src_offset); + dst_po = dst_offset - vm_object_trunc_page(dst_offset); if (dst_po > src_po) { part_size = PAGE_SIZE - dst_po; @@ -3486,11 +4514,20 @@ vm_fault_copy( } if (result_page == VM_PAGE_NULL) { + assert((vm_offset_t) dst_po == dst_po); + assert((vm_size_t) part_size == part_size); vm_page_part_zero_fill(dst_page, - dst_po, part_size); + (vm_offset_t) dst_po, + (vm_size_t) part_size); } else { - vm_page_part_copy(result_page, src_po, - dst_page, dst_po, part_size); + assert((vm_offset_t) src_po == src_po); + assert((vm_offset_t) dst_po == dst_po); + assert((vm_size_t) part_size == part_size); + vm_page_part_copy(result_page, + (vm_offset_t) src_po, + dst_page, + (vm_offset_t) dst_po, + (vm_size_t)part_size); if(!dst_page->dirty){ vm_object_lock(dst_object); dst_page->dirty = TRUE; @@ -3535,163 +4572,6 @@ vm_fault_copy( /*NOTREACHED*/ } -#ifdef notdef - -/* - * Routine: vm_fault_page_overwrite - * - * Description: - * A form of vm_fault_page that assumes that the - * resulting page will be overwritten in its entirety, - * making it unnecessary to obtain the correct *contents* - * of the page. - * - * Implementation: - * XXX Untested. Also unused. Eventually, this technology - * could be used in vm_fault_copy() to advantage. - */ -vm_fault_return_t -vm_fault_page_overwrite( - register - vm_object_t dst_object, - vm_object_offset_t dst_offset, - vm_page_t *result_page) /* OUT */ -{ - register - vm_page_t dst_page; - kern_return_t wait_result; - -#define interruptible THREAD_UNINT /* XXX */ - - while (TRUE) { - /* - * Look for a page at this offset - */ - - while ((dst_page = vm_page_lookup(dst_object, dst_offset)) - == VM_PAGE_NULL) { - /* - * No page, no problem... just allocate one. - */ - - dst_page = vm_page_alloc(dst_object, dst_offset); - if (dst_page == VM_PAGE_NULL) { - vm_object_unlock(dst_object); - VM_PAGE_WAIT(); - vm_object_lock(dst_object); - continue; - } - - /* - * Pretend that the memory manager - * write-protected the page. - * - * Note that we will be asking for write - * permission without asking for the data - * first. - */ - - dst_page->overwriting = TRUE; - dst_page->page_lock = VM_PROT_WRITE; - dst_page->absent = TRUE; - dst_page->unusual = TRUE; - dst_object->absent_count++; - - break; - - /* - * When we bail out, we might have to throw - * away the page created here. - */ - -#define DISCARD_PAGE \ - MACRO_BEGIN \ - vm_object_lock(dst_object); \ - dst_page = vm_page_lookup(dst_object, dst_offset); \ - if ((dst_page != VM_PAGE_NULL) && dst_page->overwriting) \ - VM_PAGE_FREE(dst_page); \ - vm_object_unlock(dst_object); \ - MACRO_END - } - - /* - * If the page is write-protected... - */ - - if (dst_page->page_lock & VM_PROT_WRITE) { - /* - * ... and an unlock request hasn't been sent - */ - - if ( ! (dst_page->unlock_request & VM_PROT_WRITE)) { - vm_prot_t u; - kern_return_t rc; - - /* - * ... then send one now. - */ - - if (!dst_object->pager_ready) { - vm_object_assert_wait(dst_object, - VM_OBJECT_EVENT_PAGER_READY, - interruptible); - vm_object_unlock(dst_object); - wait_result = thread_block((void (*)(void))0); - if (wait_result != THREAD_AWAKENED) { - DISCARD_PAGE; - return(VM_FAULT_INTERRUPTED); - } - continue; - } - - u = dst_page->unlock_request |= VM_PROT_WRITE; - vm_object_unlock(dst_object); - - if ((rc = memory_object_data_unlock( - dst_object->pager, - dst_offset + dst_object->paging_offset, - PAGE_SIZE, - u)) != KERN_SUCCESS) { - if (vm_fault_debug) - printf("vm_object_overwrite: memory_object_data_unlock failed\n"); - DISCARD_PAGE; - return((rc == MACH_SEND_INTERRUPTED) ? - VM_FAULT_INTERRUPTED : - VM_FAULT_MEMORY_ERROR); - } - vm_object_lock(dst_object); - continue; - } - - /* ... fall through to wait below */ - } else { - /* - * If the page isn't being used for other - * purposes, then we're done. - */ - if ( ! (dst_page->busy || dst_page->absent || - dst_page->error || dst_page->restart) ) - break; - } - - PAGE_ASSERT_WAIT(dst_page, interruptible); - vm_object_unlock(dst_object); - wait_result = thread_block((void (*)(void))0); - if (wait_result != THREAD_AWAKENED) { - DISCARD_PAGE; - return(VM_FAULT_INTERRUPTED); - } - } - - *result_page = dst_page; - return(VM_FAULT_SUCCESS); - -#undef interruptible -#undef DISCARD_PAGE -} - -#endif /* notdef */ - #if VM_FAULT_CLASSIFY /* * Temporary statistics gathering support. @@ -3723,8 +4603,7 @@ vm_fault_classify(vm_object_t object, while (TRUE) { m = vm_page_lookup(object, offset); if (m != VM_PAGE_NULL) { - if (m->busy || m->error || m->restart || m->absent || - fault_type & m->page_lock) { + if (m->busy || m->error || m->restart || m->absent) { type = VM_FAULT_TYPE_OTHER; break; } @@ -3777,3 +4656,186 @@ vm_fault_classify_init(void) return; } #endif /* VM_FAULT_CLASSIFY */ + + +extern int cs_validation; + +void +vm_page_validate_cs_mapped( + vm_page_t page, + const void *kaddr) +{ + vm_object_t object; + vm_object_offset_t offset; + kern_return_t kr; + memory_object_t pager; + void *blobs; + boolean_t validated, tainted; + + assert(page->busy); + vm_object_lock_assert_exclusive(page->object); + + if (!cs_validation) { + return; + } + + if (page->wpmapped && !page->cs_tainted) { + /* + * This page was mapped for "write" access sometime in the + * past and could still be modifiable in the future. + * Consider it tainted. + * [ If the page was already found to be "tainted", no + * need to re-validate. ] + */ + page->cs_validated = TRUE; + page->cs_tainted = TRUE; + if (cs_debug) { + printf("CODESIGNING: vm_page_validate_cs: " + "page %p obj %p off 0x%llx " + "was modified\n", + page, page->object, page->offset); + } + vm_cs_validated_dirtied++; + } + + if (page->cs_validated) { + return; + } + + vm_cs_validates++; + + object = page->object; + assert(object->code_signed); + offset = page->offset; + + if (!object->alive || object->terminating || object->pager == NULL) { + /* + * The object is terminating and we don't have its pager + * so we can't validate the data... + */ + return; + } + /* + * Since we get here to validate a page that was brought in by + * the pager, we know that this pager is all setup and ready + * by now. + */ + assert(!object->internal); + assert(object->pager != NULL); + assert(object->pager_ready); + + pager = object->pager; + assert(object->paging_in_progress); + kr = vnode_pager_get_object_cs_blobs(pager, &blobs); + if (kr != KERN_SUCCESS) { + blobs = NULL; + } + + /* verify the SHA1 hash for this page */ + validated = cs_validate_page(blobs, + offset + object->paging_offset, + (const void *)kaddr, + &tainted); + + page->cs_validated = validated; + if (validated) { + page->cs_tainted = tainted; + } +} + +void +vm_page_validate_cs( + vm_page_t page) +{ + vm_object_t object; + vm_object_offset_t offset; + vm_map_offset_t koffset; + vm_map_size_t ksize; + vm_offset_t kaddr; + kern_return_t kr; + boolean_t busy_page; + + vm_object_lock_assert_held(page->object); + + if (!cs_validation) { + return; + } + + if (page->wpmapped && !page->cs_tainted) { + vm_object_lock_assert_exclusive(page->object); + + /* + * This page was mapped for "write" access sometime in the + * past and could still be modifiable in the future. + * Consider it tainted. + * [ If the page was already found to be "tainted", no + * need to re-validate. ] + */ + page->cs_validated = TRUE; + page->cs_tainted = TRUE; + if (cs_debug) { + printf("CODESIGNING: vm_page_validate_cs: " + "page %p obj %p off 0x%llx " + "was modified\n", + page, page->object, page->offset); + } + vm_cs_validated_dirtied++; + } + + if (page->cs_validated) { + return; + } + + vm_object_lock_assert_exclusive(page->object); + + object = page->object; + assert(object->code_signed); + offset = page->offset; + + busy_page = page->busy; + if (!busy_page) { + /* keep page busy while we map (and unlock) the VM object */ + page->busy = TRUE; + } + + /* + * Take a paging reference on the VM object + * to protect it from collapse or bypass, + * and keep it from disappearing too. + */ + vm_object_paging_begin(object); + + /* map the page in the kernel address space */ + koffset = 0; + ksize = PAGE_SIZE_64; + kr = vm_paging_map_object(&koffset, + page, + object, + offset, + &ksize, + VM_PROT_READ, + FALSE); /* can't unlock object ! */ + if (kr != KERN_SUCCESS) { + panic("vm_page_validate_cs: could not map page: 0x%x\n", kr); + } + kaddr = CAST_DOWN(vm_offset_t, koffset); + + /* validate the mapped page */ + vm_page_validate_cs_mapped(page, (const void *) kaddr); + + assert(page->busy); + assert(object == page->object); + vm_object_lock_assert_exclusive(object); + + if (!busy_page) { + PAGE_WAKEUP_DONE(page); + } + if (koffset != 0) { + /* unmap the map from the kernel address space */ + vm_paging_unmap_object(object, koffset, koffset + ksize); + koffset = 0; + ksize = 0; + kaddr = 0; + } + vm_object_paging_end(object); +}