+ kr = vm_fault_pmap_enter_with_object_lock(object, pmap, vaddr,
+ fault_page_size, fault_phys_offset, m,
+ &prot, caller_prot, fault_type, wired, pmap_options, need_retry);
+ }
+
+ return kr;
+}
+
+void
+vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
+{
+ if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
+ vm_fault(current_map(), /* map */
+ vaddr, /* vaddr */
+ prot, /* fault_type */
+ FALSE, /* change_wiring */
+ VM_KERN_MEMORY_NONE, /* tag - not wiring */
+ THREAD_UNINT, /* interruptible */
+ NULL, /* caller_pmap */
+ 0 /* caller_pmap_addr */);
+ }
+}
+
+
+/*
+ * Routine: vm_fault
+ * Purpose:
+ * Handle page faults, including pseudo-faults
+ * used to change the wiring status of pages.
+ * Returns:
+ * Explicit continuations have been removed.
+ * Implementation:
+ * vm_fault and vm_fault_page save mucho state
+ * in the moral equivalent of a closure. The state
+ * structure is allocated when first entering vm_fault
+ * and deallocated when leaving vm_fault.
+ */
+
+extern uint64_t get_current_unique_pid(void);
+
+unsigned long vm_fault_collapse_total = 0;
+unsigned long vm_fault_collapse_skipped = 0;
+
+
+kern_return_t
+vm_fault_external(
+ vm_map_t map,
+ vm_map_offset_t vaddr,
+ vm_prot_t fault_type,
+ boolean_t change_wiring,
+ int interruptible,
+ pmap_t caller_pmap,
+ vm_map_offset_t caller_pmap_addr)
+{
+ return vm_fault_internal(map, vaddr, fault_type, change_wiring,
+ change_wiring ? vm_tag_bt() : VM_KERN_MEMORY_NONE,
+ interruptible, caller_pmap, caller_pmap_addr,
+ NULL);
+}
+
+kern_return_t
+vm_fault(
+ vm_map_t map,
+ vm_map_offset_t vaddr,
+ vm_prot_t fault_type,
+ boolean_t change_wiring,
+ vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
+ int interruptible,
+ pmap_t caller_pmap,
+ vm_map_offset_t caller_pmap_addr)
+{
+ return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
+ interruptible, caller_pmap, caller_pmap_addr,
+ NULL);
+}
+
+static boolean_t
+current_proc_is_privileged(void)
+{
+ return csproc_get_platform_binary(current_proc());
+}
+
+uint64_t vm_copied_on_read = 0;
+
+/*
+ * Cleanup after a vm_fault_enter.
+ * At this point, the fault should either have failed (kr != KERN_SUCCESS)
+ * or the page should be in the pmap and on the correct paging queue.
+ *
+ * Precondition:
+ * map must be locked shared.
+ * m_object must be locked.
+ * If top_object != VM_OBJECT_NULL, it must be locked.
+ * real_map must be locked.
+ *
+ * Postcondition:
+ * map will be unlocked
+ * m_object will be unlocked
+ * top_object will be unlocked
+ * If real_map != map, it will be unlocked
+ */
+static void
+vm_fault_complete(
+ vm_map_t map,
+ vm_map_t real_map,
+ vm_object_t object,
+ vm_object_t m_object,
+ vm_page_t m,
+ vm_map_offset_t offset,
+ vm_map_offset_t trace_real_vaddr,
+ vm_object_fault_info_t fault_info,
+ vm_prot_t caller_prot,
+#if CONFIG_DTRACE
+ vm_map_offset_t real_vaddr,
+#else
+ __unused vm_map_offset_t real_vaddr,
+#endif /* CONFIG_DTRACE */
+ int type_of_fault,
+ boolean_t need_retry,
+ kern_return_t kr,
+ ppnum_t *physpage_p,
+ vm_prot_t prot,
+ vm_object_t top_object,
+ boolean_t need_collapse,
+ vm_map_offset_t cur_offset,
+ vm_prot_t fault_type,
+ vm_object_t *written_on_object,
+ memory_object_t *written_on_pager,
+ vm_object_offset_t *written_on_offset)
+{
+ int event_code = 0;
+ vm_map_lock_assert_shared(map);
+ vm_object_lock_assert_held(m_object);
+ if (top_object != VM_OBJECT_NULL) {
+ vm_object_lock_assert_held(top_object);
+ }
+ vm_map_lock_assert_held(real_map);
+
+ if (m_object->internal) {
+ event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
+ } else if (m_object->object_is_shared_cache) {
+ event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
+ } else {
+ event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
+ }
+
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info->user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
+ if (need_retry == FALSE) {
+ KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_FAST), get_current_unique_pid(), 0, 0, 0, 0);
+ }
+ DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
+ if (kr == KERN_SUCCESS &&
+ physpage_p != NULL) {
+ /* for vm_map_wire_and_extract() */
+ *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
+ if (prot & VM_PROT_WRITE) {
+ vm_object_lock_assert_exclusive(m_object);
+ m->vmp_dirty = TRUE;
+ }
+ }
+
+ if (top_object != VM_OBJECT_NULL) {
+ /*
+ * It's safe to drop the top object
+ * now that we've done our
+ * vm_fault_enter(). Any other fault
+ * in progress for that virtual
+ * address will either find our page
+ * and translation or put in a new page
+ * and translation.
+ */
+ vm_object_unlock(top_object);
+ top_object = VM_OBJECT_NULL;
+ }
+
+ if (need_collapse == TRUE) {
+ vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
+ }
+
+ if (need_retry == FALSE &&
+ (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
+ /*
+ * evaluate access pattern and update state
+ * vm_fault_deactivate_behind depends on the
+ * state being up to date
+ */
+ vm_fault_is_sequential(m_object, cur_offset, fault_info->behavior);
+
+ vm_fault_deactivate_behind(m_object, cur_offset, fault_info->behavior);
+ }
+ /*
+ * That's it, clean up and return.
+ */
+ if (m->vmp_busy) {
+ vm_object_lock_assert_exclusive(m_object);
+ PAGE_WAKEUP_DONE(m);
+ }
+
+ if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
+ vm_object_paging_begin(m_object);
+
+ assert(*written_on_object == VM_OBJECT_NULL);
+ *written_on_object = m_object;
+ *written_on_pager = m_object->pager;
+ *written_on_offset = m_object->paging_offset + m->vmp_offset;
+ }
+ vm_object_unlock(object);
+
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+}
+
+static inline int
+vm_fault_type_for_tracing(boolean_t need_copy_on_read, int type_of_fault)
+{
+ if (need_copy_on_read && type_of_fault == DBG_COW_FAULT) {
+ return DBG_COR_FAULT;
+ }
+ return type_of_fault;
+}
+
+kern_return_t
+vm_fault_internal(
+ vm_map_t map,
+ vm_map_offset_t vaddr,
+ vm_prot_t caller_prot,
+ boolean_t change_wiring,
+ vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
+ int interruptible,
+ pmap_t caller_pmap,
+ vm_map_offset_t caller_pmap_addr,
+ ppnum_t *physpage_p)
+{
+ vm_map_version_t version; /* Map version for verificiation */
+ boolean_t wired; /* Should mapping be wired down? */
+ vm_object_t object; /* Top-level object */
+ vm_object_offset_t offset; /* Top-level offset */
+ vm_prot_t prot; /* Protection for mapping */
+ vm_object_t old_copy_object; /* Saved copy object */
+ vm_page_t result_page; /* Result of vm_fault_page */
+ vm_page_t top_page; /* Placeholder page */
+ kern_return_t kr;
+
+ vm_page_t m; /* Fast access to result_page */
+ kern_return_t error_code;
+ vm_object_t cur_object;
+ vm_object_t m_object = NULL;
+ vm_object_offset_t cur_offset;
+ vm_page_t cur_m;
+ vm_object_t new_object;
+ int type_of_fault;
+ pmap_t pmap;
+ wait_interrupt_t interruptible_state;
+ vm_map_t real_map = map;
+ vm_map_t original_map = map;
+ bool object_locks_dropped = FALSE;
+ vm_prot_t fault_type;
+ vm_prot_t original_fault_type;
+ struct vm_object_fault_info fault_info = {};
+ bool need_collapse = FALSE;
+ boolean_t need_retry = FALSE;
+ boolean_t *need_retry_ptr = NULL;
+ uint8_t object_lock_type = 0;
+ uint8_t cur_object_lock_type;
+ vm_object_t top_object = VM_OBJECT_NULL;
+ vm_object_t written_on_object = VM_OBJECT_NULL;
+ memory_object_t written_on_pager = NULL;
+ vm_object_offset_t written_on_offset = 0;
+ int throttle_delay;
+ int compressed_count_delta;
+ uint8_t grab_options;
+ bool need_copy;
+ bool need_copy_on_read;
+ vm_map_offset_t trace_vaddr;
+ vm_map_offset_t trace_real_vaddr;
+ vm_map_size_t fault_page_size;
+ vm_map_size_t fault_page_mask;
+ vm_map_offset_t fault_phys_offset;
+ vm_map_offset_t real_vaddr;
+ bool resilient_media_retry = FALSE;
+ vm_object_t resilient_media_object = VM_OBJECT_NULL;
+ vm_object_offset_t resilient_media_offset = (vm_object_offset_t)-1;
+ bool page_needs_data_sync = false;
+ /*
+ * Was the VM object contended when vm_map_lookup_locked locked it?
+ * If so, the zero fill path will drop the lock
+ * NB: Ideally we would always drop the lock rather than rely on
+ * this heuristic, but vm_object_unlock currently takes > 30 cycles.
+ */
+ bool object_is_contended = false;
+
+ real_vaddr = vaddr;
+ trace_real_vaddr = vaddr;
+
+ if (VM_MAP_PAGE_SIZE(original_map) < PAGE_SIZE) {
+ fault_phys_offset = (vm_map_offset_t)-1;
+ fault_page_size = VM_MAP_PAGE_SIZE(original_map);
+ fault_page_mask = VM_MAP_PAGE_MASK(original_map);
+ if (fault_page_size < PAGE_SIZE) {
+ DEBUG4K_FAULT("map %p vaddr 0x%llx caller_prot 0x%x\n", map, (uint64_t)trace_real_vaddr, caller_prot);
+ vaddr = vm_map_trunc_page(vaddr, fault_page_mask);
+ }
+ } else {
+ fault_phys_offset = 0;
+ fault_page_size = PAGE_SIZE;
+ fault_page_mask = PAGE_MASK;
+ vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
+ }
+
+ if (map == kernel_map) {
+ trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
+ trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
+ } else {
+ trace_vaddr = vaddr;
+ }
+
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+ (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
+ ((uint64_t)trace_vaddr >> 32),
+ trace_vaddr,
+ (map == kernel_map),
+ 0,
+ 0);
+
+ if (get_preemption_level() != 0) {
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+ (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
+ ((uint64_t)trace_vaddr >> 32),
+ trace_vaddr,
+ KERN_FAILURE,
+ 0,
+ 0);
+
+ return KERN_FAILURE;
+ }
+
+ thread_t cthread = current_thread();
+ bool rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
+ uint64_t fstart = 0;
+
+ if (rtfault) {
+ fstart = mach_continuous_time();
+ }
+
+ interruptible_state = thread_interrupt_level(interruptible);
+
+ fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
+
+ counter_inc(&vm_statistics_faults);
+ counter_inc(¤t_task()->faults);
+ original_fault_type = fault_type;
+
+ need_copy = FALSE;
+ if (fault_type & VM_PROT_WRITE) {
+ need_copy = TRUE;
+ }
+
+ if (need_copy || change_wiring) {
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+ } else {
+ object_lock_type = OBJECT_LOCK_SHARED;
+ }
+
+ cur_object_lock_type = OBJECT_LOCK_SHARED;
+
+ if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
+ if (compressor_map) {
+ if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
+ panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
+ }
+ }
+ }
+RetryFault:
+ assert(written_on_object == VM_OBJECT_NULL);
+
+ /*
+ * assume we will hit a page in the cache
+ * otherwise, explicitly override with
+ * the real fault type once we determine it
+ */
+ type_of_fault = DBG_CACHE_HIT_FAULT;
+
+ /*
+ * Find the backing store object and offset into
+ * it to begin the search.
+ */
+ fault_type = original_fault_type;
+ map = original_map;
+ vm_map_lock_read(map);
+
+ if (resilient_media_retry) {
+ /*
+ * If we have to insert a fake zero-filled page to hide
+ * a media failure to provide the real page, we need to
+ * resolve any pending copy-on-write on this mapping.
+ * VM_PROT_COPY tells vm_map_lookup_locked() to deal
+ * with that even if this is not a "write" fault.
+ */
+ need_copy = TRUE;
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+ }
+
+ kr = vm_map_lookup_locked(&map, vaddr,
+ (fault_type | (need_copy ? VM_PROT_COPY : 0)),
+ object_lock_type, &version,
+ &object, &offset, &prot, &wired,
+ &fault_info,
+ &real_map,
+ &object_is_contended);
+
+ if (kr != KERN_SUCCESS) {
+ vm_map_unlock_read(map);
+ goto done;
+ }
+
+
+ pmap = real_map->pmap;
+ fault_info.interruptible = interruptible;
+ fault_info.stealth = FALSE;
+ fault_info.io_sync = FALSE;
+ fault_info.mark_zf_absent = FALSE;
+ fault_info.batch_pmap_op = FALSE;
+
+ if (resilient_media_retry) {
+ /*
+ * We're retrying this fault after having detected a media
+ * failure from a "resilient_media" mapping.
+ * Check that the mapping is still pointing at the object
+ * that just failed to provide a page.
+ */
+ assert(resilient_media_object != VM_OBJECT_NULL);
+ assert(resilient_media_offset != (vm_object_offset_t)-1);
+ if (object != VM_OBJECT_NULL &&
+ object == resilient_media_object &&
+ offset == resilient_media_offset &&
+ fault_info.resilient_media) {
+ /*
+ * This mapping still points at the same object
+ * and is still "resilient_media": proceed in
+ * "recovery-from-media-failure" mode, where we'll
+ * insert a zero-filled page in the top object.
+ */
+// printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
+ } else {
+ /* not recovering: reset state */
+// printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset);
+ resilient_media_retry = FALSE;
+ /* release our extra reference on failed object */
+// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
+ vm_object_deallocate(resilient_media_object);
+ resilient_media_object = VM_OBJECT_NULL;
+ resilient_media_offset = (vm_object_offset_t)-1;
+ }
+ } else {
+ assert(resilient_media_object == VM_OBJECT_NULL);
+ resilient_media_offset = (vm_object_offset_t)-1;
+ }
+
+ /*
+ * If the page is wired, we must fault for the current protection
+ * value, to avoid further faults.
+ */
+ if (wired) {
+ fault_type = prot | VM_PROT_WRITE;
+ }
+ if (wired || need_copy) {
+ /*
+ * since we're treating this fault as a 'write'
+ * we must hold the top object lock exclusively
+ */
+ if (object_lock_type == OBJECT_LOCK_SHARED) {
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ if (vm_object_lock_upgrade(object) == FALSE) {
+ /*
+ * couldn't upgrade, so explictly
+ * take the lock exclusively
+ */
+ vm_object_lock(object);
+ }
+ }
+ }
+
+#if VM_FAULT_CLASSIFY
+ /*
+ * Temporary data gathering code
+ */
+ vm_fault_classify(object, offset, fault_type);
+#endif
+ /*
+ * Fast fault code. The basic idea is to do as much as
+ * possible while holding the map lock and object locks.
+ * Busy pages are not used until the object lock has to
+ * be dropped to do something (copy, zero fill, pmap enter).
+ * Similarly, paging references aren't acquired until that
+ * point, and object references aren't used.
+ *
+ * If we can figure out what to do
+ * (zero fill, copy on write, pmap enter) while holding
+ * the locks, then it gets done. Otherwise, we give up,
+ * and use the original fault path (which doesn't hold
+ * the map lock, and relies on busy pages).
+ * The give up cases include:
+ * - Have to talk to pager.
+ * - Page is busy, absent or in error.
+ * - Pager has locked out desired access.
+ * - Fault needs to be restarted.
+ * - Have to push page into copy object.
+ *
+ * The code is an infinite loop that moves one level down
+ * the shadow chain each time. cur_object and cur_offset
+ * refer to the current object being examined. object and offset
+ * are the original object from the map. The loop is at the
+ * top level if and only if object and cur_object are the same.
+ *
+ * Invariants: Map lock is held throughout. Lock is held on
+ * original object and cur_object (if different) when
+ * continuing or exiting loop.
+ *
+ */
+
+#if defined(__arm64__)
+ /*
+ * Fail if reading an execute-only page in a
+ * pmap that enforces execute-only protection.
+ */
+ if (fault_type == VM_PROT_READ &&
+ (prot & VM_PROT_EXECUTE) &&
+ !(prot & VM_PROT_READ) &&
+ pmap_enforces_execute_only(pmap)) {
+ vm_object_unlock(object);
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+ kr = KERN_PROTECTION_FAILURE;
+ goto done;
+ }
+#endif
+
+ fault_phys_offset = (vm_map_offset_t)offset - vm_map_trunc_page((vm_map_offset_t)offset, PAGE_MASK);
+
+ /*
+ * If this page is to be inserted in a copy delay object
+ * for writing, and if the object has a copy, then the
+ * copy delay strategy is implemented in the slow fault page.
+ */
+ if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
+ object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
+ goto handle_copy_delay;
+ }
+
+ cur_object = object;
+ cur_offset = offset;
+
+ grab_options = 0;
+#if CONFIG_SECLUDED_MEMORY
+ if (object->can_grab_secluded) {
+ grab_options |= VM_PAGE_GRAB_SECLUDED;
+ }
+#endif /* CONFIG_SECLUDED_MEMORY */
+
+ while (TRUE) {
+ if (!cur_object->pager_created &&
+ cur_object->phys_contiguous) { /* superpage */
+ break;
+ }
+
+ if (cur_object->blocked_access) {
+ /*
+ * Access to this VM object has been blocked.
+ * Let the slow path handle it.
+ */
+ break;
+ }
+
+ m = vm_page_lookup(cur_object, vm_object_trunc_page(cur_offset));
+ m_object = NULL;
+
+ if (m != VM_PAGE_NULL) {
+ m_object = cur_object;
+
+ if (m->vmp_busy) {
+ wait_result_t result;
+
+ /*
+ * in order to do the PAGE_ASSERT_WAIT, we must
+ * have object that 'm' belongs to locked exclusively
+ */
+ if (object != cur_object) {
+ if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
+ cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ if (vm_object_lock_upgrade(cur_object) == FALSE) {
+ /*
+ * couldn't upgrade so go do a full retry
+ * immediately since we can no longer be
+ * certain about cur_object (since we
+ * don't hold a reference on it)...
+ * first drop the top object lock
+ */
+ vm_object_unlock(object);
+
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ goto RetryFault;
+ }
+ }
+ } else if (object_lock_type == OBJECT_LOCK_SHARED) {
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ if (vm_object_lock_upgrade(object) == FALSE) {
+ /*
+ * couldn't upgrade, so explictly take the lock
+ * exclusively and go relookup the page since we
+ * will have dropped the object lock and
+ * a different thread could have inserted
+ * a page at this offset
+ * no need for a full retry since we're
+ * at the top level of the object chain
+ */
+ vm_object_lock(object);
+
+ continue;
+ }
+ }
+ if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
+ /*
+ * m->vmp_busy == TRUE and the object is locked exclusively
+ * if m->pageout_queue == TRUE after we acquire the
+ * queues lock, we are guaranteed that it is stable on
+ * the pageout queue and therefore reclaimable
+ *
+ * NOTE: this is only true for the internal pageout queue
+ * in the compressor world
+ */
+ assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
+
+ vm_page_lock_queues();
+
+ if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
+ vm_pageout_throttle_up(m);
+ vm_page_unlock_queues();
+
+ PAGE_WAKEUP_DONE(m);
+ goto reclaimed_from_pageout;
+ }
+ vm_page_unlock_queues();
+ }
+ if (object != cur_object) {
+ vm_object_unlock(object);
+ }
+
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ result = PAGE_ASSERT_WAIT(m, interruptible);
+
+ vm_object_unlock(cur_object);
+
+ if (result == THREAD_WAITING) {
+ result = thread_block(THREAD_CONTINUE_NULL);
+ }
+ if (result == THREAD_AWAKENED || result == THREAD_RESTART) {
+ goto RetryFault;
+ }
+
+ kr = KERN_ABORTED;
+ goto done;
+ }
+reclaimed_from_pageout:
+ if (m->vmp_laundry) {
+ if (object != cur_object) {
+ if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
+ cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ vm_object_unlock(object);
+ vm_object_unlock(cur_object);
+
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ goto RetryFault;
+ }
+ } else if (object_lock_type == OBJECT_LOCK_SHARED) {
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ if (vm_object_lock_upgrade(object) == FALSE) {
+ /*
+ * couldn't upgrade, so explictly take the lock
+ * exclusively and go relookup the page since we
+ * will have dropped the object lock and
+ * a different thread could have inserted
+ * a page at this offset
+ * no need for a full retry since we're
+ * at the top level of the object chain
+ */
+ vm_object_lock(object);
+
+ continue;
+ }
+ }
+ vm_pageout_steal_laundry(m, FALSE);
+ }
+
+ if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
+ /*
+ * Guard page: let the slow path deal with it
+ */
+ break;
+ }
+ if (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent)) {
+ /*
+ * Unusual case... let the slow path deal with it
+ */
+ break;
+ }
+ if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
+ if (object != cur_object) {
+ vm_object_unlock(object);
+ }
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+ vm_object_unlock(cur_object);
+ kr = KERN_MEMORY_ERROR;
+ goto done;
+ }
+ assert(m_object == VM_PAGE_OBJECT(m));
+
+ if (vm_fault_cs_need_validation(map->pmap, m, m_object,
+ PAGE_SIZE, 0) ||
+ (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
+upgrade_lock_and_retry:
+ /*
+ * We might need to validate this page
+ * against its code signature, so we
+ * want to hold the VM object exclusively.
+ */
+ if (object != cur_object) {
+ if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
+ vm_object_unlock(object);
+ vm_object_unlock(cur_object);
+
+ cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ goto RetryFault;
+ }
+ } else if (object_lock_type == OBJECT_LOCK_SHARED) {
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ if (vm_object_lock_upgrade(object) == FALSE) {
+ /*
+ * couldn't upgrade, so explictly take the lock
+ * exclusively and go relookup the page since we
+ * will have dropped the object lock and
+ * a different thread could have inserted
+ * a page at this offset
+ * no need for a full retry since we're
+ * at the top level of the object chain
+ */
+ vm_object_lock(object);
+
+ continue;
+ }
+ }
+ }
+ /*
+ * Two cases of map in faults:
+ * - At top level w/o copy object.
+ * - Read fault anywhere.
+ * --> must disallow write.
+ */
+
+ if (object == cur_object && object->copy == VM_OBJECT_NULL) {
+ goto FastPmapEnter;
+ }
+
+ if (!need_copy &&
+ !fault_info.no_copy_on_read &&
+ cur_object != object &&
+ !cur_object->internal &&
+ !cur_object->pager_trusted &&
+ vm_protect_privileged_from_untrusted &&
+ !((prot & VM_PROT_EXECUTE) &&
+ cur_object->code_signed &&
+ pmap_get_vm_map_cs_enforced(caller_pmap ? caller_pmap : pmap)) &&
+ current_proc_is_privileged()) {
+ /*
+ * We're faulting on a page in "object" and
+ * went down the shadow chain to "cur_object"
+ * to find out that "cur_object"'s pager
+ * is not "trusted", i.e. we can not trust it
+ * to always return the same contents.
+ * Since the target is a "privileged" process,
+ * let's treat this as a copy-on-read fault, as
+ * if it was a copy-on-write fault.
+ * Once "object" gets a copy of this page, it
+ * won't have to rely on "cur_object" to
+ * provide the contents again.
+ *
+ * This is done by setting "need_copy" and
+ * retrying the fault from the top with the
+ * appropriate locking.
+ *
+ * Special case: if the mapping is executable
+ * and the untrusted object is code-signed and
+ * the process is "cs_enforced", we do not
+ * copy-on-read because that would break
+ * code-signing enforcement expectations (an
+ * executable page must belong to a code-signed
+ * object) and we can rely on code-signing
+ * to re-validate the page if it gets evicted
+ * and paged back in.
+ */
+// printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
+ vm_copied_on_read++;
+ need_copy = TRUE;
+
+ vm_object_unlock(object);
+ vm_object_unlock(cur_object);
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+ goto RetryFault;
+ }
+
+ if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
+ if (!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
+ prot &= ~VM_PROT_WRITE;
+ } else {
+ /*
+ * For a protection that the pmap cares
+ * about, we must hand over the full
+ * set of protections (so that the pmap
+ * layer can apply any desired policy).
+ * This means that cs_bypass must be
+ * set, as this can force us to pass
+ * RWX.
+ */
+ assert(fault_info.cs_bypass);
+ }
+
+ if (object != cur_object) {
+ /*
+ * We still need to hold the top object
+ * lock here to prevent a race between
+ * a read fault (taking only "shared"
+ * locks) and a write fault (taking
+ * an "exclusive" lock on the top
+ * object.
+ * Otherwise, as soon as we release the
+ * top lock, the write fault could
+ * proceed and actually complete before
+ * the read fault, and the copied page's
+ * translation could then be overwritten
+ * by the read fault's translation for
+ * the original page.
+ *
+ * Let's just record what the top object
+ * is and we'll release it later.
+ */
+ top_object = object;
+
+ /*
+ * switch to the object that has the new page
+ */
+ object = cur_object;
+ object_lock_type = cur_object_lock_type;
+ }
+FastPmapEnter:
+ assert(m_object == VM_PAGE_OBJECT(m));
+
+ /*
+ * prepare for the pmap_enter...
+ * object and map are both locked
+ * m contains valid data
+ * object == m->vmp_object
+ * cur_object == NULL or it's been unlocked
+ * no paging references on either object or cur_object
+ */
+ if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
+ need_retry_ptr = &need_retry;
+ } else {
+ need_retry_ptr = NULL;
+ }
+
+ if (fault_page_size < PAGE_SIZE) {
+ DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx caller pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, caller_pmap, (uint64_t)caller_pmap_addr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
+ assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
+ fault_phys_offset < PAGE_SIZE),
+ "0x%llx\n", (uint64_t)fault_phys_offset);
+ } else {
+ assertf(fault_phys_offset == 0,
+ "0x%llx\n", (uint64_t)fault_phys_offset);
+ }
+
+ if (caller_pmap) {
+ kr = vm_fault_enter(m,
+ caller_pmap,
+ caller_pmap_addr,
+ fault_page_size,
+ fault_phys_offset,
+ prot,
+ caller_prot,
+ wired,
+ change_wiring,
+ wire_tag,
+ &fault_info,
+ need_retry_ptr,
+ &type_of_fault);
+ } else {
+ kr = vm_fault_enter(m,
+ pmap,
+ vaddr,
+ fault_page_size,
+ fault_phys_offset,
+ prot,
+ caller_prot,
+ wired,
+ change_wiring,
+ wire_tag,
+ &fault_info,
+ need_retry_ptr,
+ &type_of_fault);
+ }
+
+ vm_fault_complete(
+ map,
+ real_map,
+ object,
+ m_object,
+ m,
+ offset,
+ trace_real_vaddr,
+ &fault_info,
+ caller_prot,
+ real_vaddr,
+ vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
+ need_retry,
+ kr,
+ physpage_p,
+ prot,
+ top_object,
+ need_collapse,
+ cur_offset,
+ fault_type,
+ &written_on_object,
+ &written_on_pager,
+ &written_on_offset);
+ top_object = VM_OBJECT_NULL;
+ if (need_retry == TRUE) {
+ /*
+ * vm_fault_enter couldn't complete the PMAP_ENTER...
+ * at this point we don't hold any locks so it's safe
+ * to ask the pmap layer to expand the page table to
+ * accommodate this mapping... once expanded, we'll
+ * re-drive the fault which should result in vm_fault_enter
+ * being able to successfully enter the mapping this time around
+ */
+ (void)pmap_enter_options(
+ pmap, vaddr, 0, 0, 0, 0, 0,
+ PMAP_OPTIONS_NOENTER, NULL);
+
+ need_retry = FALSE;
+ goto RetryFault;
+ }
+ goto done;
+ }
+ /*
+ * COPY ON WRITE FAULT
+ */
+ assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
+
+ /*
+ * If objects match, then
+ * object->copy must not be NULL (else control
+ * would be in previous code block), and we
+ * have a potential push into the copy object
+ * with which we can't cope with here.
+ */
+ if (cur_object == object) {
+ /*
+ * must take the slow path to
+ * deal with the copy push
+ */
+ break;
+ }
+
+ /*
+ * This is now a shadow based copy on write
+ * fault -- it requires a copy up the shadow
+ * chain.
+ */
+ assert(m_object == VM_PAGE_OBJECT(m));
+
+ if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
+ vm_fault_cs_need_validation(NULL, m, m_object,
+ PAGE_SIZE, 0)) {
+ goto upgrade_lock_and_retry;
+ }
+
+ /*
+ * Allocate a page in the original top level
+ * object. Give up if allocate fails. Also
+ * need to remember current page, as it's the
+ * source of the copy.
+ *
+ * at this point we hold locks on both
+ * object and cur_object... no need to take
+ * paging refs or mark pages BUSY since
+ * we don't drop either object lock until
+ * the page has been copied and inserted
+ */
+ cur_m = m;
+ m = vm_page_grab_options(grab_options);
+ m_object = NULL;
+
+ if (m == VM_PAGE_NULL) {
+ /*
+ * no free page currently available...
+ * must take the slow path
+ */
+ break;
+ }
+ /*
+ * Now do the copy. Mark the source page busy...
+ *
+ * NOTE: This code holds the map lock across
+ * the page copy.
+ */
+ vm_page_copy(cur_m, m);
+ vm_page_insert(m, object, vm_object_trunc_page(offset));
+ if (VM_MAP_PAGE_MASK(map) != PAGE_MASK) {
+ DEBUG4K_FAULT("map %p vaddr 0x%llx page %p [%p 0x%llx] copied to %p [%p 0x%llx]\n", map, (uint64_t)vaddr, cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
+ }
+ m_object = object;
+ SET_PAGE_DIRTY(m, FALSE);
+
+ /*
+ * Now cope with the source page and object
+ */
+ if (object->ref_count > 1 && cur_m->vmp_pmapped) {
+ pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
+ } else if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
+ /*
+ * We've copied the full 16K page but we're
+ * about to call vm_fault_enter() only for
+ * the 4K chunk we're faulting on. The other
+ * three 4K chunks in that page could still
+ * be pmapped in this pmap.
+ * Since the VM object layer thinks that the
+ * entire page has been dealt with and the
+ * original page might no longer be needed,
+ * it might collapse/bypass the original VM
+ * object and free its pages, which would be
+ * bad (and would trigger pmap_verify_free()
+ * assertions) if the other 4K chunks are still
+ * pmapped.
+ */
+ /*
+ * XXX FBDP TODO4K: to be revisisted
+ * Technically, we need to pmap_disconnect()
+ * only the target pmap's mappings for the 4K
+ * chunks of this 16K VM page. If other pmaps
+ * have PTEs on these chunks, that means that
+ * the associated VM map must have a reference
+ * on the VM object, so no need to worry about
+ * those.
+ * pmap_protect() for each 4K chunk would be
+ * better but we'd have to check which chunks
+ * are actually mapped before and after this
+ * one.
+ * A full-blown pmap_disconnect() is easier
+ * for now but not efficient.
+ */
+ DEBUG4K_FAULT("pmap_disconnect() page %p object %p offset 0x%llx phys 0x%x\n", cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, VM_PAGE_GET_PHYS_PAGE(cur_m));
+ pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
+ }
+
+ if (cur_m->vmp_clustered) {
+ VM_PAGE_COUNT_AS_PAGEIN(cur_m);
+ VM_PAGE_CONSUME_CLUSTERED(cur_m);
+ vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
+ }
+ need_collapse = TRUE;
+
+ if (!cur_object->internal &&
+ cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
+ /*
+ * The object from which we've just
+ * copied a page is most probably backed
+ * by a vnode. We don't want to waste too
+ * much time trying to collapse the VM objects
+ * and create a bottleneck when several tasks
+ * map the same file.
+ */
+ if (cur_object->copy == object) {
+ /*
+ * Shared mapping or no COW yet.
+ * We can never collapse a copy
+ * object into its backing object.
+ */
+ need_collapse = FALSE;
+ } else if (cur_object->copy == object->shadow &&
+ object->shadow->resident_page_count == 0) {
+ /*
+ * Shared mapping after a COW occurred.
+ */
+ need_collapse = FALSE;
+ }
+ }
+ vm_object_unlock(cur_object);
+
+ if (need_collapse == FALSE) {
+ vm_fault_collapse_skipped++;
+ }
+ vm_fault_collapse_total++;
+
+ type_of_fault = DBG_COW_FAULT;
+ counter_inc(&vm_statistics_cow_faults);
+ DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
+ current_task()->cow_faults++;
+
+ goto FastPmapEnter;
+ } else {
+ /*