+ if (wired) {
+ fault_type = prot | VM_PROT_WRITE;
+ }
+ if (wired || need_copy) {
+ /*
+ * since we're treating this fault as a 'write'
+ * we must hold the top object lock exclusively
+ */
+ if (object_lock_type == OBJECT_LOCK_SHARED) {
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ if (vm_object_lock_upgrade(object) == FALSE) {
+ /*
+ * couldn't upgrade, so explictly
+ * take the lock exclusively
+ */
+ vm_object_lock(object);
+ }
+ }
+ }
+
+#if VM_FAULT_CLASSIFY
+ /*
+ * Temporary data gathering code
+ */
+ vm_fault_classify(object, offset, fault_type);
+#endif
+ /*
+ * Fast fault code. The basic idea is to do as much as
+ * possible while holding the map lock and object locks.
+ * Busy pages are not used until the object lock has to
+ * be dropped to do something (copy, zero fill, pmap enter).
+ * Similarly, paging references aren't acquired until that
+ * point, and object references aren't used.
+ *
+ * If we can figure out what to do
+ * (zero fill, copy on write, pmap enter) while holding
+ * the locks, then it gets done. Otherwise, we give up,
+ * and use the original fault path (which doesn't hold
+ * the map lock, and relies on busy pages).
+ * The give up cases include:
+ * - Have to talk to pager.
+ * - Page is busy, absent or in error.
+ * - Pager has locked out desired access.
+ * - Fault needs to be restarted.
+ * - Have to push page into copy object.
+ *
+ * The code is an infinite loop that moves one level down
+ * the shadow chain each time. cur_object and cur_offset
+ * refer to the current object being examined. object and offset
+ * are the original object from the map. The loop is at the
+ * top level if and only if object and cur_object are the same.
+ *
+ * Invariants: Map lock is held throughout. Lock is held on
+ * original object and cur_object (if different) when
+ * continuing or exiting loop.
+ *
+ */
+
+#if defined(__arm64__)
+ /*
+ * Fail if reading an execute-only page in a
+ * pmap that enforces execute-only protection.
+ */
+ if (fault_type == VM_PROT_READ &&
+ (prot & VM_PROT_EXECUTE) &&
+ !(prot & VM_PROT_READ) &&
+ pmap_enforces_execute_only(pmap)) {
+ vm_object_unlock(object);
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+ kr = KERN_PROTECTION_FAILURE;
+ goto done;
+ }
+#endif
+
+ fault_phys_offset = (vm_map_offset_t)offset - vm_map_trunc_page((vm_map_offset_t)offset, PAGE_MASK);
+
+ /*
+ * If this page is to be inserted in a copy delay object
+ * for writing, and if the object has a copy, then the
+ * copy delay strategy is implemented in the slow fault page.
+ */
+ if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
+ object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
+ goto handle_copy_delay;
+ }
+
+ cur_object = object;
+ cur_offset = offset;
+
+ grab_options = 0;
+#if CONFIG_SECLUDED_MEMORY
+ if (object->can_grab_secluded) {
+ grab_options |= VM_PAGE_GRAB_SECLUDED;
+ }
+#endif /* CONFIG_SECLUDED_MEMORY */
+
+ while (TRUE) {
+ if (!cur_object->pager_created &&
+ cur_object->phys_contiguous) { /* superpage */
+ break;
+ }
+
+ if (cur_object->blocked_access) {
+ /*
+ * Access to this VM object has been blocked.
+ * Let the slow path handle it.
+ */
+ break;
+ }
+
+ m = vm_page_lookup(cur_object, vm_object_trunc_page(cur_offset));
+ m_object = NULL;
+
+ if (m != VM_PAGE_NULL) {
+ m_object = cur_object;
+
+ if (m->vmp_busy) {
+ wait_result_t result;
+
+ /*
+ * in order to do the PAGE_ASSERT_WAIT, we must
+ * have object that 'm' belongs to locked exclusively
+ */
+ if (object != cur_object) {
+ if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
+ cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ if (vm_object_lock_upgrade(cur_object) == FALSE) {
+ /*
+ * couldn't upgrade so go do a full retry
+ * immediately since we can no longer be
+ * certain about cur_object (since we
+ * don't hold a reference on it)...
+ * first drop the top object lock
+ */
+ vm_object_unlock(object);
+
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ goto RetryFault;
+ }
+ }
+ } else if (object_lock_type == OBJECT_LOCK_SHARED) {
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ if (vm_object_lock_upgrade(object) == FALSE) {
+ /*
+ * couldn't upgrade, so explictly take the lock
+ * exclusively and go relookup the page since we
+ * will have dropped the object lock and
+ * a different thread could have inserted
+ * a page at this offset
+ * no need for a full retry since we're
+ * at the top level of the object chain
+ */
+ vm_object_lock(object);
+
+ continue;
+ }
+ }
+ if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
+ /*
+ * m->vmp_busy == TRUE and the object is locked exclusively
+ * if m->pageout_queue == TRUE after we acquire the
+ * queues lock, we are guaranteed that it is stable on
+ * the pageout queue and therefore reclaimable
+ *
+ * NOTE: this is only true for the internal pageout queue
+ * in the compressor world
+ */
+ assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
+
+ vm_page_lock_queues();
+
+ if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
+ vm_pageout_throttle_up(m);
+ vm_page_unlock_queues();
+
+ PAGE_WAKEUP_DONE(m);
+ goto reclaimed_from_pageout;
+ }
+ vm_page_unlock_queues();
+ }
+ if (object != cur_object) {
+ vm_object_unlock(object);
+ }
+
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ result = PAGE_ASSERT_WAIT(m, interruptible);
+
+ vm_object_unlock(cur_object);
+
+ if (result == THREAD_WAITING) {
+ result = thread_block(THREAD_CONTINUE_NULL);
+ }
+ if (result == THREAD_AWAKENED || result == THREAD_RESTART) {
+ goto RetryFault;
+ }
+
+ kr = KERN_ABORTED;
+ goto done;
+ }
+reclaimed_from_pageout:
+ if (m->vmp_laundry) {
+ if (object != cur_object) {
+ if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
+ cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ vm_object_unlock(object);
+ vm_object_unlock(cur_object);
+
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ goto RetryFault;
+ }
+ } else if (object_lock_type == OBJECT_LOCK_SHARED) {
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ if (vm_object_lock_upgrade(object) == FALSE) {
+ /*
+ * couldn't upgrade, so explictly take the lock
+ * exclusively and go relookup the page since we
+ * will have dropped the object lock and
+ * a different thread could have inserted
+ * a page at this offset
+ * no need for a full retry since we're
+ * at the top level of the object chain
+ */
+ vm_object_lock(object);
+
+ continue;
+ }
+ }
+ vm_pageout_steal_laundry(m, FALSE);
+ }
+
+ if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
+ /*
+ * Guard page: let the slow path deal with it
+ */
+ break;
+ }
+ if (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent)) {
+ /*
+ * Unusual case... let the slow path deal with it
+ */
+ break;
+ }
+ if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
+ if (object != cur_object) {
+ vm_object_unlock(object);
+ }
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+ vm_object_unlock(cur_object);
+ kr = KERN_MEMORY_ERROR;
+ goto done;
+ }
+ assert(m_object == VM_PAGE_OBJECT(m));
+
+ if (vm_fault_cs_need_validation(map->pmap, m, m_object,
+ PAGE_SIZE, 0) ||
+ (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
+upgrade_lock_and_retry:
+ /*
+ * We might need to validate this page
+ * against its code signature, so we
+ * want to hold the VM object exclusively.
+ */
+ if (object != cur_object) {
+ if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
+ vm_object_unlock(object);
+ vm_object_unlock(cur_object);
+
+ cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ goto RetryFault;
+ }
+ } else if (object_lock_type == OBJECT_LOCK_SHARED) {
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ if (vm_object_lock_upgrade(object) == FALSE) {
+ /*
+ * couldn't upgrade, so explictly take the lock
+ * exclusively and go relookup the page since we
+ * will have dropped the object lock and
+ * a different thread could have inserted
+ * a page at this offset
+ * no need for a full retry since we're
+ * at the top level of the object chain
+ */
+ vm_object_lock(object);
+
+ continue;
+ }
+ }
+ }
+ /*
+ * Two cases of map in faults:
+ * - At top level w/o copy object.
+ * - Read fault anywhere.
+ * --> must disallow write.
+ */
+
+ if (object == cur_object && object->copy == VM_OBJECT_NULL) {
+ goto FastPmapEnter;
+ }
+
+ if (!need_copy &&
+ !fault_info.no_copy_on_read &&
+ cur_object != object &&
+ !cur_object->internal &&
+ !cur_object->pager_trusted &&
+ vm_protect_privileged_from_untrusted &&
+ !((prot & VM_PROT_EXECUTE) &&
+ cur_object->code_signed &&
+ pmap_get_vm_map_cs_enforced(caller_pmap ? caller_pmap : pmap)) &&
+ current_proc_is_privileged()) {
+ /*
+ * We're faulting on a page in "object" and
+ * went down the shadow chain to "cur_object"
+ * to find out that "cur_object"'s pager
+ * is not "trusted", i.e. we can not trust it
+ * to always return the same contents.
+ * Since the target is a "privileged" process,
+ * let's treat this as a copy-on-read fault, as
+ * if it was a copy-on-write fault.
+ * Once "object" gets a copy of this page, it
+ * won't have to rely on "cur_object" to
+ * provide the contents again.
+ *
+ * This is done by setting "need_copy" and
+ * retrying the fault from the top with the
+ * appropriate locking.
+ *
+ * Special case: if the mapping is executable
+ * and the untrusted object is code-signed and
+ * the process is "cs_enforced", we do not
+ * copy-on-read because that would break
+ * code-signing enforcement expectations (an
+ * executable page must belong to a code-signed
+ * object) and we can rely on code-signing
+ * to re-validate the page if it gets evicted
+ * and paged back in.
+ */
+// printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
+ vm_copied_on_read++;
+ need_copy = TRUE;
+
+ vm_object_unlock(object);
+ vm_object_unlock(cur_object);
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+ goto RetryFault;
+ }
+
+ if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
+ if (!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
+ prot &= ~VM_PROT_WRITE;
+ } else {
+ /*
+ * For a protection that the pmap cares
+ * about, we must hand over the full
+ * set of protections (so that the pmap
+ * layer can apply any desired policy).
+ * This means that cs_bypass must be
+ * set, as this can force us to pass
+ * RWX.
+ */
+ assert(fault_info.cs_bypass);
+ }
+
+ if (object != cur_object) {
+ /*
+ * We still need to hold the top object
+ * lock here to prevent a race between
+ * a read fault (taking only "shared"
+ * locks) and a write fault (taking
+ * an "exclusive" lock on the top
+ * object.
+ * Otherwise, as soon as we release the
+ * top lock, the write fault could
+ * proceed and actually complete before
+ * the read fault, and the copied page's
+ * translation could then be overwritten
+ * by the read fault's translation for
+ * the original page.
+ *
+ * Let's just record what the top object
+ * is and we'll release it later.
+ */
+ top_object = object;
+
+ /*
+ * switch to the object that has the new page
+ */
+ object = cur_object;
+ object_lock_type = cur_object_lock_type;
+ }
+FastPmapEnter:
+ assert(m_object == VM_PAGE_OBJECT(m));
+
+ /*
+ * prepare for the pmap_enter...
+ * object and map are both locked
+ * m contains valid data
+ * object == m->vmp_object
+ * cur_object == NULL or it's been unlocked
+ * no paging references on either object or cur_object
+ */
+ if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
+ need_retry_ptr = &need_retry;
+ } else {
+ need_retry_ptr = NULL;
+ }
+
+ if (fault_page_size < PAGE_SIZE) {
+ DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx caller pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, caller_pmap, (uint64_t)caller_pmap_addr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
+ assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
+ fault_phys_offset < PAGE_SIZE),
+ "0x%llx\n", (uint64_t)fault_phys_offset);
+ } else {
+ assertf(fault_phys_offset == 0,
+ "0x%llx\n", (uint64_t)fault_phys_offset);
+ }
+
+ if (caller_pmap) {
+ kr = vm_fault_enter(m,
+ caller_pmap,
+ caller_pmap_addr,
+ fault_page_size,
+ fault_phys_offset,
+ prot,
+ caller_prot,
+ wired,
+ change_wiring,
+ wire_tag,
+ &fault_info,
+ need_retry_ptr,
+ &type_of_fault);
+ } else {
+ kr = vm_fault_enter(m,
+ pmap,
+ vaddr,
+ fault_page_size,
+ fault_phys_offset,
+ prot,
+ caller_prot,
+ wired,
+ change_wiring,
+ wire_tag,
+ &fault_info,
+ need_retry_ptr,
+ &type_of_fault);
+ }
+
+ vm_fault_complete(
+ map,
+ real_map,
+ object,
+ m_object,
+ m,
+ offset,
+ trace_real_vaddr,
+ &fault_info,
+ caller_prot,
+ real_vaddr,
+ vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
+ need_retry,
+ kr,
+ physpage_p,
+ prot,
+ top_object,
+ need_collapse,
+ cur_offset,
+ fault_type,
+ &written_on_object,
+ &written_on_pager,
+ &written_on_offset);
+ top_object = VM_OBJECT_NULL;
+ if (need_retry == TRUE) {
+ /*
+ * vm_fault_enter couldn't complete the PMAP_ENTER...
+ * at this point we don't hold any locks so it's safe
+ * to ask the pmap layer to expand the page table to
+ * accommodate this mapping... once expanded, we'll
+ * re-drive the fault which should result in vm_fault_enter
+ * being able to successfully enter the mapping this time around
+ */
+ (void)pmap_enter_options(
+ pmap, vaddr, 0, 0, 0, 0, 0,
+ PMAP_OPTIONS_NOENTER, NULL);
+
+ need_retry = FALSE;
+ goto RetryFault;
+ }
+ goto done;
+ }
+ /*
+ * COPY ON WRITE FAULT
+ */
+ assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
+
+ /*
+ * If objects match, then
+ * object->copy must not be NULL (else control
+ * would be in previous code block), and we
+ * have a potential push into the copy object
+ * with which we can't cope with here.
+ */
+ if (cur_object == object) {
+ /*
+ * must take the slow path to
+ * deal with the copy push
+ */
+ break;
+ }
+
+ /*
+ * This is now a shadow based copy on write
+ * fault -- it requires a copy up the shadow
+ * chain.
+ */
+ assert(m_object == VM_PAGE_OBJECT(m));
+
+ if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
+ vm_fault_cs_need_validation(NULL, m, m_object,
+ PAGE_SIZE, 0)) {
+ goto upgrade_lock_and_retry;
+ }
+
+ /*
+ * Allocate a page in the original top level
+ * object. Give up if allocate fails. Also
+ * need to remember current page, as it's the
+ * source of the copy.
+ *
+ * at this point we hold locks on both
+ * object and cur_object... no need to take
+ * paging refs or mark pages BUSY since
+ * we don't drop either object lock until
+ * the page has been copied and inserted
+ */
+ cur_m = m;
+ m = vm_page_grab_options(grab_options);
+ m_object = NULL;
+
+ if (m == VM_PAGE_NULL) {
+ /*
+ * no free page currently available...
+ * must take the slow path
+ */
+ break;
+ }
+ /*
+ * Now do the copy. Mark the source page busy...
+ *
+ * NOTE: This code holds the map lock across
+ * the page copy.
+ */
+ vm_page_copy(cur_m, m);
+ vm_page_insert(m, object, vm_object_trunc_page(offset));
+ if (VM_MAP_PAGE_MASK(map) != PAGE_MASK) {
+ DEBUG4K_FAULT("map %p vaddr 0x%llx page %p [%p 0x%llx] copied to %p [%p 0x%llx]\n", map, (uint64_t)vaddr, cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
+ }
+ m_object = object;
+ SET_PAGE_DIRTY(m, FALSE);
+
+ /*
+ * Now cope with the source page and object
+ */
+ if (object->ref_count > 1 && cur_m->vmp_pmapped) {
+ pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
+ } else if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
+ /*
+ * We've copied the full 16K page but we're
+ * about to call vm_fault_enter() only for
+ * the 4K chunk we're faulting on. The other
+ * three 4K chunks in that page could still
+ * be pmapped in this pmap.
+ * Since the VM object layer thinks that the
+ * entire page has been dealt with and the
+ * original page might no longer be needed,
+ * it might collapse/bypass the original VM
+ * object and free its pages, which would be
+ * bad (and would trigger pmap_verify_free()
+ * assertions) if the other 4K chunks are still
+ * pmapped.
+ */
+ /*
+ * XXX FBDP TODO4K: to be revisisted
+ * Technically, we need to pmap_disconnect()
+ * only the target pmap's mappings for the 4K
+ * chunks of this 16K VM page. If other pmaps
+ * have PTEs on these chunks, that means that
+ * the associated VM map must have a reference
+ * on the VM object, so no need to worry about
+ * those.
+ * pmap_protect() for each 4K chunk would be
+ * better but we'd have to check which chunks
+ * are actually mapped before and after this
+ * one.
+ * A full-blown pmap_disconnect() is easier
+ * for now but not efficient.
+ */
+ DEBUG4K_FAULT("pmap_disconnect() page %p object %p offset 0x%llx phys 0x%x\n", cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, VM_PAGE_GET_PHYS_PAGE(cur_m));
+ pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
+ }
+
+ if (cur_m->vmp_clustered) {
+ VM_PAGE_COUNT_AS_PAGEIN(cur_m);
+ VM_PAGE_CONSUME_CLUSTERED(cur_m);
+ vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
+ }
+ need_collapse = TRUE;
+
+ if (!cur_object->internal &&
+ cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
+ /*
+ * The object from which we've just
+ * copied a page is most probably backed
+ * by a vnode. We don't want to waste too
+ * much time trying to collapse the VM objects
+ * and create a bottleneck when several tasks
+ * map the same file.
+ */
+ if (cur_object->copy == object) {
+ /*
+ * Shared mapping or no COW yet.
+ * We can never collapse a copy
+ * object into its backing object.
+ */
+ need_collapse = FALSE;
+ } else if (cur_object->copy == object->shadow &&
+ object->shadow->resident_page_count == 0) {
+ /*
+ * Shared mapping after a COW occurred.
+ */
+ need_collapse = FALSE;
+ }
+ }
+ vm_object_unlock(cur_object);
+
+ if (need_collapse == FALSE) {
+ vm_fault_collapse_skipped++;
+ }
+ vm_fault_collapse_total++;
+
+ type_of_fault = DBG_COW_FAULT;
+ counter_inc(&vm_statistics_cow_faults);
+ DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
+ current_task()->cow_faults++;
+
+ goto FastPmapEnter;
+ } else {
+ /*
+ * No page at cur_object, cur_offset... m == NULL
+ */
+ if (cur_object->pager_created) {
+ vm_external_state_t compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
+
+ if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
+ int my_fault_type;
+ uint8_t c_flags = C_DONT_BLOCK;
+ bool insert_cur_object = FALSE;
+
+ /*
+ * May have to talk to a pager...
+ * if so, take the slow path by
+ * doing a 'break' from the while (TRUE) loop
+ *
+ * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
+ * if the compressor is active and the page exists there
+ */
+ if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS) {
+ break;
+ }
+
+ if (map == kernel_map || real_map == kernel_map) {
+ /*
+ * can't call into the compressor with the kernel_map
+ * lock held, since the compressor may try to operate
+ * on the kernel map in order to return an empty c_segment
+ */
+ break;
+ }
+ if (object != cur_object) {
+ if (fault_type & VM_PROT_WRITE) {
+ c_flags |= C_KEEP;
+ } else {
+ insert_cur_object = TRUE;
+ }
+ }
+ if (insert_cur_object == TRUE) {
+ if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
+ cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ if (vm_object_lock_upgrade(cur_object) == FALSE) {
+ /*
+ * couldn't upgrade so go do a full retry
+ * immediately since we can no longer be
+ * certain about cur_object (since we
+ * don't hold a reference on it)...
+ * first drop the top object lock
+ */
+ vm_object_unlock(object);
+
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ goto RetryFault;
+ }
+ }
+ } else if (object_lock_type == OBJECT_LOCK_SHARED) {
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ if (object != cur_object) {
+ /*
+ * we can't go for the upgrade on the top
+ * lock since the upgrade may block waiting
+ * for readers to drain... since we hold
+ * cur_object locked at this point, waiting
+ * for the readers to drain would represent
+ * a lock order inversion since the lock order
+ * for objects is the reference order in the
+ * shadown chain
+ */
+ vm_object_unlock(object);
+ vm_object_unlock(cur_object);
+
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ goto RetryFault;
+ }
+ if (vm_object_lock_upgrade(object) == FALSE) {
+ /*
+ * couldn't upgrade, so explictly take the lock
+ * exclusively and go relookup the page since we
+ * will have dropped the object lock and
+ * a different thread could have inserted
+ * a page at this offset
+ * no need for a full retry since we're
+ * at the top level of the object chain
+ */
+ vm_object_lock(object);
+
+ continue;
+ }
+ }
+ m = vm_page_grab_options(grab_options);
+ m_object = NULL;
+
+ if (m == VM_PAGE_NULL) {
+ /*
+ * no free page currently available...
+ * must take the slow path
+ */
+ break;
+ }
+
+ /*
+ * The object is and remains locked
+ * so no need to take a
+ * "paging_in_progress" reference.
+ */
+ bool shared_lock;
+ if ((object == cur_object &&
+ object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
+ (object != cur_object &&
+ cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
+ shared_lock = FALSE;
+ } else {
+ shared_lock = TRUE;
+ }
+
+ kr = vm_compressor_pager_get(
+ cur_object->pager,
+ (vm_object_trunc_page(cur_offset)
+ + cur_object->paging_offset),
+ VM_PAGE_GET_PHYS_PAGE(m),
+ &my_fault_type,
+ c_flags,
+ &compressed_count_delta);
+
+ vm_compressor_pager_count(
+ cur_object->pager,
+ compressed_count_delta,
+ shared_lock,
+ cur_object);
+
+ if (kr != KERN_SUCCESS) {
+ vm_page_release(m, FALSE);
+ m = VM_PAGE_NULL;
+ }
+ /*
+ * If vm_compressor_pager_get() returns
+ * KERN_MEMORY_FAILURE, then the
+ * compressed data is permanently lost,
+ * so return this error immediately.
+ */
+ if (kr == KERN_MEMORY_FAILURE) {
+ if (object != cur_object) {
+ vm_object_unlock(cur_object);
+ }
+ vm_object_unlock(object);
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+ goto done;
+ } else if (kr != KERN_SUCCESS) {
+ break;
+ }
+ m->vmp_dirty = TRUE;
+
+ /*
+ * If the object is purgeable, its
+ * owner's purgeable ledgers will be
+ * updated in vm_page_insert() but the
+ * page was also accounted for in a
+ * "compressed purgeable" ledger, so
+ * update that now.
+ */
+ if (object != cur_object &&
+ !insert_cur_object) {
+ /*
+ * We're not going to insert
+ * the decompressed page into
+ * the object it came from.
+ *
+ * We're dealing with a
+ * copy-on-write fault on
+ * "object".
+ * We're going to decompress
+ * the page directly into the
+ * target "object" while
+ * keepin the compressed
+ * page for "cur_object", so
+ * no ledger update in that
+ * case.
+ */
+ } else if (((cur_object->purgable ==
+ VM_PURGABLE_DENY) &&
+ (!cur_object->vo_ledger_tag)) ||
+ (cur_object->vo_owner ==
+ NULL)) {
+ /*
+ * "cur_object" is not purgeable
+ * and is not ledger-taged, or
+ * there's no owner for it,
+ * so no owner's ledgers to
+ * update.
+ */
+ } else {
+ /*
+ * One less compressed
+ * purgeable/tagged page for
+ * cur_object's owner.
+ */
+ vm_object_owner_compressed_update(
+ cur_object,
+ -1);
+ }
+
+ if (insert_cur_object) {
+ vm_page_insert(m, cur_object, vm_object_trunc_page(cur_offset));
+ m_object = cur_object;
+ } else {
+ vm_page_insert(m, object, vm_object_trunc_page(offset));
+ m_object = object;
+ }
+
+ if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
+ /*
+ * If the page is not cacheable,
+ * we can't let its contents
+ * linger in the data cache
+ * after the decompression.
+ */
+ pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
+ }
+
+ type_of_fault = my_fault_type;
+
+ VM_STAT_DECOMPRESSIONS();
+
+ if (cur_object != object) {
+ if (insert_cur_object) {
+ top_object = object;
+ /*
+ * switch to the object that has the new page
+ */
+ object = cur_object;
+ object_lock_type = cur_object_lock_type;
+ } else {
+ vm_object_unlock(cur_object);
+ cur_object = object;
+ }
+ }
+ goto FastPmapEnter;
+ }
+ /*
+ * existence map present and indicates
+ * that the pager doesn't have this page
+ */
+ }
+ if (cur_object->shadow == VM_OBJECT_NULL ||
+ resilient_media_retry) {
+ /*
+ * Zero fill fault. Page gets
+ * inserted into the original object.
+ */
+ if (cur_object->shadow_severed ||
+ VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) ||
+ cur_object == compressor_object ||
+ cur_object == kernel_object ||
+ cur_object == vm_submap_object) {
+ if (object != cur_object) {
+ vm_object_unlock(cur_object);
+ }
+ vm_object_unlock(object);
+
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ kr = KERN_MEMORY_ERROR;
+ goto done;
+ }
+ if (cur_object != object) {
+ vm_object_unlock(cur_object);
+
+ cur_object = object;
+ }
+ if (object_lock_type == OBJECT_LOCK_SHARED) {
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ if (vm_object_lock_upgrade(object) == FALSE) {
+ /*
+ * couldn't upgrade so do a full retry on the fault
+ * since we dropped the object lock which
+ * could allow another thread to insert
+ * a page at this offset
+ */
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ goto RetryFault;
+ }
+ }
+ if (!object->internal) {
+ panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object);
+ }
+ m = vm_page_alloc(object, vm_object_trunc_page(offset));
+ m_object = NULL;
+
+ if (m == VM_PAGE_NULL) {
+ /*
+ * no free page currently available...
+ * must take the slow path
+ */
+ break;
+ }
+ m_object = object;
+
+ /*
+ * Zeroing the page and entering into it into the pmap
+ * represents a significant amount of the zero fill fault handler's work.
+ *
+ * To improve fault scalability, we'll drop the object lock, if it appears contended,
+ * now that we've inserted the page into the vm object.
+ * Before dropping the lock, we need to check protection bits and set the
+ * mapped bits on the page. Then we can mark the page busy, drop the lock,
+ * zero it, and do the pmap enter. We'll need to reacquire the lock
+ * to clear the busy bit and wake up any waiters.
+ */
+ vm_fault_cs_clear(m);
+ m->vmp_pmapped = TRUE;
+ if (map->no_zero_fill) {
+ type_of_fault = DBG_NZF_PAGE_FAULT;
+ } else {
+ type_of_fault = DBG_ZERO_FILL_FAULT;
+ }
+ {
+ pmap_t destination_pmap;
+ vm_map_offset_t destination_pmap_vaddr;
+ vm_prot_t enter_fault_type;
+ if (caller_pmap) {
+ destination_pmap = caller_pmap;
+ destination_pmap_vaddr = caller_pmap_addr;
+ } else {
+ destination_pmap = pmap;
+ destination_pmap_vaddr = vaddr;
+ }
+ if (change_wiring) {
+ enter_fault_type = VM_PROT_NONE;
+ } else {
+ enter_fault_type = caller_prot;
+ }
+ kr = vm_fault_enter_prepare(m,
+ destination_pmap,
+ destination_pmap_vaddr,
+ &prot,
+ caller_prot,
+ fault_page_size,
+ fault_phys_offset,
+ change_wiring,
+ enter_fault_type,
+ &fault_info,
+ &type_of_fault,
+ &page_needs_data_sync);
+ if (kr != KERN_SUCCESS) {
+ goto zero_fill_cleanup;
+ }
+
+ if (object_is_contended) {
+ /*
+ * At this point the page is in the vm object, but not on a paging queue.
+ * Since it's accessible to another thread but its contents are invalid
+ * (it hasn't been zeroed) mark it busy before dropping the object lock.
+ */
+ m->vmp_busy = TRUE;
+ vm_object_unlock(object);
+ }
+ if (type_of_fault == DBG_ZERO_FILL_FAULT) {
+ /*
+ * Now zero fill page...
+ * the page is probably going to
+ * be written soon, so don't bother
+ * to clear the modified bit
+ *
+ * NOTE: This code holds the map
+ * lock across the zero fill.
+ */
+ vm_page_zero_fill(m);
+ counter_inc(&vm_statistics_zero_fill_count);
+ DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
+ }
+ if (page_needs_data_sync) {
+ pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
+ }
+
+ if (top_object != VM_OBJECT_NULL) {
+ need_retry_ptr = &need_retry;
+ } else {
+ need_retry_ptr = NULL;
+ }
+ if (object_is_contended) {
+ kr = vm_fault_pmap_enter(destination_pmap, destination_pmap_vaddr,
+ fault_page_size, fault_phys_offset,
+ m, &prot, caller_prot, enter_fault_type, wired,
+ fault_info.pmap_options, need_retry_ptr);
+ vm_object_lock(object);
+ } else {
+ kr = vm_fault_pmap_enter_with_object_lock(object, destination_pmap, destination_pmap_vaddr,
+ fault_page_size, fault_phys_offset,
+ m, &prot, caller_prot, enter_fault_type, wired,
+ fault_info.pmap_options, need_retry_ptr);
+ }
+ }
+zero_fill_cleanup:
+ if (!VM_DYNAMIC_PAGING_ENABLED() &&
+ (object->purgable == VM_PURGABLE_DENY ||
+ object->purgable == VM_PURGABLE_NONVOLATILE ||
+ object->purgable == VM_PURGABLE_VOLATILE)) {
+ vm_page_lockspin_queues();
+ if (!VM_DYNAMIC_PAGING_ENABLED()) {
+ vm_fault_enqueue_throttled_locked(m);
+ }
+ vm_page_unlock_queues();
+ }
+ vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info.no_cache, &type_of_fault, kr);
+
+ vm_fault_complete(
+ map,
+ real_map,
+ object,
+ m_object,
+ m,
+ offset,
+ trace_real_vaddr,
+ &fault_info,
+ caller_prot,
+ real_vaddr,
+ type_of_fault,
+ need_retry,
+ kr,
+ physpage_p,
+ prot,
+ top_object,
+ need_collapse,
+ cur_offset,
+ fault_type,
+ &written_on_object,
+ &written_on_pager,
+ &written_on_offset);
+ top_object = VM_OBJECT_NULL;
+ if (need_retry == TRUE) {
+ /*
+ * vm_fault_enter couldn't complete the PMAP_ENTER...
+ * at this point we don't hold any locks so it's safe
+ * to ask the pmap layer to expand the page table to
+ * accommodate this mapping... once expanded, we'll
+ * re-drive the fault which should result in vm_fault_enter
+ * being able to successfully enter the mapping this time around
+ */
+ (void)pmap_enter_options(
+ pmap, vaddr, 0, 0, 0, 0, 0,
+ PMAP_OPTIONS_NOENTER, NULL);
+
+ need_retry = FALSE;
+ goto RetryFault;
+ }
+ goto done;
+ }
+ /*
+ * On to the next level in the shadow chain
+ */
+ cur_offset += cur_object->vo_shadow_offset;
+ new_object = cur_object->shadow;
+ fault_phys_offset = cur_offset - vm_object_trunc_page(cur_offset);
+
+ /*
+ * take the new_object's lock with the indicated state
+ */
+ if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
+ vm_object_lock_shared(new_object);
+ } else {
+ vm_object_lock(new_object);
+ }
+
+ if (cur_object != object) {
+ vm_object_unlock(cur_object);
+ }
+
+ cur_object = new_object;
+
+ continue;
+ }
+ }
+ /*
+ * Cleanup from fast fault failure. Drop any object
+ * lock other than original and drop map lock.
+ */
+ if (object != cur_object) {
+ vm_object_unlock(cur_object);
+ }
+
+ /*
+ * must own the object lock exclusively at this point
+ */
+ if (object_lock_type == OBJECT_LOCK_SHARED) {
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ if (vm_object_lock_upgrade(object) == FALSE) {
+ /*
+ * couldn't upgrade, so explictly
+ * take the lock exclusively
+ * no need to retry the fault at this
+ * point since "vm_fault_page" will
+ * completely re-evaluate the state
+ */
+ vm_object_lock(object);
+ }
+ }
+
+handle_copy_delay:
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ if (__improbable(object == compressor_object ||
+ object == kernel_object ||
+ object == vm_submap_object)) {
+ /*
+ * These objects are explicitly managed and populated by the
+ * kernel. The virtual ranges backed by these objects should
+ * either have wired pages or "holes" that are not supposed to
+ * be accessed at all until they get explicitly populated.
+ * We should never have to resolve a fault on a mapping backed
+ * by one of these VM objects and providing a zero-filled page
+ * would be wrong here, so let's fail the fault and let the
+ * caller crash or recover.
+ */
+ vm_object_unlock(object);
+ kr = KERN_MEMORY_ERROR;
+ goto done;
+ }
+
+ assert(object != compressor_object);
+ assert(object != kernel_object);
+ assert(object != vm_submap_object);
+
+ if (resilient_media_retry) {
+ /*
+ * We could get here if we failed to get a free page
+ * to zero-fill and had to take the slow path again.
+ * Reset our "recovery-from-failed-media" state.
+ */
+ assert(resilient_media_object != VM_OBJECT_NULL);
+ assert(resilient_media_offset != (vm_object_offset_t)-1);
+ /* release our extra reference on failed object */
+// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
+ vm_object_deallocate(resilient_media_object);
+ resilient_media_object = VM_OBJECT_NULL;
+ resilient_media_offset = (vm_object_offset_t)-1;
+ resilient_media_retry = FALSE;
+ }
+
+ /*
+ * Make a reference to this object to
+ * prevent its disposal while we are messing with
+ * it. Once we have the reference, the map is free
+ * to be diddled. Since objects reference their
+ * shadows (and copies), they will stay around as well.
+ */
+ vm_object_reference_locked(object);
+ vm_object_paging_begin(object);
+
+ set_thread_pagein_error(cthread, 0);
+ error_code = 0;
+
+ result_page = VM_PAGE_NULL;
+ kr = vm_fault_page(object, offset, fault_type,
+ (change_wiring && !wired),
+ FALSE, /* page not looked up */
+ &prot, &result_page, &top_page,
+ &type_of_fault,
+ &error_code, map->no_zero_fill,
+ FALSE, &fault_info);
+
+ /*
+ * if kr != VM_FAULT_SUCCESS, then the paging reference
+ * has been dropped and the object unlocked... the ref_count
+ * is still held
+ *
+ * if kr == VM_FAULT_SUCCESS, then the paging reference
+ * is still held along with the ref_count on the original object
+ *
+ * the object is returned locked with a paging reference
+ *
+ * if top_page != NULL, then it's BUSY and the
+ * object it belongs to has a paging reference
+ * but is returned unlocked
+ */
+ if (kr != VM_FAULT_SUCCESS &&
+ kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
+ if (kr == VM_FAULT_MEMORY_ERROR &&
+ fault_info.resilient_media) {
+ assertf(object->internal, "object %p", object);
+ /*
+ * This fault failed but the mapping was
+ * "media resilient", so we'll retry the fault in
+ * recovery mode to get a zero-filled page in the
+ * top object.
+ * Keep the reference on the failing object so
+ * that we can check that the mapping is still
+ * pointing to it when we retry the fault.
+ */
+// printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
+ assert(!resilient_media_retry); /* no double retry */
+ assert(resilient_media_object == VM_OBJECT_NULL);
+ assert(resilient_media_offset == (vm_object_offset_t)-1);
+ resilient_media_retry = TRUE;
+ resilient_media_object = object;
+ resilient_media_offset = offset;
+// printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
+ goto RetryFault;
+ } else {
+ /*
+ * we didn't succeed, lose the object reference
+ * immediately.
+ */
+ vm_object_deallocate(object);
+ object = VM_OBJECT_NULL; /* no longer valid */
+ }
+
+ /*
+ * See why we failed, and take corrective action.
+ */
+ switch (kr) {
+ case VM_FAULT_MEMORY_SHORTAGE:
+ if (vm_page_wait((change_wiring) ?
+ THREAD_UNINT :
+ THREAD_ABORTSAFE)) {
+ goto RetryFault;
+ }
+ OS_FALLTHROUGH;
+ case VM_FAULT_INTERRUPTED:
+ kr = KERN_ABORTED;
+ goto done;
+ case VM_FAULT_RETRY:
+ goto RetryFault;
+ case VM_FAULT_MEMORY_ERROR:
+ if (error_code) {
+ kr = error_code;
+ } else {
+ kr = KERN_MEMORY_ERROR;
+ }
+ goto done;
+ default:
+ panic("vm_fault: unexpected error 0x%x from "
+ "vm_fault_page()\n", kr);
+ }
+ }
+ m = result_page;
+ m_object = NULL;
+
+ if (m != VM_PAGE_NULL) {
+ m_object = VM_PAGE_OBJECT(m);
+ assert((change_wiring && !wired) ?
+ (top_page == VM_PAGE_NULL) :
+ ((top_page == VM_PAGE_NULL) == (m_object == object)));
+ }
+
+ /*
+ * What to do with the resulting page from vm_fault_page
+ * if it doesn't get entered into the physical map:
+ */
+#define RELEASE_PAGE(m) \
+ MACRO_BEGIN \
+ PAGE_WAKEUP_DONE(m); \
+ if ( !VM_PAGE_PAGEABLE(m)) { \
+ vm_page_lockspin_queues(); \
+ if ( !VM_PAGE_PAGEABLE(m)) \
+ vm_page_activate(m); \
+ vm_page_unlock_queues(); \
+ } \
+ MACRO_END
+
+
+ object_locks_dropped = FALSE;
+ /*
+ * We must verify that the maps have not changed
+ * since our last lookup. vm_map_verify() needs the
+ * map lock (shared) but we are holding object locks.
+ * So we do a try_lock() first and, if that fails, we
+ * drop the object locks and go in for the map lock again.
+ */
+ if (!vm_map_try_lock_read(original_map)) {
+ if (m != VM_PAGE_NULL) {
+ old_copy_object = m_object->copy;
+ vm_object_unlock(m_object);
+ } else {
+ old_copy_object = VM_OBJECT_NULL;
+ vm_object_unlock(object);
+ }
+
+ object_locks_dropped = TRUE;
+
+ vm_map_lock_read(original_map);
+ }
+
+ if ((map != original_map) || !vm_map_verify(map, &version)) {
+ if (object_locks_dropped == FALSE) {
+ if (m != VM_PAGE_NULL) {
+ old_copy_object = m_object->copy;
+ vm_object_unlock(m_object);
+ } else {
+ old_copy_object = VM_OBJECT_NULL;
+ vm_object_unlock(object);
+ }
+
+ object_locks_dropped = TRUE;
+ }
+
+ /*
+ * no object locks are held at this point
+ */
+ vm_object_t retry_object;
+ vm_object_offset_t retry_offset;
+ vm_prot_t retry_prot;
+
+ /*
+ * To avoid trying to write_lock the map while another
+ * thread has it read_locked (in vm_map_pageable), we
+ * do not try for write permission. If the page is
+ * still writable, we will get write permission. If it
+ * is not, or has been marked needs_copy, we enter the
+ * mapping without write permission, and will merely
+ * take another fault.
+ */
+ map = original_map;
+
+ kr = vm_map_lookup_locked(&map, vaddr,
+ fault_type & ~VM_PROT_WRITE,
+ OBJECT_LOCK_EXCLUSIVE, &version,
+ &retry_object, &retry_offset, &retry_prot,
+ &wired,
+ &fault_info,
+ &real_map,
+ NULL);
+ pmap = real_map->pmap;
+
+ if (kr != KERN_SUCCESS) {
+ vm_map_unlock_read(map);
+
+ if (m != VM_PAGE_NULL) {
+ assert(VM_PAGE_OBJECT(m) == m_object);
+
+ /*
+ * retake the lock so that
+ * we can drop the paging reference
+ * in vm_fault_cleanup and do the
+ * PAGE_WAKEUP_DONE in RELEASE_PAGE
+ */
+ vm_object_lock(m_object);
+
+ RELEASE_PAGE(m);
+
+ vm_fault_cleanup(m_object, top_page);
+ } else {
+ /*
+ * retake the lock so that
+ * we can drop the paging reference
+ * in vm_fault_cleanup
+ */
+ vm_object_lock(object);
+
+ vm_fault_cleanup(object, top_page);
+ }
+ vm_object_deallocate(object);
+
+ goto done;
+ }
+ vm_object_unlock(retry_object);
+
+ if ((retry_object != object) || (retry_offset != offset)) {
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ if (m != VM_PAGE_NULL) {
+ assert(VM_PAGE_OBJECT(m) == m_object);
+
+ /*
+ * retake the lock so that
+ * we can drop the paging reference
+ * in vm_fault_cleanup and do the
+ * PAGE_WAKEUP_DONE in RELEASE_PAGE
+ */
+ vm_object_lock(m_object);
+
+ RELEASE_PAGE(m);
+
+ vm_fault_cleanup(m_object, top_page);
+ } else {
+ /*
+ * retake the lock so that
+ * we can drop the paging reference
+ * in vm_fault_cleanup
+ */
+ vm_object_lock(object);
+
+ vm_fault_cleanup(object, top_page);
+ }
+ vm_object_deallocate(object);
+
+ goto RetryFault;
+ }
+ /*
+ * Check whether the protection has changed or the object
+ * has been copied while we left the map unlocked.
+ */
+ if (pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, retry_prot)) {
+ /* If the pmap layer cares, pass the full set. */
+ prot = retry_prot;
+ } else {
+ prot &= retry_prot;
+ }
+ }
+
+ if (object_locks_dropped == TRUE) {
+ if (m != VM_PAGE_NULL) {
+ vm_object_lock(m_object);
+
+ if (m_object->copy != old_copy_object) {
+ /*
+ * The copy object changed while the top-level object
+ * was unlocked, so take away write permission.
+ */
+ assert(!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot));
+ prot &= ~VM_PROT_WRITE;
+ }
+ } else {
+ vm_object_lock(object);
+ }
+
+ object_locks_dropped = FALSE;
+ }
+
+ if (!need_copy &&
+ !fault_info.no_copy_on_read &&
+ m != VM_PAGE_NULL &&
+ VM_PAGE_OBJECT(m) != object &&
+ !VM_PAGE_OBJECT(m)->pager_trusted &&
+ vm_protect_privileged_from_untrusted &&
+ !((prot & VM_PROT_EXECUTE) &&
+ VM_PAGE_OBJECT(m)->code_signed &&
+ pmap_get_vm_map_cs_enforced(caller_pmap ? caller_pmap : pmap)) &&
+ current_proc_is_privileged()) {
+ /*
+ * We found the page we want in an "untrusted" VM object
+ * down the shadow chain. Since the target is "privileged"
+ * we want to perform a copy-on-read of that page, so that the
+ * mapped object gets a stable copy and does not have to
+ * rely on the "untrusted" object to provide the same
+ * contents if the page gets reclaimed and has to be paged
+ * in again later on.
+ *
+ * Special case: if the mapping is executable and the untrusted
+ * object is code-signed and the process is "cs_enforced", we
+ * do not copy-on-read because that would break code-signing
+ * enforcement expectations (an executable page must belong
+ * to a code-signed object) and we can rely on code-signing
+ * to re-validate the page if it gets evicted and paged back in.
+ */
+// printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
+ vm_copied_on_read++;
+ need_copy_on_read = TRUE;
+ need_copy = TRUE;
+ } else {
+ need_copy_on_read = FALSE;
+ }
+
+ /*
+ * If we want to wire down this page, but no longer have
+ * adequate permissions, we must start all over.
+ * If we decided to copy-on-read, we must also start all over.
+ */
+ if ((wired && (fault_type != (prot | VM_PROT_WRITE))) ||
+ need_copy_on_read) {
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ if (m != VM_PAGE_NULL) {
+ assert(VM_PAGE_OBJECT(m) == m_object);
+
+ RELEASE_PAGE(m);
+
+ vm_fault_cleanup(m_object, top_page);
+ } else {
+ vm_fault_cleanup(object, top_page);
+ }
+
+ vm_object_deallocate(object);
+
+ goto RetryFault;
+ }
+ if (m != VM_PAGE_NULL) {
+ /*
+ * Put this page into the physical map.
+ * We had to do the unlock above because pmap_enter
+ * may cause other faults. The page may be on
+ * the pageout queues. If the pageout daemon comes
+ * across the page, it will remove it from the queues.
+ */
+ if (fault_page_size < PAGE_SIZE) {
+ DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx pa 0x%llx(0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
+ assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
+ fault_phys_offset < PAGE_SIZE),
+ "0x%llx\n", (uint64_t)fault_phys_offset);
+ } else {
+ assertf(fault_phys_offset == 0,
+ "0x%llx\n", (uint64_t)fault_phys_offset);
+ }
+ if (caller_pmap) {
+ kr = vm_fault_enter(m,
+ caller_pmap,
+ caller_pmap_addr,
+ fault_page_size,
+ fault_phys_offset,
+ prot,
+ caller_prot,
+ wired,
+ change_wiring,
+ wire_tag,
+ &fault_info,
+ NULL,
+ &type_of_fault);
+ } else {
+ kr = vm_fault_enter(m,
+ pmap,
+ vaddr,
+ fault_page_size,
+ fault_phys_offset,
+ prot,
+ caller_prot,
+ wired,
+ change_wiring,
+ wire_tag,
+ &fault_info,
+ NULL,
+ &type_of_fault);
+ }
+ assert(VM_PAGE_OBJECT(m) == m_object);
+
+ {
+ int event_code = 0;
+
+ if (m_object->internal) {
+ event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
+ } else if (m_object->object_is_shared_cache) {
+ event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
+ } else {
+ event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
+ }
+
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | vm_fault_type_for_tracing(need_copy_on_read, type_of_fault), m->vmp_offset, get_current_unique_pid(), 0);
+ KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_SLOW), get_current_unique_pid(), 0, 0, 0, 0);
+
+ DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
+ }
+ if (kr != KERN_SUCCESS) {
+ /* abort this page fault */
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+ PAGE_WAKEUP_DONE(m);
+ vm_fault_cleanup(m_object, top_page);
+ vm_object_deallocate(object);
+ goto done;
+ }
+ if (physpage_p != NULL) {
+ /* for vm_map_wire_and_extract() */
+ *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
+ if (prot & VM_PROT_WRITE) {
+ vm_object_lock_assert_exclusive(m_object);
+ m->vmp_dirty = TRUE;
+ }
+ }
+ } else {
+ vm_map_entry_t entry;
+ vm_map_offset_t laddr;
+ vm_map_offset_t ldelta, hdelta;
+
+ /*
+ * do a pmap block mapping from the physical address
+ * in the object
+ */
+
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ if (original_map != map) {
+ vm_map_unlock_read(map);
+ vm_map_lock_read(original_map);
+ map = original_map;
+ }
+ real_map = map;
+
+ laddr = vaddr;
+ hdelta = 0xFFFFF000;
+ ldelta = 0xFFFFF000;
+
+ while (vm_map_lookup_entry(map, laddr, &entry)) {
+ if (ldelta > (laddr - entry->vme_start)) {
+ ldelta = laddr - entry->vme_start;
+ }
+ if (hdelta > (entry->vme_end - laddr)) {
+ hdelta = entry->vme_end - laddr;
+ }
+ if (entry->is_sub_map) {
+ laddr = ((laddr - entry->vme_start)
+ + VME_OFFSET(entry));
+ vm_map_lock_read(VME_SUBMAP(entry));
+
+ if (map != real_map) {
+ vm_map_unlock_read(map);
+ }
+ if (entry->use_pmap) {
+ vm_map_unlock_read(real_map);
+ real_map = VME_SUBMAP(entry);
+ }
+ map = VME_SUBMAP(entry);
+ } else {
+ break;
+ }
+ }
+
+ if (vm_map_lookup_entry(map, laddr, &entry) &&
+ (VME_OBJECT(entry) != NULL) &&
+ (VME_OBJECT(entry) == object)) {
+ uint16_t superpage;
+
+ if (!object->pager_created &&
+ object->phys_contiguous &&
+ VME_OFFSET(entry) == 0 &&
+ (entry->vme_end - entry->vme_start == object->vo_size) &&
+ VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size - 1))) {
+ superpage = VM_MEM_SUPERPAGE;
+ } else {
+ superpage = 0;
+ }
+
+ if (superpage && physpage_p) {
+ /* for vm_map_wire_and_extract() */
+ *physpage_p = (ppnum_t)
+ ((((vm_map_offset_t)
+ object->vo_shadow_offset)
+ + VME_OFFSET(entry)
+ + (laddr - entry->vme_start))
+ >> PAGE_SHIFT);
+ }
+
+ if (caller_pmap) {
+ /*
+ * Set up a block mapped area
+ */
+ assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
+ kr = pmap_map_block(caller_pmap,
+ (addr64_t)(caller_pmap_addr - ldelta),
+ (ppnum_t)((((vm_map_offset_t) (VME_OBJECT(entry)->vo_shadow_offset)) +
+ VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
+ (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
+ (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
+
+ if (kr != KERN_SUCCESS) {
+ goto cleanup;
+ }
+ } else {
+ /*
+ * Set up a block mapped area
+ */
+ assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
+ kr = pmap_map_block(real_map->pmap,
+ (addr64_t)(vaddr - ldelta),
+ (ppnum_t)((((vm_map_offset_t)(VME_OBJECT(entry)->vo_shadow_offset)) +
+ VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
+ (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
+ (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
+
+ if (kr != KERN_SUCCESS) {
+ goto cleanup;
+ }
+ }
+ }
+ }
+
+ /*
+ * Success
+ */
+ kr = KERN_SUCCESS;
+
+ /*
+ * TODO: could most of the done cases just use cleanup?
+ */
+cleanup:
+ /*
+ * Unlock everything, and return
+ */
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+
+ if (m != VM_PAGE_NULL) {
+ assert(VM_PAGE_OBJECT(m) == m_object);
+
+ if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
+ vm_object_paging_begin(m_object);
+
+ assert(written_on_object == VM_OBJECT_NULL);
+ written_on_object = m_object;
+ written_on_pager = m_object->pager;
+ written_on_offset = m_object->paging_offset + m->vmp_offset;
+ }
+ PAGE_WAKEUP_DONE(m);
+
+ vm_fault_cleanup(m_object, top_page);
+ } else {
+ vm_fault_cleanup(object, top_page);
+ }
+
+ vm_object_deallocate(object);
+
+#undef RELEASE_PAGE
+
+done:
+ thread_interrupt_level(interruptible_state);
+
+ if (resilient_media_object != VM_OBJECT_NULL) {
+ assert(resilient_media_retry);
+ assert(resilient_media_offset != (vm_object_offset_t)-1);
+ /* release extra reference on failed object */
+// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
+ vm_object_deallocate(resilient_media_object);
+ resilient_media_object = VM_OBJECT_NULL;
+ resilient_media_offset = (vm_object_offset_t)-1;
+ resilient_media_retry = FALSE;
+ }
+ assert(!resilient_media_retry);
+
+ /*
+ * Only I/O throttle on faults which cause a pagein/swapin.
+ */
+ if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
+ throttle_lowpri_io(1);
+ } else {
+ if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
+ if ((throttle_delay = vm_page_throttled(TRUE))) {
+ if (vm_debug_events) {
+ if (type_of_fault == DBG_COMPRESSOR_FAULT) {
+ VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
+ } else if (type_of_fault == DBG_COW_FAULT) {
+ VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
+ } else {
+ VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
+ }
+ }
+ delay(throttle_delay);
+ }
+ }
+ }
+
+ if (written_on_object) {
+ vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
+
+ vm_object_lock(written_on_object);
+ vm_object_paging_end(written_on_object);
+ vm_object_unlock(written_on_object);
+
+ written_on_object = VM_OBJECT_NULL;
+ }
+
+ if (rtfault) {
+ vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
+ }
+
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+ (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
+ ((uint64_t)trace_vaddr >> 32),
+ trace_vaddr,
+ kr,
+ vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
+ 0);
+
+ if (fault_page_size < PAGE_SIZE && kr != KERN_SUCCESS) {
+ DEBUG4K_FAULT("map %p original %p vaddr 0x%llx -> 0x%x\n", map, original_map, (uint64_t)trace_real_vaddr, kr);
+ }
+
+ return kr;
+}
+
+/*
+ * vm_fault_wire:
+ *
+ * Wire down a range of virtual addresses in a map.
+ */
+kern_return_t
+vm_fault_wire(
+ vm_map_t map,
+ vm_map_entry_t entry,
+ vm_prot_t prot,
+ vm_tag_t wire_tag,
+ pmap_t pmap,
+ vm_map_offset_t pmap_addr,
+ ppnum_t *physpage_p)
+{
+ vm_map_offset_t va;
+ vm_map_offset_t end_addr = entry->vme_end;
+ kern_return_t rc;
+ vm_map_size_t effective_page_size;
+
+ assert(entry->in_transition);
+
+ if ((VME_OBJECT(entry) != NULL) &&
+ !entry->is_sub_map &&
+ VME_OBJECT(entry)->phys_contiguous) {
+ return KERN_SUCCESS;
+ }
+
+ /*
+ * Inform the physical mapping system that the
+ * range of addresses may not fault, so that
+ * page tables and such can be locked down as well.
+ */
+
+ pmap_pageable(pmap, pmap_addr,
+ pmap_addr + (end_addr - entry->vme_start), FALSE);
+
+ /*
+ * We simulate a fault to get the page and enter it
+ * in the physical map.
+ */
+
+ effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
+ for (va = entry->vme_start;
+ va < end_addr;
+ va += effective_page_size) {
+ rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
+ pmap_addr + (va - entry->vme_start),
+ physpage_p);
+ if (rc != KERN_SUCCESS) {
+ rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
+ ((pmap == kernel_pmap)
+ ? THREAD_UNINT
+ : THREAD_ABORTSAFE),
+ pmap,
+ (pmap_addr +
+ (va - entry->vme_start)),
+ physpage_p);
+ DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
+ }
+
+ if (rc != KERN_SUCCESS) {
+ struct vm_map_entry tmp_entry = *entry;
+
+ /* unwire wired pages */
+ tmp_entry.vme_end = va;
+ vm_fault_unwire(map,
+ &tmp_entry, FALSE, pmap, pmap_addr);
+
+ return rc;
+ }
+ }
+ return KERN_SUCCESS;
+}
+
+/*
+ * vm_fault_unwire:
+ *
+ * Unwire a range of virtual addresses in a map.
+ */
+void
+vm_fault_unwire(
+ vm_map_t map,
+ vm_map_entry_t entry,
+ boolean_t deallocate,
+ pmap_t pmap,
+ vm_map_offset_t pmap_addr)
+{
+ vm_map_offset_t va;
+ vm_map_offset_t end_addr = entry->vme_end;
+ vm_object_t object;
+ struct vm_object_fault_info fault_info = {};
+ unsigned int unwired_pages;
+ vm_map_size_t effective_page_size;
+
+ object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
+
+ /*
+ * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
+ * do anything since such memory is wired by default. So we don't have
+ * anything to undo here.
+ */
+
+ if (object != VM_OBJECT_NULL && object->phys_contiguous) {
+ return;
+ }
+
+ fault_info.interruptible = THREAD_UNINT;
+ fault_info.behavior = entry->behavior;
+ fault_info.user_tag = VME_ALIAS(entry);
+ if (entry->iokit_acct ||
+ (!entry->is_sub_map && !entry->use_pmap)) {
+ fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
+ }
+ fault_info.lo_offset = VME_OFFSET(entry);
+ fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
+ fault_info.no_cache = entry->no_cache;
+ fault_info.stealth = TRUE;
+
+ unwired_pages = 0;
+
+ /*
+ * Since the pages are wired down, we must be able to
+ * get their mappings from the physical map system.
+ */
+
+ effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
+ for (va = entry->vme_start;
+ va < end_addr;
+ va += effective_page_size) {
+ if (object == VM_OBJECT_NULL) {
+ if (pmap) {
+ pmap_change_wiring(pmap,
+ pmap_addr + (va - entry->vme_start), FALSE);
+ }
+ (void) vm_fault(map, va, VM_PROT_NONE,
+ TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
+ } else {
+ vm_prot_t prot;
+ vm_page_t result_page;
+ vm_page_t top_page;
+ vm_object_t result_object;
+ vm_fault_return_t result;
+
+ /* cap cluster size at maximum UPL size */
+ upl_size_t cluster_size;
+ if (os_sub_overflow(end_addr, va, &cluster_size)) {
+ cluster_size = 0 - (upl_size_t)PAGE_SIZE;
+ }
+ fault_info.cluster_size = cluster_size;
+
+ do {
+ prot = VM_PROT_NONE;
+
+ vm_object_lock(object);
+ vm_object_paging_begin(object);
+ result_page = VM_PAGE_NULL;
+ result = vm_fault_page(
+ object,
+ (VME_OFFSET(entry) +
+ (va - entry->vme_start)),
+ VM_PROT_NONE, TRUE,
+ FALSE, /* page not looked up */
+ &prot, &result_page, &top_page,
+ (int *)0,
+ NULL, map->no_zero_fill,
+ FALSE, &fault_info);
+ } while (result == VM_FAULT_RETRY);
+
+ /*
+ * If this was a mapping to a file on a device that has been forcibly
+ * unmounted, then we won't get a page back from vm_fault_page(). Just
+ * move on to the next one in case the remaining pages are mapped from
+ * different objects. During a forced unmount, the object is terminated
+ * so the alive flag will be false if this happens. A forced unmount will
+ * will occur when an external disk is unplugged before the user does an
+ * eject, so we don't want to panic in that situation.
+ */
+
+ if (result == VM_FAULT_MEMORY_ERROR && !object->alive) {
+ continue;
+ }
+
+ if (result == VM_FAULT_MEMORY_ERROR &&
+ object == kernel_object) {
+ /*
+ * This must have been allocated with
+ * KMA_KOBJECT and KMA_VAONLY and there's
+ * no physical page at this offset.
+ * We're done (no page to free).
+ */
+ assert(deallocate);
+ continue;
+ }
+
+ if (result != VM_FAULT_SUCCESS) {
+ panic("vm_fault_unwire: failure");
+ }
+
+ result_object = VM_PAGE_OBJECT(result_page);
+
+ if (deallocate) {
+ assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
+ vm_page_fictitious_addr);
+ pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
+ if (VM_PAGE_WIRED(result_page)) {
+ unwired_pages++;
+ }
+ VM_PAGE_FREE(result_page);
+ } else {
+ if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr)) {
+ pmap_change_wiring(pmap,
+ pmap_addr + (va - entry->vme_start), FALSE);
+ }
+
+
+ if (VM_PAGE_WIRED(result_page)) {
+ vm_page_lockspin_queues();
+ vm_page_unwire(result_page, TRUE);
+ vm_page_unlock_queues();
+ unwired_pages++;
+ }
+ if (entry->zero_wired_pages) {
+ pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
+ entry->zero_wired_pages = FALSE;
+ }
+
+ PAGE_WAKEUP_DONE(result_page);
+ }
+ vm_fault_cleanup(result_object, top_page);
+ }
+ }