+ if (wait_result != THREAD_AWAKENED) {
+ vm_fault_cleanup(object, first_m);
+ thread_interrupt_level(interruptible_state);
+
+ if (wait_result == THREAD_RESTART) {
+ return VM_FAULT_RETRY;
+ } else {
+ return VM_FAULT_INTERRUPTED;
+ }
+ }
+ continue;
+ }
+ if (m->vmp_laundry) {
+ m->vmp_free_when_done = FALSE;
+
+ if (!m->vmp_cleaning) {
+ vm_pageout_steal_laundry(m, FALSE);
+ }
+ }
+ if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
+ /*
+ * Guard page: off limits !
+ */
+ if (fault_type == VM_PROT_NONE) {
+ /*
+ * The fault is not requesting any
+ * access to the guard page, so it must
+ * be just to wire or unwire it.
+ * Let's pretend it succeeded...
+ */
+ m->vmp_busy = TRUE;
+ *result_page = m;
+ assert(first_m == VM_PAGE_NULL);
+ *top_page = first_m;
+ if (type_of_fault) {
+ *type_of_fault = DBG_GUARD_FAULT;
+ }
+ thread_interrupt_level(interruptible_state);
+ return VM_FAULT_SUCCESS;
+ } else {
+ /*
+ * The fault requests access to the
+ * guard page: let's deny that !
+ */
+ vm_fault_cleanup(object, first_m);
+ thread_interrupt_level(interruptible_state);
+ return VM_FAULT_MEMORY_ERROR;
+ }
+ }
+
+ if (m->vmp_error) {
+ /*
+ * The page is in error, give up now.
+ */
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
+#endif
+ if (error_code) {
+ *error_code = KERN_MEMORY_ERROR;
+ }
+ VM_PAGE_FREE(m);
+
+ vm_fault_cleanup(object, first_m);
+ thread_interrupt_level(interruptible_state);
+
+ return VM_FAULT_MEMORY_ERROR;
+ }
+ if (m->vmp_restart) {
+ /*
+ * The pager wants us to restart
+ * at the top of the chain,
+ * typically because it has moved the
+ * page to another pager, then do so.
+ */
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
+#endif
+ VM_PAGE_FREE(m);
+
+ vm_fault_cleanup(object, first_m);
+ thread_interrupt_level(interruptible_state);
+
+ return VM_FAULT_RETRY;
+ }
+ if (m->vmp_absent) {
+ /*
+ * The page isn't busy, but is absent,
+ * therefore it's deemed "unavailable".
+ *
+ * Remove the non-existent page (unless it's
+ * in the top object) and move on down to the
+ * next object (if there is one).
+ */
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
+#endif
+ next_object = object->shadow;
+
+ if (next_object == VM_OBJECT_NULL) {
+ /*
+ * Absent page at bottom of shadow
+ * chain; zero fill the page we left
+ * busy in the first object, and free
+ * the absent page.
+ */
+ assert(!must_be_resident);
+
+ /*
+ * check for any conditions that prevent
+ * us from creating a new zero-fill page
+ * vm_fault_check will do all of the
+ * fault cleanup in the case of an error condition
+ * including resetting the thread_interrupt_level
+ */
+ error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
+
+ if (error != VM_FAULT_SUCCESS) {
+ return error;
+ }
+
+ if (object != first_object) {
+ /*
+ * free the absent page we just found
+ */
+ VM_PAGE_FREE(m);
+
+ /*
+ * drop reference and lock on current object
+ */
+ vm_object_paging_end(object);
+ vm_object_unlock(object);
+
+ /*
+ * grab the original page we
+ * 'soldered' in place and
+ * retake lock on 'first_object'
+ */
+ m = first_m;
+ first_m = VM_PAGE_NULL;
+
+ object = first_object;
+ offset = first_offset;
+
+ vm_object_lock(object);
+ } else {
+ /*
+ * we're going to use the absent page we just found
+ * so convert it to a 'busy' page
+ */
+ m->vmp_absent = FALSE;
+ m->vmp_busy = TRUE;
+ }
+ if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
+ m->vmp_absent = TRUE;
+ }
+ /*
+ * zero-fill the page and put it on
+ * the correct paging queue
+ */
+ my_fault = vm_fault_zero_page(m, no_zero_fill);
+
+ break;
+ } else {
+ if (must_be_resident) {
+ vm_object_paging_end(object);
+ } else if (object != first_object) {
+ vm_object_paging_end(object);
+ VM_PAGE_FREE(m);
+ } else {
+ first_m = m;
+ m->vmp_absent = FALSE;
+ m->vmp_busy = TRUE;
+
+ vm_page_lockspin_queues();
+ vm_page_queues_remove(m, FALSE);
+ vm_page_unlock_queues();
+ }
+
+ offset += object->vo_shadow_offset;
+ fault_info->lo_offset += object->vo_shadow_offset;
+ fault_info->hi_offset += object->vo_shadow_offset;
+ access_required = VM_PROT_READ;
+
+ vm_object_lock(next_object);
+ vm_object_unlock(object);
+ object = next_object;
+ vm_object_paging_begin(object);
+
+ /*
+ * reset to default type of fault
+ */
+ my_fault = DBG_CACHE_HIT_FAULT;
+
+ continue;
+ }
+ }
+ if ((m->vmp_cleaning)
+ && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
+ && (fault_type & VM_PROT_WRITE)) {
+ /*
+ * This is a copy-on-write fault that will
+ * cause us to revoke access to this page, but
+ * this page is in the process of being cleaned
+ * in a clustered pageout. We must wait until
+ * the cleaning operation completes before
+ * revoking access to the original page,
+ * otherwise we might attempt to remove a
+ * wired mapping.
+ */
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
+#endif
+ /*
+ * take an extra ref so that object won't die
+ */
+ vm_object_reference_locked(object);
+
+ vm_fault_cleanup(object, first_m);
+
+ vm_object_lock(object);
+ assert(object->ref_count > 0);
+
+ m = vm_page_lookup(object, vm_object_trunc_page(offset));
+
+ if (m != VM_PAGE_NULL && m->vmp_cleaning) {
+ PAGE_ASSERT_WAIT(m, interruptible);
+
+ vm_object_unlock(object);
+ wait_result = thread_block(THREAD_CONTINUE_NULL);
+ vm_object_deallocate(object);
+
+ goto backoff;
+ } else {
+ vm_object_unlock(object);
+
+ vm_object_deallocate(object);
+ thread_interrupt_level(interruptible_state);
+
+ return VM_FAULT_RETRY;
+ }
+ }
+ if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
+ !(fault_info != NULL && fault_info->stealth)) {
+ /*
+ * If we were passed a non-NULL pointer for
+ * "type_of_fault", than we came from
+ * vm_fault... we'll let it deal with
+ * this condition, since it
+ * needs to see m->vmp_speculative to correctly
+ * account the pageins, otherwise...
+ * take it off the speculative queue, we'll
+ * let the caller of vm_fault_page deal
+ * with getting it onto the correct queue
+ *
+ * If the caller specified in fault_info that
+ * it wants a "stealth" fault, we also leave
+ * the page in the speculative queue.
+ */
+ vm_page_lockspin_queues();
+ if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
+ vm_page_queues_remove(m, FALSE);
+ }
+ vm_page_unlock_queues();
+ }
+ assert(object == VM_PAGE_OBJECT(m));
+
+ if (object->code_signed) {
+ /*
+ * CODE SIGNING:
+ * We just paged in a page from a signed
+ * memory object but we don't need to
+ * validate it now. We'll validate it if
+ * when it gets mapped into a user address
+ * space for the first time or when the page
+ * gets copied to another object as a result
+ * of a copy-on-write.
+ */
+ }
+
+ /*
+ * We mark the page busy and leave it on
+ * the pageout queues. If the pageout
+ * deamon comes across it, then it will
+ * remove the page from the queue, but not the object
+ */
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
+#endif
+ assert(!m->vmp_busy);
+ assert(!m->vmp_absent);
+
+ m->vmp_busy = TRUE;
+ break;
+ }
+
+
+ /*
+ * we get here when there is no page present in the object at
+ * the offset we're interested in... we'll allocate a page
+ * at this point if the pager associated with
+ * this object can provide the data or we're the top object...
+ * object is locked; m == NULL
+ */
+
+ if (must_be_resident) {
+ if (fault_type == VM_PROT_NONE &&
+ object == kernel_object) {
+ /*
+ * We've been called from vm_fault_unwire()
+ * while removing a map entry that was allocated
+ * with KMA_KOBJECT and KMA_VAONLY. This page
+ * is not present and there's nothing more to
+ * do here (nothing to unwire).
+ */
+ vm_fault_cleanup(object, first_m);
+ thread_interrupt_level(interruptible_state);
+
+ return VM_FAULT_MEMORY_ERROR;
+ }
+
+ goto dont_look_for_page;
+ }
+
+ /* Don't expect to fault pages into the kernel object. */
+ assert(object != kernel_object);
+
+ data_supply = FALSE;
+
+ look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
+
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
+#endif
+ if (!look_for_page && object == first_object && !object->phys_contiguous) {
+ /*
+ * Allocate a new page for this object/offset pair as a placeholder
+ */
+ m = vm_page_grab_options(grab_options);
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
+#endif
+ if (m == VM_PAGE_NULL) {
+ vm_fault_cleanup(object, first_m);
+ thread_interrupt_level(interruptible_state);
+
+ return VM_FAULT_MEMORY_SHORTAGE;
+ }
+
+ if (fault_info && fault_info->batch_pmap_op == TRUE) {
+ vm_page_insert_internal(m, object,
+ vm_object_trunc_page(offset),
+ VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
+ } else {
+ vm_page_insert(m, object, vm_object_trunc_page(offset));
+ }
+ }
+ if (look_for_page) {
+ kern_return_t rc;
+ int my_fault_type;
+
+ /*
+ * If the memory manager is not ready, we
+ * cannot make requests.
+ */
+ if (!object->pager_ready) {
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
+#endif
+ if (m != VM_PAGE_NULL) {
+ VM_PAGE_FREE(m);
+ }
+
+ /*
+ * take an extra ref so object won't die
+ */
+ vm_object_reference_locked(object);
+ vm_fault_cleanup(object, first_m);
+
+ vm_object_lock(object);
+ assert(object->ref_count > 0);
+
+ if (!object->pager_ready) {
+ wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
+
+ vm_object_unlock(object);
+ if (wait_result == THREAD_WAITING) {
+ wait_result = thread_block(THREAD_CONTINUE_NULL);
+ }
+ vm_object_deallocate(object);
+
+ goto backoff;
+ } else {
+ vm_object_unlock(object);
+ vm_object_deallocate(object);
+ thread_interrupt_level(interruptible_state);
+
+ return VM_FAULT_RETRY;
+ }
+ }
+ if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
+ /*
+ * If there are too many outstanding page
+ * requests pending on this external object, we
+ * wait for them to be resolved now.
+ */
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
+#endif
+ if (m != VM_PAGE_NULL) {
+ VM_PAGE_FREE(m);
+ }
+ /*
+ * take an extra ref so object won't die
+ */
+ vm_object_reference_locked(object);
+
+ vm_fault_cleanup(object, first_m);
+
+ vm_object_lock(object);
+ assert(object->ref_count > 0);
+
+ if (object->paging_in_progress >= vm_object_pagein_throttle) {
+ vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
+
+ vm_object_unlock(object);
+ wait_result = thread_block(THREAD_CONTINUE_NULL);
+ vm_object_deallocate(object);
+
+ goto backoff;
+ } else {
+ vm_object_unlock(object);
+ vm_object_deallocate(object);
+ thread_interrupt_level(interruptible_state);
+
+ return VM_FAULT_RETRY;
+ }
+ }
+ if (object->internal) {
+ int compressed_count_delta;
+
+ assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
+
+ if (m == VM_PAGE_NULL) {
+ /*
+ * Allocate a new page for this object/offset pair as a placeholder
+ */
+ m = vm_page_grab_options(grab_options);
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
+#endif
+ if (m == VM_PAGE_NULL) {
+ vm_fault_cleanup(object, first_m);
+ thread_interrupt_level(interruptible_state);
+
+ return VM_FAULT_MEMORY_SHORTAGE;
+ }
+
+ m->vmp_absent = TRUE;
+ if (fault_info && fault_info->batch_pmap_op == TRUE) {
+ vm_page_insert_internal(m, object, vm_object_trunc_page(offset), VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
+ } else {
+ vm_page_insert(m, object, vm_object_trunc_page(offset));
+ }
+ }
+ assert(m->vmp_busy);
+
+ m->vmp_absent = TRUE;
+ pager = object->pager;
+
+ assert(object->paging_in_progress > 0);
+ vm_object_unlock(object);
+
+ rc = vm_compressor_pager_get(
+ pager,
+ offset + object->paging_offset,
+ VM_PAGE_GET_PHYS_PAGE(m),
+ &my_fault_type,
+ 0,
+ &compressed_count_delta);
+
+ if (type_of_fault == NULL) {
+ int throttle_delay;
+
+ /*
+ * we weren't called from vm_fault, so we
+ * need to apply page creation throttling
+ * do it before we re-acquire any locks
+ */
+ if (my_fault_type == DBG_COMPRESSOR_FAULT) {
+ if ((throttle_delay = vm_page_throttled(TRUE))) {
+ VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
+ delay(throttle_delay);
+ }
+ }
+ }
+ vm_object_lock(object);
+ assert(object->paging_in_progress > 0);
+
+ vm_compressor_pager_count(
+ pager,
+ compressed_count_delta,
+ FALSE, /* shared_lock */
+ object);
+
+ switch (rc) {
+ case KERN_SUCCESS:
+ m->vmp_absent = FALSE;
+ m->vmp_dirty = TRUE;
+ if ((object->wimg_bits &
+ VM_WIMG_MASK) !=
+ VM_WIMG_USE_DEFAULT) {
+ /*
+ * If the page is not cacheable,
+ * we can't let its contents
+ * linger in the data cache
+ * after the decompression.
+ */
+ pmap_sync_page_attributes_phys(
+ VM_PAGE_GET_PHYS_PAGE(m));
+ } else {
+ m->vmp_written_by_kernel = TRUE;
+ }
+
+ /*
+ * If the object is purgeable, its
+ * owner's purgeable ledgers have been
+ * updated in vm_page_insert() but the
+ * page was also accounted for in a
+ * "compressed purgeable" ledger, so
+ * update that now.
+ */
+ if (((object->purgable !=
+ VM_PURGABLE_DENY) ||
+ object->vo_ledger_tag) &&
+ (object->vo_owner !=
+ NULL)) {
+ /*
+ * One less compressed
+ * purgeable/tagged page.
+ */
+ vm_object_owner_compressed_update(
+ object,
+ -1);
+ }
+
+ break;
+ case KERN_MEMORY_FAILURE:
+ m->vmp_unusual = TRUE;
+ m->vmp_error = TRUE;
+ m->vmp_absent = FALSE;
+ break;
+ case KERN_MEMORY_ERROR:
+ assert(m->vmp_absent);
+ break;
+ default:
+ panic("vm_fault_page(): unexpected "
+ "error %d from "
+ "vm_compressor_pager_get()\n",
+ rc);
+ }
+ PAGE_WAKEUP_DONE(m);
+
+ rc = KERN_SUCCESS;
+ goto data_requested;
+ }
+ my_fault_type = DBG_PAGEIN_FAULT;
+
+ if (m != VM_PAGE_NULL) {
+ VM_PAGE_FREE(m);
+ m = VM_PAGE_NULL;
+ }
+
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
+#endif
+
+ /*
+ * It's possible someone called vm_object_destroy while we weren't
+ * holding the object lock. If that has happened, then bail out
+ * here.
+ */
+
+ pager = object->pager;
+
+ if (pager == MEMORY_OBJECT_NULL) {
+ vm_fault_cleanup(object, first_m);
+ thread_interrupt_level(interruptible_state);
+ return VM_FAULT_MEMORY_ERROR;
+ }
+
+ /*
+ * We have an absent page in place for the faulting offset,
+ * so we can release the object lock.
+ */
+
+ if (object->object_is_shared_cache) {
+ set_thread_rwlock_boost();
+ }
+
+ vm_object_unlock(object);
+
+ /*
+ * If this object uses a copy_call strategy,
+ * and we are interested in a copy of this object
+ * (having gotten here only by following a
+ * shadow chain), then tell the memory manager
+ * via a flag added to the desired_access
+ * parameter, so that it can detect a race
+ * between our walking down the shadow chain
+ * and its pushing pages up into a copy of
+ * the object that it manages.
+ */
+ if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) {
+ wants_copy_flag = VM_PROT_WANTS_COPY;
+ } else {
+ wants_copy_flag = VM_PROT_NONE;
+ }
+
+ if (object->copy == first_object) {
+ /*
+ * if we issue the memory_object_data_request in
+ * this state, we are subject to a deadlock with
+ * the underlying filesystem if it is trying to
+ * shrink the file resulting in a push of pages
+ * into the copy object... that push will stall
+ * on the placeholder page, and if the pushing thread
+ * is holding a lock that is required on the pagein
+ * path (such as a truncate lock), we'll deadlock...
+ * to avoid this potential deadlock, we throw away
+ * our placeholder page before calling memory_object_data_request
+ * and force this thread to retry the vm_fault_page after
+ * we have issued the I/O. the second time through this path
+ * we will find the page already in the cache (presumably still
+ * busy waiting for the I/O to complete) and then complete
+ * the fault w/o having to go through memory_object_data_request again
+ */
+ assert(first_m != VM_PAGE_NULL);
+ assert(VM_PAGE_OBJECT(first_m) == first_object);
+
+ vm_object_lock(first_object);
+ VM_PAGE_FREE(first_m);
+ vm_object_paging_end(first_object);
+ vm_object_unlock(first_object);
+
+ first_m = VM_PAGE_NULL;
+ force_fault_retry = TRUE;
+
+ vm_fault_page_forced_retry++;
+ }
+
+ if (data_already_requested == TRUE) {
+ orig_behavior = fault_info->behavior;
+ orig_cluster_size = fault_info->cluster_size;
+
+ fault_info->behavior = VM_BEHAVIOR_RANDOM;
+ fault_info->cluster_size = PAGE_SIZE;
+ }
+ /*
+ * Call the memory manager to retrieve the data.
+ */
+ rc = memory_object_data_request(
+ pager,
+ vm_object_trunc_page(offset) + object->paging_offset,
+ PAGE_SIZE,
+ access_required | wants_copy_flag,
+ (memory_object_fault_info_t)fault_info);
+
+ if (data_already_requested == TRUE) {
+ fault_info->behavior = orig_behavior;
+ fault_info->cluster_size = orig_cluster_size;
+ } else {
+ data_already_requested = TRUE;
+ }
+
+ DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
+#endif
+ vm_object_lock(object);
+
+ if (object->object_is_shared_cache) {
+ clear_thread_rwlock_boost();
+ }
+
+data_requested:
+ if (rc != KERN_SUCCESS) {
+ vm_fault_cleanup(object, first_m);
+ thread_interrupt_level(interruptible_state);
+
+ return (rc == MACH_SEND_INTERRUPTED) ?
+ VM_FAULT_INTERRUPTED :
+ VM_FAULT_MEMORY_ERROR;
+ } else {
+ clock_sec_t tv_sec;
+ clock_usec_t tv_usec;
+
+ if (my_fault_type == DBG_PAGEIN_FAULT) {
+ clock_get_system_microtime(&tv_sec, &tv_usec);
+ current_thread()->t_page_creation_time = tv_sec;
+ current_thread()->t_page_creation_count = 0;
+ }
+ }
+ if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
+ vm_fault_cleanup(object, first_m);
+ thread_interrupt_level(interruptible_state);
+
+ return VM_FAULT_INTERRUPTED;
+ }
+ if (force_fault_retry == TRUE) {
+ vm_fault_cleanup(object, first_m);
+ thread_interrupt_level(interruptible_state);
+
+ return VM_FAULT_RETRY;
+ }
+ if (m == VM_PAGE_NULL && object->phys_contiguous) {
+ /*
+ * No page here means that the object we
+ * initially looked up was "physically
+ * contiguous" (i.e. device memory). However,
+ * with Virtual VRAM, the object might not
+ * be backed by that device memory anymore,
+ * so we're done here only if the object is
+ * still "phys_contiguous".
+ * Otherwise, if the object is no longer
+ * "phys_contiguous", we need to retry the
+ * page fault against the object's new backing
+ * store (different memory object).
+ */
+phys_contig_object:
+ goto done;
+ }
+ /*
+ * potentially a pagein fault
+ * if we make it through the state checks
+ * above, than we'll count it as such
+ */
+ my_fault = my_fault_type;
+
+ /*
+ * Retry with same object/offset, since new data may
+ * be in a different page (i.e., m is meaningless at
+ * this point).
+ */
+ continue;
+ }
+dont_look_for_page:
+ /*
+ * We get here if the object has no pager, or an existence map
+ * exists and indicates the page isn't present on the pager
+ * or we're unwiring a page. If a pager exists, but there
+ * is no existence map, then the m->vmp_absent case above handles
+ * the ZF case when the pager can't provide the page
+ */
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
+#endif
+ if (object == first_object) {
+ first_m = m;
+ } else {
+ assert(m == VM_PAGE_NULL);
+ }
+
+ next_object = object->shadow;
+
+ if (next_object == VM_OBJECT_NULL) {
+ /*
+ * we've hit the bottom of the shadown chain,
+ * fill the page in the top object with zeros.
+ */
+ assert(!must_be_resident);
+
+ if (object != first_object) {
+ vm_object_paging_end(object);
+ vm_object_unlock(object);
+
+ object = first_object;
+ offset = first_offset;
+ vm_object_lock(object);
+ }
+ m = first_m;
+ assert(VM_PAGE_OBJECT(m) == object);
+ first_m = VM_PAGE_NULL;
+
+ /*
+ * check for any conditions that prevent
+ * us from creating a new zero-fill page
+ * vm_fault_check will do all of the
+ * fault cleanup in the case of an error condition
+ * including resetting the thread_interrupt_level
+ */
+ error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
+
+ if (error != VM_FAULT_SUCCESS) {
+ return error;
+ }
+
+ if (m == VM_PAGE_NULL) {
+ m = vm_page_grab_options(grab_options);
+
+ if (m == VM_PAGE_NULL) {
+ vm_fault_cleanup(object, VM_PAGE_NULL);
+ thread_interrupt_level(interruptible_state);
+
+ return VM_FAULT_MEMORY_SHORTAGE;
+ }
+ vm_page_insert(m, object, vm_object_trunc_page(offset));
+ }
+ if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
+ m->vmp_absent = TRUE;
+ }
+
+ my_fault = vm_fault_zero_page(m, no_zero_fill);
+
+ break;
+ } else {
+ /*
+ * Move on to the next object. Lock the next
+ * object before unlocking the current one.
+ */
+ if ((object != first_object) || must_be_resident) {
+ vm_object_paging_end(object);
+ }
+
+ offset += object->vo_shadow_offset;
+ fault_info->lo_offset += object->vo_shadow_offset;
+ fault_info->hi_offset += object->vo_shadow_offset;
+ access_required = VM_PROT_READ;
+
+ vm_object_lock(next_object);
+ vm_object_unlock(object);
+
+ object = next_object;
+ vm_object_paging_begin(object);
+ }
+ }
+
+ /*
+ * PAGE HAS BEEN FOUND.
+ *
+ * This page (m) is:
+ * busy, so that we can play with it;
+ * not absent, so that nobody else will fill it;
+ * possibly eligible for pageout;
+ *
+ * The top-level page (first_m) is:
+ * VM_PAGE_NULL if the page was found in the
+ * top-level object;
+ * busy, not absent, and ineligible for pageout.
+ *
+ * The current object (object) is locked. A paging
+ * reference is held for the current and top-level
+ * objects.
+ */
+
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
+#endif
+#if EXTRA_ASSERTIONS
+ assert(m->vmp_busy && !m->vmp_absent);
+ assert((first_m == VM_PAGE_NULL) ||
+ (first_m->vmp_busy && !first_m->vmp_absent &&
+ !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
+#endif /* EXTRA_ASSERTIONS */
+
+ /*
+ * If the page is being written, but isn't
+ * already owned by the top-level object,
+ * we have to copy it into a new page owned
+ * by the top-level object.
+ */
+ if (object != first_object) {
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
+#endif
+ if (fault_type & VM_PROT_WRITE) {
+ vm_page_t copy_m;
+
+ /*
+ * We only really need to copy if we
+ * want to write it.
+ */
+ assert(!must_be_resident);
+
+ /*
+ * If we try to collapse first_object at this
+ * point, we may deadlock when we try to get
+ * the lock on an intermediate object (since we
+ * have the bottom object locked). We can't
+ * unlock the bottom object, because the page
+ * we found may move (by collapse) if we do.
+ *
+ * Instead, we first copy the page. Then, when
+ * we have no more use for the bottom object,
+ * we unlock it and try to collapse.
+ *
+ * Note that we copy the page even if we didn't
+ * need to... that's the breaks.
+ */
+
+ /*
+ * Allocate a page for the copy
+ */
+ copy_m = vm_page_grab_options(grab_options);
+
+ if (copy_m == VM_PAGE_NULL) {
+ RELEASE_PAGE(m);
+
+ vm_fault_cleanup(object, first_m);
+ thread_interrupt_level(interruptible_state);
+
+ return VM_FAULT_MEMORY_SHORTAGE;
+ }
+
+ vm_page_copy(m, copy_m);
+
+ /*
+ * If another map is truly sharing this
+ * page with us, we have to flush all
+ * uses of the original page, since we
+ * can't distinguish those which want the
+ * original from those which need the
+ * new copy.
+ *
+ * XXXO If we know that only one map has
+ * access to this page, then we could
+ * avoid the pmap_disconnect() call.
+ */
+ if (m->vmp_pmapped) {
+ pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
+ }
+
+ if (m->vmp_clustered) {
+ VM_PAGE_COUNT_AS_PAGEIN(m);
+ VM_PAGE_CONSUME_CLUSTERED(m);
+ }
+ assert(!m->vmp_cleaning);
+
+ /*
+ * We no longer need the old page or object.
+ */
+ RELEASE_PAGE(m);
+
+ /*
+ * This check helps with marking the object as having a sequential pattern
+ * Normally we'll miss doing this below because this fault is about COW to
+ * the first_object i.e. bring page in from disk, push to object above but
+ * don't update the file object's sequential pattern.
+ */
+ if (object->internal == FALSE) {
+ vm_fault_is_sequential(object, offset, fault_info->behavior);
+ }
+
+ vm_object_paging_end(object);
+ vm_object_unlock(object);
+
+ my_fault = DBG_COW_FAULT;
+ counter_inc(&vm_statistics_cow_faults);
+ DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
+ current_task()->cow_faults++;
+
+ object = first_object;
+ offset = first_offset;
+
+ vm_object_lock(object);
+ /*
+ * get rid of the place holder
+ * page that we soldered in earlier
+ */
+ VM_PAGE_FREE(first_m);
+ first_m = VM_PAGE_NULL;
+
+ /*
+ * and replace it with the
+ * page we just copied into
+ */
+ assert(copy_m->vmp_busy);
+ vm_page_insert(copy_m, object, vm_object_trunc_page(offset));
+ SET_PAGE_DIRTY(copy_m, TRUE);
+
+ m = copy_m;
+ /*
+ * Now that we've gotten the copy out of the
+ * way, let's try to collapse the top object.
+ * But we have to play ugly games with
+ * paging_in_progress to do that...
+ */
+ vm_object_paging_end(object);
+ vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
+ vm_object_paging_begin(object);
+ } else {
+ *protection &= (~VM_PROT_WRITE);
+ }
+ }
+ /*
+ * Now check whether the page needs to be pushed into the
+ * copy object. The use of asymmetric copy on write for
+ * shared temporary objects means that we may do two copies to
+ * satisfy the fault; one above to get the page from a
+ * shadowed object, and one here to push it into the copy.
+ */
+ try_failed_count = 0;
+
+ while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
+ vm_object_offset_t copy_offset;
+ vm_page_t copy_m;
+
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
+#endif
+ /*
+ * If the page is being written, but hasn't been
+ * copied to the copy-object, we have to copy it there.
+ */
+ if ((fault_type & VM_PROT_WRITE) == 0) {
+ *protection &= ~VM_PROT_WRITE;
+ break;
+ }
+
+ /*
+ * If the page was guaranteed to be resident,
+ * we must have already performed the copy.
+ */
+ if (must_be_resident) {
+ break;
+ }
+
+ /*
+ * Try to get the lock on the copy_object.
+ */
+ if (!vm_object_lock_try(copy_object)) {
+ vm_object_unlock(object);
+ try_failed_count++;
+
+ mutex_pause(try_failed_count); /* wait a bit */
+ vm_object_lock(object);
+
+ continue;
+ }
+ try_failed_count = 0;
+
+ /*
+ * Make another reference to the copy-object,
+ * to keep it from disappearing during the
+ * copy.
+ */
+ vm_object_reference_locked(copy_object);
+
+ /*
+ * Does the page exist in the copy?
+ */
+ copy_offset = first_offset - copy_object->vo_shadow_offset;
+ copy_offset = vm_object_trunc_page(copy_offset);
+
+ if (copy_object->vo_size <= copy_offset) {
+ /*
+ * Copy object doesn't cover this page -- do nothing.
+ */
+ ;
+ } else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
+ /*
+ * Page currently exists in the copy object
+ */
+ if (copy_m->vmp_busy) {
+ /*
+ * If the page is being brought
+ * in, wait for it and then retry.
+ */
+ RELEASE_PAGE(m);
+
+ /*
+ * take an extra ref so object won't die
+ */
+ vm_object_reference_locked(copy_object);
+ vm_object_unlock(copy_object);
+ vm_fault_cleanup(object, first_m);
+
+ vm_object_lock(copy_object);
+ assert(copy_object->ref_count > 0);
+ vm_object_lock_assert_exclusive(copy_object);
+ copy_object->ref_count--;
+ assert(copy_object->ref_count > 0);
+ copy_m = vm_page_lookup(copy_object, copy_offset);
+
+ if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
+ PAGE_ASSERT_WAIT(copy_m, interruptible);
+
+ vm_object_unlock(copy_object);
+ wait_result = thread_block(THREAD_CONTINUE_NULL);
+ vm_object_deallocate(copy_object);
+
+ goto backoff;
+ } else {
+ vm_object_unlock(copy_object);
+ vm_object_deallocate(copy_object);
+ thread_interrupt_level(interruptible_state);
+
+ return VM_FAULT_RETRY;
+ }
+ }
+ } else if (!PAGED_OUT(copy_object, copy_offset)) {
+ /*
+ * If PAGED_OUT is TRUE, then the page used to exist
+ * in the copy-object, and has already been paged out.
+ * We don't need to repeat this. If PAGED_OUT is
+ * FALSE, then either we don't know (!pager_created,
+ * for example) or it hasn't been paged out.
+ * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
+ * We must copy the page to the copy object.
+ *
+ * Allocate a page for the copy
+ */
+ copy_m = vm_page_alloc(copy_object, copy_offset);
+
+ if (copy_m == VM_PAGE_NULL) {
+ RELEASE_PAGE(m);
+
+ vm_object_lock_assert_exclusive(copy_object);
+ copy_object->ref_count--;
+ assert(copy_object->ref_count > 0);
+
+ vm_object_unlock(copy_object);
+ vm_fault_cleanup(object, first_m);
+ thread_interrupt_level(interruptible_state);
+
+ return VM_FAULT_MEMORY_SHORTAGE;
+ }
+ /*
+ * Must copy page into copy-object.
+ */
+ vm_page_copy(m, copy_m);
+
+ /*
+ * If the old page was in use by any users
+ * of the copy-object, it must be removed
+ * from all pmaps. (We can't know which
+ * pmaps use it.)
+ */
+ if (m->vmp_pmapped) {
+ pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
+ }
+
+ if (m->vmp_clustered) {
+ VM_PAGE_COUNT_AS_PAGEIN(m);
+ VM_PAGE_CONSUME_CLUSTERED(m);
+ }
+ /*
+ * If there's a pager, then immediately
+ * page out this page, using the "initialize"
+ * option. Else, we use the copy.
+ */
+ if ((!copy_object->pager_ready)
+ || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
+ ) {
+ vm_page_lockspin_queues();
+ assert(!m->vmp_cleaning);
+ vm_page_activate(copy_m);
+ vm_page_unlock_queues();
+
+ SET_PAGE_DIRTY(copy_m, TRUE);
+ PAGE_WAKEUP_DONE(copy_m);
+ } else {
+ assert(copy_m->vmp_busy == TRUE);
+ assert(!m->vmp_cleaning);
+
+ /*
+ * dirty is protected by the object lock
+ */
+ SET_PAGE_DIRTY(copy_m, TRUE);
+
+ /*
+ * The page is already ready for pageout:
+ * not on pageout queues and busy.
+ * Unlock everything except the
+ * copy_object itself.
+ */
+ vm_object_unlock(object);
+
+ /*
+ * Write the page to the copy-object,
+ * flushing it from the kernel.
+ */
+ vm_pageout_initialize_page(copy_m);
+
+ /*
+ * Since the pageout may have
+ * temporarily dropped the
+ * copy_object's lock, we
+ * check whether we'll have
+ * to deallocate the hard way.
+ */
+ if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
+ vm_object_unlock(copy_object);
+ vm_object_deallocate(copy_object);
+ vm_object_lock(object);
+
+ continue;
+ }
+ /*
+ * Pick back up the old object's
+ * lock. [It is safe to do so,
+ * since it must be deeper in the
+ * object tree.]
+ */
+ vm_object_lock(object);
+ }
+
+ /*
+ * Because we're pushing a page upward
+ * in the object tree, we must restart
+ * any faults that are waiting here.
+ * [Note that this is an expansion of
+ * PAGE_WAKEUP that uses the THREAD_RESTART
+ * wait result]. Can't turn off the page's
+ * busy bit because we're not done with it.
+ */
+ if (m->vmp_wanted) {
+ m->vmp_wanted = FALSE;
+ thread_wakeup_with_result((event_t) m, THREAD_RESTART);
+ }
+ }
+ /*
+ * The reference count on copy_object must be
+ * at least 2: one for our extra reference,
+ * and at least one from the outside world
+ * (we checked that when we last locked
+ * copy_object).
+ */
+ vm_object_lock_assert_exclusive(copy_object);
+ copy_object->ref_count--;
+ assert(copy_object->ref_count > 0);
+
+ vm_object_unlock(copy_object);
+
+ break;
+ }
+
+done:
+ *result_page = m;
+ *top_page = first_m;
+
+ if (m != VM_PAGE_NULL) {
+ assert(VM_PAGE_OBJECT(m) == object);
+
+ retval = VM_FAULT_SUCCESS;
+
+ if (my_fault == DBG_PAGEIN_FAULT) {
+ VM_PAGE_COUNT_AS_PAGEIN(m);
+
+ if (object->internal) {
+ my_fault = DBG_PAGEIND_FAULT;
+ } else {
+ my_fault = DBG_PAGEINV_FAULT;
+ }
+
+ /*
+ * evaluate access pattern and update state
+ * vm_fault_deactivate_behind depends on the
+ * state being up to date
+ */
+ vm_fault_is_sequential(object, offset, fault_info->behavior);
+ vm_fault_deactivate_behind(object, offset, fault_info->behavior);
+ } else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
+ /*
+ * we weren't called from vm_fault, so handle the
+ * accounting here for hits in the cache
+ */
+ if (m->vmp_clustered) {
+ VM_PAGE_COUNT_AS_PAGEIN(m);
+ VM_PAGE_CONSUME_CLUSTERED(m);
+ }
+ vm_fault_is_sequential(object, offset, fault_info->behavior);
+ vm_fault_deactivate_behind(object, offset, fault_info->behavior);
+ } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
+ VM_STAT_DECOMPRESSIONS();
+ }
+ if (type_of_fault) {
+ *type_of_fault = my_fault;
+ }
+ } else {
+ retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
+ assert(first_m == VM_PAGE_NULL);
+ assert(object == first_object);
+ }
+
+ thread_interrupt_level(interruptible_state);
+
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
+#endif
+ return retval;
+
+backoff:
+ thread_interrupt_level(interruptible_state);
+
+ if (wait_result == THREAD_INTERRUPTED) {
+ return VM_FAULT_INTERRUPTED;
+ }
+ return VM_FAULT_RETRY;
+
+#undef RELEASE_PAGE
+}
+
+
+extern int panic_on_cs_killed;
+extern int proc_selfpid(void);
+extern char *proc_name_address(void *p);
+unsigned long cs_enter_tainted_rejected = 0;
+unsigned long cs_enter_tainted_accepted = 0;
+
+/*
+ * CODE SIGNING:
+ * When soft faulting a page, we have to validate the page if:
+ * 1. the page is being mapped in user space
+ * 2. the page hasn't already been found to be "tainted"
+ * 3. the page belongs to a code-signed object
+ * 4. the page has not been validated yet or has been mapped for write.
+ */
+static bool
+vm_fault_cs_need_validation(
+ pmap_t pmap,
+ vm_page_t page,
+ vm_object_t page_obj,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset)
+{
+ if (pmap == kernel_pmap) {
+ /* 1 - not user space */
+ return false;
+ }
+ if (!page_obj->code_signed) {
+ /* 3 - page does not belong to a code-signed object */
+ return false;
+ }
+ if (fault_page_size == PAGE_SIZE) {
+ /* looking at the whole page */
+ assertf(fault_phys_offset == 0,
+ "fault_page_size 0x%llx fault_phys_offset 0x%llx\n",
+ (uint64_t)fault_page_size,
+ (uint64_t)fault_phys_offset);
+ if (page->vmp_cs_tainted == VMP_CS_ALL_TRUE) {
+ /* 2 - page is all tainted */
+ return false;
+ }
+ if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
+ !page->vmp_wpmapped) {
+ /* 4 - already fully validated and never mapped writable */
+ return false;
+ }
+ } else {
+ /* looking at a specific sub-page */
+ if (VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
+ /* 2 - sub-page was already marked as tainted */
+ return false;
+ }
+ if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) &&
+ !page->vmp_wpmapped) {
+ /* 4 - already validated and never mapped writable */
+ return false;
+ }
+ }
+ /* page needs to be validated */
+ return true;
+}
+
+
+static bool
+vm_fault_cs_page_immutable(
+ vm_page_t m,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ vm_prot_t prot __unused)
+{
+ if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)
+ /*&& ((prot) & VM_PROT_EXECUTE)*/) {
+ return true;
+ }
+ return false;
+}
+
+static bool
+vm_fault_cs_page_nx(
+ vm_page_t m,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset)
+{
+ return VMP_CS_NX(m, fault_page_size, fault_phys_offset);
+}
+
+/*
+ * Check if the page being entered into the pmap violates code signing.
+ */
+static kern_return_t
+vm_fault_cs_check_violation(
+ bool cs_bypass,
+ vm_object_t object,
+ vm_page_t m,
+ pmap_t pmap,
+ vm_prot_t prot,
+ vm_prot_t caller_prot,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ vm_object_fault_info_t fault_info,
+ bool map_is_switched,
+ bool map_is_switch_protected,
+ bool *cs_violation)
+{
+#if !PMAP_CS
+#pragma unused(caller_prot)
+#pragma unused(fault_info)
+#endif /* !PMAP_CS */
+ int cs_enforcement_enabled;
+ if (!cs_bypass &&
+ vm_fault_cs_need_validation(pmap, m, object,
+ fault_page_size, fault_phys_offset)) {
+ vm_object_lock_assert_exclusive(object);
+
+ if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)) {
+ vm_cs_revalidates++;
+ }
+
+ /* VM map is locked, so 1 ref will remain on VM object -
+ * so no harm if vm_page_validate_cs drops the object lock */
+
+ vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
+ }
+
+ /* If the map is switched, and is switch-protected, we must protect
+ * some pages from being write-faulted: immutable pages because by
+ * definition they may not be written, and executable pages because that
+ * would provide a way to inject unsigned code.
+ * If the page is immutable, we can simply return. However, we can't
+ * immediately determine whether a page is executable anywhere. But,
+ * we can disconnect it everywhere and remove the executable protection
+ * from the current map. We do that below right before we do the
+ * PMAP_ENTER.
+ */
+ if (pmap == kernel_pmap) {
+ /* kernel fault: cs_enforcement does not apply */
+ cs_enforcement_enabled = 0;
+ } else {
+ cs_enforcement_enabled = pmap_get_vm_map_cs_enforced(pmap);
+ }
+
+ if (cs_enforcement_enabled && map_is_switched &&
+ map_is_switch_protected &&
+ vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
+ (prot & VM_PROT_WRITE)) {
+ return KERN_CODESIGN_ERROR;
+ }
+
+ if (cs_enforcement_enabled &&
+ vm_fault_cs_page_nx(m, fault_page_size, fault_phys_offset) &&
+ (prot & VM_PROT_EXECUTE)) {
+ if (cs_debug) {
+ printf("page marked to be NX, not letting it be mapped EXEC\n");
+ }
+ return KERN_CODESIGN_ERROR;
+ }
+
+ /* A page could be tainted, or pose a risk of being tainted later.
+ * Check whether the receiving process wants it, and make it feel
+ * the consequences (that hapens in cs_invalid_page()).
+ * For CS Enforcement, two other conditions will
+ * cause that page to be tainted as well:
+ * - pmapping an unsigned page executable - this means unsigned code;
+ * - writeable mapping of a validated page - the content of that page
+ * can be changed without the kernel noticing, therefore unsigned
+ * code can be created
+ */
+ if (cs_bypass) {
+ /* code-signing is bypassed */
+ *cs_violation = FALSE;
+ } else if (VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
+ /* tainted page */
+ *cs_violation = TRUE;
+ } else if (!cs_enforcement_enabled) {
+ /* no further code-signing enforcement */
+ *cs_violation = FALSE;
+ } else if (vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
+ ((prot & VM_PROT_WRITE) ||
+ m->vmp_wpmapped)) {
+ /*
+ * The page should be immutable, but is in danger of being
+ * modified.
+ * This is the case where we want policy from the code
+ * directory - is the page immutable or not? For now we have
+ * to assume that code pages will be immutable, data pages not.
+ * We'll assume a page is a code page if it has a code directory
+ * and we fault for execution.
+ * That is good enough since if we faulted the code page for
+ * writing in another map before, it is wpmapped; if we fault
+ * it for writing in this map later it will also be faulted for
+ * executing at the same time; and if we fault for writing in
+ * another map later, we will disconnect it from this pmap so
+ * we'll notice the change.
+ */
+ *cs_violation = TRUE;
+ } else if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
+ (prot & VM_PROT_EXECUTE)
+ ) {
+ *cs_violation = TRUE;
+ } else {
+ *cs_violation = FALSE;
+ }
+ return KERN_SUCCESS;
+}
+
+/*
+ * Handles a code signing violation by either rejecting the page or forcing a disconnect.
+ * @param must_disconnect This value will be set to true if the caller must disconnect
+ * this page.
+ * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
+ */
+static kern_return_t
+vm_fault_cs_handle_violation(
+ vm_object_t object,
+ vm_page_t m,
+ pmap_t pmap,
+ vm_prot_t prot,
+ vm_map_offset_t vaddr,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ bool map_is_switched,
+ bool map_is_switch_protected,
+ bool *must_disconnect)
+{
+#if !MACH_ASSERT
+#pragma unused(pmap)
+#pragma unused(map_is_switch_protected)
+#endif /* !MACH_ASSERT */
+ /*
+ * We will have a tainted page. Have to handle the special case
+ * of a switched map now. If the map is not switched, standard
+ * procedure applies - call cs_invalid_page().
+ * If the map is switched, the real owner is invalid already.
+ * There is no point in invalidating the switching process since
+ * it will not be executing from the map. So we don't call
+ * cs_invalid_page() in that case.
+ */
+ boolean_t reject_page, cs_killed;
+ kern_return_t kr;
+ if (map_is_switched) {
+ assert(pmap == vm_map_pmap(current_thread()->map));
+ assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
+ reject_page = FALSE;
+ } else {
+ if (cs_debug > 5) {
+ printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
+ object->code_signed ? "yes" : "no",
+ VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
+ VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
+ m->vmp_wpmapped ? "yes" : "no",
+ (int)prot);
+ }
+ reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
+ }
+
+ if (reject_page) {
+ /* reject the invalid page: abort the page fault */
+ int pid;
+ const char *procname;
+ task_t task;
+ vm_object_t file_object, shadow;
+ vm_object_offset_t file_offset;
+ char *pathname, *filename;
+ vm_size_t pathname_len, filename_len;
+ boolean_t truncated_path;
+#define __PATH_MAX 1024
+ struct timespec mtime, cs_mtime;
+ int shadow_depth;
+ os_reason_t codesigning_exit_reason = OS_REASON_NULL;
+
+ kr = KERN_CODESIGN_ERROR;
+ cs_enter_tainted_rejected++;
+
+ /* get process name and pid */
+ procname = "?";
+ task = current_task();
+ pid = proc_selfpid();
+ if (task->bsd_info != NULL) {
+ procname = proc_name_address(task->bsd_info);
+ }
+
+ /* get file's VM object */
+ file_object = object;
+ file_offset = m->vmp_offset;
+ for (shadow = file_object->shadow,
+ shadow_depth = 0;
+ shadow != VM_OBJECT_NULL;
+ shadow = file_object->shadow,
+ shadow_depth++) {
+ vm_object_lock_shared(shadow);
+ if (file_object != object) {
+ vm_object_unlock(file_object);
+ }
+ file_offset += file_object->vo_shadow_offset;
+ file_object = shadow;
+ }
+
+ mtime.tv_sec = 0;
+ mtime.tv_nsec = 0;
+ cs_mtime.tv_sec = 0;
+ cs_mtime.tv_nsec = 0;
+
+ /* get file's pathname and/or filename */
+ pathname = NULL;
+ filename = NULL;
+ pathname_len = 0;
+ filename_len = 0;
+ truncated_path = FALSE;
+ /* no pager -> no file -> no pathname, use "<nil>" in that case */
+ if (file_object->pager != NULL) {
+ pathname = kheap_alloc(KHEAP_TEMP, __PATH_MAX * 2, Z_WAITOK);
+ if (pathname) {
+ pathname[0] = '\0';
+ pathname_len = __PATH_MAX;
+ filename = pathname + pathname_len;
+ filename_len = __PATH_MAX;
+
+ if (vnode_pager_get_object_name(file_object->pager,
+ pathname,
+ pathname_len,
+ filename,
+ filename_len,
+ &truncated_path) == KERN_SUCCESS) {
+ /* safety first... */
+ pathname[__PATH_MAX - 1] = '\0';
+ filename[__PATH_MAX - 1] = '\0';
+
+ vnode_pager_get_object_mtime(file_object->pager,
+ &mtime,
+ &cs_mtime);
+ } else {
+ kheap_free(KHEAP_TEMP, pathname, __PATH_MAX * 2);
+ pathname = NULL;
+ filename = NULL;
+ pathname_len = 0;
+ filename_len = 0;
+ truncated_path = FALSE;
+ }
+ }
+ }
+ printf("CODE SIGNING: process %d[%s]: "
+ "rejecting invalid page at address 0x%llx "
+ "from offset 0x%llx in file \"%s%s%s\" "
+ "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
+ "(signed:%d validated:%d tainted:%d nx:%d "
+ "wpmapped:%d dirty:%d depth:%d)\n",
+ pid, procname, (addr64_t) vaddr,
+ file_offset,
+ (pathname ? pathname : "<nil>"),
+ (truncated_path ? "/.../" : ""),
+ (truncated_path ? filename : ""),
+ cs_mtime.tv_sec, cs_mtime.tv_nsec,
+ ((cs_mtime.tv_sec == mtime.tv_sec &&
+ cs_mtime.tv_nsec == mtime.tv_nsec)
+ ? "=="
+ : "!="),
+ mtime.tv_sec, mtime.tv_nsec,
+ object->code_signed,
+ VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
+ VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
+ VMP_CS_NX(m, fault_page_size, fault_phys_offset),
+ m->vmp_wpmapped,
+ m->vmp_dirty,
+ shadow_depth);
+
+ /*
+ * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
+ * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
+ * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
+ * will deal with the segmentation fault.
+ */
+ if (cs_killed) {
+ KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
+ pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE, 0, 0);
+
+ codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
+ if (codesigning_exit_reason == NULL) {
+ printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
+ } else {
+ mach_vm_address_t data_addr = 0;
+ struct codesigning_exit_reason_info *ceri = NULL;
+ uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
+
+ if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
+ printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
+ } else {
+ if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
+ EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
+ ceri = (struct codesigning_exit_reason_info *)data_addr;
+ static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
+
+ ceri->ceri_virt_addr = vaddr;
+ ceri->ceri_file_offset = file_offset;
+ if (pathname) {
+ strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
+ } else {
+ ceri->ceri_pathname[0] = '\0';
+ }
+ if (filename) {
+ strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
+ } else {
+ ceri->ceri_filename[0] = '\0';
+ }
+ ceri->ceri_path_truncated = (truncated_path ? 1 : 0);
+ ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
+ ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
+ ceri->ceri_page_modtime_secs = mtime.tv_sec;
+ ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
+ ceri->ceri_object_codesigned = (object->code_signed);
+ ceri->ceri_page_codesig_validated = VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset);
+ ceri->ceri_page_codesig_tainted = VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset);
+ ceri->ceri_page_codesig_nx = VMP_CS_NX(m, fault_page_size, fault_phys_offset);
+ ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
+ ceri->ceri_page_slid = 0;
+ ceri->ceri_page_dirty = (m->vmp_dirty);
+ ceri->ceri_page_shadow_depth = shadow_depth;
+ } else {
+#if DEBUG || DEVELOPMENT
+ panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
+#else
+ printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
+#endif /* DEBUG || DEVELOPMENT */
+ /* Free the buffer */
+ os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
+ }
+ }
+ }
+
+ set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
+ }
+ if (panic_on_cs_killed &&
+ object->object_is_shared_cache) {
+ char *tainted_contents;
+ vm_map_offset_t src_vaddr;
+ src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
+ tainted_contents = kalloc(PAGE_SIZE);
+ bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE);
+ printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
+ panic("CODE SIGNING: process %d[%s]: "
+ "rejecting invalid page (phys#0x%x) at address 0x%llx "
+ "from offset 0x%llx in file \"%s%s%s\" "
+ "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
+ "(signed:%d validated:%d tainted:%d nx:%d"
+ "wpmapped:%d dirty:%d depth:%d)\n",
+ pid, procname,
+ VM_PAGE_GET_PHYS_PAGE(m),
+ (addr64_t) vaddr,
+ file_offset,
+ (pathname ? pathname : "<nil>"),
+ (truncated_path ? "/.../" : ""),
+ (truncated_path ? filename : ""),
+ cs_mtime.tv_sec, cs_mtime.tv_nsec,
+ ((cs_mtime.tv_sec == mtime.tv_sec &&
+ cs_mtime.tv_nsec == mtime.tv_nsec)
+ ? "=="
+ : "!="),
+ mtime.tv_sec, mtime.tv_nsec,
+ object->code_signed,
+ VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
+ VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
+ VMP_CS_NX(m, fault_page_size, fault_phys_offset),
+ m->vmp_wpmapped,
+ m->vmp_dirty,
+ shadow_depth);
+ }
+
+ if (file_object != object) {
+ vm_object_unlock(file_object);
+ }
+ if (pathname_len != 0) {
+ kheap_free(KHEAP_TEMP, pathname, __PATH_MAX * 2);
+ pathname = NULL;
+ filename = NULL;
+ }
+ } else {
+ /* proceed with the invalid page */
+ kr = KERN_SUCCESS;
+ if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
+ !object->code_signed) {
+ /*
+ * This page has not been (fully) validated but
+ * does not belong to a code-signed object
+ * so it should not be forcefully considered
+ * as tainted.
+ * We're just concerned about it here because
+ * we've been asked to "execute" it but that
+ * does not mean that it should cause other
+ * accesses to fail.
+ * This happens when a debugger sets a
+ * breakpoint and we then execute code in
+ * that page. Marking the page as "tainted"
+ * would cause any inspection tool ("leaks",
+ * "vmmap", "CrashReporter", ...) to get killed
+ * due to code-signing violation on that page,
+ * even though they're just reading it and not
+ * executing from it.
+ */
+ } else {
+ /*
+ * Page might have been tainted before or not;
+ * now it definitively is. If the page wasn't
+ * tainted, we must disconnect it from all
+ * pmaps later, to force existing mappings
+ * through that code path for re-consideration
+ * of the validity of that page.
+ */
+ if (!VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
+ *must_disconnect = TRUE;
+ VMP_CS_SET_TAINTED(m, fault_page_size, fault_phys_offset, TRUE);
+ }
+ }
+ cs_enter_tainted_accepted++;
+ }
+ if (kr != KERN_SUCCESS) {
+ if (cs_debug) {
+ printf("CODESIGNING: vm_fault_enter(0x%llx): "
+ "*** INVALID PAGE ***\n",
+ (long long)vaddr);
+ }
+#if !SECURE_KERNEL
+ if (cs_enforcement_panic) {
+ panic("CODESIGNING: panicking on invalid page\n");
+ }
+#endif
+ }
+ return kr;
+}
+
+/*
+ * Check that the code signature is valid for the given page being inserted into
+ * the pmap.
+ *
+ * @param must_disconnect This value will be set to true if the caller must disconnect
+ * this page.
+ * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
+ */
+static kern_return_t
+vm_fault_validate_cs(
+ bool cs_bypass,
+ vm_object_t object,
+ vm_page_t m,
+ pmap_t pmap,
+ vm_map_offset_t vaddr,
+ vm_prot_t prot,
+ vm_prot_t caller_prot,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ vm_object_fault_info_t fault_info,
+ bool *must_disconnect)
+{
+ bool map_is_switched, map_is_switch_protected, cs_violation;
+ kern_return_t kr;
+ /* Validate code signature if necessary. */
+ map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
+ (pmap == vm_map_pmap(current_thread()->map)));
+ map_is_switch_protected = current_thread()->map->switch_protect;
+ kr = vm_fault_cs_check_violation(cs_bypass, object, m, pmap,
+ prot, caller_prot, fault_page_size, fault_phys_offset, fault_info,
+ map_is_switched, map_is_switch_protected, &cs_violation);
+ if (kr != KERN_SUCCESS) {
+ return kr;
+ }
+ if (cs_violation) {
+ kr = vm_fault_cs_handle_violation(object, m, pmap, prot, vaddr,
+ fault_page_size, fault_phys_offset,
+ map_is_switched, map_is_switch_protected, must_disconnect);
+ }
+ return kr;
+}
+
+/*
+ * Enqueue the page on the appropriate paging queue.
+ */
+static void
+vm_fault_enqueue_page(
+ vm_object_t object,
+ vm_page_t m,
+ bool wired,
+ bool change_wiring,
+ vm_tag_t wire_tag,
+ bool no_cache,
+ int *type_of_fault,
+ kern_return_t kr)
+{
+ assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
+ boolean_t page_queues_locked = FALSE;
+ boolean_t previously_pmapped = m->vmp_pmapped;
+#define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED() \
+MACRO_BEGIN \
+ if (! page_queues_locked) { \
+ page_queues_locked = TRUE; \
+ vm_page_lockspin_queues(); \
+ } \
+MACRO_END
+#define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED() \
+MACRO_BEGIN \
+ if (page_queues_locked) { \
+ page_queues_locked = FALSE; \
+ vm_page_unlock_queues(); \
+ } \
+MACRO_END
+
+#if CONFIG_BACKGROUND_QUEUE
+ vm_page_update_background_state(m);
+#endif
+ if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
+ /*
+ * Compressor pages are neither wired
+ * nor pageable and should never change.
+ */
+ assert(object == compressor_object);
+ } else if (change_wiring) {
+ __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
+
+ if (wired) {
+ if (kr == KERN_SUCCESS) {
+ vm_page_wire(m, wire_tag, TRUE);
+ }
+ } else {
+ vm_page_unwire(m, TRUE);
+ }
+ /* we keep the page queues lock, if we need it later */
+ } else {
+ if (object->internal == TRUE) {
+ /*
+ * don't allow anonymous pages on
+ * the speculative queues
+ */
+ no_cache = FALSE;
+ }
+ if (kr != KERN_SUCCESS) {
+ __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
+ vm_page_deactivate(m);
+ /* we keep the page queues lock, if we need it later */
+ } else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
+ (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
+ (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
+ ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
+ !VM_PAGE_WIRED(m)) {
+ if (vm_page_local_q &&
+ (*type_of_fault == DBG_COW_FAULT ||
+ *type_of_fault == DBG_ZERO_FILL_FAULT)) {
+ struct vpl *lq;
+ uint32_t lid;
+
+ assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
+
+ __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
+ vm_object_lock_assert_exclusive(object);
+
+ /*
+ * we got a local queue to stuff this
+ * new page on...
+ * its safe to manipulate local and
+ * local_id at this point since we're
+ * behind an exclusive object lock and
+ * the page is not on any global queue.
+ *
+ * we'll use the current cpu number to
+ * select the queue note that we don't
+ * need to disable preemption... we're
+ * going to be behind the local queue's
+ * lock to do the real work
+ */
+ lid = cpu_number();
+
+ lq = zpercpu_get_cpu(vm_page_local_q, lid);
+
+ VPL_LOCK(&lq->vpl_lock);
+
+ vm_page_check_pageable_safe(m);
+ vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
+ m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
+ m->vmp_local_id = lid;
+ lq->vpl_count++;
+
+ if (object->internal) {
+ lq->vpl_internal_count++;
+ } else {
+ lq->vpl_external_count++;
+ }
+
+ VPL_UNLOCK(&lq->vpl_lock);
+
+ if (lq->vpl_count > vm_page_local_q_soft_limit) {
+ /*
+ * we're beyond the soft limit
+ * for the local queue
+ * vm_page_reactivate_local will
+ * 'try' to take the global page
+ * queue lock... if it can't
+ * that's ok... we'll let the
+ * queue continue to grow up
+ * to the hard limit... at that
+ * point we'll wait for the
+ * lock... once we've got the
+ * lock, we'll transfer all of
+ * the pages from the local
+ * queue to the global active
+ * queue
+ */
+ vm_page_reactivate_local(lid, FALSE, FALSE);
+ }
+ } else {
+ __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
+
+ /*
+ * test again now that we hold the
+ * page queue lock
+ */
+ if (!VM_PAGE_WIRED(m)) {
+ if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
+ vm_page_queues_remove(m, FALSE);
+
+ VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
+ VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
+ }
+
+ if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
+ no_cache) {
+ /*
+ * If this is a no_cache mapping
+ * and the page has never been
+ * mapped before or was
+ * previously a no_cache page,
+ * then we want to leave pages
+ * in the speculative state so
+ * that they can be readily
+ * recycled if free memory runs
+ * low. Otherwise the page is
+ * activated as normal.
+ */
+
+ if (no_cache &&
+ (!previously_pmapped ||
+ m->vmp_no_cache)) {
+ m->vmp_no_cache = TRUE;
+
+ if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
+ vm_page_speculate(m, FALSE);
+ }
+ } else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
+ vm_page_activate(m);
+ }
+ }
+ }
+ /* we keep the page queues lock, if we need it later */
+ }
+ }
+ }
+ /* we're done with the page queues lock, if we ever took it */
+ __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
+}
+
+/*
+ * Sets the pmmpped, xpmapped, and wpmapped bits on the vm_page_t and updates accounting.
+ * @return true if the page needs to be sync'ed via pmap_sync-page_data_physo
+ * before being inserted into the pmap.
+ */
+static bool
+vm_fault_enter_set_mapped(
+ vm_object_t object,
+ vm_page_t m,
+ vm_prot_t prot,
+ vm_prot_t fault_type)
+{
+ bool page_needs_sync = false;
+ /*
+ * NOTE: we may only hold the vm_object lock SHARED
+ * at this point, so we need the phys_page lock to
+ * properly serialize updating the pmapped and
+ * xpmapped bits
+ */
+ if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
+ ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
+
+ pmap_lock_phys_page(phys_page);
+ m->vmp_pmapped = TRUE;
+
+ if (!m->vmp_xpmapped) {
+ m->vmp_xpmapped = TRUE;
+
+ pmap_unlock_phys_page(phys_page);
+
+ if (!object->internal) {
+ OSAddAtomic(1, &vm_page_xpmapped_external_count);
+ }
+
+#if defined(__arm__) || defined(__arm64__)
+ page_needs_sync = true;
+#else
+ if (object->internal &&
+ object->pager != NULL) {
+ /*
+ * This page could have been
+ * uncompressed by the
+ * compressor pager and its
+ * contents might be only in
+ * the data cache.
+ * Since it's being mapped for
+ * "execute" for the fist time,
+ * make sure the icache is in
+ * sync.
+ */
+ assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
+ page_needs_sync = true;
+ }
+#endif
+ } else {
+ pmap_unlock_phys_page(phys_page);
+ }
+ } else {
+ if (m->vmp_pmapped == FALSE) {
+ ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
+
+ pmap_lock_phys_page(phys_page);
+ m->vmp_pmapped = TRUE;
+ pmap_unlock_phys_page(phys_page);
+ }
+ }
+
+ if (fault_type & VM_PROT_WRITE) {
+ if (m->vmp_wpmapped == FALSE) {
+ vm_object_lock_assert_exclusive(object);
+ if (!object->internal && object->pager) {
+ task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
+ }
+ m->vmp_wpmapped = TRUE;
+ }
+ }
+ return page_needs_sync;
+}
+
+/*
+ * Try to enter the given page into the pmap.
+ * Will retry without execute permission iff PMAP_CS is enabled and we encounter
+ * a codesigning failure on a non-execute fault.
+ */
+static kern_return_t
+vm_fault_attempt_pmap_enter(
+ pmap_t pmap,
+ vm_map_offset_t vaddr,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ vm_page_t m,
+ vm_prot_t *prot,
+ vm_prot_t caller_prot,
+ vm_prot_t fault_type,
+ bool wired,
+ int pmap_options)
+{
+#if !PMAP_CS
+#pragma unused(caller_prot)
+#endif /* !PMAP_CS */
+ kern_return_t kr;
+ if (fault_page_size != PAGE_SIZE) {
+ DEBUG4K_FAULT("pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x fault_type 0x%x\n", pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, *prot, fault_type);
+ assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
+ fault_phys_offset < PAGE_SIZE),
+ "0x%llx\n", (uint64_t)fault_phys_offset);
+ } else {
+ assertf(fault_phys_offset == 0,
+ "0x%llx\n", (uint64_t)fault_phys_offset);
+ }
+
+ PMAP_ENTER_OPTIONS(pmap, vaddr,
+ fault_phys_offset,
+ m, *prot, fault_type, 0,
+ wired,
+ pmap_options,
+ kr);
+ return kr;
+}
+
+/*
+ * Enter the given page into the pmap.
+ * The map must be locked shared.
+ * The vm object must NOT be locked.
+ *
+ * @param need_retry if not null, avoid making a (potentially) blocking call into
+ * the pmap layer. When such a call would be necessary, return true in this boolean instead.
+ */
+static kern_return_t
+vm_fault_pmap_enter(
+ pmap_t pmap,
+ vm_map_offset_t vaddr,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ vm_page_t m,
+ vm_prot_t *prot,
+ vm_prot_t caller_prot,
+ vm_prot_t fault_type,
+ bool wired,
+ int pmap_options,
+ boolean_t *need_retry)
+{
+ kern_return_t kr;
+ if (need_retry != NULL) {
+ /*
+ * Although we don't hold a lock on this object, we hold a lock
+ * on the top object in the chain. To prevent a deadlock, we
+ * can't allow the pmap layer to block.
+ */
+ pmap_options |= PMAP_OPTIONS_NOWAIT;
+ }
+ kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
+ fault_page_size, fault_phys_offset,
+ m, prot, caller_prot, fault_type, wired, pmap_options);
+ if (kr == KERN_RESOURCE_SHORTAGE) {
+ if (need_retry) {
+ /*
+ * There's nothing we can do here since we hold the
+ * lock on the top object in the chain. The caller
+ * will need to deal with this by dropping that lock and retrying.
+ */
+ *need_retry = TRUE;
+ vm_pmap_enter_retried++;
+ }
+ }
+ return kr;
+}
+
+/*
+ * Enter the given page into the pmap.
+ * The vm map must be locked shared.
+ * The vm object must be locked exclusive, unless this is a soft fault.
+ * For a soft fault, the object must be locked shared or exclusive.
+ *
+ * @param need_retry if not null, avoid making a (potentially) blocking call into
+ * the pmap layer. When such a call would be necessary, return true in this boolean instead.
+ */
+static kern_return_t
+vm_fault_pmap_enter_with_object_lock(
+ vm_object_t object,
+ pmap_t pmap,
+ vm_map_offset_t vaddr,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ vm_page_t m,
+ vm_prot_t *prot,
+ vm_prot_t caller_prot,
+ vm_prot_t fault_type,
+ bool wired,
+ int pmap_options,
+ boolean_t *need_retry)
+{
+ kern_return_t kr;
+ /*
+ * Prevent a deadlock by not
+ * holding the object lock if we need to wait for a page in
+ * pmap_enter() - <rdar://problem/7138958>
+ */
+ kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
+ fault_page_size, fault_phys_offset,
+ m, prot, caller_prot, fault_type, wired, pmap_options | PMAP_OPTIONS_NOWAIT);
+#if __x86_64__
+ if (kr == KERN_INVALID_ARGUMENT &&
+ pmap == PMAP_NULL &&
+ wired) {
+ /*
+ * Wiring a page in a pmap-less VM map:
+ * VMware's "vmmon" kernel extension does this
+ * to grab pages.
+ * Let it proceed even though the PMAP_ENTER() failed.
+ */
+ kr = KERN_SUCCESS;
+ }
+#endif /* __x86_64__ */
+
+ if (kr == KERN_RESOURCE_SHORTAGE) {
+ if (need_retry) {
+ /*
+ * this will be non-null in the case where we hold the lock
+ * on the top-object in this chain... we can't just drop
+ * the lock on the object we're inserting the page into
+ * and recall the PMAP_ENTER since we can still cause
+ * a deadlock if one of the critical paths tries to
+ * acquire the lock on the top-object and we're blocked
+ * in PMAP_ENTER waiting for memory... our only recourse
+ * is to deal with it at a higher level where we can
+ * drop both locks.
+ */
+ *need_retry = TRUE;
+ vm_pmap_enter_retried++;
+ goto done;
+ }
+ /*
+ * The nonblocking version of pmap_enter did not succeed.
+ * and we don't need to drop other locks and retry
+ * at the level above us, so
+ * use the blocking version instead. Requires marking
+ * the page busy and unlocking the object
+ */
+ boolean_t was_busy = m->vmp_busy;
+
+ vm_object_lock_assert_exclusive(object);
+
+ m->vmp_busy = TRUE;
+ vm_object_unlock(object);
+
+ PMAP_ENTER_OPTIONS(pmap, vaddr,
+ fault_phys_offset,
+ m, *prot, fault_type,
+ 0, wired,
+ pmap_options, kr);
+
+ assert(VM_PAGE_OBJECT(m) == object);
+
+ /* Take the object lock again. */
+ vm_object_lock(object);
+
+ /* If the page was busy, someone else will wake it up.
+ * Otherwise, we have to do it now. */
+ assert(m->vmp_busy);
+ if (!was_busy) {
+ PAGE_WAKEUP_DONE(m);
+ }
+ vm_pmap_enter_blocked++;
+ }
+
+done:
+ return kr;
+}
+
+/*
+ * Prepare to enter a page into the pmap by checking CS, protection bits,
+ * and setting mapped bits on the page_t.
+ * Does not modify the page's paging queue.
+ *
+ * page queue lock must NOT be held
+ * m->vmp_object must be locked
+ *
+ * NOTE: m->vmp_object could be locked "shared" only if we are called
+ * from vm_fault() as part of a soft fault.
+ */
+static kern_return_t
+vm_fault_enter_prepare(
+ vm_page_t m,
+ pmap_t pmap,
+ vm_map_offset_t vaddr,
+ vm_prot_t *prot,
+ vm_prot_t caller_prot,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ boolean_t change_wiring,
+ vm_prot_t fault_type,
+ vm_object_fault_info_t fault_info,
+ int *type_of_fault,
+ bool *page_needs_data_sync)
+{
+ kern_return_t kr;
+ bool is_tainted = false;
+ vm_object_t object;
+ boolean_t cs_bypass = fault_info->cs_bypass;
+
+ object = VM_PAGE_OBJECT(m);
+
+ vm_object_lock_assert_held(object);
+
+#if KASAN
+ if (pmap == kernel_pmap) {
+ kasan_notify_address(vaddr, PAGE_SIZE);
+ }
+#endif
+
+ LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
+
+ if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
+ vm_object_lock_assert_exclusive(object);
+ } else if ((fault_type & VM_PROT_WRITE) == 0 &&
+ !change_wiring &&
+ (!m->vmp_wpmapped
+#if VM_OBJECT_ACCESS_TRACKING
+ || object->access_tracking
+#endif /* VM_OBJECT_ACCESS_TRACKING */
+ )) {
+ /*
+ * This is not a "write" fault, so we
+ * might not have taken the object lock
+ * exclusively and we might not be able
+ * to update the "wpmapped" bit in
+ * vm_fault_enter().
+ * Let's just grant read access to
+ * the page for now and we'll
+ * soft-fault again if we need write
+ * access later...
+ */
+
+ /* This had better not be a JIT page. */
+ if (!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot)) {
+ *prot &= ~VM_PROT_WRITE;
+ } else {
+ assert(cs_bypass);
+ }
+ }
+ if (m->vmp_pmapped == FALSE) {
+ if (m->vmp_clustered) {
+ if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
+ /*
+ * found it in the cache, but this
+ * is the first fault-in of the page (m->vmp_pmapped == FALSE)
+ * so it must have come in as part of
+ * a cluster... account 1 pagein against it
+ */
+ if (object->internal) {
+ *type_of_fault = DBG_PAGEIND_FAULT;
+ } else {
+ *type_of_fault = DBG_PAGEINV_FAULT;
+ }
+
+ VM_PAGE_COUNT_AS_PAGEIN(m);
+ }
+ VM_PAGE_CONSUME_CLUSTERED(m);
+ }
+ }
+
+ if (*type_of_fault != DBG_COW_FAULT) {
+ DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
+
+ if (pmap == kernel_pmap) {
+ DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
+ }
+ }
+
+ kr = vm_fault_validate_cs(cs_bypass, object, m, pmap, vaddr,
+ *prot, caller_prot, fault_page_size, fault_phys_offset,
+ fault_info, &is_tainted);
+ if (kr == KERN_SUCCESS) {
+ /*
+ * We either have a good page, or a tainted page that has been accepted by the process.
+ * In both cases the page will be entered into the pmap.
+ */
+ *page_needs_data_sync = vm_fault_enter_set_mapped(object, m, *prot, fault_type);
+ if ((fault_type & VM_PROT_WRITE) && is_tainted) {
+ /*
+ * This page is tainted but we're inserting it anyways.
+ * Since it's writeable, we need to disconnect it from other pmaps
+ * now so those processes can take note.
+ */
+
+ /*
+ * We can only get here
+ * because of the CSE logic
+ */
+ assert(pmap_get_vm_map_cs_enforced(pmap));
+ pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
+ /*
+ * If we are faulting for a write, we can clear
+ * the execute bit - that will ensure the page is
+ * checked again before being executable, which
+ * protects against a map switch.
+ * This only happens the first time the page
+ * gets tainted, so we won't get stuck here
+ * to make an already writeable page executable.
+ */
+ if (!cs_bypass) {
+ assert(!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot));
+ *prot &= ~VM_PROT_EXECUTE;
+ }
+ }
+ assert(VM_PAGE_OBJECT(m) == object);
+
+#if VM_OBJECT_ACCESS_TRACKING
+ if (object->access_tracking) {
+ DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
+ if (fault_type & VM_PROT_WRITE) {
+ object->access_tracking_writes++;
+ vm_object_access_tracking_writes++;
+ } else {
+ object->access_tracking_reads++;
+ vm_object_access_tracking_reads++;
+ }
+ }
+#endif /* VM_OBJECT_ACCESS_TRACKING */
+ }
+
+ return kr;
+}
+
+/*
+ * page queue lock must NOT be held
+ * m->vmp_object must be locked
+ *
+ * NOTE: m->vmp_object could be locked "shared" only if we are called
+ * from vm_fault() as part of a soft fault. If so, we must be
+ * careful not to modify the VM object in any way that is not
+ * legal under a shared lock...
+ */
+kern_return_t
+vm_fault_enter(
+ vm_page_t m,
+ pmap_t pmap,
+ vm_map_offset_t vaddr,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ vm_prot_t prot,
+ vm_prot_t caller_prot,
+ boolean_t wired,
+ boolean_t change_wiring,
+ vm_tag_t wire_tag,
+ vm_object_fault_info_t fault_info,
+ boolean_t *need_retry,
+ int *type_of_fault)
+{
+ kern_return_t kr;
+ vm_object_t object;
+ bool page_needs_data_sync;
+ vm_prot_t fault_type;
+ int pmap_options = fault_info->pmap_options;
+
+ if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
+ assert(m->vmp_fictitious);
+ return KERN_SUCCESS;
+ }
+
+ fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
+
+ kr = vm_fault_enter_prepare(m, pmap, vaddr, &prot, caller_prot,
+ fault_page_size, fault_phys_offset, change_wiring, fault_type,
+ fault_info, type_of_fault, &page_needs_data_sync);
+ object = VM_PAGE_OBJECT(m);
+
+ vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info->no_cache, type_of_fault, kr);
+
+ if (kr == KERN_SUCCESS) {
+ if (page_needs_data_sync) {
+ pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
+ }
+
+ kr = vm_fault_pmap_enter_with_object_lock(object, pmap, vaddr,
+ fault_page_size, fault_phys_offset, m,
+ &prot, caller_prot, fault_type, wired, pmap_options, need_retry);
+ }
+
+ return kr;
+}
+
+void
+vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
+{
+ if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
+ vm_fault(current_map(), /* map */
+ vaddr, /* vaddr */
+ prot, /* fault_type */
+ FALSE, /* change_wiring */
+ VM_KERN_MEMORY_NONE, /* tag - not wiring */
+ THREAD_UNINT, /* interruptible */
+ NULL, /* caller_pmap */
+ 0 /* caller_pmap_addr */);
+ }
+}
+
+
+/*
+ * Routine: vm_fault
+ * Purpose:
+ * Handle page faults, including pseudo-faults
+ * used to change the wiring status of pages.
+ * Returns:
+ * Explicit continuations have been removed.
+ * Implementation:
+ * vm_fault and vm_fault_page save mucho state
+ * in the moral equivalent of a closure. The state
+ * structure is allocated when first entering vm_fault
+ * and deallocated when leaving vm_fault.
+ */
+
+extern uint64_t get_current_unique_pid(void);
+
+unsigned long vm_fault_collapse_total = 0;
+unsigned long vm_fault_collapse_skipped = 0;
+
+
+kern_return_t
+vm_fault_external(
+ vm_map_t map,
+ vm_map_offset_t vaddr,
+ vm_prot_t fault_type,
+ boolean_t change_wiring,
+ int interruptible,
+ pmap_t caller_pmap,
+ vm_map_offset_t caller_pmap_addr)
+{
+ return vm_fault_internal(map, vaddr, fault_type, change_wiring,
+ change_wiring ? vm_tag_bt() : VM_KERN_MEMORY_NONE,
+ interruptible, caller_pmap, caller_pmap_addr,
+ NULL);
+}
+
+kern_return_t
+vm_fault(
+ vm_map_t map,
+ vm_map_offset_t vaddr,
+ vm_prot_t fault_type,
+ boolean_t change_wiring,
+ vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
+ int interruptible,
+ pmap_t caller_pmap,
+ vm_map_offset_t caller_pmap_addr)
+{
+ return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
+ interruptible, caller_pmap, caller_pmap_addr,
+ NULL);
+}
+
+static boolean_t
+current_proc_is_privileged(void)
+{
+ return csproc_get_platform_binary(current_proc());
+}
+
+uint64_t vm_copied_on_read = 0;
+
+/*
+ * Cleanup after a vm_fault_enter.
+ * At this point, the fault should either have failed (kr != KERN_SUCCESS)
+ * or the page should be in the pmap and on the correct paging queue.
+ *
+ * Precondition:
+ * map must be locked shared.
+ * m_object must be locked.
+ * If top_object != VM_OBJECT_NULL, it must be locked.
+ * real_map must be locked.
+ *
+ * Postcondition:
+ * map will be unlocked
+ * m_object will be unlocked
+ * top_object will be unlocked
+ * If real_map != map, it will be unlocked
+ */
+static void
+vm_fault_complete(
+ vm_map_t map,
+ vm_map_t real_map,
+ vm_object_t object,
+ vm_object_t m_object,
+ vm_page_t m,
+ vm_map_offset_t offset,
+ vm_map_offset_t trace_real_vaddr,
+ vm_object_fault_info_t fault_info,
+ vm_prot_t caller_prot,
+#if CONFIG_DTRACE
+ vm_map_offset_t real_vaddr,
+#else
+ __unused vm_map_offset_t real_vaddr,
+#endif /* CONFIG_DTRACE */
+ int type_of_fault,
+ boolean_t need_retry,
+ kern_return_t kr,
+ ppnum_t *physpage_p,
+ vm_prot_t prot,
+ vm_object_t top_object,
+ boolean_t need_collapse,
+ vm_map_offset_t cur_offset,
+ vm_prot_t fault_type,
+ vm_object_t *written_on_object,
+ memory_object_t *written_on_pager,
+ vm_object_offset_t *written_on_offset)
+{
+ int event_code = 0;
+ vm_map_lock_assert_shared(map);
+ vm_object_lock_assert_held(m_object);
+ if (top_object != VM_OBJECT_NULL) {
+ vm_object_lock_assert_held(top_object);
+ }
+ vm_map_lock_assert_held(real_map);
+
+ if (m_object->internal) {
+ event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
+ } else if (m_object->object_is_shared_cache) {
+ event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
+ } else {
+ event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
+ }
+
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info->user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
+ if (need_retry == FALSE) {
+ KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_FAST), get_current_unique_pid(), 0, 0, 0, 0);
+ }
+ DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
+ if (kr == KERN_SUCCESS &&
+ physpage_p != NULL) {
+ /* for vm_map_wire_and_extract() */
+ *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
+ if (prot & VM_PROT_WRITE) {
+ vm_object_lock_assert_exclusive(m_object);
+ m->vmp_dirty = TRUE;
+ }
+ }
+
+ if (top_object != VM_OBJECT_NULL) {
+ /*
+ * It's safe to drop the top object
+ * now that we've done our
+ * vm_fault_enter(). Any other fault
+ * in progress for that virtual
+ * address will either find our page
+ * and translation or put in a new page
+ * and translation.
+ */
+ vm_object_unlock(top_object);
+ top_object = VM_OBJECT_NULL;
+ }
+
+ if (need_collapse == TRUE) {
+ vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
+ }
+
+ if (need_retry == FALSE &&
+ (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
+ /*
+ * evaluate access pattern and update state
+ * vm_fault_deactivate_behind depends on the
+ * state being up to date
+ */
+ vm_fault_is_sequential(m_object, cur_offset, fault_info->behavior);
+
+ vm_fault_deactivate_behind(m_object, cur_offset, fault_info->behavior);
+ }
+ /*
+ * That's it, clean up and return.
+ */
+ if (m->vmp_busy) {
+ vm_object_lock_assert_exclusive(m_object);
+ PAGE_WAKEUP_DONE(m);
+ }
+
+ if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
+ vm_object_paging_begin(m_object);
+
+ assert(*written_on_object == VM_OBJECT_NULL);
+ *written_on_object = m_object;
+ *written_on_pager = m_object->pager;
+ *written_on_offset = m_object->paging_offset + m->vmp_offset;
+ }
+ vm_object_unlock(object);
+
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+}
+
+static inline int
+vm_fault_type_for_tracing(boolean_t need_copy_on_read, int type_of_fault)
+{
+ if (need_copy_on_read && type_of_fault == DBG_COW_FAULT) {
+ return DBG_COR_FAULT;
+ }
+ return type_of_fault;
+}
+
+kern_return_t
+vm_fault_internal(
+ vm_map_t map,
+ vm_map_offset_t vaddr,
+ vm_prot_t caller_prot,
+ boolean_t change_wiring,
+ vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
+ int interruptible,
+ pmap_t caller_pmap,
+ vm_map_offset_t caller_pmap_addr,
+ ppnum_t *physpage_p)
+{
+ vm_map_version_t version; /* Map version for verificiation */
+ boolean_t wired; /* Should mapping be wired down? */
+ vm_object_t object; /* Top-level object */
+ vm_object_offset_t offset; /* Top-level offset */
+ vm_prot_t prot; /* Protection for mapping */
+ vm_object_t old_copy_object; /* Saved copy object */
+ vm_page_t result_page; /* Result of vm_fault_page */
+ vm_page_t top_page; /* Placeholder page */
+ kern_return_t kr;
+
+ vm_page_t m; /* Fast access to result_page */
+ kern_return_t error_code;
+ vm_object_t cur_object;
+ vm_object_t m_object = NULL;
+ vm_object_offset_t cur_offset;
+ vm_page_t cur_m;
+ vm_object_t new_object;
+ int type_of_fault;
+ pmap_t pmap;
+ wait_interrupt_t interruptible_state;
+ vm_map_t real_map = map;
+ vm_map_t original_map = map;
+ bool object_locks_dropped = FALSE;
+ vm_prot_t fault_type;
+ vm_prot_t original_fault_type;
+ struct vm_object_fault_info fault_info = {};
+ bool need_collapse = FALSE;
+ boolean_t need_retry = FALSE;
+ boolean_t *need_retry_ptr = NULL;
+ uint8_t object_lock_type = 0;
+ uint8_t cur_object_lock_type;
+ vm_object_t top_object = VM_OBJECT_NULL;
+ vm_object_t written_on_object = VM_OBJECT_NULL;
+ memory_object_t written_on_pager = NULL;
+ vm_object_offset_t written_on_offset = 0;
+ int throttle_delay;
+ int compressed_count_delta;
+ uint8_t grab_options;
+ bool need_copy;
+ bool need_copy_on_read;
+ vm_map_offset_t trace_vaddr;
+ vm_map_offset_t trace_real_vaddr;
+ vm_map_size_t fault_page_size;
+ vm_map_size_t fault_page_mask;
+ vm_map_offset_t fault_phys_offset;
+ vm_map_offset_t real_vaddr;
+ bool resilient_media_retry = FALSE;
+ vm_object_t resilient_media_object = VM_OBJECT_NULL;
+ vm_object_offset_t resilient_media_offset = (vm_object_offset_t)-1;
+ bool page_needs_data_sync = false;
+ /*
+ * Was the VM object contended when vm_map_lookup_locked locked it?
+ * If so, the zero fill path will drop the lock
+ * NB: Ideally we would always drop the lock rather than rely on
+ * this heuristic, but vm_object_unlock currently takes > 30 cycles.
+ */
+ bool object_is_contended = false;
+
+ real_vaddr = vaddr;
+ trace_real_vaddr = vaddr;
+
+ if (VM_MAP_PAGE_SIZE(original_map) < PAGE_SIZE) {
+ fault_phys_offset = (vm_map_offset_t)-1;
+ fault_page_size = VM_MAP_PAGE_SIZE(original_map);
+ fault_page_mask = VM_MAP_PAGE_MASK(original_map);
+ if (fault_page_size < PAGE_SIZE) {
+ DEBUG4K_FAULT("map %p vaddr 0x%llx caller_prot 0x%x\n", map, (uint64_t)trace_real_vaddr, caller_prot);
+ vaddr = vm_map_trunc_page(vaddr, fault_page_mask);
+ }
+ } else {
+ fault_phys_offset = 0;
+ fault_page_size = PAGE_SIZE;
+ fault_page_mask = PAGE_MASK;
+ vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
+ }
+
+ if (map == kernel_map) {
+ trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
+ trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
+ } else {
+ trace_vaddr = vaddr;
+ }
+
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+ (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
+ ((uint64_t)trace_vaddr >> 32),
+ trace_vaddr,
+ (map == kernel_map),
+ 0,
+ 0);
+
+ if (get_preemption_level() != 0) {
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+ (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
+ ((uint64_t)trace_vaddr >> 32),
+ trace_vaddr,
+ KERN_FAILURE,
+ 0,
+ 0);
+
+ return KERN_FAILURE;
+ }
+
+ thread_t cthread = current_thread();
+ bool rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
+ uint64_t fstart = 0;
+
+ if (rtfault) {
+ fstart = mach_continuous_time();
+ }
+
+ interruptible_state = thread_interrupt_level(interruptible);
+
+ fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
+
+ counter_inc(&vm_statistics_faults);
+ counter_inc(¤t_task()->faults);
+ original_fault_type = fault_type;
+
+ need_copy = FALSE;
+ if (fault_type & VM_PROT_WRITE) {
+ need_copy = TRUE;
+ }
+
+ if (need_copy || change_wiring) {
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+ } else {
+ object_lock_type = OBJECT_LOCK_SHARED;
+ }
+
+ cur_object_lock_type = OBJECT_LOCK_SHARED;
+
+ if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
+ if (compressor_map) {
+ if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
+ panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
+ }
+ }
+ }
+RetryFault:
+ assert(written_on_object == VM_OBJECT_NULL);
+
+ /*
+ * assume we will hit a page in the cache
+ * otherwise, explicitly override with
+ * the real fault type once we determine it
+ */
+ type_of_fault = DBG_CACHE_HIT_FAULT;
+
+ /*
+ * Find the backing store object and offset into
+ * it to begin the search.
+ */
+ fault_type = original_fault_type;
+ map = original_map;
+ vm_map_lock_read(map);
+
+ if (resilient_media_retry) {
+ /*
+ * If we have to insert a fake zero-filled page to hide
+ * a media failure to provide the real page, we need to
+ * resolve any pending copy-on-write on this mapping.
+ * VM_PROT_COPY tells vm_map_lookup_locked() to deal
+ * with that even if this is not a "write" fault.
+ */
+ need_copy = TRUE;
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+ }
+
+ kr = vm_map_lookup_locked(&map, vaddr,
+ (fault_type | (need_copy ? VM_PROT_COPY : 0)),
+ object_lock_type, &version,
+ &object, &offset, &prot, &wired,
+ &fault_info,
+ &real_map,
+ &object_is_contended);
+
+ if (kr != KERN_SUCCESS) {
+ vm_map_unlock_read(map);
+ goto done;
+ }
+
+
+ pmap = real_map->pmap;
+ fault_info.interruptible = interruptible;
+ fault_info.stealth = FALSE;
+ fault_info.io_sync = FALSE;
+ fault_info.mark_zf_absent = FALSE;
+ fault_info.batch_pmap_op = FALSE;
+
+ if (resilient_media_retry) {
+ /*
+ * We're retrying this fault after having detected a media
+ * failure from a "resilient_media" mapping.
+ * Check that the mapping is still pointing at the object
+ * that just failed to provide a page.
+ */
+ assert(resilient_media_object != VM_OBJECT_NULL);
+ assert(resilient_media_offset != (vm_object_offset_t)-1);
+ if (object != VM_OBJECT_NULL &&
+ object == resilient_media_object &&
+ offset == resilient_media_offset &&
+ fault_info.resilient_media) {
+ /*
+ * This mapping still points at the same object
+ * and is still "resilient_media": proceed in
+ * "recovery-from-media-failure" mode, where we'll
+ * insert a zero-filled page in the top object.
+ */
+// printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
+ } else {
+ /* not recovering: reset state */
+// printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset);
+ resilient_media_retry = FALSE;
+ /* release our extra reference on failed object */
+// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
+ vm_object_deallocate(resilient_media_object);
+ resilient_media_object = VM_OBJECT_NULL;
+ resilient_media_offset = (vm_object_offset_t)-1;
+ }
+ } else {
+ assert(resilient_media_object == VM_OBJECT_NULL);
+ resilient_media_offset = (vm_object_offset_t)-1;
+ }
+
+ /*
+ * If the page is wired, we must fault for the current protection
+ * value, to avoid further faults.
+ */
+ if (wired) {
+ fault_type = prot | VM_PROT_WRITE;
+ }
+ if (wired || need_copy) {
+ /*
+ * since we're treating this fault as a 'write'
+ * we must hold the top object lock exclusively
+ */
+ if (object_lock_type == OBJECT_LOCK_SHARED) {
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+
+ if (vm_object_lock_upgrade(object) == FALSE) {
+ /*
+ * couldn't upgrade, so explictly
+ * take the lock exclusively
+ */
+ vm_object_lock(object);