+ /*
+ * Write the page to the copy-object,
+ * flushing it from the kernel.
+ */
+ vm_pageout_initialize_page(copy_m);
+
+ /*
+ * Since the pageout may have
+ * temporarily dropped the
+ * copy_object's lock, we
+ * check whether we'll have
+ * to deallocate the hard way.
+ */
+ if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
+ vm_object_unlock(copy_object);
+ vm_object_deallocate(copy_object);
+ vm_object_lock(object);
+
+ continue;
+ }
+ /*
+ * Pick back up the old object's
+ * lock. [It is safe to do so,
+ * since it must be deeper in the
+ * object tree.]
+ */
+ vm_object_lock(object);
+ }
+
+ /*
+ * Because we're pushing a page upward
+ * in the object tree, we must restart
+ * any faults that are waiting here.
+ * [Note that this is an expansion of
+ * PAGE_WAKEUP that uses the THREAD_RESTART
+ * wait result]. Can't turn off the page's
+ * busy bit because we're not done with it.
+ */
+ if (m->vmp_wanted) {
+ m->vmp_wanted = FALSE;
+ thread_wakeup_with_result((event_t) m, THREAD_RESTART);
+ }
+ }
+ /*
+ * The reference count on copy_object must be
+ * at least 2: one for our extra reference,
+ * and at least one from the outside world
+ * (we checked that when we last locked
+ * copy_object).
+ */
+ vm_object_lock_assert_exclusive(copy_object);
+ copy_object->ref_count--;
+ assert(copy_object->ref_count > 0);
+
+ vm_object_unlock(copy_object);
+
+ break;
+ }
+
+done:
+ *result_page = m;
+ *top_page = first_m;
+
+ if (m != VM_PAGE_NULL) {
+ assert(VM_PAGE_OBJECT(m) == object);
+
+ retval = VM_FAULT_SUCCESS;
+
+ if (my_fault == DBG_PAGEIN_FAULT) {
+ VM_PAGE_COUNT_AS_PAGEIN(m);
+
+ if (object->internal) {
+ my_fault = DBG_PAGEIND_FAULT;
+ } else {
+ my_fault = DBG_PAGEINV_FAULT;
+ }
+
+ /*
+ * evaluate access pattern and update state
+ * vm_fault_deactivate_behind depends on the
+ * state being up to date
+ */
+ vm_fault_is_sequential(object, offset, fault_info->behavior);
+ vm_fault_deactivate_behind(object, offset, fault_info->behavior);
+ } else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
+ /*
+ * we weren't called from vm_fault, so handle the
+ * accounting here for hits in the cache
+ */
+ if (m->vmp_clustered) {
+ VM_PAGE_COUNT_AS_PAGEIN(m);
+ VM_PAGE_CONSUME_CLUSTERED(m);
+ }
+ vm_fault_is_sequential(object, offset, fault_info->behavior);
+ vm_fault_deactivate_behind(object, offset, fault_info->behavior);
+ } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
+ VM_STAT_DECOMPRESSIONS();
+ }
+ if (type_of_fault) {
+ *type_of_fault = my_fault;
+ }
+ } else {
+ retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
+ assert(first_m == VM_PAGE_NULL);
+ assert(object == first_object);
+ }
+
+ thread_interrupt_level(interruptible_state);
+
+#if TRACEFAULTPAGE
+ dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
+#endif
+ return retval;
+
+backoff:
+ thread_interrupt_level(interruptible_state);
+
+ if (wait_result == THREAD_INTERRUPTED) {
+ return VM_FAULT_INTERRUPTED;
+ }
+ return VM_FAULT_RETRY;
+
+#undef RELEASE_PAGE
+}
+
+
+extern int panic_on_cs_killed;
+extern int proc_selfpid(void);
+extern char *proc_name_address(void *p);
+unsigned long cs_enter_tainted_rejected = 0;
+unsigned long cs_enter_tainted_accepted = 0;
+
+/*
+ * CODE SIGNING:
+ * When soft faulting a page, we have to validate the page if:
+ * 1. the page is being mapped in user space
+ * 2. the page hasn't already been found to be "tainted"
+ * 3. the page belongs to a code-signed object
+ * 4. the page has not been validated yet or has been mapped for write.
+ */
+static bool
+vm_fault_cs_need_validation(
+ pmap_t pmap,
+ vm_page_t page,
+ vm_object_t page_obj,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset)
+{
+ if (pmap == kernel_pmap) {
+ /* 1 - not user space */
+ return false;
+ }
+ if (!page_obj->code_signed) {
+ /* 3 - page does not belong to a code-signed object */
+ return false;
+ }
+ if (fault_page_size == PAGE_SIZE) {
+ /* looking at the whole page */
+ assertf(fault_phys_offset == 0,
+ "fault_page_size 0x%llx fault_phys_offset 0x%llx\n",
+ (uint64_t)fault_page_size,
+ (uint64_t)fault_phys_offset);
+ if (page->vmp_cs_tainted == VMP_CS_ALL_TRUE) {
+ /* 2 - page is all tainted */
+ return false;
+ }
+ if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
+ !page->vmp_wpmapped) {
+ /* 4 - already fully validated and never mapped writable */
+ return false;
+ }
+ } else {
+ /* looking at a specific sub-page */
+ if (VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
+ /* 2 - sub-page was already marked as tainted */
+ return false;
+ }
+ if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) &&
+ !page->vmp_wpmapped) {
+ /* 4 - already validated and never mapped writable */
+ return false;
+ }
+ }
+ /* page needs to be validated */
+ return true;
+}
+
+
+static bool
+vm_fault_cs_page_immutable(
+ vm_page_t m,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ vm_prot_t prot __unused)
+{
+ if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)
+ /*&& ((prot) & VM_PROT_EXECUTE)*/) {
+ return true;
+ }
+ return false;
+}
+
+static bool
+vm_fault_cs_page_nx(
+ vm_page_t m,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset)
+{
+ return VMP_CS_NX(m, fault_page_size, fault_phys_offset);
+}
+
+/*
+ * Check if the page being entered into the pmap violates code signing.
+ */
+static kern_return_t
+vm_fault_cs_check_violation(
+ bool cs_bypass,
+ vm_object_t object,
+ vm_page_t m,
+ pmap_t pmap,
+ vm_prot_t prot,
+ vm_prot_t caller_prot,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ vm_object_fault_info_t fault_info,
+ bool map_is_switched,
+ bool map_is_switch_protected,
+ bool *cs_violation)
+{
+#if !PMAP_CS
+#pragma unused(caller_prot)
+#pragma unused(fault_info)
+#endif /* !PMAP_CS */
+ int cs_enforcement_enabled;
+ if (!cs_bypass &&
+ vm_fault_cs_need_validation(pmap, m, object,
+ fault_page_size, fault_phys_offset)) {
+ vm_object_lock_assert_exclusive(object);
+
+ if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)) {
+ vm_cs_revalidates++;
+ }
+
+ /* VM map is locked, so 1 ref will remain on VM object -
+ * so no harm if vm_page_validate_cs drops the object lock */
+
+ vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
+ }
+
+ /* If the map is switched, and is switch-protected, we must protect
+ * some pages from being write-faulted: immutable pages because by
+ * definition they may not be written, and executable pages because that
+ * would provide a way to inject unsigned code.
+ * If the page is immutable, we can simply return. However, we can't
+ * immediately determine whether a page is executable anywhere. But,
+ * we can disconnect it everywhere and remove the executable protection
+ * from the current map. We do that below right before we do the
+ * PMAP_ENTER.
+ */
+ if (pmap == kernel_pmap) {
+ /* kernel fault: cs_enforcement does not apply */
+ cs_enforcement_enabled = 0;
+ } else {
+ cs_enforcement_enabled = pmap_get_vm_map_cs_enforced(pmap);
+ }
+
+ if (cs_enforcement_enabled && map_is_switched &&
+ map_is_switch_protected &&
+ vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
+ (prot & VM_PROT_WRITE)) {
+ return KERN_CODESIGN_ERROR;
+ }
+
+ if (cs_enforcement_enabled &&
+ vm_fault_cs_page_nx(m, fault_page_size, fault_phys_offset) &&
+ (prot & VM_PROT_EXECUTE)) {
+ if (cs_debug) {
+ printf("page marked to be NX, not letting it be mapped EXEC\n");
+ }
+ return KERN_CODESIGN_ERROR;
+ }
+
+ /* A page could be tainted, or pose a risk of being tainted later.
+ * Check whether the receiving process wants it, and make it feel
+ * the consequences (that hapens in cs_invalid_page()).
+ * For CS Enforcement, two other conditions will
+ * cause that page to be tainted as well:
+ * - pmapping an unsigned page executable - this means unsigned code;
+ * - writeable mapping of a validated page - the content of that page
+ * can be changed without the kernel noticing, therefore unsigned
+ * code can be created
+ */
+ if (cs_bypass) {
+ /* code-signing is bypassed */
+ *cs_violation = FALSE;
+ } else if (VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
+ /* tainted page */
+ *cs_violation = TRUE;
+ } else if (!cs_enforcement_enabled) {
+ /* no further code-signing enforcement */
+ *cs_violation = FALSE;
+ } else if (vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
+ ((prot & VM_PROT_WRITE) ||
+ m->vmp_wpmapped)) {
+ /*
+ * The page should be immutable, but is in danger of being
+ * modified.
+ * This is the case where we want policy from the code
+ * directory - is the page immutable or not? For now we have
+ * to assume that code pages will be immutable, data pages not.
+ * We'll assume a page is a code page if it has a code directory
+ * and we fault for execution.
+ * That is good enough since if we faulted the code page for
+ * writing in another map before, it is wpmapped; if we fault
+ * it for writing in this map later it will also be faulted for
+ * executing at the same time; and if we fault for writing in
+ * another map later, we will disconnect it from this pmap so
+ * we'll notice the change.
+ */
+ *cs_violation = TRUE;
+ } else if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
+ (prot & VM_PROT_EXECUTE)
+ ) {
+ *cs_violation = TRUE;
+ } else {
+ *cs_violation = FALSE;
+ }
+ return KERN_SUCCESS;
+}
+
+/*
+ * Handles a code signing violation by either rejecting the page or forcing a disconnect.
+ * @param must_disconnect This value will be set to true if the caller must disconnect
+ * this page.
+ * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
+ */
+static kern_return_t
+vm_fault_cs_handle_violation(
+ vm_object_t object,
+ vm_page_t m,
+ pmap_t pmap,
+ vm_prot_t prot,
+ vm_map_offset_t vaddr,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ bool map_is_switched,
+ bool map_is_switch_protected,
+ bool *must_disconnect)
+{
+#if !MACH_ASSERT
+#pragma unused(pmap)
+#pragma unused(map_is_switch_protected)
+#endif /* !MACH_ASSERT */
+ /*
+ * We will have a tainted page. Have to handle the special case
+ * of a switched map now. If the map is not switched, standard
+ * procedure applies - call cs_invalid_page().
+ * If the map is switched, the real owner is invalid already.
+ * There is no point in invalidating the switching process since
+ * it will not be executing from the map. So we don't call
+ * cs_invalid_page() in that case.
+ */
+ boolean_t reject_page, cs_killed;
+ kern_return_t kr;
+ if (map_is_switched) {
+ assert(pmap == vm_map_pmap(current_thread()->map));
+ assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
+ reject_page = FALSE;
+ } else {
+ if (cs_debug > 5) {
+ printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
+ object->code_signed ? "yes" : "no",
+ VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
+ VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
+ m->vmp_wpmapped ? "yes" : "no",
+ (int)prot);
+ }
+ reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
+ }
+
+ if (reject_page) {
+ /* reject the invalid page: abort the page fault */
+ int pid;
+ const char *procname;
+ task_t task;
+ vm_object_t file_object, shadow;
+ vm_object_offset_t file_offset;
+ char *pathname, *filename;
+ vm_size_t pathname_len, filename_len;
+ boolean_t truncated_path;
+#define __PATH_MAX 1024
+ struct timespec mtime, cs_mtime;
+ int shadow_depth;
+ os_reason_t codesigning_exit_reason = OS_REASON_NULL;
+
+ kr = KERN_CODESIGN_ERROR;
+ cs_enter_tainted_rejected++;
+
+ /* get process name and pid */
+ procname = "?";
+ task = current_task();
+ pid = proc_selfpid();
+ if (task->bsd_info != NULL) {
+ procname = proc_name_address(task->bsd_info);
+ }
+
+ /* get file's VM object */
+ file_object = object;
+ file_offset = m->vmp_offset;
+ for (shadow = file_object->shadow,
+ shadow_depth = 0;
+ shadow != VM_OBJECT_NULL;
+ shadow = file_object->shadow,
+ shadow_depth++) {
+ vm_object_lock_shared(shadow);
+ if (file_object != object) {
+ vm_object_unlock(file_object);
+ }
+ file_offset += file_object->vo_shadow_offset;
+ file_object = shadow;
+ }
+
+ mtime.tv_sec = 0;
+ mtime.tv_nsec = 0;
+ cs_mtime.tv_sec = 0;
+ cs_mtime.tv_nsec = 0;
+
+ /* get file's pathname and/or filename */
+ pathname = NULL;
+ filename = NULL;
+ pathname_len = 0;
+ filename_len = 0;
+ truncated_path = FALSE;
+ /* no pager -> no file -> no pathname, use "<nil>" in that case */
+ if (file_object->pager != NULL) {
+ pathname = kheap_alloc(KHEAP_TEMP, __PATH_MAX * 2, Z_WAITOK);
+ if (pathname) {
+ pathname[0] = '\0';
+ pathname_len = __PATH_MAX;
+ filename = pathname + pathname_len;
+ filename_len = __PATH_MAX;
+
+ if (vnode_pager_get_object_name(file_object->pager,
+ pathname,
+ pathname_len,
+ filename,
+ filename_len,
+ &truncated_path) == KERN_SUCCESS) {
+ /* safety first... */
+ pathname[__PATH_MAX - 1] = '\0';
+ filename[__PATH_MAX - 1] = '\0';
+
+ vnode_pager_get_object_mtime(file_object->pager,
+ &mtime,
+ &cs_mtime);
+ } else {
+ kheap_free(KHEAP_TEMP, pathname, __PATH_MAX * 2);
+ pathname = NULL;
+ filename = NULL;
+ pathname_len = 0;
+ filename_len = 0;
+ truncated_path = FALSE;
+ }
+ }
+ }
+ printf("CODE SIGNING: process %d[%s]: "
+ "rejecting invalid page at address 0x%llx "
+ "from offset 0x%llx in file \"%s%s%s\" "
+ "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
+ "(signed:%d validated:%d tainted:%d nx:%d "
+ "wpmapped:%d dirty:%d depth:%d)\n",
+ pid, procname, (addr64_t) vaddr,
+ file_offset,
+ (pathname ? pathname : "<nil>"),
+ (truncated_path ? "/.../" : ""),
+ (truncated_path ? filename : ""),
+ cs_mtime.tv_sec, cs_mtime.tv_nsec,
+ ((cs_mtime.tv_sec == mtime.tv_sec &&
+ cs_mtime.tv_nsec == mtime.tv_nsec)
+ ? "=="
+ : "!="),
+ mtime.tv_sec, mtime.tv_nsec,
+ object->code_signed,
+ VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
+ VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
+ VMP_CS_NX(m, fault_page_size, fault_phys_offset),
+ m->vmp_wpmapped,
+ m->vmp_dirty,
+ shadow_depth);
+
+ /*
+ * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
+ * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
+ * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
+ * will deal with the segmentation fault.
+ */
+ if (cs_killed) {
+ KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
+ pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE, 0, 0);
+
+ codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
+ if (codesigning_exit_reason == NULL) {
+ printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
+ } else {
+ mach_vm_address_t data_addr = 0;
+ struct codesigning_exit_reason_info *ceri = NULL;
+ uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
+
+ if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
+ printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
+ } else {
+ if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
+ EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
+ ceri = (struct codesigning_exit_reason_info *)data_addr;
+ static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
+
+ ceri->ceri_virt_addr = vaddr;
+ ceri->ceri_file_offset = file_offset;
+ if (pathname) {
+ strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
+ } else {
+ ceri->ceri_pathname[0] = '\0';
+ }
+ if (filename) {
+ strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
+ } else {
+ ceri->ceri_filename[0] = '\0';
+ }
+ ceri->ceri_path_truncated = (truncated_path ? 1 : 0);
+ ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
+ ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
+ ceri->ceri_page_modtime_secs = mtime.tv_sec;
+ ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
+ ceri->ceri_object_codesigned = (object->code_signed);
+ ceri->ceri_page_codesig_validated = VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset);
+ ceri->ceri_page_codesig_tainted = VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset);
+ ceri->ceri_page_codesig_nx = VMP_CS_NX(m, fault_page_size, fault_phys_offset);
+ ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
+ ceri->ceri_page_slid = 0;
+ ceri->ceri_page_dirty = (m->vmp_dirty);
+ ceri->ceri_page_shadow_depth = shadow_depth;
+ } else {
+#if DEBUG || DEVELOPMENT
+ panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
+#else
+ printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
+#endif /* DEBUG || DEVELOPMENT */
+ /* Free the buffer */
+ os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
+ }
+ }
+ }
+
+ set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
+ }
+ if (panic_on_cs_killed &&
+ object->object_is_shared_cache) {
+ char *tainted_contents;
+ vm_map_offset_t src_vaddr;
+ src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
+ tainted_contents = kalloc(PAGE_SIZE);
+ bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE);
+ printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
+ panic("CODE SIGNING: process %d[%s]: "
+ "rejecting invalid page (phys#0x%x) at address 0x%llx "
+ "from offset 0x%llx in file \"%s%s%s\" "
+ "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
+ "(signed:%d validated:%d tainted:%d nx:%d"
+ "wpmapped:%d dirty:%d depth:%d)\n",
+ pid, procname,
+ VM_PAGE_GET_PHYS_PAGE(m),
+ (addr64_t) vaddr,
+ file_offset,
+ (pathname ? pathname : "<nil>"),
+ (truncated_path ? "/.../" : ""),
+ (truncated_path ? filename : ""),
+ cs_mtime.tv_sec, cs_mtime.tv_nsec,
+ ((cs_mtime.tv_sec == mtime.tv_sec &&
+ cs_mtime.tv_nsec == mtime.tv_nsec)
+ ? "=="
+ : "!="),
+ mtime.tv_sec, mtime.tv_nsec,
+ object->code_signed,
+ VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
+ VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
+ VMP_CS_NX(m, fault_page_size, fault_phys_offset),
+ m->vmp_wpmapped,
+ m->vmp_dirty,
+ shadow_depth);
+ }
+
+ if (file_object != object) {
+ vm_object_unlock(file_object);
+ }
+ if (pathname_len != 0) {
+ kheap_free(KHEAP_TEMP, pathname, __PATH_MAX * 2);
+ pathname = NULL;
+ filename = NULL;
+ }
+ } else {
+ /* proceed with the invalid page */
+ kr = KERN_SUCCESS;
+ if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
+ !object->code_signed) {
+ /*
+ * This page has not been (fully) validated but
+ * does not belong to a code-signed object
+ * so it should not be forcefully considered
+ * as tainted.
+ * We're just concerned about it here because
+ * we've been asked to "execute" it but that
+ * does not mean that it should cause other
+ * accesses to fail.
+ * This happens when a debugger sets a
+ * breakpoint and we then execute code in
+ * that page. Marking the page as "tainted"
+ * would cause any inspection tool ("leaks",
+ * "vmmap", "CrashReporter", ...) to get killed
+ * due to code-signing violation on that page,
+ * even though they're just reading it and not
+ * executing from it.
+ */
+ } else {
+ /*
+ * Page might have been tainted before or not;
+ * now it definitively is. If the page wasn't
+ * tainted, we must disconnect it from all
+ * pmaps later, to force existing mappings
+ * through that code path for re-consideration
+ * of the validity of that page.
+ */
+ if (!VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
+ *must_disconnect = TRUE;
+ VMP_CS_SET_TAINTED(m, fault_page_size, fault_phys_offset, TRUE);
+ }
+ }
+ cs_enter_tainted_accepted++;
+ }
+ if (kr != KERN_SUCCESS) {
+ if (cs_debug) {
+ printf("CODESIGNING: vm_fault_enter(0x%llx): "
+ "*** INVALID PAGE ***\n",
+ (long long)vaddr);
+ }
+#if !SECURE_KERNEL
+ if (cs_enforcement_panic) {
+ panic("CODESIGNING: panicking on invalid page\n");
+ }
+#endif
+ }
+ return kr;
+}
+
+/*
+ * Check that the code signature is valid for the given page being inserted into
+ * the pmap.
+ *
+ * @param must_disconnect This value will be set to true if the caller must disconnect
+ * this page.
+ * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
+ */
+static kern_return_t
+vm_fault_validate_cs(
+ bool cs_bypass,
+ vm_object_t object,
+ vm_page_t m,
+ pmap_t pmap,
+ vm_map_offset_t vaddr,
+ vm_prot_t prot,
+ vm_prot_t caller_prot,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ vm_object_fault_info_t fault_info,
+ bool *must_disconnect)
+{
+ bool map_is_switched, map_is_switch_protected, cs_violation;
+ kern_return_t kr;
+ /* Validate code signature if necessary. */
+ map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
+ (pmap == vm_map_pmap(current_thread()->map)));
+ map_is_switch_protected = current_thread()->map->switch_protect;
+ kr = vm_fault_cs_check_violation(cs_bypass, object, m, pmap,
+ prot, caller_prot, fault_page_size, fault_phys_offset, fault_info,
+ map_is_switched, map_is_switch_protected, &cs_violation);
+ if (kr != KERN_SUCCESS) {
+ return kr;
+ }
+ if (cs_violation) {
+ kr = vm_fault_cs_handle_violation(object, m, pmap, prot, vaddr,
+ fault_page_size, fault_phys_offset,
+ map_is_switched, map_is_switch_protected, must_disconnect);
+ }
+ return kr;
+}
+
+/*
+ * Enqueue the page on the appropriate paging queue.
+ */
+static void
+vm_fault_enqueue_page(
+ vm_object_t object,
+ vm_page_t m,
+ bool wired,
+ bool change_wiring,
+ vm_tag_t wire_tag,
+ bool no_cache,
+ int *type_of_fault,
+ kern_return_t kr)
+{
+ assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
+ boolean_t page_queues_locked = FALSE;
+ boolean_t previously_pmapped = m->vmp_pmapped;
+#define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED() \
+MACRO_BEGIN \
+ if (! page_queues_locked) { \
+ page_queues_locked = TRUE; \
+ vm_page_lockspin_queues(); \
+ } \
+MACRO_END
+#define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED() \
+MACRO_BEGIN \
+ if (page_queues_locked) { \
+ page_queues_locked = FALSE; \
+ vm_page_unlock_queues(); \
+ } \
+MACRO_END
+
+#if CONFIG_BACKGROUND_QUEUE
+ vm_page_update_background_state(m);
+#endif
+ if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
+ /*
+ * Compressor pages are neither wired
+ * nor pageable and should never change.
+ */
+ assert(object == compressor_object);
+ } else if (change_wiring) {
+ __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
+
+ if (wired) {
+ if (kr == KERN_SUCCESS) {
+ vm_page_wire(m, wire_tag, TRUE);
+ }
+ } else {
+ vm_page_unwire(m, TRUE);
+ }
+ /* we keep the page queues lock, if we need it later */
+ } else {
+ if (object->internal == TRUE) {
+ /*
+ * don't allow anonymous pages on
+ * the speculative queues
+ */
+ no_cache = FALSE;
+ }
+ if (kr != KERN_SUCCESS) {
+ __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
+ vm_page_deactivate(m);
+ /* we keep the page queues lock, if we need it later */
+ } else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
+ (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
+ (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
+ ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
+ !VM_PAGE_WIRED(m)) {
+ if (vm_page_local_q &&
+ (*type_of_fault == DBG_COW_FAULT ||
+ *type_of_fault == DBG_ZERO_FILL_FAULT)) {
+ struct vpl *lq;
+ uint32_t lid;
+
+ assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
+
+ __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
+ vm_object_lock_assert_exclusive(object);
+
+ /*
+ * we got a local queue to stuff this
+ * new page on...
+ * its safe to manipulate local and
+ * local_id at this point since we're
+ * behind an exclusive object lock and
+ * the page is not on any global queue.
+ *
+ * we'll use the current cpu number to
+ * select the queue note that we don't
+ * need to disable preemption... we're
+ * going to be behind the local queue's
+ * lock to do the real work
+ */
+ lid = cpu_number();
+
+ lq = zpercpu_get_cpu(vm_page_local_q, lid);
+
+ VPL_LOCK(&lq->vpl_lock);
+
+ vm_page_check_pageable_safe(m);
+ vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
+ m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
+ m->vmp_local_id = lid;
+ lq->vpl_count++;
+
+ if (object->internal) {
+ lq->vpl_internal_count++;
+ } else {
+ lq->vpl_external_count++;
+ }
+
+ VPL_UNLOCK(&lq->vpl_lock);
+
+ if (lq->vpl_count > vm_page_local_q_soft_limit) {
+ /*
+ * we're beyond the soft limit
+ * for the local queue
+ * vm_page_reactivate_local will
+ * 'try' to take the global page
+ * queue lock... if it can't
+ * that's ok... we'll let the
+ * queue continue to grow up
+ * to the hard limit... at that
+ * point we'll wait for the
+ * lock... once we've got the
+ * lock, we'll transfer all of
+ * the pages from the local
+ * queue to the global active
+ * queue
+ */
+ vm_page_reactivate_local(lid, FALSE, FALSE);
+ }
+ } else {
+ __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
+
+ /*
+ * test again now that we hold the
+ * page queue lock
+ */
+ if (!VM_PAGE_WIRED(m)) {
+ if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
+ vm_page_queues_remove(m, FALSE);
+
+ VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
+ VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
+ }
+
+ if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
+ no_cache) {
+ /*
+ * If this is a no_cache mapping
+ * and the page has never been
+ * mapped before or was
+ * previously a no_cache page,
+ * then we want to leave pages
+ * in the speculative state so
+ * that they can be readily
+ * recycled if free memory runs
+ * low. Otherwise the page is
+ * activated as normal.
+ */
+
+ if (no_cache &&
+ (!previously_pmapped ||
+ m->vmp_no_cache)) {
+ m->vmp_no_cache = TRUE;
+
+ if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
+ vm_page_speculate(m, FALSE);
+ }
+ } else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
+ vm_page_activate(m);
+ }
+ }
+ }
+ /* we keep the page queues lock, if we need it later */
+ }
+ }
+ }
+ /* we're done with the page queues lock, if we ever took it */
+ __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
+}
+
+/*
+ * Sets the pmmpped, xpmapped, and wpmapped bits on the vm_page_t and updates accounting.
+ * @return true if the page needs to be sync'ed via pmap_sync-page_data_physo
+ * before being inserted into the pmap.
+ */
+static bool
+vm_fault_enter_set_mapped(
+ vm_object_t object,
+ vm_page_t m,
+ vm_prot_t prot,
+ vm_prot_t fault_type)
+{
+ bool page_needs_sync = false;
+ /*
+ * NOTE: we may only hold the vm_object lock SHARED
+ * at this point, so we need the phys_page lock to
+ * properly serialize updating the pmapped and
+ * xpmapped bits
+ */
+ if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
+ ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
+
+ pmap_lock_phys_page(phys_page);
+ m->vmp_pmapped = TRUE;
+
+ if (!m->vmp_xpmapped) {
+ m->vmp_xpmapped = TRUE;
+
+ pmap_unlock_phys_page(phys_page);
+
+ if (!object->internal) {
+ OSAddAtomic(1, &vm_page_xpmapped_external_count);
+ }
+
+#if defined(__arm__) || defined(__arm64__)
+ page_needs_sync = true;
+#else
+ if (object->internal &&
+ object->pager != NULL) {
+ /*
+ * This page could have been
+ * uncompressed by the
+ * compressor pager and its
+ * contents might be only in
+ * the data cache.
+ * Since it's being mapped for
+ * "execute" for the fist time,
+ * make sure the icache is in
+ * sync.
+ */
+ assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
+ page_needs_sync = true;
+ }
+#endif
+ } else {
+ pmap_unlock_phys_page(phys_page);
+ }
+ } else {
+ if (m->vmp_pmapped == FALSE) {
+ ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
+
+ pmap_lock_phys_page(phys_page);
+ m->vmp_pmapped = TRUE;
+ pmap_unlock_phys_page(phys_page);
+ }
+ }
+
+ if (fault_type & VM_PROT_WRITE) {
+ if (m->vmp_wpmapped == FALSE) {
+ vm_object_lock_assert_exclusive(object);
+ if (!object->internal && object->pager) {
+ task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
+ }
+ m->vmp_wpmapped = TRUE;
+ }
+ }
+ return page_needs_sync;
+}
+
+/*
+ * Try to enter the given page into the pmap.
+ * Will retry without execute permission iff PMAP_CS is enabled and we encounter
+ * a codesigning failure on a non-execute fault.
+ */
+static kern_return_t
+vm_fault_attempt_pmap_enter(
+ pmap_t pmap,
+ vm_map_offset_t vaddr,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ vm_page_t m,
+ vm_prot_t *prot,
+ vm_prot_t caller_prot,
+ vm_prot_t fault_type,
+ bool wired,
+ int pmap_options)
+{
+#if !PMAP_CS
+#pragma unused(caller_prot)
+#endif /* !PMAP_CS */
+ kern_return_t kr;
+ if (fault_page_size != PAGE_SIZE) {
+ DEBUG4K_FAULT("pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x fault_type 0x%x\n", pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, *prot, fault_type);
+ assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
+ fault_phys_offset < PAGE_SIZE),
+ "0x%llx\n", (uint64_t)fault_phys_offset);
+ } else {
+ assertf(fault_phys_offset == 0,
+ "0x%llx\n", (uint64_t)fault_phys_offset);
+ }
+
+ PMAP_ENTER_OPTIONS(pmap, vaddr,
+ fault_phys_offset,
+ m, *prot, fault_type, 0,
+ wired,
+ pmap_options,
+ kr);
+ return kr;
+}
+
+/*
+ * Enter the given page into the pmap.
+ * The map must be locked shared.
+ * The vm object must NOT be locked.
+ *
+ * @param need_retry if not null, avoid making a (potentially) blocking call into
+ * the pmap layer. When such a call would be necessary, return true in this boolean instead.
+ */
+static kern_return_t
+vm_fault_pmap_enter(
+ pmap_t pmap,
+ vm_map_offset_t vaddr,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ vm_page_t m,
+ vm_prot_t *prot,
+ vm_prot_t caller_prot,
+ vm_prot_t fault_type,
+ bool wired,
+ int pmap_options,
+ boolean_t *need_retry)
+{
+ kern_return_t kr;
+ if (need_retry != NULL) {
+ /*
+ * Although we don't hold a lock on this object, we hold a lock
+ * on the top object in the chain. To prevent a deadlock, we
+ * can't allow the pmap layer to block.
+ */
+ pmap_options |= PMAP_OPTIONS_NOWAIT;
+ }
+ kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
+ fault_page_size, fault_phys_offset,
+ m, prot, caller_prot, fault_type, wired, pmap_options);
+ if (kr == KERN_RESOURCE_SHORTAGE) {
+ if (need_retry) {
+ /*
+ * There's nothing we can do here since we hold the
+ * lock on the top object in the chain. The caller
+ * will need to deal with this by dropping that lock and retrying.
+ */
+ *need_retry = TRUE;
+ vm_pmap_enter_retried++;
+ }
+ }
+ return kr;
+}
+
+/*
+ * Enter the given page into the pmap.
+ * The vm map must be locked shared.
+ * The vm object must be locked exclusive, unless this is a soft fault.
+ * For a soft fault, the object must be locked shared or exclusive.
+ *
+ * @param need_retry if not null, avoid making a (potentially) blocking call into
+ * the pmap layer. When such a call would be necessary, return true in this boolean instead.
+ */
+static kern_return_t
+vm_fault_pmap_enter_with_object_lock(
+ vm_object_t object,
+ pmap_t pmap,
+ vm_map_offset_t vaddr,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ vm_page_t m,
+ vm_prot_t *prot,
+ vm_prot_t caller_prot,
+ vm_prot_t fault_type,
+ bool wired,
+ int pmap_options,
+ boolean_t *need_retry)
+{
+ kern_return_t kr;
+ /*
+ * Prevent a deadlock by not
+ * holding the object lock if we need to wait for a page in
+ * pmap_enter() - <rdar://problem/7138958>
+ */
+ kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
+ fault_page_size, fault_phys_offset,
+ m, prot, caller_prot, fault_type, wired, pmap_options | PMAP_OPTIONS_NOWAIT);
+#if __x86_64__
+ if (kr == KERN_INVALID_ARGUMENT &&
+ pmap == PMAP_NULL &&
+ wired) {
+ /*
+ * Wiring a page in a pmap-less VM map:
+ * VMware's "vmmon" kernel extension does this
+ * to grab pages.
+ * Let it proceed even though the PMAP_ENTER() failed.
+ */
+ kr = KERN_SUCCESS;
+ }
+#endif /* __x86_64__ */
+
+ if (kr == KERN_RESOURCE_SHORTAGE) {
+ if (need_retry) {
+ /*
+ * this will be non-null in the case where we hold the lock
+ * on the top-object in this chain... we can't just drop
+ * the lock on the object we're inserting the page into
+ * and recall the PMAP_ENTER since we can still cause
+ * a deadlock if one of the critical paths tries to
+ * acquire the lock on the top-object and we're blocked
+ * in PMAP_ENTER waiting for memory... our only recourse
+ * is to deal with it at a higher level where we can
+ * drop both locks.
+ */
+ *need_retry = TRUE;
+ vm_pmap_enter_retried++;
+ goto done;
+ }
+ /*
+ * The nonblocking version of pmap_enter did not succeed.
+ * and we don't need to drop other locks and retry
+ * at the level above us, so
+ * use the blocking version instead. Requires marking
+ * the page busy and unlocking the object
+ */
+ boolean_t was_busy = m->vmp_busy;
+
+ vm_object_lock_assert_exclusive(object);
+
+ m->vmp_busy = TRUE;
+ vm_object_unlock(object);
+
+ PMAP_ENTER_OPTIONS(pmap, vaddr,
+ fault_phys_offset,
+ m, *prot, fault_type,
+ 0, wired,
+ pmap_options, kr);
+
+ assert(VM_PAGE_OBJECT(m) == object);
+
+ /* Take the object lock again. */
+ vm_object_lock(object);
+
+ /* If the page was busy, someone else will wake it up.
+ * Otherwise, we have to do it now. */
+ assert(m->vmp_busy);
+ if (!was_busy) {
+ PAGE_WAKEUP_DONE(m);
+ }
+ vm_pmap_enter_blocked++;
+ }
+
+done:
+ return kr;
+}
+
+/*
+ * Prepare to enter a page into the pmap by checking CS, protection bits,
+ * and setting mapped bits on the page_t.
+ * Does not modify the page's paging queue.
+ *
+ * page queue lock must NOT be held
+ * m->vmp_object must be locked
+ *
+ * NOTE: m->vmp_object could be locked "shared" only if we are called
+ * from vm_fault() as part of a soft fault.
+ */
+static kern_return_t
+vm_fault_enter_prepare(
+ vm_page_t m,
+ pmap_t pmap,
+ vm_map_offset_t vaddr,
+ vm_prot_t *prot,
+ vm_prot_t caller_prot,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ boolean_t change_wiring,
+ vm_prot_t fault_type,
+ vm_object_fault_info_t fault_info,
+ int *type_of_fault,
+ bool *page_needs_data_sync)
+{
+ kern_return_t kr;
+ bool is_tainted = false;
+ vm_object_t object;
+ boolean_t cs_bypass = fault_info->cs_bypass;
+
+ object = VM_PAGE_OBJECT(m);
+
+ vm_object_lock_assert_held(object);
+
+#if KASAN
+ if (pmap == kernel_pmap) {
+ kasan_notify_address(vaddr, PAGE_SIZE);
+ }
+#endif
+
+ LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
+
+ if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
+ vm_object_lock_assert_exclusive(object);
+ } else if ((fault_type & VM_PROT_WRITE) == 0 &&
+ !change_wiring &&
+ (!m->vmp_wpmapped
+#if VM_OBJECT_ACCESS_TRACKING
+ || object->access_tracking
+#endif /* VM_OBJECT_ACCESS_TRACKING */
+ )) {
+ /*
+ * This is not a "write" fault, so we
+ * might not have taken the object lock
+ * exclusively and we might not be able
+ * to update the "wpmapped" bit in
+ * vm_fault_enter().
+ * Let's just grant read access to
+ * the page for now and we'll
+ * soft-fault again if we need write
+ * access later...
+ */
+
+ /* This had better not be a JIT page. */
+ if (!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot)) {
+ *prot &= ~VM_PROT_WRITE;
+ } else {
+ assert(cs_bypass);
+ }
+ }
+ if (m->vmp_pmapped == FALSE) {
+ if (m->vmp_clustered) {
+ if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
+ /*
+ * found it in the cache, but this
+ * is the first fault-in of the page (m->vmp_pmapped == FALSE)
+ * so it must have come in as part of
+ * a cluster... account 1 pagein against it
+ */
+ if (object->internal) {
+ *type_of_fault = DBG_PAGEIND_FAULT;
+ } else {
+ *type_of_fault = DBG_PAGEINV_FAULT;
+ }
+
+ VM_PAGE_COUNT_AS_PAGEIN(m);
+ }
+ VM_PAGE_CONSUME_CLUSTERED(m);
+ }
+ }
+
+ if (*type_of_fault != DBG_COW_FAULT) {
+ DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
+
+ if (pmap == kernel_pmap) {
+ DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
+ }
+ }
+
+ kr = vm_fault_validate_cs(cs_bypass, object, m, pmap, vaddr,
+ *prot, caller_prot, fault_page_size, fault_phys_offset,
+ fault_info, &is_tainted);
+ if (kr == KERN_SUCCESS) {
+ /*
+ * We either have a good page, or a tainted page that has been accepted by the process.
+ * In both cases the page will be entered into the pmap.
+ */
+ *page_needs_data_sync = vm_fault_enter_set_mapped(object, m, *prot, fault_type);
+ if ((fault_type & VM_PROT_WRITE) && is_tainted) {
+ /*
+ * This page is tainted but we're inserting it anyways.
+ * Since it's writeable, we need to disconnect it from other pmaps
+ * now so those processes can take note.
+ */
+
+ /*
+ * We can only get here
+ * because of the CSE logic
+ */
+ assert(pmap_get_vm_map_cs_enforced(pmap));
+ pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
+ /*
+ * If we are faulting for a write, we can clear
+ * the execute bit - that will ensure the page is
+ * checked again before being executable, which
+ * protects against a map switch.
+ * This only happens the first time the page
+ * gets tainted, so we won't get stuck here
+ * to make an already writeable page executable.
+ */
+ if (!cs_bypass) {
+ assert(!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot));
+ *prot &= ~VM_PROT_EXECUTE;
+ }
+ }
+ assert(VM_PAGE_OBJECT(m) == object);
+
+#if VM_OBJECT_ACCESS_TRACKING
+ if (object->access_tracking) {
+ DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
+ if (fault_type & VM_PROT_WRITE) {
+ object->access_tracking_writes++;
+ vm_object_access_tracking_writes++;
+ } else {
+ object->access_tracking_reads++;
+ vm_object_access_tracking_reads++;
+ }
+ }
+#endif /* VM_OBJECT_ACCESS_TRACKING */
+ }
+
+ return kr;
+}
+
+/*
+ * page queue lock must NOT be held
+ * m->vmp_object must be locked
+ *
+ * NOTE: m->vmp_object could be locked "shared" only if we are called
+ * from vm_fault() as part of a soft fault. If so, we must be
+ * careful not to modify the VM object in any way that is not
+ * legal under a shared lock...
+ */
+kern_return_t
+vm_fault_enter(
+ vm_page_t m,
+ pmap_t pmap,
+ vm_map_offset_t vaddr,
+ vm_map_size_t fault_page_size,
+ vm_map_offset_t fault_phys_offset,
+ vm_prot_t prot,
+ vm_prot_t caller_prot,
+ boolean_t wired,
+ boolean_t change_wiring,
+ vm_tag_t wire_tag,
+ vm_object_fault_info_t fault_info,
+ boolean_t *need_retry,
+ int *type_of_fault)
+{
+ kern_return_t kr;
+ vm_object_t object;
+ bool page_needs_data_sync;
+ vm_prot_t fault_type;
+ int pmap_options = fault_info->pmap_options;
+
+ if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
+ assert(m->vmp_fictitious);
+ return KERN_SUCCESS;
+ }
+
+ fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
+
+ kr = vm_fault_enter_prepare(m, pmap, vaddr, &prot, caller_prot,
+ fault_page_size, fault_phys_offset, change_wiring, fault_type,
+ fault_info, type_of_fault, &page_needs_data_sync);
+ object = VM_PAGE_OBJECT(m);
+
+ vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info->no_cache, type_of_fault, kr);
+
+ if (kr == KERN_SUCCESS) {
+ if (page_needs_data_sync) {
+ pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
+ }
+
+ kr = vm_fault_pmap_enter_with_object_lock(object, pmap, vaddr,
+ fault_page_size, fault_phys_offset, m,
+ &prot, caller_prot, fault_type, wired, pmap_options, need_retry);
+ }
+
+ return kr;
+}
+
+void
+vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
+{
+ if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
+ vm_fault(current_map(), /* map */
+ vaddr, /* vaddr */
+ prot, /* fault_type */
+ FALSE, /* change_wiring */
+ VM_KERN_MEMORY_NONE, /* tag - not wiring */
+ THREAD_UNINT, /* interruptible */
+ NULL, /* caller_pmap */
+ 0 /* caller_pmap_addr */);
+ }
+}
+
+
+/*
+ * Routine: vm_fault
+ * Purpose:
+ * Handle page faults, including pseudo-faults
+ * used to change the wiring status of pages.
+ * Returns:
+ * Explicit continuations have been removed.
+ * Implementation:
+ * vm_fault and vm_fault_page save mucho state
+ * in the moral equivalent of a closure. The state
+ * structure is allocated when first entering vm_fault
+ * and deallocated when leaving vm_fault.
+ */
+
+extern uint64_t get_current_unique_pid(void);
+
+unsigned long vm_fault_collapse_total = 0;
+unsigned long vm_fault_collapse_skipped = 0;
+
+
+kern_return_t
+vm_fault_external(
+ vm_map_t map,
+ vm_map_offset_t vaddr,
+ vm_prot_t fault_type,
+ boolean_t change_wiring,
+ int interruptible,
+ pmap_t caller_pmap,
+ vm_map_offset_t caller_pmap_addr)
+{
+ return vm_fault_internal(map, vaddr, fault_type, change_wiring,
+ change_wiring ? vm_tag_bt() : VM_KERN_MEMORY_NONE,
+ interruptible, caller_pmap, caller_pmap_addr,
+ NULL);
+}
+
+kern_return_t
+vm_fault(
+ vm_map_t map,
+ vm_map_offset_t vaddr,
+ vm_prot_t fault_type,
+ boolean_t change_wiring,
+ vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
+ int interruptible,
+ pmap_t caller_pmap,
+ vm_map_offset_t caller_pmap_addr)
+{
+ return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
+ interruptible, caller_pmap, caller_pmap_addr,
+ NULL);
+}
+
+static boolean_t
+current_proc_is_privileged(void)
+{
+ return csproc_get_platform_binary(current_proc());
+}
+
+uint64_t vm_copied_on_read = 0;
+
+/*
+ * Cleanup after a vm_fault_enter.
+ * At this point, the fault should either have failed (kr != KERN_SUCCESS)
+ * or the page should be in the pmap and on the correct paging queue.
+ *
+ * Precondition:
+ * map must be locked shared.
+ * m_object must be locked.
+ * If top_object != VM_OBJECT_NULL, it must be locked.
+ * real_map must be locked.
+ *
+ * Postcondition:
+ * map will be unlocked
+ * m_object will be unlocked
+ * top_object will be unlocked
+ * If real_map != map, it will be unlocked
+ */
+static void
+vm_fault_complete(
+ vm_map_t map,
+ vm_map_t real_map,
+ vm_object_t object,
+ vm_object_t m_object,
+ vm_page_t m,
+ vm_map_offset_t offset,
+ vm_map_offset_t trace_real_vaddr,
+ vm_object_fault_info_t fault_info,
+ vm_prot_t caller_prot,
+#if CONFIG_DTRACE
+ vm_map_offset_t real_vaddr,
+#else
+ __unused vm_map_offset_t real_vaddr,
+#endif /* CONFIG_DTRACE */
+ int type_of_fault,
+ boolean_t need_retry,
+ kern_return_t kr,
+ ppnum_t *physpage_p,
+ vm_prot_t prot,
+ vm_object_t top_object,
+ boolean_t need_collapse,
+ vm_map_offset_t cur_offset,
+ vm_prot_t fault_type,
+ vm_object_t *written_on_object,
+ memory_object_t *written_on_pager,
+ vm_object_offset_t *written_on_offset)
+{
+ int event_code = 0;
+ vm_map_lock_assert_shared(map);
+ vm_object_lock_assert_held(m_object);
+ if (top_object != VM_OBJECT_NULL) {
+ vm_object_lock_assert_held(top_object);
+ }
+ vm_map_lock_assert_held(real_map);
+
+ if (m_object->internal) {
+ event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
+ } else if (m_object->object_is_shared_cache) {
+ event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
+ } else {
+ event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
+ }
+
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info->user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
+ if (need_retry == FALSE) {
+ KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_FAST), get_current_unique_pid(), 0, 0, 0, 0);
+ }
+ DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
+ if (kr == KERN_SUCCESS &&
+ physpage_p != NULL) {
+ /* for vm_map_wire_and_extract() */
+ *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
+ if (prot & VM_PROT_WRITE) {
+ vm_object_lock_assert_exclusive(m_object);
+ m->vmp_dirty = TRUE;
+ }
+ }
+
+ if (top_object != VM_OBJECT_NULL) {
+ /*
+ * It's safe to drop the top object
+ * now that we've done our
+ * vm_fault_enter(). Any other fault
+ * in progress for that virtual
+ * address will either find our page
+ * and translation or put in a new page
+ * and translation.
+ */
+ vm_object_unlock(top_object);
+ top_object = VM_OBJECT_NULL;
+ }
+
+ if (need_collapse == TRUE) {
+ vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
+ }
+
+ if (need_retry == FALSE &&
+ (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
+ /*
+ * evaluate access pattern and update state
+ * vm_fault_deactivate_behind depends on the
+ * state being up to date
+ */
+ vm_fault_is_sequential(m_object, cur_offset, fault_info->behavior);
+
+ vm_fault_deactivate_behind(m_object, cur_offset, fault_info->behavior);
+ }
+ /*
+ * That's it, clean up and return.
+ */
+ if (m->vmp_busy) {
+ vm_object_lock_assert_exclusive(m_object);
+ PAGE_WAKEUP_DONE(m);
+ }
+
+ if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
+ vm_object_paging_begin(m_object);
+
+ assert(*written_on_object == VM_OBJECT_NULL);
+ *written_on_object = m_object;
+ *written_on_pager = m_object->pager;
+ *written_on_offset = m_object->paging_offset + m->vmp_offset;
+ }
+ vm_object_unlock(object);
+
+ vm_map_unlock_read(map);
+ if (real_map != map) {
+ vm_map_unlock(real_map);
+ }
+}
+
+static inline int
+vm_fault_type_for_tracing(boolean_t need_copy_on_read, int type_of_fault)
+{
+ if (need_copy_on_read && type_of_fault == DBG_COW_FAULT) {
+ return DBG_COR_FAULT;
+ }
+ return type_of_fault;
+}
+
+kern_return_t
+vm_fault_internal(
+ vm_map_t map,
+ vm_map_offset_t vaddr,
+ vm_prot_t caller_prot,
+ boolean_t change_wiring,
+ vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
+ int interruptible,
+ pmap_t caller_pmap,
+ vm_map_offset_t caller_pmap_addr,
+ ppnum_t *physpage_p)
+{
+ vm_map_version_t version; /* Map version for verificiation */
+ boolean_t wired; /* Should mapping be wired down? */
+ vm_object_t object; /* Top-level object */
+ vm_object_offset_t offset; /* Top-level offset */
+ vm_prot_t prot; /* Protection for mapping */
+ vm_object_t old_copy_object; /* Saved copy object */
+ vm_page_t result_page; /* Result of vm_fault_page */
+ vm_page_t top_page; /* Placeholder page */
+ kern_return_t kr;
+
+ vm_page_t m; /* Fast access to result_page */
+ kern_return_t error_code;
+ vm_object_t cur_object;
+ vm_object_t m_object = NULL;
+ vm_object_offset_t cur_offset;
+ vm_page_t cur_m;
+ vm_object_t new_object;
+ int type_of_fault;
+ pmap_t pmap;
+ wait_interrupt_t interruptible_state;
+ vm_map_t real_map = map;
+ vm_map_t original_map = map;
+ bool object_locks_dropped = FALSE;
+ vm_prot_t fault_type;
+ vm_prot_t original_fault_type;
+ struct vm_object_fault_info fault_info = {};
+ bool need_collapse = FALSE;
+ boolean_t need_retry = FALSE;
+ boolean_t *need_retry_ptr = NULL;
+ uint8_t object_lock_type = 0;
+ uint8_t cur_object_lock_type;
+ vm_object_t top_object = VM_OBJECT_NULL;
+ vm_object_t written_on_object = VM_OBJECT_NULL;
+ memory_object_t written_on_pager = NULL;
+ vm_object_offset_t written_on_offset = 0;
+ int throttle_delay;
+ int compressed_count_delta;
+ uint8_t grab_options;
+ bool need_copy;
+ bool need_copy_on_read;
+ vm_map_offset_t trace_vaddr;
+ vm_map_offset_t trace_real_vaddr;
+ vm_map_size_t fault_page_size;
+ vm_map_size_t fault_page_mask;
+ vm_map_offset_t fault_phys_offset;
+ vm_map_offset_t real_vaddr;
+ bool resilient_media_retry = FALSE;
+ vm_object_t resilient_media_object = VM_OBJECT_NULL;
+ vm_object_offset_t resilient_media_offset = (vm_object_offset_t)-1;
+ bool page_needs_data_sync = false;
+ /*
+ * Was the VM object contended when vm_map_lookup_locked locked it?
+ * If so, the zero fill path will drop the lock
+ * NB: Ideally we would always drop the lock rather than rely on
+ * this heuristic, but vm_object_unlock currently takes > 30 cycles.
+ */
+ bool object_is_contended = false;
+
+ real_vaddr = vaddr;
+ trace_real_vaddr = vaddr;
+
+ if (VM_MAP_PAGE_SIZE(original_map) < PAGE_SIZE) {
+ fault_phys_offset = (vm_map_offset_t)-1;
+ fault_page_size = VM_MAP_PAGE_SIZE(original_map);
+ fault_page_mask = VM_MAP_PAGE_MASK(original_map);
+ if (fault_page_size < PAGE_SIZE) {
+ DEBUG4K_FAULT("map %p vaddr 0x%llx caller_prot 0x%x\n", map, (uint64_t)trace_real_vaddr, caller_prot);
+ vaddr = vm_map_trunc_page(vaddr, fault_page_mask);
+ }
+ } else {
+ fault_phys_offset = 0;
+ fault_page_size = PAGE_SIZE;
+ fault_page_mask = PAGE_MASK;
+ vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
+ }
+
+ if (map == kernel_map) {
+ trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
+ trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
+ } else {
+ trace_vaddr = vaddr;
+ }
+
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+ (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
+ ((uint64_t)trace_vaddr >> 32),
+ trace_vaddr,
+ (map == kernel_map),
+ 0,
+ 0);
+
+ if (get_preemption_level() != 0) {
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+ (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
+ ((uint64_t)trace_vaddr >> 32),
+ trace_vaddr,
+ KERN_FAILURE,
+ 0,
+ 0);
+
+ return KERN_FAILURE;
+ }
+
+ thread_t cthread = current_thread();
+ bool rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
+ uint64_t fstart = 0;
+
+ if (rtfault) {
+ fstart = mach_continuous_time();
+ }
+
+ interruptible_state = thread_interrupt_level(interruptible);
+
+ fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
+
+ counter_inc(&vm_statistics_faults);
+ counter_inc(¤t_task()->faults);
+ original_fault_type = fault_type;
+
+ need_copy = FALSE;
+ if (fault_type & VM_PROT_WRITE) {
+ need_copy = TRUE;
+ }
+
+ if (need_copy || change_wiring) {
+ object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+ } else {
+ object_lock_type = OBJECT_LOCK_SHARED;
+ }
+
+ cur_object_lock_type = OBJECT_LOCK_SHARED;
+
+ if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
+ if (compressor_map) {
+ if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
+ panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));