X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/060df5ea7c632b1ac8cc8aac1fb59758165c2084..ecc0ceb4089d506a0b8d16686a95817b331af9cb:/osfmk/i386/pmap_x86_common.c diff --git a/osfmk/i386/pmap_x86_common.c b/osfmk/i386/pmap_x86_common.c index a8c3423b4..c6352893a 100644 --- a/osfmk/i386/pmap_x86_common.c +++ b/osfmk/i386/pmap_x86_common.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,30 +25,35 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ + +#include + #include #include +#include #include - void pmap_remove_range( pmap_t pmap, vm_map_offset_t va, pt_entry_t *spte, pt_entry_t *epte); -pv_rooted_entry_t pv_head_table; /* array of entries, one per - * page */ -thread_call_t mapping_adjust_call; -static thread_call_data_t mapping_adjust_call_data; -uint32_t mappingrecurse = 0; +void pmap_remove_range_options( + pmap_t pmap, + vm_map_offset_t va, + pt_entry_t *spte, + pt_entry_t *epte, + int options); -pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[PMAP_PAGETABLE_CORRUPTION_MAX_LOG]; -uint32_t pmap_pagetable_corruption_incidents; -uint64_t pmap_pagetable_corruption_last_abstime = (~(0ULL) >> 1); -uint64_t pmap_pagetable_corruption_interval_abstime; -thread_call_t pmap_pagetable_corruption_log_call; -static thread_call_data_t pmap_pagetable_corruption_log_call_data; -boolean_t pmap_pagetable_corruption_timeout = FALSE; +void pmap_reusable_range( + pmap_t pmap, + vm_map_offset_t va, + pt_entry_t *spte, + pt_entry_t *epte, + boolean_t reusable); + +uint32_t pmap_update_clear_pte_count; /* * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time, @@ -89,6 +94,9 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t unsigned int i; uint64_t num_pde; + assert(!is_ept_pmap(grand)); + assert(!is_ept_pmap(subord)); + if ((size & (pmap_nesting_size_min-1)) || (va_start & (pmap_nesting_size_min-1)) || (nstart & (pmap_nesting_size_min-1)) || @@ -103,8 +111,8 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t panic("pmap_nest: va_start(0x%llx) != nstart(0x%llx)\n", va_start, nstart); PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START, - (int) grand, (int) subord, - (int) (va_start>>32), (int) va_start, 0); + (uintptr_t) grand, (uintptr_t) subord, + (uintptr_t) (va_start>>32), (uintptr_t) va_start, 0); nvaddr = (vm_map_offset_t)nstart; num_pde = size >> PDESHIFT; @@ -120,7 +128,7 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) { PMAP_UNLOCK(subord); - pmap_expand_pdpt(subord, nvaddr); + pmap_expand_pdpt(subord, nvaddr, PMAP_EXPAND_OPTIONS_NONE); PMAP_LOCK(subord); npde = pmap64_pdpt(subord, nvaddr); } @@ -133,7 +141,7 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) { PMAP_UNLOCK(subord); - pmap_expand(subord, nvaddr); + pmap_expand(subord, nvaddr, PMAP_EXPAND_OPTIONS_NONE); PMAP_LOCK(subord); npde = pmap_pde(subord, nvaddr); } @@ -159,7 +167,7 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t pde = pmap64_pdpt(grand, vaddr); if (0 == pde) { PMAP_UNLOCK(grand); - pmap_expand_pml4(grand, vaddr); + pmap_expand_pml4(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE); PMAP_LOCK(grand); pde = pmap64_pdpt(grand, vaddr); } @@ -178,7 +186,7 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t pde = pmap_pde(grand, vaddr); if ((0 == pde) && cpu_64bit) { PMAP_UNLOCK(grand); - pmap_expand_pdpt(grand, vaddr); + pmap_expand_pdpt(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE); PMAP_LOCK(grand); pde = pmap_pde(grand, vaddr); } @@ -216,8 +224,8 @@ kern_return_t pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size) { uint64_t npdpt = PMAP_INVALID_PDPTNUM; PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START, - (int) grand, - (int) (vaddr>>32), (int) vaddr, 0, 0); + (uintptr_t) grand, + (uintptr_t) (vaddr>>32), (uintptr_t) vaddr, 0, 0); if ((size & (pmap_nesting_size_min-1)) || (vaddr & (pmap_nesting_size_min-1))) { @@ -225,6 +233,8 @@ kern_return_t pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size) { grand, vaddr, size); } + assert(!is_ept_pmap(grand)); + /* align everything to PDE boundaries */ va_start = vaddr & ~(NBPDE-1); va_end = (vaddr + size + NBPDE - 1) & ~(NBPDE-1); @@ -263,6 +273,15 @@ kern_return_t pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size) { return KERN_SUCCESS; } +kern_return_t +pmap_unnest_options( + pmap_t grand, + addr64_t vaddr, + __unused uint64_t size, + __unused unsigned int options) { + return pmap_unnest(grand, vaddr, size); +} + /* Invoked by the Mach VM to determine the platform specific unnest region */ boolean_t pmap_adjust_unnest_parameters(pmap_t p, vm_map_offset_t *s, vm_map_offset_t *e) { @@ -306,6 +325,9 @@ pmap_find_phys(pmap_t pmap, addr64_t va) ppnum_t ppn = 0; pd_entry_t pde; pt_entry_t pte; + boolean_t is_ept; + + is_ept = is_ept_pmap(pmap); mp_disable_preemption(); @@ -319,14 +341,14 @@ pmap_find_phys(pmap_t pmap, addr64_t va) pdep = pmap_pde(pmap, va); - if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & INTEL_PTE_VALID)) { - if (pde & INTEL_PTE_PS) { + if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & PTE_VALID_MASK(is_ept))) { + if (pde & PTE_PS) { ppn = (ppnum_t) i386_btop(pte_to_pa(pde)); ppn += (ppnum_t) ptenum(va); } else { ptp = pmap_pte(pmap, va); - if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & INTEL_PTE_VALID) != 0)) { + if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & PTE_VALID_MASK(is_ept)) != 0)) { ppn = (ppnum_t) i386_btop(pte_to_pa(pte)); } } @@ -337,6 +359,86 @@ pfp_exit: return ppn; } +/* + * Update cache attributes for all extant managed mappings. + * Assumes PV for this page is locked, and that the page + * is managed. We assume that this physical page may be mapped in + * both EPT and normal Intel PTEs, so we convert the attributes + * to the corresponding format for each pmap. + * + * We assert that the passed set of attributes is a subset of the + * PHYS_CACHEABILITY_MASK. + */ +void +pmap_update_cache_attributes_locked(ppnum_t pn, unsigned attributes) { + pv_rooted_entry_t pv_h, pv_e; + pv_hashed_entry_t pvh_e, nexth; + vm_map_offset_t vaddr; + pmap_t pmap; + pt_entry_t *ptep; + boolean_t is_ept; + unsigned ept_attributes; + + assert(IS_MANAGED_PAGE(pn)); + assert(((~PHYS_CACHEABILITY_MASK) & attributes) == 0); + + /* We don't support the PTA bit for EPT PTEs */ + if (attributes & INTEL_PTE_NCACHE) + ept_attributes = INTEL_EPT_NCACHE; + else + ept_attributes = INTEL_EPT_WB; + + pv_h = pai_to_pvh(pn); + /* TODO: translate the PHYS_* bits to PTE bits, while they're + * currently identical, they may not remain so + * Potential optimization (here and in page_protect), + * parallel shootdowns, check for redundant + * attribute modifications. + */ + + /* + * Alter attributes on all mappings + */ + if (pv_h->pmap != PMAP_NULL) { + pv_e = pv_h; + pvh_e = (pv_hashed_entry_t)pv_e; + + do { + pmap = pv_e->pmap; + vaddr = pv_e->va; + ptep = pmap_pte(pmap, vaddr); + + if (0 == ptep) + panic("pmap_update_cache_attributes_locked: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx kernel_pmap: %p", pmap, pn, vaddr, kernel_pmap); + + is_ept = is_ept_pmap(pmap); + + nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink); + if (!is_ept) { + pmap_update_pte(ptep, PHYS_CACHEABILITY_MASK, attributes); + } else { + pmap_update_pte(ptep, INTEL_EPT_CACHE_MASK, ept_attributes); + } + PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); + pvh_e = nexth; + } while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h); + } +} + +void x86_filter_TLB_coherency_interrupts(boolean_t dofilter) { + assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0); + + if (dofilter) { + CPU_CR3_MARK_INACTIVE(); + } else { + CPU_CR3_MARK_ACTIVE(); + mfence(); + if (current_cpu_datap()->cpu_tlb_invalid) + process_pmap_updates(); + } +} + + /* * Insert the given physical page (p) at * the specified virtual address (v) in the @@ -349,18 +451,36 @@ pfp_exit: * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ + void pmap_enter( register pmap_t pmap, vm_map_offset_t vaddr, ppnum_t pn, vm_prot_t prot, + vm_prot_t fault_type, unsigned int flags, boolean_t wired) +{ + (void) pmap_enter_options(pmap, vaddr, pn, prot, fault_type, flags, wired, PMAP_EXPAND_OPTIONS_NONE, NULL); +} + + +kern_return_t +pmap_enter_options( + register pmap_t pmap, + vm_map_offset_t vaddr, + ppnum_t pn, + vm_prot_t prot, + __unused vm_prot_t fault_type, + unsigned int flags, + boolean_t wired, + unsigned int options, + void *arg) { pt_entry_t *pte; pv_rooted_entry_t pv_h; - int pai; + ppnum_t pai; pv_hashed_entry_t pvh_e; pv_hashed_entry_t pvh_new; pt_entry_t template; @@ -373,27 +493,40 @@ pmap_enter( /* 2MiB mappings are confined to x86_64 by VM */ boolean_t superpage = flags & VM_MEM_SUPERPAGE; vm_object_t delpage_pm_obj = NULL; - int delpage_pde_index = 0; + uint64_t delpage_pde_index = 0; pt_entry_t old_pte; + kern_return_t kr_expand; + boolean_t is_ept; pmap_intr_assert(); - assert(pn != vm_page_fictitious_addr); if (pmap == PMAP_NULL) - return; + return KERN_INVALID_ARGUMENT; + + is_ept = is_ept_pmap(pmap); + + /* N.B. We can be supplied a zero page frame in the NOENTER case, it's an + * unused value for that scenario. + */ + assert(pn != vm_page_fictitious_addr); + if (pn == vm_page_guard_addr) - return; + return KERN_INVALID_ARGUMENT; PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START, - pmap, - (uint32_t) (vaddr >> 32), (uint32_t) vaddr, - pn, prot); + pmap, + (uint32_t) (vaddr >> 32), (uint32_t) vaddr, + pn, prot); if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled) set_NX = FALSE; else set_NX = TRUE; + if (__improbable(set_NX && (pmap == kernel_pmap) && ((pmap_disable_kstack_nx && (flags & VM_MEM_STACK)) || (pmap_disable_kheap_nx && !(flags & VM_MEM_STACK))))) { + set_NX = FALSE; + } + /* * Must allocate a new pvlist entry while we're unlocked; * zalloc may cause pageout (which will lock the pmap system). @@ -417,7 +550,9 @@ Retry: while ((pte = pmap64_pde(pmap, vaddr)) == PD_ENTRY_NULL) { /* need room for another pde entry */ PMAP_UNLOCK(pmap); - pmap_expand_pdpt(pmap, vaddr); + kr_expand = pmap_expand_pdpt(pmap, vaddr, options); + if (kr_expand != KERN_SUCCESS) + return kr_expand; PMAP_LOCK(pmap); } } else { @@ -427,28 +562,40 @@ Retry: * going to grow pde level page(s) */ PMAP_UNLOCK(pmap); - pmap_expand(pmap, vaddr); + kr_expand = pmap_expand(pmap, vaddr, options); + if (kr_expand != KERN_SUCCESS) + return kr_expand; PMAP_LOCK(pmap); } } + if (options & PMAP_EXPAND_OPTIONS_NOENTER) { + PMAP_UNLOCK(pmap); + return KERN_SUCCESS; + } - if (superpage && *pte && !(*pte & INTEL_PTE_PS)) { + if (superpage && *pte && !(*pte & PTE_PS)) { /* * There is still an empty page table mapped that * was used for a previous base page mapping. * Remember the PDE and the PDE index, so that we * can free the page at the end of this function. */ - delpage_pde_index = (int)pdeidx(pmap, vaddr); + delpage_pde_index = pdeidx(pmap, vaddr); delpage_pm_obj = pmap->pm_obj; *pte = 0; } - old_pa = pte_to_pa(*pte); pai = pa_index(old_pa); old_pa_locked = FALSE; + if (old_pa == 0 && + (*pte & PTE_COMPRESSED)) { + /* one less "compressed" */ + OSAddAtomic64(-1, &pmap->stats.compressed); + /* marker will be cleared below */ + } + /* * if we have a previous managed page, lock the pv entry now. after * we lock it, check to see if someone beat us to the lock and if so @@ -469,50 +616,106 @@ Retry: * at this address. */ if (old_pa == pa) { + pt_entry_t old_attributes = + *pte & ~(PTE_REF(is_ept) | PTE_MOD(is_ept)); /* * May be changing its wired attribute or protection */ - template = pa_to_pte(pa) | INTEL_PTE_VALID; + template = pa_to_pte(pa); + + /* ?: WORTH ASSERTING THAT AT LEAST ONE RWX (implicit valid) PASSED FOR EPT? */ + if (!is_ept) { + template |= INTEL_PTE_VALID; + } else { + template |= INTEL_EPT_IPTA; + } - if (VM_MEM_NOT_CACHEABLE == - (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) { + template |= pmap_get_cache_attributes(pa_index(pa), is_ept); + + /* + * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs + */ + if (!is_ept && (VM_MEM_NOT_CACHEABLE == + (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)))) { if (!(flags & VM_MEM_GUARDED)) template |= INTEL_PTE_PTA; template |= INTEL_PTE_NCACHE; } - if (pmap != kernel_pmap) + if (pmap != kernel_pmap && !is_ept) template |= INTEL_PTE_USER; - if (prot & VM_PROT_WRITE) - template |= INTEL_PTE_WRITE; + + if (prot & VM_PROT_READ) + template |= PTE_READ(is_ept); + + if (prot & VM_PROT_WRITE) { + template |= PTE_WRITE(is_ept); + if (is_ept && !pmap_ept_support_ad) { + template |= PTE_MOD(is_ept); + if (old_pa_locked) { + assert(IS_MANAGED_PAGE(pai)); + pmap_phys_attributes[pai] |= PHYS_MODIFIED; + } + } + } + if (prot & VM_PROT_EXECUTE) { + assert(set_NX == 0); + template = pte_set_ex(template, is_ept); + } if (set_NX) - template |= INTEL_PTE_NX; + template = pte_remove_ex(template, is_ept); if (wired) { - template |= INTEL_PTE_WIRED; - if (!iswired(*pte)) - OSAddAtomic(+1, - &pmap->stats.wired_count); + template |= PTE_WIRED; + if (!iswired(old_attributes)) { + OSAddAtomic(+1, &pmap->stats.wired_count); + pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE); + } } else { - if (iswired(*pte)) { + if (iswired(old_attributes)) { assert(pmap->stats.wired_count >= 1); - OSAddAtomic(-1, - &pmap->stats.wired_count); + OSAddAtomic(-1, &pmap->stats.wired_count); + pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE); } } + if (superpage) /* this path can not be used */ - template |= INTEL_PTE_PS; /* to change the page size! */ + template |= PTE_PS; /* to change the page size! */ + + if (old_attributes == template) + goto dont_update_pte; + + /* Determine delta, PV locked */ + need_tlbflush = + ((old_attributes ^ template) != PTE_WIRED); + + if (need_tlbflush == TRUE && !(old_attributes & PTE_WRITE(is_ept))) { + if ((old_attributes ^ template) == PTE_WRITE(is_ept)) + need_tlbflush = FALSE; + } + + /* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */ + if (is_ept && !pmap_ept_support_ad) { + template |= PTE_REF(is_ept); + if (old_pa_locked) { + assert(IS_MANAGED_PAGE(pai)); + pmap_phys_attributes[pai] |= PHYS_REFERENCED; + } + } /* store modified PTE and preserve RC bits */ - pmap_update_pte(pte, *pte, - template | (*pte & (INTEL_PTE_REF | INTEL_PTE_MOD))); + pt_entry_t npte, opte;; + do { + opte = *pte; + npte = template | (opte & (PTE_REF(is_ept) | PTE_MOD(is_ept))); + } while (!pmap_cmpx_pte(pte, opte, npte)); +dont_update_pte: if (old_pa_locked) { UNLOCK_PVH(pai); old_pa_locked = FALSE; } - need_tlbflush = TRUE; goto Done; } @@ -538,34 +741,45 @@ Retry: */ /* invalidate the PTE */ - pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID)); + pmap_update_pte(pte, PTE_VALID_MASK(is_ept), 0); /* propagate invalidate everywhere */ PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); /* remember reference and change */ old_pte = *pte; - oattr = (char) (old_pte & (PHYS_MODIFIED | PHYS_REFERENCED)); + oattr = (char) (old_pte & (PTE_MOD(is_ept) | PTE_REF(is_ept))); /* completely invalidate the PTE */ pmap_store_pte(pte, 0); if (IS_MANAGED_PAGE(pai)) { -#if TESTING - if (pmap->stats.resident_count < 1) - panic("pmap_enter: resident_count"); -#endif + pmap_assert(old_pa_locked == TRUE); + pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE); + pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); assert(pmap->stats.resident_count >= 1); - OSAddAtomic(-1, - &pmap->stats.resident_count); - + OSAddAtomic(-1, &pmap->stats.resident_count); + if (pmap != kernel_pmap) { + if (IS_REUSABLE_PAGE(pai)) { + assert(pmap->stats.reusable > 0); + OSAddAtomic(-1, &pmap->stats.reusable); + } else if (IS_INTERNAL_PAGE(pai)) { + assert(pmap->stats.internal > 0); + OSAddAtomic(-1, &pmap->stats.internal); + } else { + assert(pmap->stats.external > 0); + OSAddAtomic(-1, &pmap->stats.external); + } + } if (iswired(*pte)) { -#if TESTING - if (pmap->stats.wired_count < 1) - panic("pmap_enter: wired_count"); -#endif assert(pmap->stats.wired_count >= 1); - OSAddAtomic(-1, - &pmap->stats.wired_count); + OSAddAtomic(-1, &pmap->stats.wired_count); + pmap_ledger_debit(pmap, task_ledgers.wired_mem, + PAGE_SIZE); + } + + if (!is_ept) { + pmap_phys_attributes[pai] |= oattr; + } else { + pmap_phys_attributes[pai] |= ept_refmod_to_physmap(oattr); } - pmap_phys_attributes[pai] |= oattr; /* * Remove the mapping from the pvlist for @@ -582,10 +796,16 @@ Retry: * Do removal part of accounting. */ + if (pmap != kernel_pmap) { +#if 00 + assert(pmap->stats.device > 0); + OSAddAtomic(-1, &pmap->stats.device); +#endif + } if (iswired(*pte)) { assert(pmap->stats.wired_count >= 1); - OSAddAtomic(-1, - &pmap->stats.wired_count); + OSAddAtomic(-1, &pmap->stats.wired_count); + pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE); } } } @@ -616,6 +836,17 @@ Retry: pv_h->va = vaddr; pv_h->pmap = pmap; queue_init(&pv_h->qlink); + + if (options & PMAP_OPTIONS_INTERNAL) { + pmap_phys_attributes[pai] |= PHYS_INTERNAL; + } else { + pmap_phys_attributes[pai] &= ~PHYS_INTERNAL; + } + if (options & PMAP_OPTIONS_REUSABLE) { + pmap_phys_attributes[pai] |= PHYS_REUSABLE; + } else { + pmap_phys_attributes[pai] &= ~PHYS_REUSABLE; + } } else { /* * Add new pv_hashed_entry after header. @@ -624,7 +855,7 @@ Retry: pvh_e = pvh_new; pvh_new = PV_HASHED_ENTRY_NULL; } else if (PV_HASHED_ENTRY_NULL == pvh_e) { - PV_HASHED_ALLOC(pvh_e); + PV_HASHED_ALLOC(&pvh_e); if (PV_HASHED_ENTRY_NULL == pvh_e) { /* * the pv list is empty. if we are on @@ -636,10 +867,11 @@ Retry: * us. */ if (kernel_pmap == pmap) { - PV_HASHED_KERN_ALLOC(pvh_e); + PV_HASHED_KERN_ALLOC(&pvh_e); } else { UNLOCK_PVH(pai); PMAP_UNLOCK(pmap); + pmap_pv_throttle(pmap); pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); goto Retry; } @@ -664,15 +896,37 @@ Retry: * only count the mapping * for 'managed memory' */ - OSAddAtomic(+1, & pmap->stats.resident_count); + pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE); + pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); + OSAddAtomic(+1, &pmap->stats.resident_count); if (pmap->stats.resident_count > pmap->stats.resident_max) { pmap->stats.resident_max = pmap->stats.resident_count; } + if (pmap != kernel_pmap) { + if (IS_REUSABLE_PAGE(pai)) { + OSAddAtomic(+1, &pmap->stats.reusable); + PMAP_STATS_PEAK(pmap->stats.reusable); + } else if (IS_INTERNAL_PAGE(pai)) { + OSAddAtomic(+1, &pmap->stats.internal); + PMAP_STATS_PEAK(pmap->stats.internal); + } else { + OSAddAtomic(+1, &pmap->stats.external); + PMAP_STATS_PEAK(pmap->stats.external); + } + } } else if (last_managed_page == 0) { /* Account for early mappings created before "managed pages" * are determined. Consider consulting the available DRAM map. */ + pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE); + pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); OSAddAtomic(+1, &pmap->stats.resident_count); + if (pmap != kernel_pmap) { +#if 00 + OSAddAtomic(+1, &pmap->stats.device); + PMAP_STATS_PEAK(pmap->stats.device); +#endif + } } /* * Step 3) Enter the mapping. @@ -680,25 +934,64 @@ Retry: * Build a template to speed up entering - * only the pfn changes. */ - template = pa_to_pte(pa) | INTEL_PTE_VALID; + template = pa_to_pte(pa); + + if (!is_ept) { + template |= INTEL_PTE_VALID; + } else { + template |= INTEL_EPT_IPTA; + } + + + /* + * DRK: It may be worth asserting on cache attribute flags that diverge + * from the existing physical page attributes. + */ + + template |= pmap_get_cache_attributes(pa_index(pa), is_ept); - if (flags & VM_MEM_NOT_CACHEABLE) { + /* + * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs + */ + if (!is_ept && (flags & VM_MEM_NOT_CACHEABLE)) { if (!(flags & VM_MEM_GUARDED)) template |= INTEL_PTE_PTA; template |= INTEL_PTE_NCACHE; } - if (pmap != kernel_pmap) + if (pmap != kernel_pmap && !is_ept) template |= INTEL_PTE_USER; - if (prot & VM_PROT_WRITE) - template |= INTEL_PTE_WRITE; + if (prot & VM_PROT_READ) + template |= PTE_READ(is_ept); + if (prot & VM_PROT_WRITE) { + template |= PTE_WRITE(is_ept); + if (is_ept && !pmap_ept_support_ad) { + template |= PTE_MOD(is_ept); + if (IS_MANAGED_PAGE(pai)) + pmap_phys_attributes[pai] |= PHYS_MODIFIED; + } + } + if (prot & VM_PROT_EXECUTE) { + assert(set_NX == 0); + template = pte_set_ex(template, is_ept); + } + if (set_NX) - template |= INTEL_PTE_NX; + template = pte_remove_ex(template, is_ept); if (wired) { template |= INTEL_PTE_WIRED; OSAddAtomic(+1, & pmap->stats.wired_count); + pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE); } if (superpage) template |= INTEL_PTE_PS; + + /* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */ + if (is_ept && !pmap_ept_support_ad) { + template |= PTE_REF(is_ept); + if (IS_MANAGED_PAGE(pai)) + pmap_phys_attributes[pai] |= PHYS_REFERENCED; + } + pmap_store_pte(pte, template); /* @@ -710,9 +1003,12 @@ Retry: UNLOCK_PVH(pai); } Done: - if (need_tlbflush == TRUE) - PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); - + if (need_tlbflush == TRUE) { + if (options & PMAP_OPTIONS_NOFLUSH) + PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg); + else + PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); + } if (pvh_e != PV_HASHED_ENTRY_NULL) { PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1); } @@ -725,15 +1021,17 @@ Done: vm_page_t m; vm_object_lock(delpage_pm_obj); - m = vm_page_lookup(delpage_pm_obj, delpage_pde_index); + m = vm_page_lookup(delpage_pm_obj, (delpage_pde_index * PAGE_SIZE)); if (m == VM_PAGE_NULL) panic("pmap_enter: pte page not in object"); VM_PAGE_FREE(m); - OSAddAtomic(-1, &inuse_ptepages_count); vm_object_unlock(delpage_pm_obj); + OSAddAtomic(-1, &inuse_ptepages_count); + PMAP_ZINFO_PFREE(pmap, PAGE_SIZE); } PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0); + return KERN_SUCCESS; } /* @@ -754,6 +1052,18 @@ pmap_remove_range( vm_map_offset_t start_vaddr, pt_entry_t *spte, pt_entry_t *epte) +{ + pmap_remove_range_options(pmap, start_vaddr, spte, epte, + PMAP_OPTIONS_REMOVE); +} + +void +pmap_remove_range_options( + pmap_t pmap, + vm_map_offset_t start_vaddr, + pt_entry_t *spte, + pt_entry_t *epte, + int options) { pt_entry_t *cpte; pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL; @@ -761,27 +1071,22 @@ pmap_remove_range( pv_hashed_entry_t pvh_e; int pvh_cnt = 0; int num_removed, num_unwired, num_found, num_invalid; - int pai; + int num_device, num_external, num_internal, num_reusable; + uint64_t num_compressed; + ppnum_t pai; pmap_paddr_t pa; vm_map_offset_t vaddr; + boolean_t is_ept = is_ept_pmap(pmap); num_removed = 0; num_unwired = 0; num_found = 0; num_invalid = 0; -#if defined(__i386__) - if (pmap != kernel_pmap && - pmap->pm_task_map == TASK_MAP_32BIT && - start_vaddr >= HIGH_MEM_BASE) { - /* - * The range is in the "high_shared_pde" which is shared - * between the kernel and all 32-bit tasks. It holds - * the 32-bit commpage but also the trampolines, GDT, etc... - * so we can't let user tasks remove anything from it. - */ - return; - } -#endif + num_device = 0; + num_external = 0; + num_internal = 0; + num_reusable = 0; + num_compressed = 0; /* invalidate the PTEs first to "freeze" them */ for (cpte = spte, vaddr = start_vaddr; cpte < epte; @@ -789,8 +1094,18 @@ pmap_remove_range( pt_entry_t p = *cpte; pa = pte_to_pa(p); - if (pa == 0) + if (pa == 0) { + if (pmap != kernel_pmap && + (options & PMAP_OPTIONS_REMOVE) && + (p & PTE_COMPRESSED)) { + /* one less "compressed" */ + num_compressed++; + /* clear marker */ + /* XXX probably does not need to be atomic! */ + pmap_update_pte(cpte, PTE_COMPRESSED, 0); + } continue; + } num_found++; if (iswired(p)) @@ -804,14 +1119,15 @@ pmap_remove_range( * Just remove the mappings. */ pmap_store_pte(cpte, 0); + num_device++; continue; } - if ((p & INTEL_PTE_VALID) == 0) + if ((p & PTE_VALID_MASK(is_ept)) == 0) num_invalid++; - /* invalidate the PTE */ - pmap_update_pte(cpte, *cpte, (*cpte & ~INTEL_PTE_VALID)); + /* invalidate the PTE */ + pmap_update_pte(cpte, PTE_VALID_MASK(is_ept), 0); } if (num_found == 0) { @@ -841,6 +1157,13 @@ pmap_remove_range( continue; } num_removed++; + if (IS_REUSABLE_PAGE(pai)) { + num_reusable++; + } else if (IS_INTERNAL_PAGE(pai)) { + num_internal++; + } else { + num_external++; + } /* * Get the modify and reference bits, then @@ -882,15 +1205,38 @@ update_counts: if (pmap->stats.resident_count < num_removed) panic("pmap_remove_range: resident_count"); #endif + pmap_ledger_debit(pmap, task_ledgers.phys_mem, machine_ptob(num_removed)); + pmap_ledger_debit(pmap, task_ledgers.phys_footprint, machine_ptob(num_removed)); assert(pmap->stats.resident_count >= num_removed); OSAddAtomic(-num_removed, &pmap->stats.resident_count); + if (pmap != kernel_pmap) { +#if 00 + assert(pmap->stats.device >= num_device); + if (num_device) + OSAddAtomic(-num_device, &pmap->stats.device); +#endif /* 00 */ + assert(pmap->stats.external >= num_external); + if (num_external) + OSAddAtomic(-num_external, &pmap->stats.external); + assert(pmap->stats.internal >= num_internal); + if (num_internal) + OSAddAtomic(-num_internal, &pmap->stats.internal); + assert(pmap->stats.reusable >= num_reusable); + if (num_reusable) + OSAddAtomic(-num_reusable, &pmap->stats.reusable); + assert(pmap->stats.compressed >= num_compressed); + if (num_compressed) + OSAddAtomic64(-num_compressed, &pmap->stats.compressed); + } + #if TESTING if (pmap->stats.wired_count < num_unwired) panic("pmap_remove_range: wired_count"); #endif assert(pmap->stats.wired_count >= num_unwired); OSAddAtomic(-num_unwired, &pmap->stats.wired_count); + pmap_ledger_debit(pmap, task_ledgers.wired_mem, machine_ptob(num_unwired)); return; } @@ -908,17 +1254,30 @@ pmap_remove( pmap_t map, addr64_t s64, addr64_t e64) +{ + pmap_remove_options(map, s64, e64, PMAP_OPTIONS_REMOVE); +} + +void +pmap_remove_options( + pmap_t map, + addr64_t s64, + addr64_t e64, + int options) { pt_entry_t *pde; pt_entry_t *spte, *epte; addr64_t l64; uint64_t deadline; + boolean_t is_ept; pmap_intr_assert(); if (map == PMAP_NULL || s64 == e64) return; + is_ept = is_ept_pmap(map); + PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START, map, (uint32_t) (s64 >> 32), s64, @@ -967,8 +1326,8 @@ pmap_remove( l64 = e64; pde = pmap_pde(map, s64); - if (pde && (*pde & INTEL_PTE_VALID)) { - if (*pde & INTEL_PTE_PS) { + if (pde && (*pde & PTE_VALID_MASK(is_ept))) { + if (*pde & PTE_PS) { /* * If we're removing a superpage, pmap_remove_range() * must work on level 2 instead of level 1; and we're @@ -982,12 +1341,21 @@ pmap_remove( spte = &spte[ptenum(s64)]; epte = &spte[intel_btop(l64 - s64)]; } - pmap_remove_range(map, s64, spte, epte); + pmap_remove_range_options(map, s64, spte, epte, + options); } s64 = l64; if (s64 < e64 && rdtsc64() >= deadline) { PMAP_UNLOCK(map) + /* TODO: Rapid release/reacquisition can defeat + * the "backoff" intent here; either consider a + * fair spinlock, or a scheme whereby each lock + * attempt marks the processor as within a spinlock + * acquisition, and scan CPUs here to determine + * if a backoff is necessary, to avoid sacrificing + * performance in the common case. + */ PMAP_LOCK(map) deadline = rdtsc64() + max_preemption_latency_tsc; } @@ -1000,17 +1368,27 @@ pmap_remove( } +void +pmap_page_protect( + ppnum_t pn, + vm_prot_t prot) +{ + pmap_page_protect_options(pn, prot, 0, NULL); +} + /* - * Routine: pmap_page_protect + * Routine: pmap_page_protect_options * * Function: * Lower the permission for all mappings to a given * page. */ void -pmap_page_protect( +pmap_page_protect_options( ppnum_t pn, - vm_prot_t prot) + vm_prot_t prot, + unsigned int options, + void *arg) { pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL; pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL; @@ -1023,6 +1401,8 @@ pmap_page_protect( int pai; pmap_t pmap; boolean_t remove; + pt_entry_t new_pte_value; + boolean_t is_ept; pmap_intr_assert(); assert(pn != vm_page_fictitious_addr); @@ -1072,14 +1452,21 @@ pmap_page_protect( do { vm_map_offset_t vaddr; + if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) && + (pmap_phys_attributes[pai] & PHYS_MODIFIED)) { + /* page was modified, so it will be compressed */ + options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED; + options |= PMAP_OPTIONS_COMPRESSOR; + } + pmap = pv_e->pmap; + is_ept = is_ept_pmap(pmap); vaddr = pv_e->va; pte = pmap_pte(pmap, vaddr); -#if DEBUG - if (pa_index(pte_to_pa(*pte)) != pn) - panic("pmap_page_protect: PTE mismatch, pn: 0x%x, pmap: %p, vaddr: 0x%llx, pte: 0x%llx", pn, pmap, vaddr, *pte); -#endif + pmap_assert2((pa_index(pte_to_pa(*pte)) == pn), + "pmap_page_protect: PTE mismatch, pn: 0x%x, pmap: %p, vaddr: 0x%llx, pte: 0x%llx", pn, pmap, vaddr, *pte); + if (0 == pte) { panic("pmap_page_protect() " "pmap=%p pn=0x%x vaddr=0x%llx\n", @@ -1089,24 +1476,98 @@ pmap_page_protect( /* * Remove the mapping if new protection is NONE - * or if write-protecting a kernel mapping. */ - if (remove || pmap == kernel_pmap) { - /* - * Remove the mapping, collecting dirty bits. - */ - pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_VALID); - PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); - pmap_phys_attributes[pai] |= - *pte & (PHYS_MODIFIED|PHYS_REFERENCED); - pmap_store_pte(pte, 0); + if (remove) { + + /* Remove per-pmap wired count */ + if (iswired(*pte)) { + OSAddAtomic(-1, &pmap->stats.wired_count); + pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE); + } + + if (pmap != kernel_pmap && + (options & PMAP_OPTIONS_COMPRESSOR) && + IS_INTERNAL_PAGE(pai)) { + /* mark this PTE as having been "reclaimed" */ + new_pte_value = PTE_COMPRESSED; + } else { + new_pte_value = 0; + } + + if (options & PMAP_OPTIONS_NOREFMOD) { + pmap_store_pte(pte, new_pte_value); + + if (options & PMAP_OPTIONS_NOFLUSH) + PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg); + else + PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); + } else { + /* + * Remove the mapping, collecting dirty bits. + */ + pmap_update_pte(pte, PTE_VALID_MASK(is_ept), 0); + + PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); + if ((options & + PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) && + ! (pmap_phys_attributes[pai] & + PHYS_MODIFIED) && + (*pte & PHYS_MODIFIED)) { + /* + * Page is actually "modified" and + * will be compressed. Start + * accounting for it as "compressed". + */ + options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED; + options |= PMAP_OPTIONS_COMPRESSOR; + new_pte_value = PTE_COMPRESSED; + } + if (!is_ept) { + pmap_phys_attributes[pai] |= + *pte & (PHYS_MODIFIED|PHYS_REFERENCED); + } else { + pmap_phys_attributes[pai] |= + ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED); + } + pmap_store_pte(pte, new_pte_value); + } + + if (new_pte_value == PTE_COMPRESSED) { + /* one more "compressed" page */ + OSAddAtomic64(+1, &pmap->stats.compressed); + PMAP_STATS_PEAK(pmap->stats.compressed); + pmap->stats.compressed_lifetime++; + } #if TESTING if (pmap->stats.resident_count < 1) panic("pmap_page_protect: resident_count"); #endif + pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE); assert(pmap->stats.resident_count >= 1); OSAddAtomic(-1, &pmap->stats.resident_count); + if (options & PMAP_OPTIONS_COMPRESSOR) { + /* + * This removal is only being done so we can send this page to + * the compressor; therefore it mustn't affect total task footprint. + */ + pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE); + } else { + pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); + } + + if (pmap != kernel_pmap) { + if (IS_REUSABLE_PAGE(pai)) { + assert(pmap->stats.reusable > 0); + OSAddAtomic(-1, &pmap->stats.reusable); + } else if (IS_INTERNAL_PAGE(pai)) { + assert(pmap->stats.internal > 0); + OSAddAtomic(-1, &pmap->stats.internal); + } else { + assert(pmap->stats.external > 0); + OSAddAtomic(-1, &pmap->stats.external); + } + } /* * Deal with the pv_rooted_entry. @@ -1117,8 +1578,6 @@ pmap_page_protect( * Fix up head later. */ pv_h->pmap = PMAP_NULL; - - pmap_phys_attributes[pai] &= ~PHYS_NOENCRYPT; } else { /* * Delete this entry. @@ -1133,10 +1592,21 @@ pmap_page_protect( } } else { /* - * Write-protect. + * Write-protect, after opportunistic refmod collect */ - pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_WRITE); - PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); + if (!is_ept) { + pmap_phys_attributes[pai] |= + *pte & (PHYS_MODIFIED|PHYS_REFERENCED); + } else { + pmap_phys_attributes[pai] |= + ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED); + } + pmap_update_pte(pte, PTE_WRITE(is_ept), 0); + + if (options & PMAP_OPTIONS_NOFLUSH) + PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg); + else + PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); } pvh_e = nexth; } while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h); @@ -1170,180 +1640,476 @@ done: 0, 0, 0, 0, 0); } -__private_extern__ void -pmap_pagetable_corruption_msg_log(int (*log_func)(const char * fmt, ...)__printflike(1,2)) { - if (pmap_pagetable_corruption_incidents > 0) { - int i, e = MIN(pmap_pagetable_corruption_incidents, PMAP_PAGETABLE_CORRUPTION_MAX_LOG); - (*log_func)("%u pagetable corruption incident(s) detected, timeout: %u\n", pmap_pagetable_corruption_incidents, pmap_pagetable_corruption_timeout); - for (i = 0; i < e; i++) { - (*log_func)("Incident 0x%x, reason: 0x%x, action: 0x%x, time: 0x%llx\n", pmap_pagetable_corruption_records[i].incident, pmap_pagetable_corruption_records[i].reason, pmap_pagetable_corruption_records[i].action, pmap_pagetable_corruption_records[i].abstime); - } - } -} +/* + * Clear specified attribute bits. + */ void -mapping_free_prime(void) +phys_attribute_clear( + ppnum_t pn, + int bits, + unsigned int options, + void *arg) { - int i; - pv_hashed_entry_t pvh_e; - pv_hashed_entry_t pvh_eh; - pv_hashed_entry_t pvh_et; - int pv_cnt; + pv_rooted_entry_t pv_h; + pv_hashed_entry_t pv_e; + pt_entry_t *pte; + int pai; + pmap_t pmap; + char attributes = 0; + boolean_t is_internal, is_reusable, is_ept; + int ept_bits_to_clear; + boolean_t ept_keep_global_mod = FALSE; + + if ((bits & PHYS_MODIFIED) && + (options & PMAP_OPTIONS_NOFLUSH) && + arg == NULL) { + panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p): " + "should not clear 'modified' without flushing TLBs\n", + pn, bits, options, arg); + } - pv_cnt = 0; - pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; - for (i = 0; i < (5 * PV_HASHED_ALLOC_CHUNK); i++) { - pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); + /* We only support converting MOD and REF bits for EPT PTEs in this function */ + assert((bits & ~(PHYS_REFERENCED | PHYS_MODIFIED)) == 0); - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; + ept_bits_to_clear = (unsigned)physmap_refmod_to_ept(bits & (PHYS_MODIFIED | PHYS_REFERENCED)); - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pv_cnt++; + pmap_intr_assert(); + assert(pn != vm_page_fictitious_addr); + if (pn == vm_page_guard_addr) + return; + + pai = ppn_to_pai(pn); + + if (!IS_MANAGED_PAGE(pai)) { + /* + * Not a managed page. + */ + return; } - PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt); - pv_cnt = 0; - pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; - for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) { - pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); + PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, + pn, bits, 0, 0, 0); - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; + pv_h = pai_to_pvh(pai); - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pv_cnt++; + LOCK_PVH(pai); + + + /* + * Walk down PV list, clearing all modify or reference bits. + * We do not have to lock the pv_list because we have + * the per-pmap lock + */ + if (pv_h->pmap != PMAP_NULL) { + /* + * There are some mappings. + */ + + is_internal = IS_INTERNAL_PAGE(pai); + is_reusable = IS_REUSABLE_PAGE(pai); + + pv_e = (pv_hashed_entry_t)pv_h; + + do { + vm_map_offset_t va; + char pte_bits; + + pmap = pv_e->pmap; + is_ept = is_ept_pmap(pmap); + va = pv_e->va; + pte_bits = 0; + + if (bits) { + pte = pmap_pte(pmap, va); + /* grab ref/mod bits from this PTE */ + pte_bits = (*pte & (PTE_REF(is_ept) | PTE_MOD(is_ept))); + /* propagate to page's global attributes */ + if (!is_ept) { + attributes |= pte_bits; + } else { + attributes |= ept_refmod_to_physmap(pte_bits); + if (!pmap_ept_support_ad && (pte_bits & INTEL_EPT_MOD)) { + ept_keep_global_mod = TRUE; + } + } + /* which bits to clear for this PTE? */ + if (!is_ept) { + pte_bits &= bits; + } else { + pte_bits &= ept_bits_to_clear; + } + } + + /* + * Clear modify and/or reference bits. + */ + if (pte_bits) { + pmap_update_pte(pte, bits, 0); + + /* Ensure all processors using this translation + * invalidate this TLB entry. The invalidation + * *must* follow the PTE update, to ensure that + * the TLB shadow of the 'D' bit (in particular) + * is synchronized with the updated PTE. + */ + if (! (options & PMAP_OPTIONS_NOFLUSH)) { + /* flush TLBS now */ + PMAP_UPDATE_TLBS(pmap, + va, + va + PAGE_SIZE); + } else if (arg) { + /* delayed TLB flush: add "pmap" info */ + PMAP_UPDATE_TLBS_DELAYED( + pmap, + va, + va + PAGE_SIZE, + (pmap_flush_context *)arg); + } else { + /* no TLB flushing at all */ + } + } + + /* update pmap "reusable" stats */ + if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && + is_reusable && + pmap != kernel_pmap) { + /* one less "reusable" */ + assert(pmap->stats.reusable > 0); + OSAddAtomic(-1, &pmap->stats.reusable); + if (is_internal) { + /* one more "internal" */ + OSAddAtomic(+1, &pmap->stats.internal); + PMAP_STATS_PEAK(pmap->stats.internal); + } else { + /* one more "external" */ + OSAddAtomic(+1, &pmap->stats.external); + PMAP_STATS_PEAK(pmap->stats.external); + } + } else if ((options & PMAP_OPTIONS_SET_REUSABLE) && + !is_reusable && + pmap != kernel_pmap) { + /* one more "reusable" */ + OSAddAtomic(+1, &pmap->stats.reusable); + PMAP_STATS_PEAK(pmap->stats.reusable); + if (is_internal) { + /* one less "internal" */ + assert(pmap->stats.internal > 0); + OSAddAtomic(-1, &pmap->stats.internal); + } else { + /* one less "external" */ + assert(pmap->stats.external > 0); + OSAddAtomic(-1, &pmap->stats.external); + } + } + + pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink); + + } while (pv_e != (pv_hashed_entry_t)pv_h); } - PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt); + /* Opportunistic refmod collection, annulled + * if both REF and MOD are being cleared. + */ -} + pmap_phys_attributes[pai] |= attributes; -static inline void -pmap_pagetable_corruption_log_setup(void) { - if (pmap_pagetable_corruption_log_call == NULL) { - nanotime_to_absolutetime(PMAP_PAGETABLE_CORRUPTION_INTERVAL, 0, &pmap_pagetable_corruption_interval_abstime); - thread_call_setup(&pmap_pagetable_corruption_log_call_data, - (thread_call_func_t) pmap_pagetable_corruption_msg_log, - (thread_call_param_t) &printf); - pmap_pagetable_corruption_log_call = &pmap_pagetable_corruption_log_call_data; + if (ept_keep_global_mod) { + /* + * If the hardware doesn't support AD bits for EPT PTEs and someone is + * requesting that we clear the modified bit for a phys page, we need + * to ensure that there are no EPT mappings for the page with the + * modified bit set. If there are, we cannot clear the global modified bit. + */ + bits &= ~PHYS_MODIFIED; + } + pmap_phys_attributes[pai] &= ~(bits); + + /* update this page's "reusable" status */ + if (options & PMAP_OPTIONS_CLEAR_REUSABLE) { + pmap_phys_attributes[pai] &= ~PHYS_REUSABLE; + } else if (options & PMAP_OPTIONS_SET_REUSABLE) { + pmap_phys_attributes[pai] |= PHYS_REUSABLE; } + + UNLOCK_PVH(pai); + + PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END, + 0, 0, 0, 0, 0); } -void -mapping_adjust(void) +/* + * Check specified attribute bits. + */ +int +phys_attribute_test( + ppnum_t pn, + int bits) { - pv_hashed_entry_t pvh_e; - pv_hashed_entry_t pvh_eh; - pv_hashed_entry_t pvh_et; - int pv_cnt; - int i; - - if (mapping_adjust_call == NULL) { - thread_call_setup(&mapping_adjust_call_data, - (thread_call_func_t) mapping_adjust, - (thread_call_param_t) NULL); - mapping_adjust_call = &mapping_adjust_call_data; + pv_rooted_entry_t pv_h; + pv_hashed_entry_t pv_e; + pt_entry_t *pte; + int pai; + pmap_t pmap; + int attributes = 0; + boolean_t is_ept; + + pmap_intr_assert(); + assert(pn != vm_page_fictitious_addr); + assert((bits & ~(PHYS_MODIFIED | PHYS_REFERENCED)) == 0); + if (pn == vm_page_guard_addr) + return 0; + + pai = ppn_to_pai(pn); + + if (!IS_MANAGED_PAGE(pai)) { + /* + * Not a managed page. + */ + return 0; } - pmap_pagetable_corruption_log_setup(); + /* + * Fast check... if bits already collected + * no need to take any locks... + * if not set, we need to recheck after taking + * the lock in case they got pulled in while + * we were waiting for the lock + */ + if ((pmap_phys_attributes[pai] & bits) == bits) + return bits; - pv_cnt = 0; - pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; - if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) { - for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) { - pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); + pv_h = pai_to_pvh(pai); - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; + LOCK_PVH(pai); - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pv_cnt++; - } - PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt); - } + attributes = pmap_phys_attributes[pai] & bits; - pv_cnt = 0; - pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; - if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) { - for (i = 0; i < PV_HASHED_ALLOC_CHUNK; i++) { - pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; + /* + * Walk down PV list, checking the mappings until we + * reach the end or we've found the desired attributes. + */ + if (attributes != bits && + pv_h->pmap != PMAP_NULL) { + /* + * There are some mappings. + */ + pv_e = (pv_hashed_entry_t)pv_h; + do { + vm_map_offset_t va; - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pv_cnt++; - } - PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt); + pmap = pv_e->pmap; + is_ept = is_ept_pmap(pmap); + va = pv_e->va; + /* + * pick up modify and/or reference bits from mapping + */ + + pte = pmap_pte(pmap, va); + if (!is_ept) { + attributes |= (int)(*pte & bits); + } else { + attributes |= (int)(ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED)); + + } + + pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink); + + } while ((attributes != bits) && + (pv_e != (pv_hashed_entry_t)pv_h)); } - mappingrecurse = 0; -} + pmap_phys_attributes[pai] |= attributes; + UNLOCK_PVH(pai); + return (attributes); +} -boolean_t -pmap_is_noencrypt(ppnum_t pn) +/* + * Routine: pmap_change_wiring + * Function: Change the wiring attribute for a map/virtual-address + * pair. + * In/out conditions: + * The mapping must already exist in the pmap. + */ +void +pmap_change_wiring( + pmap_t map, + vm_map_offset_t vaddr, + boolean_t wired) { - int pai; + pt_entry_t *pte; - pai = ppn_to_pai(pn); + PMAP_LOCK(map); - if (!IS_MANAGED_PAGE(pai)) - return (TRUE); + if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL) + panic("pmap_change_wiring(%p,0x%llx,%d): pte missing", + map, vaddr, wired); - if (pmap_phys_attributes[pai] & PHYS_NOENCRYPT) - return (TRUE); + if (wired && !iswired(*pte)) { + /* + * wiring down mapping + */ + pmap_ledger_credit(map, task_ledgers.wired_mem, PAGE_SIZE); + OSAddAtomic(+1, &map->stats.wired_count); + pmap_update_pte(pte, 0, PTE_WIRED); + } + else if (!wired && iswired(*pte)) { + /* + * unwiring mapping + */ + assert(map->stats.wired_count >= 1); + OSAddAtomic(-1, &map->stats.wired_count); + pmap_ledger_debit(map, task_ledgers.wired_mem, PAGE_SIZE); + pmap_update_pte(pte, PTE_WIRED, 0); + } - return (FALSE); + PMAP_UNLOCK(map); } +/* + * "Backdoor" direct map routine for early mappings. + * Useful for mapping memory outside the range + * Sets A, D and NC if requested + */ -void -pmap_set_noencrypt(ppnum_t pn) +vm_offset_t +pmap_map_bd( + vm_offset_t virt, + vm_map_offset_t start_addr, + vm_map_offset_t end_addr, + vm_prot_t prot, + unsigned int flags) { - int pai; - - pai = ppn_to_pai(pn); + pt_entry_t template; + pt_entry_t *pte; + spl_t spl; + vm_offset_t base = virt; + template = pa_to_pte(start_addr) + | INTEL_PTE_REF + | INTEL_PTE_MOD + | INTEL_PTE_WIRED + | INTEL_PTE_VALID; + + if ((flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) == VM_MEM_NOT_CACHEABLE) { + template |= INTEL_PTE_NCACHE; + if (!(flags & (VM_MEM_GUARDED))) + template |= INTEL_PTE_PTA; + } - if (IS_MANAGED_PAGE(pai)) { - LOCK_PVH(pai); +#if defined(__x86_64__) + if ((prot & VM_PROT_EXECUTE) == 0) + template |= INTEL_PTE_NX; +#endif - pmap_phys_attributes[pai] |= PHYS_NOENCRYPT; + if (prot & VM_PROT_WRITE) + template |= INTEL_PTE_WRITE; - UNLOCK_PVH(pai); + while (start_addr < end_addr) { + spl = splhigh(); + pte = pmap_pte(kernel_pmap, (vm_map_offset_t)virt); + if (pte == PT_ENTRY_NULL) { + panic("pmap_map_bd: Invalid kernel address\n"); + } + pmap_store_pte(pte, template); + splx(spl); + pte_increment_pa(template); + virt += PAGE_SIZE; + start_addr += PAGE_SIZE; } + flush_tlb_raw(); + PMAP_UPDATE_TLBS(kernel_pmap, base, base + end_addr - start_addr); + return(virt); } - -void -pmap_clear_noencrypt(ppnum_t pn) +mach_vm_size_t +pmap_query_resident( + pmap_t pmap, + addr64_t s64, + addr64_t e64, + mach_vm_size_t *compressed_bytes_p) { - int pai; + pt_entry_t *pde; + pt_entry_t *spte, *epte; + addr64_t l64; + uint64_t deadline; + mach_vm_size_t resident_bytes; + mach_vm_size_t compressed_bytes; + boolean_t is_ept; - pai = ppn_to_pai(pn); + pmap_intr_assert(); - if (IS_MANAGED_PAGE(pai)) { - LOCK_PVH(pai); + if (pmap == PMAP_NULL || pmap == kernel_pmap || s64 == e64) { + if (compressed_bytes_p) { + *compressed_bytes_p = 0; + } + return 0; + } - pmap_phys_attributes[pai] &= ~PHYS_NOENCRYPT; + is_ept = is_ept_pmap(pmap); - UNLOCK_PVH(pai); + PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START, + pmap, + (uint32_t) (s64 >> 32), s64, + (uint32_t) (e64 >> 32), e64); + + resident_bytes = 0; + compressed_bytes = 0; + + PMAP_LOCK(pmap); + + deadline = rdtsc64() + max_preemption_latency_tsc; + + while (s64 < e64) { + l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size - 1); + if (l64 > e64) + l64 = e64; + pde = pmap_pde(pmap, s64); + + if (pde && (*pde & PTE_VALID_MASK(is_ept))) { + if (*pde & PTE_PS) { + /* superpage: not supported */ + } else { + spte = pmap_pte(pmap, + (s64 & ~(pde_mapped_size - 1))); + spte = &spte[ptenum(s64)]; + epte = &spte[intel_btop(l64 - s64)]; + + for (; spte < epte; spte++) { + if (pte_to_pa(*spte) != 0) { + resident_bytes += PAGE_SIZE; + } else if (*spte & PTE_COMPRESSED) { + compressed_bytes += PAGE_SIZE; + } + } + + } + } + s64 = l64; + + if (s64 < e64 && rdtsc64() >= deadline) { + PMAP_UNLOCK(pmap); + PMAP_LOCK(pmap); + deadline = rdtsc64() + max_preemption_latency_tsc; + } } -} -void x86_filter_TLB_coherency_interrupts(boolean_t dofilter) { - assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0); + PMAP_UNLOCK(pmap); - if (dofilter) { - CPU_CR3_MARK_INACTIVE(); - } else { - CPU_CR3_MARK_ACTIVE(); - __asm__ volatile("mfence"); - if (current_cpu_datap()->cpu_tlb_invalid) - process_pmap_updates(); + PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END, + pmap, 0, 0, 0, 0); + + if (compressed_bytes_p) { + *compressed_bytes_p = compressed_bytes; } + return resident_bytes; } +#if MACH_ASSERT +void +pmap_set_process( + __unused pmap_t pmap, + __unused int pid, + __unused char *procname) +{ +} +#endif /* MACH_ASSERT */