X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/7ddcb079202367355dddccdfa4318e57d50318be..3e170ce000f1506b7b5d2c5c7faec85ceabb573d:/osfmk/i386/pmap_x86_common.c diff --git a/osfmk/i386/pmap_x86_common.c b/osfmk/i386/pmap_x86_common.c index 9061d73cf..9841a0754 100644 --- a/osfmk/i386/pmap_x86_common.c +++ b/osfmk/i386/pmap_x86_common.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,8 +25,12 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ + +#include + #include #include +#include #include void pmap_remove_range( @@ -35,6 +39,22 @@ void pmap_remove_range( pt_entry_t *spte, pt_entry_t *epte); +void pmap_remove_range_options( + pmap_t pmap, + vm_map_offset_t va, + pt_entry_t *spte, + pt_entry_t *epte, + int options); + +void pmap_reusable_range( + pmap_t pmap, + vm_map_offset_t va, + pt_entry_t *spte, + pt_entry_t *epte, + boolean_t reusable); + +uint32_t pmap_update_clear_pte_count; + /* * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time, * on a NBPDE boundary. @@ -74,6 +94,9 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t unsigned int i; uint64_t num_pde; + assert(!is_ept_pmap(grand)); + assert(!is_ept_pmap(subord)); + if ((size & (pmap_nesting_size_min-1)) || (va_start & (pmap_nesting_size_min-1)) || (nstart & (pmap_nesting_size_min-1)) || @@ -105,7 +128,7 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) { PMAP_UNLOCK(subord); - pmap_expand_pdpt(subord, nvaddr); + pmap_expand_pdpt(subord, nvaddr, PMAP_EXPAND_OPTIONS_NONE); PMAP_LOCK(subord); npde = pmap64_pdpt(subord, nvaddr); } @@ -118,7 +141,7 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) { PMAP_UNLOCK(subord); - pmap_expand(subord, nvaddr); + pmap_expand(subord, nvaddr, PMAP_EXPAND_OPTIONS_NONE); PMAP_LOCK(subord); npde = pmap_pde(subord, nvaddr); } @@ -144,7 +167,7 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t pde = pmap64_pdpt(grand, vaddr); if (0 == pde) { PMAP_UNLOCK(grand); - pmap_expand_pml4(grand, vaddr); + pmap_expand_pml4(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE); PMAP_LOCK(grand); pde = pmap64_pdpt(grand, vaddr); } @@ -163,7 +186,7 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t pde = pmap_pde(grand, vaddr); if ((0 == pde) && cpu_64bit) { PMAP_UNLOCK(grand); - pmap_expand_pdpt(grand, vaddr); + pmap_expand_pdpt(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE); PMAP_LOCK(grand); pde = pmap_pde(grand, vaddr); } @@ -210,6 +233,8 @@ kern_return_t pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size) { grand, vaddr, size); } + assert(!is_ept_pmap(grand)); + /* align everything to PDE boundaries */ va_start = vaddr & ~(NBPDE-1); va_end = (vaddr + size + NBPDE - 1) & ~(NBPDE-1); @@ -248,6 +273,15 @@ kern_return_t pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size) { return KERN_SUCCESS; } +kern_return_t +pmap_unnest_options( + pmap_t grand, + addr64_t vaddr, + __unused uint64_t size, + __unused unsigned int options) { + return pmap_unnest(grand, vaddr, size); +} + /* Invoked by the Mach VM to determine the platform specific unnest region */ boolean_t pmap_adjust_unnest_parameters(pmap_t p, vm_map_offset_t *s, vm_map_offset_t *e) { @@ -291,6 +325,9 @@ pmap_find_phys(pmap_t pmap, addr64_t va) ppnum_t ppn = 0; pd_entry_t pde; pt_entry_t pte; + boolean_t is_ept; + + is_ept = is_ept_pmap(pmap); mp_disable_preemption(); @@ -304,14 +341,14 @@ pmap_find_phys(pmap_t pmap, addr64_t va) pdep = pmap_pde(pmap, va); - if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & INTEL_PTE_VALID)) { - if (pde & INTEL_PTE_PS) { + if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & PTE_VALID_MASK(is_ept))) { + if (pde & PTE_PS) { ppn = (ppnum_t) i386_btop(pte_to_pa(pde)); ppn += (ppnum_t) ptenum(va); } else { ptp = pmap_pte(pmap, va); - if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & INTEL_PTE_VALID) != 0)) { + if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & PTE_VALID_MASK(is_ept)) != 0)) { ppn = (ppnum_t) i386_btop(pte_to_pa(pte)); } } @@ -325,9 +362,13 @@ pfp_exit: /* * Update cache attributes for all extant managed mappings. * Assumes PV for this page is locked, and that the page - * is managed. + * is managed. We assume that this physical page may be mapped in + * both EPT and normal Intel PTEs, so we convert the attributes + * to the corresponding format for each pmap. + * + * We assert that the passed set of attributes is a subset of the + * PHYS_CACHEABILITY_MASK. */ - void pmap_update_cache_attributes_locked(ppnum_t pn, unsigned attributes) { pv_rooted_entry_t pv_h, pv_e; @@ -335,8 +376,17 @@ pmap_update_cache_attributes_locked(ppnum_t pn, unsigned attributes) { vm_map_offset_t vaddr; pmap_t pmap; pt_entry_t *ptep; + boolean_t is_ept; + unsigned ept_attributes; assert(IS_MANAGED_PAGE(pn)); + assert(((~PHYS_CACHEABILITY_MASK) & attributes) == 0); + + /* We don't support the PTA bit for EPT PTEs */ + if (attributes & INTEL_PTE_NCACHE) + ept_attributes = INTEL_EPT_NCACHE; + else + ept_attributes = INTEL_EPT_WB; pv_h = pai_to_pvh(pn); /* TODO: translate the PHYS_* bits to PTE bits, while they're @@ -357,12 +407,18 @@ pmap_update_cache_attributes_locked(ppnum_t pn, unsigned attributes) { pmap = pv_e->pmap; vaddr = pv_e->va; ptep = pmap_pte(pmap, vaddr); - + if (0 == ptep) panic("pmap_update_cache_attributes_locked: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx kernel_pmap: %p", pmap, pn, vaddr, kernel_pmap); + is_ept = is_ept_pmap(pmap); + nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink); - pmap_update_pte(ptep, *ptep, (*ptep & ~PHYS_CACHEABILITY_MASK) | attributes); + if (!is_ept) { + pmap_update_pte(ptep, PHYS_CACHEABILITY_MASK, attributes); + } else { + pmap_update_pte(ptep, INTEL_EPT_CACHE_MASK, ept_attributes); + } PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); pvh_e = nexth; } while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h); @@ -376,7 +432,7 @@ void x86_filter_TLB_coherency_interrupts(boolean_t dofilter) { CPU_CR3_MARK_INACTIVE(); } else { CPU_CR3_MARK_ACTIVE(); - __asm__ volatile("mfence"); + mfence(); if (current_cpu_datap()->cpu_tlb_invalid) process_pmap_updates(); } @@ -395,18 +451,36 @@ void x86_filter_TLB_coherency_interrupts(boolean_t dofilter) { * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ + void pmap_enter( register pmap_t pmap, vm_map_offset_t vaddr, ppnum_t pn, vm_prot_t prot, + vm_prot_t fault_type, unsigned int flags, boolean_t wired) +{ + (void) pmap_enter_options(pmap, vaddr, pn, prot, fault_type, flags, wired, PMAP_EXPAND_OPTIONS_NONE, NULL); +} + + +kern_return_t +pmap_enter_options( + register pmap_t pmap, + vm_map_offset_t vaddr, + ppnum_t pn, + vm_prot_t prot, + __unused vm_prot_t fault_type, + unsigned int flags, + boolean_t wired, + unsigned int options, + void *arg) { pt_entry_t *pte; pv_rooted_entry_t pv_h; - int pai; + ppnum_t pai; pv_hashed_entry_t pvh_e; pv_hashed_entry_t pvh_new; pt_entry_t template; @@ -419,27 +493,40 @@ pmap_enter( /* 2MiB mappings are confined to x86_64 by VM */ boolean_t superpage = flags & VM_MEM_SUPERPAGE; vm_object_t delpage_pm_obj = NULL; - int delpage_pde_index = 0; + uint64_t delpage_pde_index = 0; pt_entry_t old_pte; + kern_return_t kr_expand; + boolean_t is_ept; pmap_intr_assert(); - assert(pn != vm_page_fictitious_addr); if (pmap == PMAP_NULL) - return; + return KERN_INVALID_ARGUMENT; + + is_ept = is_ept_pmap(pmap); + + /* N.B. We can be supplied a zero page frame in the NOENTER case, it's an + * unused value for that scenario. + */ + assert(pn != vm_page_fictitious_addr); + if (pn == vm_page_guard_addr) - return; + return KERN_INVALID_ARGUMENT; PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START, - pmap, - (uint32_t) (vaddr >> 32), (uint32_t) vaddr, - pn, prot); + pmap, + (uint32_t) (vaddr >> 32), (uint32_t) vaddr, + pn, prot); if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled) set_NX = FALSE; else set_NX = TRUE; + if (__improbable(set_NX && (pmap == kernel_pmap) && ((pmap_disable_kstack_nx && (flags & VM_MEM_STACK)) || (pmap_disable_kheap_nx && !(flags & VM_MEM_STACK))))) { + set_NX = FALSE; + } + /* * Must allocate a new pvlist entry while we're unlocked; * zalloc may cause pageout (which will lock the pmap system). @@ -463,7 +550,9 @@ Retry: while ((pte = pmap64_pde(pmap, vaddr)) == PD_ENTRY_NULL) { /* need room for another pde entry */ PMAP_UNLOCK(pmap); - pmap_expand_pdpt(pmap, vaddr); + kr_expand = pmap_expand_pdpt(pmap, vaddr, options); + if (kr_expand != KERN_SUCCESS) + return kr_expand; PMAP_LOCK(pmap); } } else { @@ -473,19 +562,25 @@ Retry: * going to grow pde level page(s) */ PMAP_UNLOCK(pmap); - pmap_expand(pmap, vaddr); + kr_expand = pmap_expand(pmap, vaddr, options); + if (kr_expand != KERN_SUCCESS) + return kr_expand; PMAP_LOCK(pmap); } } + if (options & PMAP_EXPAND_OPTIONS_NOENTER) { + PMAP_UNLOCK(pmap); + return KERN_SUCCESS; + } - if (superpage && *pte && !(*pte & INTEL_PTE_PS)) { + if (superpage && *pte && !(*pte & PTE_PS)) { /* * There is still an empty page table mapped that * was used for a previous base page mapping. * Remember the PDE and the PDE index, so that we * can free the page at the end of this function. */ - delpage_pde_index = (int)pdeidx(pmap, vaddr); + delpage_pde_index = pdeidx(pmap, vaddr); delpage_pm_obj = pmap->pm_obj; *pte = 0; } @@ -494,6 +589,13 @@ Retry: pai = pa_index(old_pa); old_pa_locked = FALSE; + if (old_pa == 0 && + (*pte & PTE_COMPRESSED)) { + /* one less "compressed" */ + OSAddAtomic64(-1, &pmap->stats.compressed); + /* marker will be cleared below */ + } + /* * if we have a previous managed page, lock the pv entry now. after * we lock it, check to see if someone beat us to the lock and if so @@ -515,50 +617,101 @@ Retry: */ if (old_pa == pa) { pt_entry_t old_attributes = - *pte & ~(INTEL_PTE_REF | INTEL_PTE_MOD); + *pte & ~(PTE_REF(is_ept) | PTE_MOD(is_ept)); /* * May be changing its wired attribute or protection */ - template = pa_to_pte(pa) | INTEL_PTE_VALID; - template |= pmap_get_cache_attributes(pa_index(pa)); + template = pa_to_pte(pa); - if (VM_MEM_NOT_CACHEABLE == - (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) { + /* ?: WORTH ASSERTING THAT AT LEAST ONE RWX (implicit valid) PASSED FOR EPT? */ + if (!is_ept) { + template |= INTEL_PTE_VALID; + } else { + template |= INTEL_EPT_IPTA; + } + + template |= pmap_get_cache_attributes(pa_index(pa), is_ept); + + /* + * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs + */ + if (!is_ept && (VM_MEM_NOT_CACHEABLE == + (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)))) { if (!(flags & VM_MEM_GUARDED)) template |= INTEL_PTE_PTA; template |= INTEL_PTE_NCACHE; } - if (pmap != kernel_pmap) + if (pmap != kernel_pmap && !is_ept) template |= INTEL_PTE_USER; - if (prot & VM_PROT_WRITE) - template |= INTEL_PTE_WRITE; + + if (prot & VM_PROT_READ) + template |= PTE_READ(is_ept); + + if (prot & VM_PROT_WRITE) { + template |= PTE_WRITE(is_ept); + if (is_ept && !pmap_ept_support_ad) { + template |= PTE_MOD(is_ept); + if (old_pa_locked) { + assert(IS_MANAGED_PAGE(pai)); + pmap_phys_attributes[pai] |= PHYS_MODIFIED; + } + } + } + if (prot & VM_PROT_EXECUTE) { + assert(set_NX == 0); + template = pte_set_ex(template, is_ept); + } if (set_NX) - template |= INTEL_PTE_NX; + template = pte_remove_ex(template, is_ept); if (wired) { - template |= INTEL_PTE_WIRED; - if (!iswired(old_attributes)) - OSAddAtomic(+1, - &pmap->stats.wired_count); + template |= PTE_WIRED; + if (!iswired(old_attributes)) { + OSAddAtomic(+1, &pmap->stats.wired_count); + pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE); + } } else { if (iswired(old_attributes)) { assert(pmap->stats.wired_count >= 1); - OSAddAtomic(-1, - &pmap->stats.wired_count); + OSAddAtomic(-1, &pmap->stats.wired_count); + pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE); } } + if (superpage) /* this path can not be used */ - template |= INTEL_PTE_PS; /* to change the page size! */ + template |= PTE_PS; /* to change the page size! */ + + if (old_attributes == template) + goto dont_update_pte; + /* Determine delta, PV locked */ need_tlbflush = - ((old_attributes ^ template) != INTEL_PTE_WIRED); + ((old_attributes ^ template) != PTE_WIRED); + + if (need_tlbflush == TRUE && !(old_attributes & PTE_WRITE(is_ept))) { + if ((old_attributes ^ template) == PTE_WRITE(is_ept)) + need_tlbflush = FALSE; + } + + /* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */ + if (is_ept && !pmap_ept_support_ad) { + template |= PTE_REF(is_ept); + if (old_pa_locked) { + assert(IS_MANAGED_PAGE(pai)); + pmap_phys_attributes[pai] |= PHYS_REFERENCED; + } + } /* store modified PTE and preserve RC bits */ - pmap_update_pte(pte, *pte, - template | (*pte & (INTEL_PTE_REF | INTEL_PTE_MOD))); + pt_entry_t npte, opte;; + do { + opte = *pte; + npte = template | (opte & (PTE_REF(is_ept) | PTE_MOD(is_ept))); + } while (!pmap_cmpx_pte(pte, opte, npte)); +dont_update_pte: if (old_pa_locked) { UNLOCK_PVH(pai); old_pa_locked = FALSE; @@ -588,27 +741,45 @@ Retry: */ /* invalidate the PTE */ - pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID)); + pmap_update_pte(pte, PTE_VALID_MASK(is_ept), 0); /* propagate invalidate everywhere */ PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); /* remember reference and change */ old_pte = *pte; - oattr = (char) (old_pte & (PHYS_MODIFIED | PHYS_REFERENCED)); + oattr = (char) (old_pte & (PTE_MOD(is_ept) | PTE_REF(is_ept))); /* completely invalidate the PTE */ pmap_store_pte(pte, 0); if (IS_MANAGED_PAGE(pai)) { pmap_assert(old_pa_locked == TRUE); + pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE); + pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); assert(pmap->stats.resident_count >= 1); - OSAddAtomic(-1, - &pmap->stats.resident_count); - + OSAddAtomic(-1, &pmap->stats.resident_count); + if (pmap != kernel_pmap) { + if (IS_REUSABLE_PAGE(pai)) { + assert(pmap->stats.reusable > 0); + OSAddAtomic(-1, &pmap->stats.reusable); + } else if (IS_INTERNAL_PAGE(pai)) { + assert(pmap->stats.internal > 0); + OSAddAtomic(-1, &pmap->stats.internal); + } else { + assert(pmap->stats.external > 0); + OSAddAtomic(-1, &pmap->stats.external); + } + } if (iswired(*pte)) { assert(pmap->stats.wired_count >= 1); - OSAddAtomic(-1, - &pmap->stats.wired_count); + OSAddAtomic(-1, &pmap->stats.wired_count); + pmap_ledger_debit(pmap, task_ledgers.wired_mem, + PAGE_SIZE); + } + + if (!is_ept) { + pmap_phys_attributes[pai] |= oattr; + } else { + pmap_phys_attributes[pai] |= ept_refmod_to_physmap(oattr); } - pmap_phys_attributes[pai] |= oattr; /* * Remove the mapping from the pvlist for @@ -625,10 +796,16 @@ Retry: * Do removal part of accounting. */ + if (pmap != kernel_pmap) { +#if 00 + assert(pmap->stats.device > 0); + OSAddAtomic(-1, &pmap->stats.device); +#endif + } if (iswired(*pte)) { assert(pmap->stats.wired_count >= 1); - OSAddAtomic(-1, - &pmap->stats.wired_count); + OSAddAtomic(-1, &pmap->stats.wired_count); + pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE); } } } @@ -659,6 +836,17 @@ Retry: pv_h->va = vaddr; pv_h->pmap = pmap; queue_init(&pv_h->qlink); + + if (options & PMAP_OPTIONS_INTERNAL) { + pmap_phys_attributes[pai] |= PHYS_INTERNAL; + } else { + pmap_phys_attributes[pai] &= ~PHYS_INTERNAL; + } + if (options & PMAP_OPTIONS_REUSABLE) { + pmap_phys_attributes[pai] |= PHYS_REUSABLE; + } else { + pmap_phys_attributes[pai] &= ~PHYS_REUSABLE; + } } else { /* * Add new pv_hashed_entry after header. @@ -708,15 +896,37 @@ Retry: * only count the mapping * for 'managed memory' */ + pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE); + pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); OSAddAtomic(+1, &pmap->stats.resident_count); if (pmap->stats.resident_count > pmap->stats.resident_max) { pmap->stats.resident_max = pmap->stats.resident_count; } + if (pmap != kernel_pmap) { + if (IS_REUSABLE_PAGE(pai)) { + OSAddAtomic(+1, &pmap->stats.reusable); + PMAP_STATS_PEAK(pmap->stats.reusable); + } else if (IS_INTERNAL_PAGE(pai)) { + OSAddAtomic(+1, &pmap->stats.internal); + PMAP_STATS_PEAK(pmap->stats.internal); + } else { + OSAddAtomic(+1, &pmap->stats.external); + PMAP_STATS_PEAK(pmap->stats.external); + } + } } else if (last_managed_page == 0) { /* Account for early mappings created before "managed pages" * are determined. Consider consulting the available DRAM map. */ + pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE); + pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); OSAddAtomic(+1, &pmap->stats.resident_count); + if (pmap != kernel_pmap) { +#if 00 + OSAddAtomic(+1, &pmap->stats.device); + PMAP_STATS_PEAK(pmap->stats.device); +#endif + } } /* * Step 3) Enter the mapping. @@ -724,31 +934,64 @@ Retry: * Build a template to speed up entering - * only the pfn changes. */ - template = pa_to_pte(pa) | INTEL_PTE_VALID; + template = pa_to_pte(pa); + + if (!is_ept) { + template |= INTEL_PTE_VALID; + } else { + template |= INTEL_EPT_IPTA; + } + + /* * DRK: It may be worth asserting on cache attribute flags that diverge * from the existing physical page attributes. */ - template |= pmap_get_cache_attributes(pa_index(pa)); - - if (flags & VM_MEM_NOT_CACHEABLE) { + template |= pmap_get_cache_attributes(pa_index(pa), is_ept); + + /* + * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs + */ + if (!is_ept && (flags & VM_MEM_NOT_CACHEABLE)) { if (!(flags & VM_MEM_GUARDED)) template |= INTEL_PTE_PTA; template |= INTEL_PTE_NCACHE; } - if (pmap != kernel_pmap) + if (pmap != kernel_pmap && !is_ept) template |= INTEL_PTE_USER; - if (prot & VM_PROT_WRITE) - template |= INTEL_PTE_WRITE; + if (prot & VM_PROT_READ) + template |= PTE_READ(is_ept); + if (prot & VM_PROT_WRITE) { + template |= PTE_WRITE(is_ept); + if (is_ept && !pmap_ept_support_ad) { + template |= PTE_MOD(is_ept); + if (IS_MANAGED_PAGE(pai)) + pmap_phys_attributes[pai] |= PHYS_MODIFIED; + } + } + if (prot & VM_PROT_EXECUTE) { + assert(set_NX == 0); + template = pte_set_ex(template, is_ept); + } + if (set_NX) - template |= INTEL_PTE_NX; + template = pte_remove_ex(template, is_ept); if (wired) { template |= INTEL_PTE_WIRED; OSAddAtomic(+1, & pmap->stats.wired_count); + pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE); } if (superpage) template |= INTEL_PTE_PS; + + /* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */ + if (is_ept && !pmap_ept_support_ad) { + template |= PTE_REF(is_ept); + if (IS_MANAGED_PAGE(pai)) + pmap_phys_attributes[pai] |= PHYS_REFERENCED; + } + pmap_store_pte(pte, template); /* @@ -760,9 +1003,12 @@ Retry: UNLOCK_PVH(pai); } Done: - if (need_tlbflush == TRUE) - PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); - + if (need_tlbflush == TRUE) { + if (options & PMAP_OPTIONS_NOFLUSH) + PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg); + else + PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); + } if (pvh_e != PV_HASHED_ENTRY_NULL) { PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1); } @@ -775,16 +1021,17 @@ Done: vm_page_t m; vm_object_lock(delpage_pm_obj); - m = vm_page_lookup(delpage_pm_obj, delpage_pde_index); + m = vm_page_lookup(delpage_pm_obj, (delpage_pde_index * PAGE_SIZE)); if (m == VM_PAGE_NULL) panic("pmap_enter: pte page not in object"); - vm_object_unlock(delpage_pm_obj); VM_PAGE_FREE(m); + vm_object_unlock(delpage_pm_obj); OSAddAtomic(-1, &inuse_ptepages_count); - PMAP_ZINFO_PFREE(PAGE_SIZE); + PMAP_ZINFO_PFREE(pmap, PAGE_SIZE); } PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0); + return KERN_SUCCESS; } /* @@ -805,6 +1052,18 @@ pmap_remove_range( vm_map_offset_t start_vaddr, pt_entry_t *spte, pt_entry_t *epte) +{ + pmap_remove_range_options(pmap, start_vaddr, spte, epte, + PMAP_OPTIONS_REMOVE); +} + +void +pmap_remove_range_options( + pmap_t pmap, + vm_map_offset_t start_vaddr, + pt_entry_t *spte, + pt_entry_t *epte, + int options) { pt_entry_t *cpte; pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL; @@ -812,27 +1071,22 @@ pmap_remove_range( pv_hashed_entry_t pvh_e; int pvh_cnt = 0; int num_removed, num_unwired, num_found, num_invalid; - int pai; + int num_device, num_external, num_internal, num_reusable; + uint64_t num_compressed; + ppnum_t pai; pmap_paddr_t pa; vm_map_offset_t vaddr; + boolean_t is_ept = is_ept_pmap(pmap); num_removed = 0; num_unwired = 0; num_found = 0; num_invalid = 0; -#if defined(__i386__) - if (pmap != kernel_pmap && - pmap->pm_task_map == TASK_MAP_32BIT && - start_vaddr >= HIGH_MEM_BASE) { - /* - * The range is in the "high_shared_pde" which is shared - * between the kernel and all 32-bit tasks. It holds - * the 32-bit commpage but also the trampolines, GDT, etc... - * so we can't let user tasks remove anything from it. - */ - return; - } -#endif + num_device = 0; + num_external = 0; + num_internal = 0; + num_reusable = 0; + num_compressed = 0; /* invalidate the PTEs first to "freeze" them */ for (cpte = spte, vaddr = start_vaddr; cpte < epte; @@ -840,8 +1094,18 @@ pmap_remove_range( pt_entry_t p = *cpte; pa = pte_to_pa(p); - if (pa == 0) + if (pa == 0) { + if (pmap != kernel_pmap && + (options & PMAP_OPTIONS_REMOVE) && + (p & PTE_COMPRESSED)) { + /* one less "compressed" */ + num_compressed++; + /* clear marker */ + /* XXX probably does not need to be atomic! */ + pmap_update_pte(cpte, PTE_COMPRESSED, 0); + } continue; + } num_found++; if (iswired(p)) @@ -855,14 +1119,15 @@ pmap_remove_range( * Just remove the mappings. */ pmap_store_pte(cpte, 0); + num_device++; continue; } - if ((p & INTEL_PTE_VALID) == 0) + if ((p & PTE_VALID_MASK(is_ept)) == 0) num_invalid++; - /* invalidate the PTE */ - pmap_update_pte(cpte, *cpte, (*cpte & ~INTEL_PTE_VALID)); + /* invalidate the PTE */ + pmap_update_pte(cpte, PTE_VALID_MASK(is_ept), 0); } if (num_found == 0) { @@ -892,6 +1157,13 @@ pmap_remove_range( continue; } num_removed++; + if (IS_REUSABLE_PAGE(pai)) { + num_reusable++; + } else if (IS_INTERNAL_PAGE(pai)) { + num_internal++; + } else { + num_external++; + } /* * Get the modify and reference bits, then @@ -933,15 +1205,38 @@ update_counts: if (pmap->stats.resident_count < num_removed) panic("pmap_remove_range: resident_count"); #endif + pmap_ledger_debit(pmap, task_ledgers.phys_mem, machine_ptob(num_removed)); + pmap_ledger_debit(pmap, task_ledgers.phys_footprint, machine_ptob(num_removed)); assert(pmap->stats.resident_count >= num_removed); OSAddAtomic(-num_removed, &pmap->stats.resident_count); + if (pmap != kernel_pmap) { +#if 00 + assert(pmap->stats.device >= num_device); + if (num_device) + OSAddAtomic(-num_device, &pmap->stats.device); +#endif /* 00 */ + assert(pmap->stats.external >= num_external); + if (num_external) + OSAddAtomic(-num_external, &pmap->stats.external); + assert(pmap->stats.internal >= num_internal); + if (num_internal) + OSAddAtomic(-num_internal, &pmap->stats.internal); + assert(pmap->stats.reusable >= num_reusable); + if (num_reusable) + OSAddAtomic(-num_reusable, &pmap->stats.reusable); + assert(pmap->stats.compressed >= num_compressed); + if (num_compressed) + OSAddAtomic64(-num_compressed, &pmap->stats.compressed); + } + #if TESTING if (pmap->stats.wired_count < num_unwired) panic("pmap_remove_range: wired_count"); #endif assert(pmap->stats.wired_count >= num_unwired); OSAddAtomic(-num_unwired, &pmap->stats.wired_count); + pmap_ledger_debit(pmap, task_ledgers.wired_mem, machine_ptob(num_unwired)); return; } @@ -959,17 +1254,30 @@ pmap_remove( pmap_t map, addr64_t s64, addr64_t e64) +{ + pmap_remove_options(map, s64, e64, PMAP_OPTIONS_REMOVE); +} + +void +pmap_remove_options( + pmap_t map, + addr64_t s64, + addr64_t e64, + int options) { pt_entry_t *pde; pt_entry_t *spte, *epte; addr64_t l64; uint64_t deadline; + boolean_t is_ept; pmap_intr_assert(); if (map == PMAP_NULL || s64 == e64) return; + is_ept = is_ept_pmap(map); + PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START, map, (uint32_t) (s64 >> 32), s64, @@ -1018,8 +1326,8 @@ pmap_remove( l64 = e64; pde = pmap_pde(map, s64); - if (pde && (*pde & INTEL_PTE_VALID)) { - if (*pde & INTEL_PTE_PS) { + if (pde && (*pde & PTE_VALID_MASK(is_ept))) { + if (*pde & PTE_PS) { /* * If we're removing a superpage, pmap_remove_range() * must work on level 2 instead of level 1; and we're @@ -1033,12 +1341,21 @@ pmap_remove( spte = &spte[ptenum(s64)]; epte = &spte[intel_btop(l64 - s64)]; } - pmap_remove_range(map, s64, spte, epte); + pmap_remove_range_options(map, s64, spte, epte, + options); } s64 = l64; if (s64 < e64 && rdtsc64() >= deadline) { PMAP_UNLOCK(map) + /* TODO: Rapid release/reacquisition can defeat + * the "backoff" intent here; either consider a + * fair spinlock, or a scheme whereby each lock + * attempt marks the processor as within a spinlock + * acquisition, and scan CPUs here to determine + * if a backoff is necessary, to avoid sacrificing + * performance in the common case. + */ PMAP_LOCK(map) deadline = rdtsc64() + max_preemption_latency_tsc; } @@ -1051,17 +1368,27 @@ pmap_remove( } +void +pmap_page_protect( + ppnum_t pn, + vm_prot_t prot) +{ + pmap_page_protect_options(pn, prot, 0, NULL); +} + /* - * Routine: pmap_page_protect + * Routine: pmap_page_protect_options * * Function: * Lower the permission for all mappings to a given * page. */ void -pmap_page_protect( +pmap_page_protect_options( ppnum_t pn, - vm_prot_t prot) + vm_prot_t prot, + unsigned int options, + void *arg) { pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL; pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL; @@ -1074,6 +1401,8 @@ pmap_page_protect( int pai; pmap_t pmap; boolean_t remove; + pt_entry_t new_pte_value; + boolean_t is_ept; pmap_intr_assert(); assert(pn != vm_page_fictitious_addr); @@ -1123,7 +1452,15 @@ pmap_page_protect( do { vm_map_offset_t vaddr; + if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) && + (pmap_phys_attributes[pai] & PHYS_MODIFIED)) { + /* page was modified, so it will be compressed */ + options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED; + options |= PMAP_OPTIONS_COMPRESSOR; + } + pmap = pv_e->pmap; + is_ept = is_ept_pmap(pmap); vaddr = pv_e->va; pte = pmap_pte(pmap, vaddr); @@ -1141,27 +1478,96 @@ pmap_page_protect( * Remove the mapping if new protection is NONE */ if (remove) { - /* - * Remove the mapping, collecting dirty bits. - */ - pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_VALID); /* Remove per-pmap wired count */ if (iswired(*pte)) { OSAddAtomic(-1, &pmap->stats.wired_count); + pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE); + } + + if (pmap != kernel_pmap && + (options & PMAP_OPTIONS_COMPRESSOR) && + IS_INTERNAL_PAGE(pai)) { + /* mark this PTE as having been "reclaimed" */ + new_pte_value = PTE_COMPRESSED; + } else { + new_pte_value = 0; } - PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); - pmap_phys_attributes[pai] |= - *pte & (PHYS_MODIFIED|PHYS_REFERENCED); - pmap_store_pte(pte, 0); + if (options & PMAP_OPTIONS_NOREFMOD) { + pmap_store_pte(pte, new_pte_value); + + if (options & PMAP_OPTIONS_NOFLUSH) + PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg); + else + PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); + } else { + /* + * Remove the mapping, collecting dirty bits. + */ + pmap_update_pte(pte, PTE_VALID_MASK(is_ept), 0); + + PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); + if ((options & + PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) && + ! (pmap_phys_attributes[pai] & + PHYS_MODIFIED) && + (*pte & PHYS_MODIFIED)) { + /* + * Page is actually "modified" and + * will be compressed. Start + * accounting for it as "compressed". + */ + options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED; + options |= PMAP_OPTIONS_COMPRESSOR; + new_pte_value = PTE_COMPRESSED; + } + if (!is_ept) { + pmap_phys_attributes[pai] |= + *pte & (PHYS_MODIFIED|PHYS_REFERENCED); + } else { + pmap_phys_attributes[pai] |= + ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED); + } + pmap_store_pte(pte, new_pte_value); + } + + if (new_pte_value == PTE_COMPRESSED) { + /* one more "compressed" page */ + OSAddAtomic64(+1, &pmap->stats.compressed); + PMAP_STATS_PEAK(pmap->stats.compressed); + pmap->stats.compressed_lifetime++; + } #if TESTING if (pmap->stats.resident_count < 1) panic("pmap_page_protect: resident_count"); #endif + pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE); assert(pmap->stats.resident_count >= 1); OSAddAtomic(-1, &pmap->stats.resident_count); + if (options & PMAP_OPTIONS_COMPRESSOR) { + /* + * This removal is only being done so we can send this page to + * the compressor; therefore it mustn't affect total task footprint. + */ + pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE); + } else { + pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); + } + + if (pmap != kernel_pmap) { + if (IS_REUSABLE_PAGE(pai)) { + assert(pmap->stats.reusable > 0); + OSAddAtomic(-1, &pmap->stats.reusable); + } else if (IS_INTERNAL_PAGE(pai)) { + assert(pmap->stats.internal > 0); + OSAddAtomic(-1, &pmap->stats.internal); + } else { + assert(pmap->stats.external > 0); + OSAddAtomic(-1, &pmap->stats.external); + } + } /* * Deal with the pv_rooted_entry. @@ -1188,11 +1594,19 @@ pmap_page_protect( /* * Write-protect, after opportunistic refmod collect */ - pmap_phys_attributes[pai] |= - *pte & (PHYS_MODIFIED|PHYS_REFERENCED); + if (!is_ept) { + pmap_phys_attributes[pai] |= + *pte & (PHYS_MODIFIED|PHYS_REFERENCED); + } else { + pmap_phys_attributes[pai] |= + ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED); + } + pmap_update_pte(pte, PTE_WRITE(is_ept), 0); - pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_WRITE); - PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); + if (options & PMAP_OPTIONS_NOFLUSH) + PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg); + else + PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); } pvh_e = nexth; } while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h); @@ -1226,13 +1640,16 @@ done: 0, 0, 0, 0, 0); } + /* * Clear specified attribute bits. */ void phys_attribute_clear( ppnum_t pn, - int bits) + int bits, + unsigned int options, + void *arg) { pv_rooted_entry_t pv_h; pv_hashed_entry_t pv_e; @@ -1240,7 +1657,23 @@ phys_attribute_clear( int pai; pmap_t pmap; char attributes = 0; - + boolean_t is_internal, is_reusable, is_ept; + int ept_bits_to_clear; + boolean_t ept_keep_global_mod = FALSE; + + if ((bits & PHYS_MODIFIED) && + (options & PMAP_OPTIONS_NOFLUSH) && + arg == NULL) { + panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p): " + "should not clear 'modified' without flushing TLBs\n", + pn, bits, options, arg); + } + + /* We only support converting MOD and REF bits for EPT PTEs in this function */ + assert((bits & ~(PHYS_REFERENCED | PHYS_MODIFIED)) == 0); + + ept_bits_to_clear = (unsigned)physmap_refmod_to_ept(bits & (PHYS_MODIFIED | PHYS_REFERENCED)); + pmap_intr_assert(); assert(pn != vm_page_fictitious_addr); if (pn == vm_page_guard_addr) @@ -1262,38 +1695,113 @@ phys_attribute_clear( LOCK_PVH(pai); + /* * Walk down PV list, clearing all modify or reference bits. * We do not have to lock the pv_list because we have - * the entire pmap system locked. + * the per-pmap lock */ if (pv_h->pmap != PMAP_NULL) { /* * There are some mappings. */ + is_internal = IS_INTERNAL_PAGE(pai); + is_reusable = IS_REUSABLE_PAGE(pai); + pv_e = (pv_hashed_entry_t)pv_h; do { vm_map_offset_t va; + char pte_bits; pmap = pv_e->pmap; + is_ept = is_ept_pmap(pmap); va = pv_e->va; + pte_bits = 0; + + if (bits) { + pte = pmap_pte(pmap, va); + /* grab ref/mod bits from this PTE */ + pte_bits = (*pte & (PTE_REF(is_ept) | PTE_MOD(is_ept))); + /* propagate to page's global attributes */ + if (!is_ept) { + attributes |= pte_bits; + } else { + attributes |= ept_refmod_to_physmap(pte_bits); + if (!pmap_ept_support_ad && (pte_bits & INTEL_EPT_MOD)) { + ept_keep_global_mod = TRUE; + } + } + /* which bits to clear for this PTE? */ + if (!is_ept) { + pte_bits &= bits; + } else { + pte_bits &= ept_bits_to_clear; + } + } /* * Clear modify and/or reference bits. */ - pte = pmap_pte(pmap, va); - attributes |= *pte & (PHYS_MODIFIED|PHYS_REFERENCED); - - pmap_update_pte(pte, *pte, (*pte & ~bits)); - /* Ensure all processors using this translation - * invalidate this TLB entry. The invalidation *must* - * follow the PTE update, to ensure that the TLB - * shadow of the 'D' bit (in particular) is - * synchronized with the updated PTE. - */ - PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE); + if (pte_bits) { + pmap_update_pte(pte, bits, 0); + + /* Ensure all processors using this translation + * invalidate this TLB entry. The invalidation + * *must* follow the PTE update, to ensure that + * the TLB shadow of the 'D' bit (in particular) + * is synchronized with the updated PTE. + */ + if (! (options & PMAP_OPTIONS_NOFLUSH)) { + /* flush TLBS now */ + PMAP_UPDATE_TLBS(pmap, + va, + va + PAGE_SIZE); + } else if (arg) { + /* delayed TLB flush: add "pmap" info */ + PMAP_UPDATE_TLBS_DELAYED( + pmap, + va, + va + PAGE_SIZE, + (pmap_flush_context *)arg); + } else { + /* no TLB flushing at all */ + } + } + + /* update pmap "reusable" stats */ + if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && + is_reusable && + pmap != kernel_pmap) { + /* one less "reusable" */ + assert(pmap->stats.reusable > 0); + OSAddAtomic(-1, &pmap->stats.reusable); + if (is_internal) { + /* one more "internal" */ + OSAddAtomic(+1, &pmap->stats.internal); + PMAP_STATS_PEAK(pmap->stats.internal); + } else { + /* one more "external" */ + OSAddAtomic(+1, &pmap->stats.external); + PMAP_STATS_PEAK(pmap->stats.external); + } + } else if ((options & PMAP_OPTIONS_SET_REUSABLE) && + !is_reusable && + pmap != kernel_pmap) { + /* one more "reusable" */ + OSAddAtomic(+1, &pmap->stats.reusable); + PMAP_STATS_PEAK(pmap->stats.reusable); + if (is_internal) { + /* one less "internal" */ + assert(pmap->stats.internal > 0); + OSAddAtomic(-1, &pmap->stats.internal); + } else { + /* one less "external" */ + assert(pmap->stats.external > 0); + OSAddAtomic(-1, &pmap->stats.external); + } + } pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink); @@ -1304,7 +1812,24 @@ phys_attribute_clear( */ pmap_phys_attributes[pai] |= attributes; - pmap_phys_attributes[pai] &= (~bits); + + if (ept_keep_global_mod) { + /* + * If the hardware doesn't support AD bits for EPT PTEs and someone is + * requesting that we clear the modified bit for a phys page, we need + * to ensure that there are no EPT mappings for the page with the + * modified bit set. If there are, we cannot clear the global modified bit. + */ + bits &= ~PHYS_MODIFIED; + } + pmap_phys_attributes[pai] &= ~(bits); + + /* update this page's "reusable" status */ + if (options & PMAP_OPTIONS_CLEAR_REUSABLE) { + pmap_phys_attributes[pai] &= ~PHYS_REUSABLE; + } else if (options & PMAP_OPTIONS_SET_REUSABLE) { + pmap_phys_attributes[pai] |= PHYS_REUSABLE; + } UNLOCK_PVH(pai); @@ -1326,9 +1851,11 @@ phys_attribute_test( int pai; pmap_t pmap; int attributes = 0; + boolean_t is_ept; pmap_intr_assert(); assert(pn != vm_page_fictitious_addr); + assert((bits & ~(PHYS_MODIFIED | PHYS_REFERENCED)) == 0); if (pn == vm_page_guard_addr) return 0; @@ -1372,13 +1899,19 @@ phys_attribute_test( vm_map_offset_t va; pmap = pv_e->pmap; + is_ept = is_ept_pmap(pmap); va = pv_e->va; /* * pick up modify and/or reference bits from mapping */ pte = pmap_pte(pmap, va); - attributes |= (int)(*pte & bits); + if (!is_ept) { + attributes |= (int)(*pte & bits); + } else { + attributes |= (int)(ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED)); + + } pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink); @@ -1415,8 +1948,9 @@ pmap_change_wiring( /* * wiring down mapping */ + pmap_ledger_credit(map, task_ledgers.wired_mem, PAGE_SIZE); OSAddAtomic(+1, &map->stats.wired_count); - pmap_update_pte(pte, *pte, (*pte | INTEL_PTE_WIRED)); + pmap_update_pte(pte, 0, PTE_WIRED); } else if (!wired && iswired(*pte)) { /* @@ -1424,7 +1958,8 @@ pmap_change_wiring( */ assert(map->stats.wired_count >= 1); OSAddAtomic(-1, &map->stats.wired_count); - pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WIRED)); + pmap_ledger_debit(map, task_ledgers.wired_mem, PAGE_SIZE); + pmap_update_pte(pte, PTE_WIRED, 0); } PMAP_UNLOCK(map); @@ -1459,6 +1994,12 @@ pmap_map_bd( if (!(flags & (VM_MEM_GUARDED))) template |= INTEL_PTE_PTA; } + +#if defined(__x86_64__) + if ((prot & VM_PROT_EXECUTE) == 0) + template |= INTEL_PTE_NX; +#endif + if (prot & VM_PROT_WRITE) template |= INTEL_PTE_WRITE; @@ -1478,3 +2019,96 @@ pmap_map_bd( PMAP_UPDATE_TLBS(kernel_pmap, base, base + end_addr - start_addr); return(virt); } + +unsigned int +pmap_query_resident( + pmap_t pmap, + addr64_t s64, + addr64_t e64, + unsigned int *compressed_count_p) +{ + pt_entry_t *pde; + pt_entry_t *spte, *epte; + addr64_t l64; + uint64_t deadline; + unsigned int result; + boolean_t is_ept; + unsigned int compressed_count; + + pmap_intr_assert(); + + if (pmap == PMAP_NULL || pmap == kernel_pmap || s64 == e64) { + if (compressed_count_p) { + *compressed_count_p = 0; + } + return 0; + } + + is_ept = is_ept_pmap(pmap); + + PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START, + pmap, + (uint32_t) (s64 >> 32), s64, + (uint32_t) (e64 >> 32), e64); + + result = 0; + compressed_count = 0; + + PMAP_LOCK(pmap); + + deadline = rdtsc64() + max_preemption_latency_tsc; + + while (s64 < e64) { + l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size - 1); + if (l64 > e64) + l64 = e64; + pde = pmap_pde(pmap, s64); + + if (pde && (*pde & PTE_VALID_MASK(is_ept))) { + if (*pde & PTE_PS) { + /* superpage: not supported */ + } else { + spte = pmap_pte(pmap, + (s64 & ~(pde_mapped_size - 1))); + spte = &spte[ptenum(s64)]; + epte = &spte[intel_btop(l64 - s64)]; + + for (; spte < epte; spte++) { + if (pte_to_pa(*spte) != 0) { + result++; + } else if (*spte & PTE_COMPRESSED) { + compressed_count++; + } + } + + } + } + s64 = l64; + + if (s64 < e64 && rdtsc64() >= deadline) { + PMAP_UNLOCK(pmap); + PMAP_LOCK(pmap); + deadline = rdtsc64() + max_preemption_latency_tsc; + } + } + + PMAP_UNLOCK(pmap); + + PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END, + pmap, 0, 0, 0, 0); + + if (compressed_count_p) { + *compressed_count_p = compressed_count; + } + return result; +} + +#if MACH_ASSERT +void +pmap_set_process( + __unused pmap_t pmap, + __unused int pid, + __unused char *procname) +{ +} +#endif /* MACH_ASSERT */