X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/0b4c1975fb5e4eccf1012a35081f7e7799b81046..d26ffc64f583ab2d29df48f13518685602bc8832:/osfmk/i386/pmap_internal.h diff --git a/osfmk/i386/pmap_internal.h b/osfmk/i386/pmap_internal.h index eef4f7c4d..4ddabaa20 100644 --- a/osfmk/i386/pmap_internal.h +++ b/osfmk/i386/pmap_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,12 +26,17 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include -#include -#include +#ifndef _I386_PMAP_INTERNAL_ +#define _I386_PMAP_INTERNAL_ #ifdef MACH_KERNEL_PRIVATE +#include +#include +#include +#include +#include + /* * pmap locking */ @@ -44,36 +49,57 @@ simple_unlock(&(pmap)->lock); \ } +#define PMAP_UPDATE_TLBS(pmap, s, e) \ + pmap_flush_tlbs(pmap, s, e, 0, NULL) + + +#define PMAP_DELAY_TLB_FLUSH 0x01 + +#define PMAP_UPDATE_TLBS_DELAYED(pmap, s, e, c) \ + pmap_flush_tlbs(pmap, s, e, PMAP_DELAY_TLB_FLUSH, c) -#define PMAP_UPDATE_TLBS(pmap, s, e) \ - pmap_flush_tlbs(pmap) #define iswired(pte) ((pte) & INTEL_PTE_WIRED) #ifdef PMAP_TRACES extern boolean_t pmap_trace; -#define PMAP_TRACE(x,a,b,c,d,e) \ - if (pmap_trace) { \ - KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \ +#define PMAP_TRACE(...) \ + if (pmap_trace) { \ + KDBG_RELEASE(__VA_ARGS__); \ } #else -#define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e) +#define PMAP_TRACE(...) KDBG_DEBUG(__VA_ARGS__) #endif /* PMAP_TRACES */ -void pmap_expand_pml4( +#define PMAP_TRACE_CONSTANT(...) KDBG_RELEASE(__VA_ARGS__) + +kern_return_t pmap_expand_pml4( pmap_t map, - vm_map_offset_t v); + vm_map_offset_t v, + unsigned int options); -void pmap_expand_pdpt( +kern_return_t pmap_expand_pdpt( pmap_t map, - vm_map_offset_t v); -extern void pmap_flush_tlbs(pmap_t pmap); + vm_map_offset_t v, + unsigned int options); + +void phys_attribute_set( + ppnum_t phys, + int bits); + +void pmap_set_reference( + ppnum_t pn); + +boolean_t phys_page_exists( + ppnum_t pn); + +void +pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t, int, pmap_flush_context *); + +void +pmap_update_cache_attributes_locked(ppnum_t, unsigned); -#if defined(__x86_64__) extern const boolean_t cpu_64bit; -#else -extern boolean_t cpu_64bit; -#endif /* * Private data structures. @@ -99,8 +125,8 @@ extern boolean_t cpu_64bit; PV HASHING Changes - JK 1/2007 Pve's establish physical to virtual mappings. These are used for aliasing of a -physical page to (potentially many) virtual addresses within pmaps. In the previous -implementation the structure of the pv_entries (each 16 bytes in size) was +physical page to (potentially many) virtual addresses within pmaps. In the +previous implementation the structure of the pv_entries (each 16 bytes in size) was typedef struct pv_entry { struct pv_entry_t next; @@ -108,20 +134,23 @@ typedef struct pv_entry { vm_map_offset_t va; } *pv_entry_t; -An initial array of these is created at boot time, one per physical page of memory, -indexed by the physical page number. Additionally, a pool of entries is created from a -pv_zone to be used as needed by pmap_enter() when it is creating new mappings. -Originally, we kept this pool around because the code in pmap_enter() was unable to -block if it needed an entry and none were available - we'd panic. Some time ago I -restructured the pmap_enter() code so that for user pmaps it can block while zalloc'ing -a pv structure and restart, removing a panic from the code (in the case of the kernel -pmap we cannot block and still panic, so, we keep a separate hot pool for use only on -kernel pmaps). The pool has not been removed since there is a large performance gain -keeping freed pv's around for reuse and not suffering the overhead of zalloc for every new pv we need. - -As pmap_enter() created new mappings it linked the new pve's for them off the fixed -pv array for that ppn (off the next pointer). These pve's are accessed for several -operations, one of them being address space teardown. In that case, we basically do this +An initial array of these is created at boot time, one per physical page of +memory, indexed by the physical page number. Additionally, a pool of entries +is created from a pv_zone to be used as needed by pmap_enter() when it is +creating new mappings. Originally, we kept this pool around because the code +in pmap_enter() was unable to block if it needed an entry and none were +available - we'd panic. Some time ago I restructured the pmap_enter() code +so that for user pmaps it can block while zalloc'ing a pv structure and restart, +removing a panic from the code (in the case of the kernel pmap we cannot block +and still panic, so, we keep a separate hot pool for use only on kernel pmaps). +The pool has not been removed since there is a large performance gain keeping +freed pv's around for reuse and not suffering the overhead of zalloc for every +new pv we need. + +As pmap_enter() created new mappings it linked the new pve's for them off the +fixed pv array for that ppn (off the next pointer). These pve's are accessed +for several operations, one of them being address space teardown. In that case, +we basically do this for (every page/pte in the space) { calc pve_ptr from the ppn in the pte @@ -133,124 +162,205 @@ operations, one of them being address space teardown. In that case, we basicall } } -The problem arose when we were running, say 8000 (or even 2000) apache or other processes -and one or all terminate. The list hanging off each pv array entry could have thousands of -entries. We were continuously linearly searching each of these lists as we stepped through -the address space we were tearing down. Because of the locks we hold, likely taking a cache -miss for each node, and interrupt disabling for MP issues the system became completely -unresponsive for many seconds while we did this. - -Realizing that pve's are accessed in two distinct ways (linearly running the list by ppn -for operations like pmap_page_protect and finding and modifying/removing a single pve as -part of pmap_enter processing) has led to modifying the pve structures and databases. - -There are now two types of pve structures. A "rooted" structure which is basically the -original structure accessed in an array by ppn, and a ''hashed'' structure accessed on a -hash list via a hash of [pmap, vaddr]. These have been designed with the two goals of -minimizing wired memory and making the lookup of a ppn faster. Since a vast majority of -pages in the system are not aliased and hence represented by a single pv entry I've kept -the rooted entry size as small as possible because there is one of these dedicated for -every physical page of memory. The hashed pve's are larger due to the addition of the hash -link and the ppn entry needed for matching while running the hash list to find the entry we -are looking for. This way, only systems that have lots of aliasing (like 2000+ httpd procs) -will pay the extra memory price. Both structures have the same first three fields allowing -some simplification in the code. +The problem arose when we were running, say 8000 (or even 2000) apache or +other processes and one or all terminate. The list hanging off each pv array +entry could have thousands of entries. We were continuously linearly searching +each of these lists as we stepped through the address space we were tearing +down. Because of the locks we hold, likely taking a cache miss for each node, +and interrupt disabling for MP issues the system became completely unresponsive +for many seconds while we did this. + +Realizing that pve's are accessed in two distinct ways (linearly running the +list by ppn for operations like pmap_page_protect and finding and +modifying/removing a single pve as part of pmap_enter processing) has led to +modifying the pve structures and databases. + +There are now two types of pve structures. A "rooted" structure which is +basically the original structure accessed in an array by ppn, and a ''hashed'' +structure accessed on a hash list via a hash of [pmap, vaddr]. These have been +designed with the two goals of minimizing wired memory and making the lookup of +a ppn faster. Since a vast majority of pages in the system are not aliased +and hence represented by a single pv entry I've kept the rooted entry size as +small as possible because there is one of these dedicated for every physical +page of memory. The hashed pve's are larger due to the addition of the hash +link and the ppn entry needed for matching while running the hash list to find +the entry we are looking for. This way, only systems that have lots of +aliasing (like 2000+ httpd procs) will pay the extra memory price. Both +structures have the same first three fields allowing some simplification in +the code. They have these shapes typedef struct pv_rooted_entry { - queue_head_t qlink; - vm_map_offset_t va; - pmap_t pmap; + queue_head_t qlink; + vm_map_offset_t va; + pmap_t pmap; } *pv_rooted_entry_t; typedef struct pv_hashed_entry { - queue_head_t qlink; - vm_map_offset_t va; - pmap_t pmap; - ppnum_t ppn; - struct pv_hashed_entry *nexth; + queue_head_t qlink; + vm_map_offset_t va; + pmap_t pmap; + ppnum_t ppn; + struct pv_hashed_entry *nexth; } *pv_hashed_entry_t; -The main flow difference is that the code is now aware of the rooted entry and the hashed -entries. Code that runs the pv list still starts with the rooted entry and then continues -down the qlink onto the hashed entries. Code that is looking up a specific pv entry first -checks the rooted entry and then hashes and runs the hash list for the match. The hash list -lengths are much smaller than the original pv lists that contained all aliases for the specific ppn. +The main flow difference is that the code is now aware of the rooted entry and +the hashed entries. Code that runs the pv list still starts with the rooted +entry and then continues down the qlink onto the hashed entries. Code that is +looking up a specific pv entry first checks the rooted entry and then hashes +and runs the hash list for the match. The hash list lengths are much smaller +than the original pv lists that contained all aliases for the specific ppn. */ -typedef struct pv_rooted_entry { /* first three entries must match pv_hashed_entry_t */ - queue_head_t qlink; - vm_map_offset_t va; /* virtual address for mapping */ - pmap_t pmap; /* pmap where mapping lies */ +typedef struct pv_rooted_entry { + /* first three entries must match pv_hashed_entry_t */ + queue_head_t qlink; + vm_map_offset_t va_and_flags; /* virtual address for mapping */ + pmap_t pmap; /* pmap where mapping lies */ } *pv_rooted_entry_t; #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0) - -typedef struct pv_hashed_entry { /* first three entries must match pv_rooted_entry_t */ - queue_head_t qlink; - vm_map_offset_t va; - pmap_t pmap; - ppnum_t ppn; - struct pv_hashed_entry *nexth; +typedef struct pv_hashed_entry { + /* first three entries must match pv_rooted_entry_t */ + queue_head_t qlink; + vm_map_offset_t va_and_flags; + pmap_t pmap; + ppnum_t ppn; + struct pv_hashed_entry *nexth; } *pv_hashed_entry_t; #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0) -/* #define PV_DEBUG 1 uncomment to enable some PV debugging code */ +#define PVE_VA(pve) ((pve)->va_and_flags & ~PAGE_MASK) +#define PVE_FLAGS(pve) ((pve)->va_and_flags & PAGE_MASK) +#define PVE_IS_ALTACCT 0x001 +#define PVE_IS_ALTACCT_PAGE(pve) \ + (((pve)->va_and_flags & PVE_IS_ALTACCT) ? TRUE : FALSE) + +//#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */ #ifdef PV_DEBUG -#define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized"); +#define CHK_NPVHASH() if(0 == npvhashmask) panic("npvhash uninitialized"); #else -#define CHK_NPVHASH() +#define CHK_NPVHASH(x) #endif -#define NPVHASH 4095 /* MUST BE 2^N - 1 */ -#define PV_HASHED_LOW_WATER_MARK 5000 -#define PV_HASHED_KERN_LOW_WATER_MARK 400 -#define PV_HASHED_ALLOC_CHUNK 2000 -#define PV_HASHED_KERN_ALLOC_CHUNK 200 - -#define PV_HASHED_ALLOC(pvh_e) { \ - simple_lock(&pv_hashed_free_list_lock); \ - if ((pvh_e = pv_hashed_free_list) != 0) { \ - pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \ - pv_hashed_free_count--; \ - if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \ - if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \ - thread_call_enter(mapping_adjust_call); \ - } \ - simple_unlock(&pv_hashed_free_list_lock); \ +#define NPVHASHBUCKETS (4096) +#define NPVHASHMASK ((NPVHASHBUCKETS) - 1) /* MUST BE 2^N - 1 */ +#define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000 +#define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000 +#define PV_HASHED_ALLOC_CHUNK_INITIAL 2000 +#define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200 + +extern volatile uint32_t mappingrecurse; +extern uint32_t pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark; + +/* + * PV hash locking + */ + +#define LOCK_PV_HASH(hash) lock_hash_hash(hash) +#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash) +extern uint32_t npvhashmask; +extern pv_hashed_entry_t *pv_hash_table; /* hash lists */ +extern pv_hashed_entry_t pv_hashed_free_list; +extern pv_hashed_entry_t pv_hashed_kern_free_list; +decl_simple_lock_data(extern, pv_hashed_free_list_lock) +decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock) +decl_simple_lock_data(extern, pv_hash_table_lock) +decl_simple_lock_data(extern, phys_backup_lock) + +extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry + * structures */ + +extern uint32_t pv_hashed_free_count; +extern uint32_t pv_hashed_kern_free_count; +/* + * Each entry in the pv_head_table is locked by a bit in the + * pv_lock_table. The lock bits are accessed by the address of + * the frame they lock. + */ +#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) +#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) +extern char *pv_lock_table; /* pointer to array of bits */ +extern char *pv_hash_lock_table; +extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */ + +extern event_t mapping_replenish_event; + +static inline void PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) { + pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL); + simple_lock(&pv_hashed_free_list_lock); + /* If the kernel reserved pool is low, let non-kernel mappings allocate + * synchronously, possibly subject to a throttle. + */ + if ((pv_hashed_kern_free_count > pv_hashed_kern_low_water_mark) && ((*pvh_ep = pv_hashed_free_list) != 0)) { + pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next; + pv_hashed_free_count--; + } + + simple_unlock(&pv_hashed_free_list_lock); + + if (pv_hashed_free_count <= pv_hashed_low_water_mark) { + if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) + thread_wakeup(&mapping_replenish_event); + } +} + +static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) { + simple_lock(&pv_hashed_free_list_lock); + pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; + pv_hashed_free_list = pvh_eh; + pv_hashed_free_count += pv_cnt; + simple_unlock(&pv_hashed_free_list_lock); } -#define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \ - simple_lock(&pv_hashed_free_list_lock); \ - pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; \ - pv_hashed_free_list = pvh_eh; \ - pv_hashed_free_count += pv_cnt; \ - simple_unlock(&pv_hashed_free_list_lock); \ +extern unsigned pmap_kern_reserve_alloc_stat; + +static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) { + pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL); + simple_lock(&pv_hashed_kern_free_list_lock); + + if ((*pvh_e = pv_hashed_kern_free_list) != 0) { + pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next; + pv_hashed_kern_free_count--; + pmap_kern_reserve_alloc_stat++; + } + + simple_unlock(&pv_hashed_kern_free_list_lock); + + if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) { + if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) + thread_wakeup(&mapping_replenish_event); + } } -#define PV_HASHED_KERN_ALLOC(pvh_e) { \ - simple_lock(&pv_hashed_kern_free_list_lock); \ - if ((pvh_e = pv_hashed_kern_free_list) != 0) { \ - pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \ - pv_hashed_kern_free_count--; \ - if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) \ - if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \ - thread_call_enter(mapping_adjust_call); \ - } \ - simple_unlock(&pv_hashed_kern_free_list_lock); \ +static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) { + simple_lock(&pv_hashed_kern_free_list_lock); + pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; + pv_hashed_kern_free_list = pvh_eh; + pv_hashed_kern_free_count += pv_cnt; + simple_unlock(&pv_hashed_kern_free_list_lock); } -#define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \ - simple_lock(&pv_hashed_kern_free_list_lock); \ - pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; \ - pv_hashed_kern_free_list = pvh_eh; \ - pv_hashed_kern_free_count += pv_cnt; \ - simple_unlock(&pv_hashed_kern_free_list_lock); \ +extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters; +extern event_t pmap_user_pv_throttle_event; + +static inline void pmap_pv_throttle(__unused pmap_t p) { + pmap_assert(p != kernel_pmap); + /* Apply throttle on non-kernel mappings */ + if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) { + pmap_pv_throttle_stat++; + /* This doesn't need to be strictly accurate, merely a hint + * to eliminate the timeout when the reserve is replenished. + */ + pmap_pv_throttled_waiters++; + assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC); + thread_block(THREAD_CONTINUE_NULL); + } } /* @@ -264,22 +374,40 @@ typedef struct pv_hashed_entry { /* first three entries must match pv_rooted #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table) #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table) #define pvhash(idx) (&pv_hash_table[idx]) - #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table) #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table) #define IS_MANAGED_PAGE(x) \ ((unsigned int)(x) <= last_managed_page && \ (pmap_phys_attributes[x] & PHYS_MANAGED)) +#define IS_INTERNAL_PAGE(x) \ + (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_INTERNAL)) +#define IS_REUSABLE_PAGE(x) \ + (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_REUSABLE)) +#define IS_ALTACCT_PAGE(x,pve) \ + (IS_MANAGED_PAGE((x)) && \ + (PVE_IS_ALTACCT_PAGE((pve)))) /* * Physical page attributes. Copy bits from PTE definition. */ #define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */ #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */ -#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */ -#define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */ - +#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */ +#define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */ +#define PHYS_NCACHE INTEL_PTE_NCACHE +#define PHYS_PTA INTEL_PTE_PTA +#define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE) +#define PHYS_INTERNAL INTEL_PTE_WTHRU /* page from internal object */ +#define PHYS_REUSABLE INTEL_PTE_WRITE /* page is "reusable" */ + +extern boolean_t pmap_disable_kheap_nx; +extern boolean_t pmap_disable_kstack_nx; + +#define PMAP_EXPAND_OPTIONS_NONE (0x0) +#define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT) +#define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER) +#define PMAP_EXPAND_OPTIONS_ALIASMAP (0x40000000U) /* * Amount of virtual memory mapped by one * page-directory entry. @@ -325,35 +453,15 @@ typedef struct pv_hashed_entry { /* first three entries must match pv_rooted unlock_pvh_pai(index); \ mp_enable_preemption(); \ } -/* - * PV hash locking - */ - -#define LOCK_PV_HASH(hash) lock_hash_hash(hash) -#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash) -extern uint32_t npvhash; -extern pv_hashed_entry_t *pv_hash_table; /* hash lists */ -extern pv_hashed_entry_t pv_hashed_free_list; -extern pv_hashed_entry_t pv_hashed_kern_free_list; -decl_simple_lock_data(extern, pv_hashed_free_list_lock) -decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock) -decl_simple_lock_data(extern, pv_hash_table_lock) - -extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */ - -extern int pv_hashed_free_count; -extern int pv_hashed_kern_free_count; -#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) -#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) -extern char *pv_lock_table; /* pointer to array of bits */ -extern char *pv_hash_lock_table; -extern pv_rooted_entry_t pv_head_table; /* array of entries, one - * per page */ extern uint64_t pde_mapped_size; extern char *pmap_phys_attributes; -extern unsigned int last_managed_page; +extern ppnum_t last_managed_page; + +extern ppnum_t lowest_lo; +extern ppnum_t lowest_hi; +extern ppnum_t highest_hi; /* * when spinning through pmap_remove @@ -365,25 +473,29 @@ extern unsigned int last_managed_page; #define MAX_PREEMPTION_LATENCY_NS 20000 extern uint64_t max_preemption_latency_tsc; -/* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */ -#ifdef DEBUGINTERRUPTS +#if DEBUG +#define PMAP_INTR_DEBUG (1) +#endif + +#if PMAP_INTR_DEBUG #define pmap_intr_assert() { \ if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \ - panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \ + panic("pmap interrupt assert %d %s, %d", processor_avail_count, __FILE__, __LINE__); \ } #else #define pmap_intr_assert() #endif -extern int nx_enabled; -extern unsigned int inuse_ptepages_count; +extern int nx_enabled; +extern unsigned int inuse_ptepages_count; static inline uint32_t pvhashidx(pmap_t pmap, vm_map_offset_t va) { - return ((uint32_t)(uintptr_t)pmap ^ - ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) & - npvhash; + uint32_t hashidx = ((uint32_t)(uintptr_t)pmap ^ + ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) & + npvhashmask; + return hashidx; } /* @@ -391,7 +503,6 @@ pvhashidx(pmap_t pmap, vm_map_offset_t va) * properly deals with the anchor. * must be called with the hash locked, does not unlock it */ - static inline void pmap_pvh_unlink(pv_hashed_entry_t pvh) { @@ -400,7 +511,7 @@ pmap_pvh_unlink(pv_hashed_entry_t pvh) int pvhash_idx; CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh->pmap, pvh->va); + pvhash_idx = pvhashidx(pvh->pmap, PVE_VA(pvh)); pprevh = pvhash(pvhash_idx); @@ -429,7 +540,7 @@ pv_hash_add(pv_hashed_entry_t pvh_e, int pvhash_idx; CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); + pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e)); LOCK_PV_HASH(pvhash_idx); insque(&pvh_e->qlink, &pv_h->qlink); hashp = pvhash(pvhash_idx); @@ -448,12 +559,12 @@ pv_hash_remove(pv_hashed_entry_t pvh_e) int pvhash_idx; CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va); + pvhash_idx = pvhashidx(pvh_e->pmap,PVE_VA(pvh_e)); LOCK_PV_HASH(pvhash_idx); remque(&pvh_e->qlink); pmap_pvh_unlink(pvh_e); UNLOCK_PV_HASH(pvhash_idx); -} +} static inline boolean_t popcnt1(uint64_t distance) { return ((distance & (distance - 1)) == 0); @@ -540,7 +651,7 @@ pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corru static inline pmap_pagetable_corruption_action_t pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) { - pmap_pv_assertion_t action = PMAP_ACTION_ASSERT; + pmap_pagetable_corruption_action_t action = PMAP_ACTION_ASSERT; pmap_pagetable_corruption_t suppress_reason = PTE_VALID; ppnum_t suppress_ppn = 0; pt_entry_t cpte = *ptep; @@ -550,8 +661,10 @@ pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t * pv_rooted_entry_t pv_e = pv_h; uint32_t bitdex; pmap_t pvpmap = pv_h->pmap; - vm_map_offset_t pvva = pv_h->va; + vm_map_offset_t pvva = PVE_VA(pv_h); + vm_map_offset_t pve_flags; boolean_t ppcd = FALSE; + boolean_t is_ept; /* Ideally, we'd consult the Mach VM here to definitively determine * the nature of the mapping for this address space and address. @@ -563,21 +676,23 @@ pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t * /* As a precautionary measure, mark A+D */ pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED); + is_ept = is_ept_pmap(pmap); /* * Correct potential single bit errors in either (but not both) element * of the PV */ do { - if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) || - (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) { + if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && PVE_VA(pv_e) == vaddr) || + (pv_e->pmap == pmap && popcnt1(PVE_VA(pv_e) ^ vaddr))) { + pve_flags = PVE_FLAGS(pv_e); pv_e->pmap = pmap; - pv_e->va = vaddr; + pv_h->va_and_flags = vaddr | pve_flags; suppress_reason = PV_BITFLIP; action = PMAP_ACTION_RETRY; goto pmap_cpc_exit; } - } while((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink)) != pv_h); + } while (((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink))) && (pv_e != pv_h)); /* Discover root entries with a Hamming * distance of 1 from the supplied @@ -587,7 +702,7 @@ pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t * ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex); if (IS_MANAGED_PAGE(npn)) { pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn)); - if (npv_h->va == vaddr && npv_h->pmap == pmap) { + if (PVE_VA(npv_h) == vaddr && npv_h->pmap == pmap) { suppress_reason = PTE_BITFLIP; suppress_ppn = npn; action = PMAP_ACTION_RETRY_RELOCK; @@ -603,9 +718,11 @@ pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t * goto pmap_cpc_exit; } - /* Check for malformed/inconsistent entries */ - - if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) { + /* + * Check for malformed/inconsistent entries. + * The first check here isn't useful for EPT PTEs because INTEL_EPT_NCACHE == 0 + */ + if (!is_ept && ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU))) { action = PMAP_ACTION_IGNORE; suppress_reason = PTE_INVALID_CACHEABILITY; } @@ -613,7 +730,7 @@ pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t * action = PMAP_ACTION_IGNORE; suppress_reason = PTE_RSVD; } - else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) { + else if ((pmap != kernel_pmap) && (!is_ept) && ((cpte & INTEL_PTE_USER) == 0)) { action = PMAP_ACTION_IGNORE; suppress_reason = PTE_SUPERVISOR; } @@ -635,17 +752,18 @@ pmap_cpc_exit: pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva); return action; } + /* * Remove pv list entry. * Called with pv_head_table entry locked. * Returns pv entry to be freed (or NULL). */ - static inline __attribute__((always_inline)) pv_hashed_entry_t -pmap_pv_remove( pmap_t pmap, - vm_map_offset_t vaddr, - ppnum_t *ppnp, - pt_entry_t *pte) +pmap_pv_remove(pmap_t pmap, + vm_map_offset_t vaddr, + ppnum_t *ppnp, + pt_entry_t *pte, + boolean_t *was_altacct) { pv_hashed_entry_t pvh_e; pv_rooted_entry_t pv_h; @@ -654,17 +772,18 @@ pmap_pv_remove( pmap_t pmap, uint32_t pv_cnt; ppnum_t ppn; + *was_altacct = FALSE; pmap_pv_remove_retry: ppn = *ppnp; pvh_e = PV_HASHED_ENTRY_NULL; pv_h = pai_to_pvh(ppn_to_pai(ppn)); - if (pv_h->pmap == PMAP_NULL) { + if (__improbable(pv_h->pmap == PMAP_NULL)) { pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT); if (pac == PMAP_ACTION_IGNORE) goto pmap_pv_remove_exit; else if (pac == PMAP_ACTION_ASSERT) - panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): null pv_list!", pmap, vaddr, ppn, *pte); + panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list, priors: %d", pmap, vaddr, ppn, *pte, ppnp, pte, pmap_pagetable_corruption_incidents); else if (pac == PMAP_ACTION_RETRY_RELOCK) { LOCK_PVH(ppn_to_pai(*ppnp)); pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); @@ -674,7 +793,8 @@ pmap_pv_remove_retry: goto pmap_pv_remove_retry; } - if (pv_h->va == vaddr && pv_h->pmap == pmap) { + if (PVE_VA(pv_h) == vaddr && pv_h->pmap == pmap) { + *was_altacct = IS_ALTACCT_PAGE(ppn_to_pai(*ppnp), pv_h); /* * Header is the pv_rooted_entry. * We can't free that. If there is a queued @@ -689,19 +809,20 @@ pmap_pv_remove_retry: * and install as new root. */ CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); + pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e)); LOCK_PV_HASH(pvhash_idx); remque(&pvh_e->qlink); pprevh = pvhash(pvhash_idx); if (PV_HASHED_ENTRY_NULL == *pprevh) { - panic("pmap_pv_remove(%p,0x%llx,0x%x): " - "empty hash, removing rooted", - pmap, vaddr, ppn); + panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x): " + "empty hash, removing rooted, priors: %d", + pmap, vaddr, ppn, pmap_pagetable_corruption_incidents); } pmap_pvh_unlink(pvh_e); UNLOCK_PV_HASH(pvhash_idx); pv_h->pmap = pvh_e->pmap; - pv_h->va = pvh_e->va; /* dispose of pvh_e */ + pv_h->va_and_flags = pvh_e->va_and_flags; + /* dispose of pvh_e */ } else { /* none queued after rooted */ pv_h->pmap = PMAP_NULL; @@ -717,7 +838,8 @@ pmap_pv_remove_retry: LOCK_PV_HASH(pvhash_idx); pprevh = pvhash(pvhash_idx); if (PV_HASHED_ENTRY_NULL == *pprevh) { - panic("pmap_pv_remove(%p,0x%llx,0x%x): empty hash", pmap, vaddr, ppn); + panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash, priors: %d", + pmap, vaddr, ppn, *pte, pte, pmap_pagetable_corruption_incidents); } pvh_e = *pprevh; pmap_pv_hashlist_walks++; @@ -725,17 +847,18 @@ pmap_pv_remove_retry: while (PV_HASHED_ENTRY_NULL != pvh_e) { pv_cnt++; if (pvh_e->pmap == pmap && - pvh_e->va == vaddr && + PVE_VA(pvh_e) == vaddr && pvh_e->ppn == ppn) break; pprevh = &pvh_e->nexth; pvh_e = pvh_e->nexth; } + if (PV_HASHED_ENTRY_NULL == pvh_e) { pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT); if (pac == PMAP_ACTION_ASSERT) - panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, pv_h->pmap, pv_h->va); + panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx, priors: %d", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, PVE_VA(pv_h), pmap_pagetable_corruption_incidents); else { UNLOCK_PV_HASH(pvhash_idx); if (pac == PMAP_ACTION_RETRY_RELOCK) { @@ -751,6 +874,9 @@ pmap_pv_remove_retry: } } } + + *was_altacct = IS_ALTACCT_PAGE(ppn_to_pai(*ppnp), pvh_e); + pmap_pv_hashlist_cnts += pv_cnt; if (pmap_pv_hashlist_max < pv_cnt) pmap_pv_hashlist_max = pv_cnt; @@ -762,4 +888,275 @@ pmap_pv_remove_exit: return pvh_e; } +static inline __attribute__((always_inline)) boolean_t +pmap_pv_is_altacct( + pmap_t pmap, + vm_map_offset_t vaddr, + ppnum_t ppn) +{ + pv_hashed_entry_t pvh_e; + pv_rooted_entry_t pv_h; + int pvhash_idx; + boolean_t is_altacct; + + pvh_e = PV_HASHED_ENTRY_NULL; + pv_h = pai_to_pvh(ppn_to_pai(ppn)); + + if (__improbable(pv_h->pmap == PMAP_NULL)) { + return FALSE; + } + + if (PVE_VA(pv_h) == vaddr && pv_h->pmap == pmap) { + /* + * Header is the pv_rooted_entry. + */ + return IS_ALTACCT_PAGE(ppn, pv_h); + } + + CHK_NPVHASH(); + pvhash_idx = pvhashidx(pmap, vaddr); + LOCK_PV_HASH(pvhash_idx); + pvh_e = *(pvhash(pvhash_idx)); + if (PV_HASHED_ENTRY_NULL == pvh_e) { + panic("Possible memory corruption: pmap_pv_is_altacct(%p,0x%llx,0x%x): empty hash", + pmap, vaddr, ppn); + } + while (PV_HASHED_ENTRY_NULL != pvh_e) { + if (pvh_e->pmap == pmap && + PVE_VA(pvh_e) == vaddr && + pvh_e->ppn == ppn) + break; + pvh_e = pvh_e->nexth; + } + if (PV_HASHED_ENTRY_NULL == pvh_e) { + is_altacct = FALSE; + } else { + is_altacct = IS_ALTACCT_PAGE(ppn, pvh_e); + } + UNLOCK_PV_HASH(pvhash_idx); + + return is_altacct; +} + +extern int pt_fake_zone_index; +static inline void +PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes) +{ + pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes); +} + +static inline void +PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes) +{ + pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes); +} + +static inline void +PMAP_ZINFO_SALLOC(pmap_t pmap, vm_size_t bytes) +{ + pmap_ledger_credit(pmap, task_ledgers.tkm_shared, bytes); +} + +static inline void +PMAP_ZINFO_SFREE(pmap_t pmap, vm_size_t bytes) +{ + pmap_ledger_debit(pmap, task_ledgers.tkm_shared, bytes); +} + +extern boolean_t pmap_initialized;/* Has pmap_init completed? */ +#define valid_page(x) (pmap_initialized && pmap_valid_page(x)) + +int phys_attribute_test( + ppnum_t phys, + int bits); +void phys_attribute_clear( + ppnum_t phys, + int bits, + unsigned int options, + void *arg); + +//#define PCID_DEBUG 1 +#if PCID_DEBUG +#define pmap_pcid_log(fmt, args...) \ + do { \ + kprintf(fmt, ##args); \ + printf(fmt, ##args); \ + } while(0) +#else +#define pmap_pcid_log(fmt, args...) +#endif +void pmap_pcid_configure(void); + + +/* + * Atomic 64-bit compare and exchange of a page table entry. + */ +static inline boolean_t +pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new) +{ + boolean_t ret; + + /* + * Load the old value into %rax + * Load the new value into another register + * Compare-exchange-quad at address entryp + * If the compare succeeds, the new value is stored, return TRUE. + * Otherwise, no swap is made, return FALSE. + */ + asm volatile( + " lock; cmpxchgq %2,(%3) \n\t" + " setz %%al \n\t" + " movzbl %%al,%0" + : "=a" (ret) + : "a" (old), + "r" (new), + "r" (entryp) + : "memory"); + return ret; +} + +extern uint32_t pmap_update_clear_pte_count; + +static inline void pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits) { + pt_entry_t npte, opte; + do { + opte = *mptep; + if (__improbable(opte == 0)) { + pmap_update_clear_pte_count++; + break; + } + npte = opte & ~(pclear_bits); + npte |= pset_bits; + } while (!pmap_cmpx_pte(mptep, opte, npte)); +} + +/* + * The single pml4 page per pmap is allocated at pmap create time and exists + * for the duration of the pmap. we allocate this page in kernel vm. + * this returns the address of the requested pml4 entry in the top level page. + */ +static inline +pml4_entry_t * +pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr) +{ + if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) && + (vaddr < 0xFFFF800000000000ULL))) { + return (NULL); + } + +#if DEBUG + return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]); +#else + return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)]; +#endif +} + +static inline pml4_entry_t * +pmap64_user_pml4(pmap_t pmap, vm_map_offset_t vaddr) +{ + if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) && + (vaddr < 0xFFFF800000000000ULL))) { + return (NULL); + } + +#if DEBUG + return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_ucr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]); +#else + return &pmap->pm_upml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)]; +#endif +} + +/* + * Returns address of requested PDPT entry in the physmap. + */ +static inline pdpt_entry_t * +pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr) +{ + pml4_entry_t newpf; + pml4_entry_t *pml4; + boolean_t is_ept; + + pml4 = pmap64_pml4(pmap, vaddr); + is_ept = is_ept_pmap(pmap); + + if (pml4 && (*pml4 & PTE_VALID_MASK(is_ept))) { + newpf = *pml4 & PG_FRAME; + return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf)) + [(vaddr >> PDPTSHIFT) & (NPDPTPG-1)]; + } + return (NULL); +} +/* + * Returns the address of the requested PDE entry in the physmap. + */ +static inline pd_entry_t * +pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr) +{ + pdpt_entry_t newpf; + pdpt_entry_t *pdpt; + boolean_t is_ept; + + pdpt = pmap64_pdpt(pmap, vaddr); + is_ept = is_ept_pmap(pmap); + + if (pdpt && (*pdpt & PTE_VALID_MASK(is_ept))) { + newpf = *pdpt & PG_FRAME; + return &((pd_entry_t *) PHYSMAP_PTOV(newpf)) + [(vaddr >> PDSHIFT) & (NPDPG-1)]; + } + return (NULL); +} + +static inline pd_entry_t * +pmap_pde(pmap_t m, vm_map_offset_t v) +{ + pd_entry_t *pde; + + pde = pmap64_pde(m, v); + + return pde; +} + + +/* + * return address of mapped pte for vaddr va in pmap pmap. + * + * In case the pde maps a superpage, return the pde, which, in this case + * is the actual page table entry. + */ +static inline pt_entry_t * +pmap_pte(pmap_t pmap, vm_map_offset_t vaddr) +{ + pd_entry_t *pde; + pd_entry_t newpf; + boolean_t is_ept; + + assert(pmap); + pde = pmap64_pde(pmap, vaddr); + + is_ept = is_ept_pmap(pmap); + + if (pde && (*pde & PTE_VALID_MASK(is_ept))) { + if (*pde & PTE_PS) + return pde; + newpf = *pde & PG_FRAME; + return &((pt_entry_t *)PHYSMAP_PTOV(newpf)) + [i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)]; + } + return (NULL); +} +extern void pmap_alias( + vm_offset_t ava, + vm_map_offset_t start, + vm_map_offset_t end, + vm_prot_t prot, + unsigned int options); + +#if DEBUG +#define DPRINTF(x...) kprintf(x) +#else +#define DPRINTF(x...) +#endif + #endif /* MACH_KERNEL_PRIVATE */ +#endif /* _I386_PMAP_INTERNAL_ */