X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/5c9f46613a83ebfc29a5b1f099448259e96a98f0..2a1bd2d3eef5c7a7bb14f4bb9fdbca9a96ee4752:/osfmk/i386/pmap_internal.h diff --git a/osfmk/i386/pmap_internal.h b/osfmk/i386/pmap_internal.h index 4ddabaa20..f9db3a737 100644 --- a/osfmk/i386/pmap_internal.h +++ b/osfmk/i386/pmap_internal.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,12 +22,12 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef _I386_PMAP_INTERNAL_ +#ifndef _I386_PMAP_INTERNAL_ #define _I386_PMAP_INTERNAL_ #ifdef MACH_KERNEL_PRIVATE @@ -41,65 +41,100 @@ * pmap locking */ -#define PMAP_LOCK(pmap) { \ - simple_lock(&(pmap)->lock); \ +static inline void +PMAP_LOCK_EXCLUSIVE(pmap_t p) +{ + mp_disable_preemption(); + lck_rw_lock_exclusive(&p->pmap_rwl); } -#define PMAP_UNLOCK(pmap) { \ - simple_unlock(&(pmap)->lock); \ +static inline void +PMAP_LOCK_SHARED(pmap_t p) +{ + mp_disable_preemption(); + lck_rw_lock_shared(&p->pmap_rwl); } -#define PMAP_UPDATE_TLBS(pmap, s, e) \ - pmap_flush_tlbs(pmap, s, e, 0, NULL) - +static inline void +PMAP_LOCK_SHARED_TO_EXCLUSIVE(pmap_t p) +{ + lck_rw_lock_shared_to_exclusive(&p->pmap_rwl); +} -#define PMAP_DELAY_TLB_FLUSH 0x01 +static inline void +PMAP_LOCK_EXCLUSIVE_TO_SHARED(pmap_t p) +{ + lck_rw_lock_exclusive_to_shared(&p->pmap_rwl); +} -#define PMAP_UPDATE_TLBS_DELAYED(pmap, s, e, c) \ - pmap_flush_tlbs(pmap, s, e, PMAP_DELAY_TLB_FLUSH, c) +static inline void +PMAP_UNLOCK_EXCLUSIVE(pmap_t p) +{ + lck_rw_unlock_exclusive(&p->pmap_rwl); + mp_enable_preemption(); +} +static inline void +PMAP_UNLOCK_SHARED(pmap_t p) +{ + lck_rw_unlock_shared(&p->pmap_rwl); + mp_enable_preemption(); +} -#define iswired(pte) ((pte) & INTEL_PTE_WIRED) +#define iswired(pte) ((pte) & INTEL_PTE_WIRED) -#ifdef PMAP_TRACES -extern boolean_t pmap_trace; +#ifdef PMAP_TRACES +extern boolean_t pmap_trace; #define PMAP_TRACE(...) \ if (pmap_trace) { \ - KDBG_RELEASE(__VA_ARGS__); \ + KDBG_RELEASE(__VA_ARGS__); \ } #else -#define PMAP_TRACE(...) KDBG_DEBUG(__VA_ARGS__) +#define PMAP_TRACE(...) KDBG_DEBUG(__VA_ARGS__) #endif /* PMAP_TRACES */ #define PMAP_TRACE_CONSTANT(...) KDBG_RELEASE(__VA_ARGS__) -kern_return_t pmap_expand_pml4( - pmap_t map, - vm_map_offset_t v, - unsigned int options); +kern_return_t pmap_expand_pml4( + pmap_t map, + vm_map_offset_t v, + unsigned int options); -kern_return_t pmap_expand_pdpt( - pmap_t map, - vm_map_offset_t v, - unsigned int options); +kern_return_t pmap_expand_pdpt( + pmap_t map, + vm_map_offset_t v, + unsigned int options); -void phys_attribute_set( - ppnum_t phys, - int bits); +void phys_attribute_set( + ppnum_t phys, + int bits); -void pmap_set_reference( - ppnum_t pn); +void pmap_set_reference( + ppnum_t pn); -boolean_t phys_page_exists( - ppnum_t pn); +boolean_t phys_page_exists( + ppnum_t pn); void -pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t, int, pmap_flush_context *); + pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t, int, pmap_flush_context *); void -pmap_update_cache_attributes_locked(ppnum_t, unsigned); + pmap_update_cache_attributes_locked(ppnum_t, unsigned); -extern const boolean_t cpu_64bit; + +static inline void +PMAP_UPDATE_TLBS(pmap_t fp, addr64_t s, addr64_t e) +{ + pmap_flush_tlbs(fp, s, e, 0, NULL); +} + +#define PMAP_DELAY_TLB_FLUSH 0x01 + +static inline void +PMAP_UPDATE_TLBS_DELAYED(pmap_t fp, addr64_t s, addr64_t e, pmap_flush_context *pfc) +{ + pmap_flush_tlbs(fp, s, e, PMAP_DELAY_TLB_FLUSH, pfc); +} /* * Private data structures. @@ -121,121 +156,121 @@ extern const boolean_t cpu_64bit; */ /* - -PV HASHING Changes - JK 1/2007 - -Pve's establish physical to virtual mappings. These are used for aliasing of a -physical page to (potentially many) virtual addresses within pmaps. In the -previous implementation the structure of the pv_entries (each 16 bytes in size) was - -typedef struct pv_entry { - struct pv_entry_t next; - pmap_t pmap; - vm_map_offset_t va; -} *pv_entry_t; - -An initial array of these is created at boot time, one per physical page of -memory, indexed by the physical page number. Additionally, a pool of entries -is created from a pv_zone to be used as needed by pmap_enter() when it is -creating new mappings. Originally, we kept this pool around because the code -in pmap_enter() was unable to block if it needed an entry and none were -available - we'd panic. Some time ago I restructured the pmap_enter() code -so that for user pmaps it can block while zalloc'ing a pv structure and restart, -removing a panic from the code (in the case of the kernel pmap we cannot block -and still panic, so, we keep a separate hot pool for use only on kernel pmaps). -The pool has not been removed since there is a large performance gain keeping -freed pv's around for reuse and not suffering the overhead of zalloc for every -new pv we need. - -As pmap_enter() created new mappings it linked the new pve's for them off the -fixed pv array for that ppn (off the next pointer). These pve's are accessed -for several operations, one of them being address space teardown. In that case, -we basically do this - - for (every page/pte in the space) { - calc pve_ptr from the ppn in the pte - for (every pv in the list for the ppn) { - if (this pv is for this pmap/vaddr) { - do housekeeping - unlink/free the pv - } - } - } - -The problem arose when we were running, say 8000 (or even 2000) apache or -other processes and one or all terminate. The list hanging off each pv array -entry could have thousands of entries. We were continuously linearly searching -each of these lists as we stepped through the address space we were tearing -down. Because of the locks we hold, likely taking a cache miss for each node, -and interrupt disabling for MP issues the system became completely unresponsive -for many seconds while we did this. - -Realizing that pve's are accessed in two distinct ways (linearly running the -list by ppn for operations like pmap_page_protect and finding and -modifying/removing a single pve as part of pmap_enter processing) has led to -modifying the pve structures and databases. - -There are now two types of pve structures. A "rooted" structure which is -basically the original structure accessed in an array by ppn, and a ''hashed'' -structure accessed on a hash list via a hash of [pmap, vaddr]. These have been -designed with the two goals of minimizing wired memory and making the lookup of -a ppn faster. Since a vast majority of pages in the system are not aliased -and hence represented by a single pv entry I've kept the rooted entry size as -small as possible because there is one of these dedicated for every physical -page of memory. The hashed pve's are larger due to the addition of the hash -link and the ppn entry needed for matching while running the hash list to find -the entry we are looking for. This way, only systems that have lots of -aliasing (like 2000+ httpd procs) will pay the extra memory price. Both -structures have the same first three fields allowing some simplification in -the code. - -They have these shapes - -typedef struct pv_rooted_entry { - queue_head_t qlink; - vm_map_offset_t va; - pmap_t pmap; -} *pv_rooted_entry_t; - - -typedef struct pv_hashed_entry { - queue_head_t qlink; - vm_map_offset_t va; - pmap_t pmap; - ppnum_t ppn; - struct pv_hashed_entry *nexth; -} *pv_hashed_entry_t; - -The main flow difference is that the code is now aware of the rooted entry and -the hashed entries. Code that runs the pv list still starts with the rooted -entry and then continues down the qlink onto the hashed entries. Code that is -looking up a specific pv entry first checks the rooted entry and then hashes -and runs the hash list for the match. The hash list lengths are much smaller -than the original pv lists that contained all aliases for the specific ppn. - -*/ + * + * PV HASHING Changes - JK 1/2007 + * + * Pve's establish physical to virtual mappings. These are used for aliasing of a + * physical page to (potentially many) virtual addresses within pmaps. In the + * previous implementation the structure of the pv_entries (each 16 bytes in size) was + * + * typedef struct pv_entry { + * struct pv_entry_t next; + * pmap_t pmap; + * vm_map_offset_t va; + * } *pv_entry_t; + * + * An initial array of these is created at boot time, one per physical page of + * memory, indexed by the physical page number. Additionally, a pool of entries + * is created from a pv_zone to be used as needed by pmap_enter() when it is + * creating new mappings. Originally, we kept this pool around because the code + * in pmap_enter() was unable to block if it needed an entry and none were + * available - we'd panic. Some time ago I restructured the pmap_enter() code + * so that for user pmaps it can block while zalloc'ing a pv structure and restart, + * removing a panic from the code (in the case of the kernel pmap we cannot block + * and still panic, so, we keep a separate hot pool for use only on kernel pmaps). + * The pool has not been removed since there is a large performance gain keeping + * freed pv's around for reuse and not suffering the overhead of zalloc for every + * new pv we need. + * + * As pmap_enter() created new mappings it linked the new pve's for them off the + * fixed pv array for that ppn (off the next pointer). These pve's are accessed + * for several operations, one of them being address space teardown. In that case, + * we basically do this + * + * for (every page/pte in the space) { + * calc pve_ptr from the ppn in the pte + * for (every pv in the list for the ppn) { + * if (this pv is for this pmap/vaddr) { + * do housekeeping + * unlink/free the pv + * } + * } + * } + * + * The problem arose when we were running, say 8000 (or even 2000) apache or + * other processes and one or all terminate. The list hanging off each pv array + * entry could have thousands of entries. We were continuously linearly searching + * each of these lists as we stepped through the address space we were tearing + * down. Because of the locks we hold, likely taking a cache miss for each node, + * and interrupt disabling for MP issues the system became completely unresponsive + * for many seconds while we did this. + * + * Realizing that pve's are accessed in two distinct ways (linearly running the + * list by ppn for operations like pmap_page_protect and finding and + * modifying/removing a single pve as part of pmap_enter processing) has led to + * modifying the pve structures and databases. + * + * There are now two types of pve structures. A "rooted" structure which is + * basically the original structure accessed in an array by ppn, and a ''hashed'' + * structure accessed on a hash list via a hash of [pmap, vaddr]. These have been + * designed with the two goals of minimizing wired memory and making the lookup of + * a ppn faster. Since a vast majority of pages in the system are not aliased + * and hence represented by a single pv entry I've kept the rooted entry size as + * small as possible because there is one of these dedicated for every physical + * page of memory. The hashed pve's are larger due to the addition of the hash + * link and the ppn entry needed for matching while running the hash list to find + * the entry we are looking for. This way, only systems that have lots of + * aliasing (like 2000+ httpd procs) will pay the extra memory price. Both + * structures have the same first three fields allowing some simplification in + * the code. + * + * They have these shapes + * + * typedef struct pv_rooted_entry { + * queue_head_t qlink; + * vm_map_offset_t va; + * pmap_t pmap; + * } *pv_rooted_entry_t; + * + * + * typedef struct pv_hashed_entry { + * queue_head_t qlink; + * vm_map_offset_t va; + * pmap_t pmap; + * ppnum_t ppn; + * struct pv_hashed_entry *nexth; + * } *pv_hashed_entry_t; + * + * The main flow difference is that the code is now aware of the rooted entry and + * the hashed entries. Code that runs the pv list still starts with the rooted + * entry and then continues down the qlink onto the hashed entries. Code that is + * looking up a specific pv entry first checks the rooted entry and then hashes + * and runs the hash list for the match. The hash list lengths are much smaller + * than the original pv lists that contained all aliases for the specific ppn. + * + */ typedef struct pv_rooted_entry { /* first three entries must match pv_hashed_entry_t */ - queue_head_t qlink; - vm_map_offset_t va_and_flags; /* virtual address for mapping */ - pmap_t pmap; /* pmap where mapping lies */ + queue_head_t qlink; + vm_map_offset_t va_and_flags; /* virtual address for mapping */ + pmap_t pmap; /* pmap where mapping lies */ } *pv_rooted_entry_t; -#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0) +#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0) typedef struct pv_hashed_entry { /* first three entries must match pv_rooted_entry_t */ - queue_head_t qlink; - vm_map_offset_t va_and_flags; - pmap_t pmap; - ppnum_t ppn; - struct pv_hashed_entry *nexth; + queue_head_t qlink; + vm_map_offset_t va_and_flags; + pmap_t pmap; + ppnum_t ppn; + struct pv_hashed_entry *nexth; } *pv_hashed_entry_t; #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0) -#define PVE_VA(pve) ((pve)->va_and_flags & ~PAGE_MASK) +#define PVE_VA(pve) ((pve)->va_and_flags & (vm_map_offset_t)~PAGE_MASK) #define PVE_FLAGS(pve) ((pve)->va_and_flags & PAGE_MASK) #define PVE_IS_ALTACCT 0x001 #define PVE_IS_ALTACCT_PAGE(pve) \ @@ -255,45 +290,47 @@ typedef struct pv_hashed_entry { #define PV_HASHED_ALLOC_CHUNK_INITIAL 2000 #define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200 -extern volatile uint32_t mappingrecurse; +extern volatile uint32_t mappingrecurse; extern uint32_t pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark; /* * PV hash locking */ -#define LOCK_PV_HASH(hash) lock_hash_hash(hash) -#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash) +#define LOCK_PV_HASH(hash) lock_hash_hash(hash) +#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash) extern uint32_t npvhashmask; -extern pv_hashed_entry_t *pv_hash_table; /* hash lists */ -extern pv_hashed_entry_t pv_hashed_free_list; -extern pv_hashed_entry_t pv_hashed_kern_free_list; -decl_simple_lock_data(extern, pv_hashed_free_list_lock) -decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock) -decl_simple_lock_data(extern, pv_hash_table_lock) -decl_simple_lock_data(extern, phys_backup_lock) - -extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry - * structures */ - -extern uint32_t pv_hashed_free_count; -extern uint32_t pv_hashed_kern_free_count; +extern pv_hashed_entry_t *pv_hash_table; /* hash lists */ +extern pv_hashed_entry_t pv_hashed_free_list; +extern pv_hashed_entry_t pv_hashed_kern_free_list; +decl_simple_lock_data(extern, pv_hashed_free_list_lock); +decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock); +decl_simple_lock_data(extern, pv_hash_table_lock); +decl_simple_lock_data(extern, phys_backup_lock); + +extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry + * structures */ + +extern uint32_t pv_hashed_free_count; +extern uint32_t pv_hashed_kern_free_count; /* * Each entry in the pv_head_table is locked by a bit in the * pv_lock_table. The lock bits are accessed by the address of * the frame they lock. */ -#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) +#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) -extern char *pv_lock_table; /* pointer to array of bits */ -extern char *pv_hash_lock_table; -extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */ +extern char *pv_lock_table; /* pointer to array of bits */ +extern char *pv_hash_lock_table; +extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */ extern event_t mapping_replenish_event; -static inline void PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) { +static inline void +PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) +{ pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL); - simple_lock(&pv_hashed_free_list_lock); + simple_lock(&pv_hashed_free_list_lock, LCK_GRP_NULL); /* If the kernel reserved pool is low, let non-kernel mappings allocate * synchronously, possibly subject to a throttle. */ @@ -305,24 +342,29 @@ static inline void PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) { simple_unlock(&pv_hashed_free_list_lock); if (pv_hashed_free_count <= pv_hashed_low_water_mark) { - if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) + if (!mappingrecurse && os_atomic_cmpxchg(&mappingrecurse, 0, 1, acq_rel)) { thread_wakeup(&mapping_replenish_event); + } } } -static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) { - simple_lock(&pv_hashed_free_list_lock); +static inline void +PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) +{ + simple_lock(&pv_hashed_free_list_lock, LCK_GRP_NULL); pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; pv_hashed_free_list = pvh_eh; - pv_hashed_free_count += pv_cnt; + pv_hashed_free_count += (uint32_t)pv_cnt; simple_unlock(&pv_hashed_free_list_lock); } extern unsigned pmap_kern_reserve_alloc_stat; -static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) { +static inline void +PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) +{ pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL); - simple_lock(&pv_hashed_kern_free_list_lock); + simple_lock(&pv_hashed_kern_free_list_lock, LCK_GRP_NULL); if ((*pvh_e = pv_hashed_kern_free_list) != 0) { pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next; @@ -333,23 +375,28 @@ static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) { simple_unlock(&pv_hashed_kern_free_list_lock); if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) { - if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) + if (!mappingrecurse && os_atomic_cmpxchg(&mappingrecurse, 0, 1, acq_rel)) { thread_wakeup(&mapping_replenish_event); + } } } -static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) { - simple_lock(&pv_hashed_kern_free_list_lock); +static inline void +PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) +{ + simple_lock(&pv_hashed_kern_free_list_lock, LCK_GRP_NULL); pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; pv_hashed_kern_free_list = pvh_eh; - pv_hashed_kern_free_count += pv_cnt; + pv_hashed_kern_free_count += (uint32_t)pv_cnt; simple_unlock(&pv_hashed_kern_free_list_lock); } extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters; extern event_t pmap_user_pv_throttle_event; -static inline void pmap_pv_throttle(__unused pmap_t p) { +static inline void +pmap_pv_throttle(__unused pmap_t p) +{ pmap_assert(p != kernel_pmap); /* Apply throttle on non-kernel mappings */ if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) { @@ -367,42 +414,44 @@ static inline void pmap_pv_throttle(__unused pmap_t p) { * Index into pv_head table, its lock bits, and the modify/reference and managed bits */ -#define pa_index(pa) (i386_btop(pa)) -#define ppn_to_pai(ppn) ((int)ppn) - -#define pai_to_pvh(pai) (&pv_head_table[pai]) -#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table) -#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table) -#define pvhash(idx) (&pv_hash_table[idx]) -#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table) -#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table) - -#define IS_MANAGED_PAGE(x) \ - ((unsigned int)(x) <= last_managed_page && \ - (pmap_phys_attributes[x] & PHYS_MANAGED)) -#define IS_INTERNAL_PAGE(x) \ - (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_INTERNAL)) -#define IS_REUSABLE_PAGE(x) \ - (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_REUSABLE)) -#define IS_ALTACCT_PAGE(x,pve) \ - (IS_MANAGED_PAGE((x)) && \ +#define pa_index(pa) (i386_btop(pa)) +#define ppn_to_pai(ppn) ((int)ppn) + +#define pai_to_pvh(pai) (&pv_head_table[pai]) +#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table) +#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table) +#define pvhash(idx) (&pv_hash_table[idx]) +#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table) +#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table) + +#define IS_MANAGED_PAGE(x) \ + ((unsigned int)(x) <= last_managed_page && \ + ((unsigned long long)pmap_phys_attributes[x] & PHYS_MANAGED)) +#define IS_INTERNAL_PAGE(x) \ + (IS_MANAGED_PAGE(x) && ((unsigned long long)pmap_phys_attributes[x] & PHYS_INTERNAL)) +#define IS_REUSABLE_PAGE(x) \ + (IS_MANAGED_PAGE(x) && ((unsigned long long)pmap_phys_attributes[x] & PHYS_REUSABLE)) +#define IS_ALTACCT_PAGE(x, pve) \ + (IS_MANAGED_PAGE((x)) && \ (PVE_IS_ALTACCT_PAGE((pve)))) /* * Physical page attributes. Copy bits from PTE definition. */ -#define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */ -#define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */ -#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */ -#define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */ -#define PHYS_NCACHE INTEL_PTE_NCACHE -#define PHYS_PTA INTEL_PTE_PTA -#define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE) -#define PHYS_INTERNAL INTEL_PTE_WTHRU /* page from internal object */ -#define PHYS_REUSABLE INTEL_PTE_WRITE /* page is "reusable" */ - -extern boolean_t pmap_disable_kheap_nx; -extern boolean_t pmap_disable_kstack_nx; +#define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */ +#define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */ +#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */ +#define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */ +#define PHYS_NCACHE INTEL_PTE_NCACHE +#define PHYS_PAT INTEL_PTE_PAT +#define PHYS_CACHEABILITY_MASK (INTEL_PTE_PAT | INTEL_PTE_NCACHE) +#define PHYS_INTERNAL INTEL_PTE_WTHRU /* page from internal object */ +#define PHYS_REUSABLE INTEL_PTE_WRITE /* page is "reusable" */ + +#if DEVELOPMENT || DEBUG +extern boolean_t pmap_disable_kheap_nx; +extern boolean_t pmap_disable_kstack_nx; +#endif #define PMAP_EXPAND_OPTIONS_NONE (0x0) #define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT) @@ -412,8 +461,7 @@ extern boolean_t pmap_disable_kstack_nx; * Amount of virtual memory mapped by one * page-directory entry. */ -#define PDE_MAPPED_SIZE (pdetova(1)) - +#define PDE_MAPPED_SIZE (pdetova(1)) /* * Locking and TLB invalidation @@ -444,24 +492,29 @@ extern boolean_t pmap_disable_kstack_nx; * PV locking */ -#define LOCK_PVH(index) { \ - mp_disable_preemption(); \ - lock_pvh_pai(index); \ +#define LOCK_PVH(index) { \ + mp_disable_preemption(); \ + lock_pvh_pai(index); \ } -#define UNLOCK_PVH(index) { \ - unlock_pvh_pai(index); \ - mp_enable_preemption(); \ +#define UNLOCK_PVH(index) { \ + unlock_pvh_pai(index); \ + mp_enable_preemption(); \ } extern uint64_t pde_mapped_size; -extern char *pmap_phys_attributes; -extern ppnum_t last_managed_page; +extern char *pmap_phys_attributes; +extern ppnum_t last_managed_page; -extern ppnum_t lowest_lo; -extern ppnum_t lowest_hi; -extern ppnum_t highest_hi; +/* + * Used to record high memory allocated to kernel before + * pmap_init() gets called. + */ +extern ppnum_t pmap_high_used_top; +extern ppnum_t pmap_high_used_bottom; +extern ppnum_t pmap_middle_used_top; +extern ppnum_t pmap_middle_used_bottom; /* * when spinning through pmap_remove @@ -478,24 +531,25 @@ extern uint64_t max_preemption_latency_tsc; #endif #if PMAP_INTR_DEBUG -#define pmap_intr_assert() { \ - if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \ - panic("pmap interrupt assert %d %s, %d", processor_avail_count, __FILE__, __LINE__); \ +#define pmap_intr_assert() { \ + if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \ + panic("pmap interrupt assert %d %s, %d", processor_avail_count, __FILE__, __LINE__); \ } #else #define pmap_intr_assert() #endif - -extern int nx_enabled; +#if DEVELOPMENT || DEBUG +extern int nx_enabled; +#endif extern unsigned int inuse_ptepages_count; static inline uint32_t pvhashidx(pmap_t pmap, vm_map_offset_t va) { uint32_t hashidx = ((uint32_t)(uintptr_t)pmap ^ - ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) & - npvhashmask; - return hashidx; + ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) & + npvhashmask; + return hashidx; } /* @@ -503,12 +557,12 @@ pvhashidx(pmap_t pmap, vm_map_offset_t va) * properly deals with the anchor. * must be called with the hash locked, does not unlock it */ -static inline void +static inline void pmap_pvh_unlink(pv_hashed_entry_t pvh) { - pv_hashed_entry_t curh; - pv_hashed_entry_t *pprevh; - int pvhash_idx; + pv_hashed_entry_t curh; + pv_hashed_entry_t *pprevh; + uint32_t pvhash_idx; CHK_NPVHASH(); pvhash_idx = pvhashidx(pvh->pmap, PVE_VA(pvh)); @@ -516,28 +570,32 @@ pmap_pvh_unlink(pv_hashed_entry_t pvh) pprevh = pvhash(pvhash_idx); #if PV_DEBUG - if (NULL == *pprevh) + if (NULL == *pprevh) { panic("pvh_unlink null anchor"); /* JK DEBUG */ + } #endif curh = *pprevh; while (PV_HASHED_ENTRY_NULL != curh) { - if (pvh == curh) + if (pvh == curh) { break; + } pprevh = &curh->nexth; curh = curh->nexth; } - if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh"); + if (PV_HASHED_ENTRY_NULL == curh) { + panic("pmap_pvh_unlink no pvh"); + } *pprevh = pvh->nexth; return; } static inline void -pv_hash_add(pv_hashed_entry_t pvh_e, - pv_rooted_entry_t pv_h) +pv_hash_add(pv_hashed_entry_t pvh_e, + pv_rooted_entry_t pv_h) { pv_hashed_entry_t *hashp; - int pvhash_idx; + uint32_t pvhash_idx; CHK_NPVHASH(); pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e)); @@ -545,8 +603,9 @@ pv_hash_add(pv_hashed_entry_t pvh_e, insque(&pvh_e->qlink, &pv_h->qlink); hashp = pvhash(pvhash_idx); #if PV_DEBUG - if (NULL==hashp) + if (NULL == hashp) { panic("pv_hash_add(%p) null hash bucket", pvh_e); + } #endif pvh_e->nexth = *hashp; *hashp = pvh_e; @@ -556,18 +615,20 @@ pv_hash_add(pv_hashed_entry_t pvh_e, static inline void pv_hash_remove(pv_hashed_entry_t pvh_e) { - int pvhash_idx; + uint32_t pvhash_idx; CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh_e->pmap,PVE_VA(pvh_e)); + pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e)); LOCK_PV_HASH(pvhash_idx); remque(&pvh_e->qlink); pmap_pvh_unlink(pvh_e); UNLOCK_PV_HASH(pvhash_idx); -} +} -static inline boolean_t popcnt1(uint64_t distance) { - return ((distance & (distance - 1)) == 0); +static inline boolean_t +popcnt1(uint64_t distance) +{ + return (distance & (distance - 1)) == 0; } /* @@ -587,13 +648,14 @@ static inline boolean_t popcnt1(uint64_t distance) { */ typedef enum { - PTE_VALID = 0x0, - PTE_INVALID = 0x1, - PTE_RSVD = 0x2, - PTE_SUPERVISOR = 0x4, - PTE_BITFLIP = 0x8, - PV_BITFLIP = 0x10, - PTE_INVALID_CACHEABILITY = 0x20 + PTE_VALID = 0x0, + PTE_INVALID = 0x1, + PTE_RSVD = 0x2, + PTE_SUPERVISOR = 0x4, + PTE_BITFLIP = 0x8, + PV_BITFLIP = 0x10, + PTE_INVALID_CACHEABILITY = 0x20, + PTE_NXBITFLIP = 0x40 } pmap_pagetable_corruption_t; typedef enum { @@ -602,13 +664,13 @@ typedef enum { } pmap_pv_assertion_t; typedef enum { - PMAP_ACTION_IGNORE = 0x0, - PMAP_ACTION_ASSERT = 0x1, - PMAP_ACTION_RETRY = 0x2, + PMAP_ACTION_IGNORE = 0x0, + PMAP_ACTION_ASSERT = 0x1, + PMAP_ACTION_RETRY = 0x2, PMAP_ACTION_RETRY_RELOCK = 0x4 } pmap_pagetable_corruption_action_t; -#define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL) +#define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL) extern uint64_t pmap_pagetable_corruption_interval_abstime; extern uint32_t pmap_pagetable_corruption_incidents; @@ -617,23 +679,38 @@ typedef struct { pmap_pv_assertion_t incident; pmap_pagetable_corruption_t reason; pmap_pagetable_corruption_action_t action; - pmap_t pmap; + pmap_t pmap; vm_map_offset_t vaddr; pt_entry_t pte; ppnum_t ppn; pmap_t pvpmap; vm_map_offset_t pvva; uint64_t abstime; + int adj_ptes_count; +#define PMPTCR_MAX_ADJ_PTES (2) + uint64_t adj_ptes[PMPTCR_MAX_ADJ_PTES]; } pmap_pagetable_corruption_record_t; extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[]; extern uint64_t pmap_pagetable_corruption_last_abstime; -extern thread_call_t pmap_pagetable_corruption_log_call; +extern thread_call_t pmap_pagetable_corruption_log_call; extern boolean_t pmap_pagetable_corruption_timeout; -static inline void -pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) { +static inline pmap_pagetable_corruption_action_t +pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, + pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, + ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva, int adj_pteps_cnt, uint64_t **adj_pteps) +{ uint32_t pmap_pagetable_corruption_log_index; + uint64_t curtime = mach_absolute_time(); + + if ((curtime - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) { + pmap_pagetable_corruption_timeout = TRUE; + action = PMAP_ACTION_ASSERT; + } else { + pmap_pagetable_corruption_last_abstime = curtime; + } + pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG; pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident; pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason; @@ -644,22 +721,31 @@ pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corru pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn; pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap; pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva; - pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time(); + pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = curtime; + if (adj_pteps_cnt > 0 && adj_pteps != NULL) { + pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes_count = MIN(adj_pteps_cnt, PMPTCR_MAX_ADJ_PTES); + for (int i = 0; i < pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes_count; i++) { + pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes[i] = *adj_pteps[i]; + } + } /* Asynchronously log */ thread_call_enter(pmap_pagetable_corruption_log_call); + + return action; } static inline pmap_pagetable_corruption_action_t -pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) { - pmap_pagetable_corruption_action_t action = PMAP_ACTION_ASSERT; - pmap_pagetable_corruption_t suppress_reason = PTE_VALID; - ppnum_t suppress_ppn = 0; +pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) +{ + pmap_pagetable_corruption_action_t action = PMAP_ACTION_ASSERT; + pmap_pagetable_corruption_t suppress_reason = PTE_VALID; + ppnum_t suppress_ppn = 0; pt_entry_t cpte = *ptep; - ppnum_t cpn = pa_index(pte_to_pa(cpte)); - ppnum_t ppn = *ppnp; - pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn)); - pv_rooted_entry_t pv_e = pv_h; - uint32_t bitdex; + ppnum_t cpn = pa_index(pte_to_pa(cpte)); + ppnum_t ppn = *ppnp; + pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn)); + pv_rooted_entry_t pv_e = pv_h; + uint32_t bitdex; pmap_t pvpmap = pv_h->pmap; vm_map_offset_t pvva = PVE_VA(pv_h); vm_map_offset_t pve_flags; @@ -722,15 +808,13 @@ pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t * * Check for malformed/inconsistent entries. * The first check here isn't useful for EPT PTEs because INTEL_EPT_NCACHE == 0 */ - if (!is_ept && ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU))) { + if (!is_ept && ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PAT)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU))) { action = PMAP_ACTION_IGNORE; suppress_reason = PTE_INVALID_CACHEABILITY; - } - else if (cpte & INTEL_PTE_RSVD) { + } else if (cpte & INTEL_PTE_RSVD) { action = PMAP_ACTION_IGNORE; suppress_reason = PTE_RSVD; - } - else if ((pmap != kernel_pmap) && (!is_ept) && ((cpte & INTEL_PTE_USER) == 0)) { + } else if ((pmap != kernel_pmap) && (!is_ept) && ((cpte & INTEL_PTE_USER) == 0)) { action = PMAP_ACTION_IGNORE; suppress_reason = PTE_SUPERVISOR; } @@ -741,16 +825,49 @@ pmap_cpc_exit: action = PMAP_ACTION_ASSERT; } - if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) { - action = PMAP_ACTION_ASSERT; - pmap_pagetable_corruption_timeout = TRUE; + return pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva, 0, 0); +} + +static inline boolean_t +pmap_compressed_pte_corruption_repair(uint64_t pte, uint64_t *pte_addr, uint64_t *ptep, pmap_t pmap, + vm_map_offset_t vaddr) +{ + uint64_t *adj_pteps[2]; + int pteidx = ((uintptr_t)ptep & INTEL_OFFMASK) / sizeof(pt_entry_t); + pmap_pagetable_corruption_action_t action = PMAP_ACTION_IGNORE; + + /* + * Grab pointers to PTEs on either side of the PTE in question, unless we're at the start of + * a PT (grab pointers to the next and next-next PTEs) or the end of a PT (grab the previous + * 2 PTEs). + */ + if (pteidx == 0) { + adj_pteps[0] = ptep + 1; + adj_pteps[1] = ptep + 2; + } else if (pteidx == (NPTPG - 1)) { + adj_pteps[0] = ptep - 2; + adj_pteps[1] = ptep - 1; + } else { + adj_pteps[0] = ptep - 1; + adj_pteps[1] = ptep + 1; } - else - { - pmap_pagetable_corruption_last_abstime = mach_absolute_time(); + + /* + * Since the compressed PTE no longer has a PTE associated, we cannot pass in the pv data to + * pmap_pagetable_corruption_log, so instead supply adjacent PTEs for logging. + */ + if (pmap_pagetable_corruption_log(ROOT_ABSENT, (pte & INTEL_PTE_NX) ? PTE_NXBITFLIP : PTE_BITFLIP, + action, pmap, vaddr, ptep, (ppnum_t)~0UL, 0, 0, sizeof(adj_pteps) / sizeof(adj_pteps[0]), + adj_pteps) != PMAP_ACTION_ASSERT) { + /* Correct the flipped bit(s) and continue */ + pmap_store_pte(ptep, pte & INTEL_PTE_COMPRESSED_MASK); + pmap->corrected_compressed_ptes_count++; + return TRUE; /* Returning TRUE to indicate this is a now a valid compressed PTE (we hope) */ } - pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva); - return action; + + panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted? Adjacent PTEs: 0x%llx@%p, 0x%llx@%p", + pte_addr, pte, pte & ~INTEL_PTE_COMPRESSED_MASK, *adj_pteps[0], adj_pteps[0], *adj_pteps[1], adj_pteps[1]); + /*NOTREACHED*/ } /* @@ -759,18 +876,18 @@ pmap_cpc_exit: * Returns pv entry to be freed (or NULL). */ static inline __attribute__((always_inline)) pv_hashed_entry_t -pmap_pv_remove(pmap_t pmap, - vm_map_offset_t vaddr, - ppnum_t *ppnp, - pt_entry_t *pte, - boolean_t *was_altacct) +pmap_pv_remove(pmap_t pmap, + vm_map_offset_t vaddr, + ppnum_t *ppnp, + pt_entry_t *pte, + boolean_t *was_altacct) { pv_hashed_entry_t pvh_e; - pv_rooted_entry_t pv_h; - pv_hashed_entry_t *pprevh; - int pvhash_idx; + pv_rooted_entry_t pv_h; + pv_hashed_entry_t *pprevh; + uint32_t pvhash_idx; uint32_t pv_cnt; - ppnum_t ppn; + ppnum_t ppn; *was_altacct = FALSE; pmap_pv_remove_retry: @@ -780,28 +897,28 @@ pmap_pv_remove_retry: if (__improbable(pv_h->pmap == PMAP_NULL)) { pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT); - if (pac == PMAP_ACTION_IGNORE) + if (pac == PMAP_ACTION_IGNORE) { goto pmap_pv_remove_exit; - else if (pac == PMAP_ACTION_ASSERT) + } else if (pac == PMAP_ACTION_ASSERT) { panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list, priors: %d", pmap, vaddr, ppn, *pte, ppnp, pte, pmap_pagetable_corruption_incidents); - else if (pac == PMAP_ACTION_RETRY_RELOCK) { + } else if (pac == PMAP_ACTION_RETRY_RELOCK) { LOCK_PVH(ppn_to_pai(*ppnp)); pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); goto pmap_pv_remove_retry; - } - else if (pac == PMAP_ACTION_RETRY) + } else if (pac == PMAP_ACTION_RETRY) { goto pmap_pv_remove_retry; + } } if (PVE_VA(pv_h) == vaddr && pv_h->pmap == pmap) { *was_altacct = IS_ALTACCT_PAGE(ppn_to_pai(*ppnp), pv_h); /* - * Header is the pv_rooted_entry. + * Header is the pv_rooted_entry. * We can't free that. If there is a queued - * entry after this one we remove that - * from the ppn queue, we remove it from the hash chain - * and copy it to the rooted entry. Then free it instead. - */ + * entry after this one we remove that + * from the ppn queue, we remove it from the hash chain + * and copy it to the rooted entry. Then free it instead. + */ pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink); if (pv_h != (pv_rooted_entry_t) pvh_e) { /* @@ -815,7 +932,7 @@ pmap_pv_remove_retry: pprevh = pvhash(pvhash_idx); if (PV_HASHED_ENTRY_NULL == *pprevh) { panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x): " - "empty hash, removing rooted, priors: %d", + "empty hash, removing rooted, priors: %d", pmap, vaddr, ppn, pmap_pagetable_corruption_incidents); } pmap_pvh_unlink(pvh_e); @@ -848,8 +965,9 @@ pmap_pv_remove_retry: pv_cnt++; if (pvh_e->pmap == pmap && PVE_VA(pvh_e) == vaddr && - pvh_e->ppn == ppn) + pvh_e->ppn == ppn) { break; + } pprevh = &pvh_e->nexth; pvh_e = pvh_e->nexth; } @@ -857,19 +975,17 @@ pmap_pv_remove_retry: if (PV_HASHED_ENTRY_NULL == pvh_e) { pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT); - if (pac == PMAP_ACTION_ASSERT) + if (pac == PMAP_ACTION_ASSERT) { panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx, priors: %d", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, PVE_VA(pv_h), pmap_pagetable_corruption_incidents); - else { + } else { UNLOCK_PV_HASH(pvhash_idx); if (pac == PMAP_ACTION_RETRY_RELOCK) { LOCK_PVH(ppn_to_pai(*ppnp)); pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); goto pmap_pv_remove_retry; - } - else if (pac == PMAP_ACTION_RETRY) { + } else if (pac == PMAP_ACTION_RETRY) { goto pmap_pv_remove_retry; - } - else if (pac == PMAP_ACTION_IGNORE) { + } else if (pac == PMAP_ACTION_IGNORE) { goto pmap_pv_remove_exit; } } @@ -878,8 +994,9 @@ pmap_pv_remove_retry: *was_altacct = IS_ALTACCT_PAGE(ppn_to_pai(*ppnp), pvh_e); pmap_pv_hashlist_cnts += pv_cnt; - if (pmap_pv_hashlist_max < pv_cnt) + if (pmap_pv_hashlist_max < pv_cnt) { pmap_pv_hashlist_max = pv_cnt; + } *pprevh = pvh_e->nexth; remque(&pvh_e->qlink); UNLOCK_PV_HASH(pvhash_idx); @@ -890,14 +1007,14 @@ pmap_pv_remove_exit: static inline __attribute__((always_inline)) boolean_t pmap_pv_is_altacct( - pmap_t pmap, - vm_map_offset_t vaddr, - ppnum_t ppn) + pmap_t pmap, + vm_map_offset_t vaddr, + ppnum_t ppn) { pv_hashed_entry_t pvh_e; - pv_rooted_entry_t pv_h; - int pvhash_idx; - boolean_t is_altacct; + pv_rooted_entry_t pv_h; + uint32_t pvhash_idx; + boolean_t is_altacct; pvh_e = PV_HASHED_ENTRY_NULL; pv_h = pai_to_pvh(ppn_to_pai(ppn)); @@ -908,8 +1025,8 @@ pmap_pv_is_altacct( if (PVE_VA(pv_h) == vaddr && pv_h->pmap == pmap) { /* - * Header is the pv_rooted_entry. - */ + * Header is the pv_rooted_entry. + */ return IS_ALTACCT_PAGE(ppn, pv_h); } @@ -917,15 +1034,12 @@ pmap_pv_is_altacct( pvhash_idx = pvhashidx(pmap, vaddr); LOCK_PV_HASH(pvhash_idx); pvh_e = *(pvhash(pvhash_idx)); - if (PV_HASHED_ENTRY_NULL == pvh_e) { - panic("Possible memory corruption: pmap_pv_is_altacct(%p,0x%llx,0x%x): empty hash", - pmap, vaddr, ppn); - } while (PV_HASHED_ENTRY_NULL != pvh_e) { if (pvh_e->pmap == pmap && PVE_VA(pvh_e) == vaddr && - pvh_e->ppn == ppn) + pvh_e->ppn == ppn) { break; + } pvh_e = pvh_e->nexth; } if (PV_HASHED_ENTRY_NULL == pvh_e) { @@ -938,96 +1052,84 @@ pmap_pv_is_altacct( return is_altacct; } -extern int pt_fake_zone_index; static inline void PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes) { - pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes); + pmap_ledger_credit(pmap, task_ledgers.tkm_private, (ledger_amount_t)bytes); } static inline void PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes) { - pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes); + pmap_ledger_debit(pmap, task_ledgers.tkm_private, (ledger_amount_t)bytes); } static inline void PMAP_ZINFO_SALLOC(pmap_t pmap, vm_size_t bytes) { - pmap_ledger_credit(pmap, task_ledgers.tkm_shared, bytes); + pmap_ledger_credit(pmap, task_ledgers.tkm_shared, (ledger_amount_t)bytes); } static inline void PMAP_ZINFO_SFREE(pmap_t pmap, vm_size_t bytes) { - pmap_ledger_debit(pmap, task_ledgers.tkm_shared, bytes); + pmap_ledger_debit(pmap, task_ledgers.tkm_shared, (ledger_amount_t)bytes); } -extern boolean_t pmap_initialized;/* Has pmap_init completed? */ +extern boolean_t pmap_initialized;/* Has pmap_init completed? */ #define valid_page(x) (pmap_initialized && pmap_valid_page(x)) -int phys_attribute_test( - ppnum_t phys, - int bits); -void phys_attribute_clear( - ppnum_t phys, - int bits, - unsigned int options, - void *arg); +int phys_attribute_test( + ppnum_t phys, + int bits); +void phys_attribute_clear( + ppnum_t phys, + int bits, + unsigned int options, + void *arg); //#define PCID_DEBUG 1 -#if PCID_DEBUG -#define pmap_pcid_log(fmt, args...) \ - do { \ - kprintf(fmt, ##args); \ - printf(fmt, ##args); \ +#if PCID_DEBUG +#define pmap_pcid_log(fmt, args...) \ + do { \ + kprintf(fmt, ##args); \ + printf(fmt, ##args); \ } while(0) #else #define pmap_pcid_log(fmt, args...) #endif -void pmap_pcid_configure(void); +void pmap_pcid_configure(void); /* * Atomic 64-bit compare and exchange of a page table entry. */ + +#include static inline boolean_t pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new) { - boolean_t ret; - - /* - * Load the old value into %rax - * Load the new value into another register - * Compare-exchange-quad at address entryp - * If the compare succeeds, the new value is stored, return TRUE. - * Otherwise, no swap is made, return FALSE. - */ - asm volatile( - " lock; cmpxchgq %2,(%3) \n\t" - " setz %%al \n\t" - " movzbl %%al,%0" - : "=a" (ret) - : "a" (old), - "r" (new), - "r" (entryp) - : "memory"); - return ret; + return __c11_atomic_compare_exchange_strong((_Atomic pt_entry_t *)entryp, &old, new, + memory_order_acq_rel_smp, memory_order_relaxed); } extern uint32_t pmap_update_clear_pte_count; -static inline void pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits) { +static inline void +pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits) +{ pt_entry_t npte, opte; do { opte = *mptep; if (__improbable(opte == 0)) { +#if DEVELOPMENT || DEBUG pmap_update_clear_pte_count++; +#endif break; } npte = opte & ~(pclear_bits); npte |= pset_bits; - } while (!pmap_cmpx_pte(mptep, opte, npte)); + } while (!pmap_cmpx_pte(mptep, opte, npte)); } /* @@ -1040,14 +1142,14 @@ pml4_entry_t * pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr) { if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) && - (vaddr < 0xFFFF800000000000ULL))) { - return (NULL); + (vaddr < 0xFFFF800000000000ULL))) { + return NULL; } -#if DEBUG - return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]); +#if DEBUG + return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG - 1)]); #else - return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)]; + return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG - 1)]; #endif } @@ -1055,14 +1157,14 @@ static inline pml4_entry_t * pmap64_user_pml4(pmap_t pmap, vm_map_offset_t vaddr) { if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) && - (vaddr < 0xFFFF800000000000ULL))) { - return (NULL); + (vaddr < 0xFFFF800000000000ULL))) { + return NULL; } -#if DEBUG - return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_ucr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]); +#if DEBUG + return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_ucr3)[(vaddr >> PML4SHIFT) & (NPML4PG - 1)]); #else - return &pmap->pm_upml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)]; + return &pmap->pm_upml4[(vaddr >> PML4SHIFT) & (NPML4PG - 1)]; #endif } @@ -1072,9 +1174,9 @@ pmap64_user_pml4(pmap_t pmap, vm_map_offset_t vaddr) static inline pdpt_entry_t * pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr) { - pml4_entry_t newpf; - pml4_entry_t *pml4; - boolean_t is_ept; + pml4_entry_t newpf; + pml4_entry_t *pml4; + boolean_t is_ept; pml4 = pmap64_pml4(pmap, vaddr); is_ept = is_ept_pmap(pmap); @@ -1082,39 +1184,53 @@ pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr) if (pml4 && (*pml4 & PTE_VALID_MASK(is_ept))) { newpf = *pml4 & PG_FRAME; return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf)) - [(vaddr >> PDPTSHIFT) & (NPDPTPG-1)]; + [(vaddr >> PDPTSHIFT) & (NPDPTPG - 1)]; } - return (NULL); + return NULL; } /* * Returns the address of the requested PDE entry in the physmap. */ static inline pd_entry_t * -pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr) +pmap_pde_internal1(vm_map_offset_t vaddr, boolean_t is_ept, pdpt_entry_t *pdpte) { - pdpt_entry_t newpf; - pdpt_entry_t *pdpt; - boolean_t is_ept; + if (*pdpte & PTE_VALID_MASK(is_ept)) { + pdpt_entry_t newpf = *pdpte & PG_FRAME; + return &((pd_entry_t *) PHYSMAP_PTOV(newpf)) + [(vaddr >> PDSHIFT) & (NPDPG - 1)]; + } else { + return NULL; + } +} - pdpt = pmap64_pdpt(pmap, vaddr); - is_ept = is_ept_pmap(pmap); +static inline pd_entry_t * +pmap_pde_internal0(pmap_t pmap, vm_map_offset_t vaddr, boolean_t is_ept) +{ + pdpt_entry_t *pdpt; - if (pdpt && (*pdpt & PTE_VALID_MASK(is_ept))) { - newpf = *pdpt & PG_FRAME; - return &((pd_entry_t *) PHYSMAP_PTOV(newpf)) - [(vaddr >> PDSHIFT) & (NPDPG-1)]; + pdpt = pmap64_pdpt(pmap, vaddr); + if (pdpt) { + return pmap_pde_internal1(vaddr, is_ept, pdpt); + } else { + return NULL; } - return (NULL); } -static inline pd_entry_t * -pmap_pde(pmap_t m, vm_map_offset_t v) + +static inline pd_entry_t * +pmap_pde(pmap_t pmap, vm_map_offset_t vaddr) { - pd_entry_t *pde; + pdpt_entry_t *pdpt; + boolean_t is_ept; - pde = pmap64_pde(m, v); + pdpt = pmap64_pdpt(pmap, vaddr); + is_ept = is_ept_pmap(pmap); - return pde; + if (pdpt) { + return pmap_pde_internal1(vaddr, is_ept, pdpt); + } else { + return NULL; + } } @@ -1124,36 +1240,51 @@ pmap_pde(pmap_t m, vm_map_offset_t v) * In case the pde maps a superpage, return the pde, which, in this case * is the actual page table entry. */ + + +static inline pt_entry_t * +pmap_pte_internal(vm_map_offset_t vaddr, boolean_t is_ept, pd_entry_t *pde) +{ + if (*pde & PTE_VALID_MASK(is_ept)) { + if (__improbable(*pde & PTE_PS)) { + return pde; + } + pd_entry_t newpf = *pde & PG_FRAME; + + return &((pt_entry_t *)PHYSMAP_PTOV(newpf)) + [i386_btop(vaddr) & (ppnum_t)(NPTEPG - 1)]; + } else { + return NULL; + } +} + static inline pt_entry_t * pmap_pte(pmap_t pmap, vm_map_offset_t vaddr) { - pd_entry_t *pde; - pd_entry_t newpf; - boolean_t is_ept; + pd_entry_t *pde; - assert(pmap); - pde = pmap64_pde(pmap, vaddr); + boolean_t is_ept; is_ept = is_ept_pmap(pmap); - if (pde && (*pde & PTE_VALID_MASK(is_ept))) { - if (*pde & PTE_PS) - return pde; - newpf = *pde & PG_FRAME; - return &((pt_entry_t *)PHYSMAP_PTOV(newpf)) - [i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)]; + pde = pmap_pde_internal0(pmap, vaddr, is_ept); + + if (pde) { + return pmap_pte_internal(vaddr, is_ept, pde); + } else { + return NULL; } - return (NULL); } -extern void pmap_alias( - vm_offset_t ava, - vm_map_offset_t start, - vm_map_offset_t end, - vm_prot_t prot, - unsigned int options); - -#if DEBUG -#define DPRINTF(x...) kprintf(x) + +extern void pmap_alias( + vm_offset_t ava, + vm_map_offset_t start, + vm_map_offset_t end, + vm_prot_t prot, + unsigned int options); + +#if DEBUG +#define DPRINTF(x...) kprintf(x) #else #define DPRINTF(x...) #endif