/*
- * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
+ *
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
- *
+ *
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
+ *
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
- *
+ *
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
-#ifndef _I386_PMAP_INTERNAL_
+#ifndef _I386_PMAP_INTERNAL_
#define _I386_PMAP_INTERNAL_
#ifdef MACH_KERNEL_PRIVATE
* pmap locking
*/
-#define PMAP_LOCK(pmap) { \
- simple_lock(&(pmap)->lock); \
+static inline void
+PMAP_LOCK_EXCLUSIVE(pmap_t p)
+{
+ mp_disable_preemption();
+ lck_rw_lock_exclusive(&p->pmap_rwl);
}
-#define PMAP_UNLOCK(pmap) { \
- simple_unlock(&(pmap)->lock); \
+static inline void
+PMAP_LOCK_SHARED(pmap_t p)
+{
+ mp_disable_preemption();
+ lck_rw_lock_shared(&p->pmap_rwl);
}
-#define PMAP_UPDATE_TLBS(pmap, s, e) \
- pmap_flush_tlbs(pmap, s, e, 0, NULL)
-
+static inline void
+PMAP_LOCK_SHARED_TO_EXCLUSIVE(pmap_t p)
+{
+ lck_rw_lock_shared_to_exclusive(&p->pmap_rwl);
+}
-#define PMAP_DELAY_TLB_FLUSH 0x01
+static inline void
+PMAP_LOCK_EXCLUSIVE_TO_SHARED(pmap_t p)
+{
+ lck_rw_lock_exclusive_to_shared(&p->pmap_rwl);
+}
-#define PMAP_UPDATE_TLBS_DELAYED(pmap, s, e, c) \
- pmap_flush_tlbs(pmap, s, e, PMAP_DELAY_TLB_FLUSH, c)
+static inline void
+PMAP_UNLOCK_EXCLUSIVE(pmap_t p)
+{
+ lck_rw_unlock_exclusive(&p->pmap_rwl);
+ mp_enable_preemption();
+}
+static inline void
+PMAP_UNLOCK_SHARED(pmap_t p)
+{
+ lck_rw_unlock_shared(&p->pmap_rwl);
+ mp_enable_preemption();
+}
-#define iswired(pte) ((pte) & INTEL_PTE_WIRED)
+#define iswired(pte) ((pte) & INTEL_PTE_WIRED)
-#ifdef PMAP_TRACES
-extern boolean_t pmap_trace;
-#define PMAP_TRACE(x,a,b,c,d,e) \
- if (pmap_trace) { \
- KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
+#ifdef PMAP_TRACES
+extern boolean_t pmap_trace;
+#define PMAP_TRACE(...) \
+ if (pmap_trace) { \
+ KDBG_RELEASE(__VA_ARGS__); \
}
#else
-#define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e)
+#define PMAP_TRACE(...) KDBG_DEBUG(__VA_ARGS__)
#endif /* PMAP_TRACES */
-#define PMAP_TRACE_CONSTANT(x,a,b,c,d,e) \
- KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
+#define PMAP_TRACE_CONSTANT(...) KDBG_RELEASE(__VA_ARGS__)
-kern_return_t pmap_expand_pml4(
- pmap_t map,
- vm_map_offset_t v,
- unsigned int options);
+kern_return_t pmap_expand_pml4(
+ pmap_t map,
+ vm_map_offset_t v,
+ unsigned int options);
-kern_return_t pmap_expand_pdpt(
- pmap_t map,
- vm_map_offset_t v,
- unsigned int options);
+kern_return_t pmap_expand_pdpt(
+ pmap_t map,
+ vm_map_offset_t v,
+ unsigned int options);
-void phys_attribute_set(
- ppnum_t phys,
- int bits);
+void phys_attribute_set(
+ ppnum_t phys,
+ int bits);
-void pmap_set_reference(
- ppnum_t pn);
+void pmap_set_reference(
+ ppnum_t pn);
-boolean_t phys_page_exists(
- ppnum_t pn);
+boolean_t phys_page_exists(
+ ppnum_t pn);
void
-pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t, int, pmap_flush_context *);
+ pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t, int, pmap_flush_context *);
void
-pmap_update_cache_attributes_locked(ppnum_t, unsigned);
+ pmap_update_cache_attributes_locked(ppnum_t, unsigned);
+
+
+static inline void
+PMAP_UPDATE_TLBS(pmap_t fp, addr64_t s, addr64_t e)
+{
+ pmap_flush_tlbs(fp, s, e, 0, NULL);
+}
+
+#define PMAP_DELAY_TLB_FLUSH 0x01
-extern const boolean_t cpu_64bit;
+static inline void
+PMAP_UPDATE_TLBS_DELAYED(pmap_t fp, addr64_t s, addr64_t e, pmap_flush_context *pfc)
+{
+ pmap_flush_tlbs(fp, s, e, PMAP_DELAY_TLB_FLUSH, pfc);
+}
/*
* Private data structures.
*/
/*
-
-PV HASHING Changes - JK 1/2007
-
-Pve's establish physical to virtual mappings. These are used for aliasing of a
-physical page to (potentially many) virtual addresses within pmaps. In the
-previous implementation the structure of the pv_entries (each 16 bytes in size) was
-
-typedef struct pv_entry {
- struct pv_entry_t next;
- pmap_t pmap;
- vm_map_offset_t va;
-} *pv_entry_t;
-
-An initial array of these is created at boot time, one per physical page of
-memory, indexed by the physical page number. Additionally, a pool of entries
-is created from a pv_zone to be used as needed by pmap_enter() when it is
-creating new mappings. Originally, we kept this pool around because the code
-in pmap_enter() was unable to block if it needed an entry and none were
-available - we'd panic. Some time ago I restructured the pmap_enter() code
-so that for user pmaps it can block while zalloc'ing a pv structure and restart,
-removing a panic from the code (in the case of the kernel pmap we cannot block
-and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
-The pool has not been removed since there is a large performance gain keeping
-freed pv's around for reuse and not suffering the overhead of zalloc for every
-new pv we need.
-
-As pmap_enter() created new mappings it linked the new pve's for them off the
-fixed pv array for that ppn (off the next pointer). These pve's are accessed
-for several operations, one of them being address space teardown. In that case,
-we basically do this
-
- for (every page/pte in the space) {
- calc pve_ptr from the ppn in the pte
- for (every pv in the list for the ppn) {
- if (this pv is for this pmap/vaddr) {
- do housekeeping
- unlink/free the pv
- }
- }
- }
-
-The problem arose when we were running, say 8000 (or even 2000) apache or
-other processes and one or all terminate. The list hanging off each pv array
-entry could have thousands of entries. We were continuously linearly searching
-each of these lists as we stepped through the address space we were tearing
-down. Because of the locks we hold, likely taking a cache miss for each node,
-and interrupt disabling for MP issues the system became completely unresponsive
-for many seconds while we did this.
-
-Realizing that pve's are accessed in two distinct ways (linearly running the
-list by ppn for operations like pmap_page_protect and finding and
-modifying/removing a single pve as part of pmap_enter processing) has led to
-modifying the pve structures and databases.
-
-There are now two types of pve structures. A "rooted" structure which is
-basically the original structure accessed in an array by ppn, and a ''hashed''
-structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
-designed with the two goals of minimizing wired memory and making the lookup of
-a ppn faster. Since a vast majority of pages in the system are not aliased
-and hence represented by a single pv entry I've kept the rooted entry size as
-small as possible because there is one of these dedicated for every physical
-page of memory. The hashed pve's are larger due to the addition of the hash
-link and the ppn entry needed for matching while running the hash list to find
-the entry we are looking for. This way, only systems that have lots of
-aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
-structures have the same first three fields allowing some simplification in
-the code.
-
-They have these shapes
-
-typedef struct pv_rooted_entry {
- queue_head_t qlink;
- vm_map_offset_t va;
- pmap_t pmap;
-} *pv_rooted_entry_t;
-
-
-typedef struct pv_hashed_entry {
- queue_head_t qlink;
- vm_map_offset_t va;
- pmap_t pmap;
- ppnum_t ppn;
- struct pv_hashed_entry *nexth;
-} *pv_hashed_entry_t;
-
-The main flow difference is that the code is now aware of the rooted entry and
-the hashed entries. Code that runs the pv list still starts with the rooted
-entry and then continues down the qlink onto the hashed entries. Code that is
-looking up a specific pv entry first checks the rooted entry and then hashes
-and runs the hash list for the match. The hash list lengths are much smaller
-than the original pv lists that contained all aliases for the specific ppn.
-
-*/
+ *
+ * PV HASHING Changes - JK 1/2007
+ *
+ * Pve's establish physical to virtual mappings. These are used for aliasing of a
+ * physical page to (potentially many) virtual addresses within pmaps. In the
+ * previous implementation the structure of the pv_entries (each 16 bytes in size) was
+ *
+ * typedef struct pv_entry {
+ * struct pv_entry_t next;
+ * pmap_t pmap;
+ * vm_map_offset_t va;
+ * } *pv_entry_t;
+ *
+ * An initial array of these is created at boot time, one per physical page of
+ * memory, indexed by the physical page number. Additionally, a pool of entries
+ * is created from a pv_zone to be used as needed by pmap_enter() when it is
+ * creating new mappings. Originally, we kept this pool around because the code
+ * in pmap_enter() was unable to block if it needed an entry and none were
+ * available - we'd panic. Some time ago I restructured the pmap_enter() code
+ * so that for user pmaps it can block while zalloc'ing a pv structure and restart,
+ * removing a panic from the code (in the case of the kernel pmap we cannot block
+ * and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
+ * The pool has not been removed since there is a large performance gain keeping
+ * freed pv's around for reuse and not suffering the overhead of zalloc for every
+ * new pv we need.
+ *
+ * As pmap_enter() created new mappings it linked the new pve's for them off the
+ * fixed pv array for that ppn (off the next pointer). These pve's are accessed
+ * for several operations, one of them being address space teardown. In that case,
+ * we basically do this
+ *
+ * for (every page/pte in the space) {
+ * calc pve_ptr from the ppn in the pte
+ * for (every pv in the list for the ppn) {
+ * if (this pv is for this pmap/vaddr) {
+ * do housekeeping
+ * unlink/free the pv
+ * }
+ * }
+ * }
+ *
+ * The problem arose when we were running, say 8000 (or even 2000) apache or
+ * other processes and one or all terminate. The list hanging off each pv array
+ * entry could have thousands of entries. We were continuously linearly searching
+ * each of these lists as we stepped through the address space we were tearing
+ * down. Because of the locks we hold, likely taking a cache miss for each node,
+ * and interrupt disabling for MP issues the system became completely unresponsive
+ * for many seconds while we did this.
+ *
+ * Realizing that pve's are accessed in two distinct ways (linearly running the
+ * list by ppn for operations like pmap_page_protect and finding and
+ * modifying/removing a single pve as part of pmap_enter processing) has led to
+ * modifying the pve structures and databases.
+ *
+ * There are now two types of pve structures. A "rooted" structure which is
+ * basically the original structure accessed in an array by ppn, and a ''hashed''
+ * structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
+ * designed with the two goals of minimizing wired memory and making the lookup of
+ * a ppn faster. Since a vast majority of pages in the system are not aliased
+ * and hence represented by a single pv entry I've kept the rooted entry size as
+ * small as possible because there is one of these dedicated for every physical
+ * page of memory. The hashed pve's are larger due to the addition of the hash
+ * link and the ppn entry needed for matching while running the hash list to find
+ * the entry we are looking for. This way, only systems that have lots of
+ * aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
+ * structures have the same first three fields allowing some simplification in
+ * the code.
+ *
+ * They have these shapes
+ *
+ * typedef struct pv_rooted_entry {
+ * queue_head_t qlink;
+ * vm_map_offset_t va;
+ * pmap_t pmap;
+ * } *pv_rooted_entry_t;
+ *
+ *
+ * typedef struct pv_hashed_entry {
+ * queue_head_t qlink;
+ * vm_map_offset_t va;
+ * pmap_t pmap;
+ * ppnum_t ppn;
+ * struct pv_hashed_entry *nexth;
+ * } *pv_hashed_entry_t;
+ *
+ * The main flow difference is that the code is now aware of the rooted entry and
+ * the hashed entries. Code that runs the pv list still starts with the rooted
+ * entry and then continues down the qlink onto the hashed entries. Code that is
+ * looking up a specific pv entry first checks the rooted entry and then hashes
+ * and runs the hash list for the match. The hash list lengths are much smaller
+ * than the original pv lists that contained all aliases for the specific ppn.
+ *
+ */
typedef struct pv_rooted_entry {
/* first three entries must match pv_hashed_entry_t */
- queue_head_t qlink;
- vm_map_offset_t va; /* virtual address for mapping */
- pmap_t pmap; /* pmap where mapping lies */
+ queue_head_t qlink;
+ vm_map_offset_t va_and_flags; /* virtual address for mapping */
+ pmap_t pmap; /* pmap where mapping lies */
} *pv_rooted_entry_t;
-#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
+#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
typedef struct pv_hashed_entry {
/* first three entries must match pv_rooted_entry_t */
- queue_head_t qlink;
- vm_map_offset_t va;
- pmap_t pmap;
- ppnum_t ppn;
- struct pv_hashed_entry *nexth;
+ queue_head_t qlink;
+ vm_map_offset_t va_and_flags;
+ pmap_t pmap;
+ ppnum_t ppn;
+ struct pv_hashed_entry *nexth;
} *pv_hashed_entry_t;
#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
+#define PVE_VA(pve) ((pve)->va_and_flags & (vm_map_offset_t)~PAGE_MASK)
+#define PVE_FLAGS(pve) ((pve)->va_and_flags & PAGE_MASK)
+#define PVE_IS_ALTACCT 0x001
+#define PVE_IS_ALTACCT_PAGE(pve) \
+ (((pve)->va_and_flags & PVE_IS_ALTACCT) ? TRUE : FALSE)
+
//#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */
#ifdef PV_DEBUG
#define CHK_NPVHASH() if(0 == npvhashmask) panic("npvhash uninitialized");
#define PV_HASHED_ALLOC_CHUNK_INITIAL 2000
#define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200
-extern volatile uint32_t mappingrecurse;
+extern volatile uint32_t mappingrecurse;
extern uint32_t pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark;
/*
* PV hash locking
*/
-#define LOCK_PV_HASH(hash) lock_hash_hash(hash)
-#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
+#define LOCK_PV_HASH(hash) lock_hash_hash(hash)
+#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
extern uint32_t npvhashmask;
-extern pv_hashed_entry_t *pv_hash_table; /* hash lists */
-extern pv_hashed_entry_t pv_hashed_free_list;
-extern pv_hashed_entry_t pv_hashed_kern_free_list;
-decl_simple_lock_data(extern, pv_hashed_free_list_lock)
-decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock)
-decl_simple_lock_data(extern, pv_hash_table_lock)
-decl_simple_lock_data(extern, phys_backup_lock)
-
-extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry
- * structures */
-
-extern uint32_t pv_hashed_free_count;
-extern uint32_t pv_hashed_kern_free_count;
+extern pv_hashed_entry_t *pv_hash_table; /* hash lists */
+extern pv_hashed_entry_t pv_hashed_free_list;
+extern pv_hashed_entry_t pv_hashed_kern_free_list;
+decl_simple_lock_data(extern, pv_hashed_free_list_lock);
+decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock);
+decl_simple_lock_data(extern, pv_hash_table_lock);
+decl_simple_lock_data(extern, phys_backup_lock);
+
+extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry
+ * structures */
+
+extern uint32_t pv_hashed_free_count;
+extern uint32_t pv_hashed_kern_free_count;
/*
* Each entry in the pv_head_table is locked by a bit in the
* pv_lock_table. The lock bits are accessed by the address of
* the frame they lock.
*/
-#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
+#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
-extern char *pv_lock_table; /* pointer to array of bits */
-extern char *pv_hash_lock_table;
-extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */
+extern char *pv_lock_table; /* pointer to array of bits */
+extern char *pv_hash_lock_table;
+extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */
extern event_t mapping_replenish_event;
-static inline void PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) {
+static inline void
+PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep)
+{
pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL);
- simple_lock(&pv_hashed_free_list_lock);
+ simple_lock(&pv_hashed_free_list_lock, LCK_GRP_NULL);
/* If the kernel reserved pool is low, let non-kernel mappings allocate
* synchronously, possibly subject to a throttle.
*/
simple_unlock(&pv_hashed_free_list_lock);
if (pv_hashed_free_count <= pv_hashed_low_water_mark) {
- if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
+ if (!mappingrecurse && os_atomic_cmpxchg(&mappingrecurse, 0, 1, acq_rel)) {
thread_wakeup(&mapping_replenish_event);
+ }
}
}
-static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) {
- simple_lock(&pv_hashed_free_list_lock);
+static inline void
+PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt)
+{
+ simple_lock(&pv_hashed_free_list_lock, LCK_GRP_NULL);
pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list;
pv_hashed_free_list = pvh_eh;
- pv_hashed_free_count += pv_cnt;
+ pv_hashed_free_count += (uint32_t)pv_cnt;
simple_unlock(&pv_hashed_free_list_lock);
}
extern unsigned pmap_kern_reserve_alloc_stat;
-static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) {
+static inline void
+PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e)
+{
pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL);
- simple_lock(&pv_hashed_kern_free_list_lock);
+ simple_lock(&pv_hashed_kern_free_list_lock, LCK_GRP_NULL);
if ((*pvh_e = pv_hashed_kern_free_list) != 0) {
pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next;
simple_unlock(&pv_hashed_kern_free_list_lock);
if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) {
- if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
+ if (!mappingrecurse && os_atomic_cmpxchg(&mappingrecurse, 0, 1, acq_rel)) {
thread_wakeup(&mapping_replenish_event);
+ }
}
}
-static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) {
- simple_lock(&pv_hashed_kern_free_list_lock);
+static inline void
+PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt)
+{
+ simple_lock(&pv_hashed_kern_free_list_lock, LCK_GRP_NULL);
pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list;
pv_hashed_kern_free_list = pvh_eh;
- pv_hashed_kern_free_count += pv_cnt;
+ pv_hashed_kern_free_count += (uint32_t)pv_cnt;
simple_unlock(&pv_hashed_kern_free_list_lock);
}
extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters;
extern event_t pmap_user_pv_throttle_event;
-static inline void pmap_pv_throttle(__unused pmap_t p) {
+static inline void
+pmap_pv_throttle(__unused pmap_t p)
+{
pmap_assert(p != kernel_pmap);
/* Apply throttle on non-kernel mappings */
if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) {
* Index into pv_head table, its lock bits, and the modify/reference and managed bits
*/
-#define pa_index(pa) (i386_btop(pa))
-#define ppn_to_pai(ppn) ((int)ppn)
-
-#define pai_to_pvh(pai) (&pv_head_table[pai])
-#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
-#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
-#define pvhash(idx) (&pv_hash_table[idx])
-#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
-#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
-
-#define IS_MANAGED_PAGE(x) \
- ((unsigned int)(x) <= last_managed_page && \
- (pmap_phys_attributes[x] & PHYS_MANAGED))
-#define IS_INTERNAL_PAGE(x) \
- (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_INTERNAL))
-#define IS_REUSABLE_PAGE(x) \
- (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_REUSABLE))
+#define pa_index(pa) (i386_btop(pa))
+#define ppn_to_pai(ppn) ((int)ppn)
+
+#define pai_to_pvh(pai) (&pv_head_table[pai])
+#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
+#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
+#define pvhash(idx) (&pv_hash_table[idx])
+#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
+#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
+
+#define IS_MANAGED_PAGE(x) \
+ ((unsigned int)(x) <= last_managed_page && \
+ ((unsigned long long)pmap_phys_attributes[x] & PHYS_MANAGED))
+#define IS_INTERNAL_PAGE(x) \
+ (IS_MANAGED_PAGE(x) && ((unsigned long long)pmap_phys_attributes[x] & PHYS_INTERNAL))
+#define IS_REUSABLE_PAGE(x) \
+ (IS_MANAGED_PAGE(x) && ((unsigned long long)pmap_phys_attributes[x] & PHYS_REUSABLE))
+#define IS_ALTACCT_PAGE(x, pve) \
+ (IS_MANAGED_PAGE((x)) && \
+ (PVE_IS_ALTACCT_PAGE((pve))))
/*
* Physical page attributes. Copy bits from PTE definition.
*/
-#define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
-#define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
-#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
-#define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */
-#define PHYS_NCACHE INTEL_PTE_NCACHE
-#define PHYS_PTA INTEL_PTE_PTA
-#define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE)
-#define PHYS_INTERNAL INTEL_PTE_WTHRU /* page from internal object */
-#define PHYS_REUSABLE INTEL_PTE_WRITE /* page is "reusable" */
-
-extern const boolean_t pmap_disable_kheap_nx;
-extern const boolean_t pmap_disable_kstack_nx;
+#define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
+#define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
+#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
+#define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */
+#define PHYS_NCACHE INTEL_PTE_NCACHE
+#define PHYS_PAT INTEL_PTE_PAT
+#define PHYS_CACHEABILITY_MASK (INTEL_PTE_PAT | INTEL_PTE_NCACHE)
+#define PHYS_INTERNAL INTEL_PTE_WTHRU /* page from internal object */
+#define PHYS_REUSABLE INTEL_PTE_WRITE /* page is "reusable" */
+
+#if DEVELOPMENT || DEBUG
+extern boolean_t pmap_disable_kheap_nx;
+extern boolean_t pmap_disable_kstack_nx;
+#endif
#define PMAP_EXPAND_OPTIONS_NONE (0x0)
#define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT)
#define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER)
-
+#define PMAP_EXPAND_OPTIONS_ALIASMAP (0x40000000U)
/*
* Amount of virtual memory mapped by one
* page-directory entry.
*/
-#define PDE_MAPPED_SIZE (pdetova(1))
-
+#define PDE_MAPPED_SIZE (pdetova(1))
/*
* Locking and TLB invalidation
* PV locking
*/
-#define LOCK_PVH(index) { \
- mp_disable_preemption(); \
- lock_pvh_pai(index); \
+#define LOCK_PVH(index) { \
+ mp_disable_preemption(); \
+ lock_pvh_pai(index); \
}
-#define UNLOCK_PVH(index) { \
- unlock_pvh_pai(index); \
- mp_enable_preemption(); \
+#define UNLOCK_PVH(index) { \
+ unlock_pvh_pai(index); \
+ mp_enable_preemption(); \
}
extern uint64_t pde_mapped_size;
-extern char *pmap_phys_attributes;
-extern ppnum_t last_managed_page;
+extern char *pmap_phys_attributes;
+extern ppnum_t last_managed_page;
-extern ppnum_t lowest_lo;
-extern ppnum_t lowest_hi;
-extern ppnum_t highest_hi;
+/*
+ * Used to record high memory allocated to kernel before
+ * pmap_init() gets called.
+ */
+extern ppnum_t pmap_high_used_top;
+extern ppnum_t pmap_high_used_bottom;
+extern ppnum_t pmap_middle_used_top;
+extern ppnum_t pmap_middle_used_bottom;
/*
* when spinning through pmap_remove
#define MAX_PREEMPTION_LATENCY_NS 20000
extern uint64_t max_preemption_latency_tsc;
-/* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
-#ifdef DEBUGINTERRUPTS
-#define pmap_intr_assert() { \
- if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \
- panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \
+#if DEBUG
+#define PMAP_INTR_DEBUG (1)
+#endif
+
+#if PMAP_INTR_DEBUG
+#define pmap_intr_assert() { \
+ if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \
+ panic("pmap interrupt assert %d %s, %d", processor_avail_count, __FILE__, __LINE__); \
}
#else
#define pmap_intr_assert()
#endif
-
-extern int nx_enabled;
+#if DEVELOPMENT || DEBUG
+extern int nx_enabled;
+#endif
extern unsigned int inuse_ptepages_count;
static inline uint32_t
pvhashidx(pmap_t pmap, vm_map_offset_t va)
{
uint32_t hashidx = ((uint32_t)(uintptr_t)pmap ^
- ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
- npvhashmask;
- return hashidx;
+ ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
+ npvhashmask;
+ return hashidx;
}
-
/*
* unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
* properly deals with the anchor.
* must be called with the hash locked, does not unlock it
*/
-static inline void
+static inline void
pmap_pvh_unlink(pv_hashed_entry_t pvh)
{
- pv_hashed_entry_t curh;
- pv_hashed_entry_t *pprevh;
- int pvhash_idx;
+ pv_hashed_entry_t curh;
+ pv_hashed_entry_t *pprevh;
+ uint32_t pvhash_idx;
CHK_NPVHASH();
- pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
+ pvhash_idx = pvhashidx(pvh->pmap, PVE_VA(pvh));
pprevh = pvhash(pvhash_idx);
#if PV_DEBUG
- if (NULL == *pprevh)
+ if (NULL == *pprevh) {
panic("pvh_unlink null anchor"); /* JK DEBUG */
+ }
#endif
curh = *pprevh;
while (PV_HASHED_ENTRY_NULL != curh) {
- if (pvh == curh)
+ if (pvh == curh) {
break;
+ }
pprevh = &curh->nexth;
curh = curh->nexth;
}
- if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
+ if (PV_HASHED_ENTRY_NULL == curh) {
+ panic("pmap_pvh_unlink no pvh");
+ }
*pprevh = pvh->nexth;
return;
}
static inline void
-pv_hash_add(pv_hashed_entry_t pvh_e,
- pv_rooted_entry_t pv_h)
+pv_hash_add(pv_hashed_entry_t pvh_e,
+ pv_rooted_entry_t pv_h)
{
pv_hashed_entry_t *hashp;
- int pvhash_idx;
+ uint32_t pvhash_idx;
CHK_NPVHASH();
- pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
+ pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e));
LOCK_PV_HASH(pvhash_idx);
insque(&pvh_e->qlink, &pv_h->qlink);
hashp = pvhash(pvhash_idx);
#if PV_DEBUG
- if (NULL==hashp)
+ if (NULL == hashp) {
panic("pv_hash_add(%p) null hash bucket", pvh_e);
+ }
#endif
pvh_e->nexth = *hashp;
*hashp = pvh_e;
static inline void
pv_hash_remove(pv_hashed_entry_t pvh_e)
{
- int pvhash_idx;
+ uint32_t pvhash_idx;
CHK_NPVHASH();
- pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
+ pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e));
LOCK_PV_HASH(pvhash_idx);
remque(&pvh_e->qlink);
pmap_pvh_unlink(pvh_e);
UNLOCK_PV_HASH(pvhash_idx);
-}
+}
-static inline boolean_t popcnt1(uint64_t distance) {
- return ((distance & (distance - 1)) == 0);
+static inline boolean_t
+popcnt1(uint64_t distance)
+{
+ return (distance & (distance - 1)) == 0;
}
/*
*/
typedef enum {
- PTE_VALID = 0x0,
- PTE_INVALID = 0x1,
- PTE_RSVD = 0x2,
- PTE_SUPERVISOR = 0x4,
- PTE_BITFLIP = 0x8,
- PV_BITFLIP = 0x10,
- PTE_INVALID_CACHEABILITY = 0x20
+ PTE_VALID = 0x0,
+ PTE_INVALID = 0x1,
+ PTE_RSVD = 0x2,
+ PTE_SUPERVISOR = 0x4,
+ PTE_BITFLIP = 0x8,
+ PV_BITFLIP = 0x10,
+ PTE_INVALID_CACHEABILITY = 0x20,
+ PTE_NXBITFLIP = 0x40
} pmap_pagetable_corruption_t;
typedef enum {
} pmap_pv_assertion_t;
typedef enum {
- PMAP_ACTION_IGNORE = 0x0,
- PMAP_ACTION_ASSERT = 0x1,
- PMAP_ACTION_RETRY = 0x2,
+ PMAP_ACTION_IGNORE = 0x0,
+ PMAP_ACTION_ASSERT = 0x1,
+ PMAP_ACTION_RETRY = 0x2,
PMAP_ACTION_RETRY_RELOCK = 0x4
} pmap_pagetable_corruption_action_t;
-#define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
+#define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
extern uint64_t pmap_pagetable_corruption_interval_abstime;
extern uint32_t pmap_pagetable_corruption_incidents;
pmap_pv_assertion_t incident;
pmap_pagetable_corruption_t reason;
pmap_pagetable_corruption_action_t action;
- pmap_t pmap;
+ pmap_t pmap;
vm_map_offset_t vaddr;
pt_entry_t pte;
ppnum_t ppn;
pmap_t pvpmap;
vm_map_offset_t pvva;
uint64_t abstime;
+ int adj_ptes_count;
+#define PMPTCR_MAX_ADJ_PTES (2)
+ uint64_t adj_ptes[PMPTCR_MAX_ADJ_PTES];
} pmap_pagetable_corruption_record_t;
extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[];
extern uint64_t pmap_pagetable_corruption_last_abstime;
-extern thread_call_t pmap_pagetable_corruption_log_call;
+extern thread_call_t pmap_pagetable_corruption_log_call;
extern boolean_t pmap_pagetable_corruption_timeout;
-static inline void
-pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) {
+static inline pmap_pagetable_corruption_action_t
+pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason,
+ pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep,
+ ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva, int adj_pteps_cnt, uint64_t **adj_pteps)
+{
uint32_t pmap_pagetable_corruption_log_index;
+ uint64_t curtime = mach_absolute_time();
+
+ if ((curtime - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) {
+ pmap_pagetable_corruption_timeout = TRUE;
+ action = PMAP_ACTION_ASSERT;
+ } else {
+ pmap_pagetable_corruption_last_abstime = curtime;
+ }
+
pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG;
pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident;
pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason;
pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn;
pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap;
pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva;
- pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time();
+ pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = curtime;
+ if (adj_pteps_cnt > 0 && adj_pteps != NULL) {
+ pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes_count = MIN(adj_pteps_cnt, PMPTCR_MAX_ADJ_PTES);
+ for (int i = 0; i < pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes_count; i++) {
+ pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes[i] = *adj_pteps[i];
+ }
+ }
/* Asynchronously log */
thread_call_enter(pmap_pagetable_corruption_log_call);
+
+ return action;
}
static inline pmap_pagetable_corruption_action_t
-pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) {
- pmap_pagetable_corruption_action_t action = PMAP_ACTION_ASSERT;
- pmap_pagetable_corruption_t suppress_reason = PTE_VALID;
- ppnum_t suppress_ppn = 0;
+pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident)
+{
+ pmap_pagetable_corruption_action_t action = PMAP_ACTION_ASSERT;
+ pmap_pagetable_corruption_t suppress_reason = PTE_VALID;
+ ppnum_t suppress_ppn = 0;
pt_entry_t cpte = *ptep;
- ppnum_t cpn = pa_index(pte_to_pa(cpte));
- ppnum_t ppn = *ppnp;
- pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn));
- pv_rooted_entry_t pv_e = pv_h;
- uint32_t bitdex;
+ ppnum_t cpn = pa_index(pte_to_pa(cpte));
+ ppnum_t ppn = *ppnp;
+ pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn));
+ pv_rooted_entry_t pv_e = pv_h;
+ uint32_t bitdex;
pmap_t pvpmap = pv_h->pmap;
- vm_map_offset_t pvva = pv_h->va;
+ vm_map_offset_t pvva = PVE_VA(pv_h);
+ vm_map_offset_t pve_flags;
boolean_t ppcd = FALSE;
+ boolean_t is_ept;
/* Ideally, we'd consult the Mach VM here to definitively determine
* the nature of the mapping for this address space and address.
/* As a precautionary measure, mark A+D */
pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
+ is_ept = is_ept_pmap(pmap);
/*
* Correct potential single bit errors in either (but not both) element
* of the PV
*/
do {
- if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) ||
- (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) {
+ if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && PVE_VA(pv_e) == vaddr) ||
+ (pv_e->pmap == pmap && popcnt1(PVE_VA(pv_e) ^ vaddr))) {
+ pve_flags = PVE_FLAGS(pv_e);
pv_e->pmap = pmap;
- pv_e->va = vaddr;
+ pv_h->va_and_flags = vaddr | pve_flags;
suppress_reason = PV_BITFLIP;
action = PMAP_ACTION_RETRY;
goto pmap_cpc_exit;
ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex);
if (IS_MANAGED_PAGE(npn)) {
pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn));
- if (npv_h->va == vaddr && npv_h->pmap == pmap) {
+ if (PVE_VA(npv_h) == vaddr && npv_h->pmap == pmap) {
suppress_reason = PTE_BITFLIP;
suppress_ppn = npn;
action = PMAP_ACTION_RETRY_RELOCK;
goto pmap_cpc_exit;
}
- /* Check for malformed/inconsistent entries */
-
- if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) {
+ /*
+ * Check for malformed/inconsistent entries.
+ * The first check here isn't useful for EPT PTEs because INTEL_EPT_NCACHE == 0
+ */
+ if (!is_ept && ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PAT)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU))) {
action = PMAP_ACTION_IGNORE;
suppress_reason = PTE_INVALID_CACHEABILITY;
- }
- else if (cpte & INTEL_PTE_RSVD) {
+ } else if (cpte & INTEL_PTE_RSVD) {
action = PMAP_ACTION_IGNORE;
suppress_reason = PTE_RSVD;
- }
- else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) {
+ } else if ((pmap != kernel_pmap) && (!is_ept) && ((cpte & INTEL_PTE_USER) == 0)) {
action = PMAP_ACTION_IGNORE;
suppress_reason = PTE_SUPERVISOR;
}
action = PMAP_ACTION_ASSERT;
}
- if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) {
- action = PMAP_ACTION_ASSERT;
- pmap_pagetable_corruption_timeout = TRUE;
+ return pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva, 0, 0);
+}
+
+static inline boolean_t
+pmap_compressed_pte_corruption_repair(uint64_t pte, uint64_t *pte_addr, uint64_t *ptep, pmap_t pmap,
+ vm_map_offset_t vaddr)
+{
+ uint64_t *adj_pteps[2];
+ int pteidx = ((uintptr_t)ptep & INTEL_OFFMASK) / sizeof(pt_entry_t);
+ pmap_pagetable_corruption_action_t action = PMAP_ACTION_IGNORE;
+
+ /*
+ * Grab pointers to PTEs on either side of the PTE in question, unless we're at the start of
+ * a PT (grab pointers to the next and next-next PTEs) or the end of a PT (grab the previous
+ * 2 PTEs).
+ */
+ if (pteidx == 0) {
+ adj_pteps[0] = ptep + 1;
+ adj_pteps[1] = ptep + 2;
+ } else if (pteidx == (NPTPG - 1)) {
+ adj_pteps[0] = ptep - 2;
+ adj_pteps[1] = ptep - 1;
+ } else {
+ adj_pteps[0] = ptep - 1;
+ adj_pteps[1] = ptep + 1;
}
- else
- {
- pmap_pagetable_corruption_last_abstime = mach_absolute_time();
+
+ /*
+ * Since the compressed PTE no longer has a PTE associated, we cannot pass in the pv data to
+ * pmap_pagetable_corruption_log, so instead supply adjacent PTEs for logging.
+ */
+ if (pmap_pagetable_corruption_log(ROOT_ABSENT, (pte & INTEL_PTE_NX) ? PTE_NXBITFLIP : PTE_BITFLIP,
+ action, pmap, vaddr, ptep, (ppnum_t)~0UL, 0, 0, sizeof(adj_pteps) / sizeof(adj_pteps[0]),
+ adj_pteps) != PMAP_ACTION_ASSERT) {
+ /* Correct the flipped bit(s) and continue */
+ pmap_store_pte(ptep, pte & INTEL_PTE_COMPRESSED_MASK);
+ pmap->corrected_compressed_ptes_count++;
+ return TRUE; /* Returning TRUE to indicate this is a now a valid compressed PTE (we hope) */
}
- pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva);
- return action;
+
+ panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted? Adjacent PTEs: 0x%llx@%p, 0x%llx@%p",
+ pte_addr, pte, pte & ~INTEL_PTE_COMPRESSED_MASK, *adj_pteps[0], adj_pteps[0], *adj_pteps[1], adj_pteps[1]);
+ /*NOTREACHED*/
}
/*
* Returns pv entry to be freed (or NULL).
*/
static inline __attribute__((always_inline)) pv_hashed_entry_t
-pmap_pv_remove(pmap_t pmap,
- vm_map_offset_t vaddr,
- ppnum_t *ppnp,
- pt_entry_t *pte)
+pmap_pv_remove(pmap_t pmap,
+ vm_map_offset_t vaddr,
+ ppnum_t *ppnp,
+ pt_entry_t *pte,
+ boolean_t *was_altacct)
{
pv_hashed_entry_t pvh_e;
- pv_rooted_entry_t pv_h;
- pv_hashed_entry_t *pprevh;
- int pvhash_idx;
+ pv_rooted_entry_t pv_h;
+ pv_hashed_entry_t *pprevh;
+ uint32_t pvhash_idx;
uint32_t pv_cnt;
- ppnum_t ppn;
+ ppnum_t ppn;
+ *was_altacct = FALSE;
pmap_pv_remove_retry:
ppn = *ppnp;
pvh_e = PV_HASHED_ENTRY_NULL;
if (__improbable(pv_h->pmap == PMAP_NULL)) {
pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT);
- if (pac == PMAP_ACTION_IGNORE)
+ if (pac == PMAP_ACTION_IGNORE) {
goto pmap_pv_remove_exit;
- else if (pac == PMAP_ACTION_ASSERT)
- panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list!", pmap, vaddr, ppn, *pte, ppnp, pte);
- else if (pac == PMAP_ACTION_RETRY_RELOCK) {
+ } else if (pac == PMAP_ACTION_ASSERT) {
+ panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list, priors: %d", pmap, vaddr, ppn, *pte, ppnp, pte, pmap_pagetable_corruption_incidents);
+ } else if (pac == PMAP_ACTION_RETRY_RELOCK) {
LOCK_PVH(ppn_to_pai(*ppnp));
pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
goto pmap_pv_remove_retry;
- }
- else if (pac == PMAP_ACTION_RETRY)
+ } else if (pac == PMAP_ACTION_RETRY) {
goto pmap_pv_remove_retry;
+ }
}
- if (pv_h->va == vaddr && pv_h->pmap == pmap) {
+ if (PVE_VA(pv_h) == vaddr && pv_h->pmap == pmap) {
+ *was_altacct = IS_ALTACCT_PAGE(ppn_to_pai(*ppnp), pv_h);
/*
- * Header is the pv_rooted_entry.
+ * Header is the pv_rooted_entry.
* We can't free that. If there is a queued
- * entry after this one we remove that
- * from the ppn queue, we remove it from the hash chain
- * and copy it to the rooted entry. Then free it instead.
- */
+ * entry after this one we remove that
+ * from the ppn queue, we remove it from the hash chain
+ * and copy it to the rooted entry. Then free it instead.
+ */
pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
if (pv_h != (pv_rooted_entry_t) pvh_e) {
/*
* and install as new root.
*/
CHK_NPVHASH();
- pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
+ pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e));
LOCK_PV_HASH(pvhash_idx);
remque(&pvh_e->qlink);
pprevh = pvhash(pvhash_idx);
if (PV_HASHED_ENTRY_NULL == *pprevh) {
panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x): "
- "empty hash, removing rooted",
- pmap, vaddr, ppn);
+ "empty hash, removing rooted, priors: %d",
+ pmap, vaddr, ppn, pmap_pagetable_corruption_incidents);
}
pmap_pvh_unlink(pvh_e);
UNLOCK_PV_HASH(pvhash_idx);
pv_h->pmap = pvh_e->pmap;
- pv_h->va = pvh_e->va; /* dispose of pvh_e */
+ pv_h->va_and_flags = pvh_e->va_and_flags;
+ /* dispose of pvh_e */
} else {
/* none queued after rooted */
pv_h->pmap = PMAP_NULL;
LOCK_PV_HASH(pvhash_idx);
pprevh = pvhash(pvhash_idx);
if (PV_HASHED_ENTRY_NULL == *pprevh) {
- panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash",
- pmap, vaddr, ppn, *pte, pte);
+ panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash, priors: %d",
+ pmap, vaddr, ppn, *pte, pte, pmap_pagetable_corruption_incidents);
}
pvh_e = *pprevh;
pmap_pv_hashlist_walks++;
while (PV_HASHED_ENTRY_NULL != pvh_e) {
pv_cnt++;
if (pvh_e->pmap == pmap &&
- pvh_e->va == vaddr &&
- pvh_e->ppn == ppn)
+ PVE_VA(pvh_e) == vaddr &&
+ pvh_e->ppn == ppn) {
break;
+ }
pprevh = &pvh_e->nexth;
pvh_e = pvh_e->nexth;
}
if (PV_HASHED_ENTRY_NULL == pvh_e) {
pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT);
- if (pac == PMAP_ACTION_ASSERT)
- panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, pv_h->va);
- else {
+ if (pac == PMAP_ACTION_ASSERT) {
+ panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx, priors: %d", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, PVE_VA(pv_h), pmap_pagetable_corruption_incidents);
+ } else {
UNLOCK_PV_HASH(pvhash_idx);
if (pac == PMAP_ACTION_RETRY_RELOCK) {
LOCK_PVH(ppn_to_pai(*ppnp));
pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
goto pmap_pv_remove_retry;
- }
- else if (pac == PMAP_ACTION_RETRY) {
+ } else if (pac == PMAP_ACTION_RETRY) {
goto pmap_pv_remove_retry;
- }
- else if (pac == PMAP_ACTION_IGNORE) {
+ } else if (pac == PMAP_ACTION_IGNORE) {
goto pmap_pv_remove_exit;
}
}
}
+ *was_altacct = IS_ALTACCT_PAGE(ppn_to_pai(*ppnp), pvh_e);
+
pmap_pv_hashlist_cnts += pv_cnt;
- if (pmap_pv_hashlist_max < pv_cnt)
+ if (pmap_pv_hashlist_max < pv_cnt) {
pmap_pv_hashlist_max = pv_cnt;
+ }
*pprevh = pvh_e->nexth;
remque(&pvh_e->qlink);
UNLOCK_PV_HASH(pvhash_idx);
return pvh_e;
}
+static inline __attribute__((always_inline)) boolean_t
+pmap_pv_is_altacct(
+ pmap_t pmap,
+ vm_map_offset_t vaddr,
+ ppnum_t ppn)
+{
+ pv_hashed_entry_t pvh_e;
+ pv_rooted_entry_t pv_h;
+ uint32_t pvhash_idx;
+ boolean_t is_altacct;
+
+ pvh_e = PV_HASHED_ENTRY_NULL;
+ pv_h = pai_to_pvh(ppn_to_pai(ppn));
+
+ if (__improbable(pv_h->pmap == PMAP_NULL)) {
+ return FALSE;
+ }
+
+ if (PVE_VA(pv_h) == vaddr && pv_h->pmap == pmap) {
+ /*
+ * Header is the pv_rooted_entry.
+ */
+ return IS_ALTACCT_PAGE(ppn, pv_h);
+ }
+
+ CHK_NPVHASH();
+ pvhash_idx = pvhashidx(pmap, vaddr);
+ LOCK_PV_HASH(pvhash_idx);
+ pvh_e = *(pvhash(pvhash_idx));
+ while (PV_HASHED_ENTRY_NULL != pvh_e) {
+ if (pvh_e->pmap == pmap &&
+ PVE_VA(pvh_e) == vaddr &&
+ pvh_e->ppn == ppn) {
+ break;
+ }
+ pvh_e = pvh_e->nexth;
+ }
+ if (PV_HASHED_ENTRY_NULL == pvh_e) {
+ is_altacct = FALSE;
+ } else {
+ is_altacct = IS_ALTACCT_PAGE(ppn, pvh_e);
+ }
+ UNLOCK_PV_HASH(pvhash_idx);
+
+ return is_altacct;
+}
-extern int pt_fake_zone_index;
static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes)
{
- thread_t thr = current_thread();
- task_t task;
- zinfo_usage_t zinfo;
-
- pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
-
- if (pt_fake_zone_index != -1 &&
- (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
- OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].alloc);
+ pmap_ledger_credit(pmap, task_ledgers.tkm_private, (ledger_amount_t)bytes);
}
static inline void
PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes)
{
- thread_t thr = current_thread();
- task_t task;
- zinfo_usage_t zinfo;
-
- pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
-
- if (pt_fake_zone_index != -1 &&
- (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
- OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].free);
+ pmap_ledger_debit(pmap, task_ledgers.tkm_private, (ledger_amount_t)bytes);
}
static inline void
PMAP_ZINFO_SALLOC(pmap_t pmap, vm_size_t bytes)
{
- pmap_ledger_credit(pmap, task_ledgers.tkm_shared, bytes);
+ pmap_ledger_credit(pmap, task_ledgers.tkm_shared, (ledger_amount_t)bytes);
}
static inline void
PMAP_ZINFO_SFREE(pmap_t pmap, vm_size_t bytes)
{
- pmap_ledger_debit(pmap, task_ledgers.tkm_shared, bytes);
+ pmap_ledger_debit(pmap, task_ledgers.tkm_shared, (ledger_amount_t)bytes);
}
-extern boolean_t pmap_initialized;/* Has pmap_init completed? */
+extern boolean_t pmap_initialized;/* Has pmap_init completed? */
#define valid_page(x) (pmap_initialized && pmap_valid_page(x))
-// XXX
-#define HIGH_MEM_BASE ((uint32_t)( -NBPDE) ) /* shared gdt etc seg addr */ /* XXX64 ?? */
-// XXX
-
-
-int phys_attribute_test(
- ppnum_t phys,
- int bits);
-void phys_attribute_clear(
- ppnum_t phys,
- int bits,
- unsigned int options,
- void *arg);
+int phys_attribute_test(
+ ppnum_t phys,
+ int bits);
+void phys_attribute_clear(
+ ppnum_t phys,
+ int bits,
+ unsigned int options,
+ void *arg);
//#define PCID_DEBUG 1
-#if PCID_DEBUG
-#define pmap_pcid_log(fmt, args...) \
- do { \
- kprintf(fmt, ##args); \
- printf(fmt, ##args); \
+#if PCID_DEBUG
+#define pmap_pcid_log(fmt, args...) \
+ do { \
+ kprintf(fmt, ##args); \
+ printf(fmt, ##args); \
} while(0)
#else
#define pmap_pcid_log(fmt, args...)
#endif
-void pmap_pcid_configure(void);
+void pmap_pcid_configure(void);
/*
* Atomic 64-bit compare and exchange of a page table entry.
*/
+
+#include <machine/atomic.h>
static inline boolean_t
pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new)
{
- boolean_t ret;
-
- /*
- * Load the old value into %rax
- * Load the new value into another register
- * Compare-exchange-quad at address entryp
- * If the compare succeeds, the new value is stored, return TRUE.
- * Otherwise, no swap is made, return FALSE.
- */
- asm volatile(
- " lock; cmpxchgq %2,(%3) \n\t"
- " setz %%al \n\t"
- " movzbl %%al,%0"
- : "=a" (ret)
- : "a" (old),
- "r" (new),
- "r" (entryp)
- : "memory");
- return ret;
+ return __c11_atomic_compare_exchange_strong((_Atomic pt_entry_t *)entryp, &old, new,
+ memory_order_acq_rel_smp, memory_order_relaxed);
}
extern uint32_t pmap_update_clear_pte_count;
-static inline void pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits) {
+static inline void
+pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits)
+{
pt_entry_t npte, opte;
do {
opte = *mptep;
if (__improbable(opte == 0)) {
+#if DEVELOPMENT || DEBUG
pmap_update_clear_pte_count++;
+#endif
break;
}
npte = opte & ~(pclear_bits);
npte |= pset_bits;
- } while (!pmap_cmpx_pte(mptep, opte, npte));
+ } while (!pmap_cmpx_pte(mptep, opte, npte));
}
-#if defined(__x86_64__)
/*
* The single pml4 page per pmap is allocated at pmap create time and exists
* for the duration of the pmap. we allocate this page in kernel vm.
pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
{
if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) &&
- (vaddr < 0xFFFF800000000000ULL))) {
- return (NULL);
+ (vaddr < 0xFFFF800000000000ULL))) {
+ return NULL;
}
-#if PMAP_ASSERT
- return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]);
+#if DEBUG
+ return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG - 1)]);
#else
- return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)];
+ return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG - 1)];
+#endif
+}
+
+static inline pml4_entry_t *
+pmap64_user_pml4(pmap_t pmap, vm_map_offset_t vaddr)
+{
+ if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) &&
+ (vaddr < 0xFFFF800000000000ULL))) {
+ return NULL;
+ }
+
+#if DEBUG
+ return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_ucr3)[(vaddr >> PML4SHIFT) & (NPML4PG - 1)]);
+#else
+ return &pmap->pm_upml4[(vaddr >> PML4SHIFT) & (NPML4PG - 1)];
#endif
}
static inline pdpt_entry_t *
pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
{
- pml4_entry_t newpf;
- pml4_entry_t *pml4;
+ pml4_entry_t newpf;
+ pml4_entry_t *pml4;
+ boolean_t is_ept;
pml4 = pmap64_pml4(pmap, vaddr);
- if (pml4 && ((*pml4 & INTEL_PTE_VALID))) {
+ is_ept = is_ept_pmap(pmap);
+
+ if (pml4 && (*pml4 & PTE_VALID_MASK(is_ept))) {
newpf = *pml4 & PG_FRAME;
return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf))
- [(vaddr >> PDPTSHIFT) & (NPDPTPG-1)];
+ [(vaddr >> PDPTSHIFT) & (NPDPTPG - 1)];
}
- return (NULL);
+ return NULL;
}
/*
* Returns the address of the requested PDE entry in the physmap.
*/
static inline pd_entry_t *
-pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr)
+pmap_pde_internal1(vm_map_offset_t vaddr, boolean_t is_ept, pdpt_entry_t *pdpte)
{
- pdpt_entry_t newpf;
- pdpt_entry_t *pdpt;
+ if (*pdpte & PTE_VALID_MASK(is_ept)) {
+ pdpt_entry_t newpf = *pdpte & PG_FRAME;
+ return &((pd_entry_t *) PHYSMAP_PTOV(newpf))
+ [(vaddr >> PDSHIFT) & (NPDPG - 1)];
+ } else {
+ return NULL;
+ }
+}
- pdpt = pmap64_pdpt(pmap, vaddr);
+static inline pd_entry_t *
+pmap_pde_internal0(pmap_t pmap, vm_map_offset_t vaddr, boolean_t is_ept)
+{
+ pdpt_entry_t *pdpt;
- if (pdpt && ((*pdpt & INTEL_PTE_VALID))) {
- newpf = *pdpt & PG_FRAME;
- return &((pd_entry_t *) PHYSMAP_PTOV(newpf))
- [(vaddr >> PDSHIFT) & (NPDPG-1)];
+ pdpt = pmap64_pdpt(pmap, vaddr);
+ if (pdpt) {
+ return pmap_pde_internal1(vaddr, is_ept, pdpt);
+ } else {
+ return NULL;
}
- return (NULL);
}
-static inline pd_entry_t *
-pmap_pde(pmap_t m, vm_map_offset_t v)
+
+static inline pd_entry_t *
+pmap_pde(pmap_t pmap, vm_map_offset_t vaddr)
{
- pd_entry_t *pde;
+ pdpt_entry_t *pdpt;
+ boolean_t is_ept;
- pde = pmap64_pde(m, v);
+ pdpt = pmap64_pdpt(pmap, vaddr);
+ is_ept = is_ept_pmap(pmap);
- return pde;
+ if (pdpt) {
+ return pmap_pde_internal1(vaddr, is_ept, pdpt);
+ } else {
+ return NULL;
+ }
}
* In case the pde maps a superpage, return the pde, which, in this case
* is the actual page table entry.
*/
+
+
+static inline pt_entry_t *
+pmap_pte_internal(vm_map_offset_t vaddr, boolean_t is_ept, pd_entry_t *pde)
+{
+ if (*pde & PTE_VALID_MASK(is_ept)) {
+ if (__improbable(*pde & PTE_PS)) {
+ return pde;
+ }
+ pd_entry_t newpf = *pde & PG_FRAME;
+
+ return &((pt_entry_t *)PHYSMAP_PTOV(newpf))
+ [i386_btop(vaddr) & (ppnum_t)(NPTEPG - 1)];
+ } else {
+ return NULL;
+ }
+}
+
static inline pt_entry_t *
pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
{
- pd_entry_t *pde;
- pd_entry_t newpf;
+ pd_entry_t *pde;
- assert(pmap);
- pde = pmap64_pde(pmap, vaddr);
+ boolean_t is_ept;
- if (pde && ((*pde & INTEL_PTE_VALID))) {
- if (*pde & INTEL_PTE_PS)
- return pde;
- newpf = *pde & PG_FRAME;
- return &((pt_entry_t *)PHYSMAP_PTOV(newpf))
- [i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)];
+ is_ept = is_ept_pmap(pmap);
+
+ pde = pmap_pde_internal0(pmap, vaddr, is_ept);
+
+ if (pde) {
+ return pmap_pte_internal(vaddr, is_ept, pde);
+ } else {
+ return NULL;
}
- return (NULL);
}
-#endif
-#if DEBUG
-#define DPRINTF(x...) kprintf(x)
+
+extern void pmap_alias(
+ vm_offset_t ava,
+ vm_map_offset_t start,
+ vm_map_offset_t end,
+ vm_prot_t prot,
+ unsigned int options);
+
+#if DEBUG
+#define DPRINTF(x...) kprintf(x)
#else
#define DPRINTF(x...)
#endif