/*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#ifndef ASSEMBLER
-#include <platforms.h>
-
#include <mach/kern_return.h>
#include <mach/machine/vm_types.h>
#include <mach/vm_prot.h>
#include <mach/machine/vm_param.h>
#include <kern/kern_types.h>
#include <kern/thread.h>
-#include <kern/lock.h>
+#include <kern/simple_lock.h>
+#include <mach/branch_predicates.h>
#include <i386/mp.h>
#include <i386/proc_reg.h>
+#include <i386/pal_routines.h>
+
/*
* Define the generic in terms of the specific
*/
#define intel_ptob(x) i386_ptob(x)
#define intel_round_page(x) i386_round_page(x)
#define intel_trunc_page(x) i386_trunc_page(x)
-#define trunc_intel_to_vm(x) trunc_i386_to_vm(x)
-#define round_intel_to_vm(x) round_i386_to_vm(x)
-#define vm_to_intel(x) vm_to_i386(x)
/*
* i386/i486/i860 Page Table Entry
#endif /* ASSEMBLER */
-#define NPGPTD 4
-#define PDESHIFT 21
-#define PTEMASK 0x1ff
-#define PTEINDX 3
+#define NPGPTD 4ULL
+#define PDESHIFT 21ULL
+#define PTEMASK 0x1ffULL
+#define PTEINDX 3ULL
+
+#define PTESHIFT 12ULL
-#define PTESHIFT 12
+#define LOW_4GB_MASK ((vm_offset_t)0x00000000FFFFFFFFUL)
#define PDESIZE sizeof(pd_entry_t) /* for assembly files */
#define PTESIZE sizeof(pt_entry_t) /* for assembly files */
#define INTEL_OFFMASK (I386_PGBYTES - 1)
+#define INTEL_LOFFMASK (I386_LPGBYTES - 1)
#define PG_FRAME 0x000FFFFFFFFFF000ULL
#define NPTEPG (PAGE_SIZE/(sizeof (pt_entry_t)))
#define NPTDPG (PAGE_SIZE/(sizeof (pd_entry_t)))
#define NBPTD (NPGPTD << PAGE_SHIFT)
#define NPDEPTD (NBPTD / (sizeof (pd_entry_t)))
#define NPDEPG (PAGE_SIZE/(sizeof (pd_entry_t)))
-#define NBPDE (1 << PDESHIFT)
+#define NBPDE (1ULL << PDESHIFT)
#define PDEMASK (NBPDE - 1)
+#define PTE_PER_PAGE 512 /* number of PTE's per page on any level */
+
/* cleanly define parameters for all the page table levels */
typedef uint64_t pml4_entry_t;
#define NPML4PG (PAGE_SIZE/(sizeof (pml4_entry_t)))
#define NPDPTPG (PAGE_SIZE/(sizeof (pdpt_entry_t)))
#define PDPTSHIFT 30
#define PDPTPGSHIFT 9
-#define NBPDPT (1 << PDPTSHIFT)
+#define NBPDPT (1ULL << PDPTSHIFT)
#define PDPTMASK (NBPDPT-1)
#define PDPT_ENTRY_NULL ((pdpt_entry_t *) 0)
#define NPDPG (PAGE_SIZE/(sizeof (pd_entry_t)))
#define PDSHIFT 21
#define PDPGSHIFT 9
-#define NBPD (1 << PDSHIFT)
+#define NBPD (1ULL << PDSHIFT)
#define PDMASK (NBPD-1)
#define PD_ENTRY_NULL ((pd_entry_t *) 0)
#define NPTPG (PAGE_SIZE/(sizeof (pt_entry_t)))
#define PTSHIFT 12
#define PTPGSHIFT 9
-#define NBPT (1 << PTSHIFT)
+#define NBPT (1ULL << PTSHIFT)
#define PTMASK (NBPT-1)
#define PT_ENTRY_NULL ((pt_entry_t *) 0)
typedef uint64_t pmap_paddr_t;
+#if DEVELOPMENT || DEBUG
+#define PMAP_ASSERT 1
+extern int pmap_asserts_enabled;
+extern int pmap_asserts_traced;
+#endif
+
+#if PMAP_ASSERT
+#define pmap_assert(ex) (pmap_asserts_enabled ? ((ex) ? (void)0 : Assert(__FILE__, __LINE__, # ex)) : (void)0)
+
+#define pmap_assert2(ex, fmt, args...) \
+ do { \
+ if (__improbable(pmap_asserts_enabled && !(ex))) { \
+ if (pmap_asserts_traced) { \
+ KERNEL_DEBUG_CONSTANT(0xDEAD1000, __builtin_return_address(0), __LINE__, 0, 0, 0); \
+ kdebug_enable = 0; \
+ } else { \
+ kprintf("Assertion %s failed (%s:%d, caller %p) " fmt , #ex, __FILE__, __LINE__, __builtin_return_address(0), ##args); \
+ panic("Assertion %s failed (%s:%d, caller %p) " fmt , #ex, __FILE__, __LINE__, __builtin_return_address(0), ##args); \
+ } \
+ } \
+ } while(0)
+#else
+#define pmap_assert(ex)
+#define pmap_assert2(ex, fmt, args...)
+#endif
+
+/* superpages */
+#define SUPERPAGE_NBASEPAGES 512
+
/*
* Atomic 64-bit store of a page table entry.
*/
pmap_store_pte(pt_entry_t *entryp, pt_entry_t value)
{
/*
- * Load the new value into %ecx:%ebx
- * Load the old value into %edx:%eax
- * Compare-exchange-8bytes at address entryp (loaded in %edi)
- * If the compare succeeds, the new value will have been stored.
- * Otherwise, the old value changed and reloaded, so try again.
+ * In the 32-bit kernel a compare-and-exchange loop was
+ * required to provide atomicity. For K64, life is easier:
*/
- __asm__ volatile(
- " movl (%0), %%eax \n\t"
- " movl 4(%0), %%edx \n\t"
- "1: \n\t"
- " cmpxchg8b (%0) \n\t"
- " jnz 1b"
- :
- : "D" (entryp),
- "b" ((uint32_t)value),
- "c" ((uint32_t)(value >> 32))
- : "eax", "edx", "memory");
+ *entryp = value;
}
-/*
- * Atomic 64-bit compare and exchange of a page table entry.
- */
-static inline boolean_t
-pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new)
-{
- boolean_t ret;
-
- /*
- * Load the old value into %edx:%eax
- * Load the new value into %ecx:%ebx
- * Compare-exchange-8bytes at address entryp (loaded in %edi)
- * If the compare succeeds, the new value is stored, return TRUE.
- * Otherwise, no swap is made, return FALSE.
- */
- asm volatile(
- " lock; cmpxchg8b (%1) \n\t"
- " setz %%al \n\t"
- " movzbl %%al,%0"
- : "=a" (ret)
- : "D" (entryp),
- "a" ((uint32_t)old),
- "d" ((uint32_t)(old >> 32)),
- "b" ((uint32_t)new),
- "c" ((uint32_t)(new >> 32))
- : "memory");
- return ret;
-}
-
-#define pmap_update_pte(entryp, old, new) \
- while (!pmap_cmpx_pte((entryp), (old), (new)))
-
-
/* in 64 bit spaces, the number of each type of page in the page tables */
#define NPML4PGS (1ULL * (PAGE_SIZE/(sizeof (pml4_entry_t))))
#define NPDPTPGS (NPML4PGS * (PAGE_SIZE/(sizeof (pdpt_entry_t))))
#define NPDEPGS (NPDPTPGS * (PAGE_SIZE/(sizeof (pd_entry_t))))
#define NPTEPGS (NPDEPGS * (PAGE_SIZE/(sizeof (pt_entry_t))))
-/*
- * The 64-bit kernel is remapped in uber-space which is at the base
- * the highest 4th-level directory (KERNEL_UBER_PML4_INDEX). That is,
- * 512GB from the top of virtual space (or zero).
- */
-#define KERNEL_UBER_PML4_INDEX 511
-#define KERNEL_UBER_BASE (0ULL - NBPML4)
-#define KERNEL_UBER_BASE_HI32 ((uint32_t)(KERNEL_UBER_BASE >> 32))
+#define KERNEL_PML4_INDEX 511
+#define KERNEL_KEXTS_INDEX 510 /* Home of KEXTs - the basement */
+#define KERNEL_PHYSMAP_PML4_INDEX 509 /* virtual to physical map */
+#define KERNEL_KASAN_PML4_INDEX0 508
+#define KERNEL_KASAN_PML4_INDEX1 507
+#define KERNEL_DBLMAP_PML4_INDEX (506)
+#define KERNEL_BASE (0ULL - NBPML4)
+#define KERNEL_BASEMENT (KERNEL_BASE - NBPML4)
#define VM_WIMG_COPYBACK VM_MEM_COHERENT
+#define VM_WIMG_COPYBACKLW VM_WIMG_COPYBACK
#define VM_WIMG_DEFAULT VM_MEM_COHERENT
/* ?? intel ?? */
#define VM_WIMG_IO (VM_MEM_COHERENT | \
VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED)
+#define VM_WIMG_POSTED VM_WIMG_IO
#define VM_WIMG_WTHRU (VM_MEM_WRITE_THROUGH | VM_MEM_COHERENT | VM_MEM_GUARDED)
/* write combining mode, aka store gather */
#define VM_WIMG_WCOMB (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT)
-
+#define VM_WIMG_INNERWBACK VM_MEM_COHERENT
/*
* Pte related macros
*/
-#define VADDR(pdi, pti) ((vm_offset_t)(((pdi)<<PDESHIFT)|((pti)<<PTESHIFT)))
-#define VADDR64(pmi, pdi, pti) ((vm_offset_t)(((pmi)<<PLM4SHIFT))((pdi)<<PDESHIFT)|((pti)<<PTESHIFT))
+#define KVADDR(pmi, pdpi, pdi, pti) \
+ ((vm_offset_t) \
+ ((uint64_t) -1 << 47) | \
+ ((uint64_t)(pmi) << PML4SHIFT) | \
+ ((uint64_t)(pdpi) << PDPTSHIFT) | \
+ ((uint64_t)(pdi) << PDESHIFT) | \
+ ((uint64_t)(pti) << PTESHIFT))
/*
* Size of Kernel address space. This is the number of page table pages
#endif
-enum high_cpu_types {
- HIGH_CPU_ISS0,
- HIGH_CPU_ISS1,
- HIGH_CPU_DESC,
- HIGH_CPU_LDT_BEGIN,
- HIGH_CPU_LDT_END = HIGH_CPU_LDT_BEGIN + (LDTSZ / 512) - 1,
- HIGH_CPU_END
-};
-
-enum high_fixed_addresses {
- HIGH_FIXED_TRAMPS, /* must be first */
- HIGH_FIXED_TRAMPS_END,
- HIGH_FIXED_GDT,
- HIGH_FIXED_IDT,
- HIGH_FIXED_LDT_BEGIN,
- HIGH_FIXED_LDT_END = HIGH_FIXED_LDT_BEGIN + (LDTSZ / 512) - 1,
- HIGH_FIXED_KTSS,
- HIGH_FIXED_DFTSS,
- HIGH_FIXED_DBTSS,
- HIGH_FIXED_CPUS_BEGIN,
- HIGH_FIXED_CPUS_END = HIGH_FIXED_CPUS_BEGIN + (HIGH_CPU_END * MAX_CPUS) - 1,
-};
-
-
-/* XXX64 below PTDI values need cleanup */
-/*
- * The *PTDI values control the layout of virtual memory
- *
- */
-#define KPTDI (0x000)/* start of kernel virtual pde's */
-#define PTDPTDI (0x7F4) /* ptd entry that points to ptd! */
-#define APTDPTDI (0x7F8) /* alt ptd entry that points to APTD */
-#define UMAXPTDI (0x7F8) /* ptd entry for user space end */
-#define UMAXPTEOFF (NPTEPG) /* pte entry for user space end */
-
-#define KERNBASE VADDR(KPTDI,0)
-
-/*
- * Convert address offset to directory address
- * containing the page table pointer - legacy
- */
-/*#define pmap_pde(m,v) (&((m)->dirbase[(vm_offset_t)(v) >> PDESHIFT]))*/
-
-#define HIGH_MEM_BASE ((uint32_t)( -NBPDE) ) /* shared gdt etc seg addr */ /* XXX64 ?? */
-#define pmap_index_to_virt(x) (HIGH_MEM_BASE | ((unsigned)(x) << PAGE_SHIFT))
/*
* Convert address offset to page descriptor index
*/
-#define pdenum(pmap, a) (((vm_offset_t)(a) >> PDESHIFT) & PDEMASK)
+#define pdptnum(pmap, a) (((vm_offset_t)(a) >> PDPTSHIFT) & PDPTMASK)
+#define pdenum(pmap, a) (((vm_offset_t)(a) >> PDESHIFT) & PDEMASK)
+#define PMAP_INVALID_PDPTNUM (~0ULL)
#define pdeidx(pmap, a) (((a) >> PDSHIFT) & ((1ULL<<(48 - PDSHIFT)) -1))
#define pdptidx(pmap, a) (((a) >> PDPTSHIFT) & ((1ULL<<(48 - PDPTSHIFT)) -1))
#define pml4idx(pmap, a) (((a) >> PML4SHIFT) & ((1ULL<<(48 - PML4SHIFT)) -1))
+
/*
* Convert page descriptor index to user virtual address
*/
* without using the bit fields).
*/
-#define INTEL_PTE_VALID 0x00000001
-#define INTEL_PTE_WRITE 0x00000002
-#define INTEL_PTE_RW 0x00000002
-#define INTEL_PTE_USER 0x00000004
-#define INTEL_PTE_WTHRU 0x00000008
-#define INTEL_PTE_NCACHE 0x00000010
-#define INTEL_PTE_REF 0x00000020
-#define INTEL_PTE_MOD 0x00000040
-#define INTEL_PTE_PS 0x00000080
-#define INTEL_PTE_GLOBAL 0x00000100
-#define INTEL_PTE_WIRED 0x00000200
+#define INTEL_PTE_VALID 0x00000001ULL
+#define INTEL_PTE_WRITE 0x00000002ULL
+#define INTEL_PTE_RW 0x00000002ULL
+#define INTEL_PTE_USER 0x00000004ULL
+#define INTEL_PTE_WTHRU 0x00000008ULL
+#define INTEL_PTE_NCACHE 0x00000010ULL
+#define INTEL_PTE_REF 0x00000020ULL
+#define INTEL_PTE_MOD 0x00000040ULL
+#define INTEL_PTE_PS 0x00000080ULL
+#define INTEL_PTE_PTA 0x00000080ULL
+#define INTEL_PTE_GLOBAL 0x00000100ULL
+#define INTEL_PTE_WIRED 0x00000400ULL
+#define INTEL_PDPTE_NESTED 0x00000800ULL
#define INTEL_PTE_PFN PG_FRAME
-#define INTEL_PTE_PTA 0x00000080
#define INTEL_PTE_NX (1ULL << 63)
#define INTEL_PTE_INVALID 0
+/* This is conservative, but suffices */
+#define INTEL_PTE_RSVD ((1ULL << 10) | (1ULL << 11) | (0x1FFULL << 54))
+
+#define INTEL_PTE_COMPRESSED (1ULL << 62) /* marker, for invalid PTE only -- ignored by hardware for both regular/EPT entries*/
+#define INTEL_PTE_COMPRESSED_ALT (1ULL << 61) /* compressed but with "alternate accounting" */
+
+#define INTEL_PTE_COMPRESSED_MASK (INTEL_PTE_COMPRESSED | \
+ INTEL_PTE_COMPRESSED_ALT)
+#define PTE_IS_COMPRESSED(x) \
+ ((((x) & INTEL_PTE_VALID) == 0) && /* PTE is not valid... */ \
+ ((x) & INTEL_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \
+ ((!((x) & ~INTEL_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \
+ (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
+ &(x), (x), (x) & ~INTEL_PTE_COMPRESSED_MASK), FALSE)))
#define pa_to_pte(a) ((a) & INTEL_PTE_PFN) /* XXX */
#define pte_to_pa(p) ((p) & INTEL_PTE_PFN) /* XXX */
#define pte_kernel_rw(p) ((pt_entry_t)(pa_to_pte(p) | INTEL_PTE_VALID|INTEL_PTE_RW))
#define pte_kernel_ro(p) ((pt_entry_t)(pa_to_pte(p) | INTEL_PTE_VALID))
-#define pte_user_rw(p) ((pt_entry)t)(pa_to_pte(p) | INTEL_PTE_VALID|INTEL_PTE_USER|INTEL_PTE_RW))
+#define pte_user_rw(p) ((pt_entry_t)(pa_to_pte(p) | INTEL_PTE_VALID|INTEL_PTE_USER|INTEL_PTE_RW))
#define pte_user_ro(p) ((pt_entry_t)(pa_to_pte(p) | INTEL_PTE_VALID|INTEL_PTE_USER))
+#define PMAP_INVEPT_SINGLE_CONTEXT 1
+
+
+#define INTEL_EPTP_AD 0x00000040ULL
+
+#define INTEL_EPT_READ 0x00000001ULL
+#define INTEL_EPT_WRITE 0x00000002ULL
+#define INTEL_EPT_EX 0x00000004ULL
+#define INTEL_EPT_IPTA 0x00000040ULL
+#define INTEL_EPT_PS 0x00000080ULL
+#define INTEL_EPT_REF 0x00000100ULL
+#define INTEL_EPT_MOD 0x00000200ULL
+
+#define INTEL_EPT_CACHE_MASK 0x00000038ULL
+#define INTEL_EPT_NCACHE 0x00000000ULL
+#define INTEL_EPT_WC 0x00000008ULL
+#define INTEL_EPT_WTHRU 0x00000020ULL
+#define INTEL_EPT_WP 0x00000028ULL
+#define INTEL_EPT_WB 0x00000030ULL
+
+/*
+ * Routines to filter correct bits depending on the pmap type
+ */
+
+static inline pt_entry_t
+pte_remove_ex(pt_entry_t pte, boolean_t is_ept)
+{
+ if (__probable(!is_ept)) {
+ return (pte | INTEL_PTE_NX);
+ }
+
+ return (pte & (~INTEL_EPT_EX));
+}
+
+static inline pt_entry_t
+pte_set_ex(pt_entry_t pte, boolean_t is_ept)
+{
+ if (__probable(!is_ept)) {
+ return (pte & (~INTEL_PTE_NX));
+ }
+
+ return (pte | INTEL_EPT_EX);
+}
+
+static inline pt_entry_t
+physmap_refmod_to_ept(pt_entry_t physmap_pte)
+{
+ pt_entry_t ept_pte = 0;
+
+ if (physmap_pte & INTEL_PTE_MOD) {
+ ept_pte |= INTEL_EPT_MOD;
+ }
+
+ if (physmap_pte & INTEL_PTE_REF) {
+ ept_pte |= INTEL_EPT_REF;
+ }
+
+ return ept_pte;
+}
+
+static inline pt_entry_t
+ept_refmod_to_physmap(pt_entry_t ept_pte)
+{
+ pt_entry_t physmap_pte = 0;
+
+ assert((ept_pte & ~(INTEL_EPT_REF | INTEL_EPT_MOD)) == 0);
+
+ if (ept_pte & INTEL_EPT_REF) {
+ physmap_pte |= INTEL_PTE_REF;
+ }
+
+ if (ept_pte & INTEL_EPT_MOD) {
+ physmap_pte |= INTEL_PTE_MOD;
+ }
+
+ return physmap_pte;
+}
+
+/*
+ * Note: Not all Intel processors support EPT referenced access and dirty bits.
+ * During pmap_init() we check the VMX capability for the current hardware
+ * and update this variable accordingly.
+ */
+extern boolean_t pmap_ept_support_ad;
+
+#define PTE_VALID_MASK(is_ept) ((is_ept) ? (INTEL_EPT_READ | INTEL_EPT_WRITE | INTEL_EPT_EX) : INTEL_PTE_VALID)
+#define PTE_READ(is_ept) ((is_ept) ? INTEL_EPT_READ : INTEL_PTE_VALID)
+#define PTE_WRITE(is_ept) ((is_ept) ? INTEL_EPT_WRITE : INTEL_PTE_WRITE)
+#define PTE_PS INTEL_PTE_PS
+#define PTE_COMPRESSED INTEL_PTE_COMPRESSED
+#define PTE_COMPRESSED_ALT INTEL_PTE_COMPRESSED_ALT
+#define PTE_NCACHE(is_ept) ((is_ept) ? INTEL_EPT_NCACHE : INTEL_PTE_NCACHE)
+#define PTE_WTHRU(is_ept) ((is_ept) ? INTEL_EPT_WTHRU : INTEL_PTE_WTHRU)
+#define PTE_REF(is_ept) ((is_ept) ? INTEL_EPT_REF : INTEL_PTE_REF)
+#define PTE_MOD(is_ept) ((is_ept) ? INTEL_EPT_MOD : INTEL_PTE_MOD)
+#define PTE_WIRED INTEL_PTE_WIRED
+
+
#define PMAP_DEFAULT_CACHE 0
#define PMAP_INHIBIT_CACHE 1
#define PMAP_GUARDED_CACHE 2
#define PMAP_ACTIVATE_CACHE 4
#define PMAP_NO_GUARD_CACHE 8
-
#ifndef ASSEMBLER
#include <sys/queue.h>
* and directories.
*/
-extern pt_entry_t PTmap[], APTmap[], Upte;
-extern pd_entry_t PTD[], APTD[], PTDpde[], APTDpde[], Upde;
+extern pt_entry_t *PTmap;
+extern pdpt_entry_t *IdlePDPT;
+extern pml4_entry_t *IdlePML4;
+extern boolean_t no_shared_cr3;
+extern pd_entry_t *IdlePTD; /* physical addr of "Idle" state PTD */
+
+extern uint64_t pmap_pv_hashlist_walks;
+extern uint64_t pmap_pv_hashlist_cnts;
+extern uint32_t pmap_pv_hashlist_max;
+extern uint32_t pmap_kernel_text_ps;
+
+#define ID_MAP_VTOP(x) ((void *)(((uint64_t)(x)) & LOW_4GB_MASK))
+
+extern uint64_t physmap_base, physmap_max;
+
+#define NPHYSMAP (MAX(K64_MAXMEM/GB + 4, 4))
+
+static inline boolean_t physmap_enclosed(addr64_t a) {
+ return (a < (NPHYSMAP * GB));
+}
+
+static inline void * PHYSMAP_PTOV_check(void *paddr) {
+ uint64_t pvaddr = (uint64_t)paddr + physmap_base;
+
+ if (__improbable(pvaddr >= physmap_max))
+ panic("PHYSMAP_PTOV bounds exceeded, 0x%qx, 0x%qx, 0x%qx",
+ pvaddr, physmap_base, physmap_max);
-extern pd_entry_t *IdlePTD; /* physical address of "Idle" state directory */
-extern pdpt_entry_t *IdlePDPT;
+ return (void *)pvaddr;
+}
-extern pmap_paddr_t lo_kernel_cr3;
+#define PHYSMAP_PTOV(x) (PHYSMAP_PTOV_check((void*) (x)))
+#if MACH_KERNEL_PRIVATE
+extern uint64_t dblmap_base, dblmap_max, dblmap_dist;
-extern pml4_entry_t *IdlePML4;
-extern pdpt_entry_t *IdlePDPT64;
-extern addr64_t kernel64_cr3;
-extern boolean_t no_shared_cr3;
+static inline uint64_t DBLMAP_CHECK(uintptr_t x) {
+ uint64_t dbladdr = (uint64_t)x + dblmap_dist;
+ if (__improbable((dbladdr >= dblmap_max) || (dbladdr < dblmap_base))) {
+ panic("DBLMAP bounds exceeded, 0x%qx, 0x%qx 0x%qx, 0x%qx",
+ (uint64_t)x, dbladdr, dblmap_base, dblmap_max);
+ }
+ return dbladdr;
-extern uint64_t pmap_pv_hashlist_walks;
-extern uint64_t pmap_pv_hashlist_cnts;
-extern uint32_t pmap_pv_hashlist_max;
+}
+#define DBLMAP(x) (DBLMAP_CHECK((uint64_t) x))
+extern uint64_t ldt_alias_offset;
+static inline uint64_t LDTALIAS_CHECK(uintptr_t x) {
+ uint64_t dbladdr = (uint64_t)x + ldt_alias_offset;
+ if (__improbable((dbladdr >= dblmap_max) || (dbladdr < dblmap_base))) {
+ panic("LDTALIAS: bounds exceeded, 0x%qx, 0x%qx 0x%qx, 0x%qx",
+ (uint64_t)x, dbladdr, dblmap_base, dblmap_max);
+ }
+ return dbladdr;
+}
+#define LDTALIAS(x) (LDTALIAS_CHECK((uint64_t) x))
+#endif
/*
- * virtual address to page table entry and
- * to physical address. Likewise for alternate address space.
- * Note: these work recursively, thus vtopte of a pte will give
- * the corresponding pde that in turn maps it.
+ * For KASLR, we alias the master processor's IDT and GDT at fixed
+ * virtual addresses to defeat SIDT/SGDT address leakage.
+ * And non-boot processor's GDT aliases likewise (skipping LOWGLOBAL_ALIAS)
+ * The low global vector page is mapped at a fixed alias also.
*/
-#define vtopte(va) (PTmap + i386_btop((vm_offset_t)va))
-
+#define LOWGLOBAL_ALIAS (VM_MIN_KERNEL_ADDRESS + 0x2000)
-typedef volatile long cpu_set; /* set of CPUs - must be <= 32 */
- /* changed by other processors */
-struct md_page {
- int pv_list_count;
- TAILQ_HEAD(,pv_entry) pv_list;
-};
+/*
+ * This indicates (roughly) where there is free space for the VM
+ * to use for the heap; this does not need to be precise.
+ */
+#define KERNEL_PMAP_HEAP_RANGE_START VM_MIN_KERNEL_AND_KEXT_ADDRESS
#include <vm/vm_page.h>
*/
struct pmap {
- pd_entry_t *dirbase; /* page directory pointer */
- pmap_paddr_t pdirbase; /* phys. address of dirbase */
- vm_object_t pm_obj; /* object to hold pde's */
- int ref_count; /* reference count */
- int nx_enabled;
- task_map_t pm_task_map;
decl_simple_lock_data(,lock) /* lock on map */
+ pmap_paddr_t pm_cr3; /* Kernel+user shared PML4 physical*/
+ pmap_paddr_t pm_ucr3; /* Mirrored user PML4 physical */
+ task_map_t pm_task_map;
+ boolean_t pm_shared;
+ boolean_t pagezero_accessible;
+#define PMAP_PCID_MAX_CPUS MAX_CPUS /* Must be a multiple of 8 */
+ pcid_t pmap_pcid_cpus[PMAP_PCID_MAX_CPUS];
+ volatile uint8_t pmap_pcid_coherency_vector[PMAP_PCID_MAX_CPUS];
struct pmap_statistics stats; /* map statistics */
- vm_offset_t pm_hold; /* true pdpt zalloc addr */
- pmap_paddr_t pm_cr3; /* physical addr */
- pdpt_entry_t *pm_pdpt; /* KVA of 3rd level page */
+ int ref_count; /* reference count */
+ int nx_enabled;
pml4_entry_t *pm_pml4; /* VKA of top level */
+ pml4_entry_t *pm_upml4; /* Shadow VKA of top level */
+ vm_object_t pm_obj; /* object to hold pde's */
vm_object_t pm_obj_pdpt; /* holds pdpt pages */
vm_object_t pm_obj_pml4; /* holds pml4 pages */
- vm_object_t pm_obj_top; /* holds single top level page */
- boolean_t pm_shared;
+ pmap_paddr_t pm_eptp; /* EPTP */
+ ledger_t ledger; /* ledger tracking phys mappings */
+#if MACH_ASSERT
+ int pmap_pid;
+ char pmap_procname[17];
+#endif /* MACH_ASSERT */
};
+static inline boolean_t
+is_ept_pmap(pmap_t p)
+{
+ if (__probable(p->pm_cr3 != 0)) {
+ assert(p->pm_eptp == 0);
+ return FALSE;
+ }
+
+ assert(p->pm_eptp != 0);
+
+ return TRUE;
+}
+void hv_ept_pmap_create(void **ept_pmap, void **eptp);
+
+#if NCOPY_WINDOWS > 0
#define PMAP_PDPT_FIRST_WINDOW 0
#define PMAP_PDPT_NWINDOWS 4
#define PMAP_PDE_FIRST_WINDOW (PMAP_PDPT_NWINDOWS)
extern mapwindow_t *pmap_get_mapwindow(pt_entry_t pentry);
extern void pmap_put_mapwindow(mapwindow_t *map);
-
+#endif
typedef struct pmap_memory_regions {
- ppnum_t base;
- ppnum_t end;
- ppnum_t alloc;
- uint32_t type;
+ ppnum_t base; /* first page of this region */
+ ppnum_t alloc_up; /* pages below this one have been "stolen" */
+ ppnum_t alloc_down; /* pages above this one have been "stolen" */
+ ppnum_t end; /* last page of this region */
+ uint32_t type;
+ uint64_t attribute;
} pmap_memory_region_t;
-unsigned pmap_memory_region_count;
-unsigned pmap_memory_region_current;
+extern unsigned pmap_memory_region_count;
+extern unsigned pmap_memory_region_current;
#define PMAP_MEMORY_REGIONS_SIZE 128
extern pmap_memory_region_t pmap_memory_regions[];
+#include <i386/pmap_pcid.h>
+
+static inline void
+set_dirbase(pmap_t tpmap, thread_t thread, int my_cpu) {
+ int ccpu = my_cpu;
+ uint64_t pcr3 = tpmap->pm_cr3, ucr3 = tpmap->pm_ucr3;
+ cpu_datap(ccpu)->cpu_task_cr3 = pcr3;
+ cpu_shadowp(ccpu)->cpu_task_cr3 = pcr3;
+
+ cpu_datap(ccpu)->cpu_ucr3 = ucr3;
+ cpu_shadowp(ccpu)->cpu_ucr3 = ucr3;
+
+ cpu_datap(ccpu)->cpu_task_map = tpmap->pm_task_map;
-static inline void set_dirbase(pmap_t tpmap, __unused int tcpu) {
- current_cpu_datap()->cpu_task_cr3 = (pmap_paddr_t)((tpmap)->pm_cr3);
- current_cpu_datap()->cpu_task_map = tpmap->pm_task_map;
+ assert((get_preemption_level() > 0) || (ml_get_interrupts_enabled() == FALSE));
+ assert(ccpu == cpu_number());
+ /*
+ * Switch cr3 if necessary
+ * - unless running with no_shared_cr3 debugging mode
+ * and we're not on the kernel's cr3 (after pre-empted copyio)
+ */
+ boolean_t nopagezero = tpmap->pagezero_accessible;
+ boolean_t priorpagezero = cpu_datap(ccpu)->cpu_pagezero_mapped;
+ cpu_datap(ccpu)->cpu_pagezero_mapped = nopagezero;
+
+ if (__probable(!no_shared_cr3)) {
+ if (__improbable(nopagezero)) {
+ boolean_t copyio_active = ((thread->machine.specFlags & CopyIOActive) != 0);
+ if (pmap_pcid_ncpus) {
+ pmap_pcid_activate(tpmap, ccpu, TRUE, copyio_active);
+ } else {
+ if (copyio_active) {
+ if (get_cr3_base() != tpmap->pm_cr3) {
+ set_cr3_raw(tpmap->pm_cr3);
+ }
+ } else if (get_cr3_base() != cpu_datap(ccpu)->cpu_kernel_cr3) {
+ set_cr3_raw(cpu_datap(ccpu)->cpu_kernel_cr3);
+ }
+ }
+ } else if ((get_cr3_base() != tpmap->pm_cr3) || priorpagezero) {
+ if (pmap_pcid_ncpus) {
+ pmap_pcid_activate(tpmap, ccpu, FALSE, FALSE);
+ } else {
+ set_cr3_raw(tpmap->pm_cr3);
+ }
+ }
+ } else {
+ if (get_cr3_base() != cpu_datap(ccpu)->cpu_kernel_cr3)
+ set_cr3_raw(cpu_datap(ccpu)->cpu_kernel_cr3);
+ }
}
/*
extern void process_pmap_updates(void);
extern void pmap_update_interrupt(void);
-/*
- * Machine dependent routines that are used only for i386/i486/i860.
- */
-
extern addr64_t (kvtophys)(
vm_offset_t addr);
-extern void pmap_expand(
+extern kern_return_t pmap_expand(
pmap_t pmap,
- vm_map_offset_t addr);
-
-extern pt_entry_t *pmap_pte(
- struct pmap *pmap,
- vm_map_offset_t addr);
-
-extern pd_entry_t *pmap_pde(
- struct pmap *pmap,
- vm_map_offset_t addr);
-
-extern pd_entry_t *pmap64_pde(
- struct pmap *pmap,
- vm_map_offset_t addr);
-
-extern pdpt_entry_t *pmap64_pdpt(
- struct pmap *pmap,
- vm_map_offset_t addr);
-
+ vm_map_offset_t addr,
+ unsigned int options);
extern vm_offset_t pmap_map(
vm_offset_t virt,
vm_map_offset_t start,
vm_map_offset_t end,
vm_prot_t prot,
unsigned int flags);
-
extern void pmap_bootstrap(
vm_offset_t load_start,
boolean_t IA32e);
struct pmap *pmap,
vm_offset_t *listp,
int space);
-
-extern void pmap_commpage32_init(
- vm_offset_t kernel,
- vm_offset_t user,
- int count);
-extern void pmap_commpage64_init(
- vm_offset_t kernel,
- vm_map_offset_t user,
- int count);
-
+extern void x86_filter_TLB_coherency_interrupts(boolean_t);
+/*
+ * Get cache attributes (as pagetable bits) for the specified phys page
+ */
+extern unsigned pmap_get_cache_attributes(ppnum_t, boolean_t is_ept);
+#if NCOPY_WINDOWS > 0
extern struct cpu_pmap *pmap_cpu_alloc(
boolean_t is_boot_cpu);
extern void pmap_cpu_free(
struct cpu_pmap *cp);
+#endif
-extern void pmap_map_block(
- pmap_t pmap,
+extern kern_return_t pmap_map_block(
+ pmap_t pmap,
addr64_t va,
ppnum_t pa,
uint32_t size,
vm_prot_t prot,
int attr,
unsigned int flags);
-
+
extern void invalidate_icache(vm_offset_t addr, unsigned cnt, int phys);
extern void flush_dcache(vm_offset_t addr, unsigned count, int phys);
extern ppnum_t pmap_find_phys(pmap_t map, addr64_t va);
extern void pmap_cpu_init(void);
extern void pmap_disable_NX(pmap_t pmap);
-extern void pmap_set_4GB_pagezero(pmap_t pmap);
-extern void pmap_clear_4GB_pagezero(pmap_t pmap);
-extern void pmap_load_kernel_cr3(void);
-extern vm_offset_t pmap_cpu_high_map_vaddr(int, enum high_cpu_types);
-extern vm_offset_t pmap_high_map_vaddr(enum high_cpu_types);
-extern vm_offset_t pmap_high_map(pt_entry_t, enum high_cpu_types);
-extern vm_offset_t pmap_cpu_high_shared_remap(int, enum high_cpu_types, vm_offset_t, int);
-extern vm_offset_t pmap_high_shared_remap(enum high_fixed_addresses, vm_offset_t, int);
-
-extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_size_t *, int *, int *);
-
+extern void pt_fake_zone_init(int);
+extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_size_t *,
+ uint64_t *, int *, int *, int *);
+extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__printflike(1,2));
/*
* Macros for speed.
#include <kern/spl.h>
-#if defined(PMAP_ACTIVATE_KERNEL)
-#undef PMAP_ACTIVATE_KERNEL
-#undef PMAP_DEACTIVATE_KERNEL
-#undef PMAP_ACTIVATE_USER
-#undef PMAP_DEACTIVATE_USER
-#endif
-
-#define PMAP_ACTIVATE_KERNEL(my_cpu) { \
- spl_t spl; \
- \
- spl = splhigh(); \
- if (current_cpu_datap()->cpu_tlb_invalid) \
- process_pmap_updates(); \
- splx(spl); \
-}
-
-#define PMAP_DEACTIVATE_KERNEL(my_cpu) { \
- spl_t spl; \
- \
- spl = splhigh(); \
- process_pmap_updates(); \
- splx(spl); \
-}
-
-
-#define PMAP_ACTIVATE_MAP(map, my_cpu) { \
- register pmap_t tpmap; \
+#define PMAP_ACTIVATE_MAP(map, thread, my_cpu) { \
+ pmap_t tpmap; \
\
tpmap = vm_map_pmap(map); \
- set_dirbase(tpmap, my_cpu); \
+ set_dirbase(tpmap, thread, my_cpu); \
}
-#define PMAP_DEACTIVATE_MAP(map, my_cpu) \
- if (vm_map_pmap(map)->pm_task_map == TASK_MAP_64BIT_SHARED) \
- pmap_load_kernel_cr3();
-
-
-#define PMAP_ACTIVATE_USER(th, my_cpu) { \
- spl_t spl; \
- \
- spl = splhigh(); \
- PMAP_ACTIVATE_MAP(th->map, my_cpu) \
- splx(spl); \
-}
-
-#define PMAP_DEACTIVATE_USER(th, my_cpu)
-
+#if defined(__x86_64__)
+#define PMAP_DEACTIVATE_MAP(map, thread, ccpu) \
+ pmap_assert2((pmap_pcid_ncpus ? (pcid_for_pmap_cpu_tuple(map->pmap, thread, ccpu) == (get_cr3_raw() & 0xFFF)) : TRUE),"PCIDs: 0x%x, active PCID: 0x%x, CR3: 0x%lx, pmap_cr3: 0x%llx, kernel_cr3: 0x%llx, kernel pmap cr3: 0x%llx, CPU active PCID: 0x%x, CPU kernel PCID: 0x%x, specflags: 0x%x, pagezero: 0x%x", pmap_pcid_ncpus, pcid_for_pmap_cpu_tuple(map->pmap, thread, ccpu), get_cr3_raw(), map->pmap->pm_cr3, cpu_datap(ccpu)->cpu_kernel_cr3, kernel_pmap->pm_cr3, cpu_datap(ccpu)->cpu_active_pcid, cpu_datap(ccpu)->cpu_kernel_pcid, thread->machine.specFlags, map->pmap->pagezero_accessible);
+#else
+#define PMAP_DEACTIVATE_MAP(map, thread)
+#endif
-#define PMAP_SWITCH_CONTEXT(old_th, new_th, my_cpu) { \
+#if NCOPY_WINDOWS > 0
+#define PMAP_SWITCH_USER(th, new_map, my_cpu) { \
spl_t spl; \
- pt_entry_t *kpdp; \
- pt_entry_t *updp; \
- int i; \
- int need_flush; \
- \
- need_flush = 0; \
- spl = splhigh(); \
- if (old_th->map != new_th->map) { \
- PMAP_DEACTIVATE_MAP(old_th->map, my_cpu); \
- PMAP_ACTIVATE_MAP(new_th->map, my_cpu); \
- } \
- kpdp = current_cpu_datap()->cpu_copywindow_pdp; \
- for (i = 0; i < NCOPY_WINDOWS; i++) { \
- if (new_th->machine.copy_window[i].user_base != (user_addr_t)-1) { \
- updp = pmap_pde(new_th->map->pmap, \
- new_th->machine.copy_window[i].user_base);\
- pmap_store_pte(kpdp, updp ? *updp : 0); \
- } \
- kpdp++; \
- } \
+ \
+ spl = splhigh(); \
+ PMAP_DEACTIVATE_MAP(th->map, th); \
+ th->map = new_map; \
+ PMAP_ACTIVATE_MAP(th->map, th); \
splx(spl); \
- if (new_th->machine.copyio_state == WINDOWS_OPENED) \
- need_flush = 1; \
- else \
- new_th->machine.copyio_state = WINDOWS_DIRTY; \
- if (new_th->machine.physwindow_pte) { \
- pmap_store_pte((current_cpu_datap()->cpu_physwindow_ptep), \
- new_th->machine.physwindow_pte); \
- if (need_flush == 0) \
- invlpg((uintptr_t)current_cpu_datap()->cpu_physwindow_base);\
- } \
- if (need_flush) \
- flush_tlb(); \
+ inval_copy_windows(th); \
}
-
+#else
#define PMAP_SWITCH_USER(th, new_map, my_cpu) { \
spl_t spl; \
\
spl = splhigh(); \
- PMAP_DEACTIVATE_MAP(th->map, my_cpu); \
+ PMAP_DEACTIVATE_MAP(th->map, th, my_cpu); \
th->map = new_map; \
- PMAP_ACTIVATE_MAP(th->map, my_cpu); \
+ PMAP_ACTIVATE_MAP(th->map, th, my_cpu); \
splx(spl); \
- inval_copy_windows(th); \
}
+#endif
/*
* Marking the current cpu's cr3 inactive is achieved by setting its lsb.
* Marking the current cpu's cr3 active once more involves clearng this bit.
* Note that valid page tables are page-aligned and so the bottom 12 bits
- * are noramlly zero.
+ * are normally zero, modulo PCID.
* We can only mark the current cpu active/inactive but we can test any cpu.
*/
#define CPU_CR3_MARK_INACTIVE() \
#define CPU_GET_ACTIVE_CR3(cpu) \
(cpu_datap(cpu)->cpu_active_cr3 & ~1)
+#define CPU_GET_TASK_CR3(cpu) \
+ (cpu_datap(cpu)->cpu_task_cr3)
+
+/*
+ * Mark this cpu idle, and remove it from the active set,
+ * since it is not actively using any pmap. Signal_cpus
+ * will notice that it is idle, and avoid signaling it,
+ * but will queue the update request for when the cpu
+ * becomes active.
+ */
#define MARK_CPU_IDLE(my_cpu) { \
- /* \
- * Mark this cpu idle, and remove it from the active set, \
- * since it is not actively using any pmap. Signal_cpus \
- * will notice that it is idle, and avoid signaling it, \
- * but will queue the update request for when the cpu \
- * becomes active. \
- */ \
- int s = splhigh(); \
- if (!cpu_mode_is64bit() || no_shared_cr3) \
- process_pmap_updates(); \
- else \
- pmap_load_kernel_cr3(); \
+ assert(ml_get_interrupts_enabled() == FALSE); \
CPU_CR3_MARK_INACTIVE(); \
- __asm__ volatile("mfence"); \
- splx(s); \
+ mfence(); \
}
#define MARK_CPU_ACTIVE(my_cpu) { \
- \
- int s = splhigh(); \
+ assert(ml_get_interrupts_enabled() == FALSE); \
/* \
* If a kernel_pmap update was requested while this cpu \
* was idle, process it as if we got the interrupt. \
* interrupt if this happens. \
*/ \
CPU_CR3_MARK_ACTIVE(); \
- __asm__ volatile("mfence"); \
+ mfence(); \
\
if (current_cpu_datap()->cpu_tlb_invalid) \
process_pmap_updates(); \
- splx(s); \
}
#define PMAP_CONTEXT(pmap, thread)
(((vm_offset_t) (VA)) <= vm_max_kernel_address))
+#define pmap_compressed(pmap) ((pmap)->stats.compressed)
#define pmap_resident_count(pmap) ((pmap)->stats.resident_count)
#define pmap_resident_max(pmap) ((pmap)->stats.resident_max)
#define pmap_copy(dst_pmap,src_pmap,dst_addr,len,src_addr)
#define pmap_attribute_cache_sync(addr,size,attr,value) \
(KERN_INVALID_ADDRESS)
-#define MACHINE_PMAP_IS_EMPTY 1
+#define MACHINE_PMAP_IS_EMPTY 1
extern boolean_t pmap_is_empty(pmap_t pmap,
vm_map_offset_t start,
vm_map_offset_t end);
-#endif /* ASSEMBLER */
+#define MACHINE_BOOTSTRAPPTD 1 /* Static bootstrap page-tables */
+kern_return_t
+pmap_permissions_verify(pmap_t, vm_map_t, vm_offset_t, vm_offset_t);
+#if MACH_ASSERT
+extern int pmap_stats_assert;
+#define PMAP_STATS_ASSERTF(args) \
+ MACRO_BEGIN \
+ if (pmap_stats_assert) assertf args; \
+ MACRO_END
+#else /* MACH_ASSERT */
+#define PMAP_STATS_ASSERTF(args)
+#endif /* MACH_ASSERT */
+#endif /* ASSEMBLER */
#endif /* _PMAP_MACHINE_ */
-
-
#endif /* KERNEL_PRIVATE */