X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/b0d623f7f2ae71ed96e60569f61f9a9a27016e80..a39ff7e25e19b3a8c3020042a3872ca9ec9659f1:/osfmk/i386/pmap.h diff --git a/osfmk/i386/pmap.h b/osfmk/i386/pmap.h index 9e6d65d20..3458ce7cf 100644 --- a/osfmk/i386/pmap.h +++ b/osfmk/i386/pmap.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2017 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,8 +70,6 @@ #ifndef ASSEMBLER -#include - #include #include #include @@ -79,11 +77,14 @@ #include #include #include -#include +#include +#include #include #include +#include + /* * Define the generic in terms of the specific */ @@ -94,9 +95,6 @@ #define intel_ptob(x) i386_ptob(x) #define intel_round_page(x) i386_round_page(x) #define intel_trunc_page(x) i386_trunc_page(x) -#define trunc_intel_to_vm(x) trunc_i386_to_vm(x) -#define round_intel_to_vm(x) round_i386_to_vm(x) -#define vm_to_intel(x) vm_to_i386(x) /* * i386/i486/i860 Page Table Entry @@ -104,21 +102,14 @@ #endif /* ASSEMBLER */ -#define NPGPTD 4 -#define PDESHIFT 21 -#define PTEMASK 0x1ff -#define PTEINDX 3 - -#define PTESHIFT 12 - +#define NPGPTD 4ULL +#define PDESHIFT 21ULL +#define PTEMASK 0x1ffULL +#define PTEINDX 3ULL -#define INITPT_SEG_BASE 0x100000 -#define INITGDT_SEG_BASE 0x106000 -#define SLEEP_SEG_BASE 0x107000 +#define PTESHIFT 12ULL -#ifdef __x86_64__ #define LOW_4GB_MASK ((vm_offset_t)0x00000000FFFFFFFFUL) -#endif #define PDESIZE sizeof(pd_entry_t) /* for assembly files */ #define PTESIZE sizeof(pt_entry_t) /* for assembly files */ @@ -132,7 +123,7 @@ #define NBPTD (NPGPTD << PAGE_SHIFT) #define NPDEPTD (NBPTD / (sizeof (pd_entry_t))) #define NPDEPG (PAGE_SIZE/(sizeof (pd_entry_t))) -#define NBPDE (1 << PDESHIFT) +#define NBPDE (1ULL << PDESHIFT) #define PDEMASK (NBPDE - 1) #define PTE_PER_PAGE 512 /* number of PTE's per page on any level */ @@ -150,7 +141,7 @@ typedef uint64_t pdpt_entry_t; #define NPDPTPG (PAGE_SIZE/(sizeof (pdpt_entry_t))) #define PDPTSHIFT 30 #define PDPTPGSHIFT 9 -#define NBPDPT (1 << PDPTSHIFT) +#define NBPDPT (1ULL << PDPTSHIFT) #define PDPTMASK (NBPDPT-1) #define PDPT_ENTRY_NULL ((pdpt_entry_t *) 0) @@ -158,7 +149,7 @@ typedef uint64_t pd_entry_t; #define NPDPG (PAGE_SIZE/(sizeof (pd_entry_t))) #define PDSHIFT 21 #define PDPGSHIFT 9 -#define NBPD (1 << PDSHIFT) +#define NBPD (1ULL << PDSHIFT) #define PDMASK (NBPD-1) #define PD_ENTRY_NULL ((pd_entry_t *) 0) @@ -166,144 +157,83 @@ typedef uint64_t pt_entry_t; #define NPTPG (PAGE_SIZE/(sizeof (pt_entry_t))) #define PTSHIFT 12 #define PTPGSHIFT 9 -#define NBPT (1 << PTSHIFT) +#define NBPT (1ULL << PTSHIFT) #define PTMASK (NBPT-1) #define PT_ENTRY_NULL ((pt_entry_t *) 0) typedef uint64_t pmap_paddr_t; -/* superpages */ -#ifdef __x86_64__ -#define SUPERPAGE_NBASEPAGES 512 +#if DEVELOPMENT || DEBUG +#define PMAP_ASSERT 1 +extern int pmap_asserts_enabled; +extern int pmap_asserts_traced; +#endif + +#if PMAP_ASSERT +#define pmap_assert(ex) (pmap_asserts_enabled ? ((ex) ? (void)0 : Assert(__FILE__, __LINE__, # ex)) : (void)0) + +#define pmap_assert2(ex, fmt, args...) \ + do { \ + if (__improbable(pmap_asserts_enabled && !(ex))) { \ + if (pmap_asserts_traced) { \ + KERNEL_DEBUG_CONSTANT(0xDEAD1000, __builtin_return_address(0), __LINE__, 0, 0, 0); \ + kdebug_enable = 0; \ + } else { \ + kprintf("Assertion %s failed (%s:%d, caller %p) " fmt , #ex, __FILE__, __LINE__, __builtin_return_address(0), ##args); \ + panic("Assertion %s failed (%s:%d, caller %p) " fmt , #ex, __FILE__, __LINE__, __builtin_return_address(0), ##args); \ + } \ + } \ + } while(0) #else -#define SUPERPAGE_NBASEPAGES 1 /* we don't support superpages on i386 */ +#define pmap_assert(ex) +#define pmap_assert2(ex, fmt, args...) #endif +/* superpages */ +#define SUPERPAGE_NBASEPAGES 512 + /* * Atomic 64-bit store of a page table entry. */ static inline void pmap_store_pte(pt_entry_t *entryp, pt_entry_t value) { -#ifdef __i386__ - /* - * Load the new value into %ecx:%ebx - * Load the old value into %edx:%eax - * Compare-exchange-8bytes at address entryp (loaded in %edi) - * If the compare succeeds, the new value will have been stored. - * Otherwise, the old value changed and reloaded, so try again. - */ - __asm__ volatile( - " movl (%0), %%eax \n\t" - " movl 4(%0), %%edx \n\t" - "1: \n\t" - " cmpxchg8b (%0) \n\t" - " jnz 1b" - : - : "D" (entryp), - "b" ((uint32_t)value), - "c" ((uint32_t)(value >> 32)) - : "eax", "edx", "memory"); -#else /* * In the 32-bit kernel a compare-and-exchange loop was * required to provide atomicity. For K64, life is easier: */ *entryp = value; -#endif } -/* - * Atomic 64-bit compare and exchange of a page table entry. - */ -static inline boolean_t -pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new) -{ - boolean_t ret; - -#ifdef __i386__ - /* - * Load the old value into %edx:%eax - * Load the new value into %ecx:%ebx - * Compare-exchange-8bytes at address entryp (loaded in %edi) - * If the compare succeeds, the new value is stored, return TRUE. - * Otherwise, no swap is made, return FALSE. - */ - asm volatile( - " lock; cmpxchg8b (%1) \n\t" - " setz %%al \n\t" - " movzbl %%al,%0" - : "=a" (ret) - : "D" (entryp), - "a" ((uint32_t)old), - "d" ((uint32_t)(old >> 32)), - "b" ((uint32_t)new), - "c" ((uint32_t)(new >> 32)) - : "memory"); -#else - /* - * Load the old value into %rax - * Load the new value into another register - * Compare-exchange-quad at address entryp - * If the compare succeeds, the new value is stored, return TRUE. - * Otherwise, no swap is made, return FALSE. - */ - asm volatile( - " lock; cmpxchgq %2,(%3) \n\t" - " setz %%al \n\t" - " movzbl %%al,%0" - : "=a" (ret) - : "a" (old), - "r" (new), - "r" (entryp) - : "memory"); -#endif - return ret; -} - -#define pmap_update_pte(entryp, old, new) \ - while (!pmap_cmpx_pte((entryp), (old), (new))) - - /* in 64 bit spaces, the number of each type of page in the page tables */ #define NPML4PGS (1ULL * (PAGE_SIZE/(sizeof (pml4_entry_t)))) #define NPDPTPGS (NPML4PGS * (PAGE_SIZE/(sizeof (pdpt_entry_t)))) #define NPDEPGS (NPDPTPGS * (PAGE_SIZE/(sizeof (pd_entry_t)))) #define NPTEPGS (NPDEPGS * (PAGE_SIZE/(sizeof (pt_entry_t)))) -#ifdef __i386__ -/* - * The 64-bit kernel is remapped in uber-space which is at the base - * the highest 4th-level directory (KERNEL_UBER_PML4_INDEX). That is, - * 512GB from the top of virtual space (or zero). - */ -#define KERNEL_UBER_PML4_INDEX 511 -#define KERNEL_UBER_BASE (0ULL - NBPML4) -#define KERNEL_UBER_BASE_HI32 ((uint32_t)(KERNEL_UBER_BASE >> 32)) -#else -#define KERNEL_PML4_INDEX 511 -#define KERNEL_KEXTS_INDEX 510 /* Home of KEXTs - the basement */ -#define KERNEL_PHYSMAP_INDEX 509 /* virtual to physical map */ +#define KERNEL_PML4_INDEX 511 +#define KERNEL_KEXTS_INDEX 510 /* Home of KEXTs - the basement */ +#define KERNEL_PHYSMAP_PML4_INDEX 509 /* virtual to physical map */ +#define KERNEL_KASAN_PML4_INDEX0 508 +#define KERNEL_KASAN_PML4_INDEX1 507 +#define KERNEL_DBLMAP_PML4_INDEX (506) #define KERNEL_BASE (0ULL - NBPML4) #define KERNEL_BASEMENT (KERNEL_BASE - NBPML4) -#endif #define VM_WIMG_COPYBACK VM_MEM_COHERENT +#define VM_WIMG_COPYBACKLW VM_WIMG_COPYBACK #define VM_WIMG_DEFAULT VM_MEM_COHERENT /* ?? intel ?? */ #define VM_WIMG_IO (VM_MEM_COHERENT | \ VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED) +#define VM_WIMG_POSTED VM_WIMG_IO #define VM_WIMG_WTHRU (VM_MEM_WRITE_THROUGH | VM_MEM_COHERENT | VM_MEM_GUARDED) /* write combining mode, aka store gather */ #define VM_WIMG_WCOMB (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT) - +#define VM_WIMG_INNERWBACK VM_MEM_COHERENT /* * Pte related macros */ -#ifdef __i386__ -#define VADDR(pdi, pti) ((vm_offset_t)(((pdi)<dirbase[(vm_offset_t)(v) >> PDESHIFT]))*/ - -#define HIGH_MEM_BASE ((uint32_t)( -NBPDE) ) /* shared gdt etc seg addr */ /* XXX64 ?? */ -#define pmap_index_to_virt(x) (HIGH_MEM_BASE | ((unsigned)(x) << PAGE_SHIFT)) -#endif /* * Convert address offset to page descriptor index @@ -385,19 +267,10 @@ enum high_fixed_addresses { #define pdenum(pmap, a) (((vm_offset_t)(a) >> PDESHIFT) & PDEMASK) #define PMAP_INVALID_PDPTNUM (~0ULL) -#ifdef __i386__ #define pdeidx(pmap, a) (((a) >> PDSHIFT) & ((1ULL<<(48 - PDSHIFT)) -1)) #define pdptidx(pmap, a) (((a) >> PDPTSHIFT) & ((1ULL<<(48 - PDPTSHIFT)) -1)) #define pml4idx(pmap, a) (((a) >> PML4SHIFT) & ((1ULL<<(48 - PML4SHIFT)) -1)) -#else -#define VAMASK ((1ULL<<48)-1) -#define pml4idx(pmap, a) ((((a) & VAMASK) >> PML4SHIFT) & \ - ((1ULL<<(48 - PML4SHIFT))-1)) -#define pdptidx(pmap, a) ((((a) & PML4MASK) >> PDPTSHIFT) & \ - ((1ULL<<(48 - PDPTSHIFT))-1)) -#define pdeidx(pmap, a) ((((a) & PML4MASK) >> PDSHIFT) & \ - ((1ULL<<(48 - PDSHIFT)) - 1)) -#endif + /* * Convert page descriptor index to user virtual address @@ -414,24 +287,38 @@ enum high_fixed_addresses { * without using the bit fields). */ -#define INTEL_PTE_VALID 0x00000001 -#define INTEL_PTE_WRITE 0x00000002 -#define INTEL_PTE_RW 0x00000002 -#define INTEL_PTE_USER 0x00000004 -#define INTEL_PTE_WTHRU 0x00000008 -#define INTEL_PTE_NCACHE 0x00000010 -#define INTEL_PTE_REF 0x00000020 -#define INTEL_PTE_MOD 0x00000040 -#define INTEL_PTE_PS 0x00000080 -#define INTEL_PTE_PTA 0x00000080 -#define INTEL_PTE_GLOBAL 0x00000100 -#define INTEL_PTE_WIRED 0x00000200 -#define INTEL_PDPTE_NESTED 0x00000400 +#define INTEL_PTE_VALID 0x00000001ULL +#define INTEL_PTE_WRITE 0x00000002ULL +#define INTEL_PTE_RW 0x00000002ULL +#define INTEL_PTE_USER 0x00000004ULL +#define INTEL_PTE_WTHRU 0x00000008ULL +#define INTEL_PTE_NCACHE 0x00000010ULL +#define INTEL_PTE_REF 0x00000020ULL +#define INTEL_PTE_MOD 0x00000040ULL +#define INTEL_PTE_PS 0x00000080ULL +#define INTEL_PTE_PTA 0x00000080ULL +#define INTEL_PTE_GLOBAL 0x00000100ULL +#define INTEL_PTE_WIRED 0x00000400ULL +#define INTEL_PDPTE_NESTED 0x00000800ULL #define INTEL_PTE_PFN PG_FRAME #define INTEL_PTE_NX (1ULL << 63) #define INTEL_PTE_INVALID 0 +/* This is conservative, but suffices */ +#define INTEL_PTE_RSVD ((1ULL << 10) | (1ULL << 11) | (0x1FFULL << 54)) + +#define INTEL_PTE_COMPRESSED (1ULL << 62) /* marker, for invalid PTE only -- ignored by hardware for both regular/EPT entries*/ +#define INTEL_PTE_COMPRESSED_ALT (1ULL << 61) /* compressed but with "alternate accounting" */ + +#define INTEL_PTE_COMPRESSED_MASK (INTEL_PTE_COMPRESSED | \ + INTEL_PTE_COMPRESSED_ALT) +#define PTE_IS_COMPRESSED(x) \ + ((((x) & INTEL_PTE_VALID) == 0) && /* PTE is not valid... */ \ + ((x) & INTEL_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \ + ((!((x) & ~INTEL_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \ + (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \ + &(x), (x), (x) & ~INTEL_PTE_COMPRESSED_MASK), FALSE))) #define pa_to_pte(a) ((a) & INTEL_PTE_PFN) /* XXX */ #define pte_to_pa(p) ((p) & INTEL_PTE_PFN) /* XXX */ @@ -439,16 +326,113 @@ enum high_fixed_addresses { #define pte_kernel_rw(p) ((pt_entry_t)(pa_to_pte(p) | INTEL_PTE_VALID|INTEL_PTE_RW)) #define pte_kernel_ro(p) ((pt_entry_t)(pa_to_pte(p) | INTEL_PTE_VALID)) -#define pte_user_rw(p) ((pt_entry)t)(pa_to_pte(p) | INTEL_PTE_VALID|INTEL_PTE_USER|INTEL_PTE_RW)) +#define pte_user_rw(p) ((pt_entry_t)(pa_to_pte(p) | INTEL_PTE_VALID|INTEL_PTE_USER|INTEL_PTE_RW)) #define pte_user_ro(p) ((pt_entry_t)(pa_to_pte(p) | INTEL_PTE_VALID|INTEL_PTE_USER)) +#define PMAP_INVEPT_SINGLE_CONTEXT 1 + + +#define INTEL_EPTP_AD 0x00000040ULL + +#define INTEL_EPT_READ 0x00000001ULL +#define INTEL_EPT_WRITE 0x00000002ULL +#define INTEL_EPT_EX 0x00000004ULL +#define INTEL_EPT_IPTA 0x00000040ULL +#define INTEL_EPT_PS 0x00000080ULL +#define INTEL_EPT_REF 0x00000100ULL +#define INTEL_EPT_MOD 0x00000200ULL + +#define INTEL_EPT_CACHE_MASK 0x00000038ULL +#define INTEL_EPT_NCACHE 0x00000000ULL +#define INTEL_EPT_WC 0x00000008ULL +#define INTEL_EPT_WTHRU 0x00000020ULL +#define INTEL_EPT_WP 0x00000028ULL +#define INTEL_EPT_WB 0x00000030ULL + +/* + * Routines to filter correct bits depending on the pmap type + */ + +static inline pt_entry_t +pte_remove_ex(pt_entry_t pte, boolean_t is_ept) +{ + if (__probable(!is_ept)) { + return (pte | INTEL_PTE_NX); + } + + return (pte & (~INTEL_EPT_EX)); +} + +static inline pt_entry_t +pte_set_ex(pt_entry_t pte, boolean_t is_ept) +{ + if (__probable(!is_ept)) { + return (pte & (~INTEL_PTE_NX)); + } + + return (pte | INTEL_EPT_EX); +} + +static inline pt_entry_t +physmap_refmod_to_ept(pt_entry_t physmap_pte) +{ + pt_entry_t ept_pte = 0; + + if (physmap_pte & INTEL_PTE_MOD) { + ept_pte |= INTEL_EPT_MOD; + } + + if (physmap_pte & INTEL_PTE_REF) { + ept_pte |= INTEL_EPT_REF; + } + + return ept_pte; +} + +static inline pt_entry_t +ept_refmod_to_physmap(pt_entry_t ept_pte) +{ + pt_entry_t physmap_pte = 0; + + assert((ept_pte & ~(INTEL_EPT_REF | INTEL_EPT_MOD)) == 0); + + if (ept_pte & INTEL_EPT_REF) { + physmap_pte |= INTEL_PTE_REF; + } + + if (ept_pte & INTEL_EPT_MOD) { + physmap_pte |= INTEL_PTE_MOD; + } + + return physmap_pte; +} + +/* + * Note: Not all Intel processors support EPT referenced access and dirty bits. + * During pmap_init() we check the VMX capability for the current hardware + * and update this variable accordingly. + */ +extern boolean_t pmap_ept_support_ad; + +#define PTE_VALID_MASK(is_ept) ((is_ept) ? (INTEL_EPT_READ | INTEL_EPT_WRITE | INTEL_EPT_EX) : INTEL_PTE_VALID) +#define PTE_READ(is_ept) ((is_ept) ? INTEL_EPT_READ : INTEL_PTE_VALID) +#define PTE_WRITE(is_ept) ((is_ept) ? INTEL_EPT_WRITE : INTEL_PTE_WRITE) +#define PTE_PS INTEL_PTE_PS +#define PTE_COMPRESSED INTEL_PTE_COMPRESSED +#define PTE_COMPRESSED_ALT INTEL_PTE_COMPRESSED_ALT +#define PTE_NCACHE(is_ept) ((is_ept) ? INTEL_EPT_NCACHE : INTEL_PTE_NCACHE) +#define PTE_WTHRU(is_ept) ((is_ept) ? INTEL_EPT_WTHRU : INTEL_PTE_WTHRU) +#define PTE_REF(is_ept) ((is_ept) ? INTEL_EPT_REF : INTEL_PTE_REF) +#define PTE_MOD(is_ept) ((is_ept) ? INTEL_EPT_MOD : INTEL_PTE_MOD) +#define PTE_WIRED INTEL_PTE_WIRED + + #define PMAP_DEFAULT_CACHE 0 #define PMAP_INHIBIT_CACHE 1 #define PMAP_GUARDED_CACHE 2 #define PMAP_ACTIVATE_CACHE 4 #define PMAP_NO_GUARD_CACHE 8 - #ifndef ASSEMBLER #include @@ -458,50 +442,76 @@ enum high_fixed_addresses { * and directories. */ -#ifdef __i386__ -extern pt_entry_t PTmap[], APTmap[], Upte; -extern pd_entry_t PTD[], APTD[], PTDpde[], APTDpde[], Upde; -extern pmap_paddr_t lo_kernel_cr3; -extern pdpt_entry_t *IdlePDPT64; -#else extern pt_entry_t *PTmap; -#endif +extern pdpt_entry_t *IdlePDPT; +extern pml4_entry_t *IdlePML4; extern boolean_t no_shared_cr3; -extern addr64_t kernel64_cr3; extern pd_entry_t *IdlePTD; /* physical addr of "Idle" state PTD */ -extern pdpt_entry_t IdlePDPT[]; -extern pml4_entry_t IdlePML4[]; extern uint64_t pmap_pv_hashlist_walks; extern uint64_t pmap_pv_hashlist_cnts; extern uint32_t pmap_pv_hashlist_max; extern uint32_t pmap_kernel_text_ps; -#ifdef __i386__ -/* - * ** i386 ** - * virtual address to page table entry and - * to physical address. Likewise for alternate address space. - * Note: these work recursively, thus vtopte of a pte will give - * the corresponding pde that in turn maps it. - */ +#define ID_MAP_VTOP(x) ((void *)(((uint64_t)(x)) & LOW_4GB_MASK)) -#define vtopte(va) (PTmap + i386_btop((vm_offset_t)va)) -#endif +extern uint64_t physmap_base, physmap_max; -#ifdef __x86_64__ -#define ID_MAP_VTOP(x) ((void *)(((uint64_t)(x)) & LOW_4GB_MASK)) +#define NPHYSMAP (MAX(K64_MAXMEM/GB + 4, 4)) + +static inline boolean_t physmap_enclosed(addr64_t a) { + return (a < (NPHYSMAP * GB)); +} + +static inline void * PHYSMAP_PTOV_check(void *paddr) { + uint64_t pvaddr = (uint64_t)paddr + physmap_base; + + if (__improbable(pvaddr >= physmap_max)) + panic("PHYSMAP_PTOV bounds exceeded, 0x%qx, 0x%qx, 0x%qx", + pvaddr, physmap_base, physmap_max); -#define PHYSMAP_BASE KVADDR(KERNEL_PHYSMAP_INDEX,0,0,0) -#define PHYSMAP_PTOV(x) ((void *)(((uint64_t)(x)) + PHYSMAP_BASE)) + return (void *)pvaddr; +} + +#define PHYSMAP_PTOV(x) (PHYSMAP_PTOV_check((void*) (x))) +#if MACH_KERNEL_PRIVATE +extern uint64_t dblmap_base, dblmap_max, dblmap_dist; + +static inline uint64_t DBLMAP_CHECK(uintptr_t x) { + uint64_t dbladdr = (uint64_t)x + dblmap_dist; + if (__improbable((dbladdr >= dblmap_max) || (dbladdr < dblmap_base))) { + panic("DBLMAP bounds exceeded, 0x%qx, 0x%qx 0x%qx, 0x%qx", + (uint64_t)x, dbladdr, dblmap_base, dblmap_max); + } + return dbladdr; + +} +#define DBLMAP(x) (DBLMAP_CHECK((uint64_t) x)) +extern uint64_t ldt_alias_offset; +static inline uint64_t LDTALIAS_CHECK(uintptr_t x) { + uint64_t dbladdr = (uint64_t)x + ldt_alias_offset; + if (__improbable((dbladdr >= dblmap_max) || (dbladdr < dblmap_base))) { + panic("LDTALIAS: bounds exceeded, 0x%qx, 0x%qx 0x%qx, 0x%qx", + (uint64_t)x, dbladdr, dblmap_base, dblmap_max); + } + return dbladdr; +} +#define LDTALIAS(x) (LDTALIAS_CHECK((uint64_t) x)) #endif -typedef volatile long cpu_set; /* set of CPUs - must be <= 32 */ - /* changed by other processors */ -struct md_page { - int pv_list_count; - TAILQ_HEAD(,pv_entry) pv_list; -}; +/* + * For KASLR, we alias the master processor's IDT and GDT at fixed + * virtual addresses to defeat SIDT/SGDT address leakage. + * And non-boot processor's GDT aliases likewise (skipping LOWGLOBAL_ALIAS) + * The low global vector page is mapped at a fixed alias also. + */ +#define LOWGLOBAL_ALIAS (VM_MIN_KERNEL_ADDRESS + 0x2000) + +/* + * This indicates (roughly) where there is free space for the VM + * to use for the heap; this does not need to be precise. + */ +#define KERNEL_PMAP_HEAP_RANGE_START VM_MIN_KERNEL_AND_KEXT_ADDRESS #include @@ -512,28 +522,45 @@ struct md_page { */ struct pmap { - pd_entry_t *dirbase; /* page directory pointer */ -#ifdef __i386__ - pmap_paddr_t pdirbase; /* phys. address of dirbase */ -#endif - vm_object_t pm_obj; /* object to hold pde's */ - int ref_count; /* reference count */ - int nx_enabled; - task_map_t pm_task_map; decl_simple_lock_data(,lock) /* lock on map */ + pmap_paddr_t pm_cr3; /* Kernel+user shared PML4 physical*/ + pmap_paddr_t pm_ucr3; /* Mirrored user PML4 physical */ + task_map_t pm_task_map; + boolean_t pm_shared; + boolean_t pagezero_accessible; +#define PMAP_PCID_MAX_CPUS MAX_CPUS /* Must be a multiple of 8 */ + pcid_t pmap_pcid_cpus[PMAP_PCID_MAX_CPUS]; + volatile uint8_t pmap_pcid_coherency_vector[PMAP_PCID_MAX_CPUS]; struct pmap_statistics stats; /* map statistics */ -#ifdef __i386__ - vm_offset_t pm_hold; /* true pdpt zalloc addr */ -#endif - pmap_paddr_t pm_cr3; /* physical addr */ - pdpt_entry_t *pm_pdpt; /* KVA of 3rd level page */ + int ref_count; /* reference count */ + int nx_enabled; pml4_entry_t *pm_pml4; /* VKA of top level */ + pml4_entry_t *pm_upml4; /* Shadow VKA of top level */ + vm_object_t pm_obj; /* object to hold pde's */ vm_object_t pm_obj_pdpt; /* holds pdpt pages */ vm_object_t pm_obj_pml4; /* holds pml4 pages */ - vm_object_t pm_obj_top; /* holds single top level page */ - boolean_t pm_shared; + pmap_paddr_t pm_eptp; /* EPTP */ + ledger_t ledger; /* ledger tracking phys mappings */ +#if MACH_ASSERT + int pmap_pid; + char pmap_procname[17]; +#endif /* MACH_ASSERT */ }; +static inline boolean_t +is_ept_pmap(pmap_t p) +{ + if (__probable(p->pm_cr3 != 0)) { + assert(p->pm_eptp == 0); + return FALSE; + } + + assert(p->pm_eptp != 0); + + return TRUE; +} + +void hv_ept_pmap_create(void **ept_pmap, void **eptp); #if NCOPY_WINDOWS > 0 #define PMAP_PDPT_FIRST_WINDOW 0 @@ -565,10 +592,12 @@ extern void pmap_put_mapwindow(mapwindow_t *map); #endif typedef struct pmap_memory_regions { - ppnum_t base; - ppnum_t end; - ppnum_t alloc; - uint32_t type; + ppnum_t base; /* first page of this region */ + ppnum_t alloc_up; /* pages below this one have been "stolen" */ + ppnum_t alloc_down; /* pages above this one have been "stolen" */ + ppnum_t end; /* last page of this region */ + uint32_t type; + uint64_t attribute; } pmap_memory_region_t; extern unsigned pmap_memory_region_count; @@ -577,25 +606,56 @@ extern unsigned pmap_memory_region_current; #define PMAP_MEMORY_REGIONS_SIZE 128 extern pmap_memory_region_t pmap_memory_regions[]; +#include static inline void -set_dirbase(pmap_t tpmap, __unused thread_t thread) { - current_cpu_datap()->cpu_task_cr3 = tpmap->pm_cr3; - current_cpu_datap()->cpu_task_map = tpmap->pm_task_map; -#ifndef __i386__ +set_dirbase(pmap_t tpmap, thread_t thread, int my_cpu) { + int ccpu = my_cpu; + uint64_t pcr3 = tpmap->pm_cr3, ucr3 = tpmap->pm_ucr3; + cpu_datap(ccpu)->cpu_task_cr3 = pcr3; + cpu_shadowp(ccpu)->cpu_task_cr3 = pcr3; + + cpu_datap(ccpu)->cpu_ucr3 = ucr3; + cpu_shadowp(ccpu)->cpu_ucr3 = ucr3; + + cpu_datap(ccpu)->cpu_task_map = tpmap->pm_task_map; + + assert((get_preemption_level() > 0) || (ml_get_interrupts_enabled() == FALSE)); + assert(ccpu == cpu_number()); /* * Switch cr3 if necessary * - unless running with no_shared_cr3 debugging mode * and we're not on the kernel's cr3 (after pre-empted copyio) */ - if (!no_shared_cr3) { - if (get_cr3() != tpmap->pm_cr3) - set_cr3(tpmap->pm_cr3); + boolean_t nopagezero = tpmap->pagezero_accessible; + boolean_t priorpagezero = cpu_datap(ccpu)->cpu_pagezero_mapped; + cpu_datap(ccpu)->cpu_pagezero_mapped = nopagezero; + + if (__probable(!no_shared_cr3)) { + if (__improbable(nopagezero)) { + boolean_t copyio_active = ((thread->machine.specFlags & CopyIOActive) != 0); + if (pmap_pcid_ncpus) { + pmap_pcid_activate(tpmap, ccpu, TRUE, copyio_active); + } else { + if (copyio_active) { + if (get_cr3_base() != tpmap->pm_cr3) { + set_cr3_raw(tpmap->pm_cr3); + } + } else if (get_cr3_base() != cpu_datap(ccpu)->cpu_kernel_cr3) { + set_cr3_raw(cpu_datap(ccpu)->cpu_kernel_cr3); + } + } + } else if ((get_cr3_base() != tpmap->pm_cr3) || priorpagezero) { + if (pmap_pcid_ncpus) { + pmap_pcid_activate(tpmap, ccpu, FALSE, FALSE); + } else { + set_cr3_raw(tpmap->pm_cr3); + } + } } else { - if (get_cr3() != current_cpu_datap()->cpu_kernel_cr3) - set_cr3(current_cpu_datap()->cpu_kernel_cr3); + if (get_cr3_base() != cpu_datap(ccpu)->cpu_kernel_cr3) + set_cr3_raw(cpu_datap(ccpu)->cpu_kernel_cr3); } -#endif } /* @@ -605,33 +665,13 @@ set_dirbase(pmap_t tpmap, __unused thread_t thread) { extern void process_pmap_updates(void); extern void pmap_update_interrupt(void); -/* - * Machine dependent routines that are used only for i386/i486/i860. - */ - extern addr64_t (kvtophys)( vm_offset_t addr); -extern void pmap_expand( +extern kern_return_t pmap_expand( pmap_t pmap, - vm_map_offset_t addr); - -extern pt_entry_t *pmap_pte( - struct pmap *pmap, - vm_map_offset_t addr); - -extern pd_entry_t *pmap_pde( - struct pmap *pmap, - vm_map_offset_t addr); - -extern pd_entry_t *pmap64_pde( - struct pmap *pmap, - vm_map_offset_t addr); - -extern pdpt_entry_t *pmap64_pdpt( - struct pmap *pmap, - vm_map_offset_t addr); - + vm_map_offset_t addr, + unsigned int options); extern vm_offset_t pmap_map( vm_offset_t virt, vm_map_offset_t start, @@ -645,7 +685,6 @@ extern vm_offset_t pmap_map_bd( vm_map_offset_t end, vm_prot_t prot, unsigned int flags); - extern void pmap_bootstrap( vm_offset_t load_start, boolean_t IA32e); @@ -657,19 +696,11 @@ extern int pmap_list_resident_pages( struct pmap *pmap, vm_offset_t *listp, int space); - -#ifdef __i386__ -extern void pmap_commpage32_init( - vm_offset_t kernel, - vm_offset_t user, - int count); -extern void pmap_commpage64_init( - vm_offset_t kernel, - vm_map_offset_t user, - int count); - -#endif - +extern void x86_filter_TLB_coherency_interrupts(boolean_t); +/* + * Get cache attributes (as pagetable bits) for the specified phys page + */ +extern unsigned pmap_get_cache_attributes(ppnum_t, boolean_t is_ept); #if NCOPY_WINDOWS > 0 extern struct cpu_pmap *pmap_cpu_alloc( boolean_t is_boot_cpu); @@ -677,35 +708,26 @@ extern void pmap_cpu_free( struct cpu_pmap *cp); #endif -extern void pmap_map_block( - pmap_t pmap, +extern kern_return_t pmap_map_block( + pmap_t pmap, addr64_t va, ppnum_t pa, uint32_t size, vm_prot_t prot, int attr, unsigned int flags); - + extern void invalidate_icache(vm_offset_t addr, unsigned cnt, int phys); extern void flush_dcache(vm_offset_t addr, unsigned count, int phys); extern ppnum_t pmap_find_phys(pmap_t map, addr64_t va); extern void pmap_cpu_init(void); extern void pmap_disable_NX(pmap_t pmap); -#ifdef __i386__ -extern void pmap_set_4GB_pagezero(pmap_t pmap); -extern void pmap_clear_4GB_pagezero(pmap_t pmap); -extern void pmap_load_kernel_cr3(void); -extern vm_offset_t pmap_cpu_high_map_vaddr(int, enum high_cpu_types); -extern vm_offset_t pmap_high_map_vaddr(enum high_cpu_types); -extern vm_offset_t pmap_high_map(pt_entry_t, enum high_cpu_types); -extern vm_offset_t pmap_cpu_high_shared_remap(int, enum high_cpu_types, vm_offset_t, int); -extern vm_offset_t pmap_high_shared_remap(enum high_fixed_addresses, vm_offset_t, int); -#endif - -extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_size_t *, int *, int *); - +extern void pt_fake_zone_init(int); +extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_size_t *, + uint64_t *, int *, int *, int *); +extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__printflike(1,2)); /* * Macros for speed. @@ -714,75 +736,22 @@ extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_s #include - -#define PMAP_ACTIVATE_MAP(map, thread) { \ - register pmap_t tpmap; \ + +#define PMAP_ACTIVATE_MAP(map, thread, my_cpu) { \ + pmap_t tpmap; \ \ tpmap = vm_map_pmap(map); \ - set_dirbase(tpmap, thread); \ + set_dirbase(tpmap, thread, my_cpu); \ } -#ifdef __i386__ -#define PMAP_DEACTIVATE_MAP(map, thread) \ - if (vm_map_pmap(map)->pm_task_map == TASK_MAP_64BIT_SHARED) \ - pmap_load_kernel_cr3(); +#if defined(__x86_64__) +#define PMAP_DEACTIVATE_MAP(map, thread, ccpu) \ + pmap_assert2((pmap_pcid_ncpus ? (pcid_for_pmap_cpu_tuple(map->pmap, thread, ccpu) == (get_cr3_raw() & 0xFFF)) : TRUE),"PCIDs: 0x%x, active PCID: 0x%x, CR3: 0x%lx, pmap_cr3: 0x%llx, kernel_cr3: 0x%llx, kernel pmap cr3: 0x%llx, CPU active PCID: 0x%x, CPU kernel PCID: 0x%x, specflags: 0x%x, pagezero: 0x%x", pmap_pcid_ncpus, pcid_for_pmap_cpu_tuple(map->pmap, thread, ccpu), get_cr3_raw(), map->pmap->pm_cr3, cpu_datap(ccpu)->cpu_kernel_cr3, kernel_pmap->pm_cr3, cpu_datap(ccpu)->cpu_active_pcid, cpu_datap(ccpu)->cpu_kernel_pcid, thread->machine.specFlags, map->pmap->pagezero_accessible); #else -#define PMAP_DEACTIVATE_MAP(map, my_cpu) +#define PMAP_DEACTIVATE_MAP(map, thread) #endif -#if defined(__i386__) - -#define PMAP_SWITCH_CONTEXT(old_th, new_th, my_cpu) { \ - spl_t spl; \ - pt_entry_t *kpdp; \ - pt_entry_t *updp; \ - int i; \ - int need_flush; \ - \ - need_flush = 0; \ - spl = splhigh(); \ - if ((old_th->map != new_th->map) || (new_th->task != old_th->task)) { \ - PMAP_DEACTIVATE_MAP(old_th->map, old_th); \ - PMAP_ACTIVATE_MAP(new_th->map, new_th); \ - } \ - kpdp = current_cpu_datap()->cpu_copywindow_pdp; \ - for (i = 0; i < NCOPY_WINDOWS; i++) { \ - if (new_th->machine.copy_window[i].user_base != (user_addr_t)-1) { \ - updp = pmap_pde(new_th->map->pmap, \ - new_th->machine.copy_window[i].user_base);\ - pmap_store_pte(kpdp, updp ? *updp : 0); \ - } \ - kpdp++; \ - } \ - splx(spl); \ - if (new_th->machine.copyio_state == WINDOWS_OPENED) \ - need_flush = 1; \ - else \ - new_th->machine.copyio_state = WINDOWS_DIRTY; \ - if (new_th->machine.physwindow_pte) { \ - pmap_store_pte((current_cpu_datap()->cpu_physwindow_ptep), \ - new_th->machine.physwindow_pte); \ - if (need_flush == 0) \ - invlpg((uintptr_t)current_cpu_datap()->cpu_physwindow_base);\ - } \ - if (need_flush) \ - flush_tlb(); \ -} - -#else /* __x86_64__ */ -#define PMAP_SWITCH_CONTEXT(old_th, new_th, my_cpu) { \ - spl_t spl; \ - \ - spl = splhigh(); \ - if (old_th->map != new_th->map) { \ - PMAP_DEACTIVATE_MAP(old_th->map, old_th); \ - PMAP_ACTIVATE_MAP(new_th->map, new_th); \ - } \ - splx(spl); \ -} -#endif /* __i386__ */ - -#ifdef __i386__ +#if NCOPY_WINDOWS > 0 #define PMAP_SWITCH_USER(th, new_map, my_cpu) { \ spl_t spl; \ \ @@ -791,16 +760,16 @@ extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_s th->map = new_map; \ PMAP_ACTIVATE_MAP(th->map, th); \ splx(spl); \ - inval_copy_windows(th); \ + inval_copy_windows(th); \ } #else #define PMAP_SWITCH_USER(th, new_map, my_cpu) { \ spl_t spl; \ \ spl = splhigh(); \ - PMAP_DEACTIVATE_MAP(th->map, th); \ + PMAP_DEACTIVATE_MAP(th->map, th, my_cpu); \ th->map = new_map; \ - PMAP_ACTIVATE_MAP(th->map, th); \ + PMAP_ACTIVATE_MAP(th->map, th, my_cpu); \ splx(spl); \ } #endif @@ -809,7 +778,7 @@ extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_s * Marking the current cpu's cr3 inactive is achieved by setting its lsb. * Marking the current cpu's cr3 active once more involves clearng this bit. * Note that valid page tables are page-aligned and so the bottom 12 bits - * are noramlly zero. + * are normally zero, modulo PCID. * We can only mark the current cpu active/inactive but we can test any cpu. */ #define CPU_CR3_MARK_INACTIVE() \ @@ -834,36 +803,14 @@ extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_s * but will queue the update request for when the cpu * becomes active. */ -#if defined(__x86_64__) -#define MARK_CPU_IDLE(my_cpu) { \ - int s = splhigh(); \ - CPU_CR3_MARK_INACTIVE(); \ - __asm__ volatile("mfence"); \ - splx(s); \ -} -#else /* __i386__ native */ #define MARK_CPU_IDLE(my_cpu) { \ - /* \ - * Mark this cpu idle, and remove it from the active set, \ - * since it is not actively using any pmap. Signal_cpus \ - * will notice that it is idle, and avoid signaling it, \ - * but will queue the update request for when the cpu \ - * becomes active. \ - */ \ - int s = splhigh(); \ - if (!cpu_mode_is64bit() || no_shared_cr3) \ - process_pmap_updates(); \ - else \ - pmap_load_kernel_cr3(); \ + assert(ml_get_interrupts_enabled() == FALSE); \ CPU_CR3_MARK_INACTIVE(); \ - __asm__ volatile("mfence"); \ - splx(s); \ + mfence(); \ } -#endif /* __i386__ */ #define MARK_CPU_ACTIVE(my_cpu) { \ - \ - int s = splhigh(); \ + assert(ml_get_interrupts_enabled() == FALSE); \ /* \ * If a kernel_pmap update was requested while this cpu \ * was idle, process it as if we got the interrupt. \ @@ -875,11 +822,10 @@ extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_s * interrupt if this happens. \ */ \ CPU_CR3_MARK_ACTIVE(); \ - __asm__ volatile("mfence"); \ + mfence(); \ \ if (current_cpu_datap()->cpu_tlb_invalid) \ process_pmap_updates(); \ - splx(s); \ } #define PMAP_CONTEXT(pmap, thread) @@ -889,6 +835,7 @@ extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_s (((vm_offset_t) (VA)) <= vm_max_kernel_address)) +#define pmap_compressed(pmap) ((pmap)->stats.compressed) #define pmap_resident_count(pmap) ((pmap)->stats.resident_count) #define pmap_resident_max(pmap) ((pmap)->stats.resident_max) #define pmap_copy(dst_pmap,src_pmap,dst_addr,len,src_addr) @@ -897,16 +844,25 @@ extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_s #define pmap_attribute_cache_sync(addr,size,attr,value) \ (KERN_INVALID_ADDRESS) -#define MACHINE_PMAP_IS_EMPTY 1 +#define MACHINE_PMAP_IS_EMPTY 1 extern boolean_t pmap_is_empty(pmap_t pmap, vm_map_offset_t start, vm_map_offset_t end); +#define MACHINE_BOOTSTRAPPTD 1 /* Static bootstrap page-tables */ -#endif /* ASSEMBLER */ - +kern_return_t +pmap_permissions_verify(pmap_t, vm_map_t, vm_offset_t, vm_offset_t); +#if MACH_ASSERT +extern int pmap_stats_assert; +#define PMAP_STATS_ASSERTF(args) \ + MACRO_BEGIN \ + if (pmap_stats_assert) assertf args; \ + MACRO_END +#else /* MACH_ASSERT */ +#define PMAP_STATS_ASSERTF(args) +#endif /* MACH_ASSERT */ +#endif /* ASSEMBLER */ #endif /* _PMAP_MACHINE_ */ - - #endif /* KERNEL_PRIVATE */