]> git.saurik.com Git - apple/xnu.git/blobdiff - osfmk/i386/pmap.h
xnu-3248.40.184.tar.gz
[apple/xnu.git] / osfmk / i386 / pmap.h
index f2a7c95116f0ade3f1b871b8d225211fb784e0b0..939e47174605d98a377a3a33963f02104a7b63e4 100644 (file)
@@ -1,23 +1,29 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
  *
- * @APPLE_LICENSE_HEADER_START@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
  * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
  * 
- * @APPLE_LICENSE_HEADER_END@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*
  * @OSF_COPYRIGHT@
  *
  *     Machine-dependent structures for the physical map module.
  */
-
+#ifdef KERNEL_PRIVATE
 #ifndef        _PMAP_MACHINE_
 #define _PMAP_MACHINE_ 1
 
 #ifndef        ASSEMBLER
 
-#include <platforms.h>
-#include <mp_v1_1.h>
 
 #include <mach/kern_return.h>
 #include <mach/machine/vm_types.h>
 #include <mach/vm_statistics.h>
 #include <mach/machine/vm_param.h>
 #include <kern/kern_types.h>
-#include <kern/thread_act.h>
-#include <kern/lock.h>
+#include <kern/thread.h>
+#include <kern/simple_lock.h>
+#include <mach/branch_predicates.h>
+
+#include <i386/mp.h>
+#include <i386/proc_reg.h>
+
+#include <i386/pal_routines.h>
 
 /*
  *     Define the generic in terms of the specific
  *     i386/i486/i860 Page Table Entry
  */
 
-typedef unsigned int   pt_entry_t;
+#endif /* ASSEMBLER */
+
+#define NPGPTD          4ULL
+#define PDESHIFT        21ULL
+#define PTEMASK         0x1ffULL
+#define PTEINDX         3ULL
+
+#define PTESHIFT        12ULL
+
+
+#ifdef __x86_64__
+#define LOW_4GB_MASK   ((vm_offset_t)0x00000000FFFFFFFFUL)
+#endif
+
+#define PDESIZE                sizeof(pd_entry_t) /* for assembly files */
+#define PTESIZE                sizeof(pt_entry_t) /* for assembly files */
+
+#define INTEL_OFFMASK  (I386_PGBYTES - 1)
+#define INTEL_LOFFMASK (I386_LPGBYTES - 1)
+#define PG_FRAME        0x000FFFFFFFFFF000ULL
+#define NPTEPG          (PAGE_SIZE/(sizeof (pt_entry_t)))
+#define NPTDPG          (PAGE_SIZE/(sizeof (pd_entry_t)))
+
+#define NBPTD           (NPGPTD << PAGE_SHIFT)
+#define NPDEPTD         (NBPTD / (sizeof (pd_entry_t)))
+#define NPDEPG          (PAGE_SIZE/(sizeof (pd_entry_t)))
+#define NBPDE           (1ULL << PDESHIFT)
+#define PDEMASK         (NBPDE - 1)
+
+#define PTE_PER_PAGE   512 /* number of PTE's per page on any level */
+
+ /* cleanly define parameters for all the page table levels */
+typedef uint64_t        pml4_entry_t;
+#define NPML4PG         (PAGE_SIZE/(sizeof (pml4_entry_t)))
+#define PML4SHIFT       39
+#define PML4PGSHIFT     9
+#define NBPML4          (1ULL << PML4SHIFT)
+#define PML4MASK        (NBPML4-1)
+#define PML4_ENTRY_NULL ((pml4_entry_t *) 0)
+
+typedef uint64_t        pdpt_entry_t;
+#define NPDPTPG         (PAGE_SIZE/(sizeof (pdpt_entry_t)))
+#define PDPTSHIFT       30
+#define PDPTPGSHIFT     9
+#define NBPDPT          (1ULL << PDPTSHIFT)
+#define PDPTMASK        (NBPDPT-1)
+#define PDPT_ENTRY_NULL ((pdpt_entry_t *) 0)
+
+typedef uint64_t        pd_entry_t;
+#define NPDPG           (PAGE_SIZE/(sizeof (pd_entry_t)))
+#define PDSHIFT         21
+#define PDPGSHIFT       9
+#define NBPD            (1ULL << PDSHIFT)
+#define PDMASK          (NBPD-1)
+#define PD_ENTRY_NULL   ((pd_entry_t *) 0)
+
+typedef uint64_t        pt_entry_t;
+#define NPTPG           (PAGE_SIZE/(sizeof (pt_entry_t)))
+#define PTSHIFT         12
+#define PTPGSHIFT       9
+#define NBPT            (1ULL << PTSHIFT)
+#define PTMASK          (NBPT-1)
 #define PT_ENTRY_NULL  ((pt_entry_t *) 0)
 
-#endif /* ASSEMBLER */
+typedef uint64_t  pmap_paddr_t;
+
+#if    DEBUG
+#define PMAP_ASSERT 1
+#endif
+#if PMAP_ASSERT
+#define        pmap_assert(ex) ((ex) ? (void)0 : Assert(__FILE__, __LINE__, # ex))
+
+#define pmap_assert2(ex, fmt, args...)                                 \
+       do {                                                            \
+               if (!(ex)) {                                            \
+                       kprintf("Assertion %s failed (%s:%d, caller %p) " fmt , #ex, __FILE__, __LINE__, __builtin_return_address(0),  ##args); \
+                       panic("Assertion %s failed (%s:%d, caller %p) " fmt , #ex, __FILE__, __LINE__, __builtin_return_address(0),  ##args);           \
+               }                                                       \
+       } while(0)
+#else
+#define pmap_assert(ex)
+#define pmap_assert2(ex, fmt, args...)
+#endif
+
+/* superpages */
+#ifdef __x86_64__
+#define SUPERPAGE_NBASEPAGES 512
+#else
+#define SUPERPAGE_NBASEPAGES 1 /* we don't support superpages on i386 */
+#endif
 
-#define INTEL_OFFMASK  0xfff   /* offset within page */
-#define PDESHIFT       22      /* page descriptor shift */
-#define PDEMASK                0x3ff   /* mask for page descriptor index */
-#define PTESHIFT       12      /* page table shift */
-#define PTEMASK                0x3ff   /* mask for page table index */
+/*
+ * Atomic 64-bit store of a page table entry.
+ */
+static inline void
+pmap_store_pte(pt_entry_t *entryp, pt_entry_t value)
+{
+       /*
+        * In the 32-bit kernel a compare-and-exchange loop was
+        * required to provide atomicity. For K64, life is easier:
+        */
+       *entryp = value;
+}
 
+/* in 64 bit spaces, the number of each type of page in the page tables */
+#define NPML4PGS        (1ULL * (PAGE_SIZE/(sizeof (pml4_entry_t))))
+#define NPDPTPGS        (NPML4PGS * (PAGE_SIZE/(sizeof (pdpt_entry_t))))
+#define NPDEPGS         (NPDPTPGS * (PAGE_SIZE/(sizeof (pd_entry_t))))
+#define NPTEPGS         (NPDEPGS * (PAGE_SIZE/(sizeof (pt_entry_t))))
+
+#define KERNEL_PML4_INDEX              511
+#define KERNEL_KEXTS_INDEX     510     /* Home of KEXTs - the basement */
+#define KERNEL_PHYSMAP_PML4_INDEX      509     /* virtual to physical map */ 
+#define KERNEL_BASE            (0ULL - NBPML4)
+#define KERNEL_BASEMENT                (KERNEL_BASE - NBPML4)
+
+#define        VM_WIMG_COPYBACK        VM_MEM_COHERENT
+#define        VM_WIMG_COPYBACKLW      VM_WIMG_COPYBACK
+#define        VM_WIMG_DEFAULT         VM_MEM_COHERENT
+/* ?? intel ?? */
+#define VM_WIMG_IO             (VM_MEM_COHERENT |      \
+                               VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED)
+#define VM_WIMG_WTHRU          (VM_MEM_WRITE_THROUGH | VM_MEM_COHERENT | VM_MEM_GUARDED)
+/* write combining mode, aka store gather */
+#define VM_WIMG_WCOMB          (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT) 
+#define        VM_WIMG_INNERWBACK      VM_MEM_COHERENT
 /*
- *     Convert kernel virtual address to linear address
+ * Pte related macros
  */
+#define KVADDR(pmi, pdpi, pdi, pti)              \
+        ((vm_offset_t)                   \
+               ((uint64_t) -1    << 47)        | \
+               ((uint64_t)(pmi)  << PML4SHIFT) | \
+               ((uint64_t)(pdpi) << PDPTSHIFT) | \
+               ((uint64_t)(pdi)  << PDESHIFT)  | \
+               ((uint64_t)(pti)  << PTESHIFT))
+
+/*
+ * Size of Kernel address space.  This is the number of page table pages
+ * (4MB each) to use for the kernel.  256 pages == 1 Gigabyte.
+ * This **MUST** be a multiple of 4 (eg: 252, 256, 260, etc).
+ */
+#ifndef KVA_PAGES
+#define KVA_PAGES      1024
+#endif
+
+#ifndef NKPT
+#define        NKPT            500     /* actual number of kernel page tables */
+#endif
+#ifndef NKPDE
+#define NKPDE  (KVA_PAGES - 1) /* addressable number of page tables/pde's */
+#endif
+
 
-#define kvtolinear(a)  ((a)+LINEAR_KERNEL_ADDRESS)
 
 /*
  *     Convert address offset to page descriptor index
  */
-#define pdenum(pmap, a)        (((((pmap) == kernel_pmap) ?    \
-                          kvtolinear(a) : (a))         \
-                         >> PDESHIFT) & PDEMASK)
+#define pdptnum(pmap, a) (((vm_offset_t)(a) >> PDPTSHIFT) & PDPTMASK)
+#define pdenum(pmap, a)        (((vm_offset_t)(a) >> PDESHIFT) & PDEMASK)
+#define PMAP_INVALID_PDPTNUM (~0ULL)
+
+#define pdeidx(pmap, a)    (((a) >> PDSHIFT)   & ((1ULL<<(48 - PDSHIFT)) -1))
+#define pdptidx(pmap, a)   (((a) >> PDPTSHIFT) & ((1ULL<<(48 - PDPTSHIFT)) -1))
+#define pml4idx(pmap, a)   (((a) >> PML4SHIFT) & ((1ULL<<(48 - PML4SHIFT)) -1))
+
 
 /*
  *     Convert page descriptor index to user virtual address
@@ -126,320 +279,505 @@ typedef unsigned int    pt_entry_t;
 /*
  *     Convert address offset to page table index
  */
-#define ptenum(a)      (((a) >> PTESHIFT) & PTEMASK)
-
-#define NPTES  (intel_ptob(1)/sizeof(pt_entry_t))
-#define NPDES  (intel_ptob(1)/sizeof(pt_entry_t))
+#define ptenum(a)      (((vm_offset_t)(a) >> PTESHIFT) & PTEMASK)
 
 /*
  *     Hardware pte bit definitions (to be used directly on the ptes
  *     without using the bit fields).
  */
 
-#define INTEL_PTE_VALID                0x00000001
-#define INTEL_PTE_WRITE                0x00000002
-#define INTEL_PTE_USER         0x00000004
-#define INTEL_PTE_WTHRU                0x00000008
-#define INTEL_PTE_NCACHE       0x00000010
-#define INTEL_PTE_REF          0x00000020
-#define INTEL_PTE_MOD          0x00000040
-#define INTEL_PTE_WIRED                0x00000200
-#define INTEL_PTE_PFN          0xfffff000
-
-#define        pa_to_pte(a)            ((a) & INTEL_PTE_PFN)
-#define        pte_to_pa(p)            ((p) & INTEL_PTE_PFN)
+#define INTEL_PTE_VALID                0x00000001ULL
+#define INTEL_PTE_WRITE                0x00000002ULL
+#define INTEL_PTE_RW           0x00000002ULL
+#define INTEL_PTE_USER         0x00000004ULL
+#define INTEL_PTE_WTHRU                0x00000008ULL
+#define INTEL_PTE_NCACHE       0x00000010ULL
+#define INTEL_PTE_REF          0x00000020ULL
+#define INTEL_PTE_MOD          0x00000040ULL
+#define INTEL_PTE_PS           0x00000080ULL
+#define INTEL_PTE_PTA          0x00000080ULL
+#define INTEL_PTE_GLOBAL       0x00000100ULL
+#define INTEL_PTE_WIRED                0x00000400ULL
+#define INTEL_PDPTE_NESTED     0x00000800ULL
+#define INTEL_PTE_PFN          PG_FRAME
+
+#define INTEL_PTE_NX           (1ULL << 63)
+
+#define INTEL_PTE_INVALID       0
+/* This is conservative, but suffices */
+#define INTEL_PTE_RSVD         ((1ULL << 10) | (1ULL << 11) | (0x1FFULL << 54))
+
+#define INTEL_COMPRESSED       (1ULL << 62) /* marker, for invalid PTE only -- ignored by hardware for both regular/EPT entries*/
+
+#define        pa_to_pte(a)            ((a) & INTEL_PTE_PFN) /* XXX */
+#define        pte_to_pa(p)            ((p) & INTEL_PTE_PFN) /* XXX */
 #define        pte_increment_pa(p)     ((p) += INTEL_OFFMASK+1)
 
+#define pte_kernel_rw(p)          ((pt_entry_t)(pa_to_pte(p) | INTEL_PTE_VALID|INTEL_PTE_RW))
+#define pte_kernel_ro(p)          ((pt_entry_t)(pa_to_pte(p) | INTEL_PTE_VALID))
+#define pte_user_rw(p)            ((pt_entry_t)(pa_to_pte(p) | INTEL_PTE_VALID|INTEL_PTE_USER|INTEL_PTE_RW))
+#define pte_user_ro(p)            ((pt_entry_t)(pa_to_pte(p) | INTEL_PTE_VALID|INTEL_PTE_USER))
+
+#define PMAP_INVEPT_SINGLE_CONTEXT     1
+
+
+#define INTEL_EPTP_AD          0x00000040ULL
+
+#define INTEL_EPT_READ         0x00000001ULL
+#define INTEL_EPT_WRITE        0x00000002ULL
+#define INTEL_EPT_EX           0x00000004ULL
+#define INTEL_EPT_IPTA         0x00000040ULL
+#define INTEL_EPT_PS           0x00000080ULL
+#define INTEL_EPT_REF          0x00000100ULL
+#define INTEL_EPT_MOD          0x00000200ULL
+
+#define INTEL_EPT_CACHE_MASK   0x00000038ULL
+#define INTEL_EPT_NCACHE       0x00000000ULL
+#define INTEL_EPT_WC           0x00000008ULL
+#define INTEL_EPT_WTHRU        0x00000020ULL
+#define INTEL_EPT_WP           0x00000028ULL
+#define INTEL_EPT_WB           0x00000030ULL
+
 /*
- *     Convert page table entry to kernel virtual address
+ * Routines to filter correct bits depending on the pmap type
  */
-#define ptetokv(a)     (phystokv(pte_to_pa(a)))
 
-#ifndef        ASSEMBLER
-typedef        volatile long   cpu_set;        /* set of CPUs - must be <= 32 */
-                                       /* changed by other processors */
+static inline pt_entry_t
+pte_remove_ex(pt_entry_t pte, boolean_t is_ept)
+{
+       if (__probable(!is_ept)) {
+               return (pte | INTEL_PTE_NX);
+       }
 
-struct pmap {
-       pt_entry_t      *dirbase;       /* page directory pointer register */
-       vm_offset_t     pdirbase;       /* phys. address of dirbase */
-       int             ref_count;      /* reference count */
-       decl_simple_lock_data(,lock)    /* lock on map */
-       struct pmap_statistics  stats;  /* map statistics */
-       cpu_set         cpus_using;     /* bitmap of cpus using pmap */
-};
+       return (pte & (~INTEL_EPT_EX));
+}
 
-/* 
- * Optimization avoiding some TLB flushes when switching to
- * kernel-loaded threads.  This is effective only for i386:
- * Since user task, kernel task and kernel loaded tasks share the
- * same virtual space (with appropriate protections), any pmap
- * allows mapping kernel and kernel loaded tasks. 
- *
- * The idea is to avoid switching to another pmap unnecessarily when
- * switching to a kernel-loaded task, or when switching to the kernel
- * itself.
- *
- * We store the pmap we are really using (from which we fetched the
- * dirbase value) in real_pmap[cpu_number()].
- *
- * Invariant:
- * current_pmap() == real_pmap[cpu_number()] || current_pmap() == kernel_pmap.
+static inline pt_entry_t
+pte_set_ex(pt_entry_t pte, boolean_t is_ept)
+{
+       if (__probable(!is_ept)) {
+               return (pte & (~INTEL_PTE_NX));
+       }
+
+       return (pte | INTEL_EPT_EX);
+}
+
+static inline pt_entry_t
+physmap_refmod_to_ept(pt_entry_t physmap_pte)
+{
+       pt_entry_t ept_pte = 0;
+
+       if (physmap_pte & INTEL_PTE_MOD) {
+               ept_pte |= INTEL_EPT_MOD;
+       }
+
+       if (physmap_pte & INTEL_PTE_REF) {
+               ept_pte |= INTEL_EPT_REF;
+       }
+
+       return ept_pte;
+}
+
+static inline pt_entry_t
+ept_refmod_to_physmap(pt_entry_t ept_pte)
+{
+       pt_entry_t physmap_pte = 0;
+
+       assert((ept_pte & ~(INTEL_EPT_REF | INTEL_EPT_MOD)) == 0);
+
+       if (ept_pte & INTEL_EPT_REF) {
+               physmap_pte |= INTEL_PTE_REF;
+       }
+
+       if (ept_pte & INTEL_EPT_MOD) {
+               physmap_pte |= INTEL_PTE_MOD;
+       }
+
+       return physmap_pte;
+}
+
+/*
+ * Note: Not all Intel processors support EPT referenced access and dirty bits.
+ *      During pmap_init() we check the VMX capability for the current hardware
+ *      and update this variable accordingly.
  */
+extern boolean_t pmap_ept_support_ad;
+
+#define PTE_VALID_MASK(is_ept) ((is_ept) ? (INTEL_EPT_READ | INTEL_EPT_WRITE | INTEL_EPT_EX) : INTEL_PTE_VALID)
+#define PTE_READ(is_ept)       ((is_ept) ? INTEL_EPT_READ : INTEL_PTE_VALID)
+#define PTE_WRITE(is_ept)      ((is_ept) ? INTEL_EPT_WRITE : INTEL_PTE_WRITE)
+#define PTE_PS                 INTEL_PTE_PS
+#define PTE_COMPRESSED         INTEL_COMPRESSED
+#define PTE_NCACHE(is_ept)     ((is_ept) ? INTEL_EPT_NCACHE : INTEL_PTE_NCACHE)
+#define PTE_WTHRU(is_ept)      ((is_ept) ? INTEL_EPT_WTHRU : INTEL_PTE_WTHRU)
+#define PTE_REF(is_ept)        ((is_ept) ? INTEL_EPT_REF : INTEL_PTE_REF)
+#define PTE_MOD(is_ept)        ((is_ept) ? INTEL_EPT_MOD : INTEL_PTE_MOD)
+#define PTE_WIRED              INTEL_PTE_WIRED
+
+
+#define PMAP_DEFAULT_CACHE     0
+#define PMAP_INHIBIT_CACHE     1
+#define PMAP_GUARDED_CACHE     2
+#define PMAP_ACTIVATE_CACHE    4
+#define PMAP_NO_GUARD_CACHE    8
 
-extern struct pmap     *real_pmap[NCPUS];
+#ifndef        ASSEMBLER
+
+#include <sys/queue.h>
 
-#include <i386/proc_reg.h>
 /*
- * If switching to the kernel pmap, don't incur the TLB cost of switching
- * to its page tables, since all maps include the kernel map as a subset.
- * Simply record that this CPU is logically on the kernel pmap (see
- * pmap_destroy).
- * 
- * Similarly, if switching to a pmap (other than kernel_pmap that is already
- * in use, don't do anything to the hardware, to avoid a TLB flush.
+ * Address of current and alternate address space page table maps
+ * and directories.
  */
 
-#if    NCPUS > 1
-#define        PMAP_CPU_SET(pmap, my_cpu) i_bit_set(my_cpu, &((pmap)->cpus_using))
-#define        PMAP_CPU_CLR(pmap, my_cpu) i_bit_clear(my_cpu, &((pmap)->cpus_using))
-#else  /* NCPUS > 1 */
-#define        PMAP_CPU_SET(pmap,my_cpu)    (pmap)->cpus_using = TRUE  
-#define        PMAP_CPU_CLR(pmap,my_cpu)    (pmap)->cpus_using = FALSE
-#endif /* NCPUS > 1 */
+extern pt_entry_t      *PTmap;
+extern pdpt_entry_t    *IdlePDPT;
+extern pml4_entry_t    *IdlePML4;
+extern boolean_t       no_shared_cr3;
+extern addr64_t                kernel64_cr3;
+extern pd_entry_t      *IdlePTD;       /* physical addr of "Idle" state PTD */
 
+extern uint64_t                pmap_pv_hashlist_walks;
+extern uint64_t                pmap_pv_hashlist_cnts;
+extern uint32_t                pmap_pv_hashlist_max;
+extern uint32_t                pmap_kernel_text_ps;
 
-#define        set_dirbase(mypmap, my_cpu) {                                   \
-       struct pmap     **ppmap = &real_pmap[my_cpu];                   \
-       vm_offset_t     pdirbase = (mypmap)->pdirbase;                  \
-                                                                       \
-       if (*ppmap == (vm_offset_t)NULL) {                              \
-               *ppmap = (mypmap);                                      \
-               PMAP_CPU_SET((mypmap), my_cpu);                         \
-               set_cr3(pdirbase);                                      \
-       } else if ((mypmap) != kernel_pmap && (mypmap) != *ppmap ) {    \
-               if (*ppmap != kernel_pmap)                              \
-                       PMAP_CPU_CLR(*ppmap, my_cpu);                   \
-               *ppmap = (mypmap);                                      \
-               PMAP_CPU_SET((mypmap), my_cpu);                         \
-               set_cr3(pdirbase);                                      \
-       }                                                               \
-       assert((mypmap) == *ppmap || (mypmap) == kernel_pmap);          \
+
+
+#ifdef __x86_64__
+#define ID_MAP_VTOP(x) ((void *)(((uint64_t)(x)) & LOW_4GB_MASK))
+
+extern uint64_t physmap_base, physmap_max;
+
+#define NPHYSMAP (MAX(K64_MAXMEM/GB + 4, 4))
+
+static inline boolean_t physmap_enclosed(addr64_t a) {
+       return (a < (NPHYSMAP * GB));
 }
 
-#if    NCPUS > 1
+static inline void * PHYSMAP_PTOV_check(void *paddr) {
+       uint64_t pvaddr = (uint64_t)paddr + physmap_base;
+
+       if (__improbable(pvaddr >= physmap_max))
+               panic("PHYSMAP_PTOV bounds exceeded, 0x%qx, 0x%qx, 0x%qx",
+                     pvaddr, physmap_base, physmap_max);
+
+       return (void *)pvaddr;
+}
+
+#define PHYSMAP_PTOV(x)        (PHYSMAP_PTOV_check((void*) (x)))
+
 /*
- *     List of cpus that are actively using mapped memory.  Any
- *     pmap update operation must wait for all cpus in this list.
- *     Update operations must still be queued to cpus not in this
- *     list.
+ * For KASLR, we alias the master processor's IDT and GDT at fixed
+ * virtual addresses to defeat SIDT/SGDT address leakage.
+ * And non-boot processor's GDT aliases likewise (skipping LOWGLOBAL_ALIAS)
+ * The low global vector page is mapped at a fixed alias also.
  */
-extern cpu_set         cpus_active;
+#define MASTER_IDT_ALIAS       (VM_MIN_KERNEL_ADDRESS + 0x0000)
+#define MASTER_GDT_ALIAS       (VM_MIN_KERNEL_ADDRESS + 0x1000)
+#define LOWGLOBAL_ALIAS                (VM_MIN_KERNEL_ADDRESS + 0x2000)
+#define CPU_GDT_ALIAS(_cpu)    (LOWGLOBAL_ALIAS + (0x1000*(_cpu)))
+
+#endif /*__x86_64__ */
+
+#include <vm/vm_page.h>
 
 /*
- *     List of cpus that are idle, but still operating, and will want
- *     to see any kernel pmap updates when they become active.
+ *     For each vm_page_t, there is a list of all currently
+ *     valid virtual mappings of that page.  An entry is
+ *     a pv_entry_t; the list is the pv_table.
  */
-extern cpu_set         cpus_idle;
 
+struct pmap {
+       decl_simple_lock_data(,lock)    /* lock on map */
+       pmap_paddr_t    pm_cr3;         /* physical addr */
+       pmap_paddr_t    pm_eptp;        /* EPTP */
+       boolean_t       pm_shared;
+        pd_entry_t      *dirbase;        /* page directory pointer */
+        vm_object_t     pm_obj;         /* object to hold pde's */
+        task_map_t      pm_task_map;
+        pdpt_entry_t    *pm_pdpt;       /* KVA of 3rd level page */
+       pml4_entry_t    *pm_pml4;       /* VKA of top level */
+       vm_object_t     pm_obj_pdpt;    /* holds pdpt pages */
+       vm_object_t     pm_obj_pml4;    /* holds pml4 pages */
+#define        PMAP_PCID_MAX_CPUS      MAX_CPUS        /* Must be a multiple of 8 */
+       pcid_t          pmap_pcid_cpus[PMAP_PCID_MAX_CPUS];
+       volatile uint8_t pmap_pcid_coherency_vector[PMAP_PCID_MAX_CPUS];
+       struct pmap_statistics  stats;  /* map statistics */
+       int             ref_count;      /* reference count */
+        int            nx_enabled;
+       ledger_t        ledger;         /* ledger tracking phys mappings */
+};
+
+static inline boolean_t
+is_ept_pmap(pmap_t p)
+{
+       if (__probable(p->pm_cr3 != 0)) {
+               assert(p->pm_eptp == 0);
+               return FALSE;
+       }
+
+       assert(p->pm_eptp != 0);
+
+       return TRUE;
+}
+
+void hv_ept_pmap_create(void **ept_pmap, void **eptp);
+
+#if NCOPY_WINDOWS > 0
+#define PMAP_PDPT_FIRST_WINDOW 0
+#define PMAP_PDPT_NWINDOWS 4
+#define PMAP_PDE_FIRST_WINDOW (PMAP_PDPT_NWINDOWS)
+#define PMAP_PDE_NWINDOWS 4
+#define PMAP_PTE_FIRST_WINDOW (PMAP_PDE_FIRST_WINDOW + PMAP_PDE_NWINDOWS)
+#define PMAP_PTE_NWINDOWS 4
+
+#define PMAP_NWINDOWS_FIRSTFREE (PMAP_PTE_FIRST_WINDOW + PMAP_PTE_NWINDOWS)
+#define PMAP_WINDOW_SIZE 8
+#define PMAP_NWINDOWS (PMAP_NWINDOWS_FIRSTFREE + PMAP_WINDOW_SIZE)
+
+typedef struct {
+       pt_entry_t      *prv_CMAP;
+       caddr_t         prv_CADDR;
+} mapwindow_t;
+
+typedef struct cpu_pmap {
+        int                     pdpt_window_index;
+        int                     pde_window_index;
+        int                     pte_window_index;
+       mapwindow_t             mapwindow[PMAP_NWINDOWS];
+} cpu_pmap_t;
+
+
+extern mapwindow_t *pmap_get_mapwindow(pt_entry_t pentry);
+extern void         pmap_put_mapwindow(mapwindow_t *map);
+#endif
+
+typedef struct pmap_memory_regions {
+       ppnum_t base;           /* first page of this region */
+       ppnum_t alloc_up;       /* pages below this one have been "stolen" */
+       ppnum_t alloc_down;     /* pages above this one have been "stolen" */
+       ppnum_t end;            /* last page of this region */
+       uint32_t type;
+       uint64_t attribute;
+} pmap_memory_region_t;
+
+extern unsigned pmap_memory_region_count;
+extern unsigned pmap_memory_region_current;
+
+#define PMAP_MEMORY_REGIONS_SIZE 128
+
+extern pmap_memory_region_t pmap_memory_regions[];
+#include <i386/pmap_pcid.h>
+
+static inline void
+set_dirbase(pmap_t tpmap, __unused thread_t thread, int my_cpu) {
+       int ccpu = my_cpu;
+       cpu_datap(ccpu)->cpu_task_cr3 = tpmap->pm_cr3;
+       cpu_datap(ccpu)->cpu_task_map = tpmap->pm_task_map;
+       /*
+        * Switch cr3 if necessary
+        * - unless running with no_shared_cr3 debugging mode
+        *   and we're not on the kernel's cr3 (after pre-empted copyio)
+        */
+       if (__probable(!no_shared_cr3)) {
+               if (get_cr3_base() != tpmap->pm_cr3) {
+                       if (pmap_pcid_ncpus) {
+                               pmap_pcid_activate(tpmap, ccpu);
+                       }
+                       else
+                               set_cr3_raw(tpmap->pm_cr3);
+               }
+       } else {
+               if (get_cr3_base() != cpu_datap(ccpu)->cpu_kernel_cr3)
+                       set_cr3_raw(cpu_datap(ccpu)->cpu_kernel_cr3);
+       }
+}
 
 /*
  *     External declarations for PMAP_ACTIVATE.
  */
 
-extern void            process_pmap_updates(struct pmap *pmap);
+extern void            process_pmap_updates(void);
 extern void            pmap_update_interrupt(void);
 
-#endif /* NCPUS > 1 */
-
 /*
  *     Machine dependent routines that are used only for i386/i486/i860.
  */
-extern vm_offset_t     (phystokv)(
-                               vm_offset_t     pa);
 
-extern vm_offset_t     (kvtophys)(
+extern addr64_t                (kvtophys)(
                                vm_offset_t     addr);
 
+extern kern_return_t   pmap_expand(
+                               pmap_t          pmap,
+                               vm_map_offset_t addr,
+                               unsigned int options);
+#if    !defined(__x86_64__)
 extern pt_entry_t      *pmap_pte(
                                struct pmap     *pmap,
-                               vm_offset_t     addr);
+                               vm_map_offset_t addr);
 
+extern pd_entry_t      *pmap_pde(
+                               struct pmap     *pmap,
+                               vm_map_offset_t addr);
+
+extern pd_entry_t      *pmap64_pde(
+                               struct pmap     *pmap,
+                               vm_map_offset_t addr);
+
+extern pdpt_entry_t    *pmap64_pdpt(
+                               struct pmap     *pmap,
+                               vm_map_offset_t addr);
+#endif
 extern vm_offset_t     pmap_map(
                                vm_offset_t     virt,
-                               vm_offset_t     start,
-                               vm_offset_t     end,
-                               vm_prot_t       prot);
+                               vm_map_offset_t start,
+                               vm_map_offset_t end,
+                               vm_prot_t       prot,
+                               unsigned int    flags);
 
 extern vm_offset_t     pmap_map_bd(
                                vm_offset_t     virt,
-                               vm_offset_t     start,
-                               vm_offset_t     end,
-                               vm_prot_t       prot);
+                               vm_map_offset_t start,
+                               vm_map_offset_t end,
+                               vm_prot_t       prot,
+                               unsigned int    flags);
 
 extern void            pmap_bootstrap(
-                               vm_offset_t     load_start);
+                               vm_offset_t     load_start,
+                               boolean_t       IA32e);
 
 extern boolean_t       pmap_valid_page(
-                               vm_offset_t     pa);
+                               ppnum_t pn);
 
 extern int             pmap_list_resident_pages(
                                struct pmap     *pmap,
                                vm_offset_t     *listp,
                                int             space);
-
-extern void            flush_tlb(void);
+extern void            x86_filter_TLB_coherency_interrupts(boolean_t);
+/*
+ * Get cache attributes (as pagetable bits) for the specified phys page
+ */
+extern unsigned        pmap_get_cache_attributes(ppnum_t, boolean_t is_ept);
+#if NCOPY_WINDOWS > 0
+extern struct cpu_pmap *pmap_cpu_alloc(
+                               boolean_t       is_boot_cpu);
+extern void            pmap_cpu_free(
+                               struct cpu_pmap *cp);
+#endif
+
+extern void            pmap_map_block(
+                               pmap_t pmap, 
+                               addr64_t va,
+                               ppnum_t pa,
+                               uint32_t size,
+                               vm_prot_t prot,
+                               int attr,
+                               unsigned int flags);
+                               
 extern void invalidate_icache(vm_offset_t addr, unsigned cnt, int phys);
 extern void flush_dcache(vm_offset_t addr, unsigned count, int phys);
+extern ppnum_t          pmap_find_phys(pmap_t map, addr64_t va);
 
+extern void pmap_cpu_init(void);
+extern void pmap_disable_NX(pmap_t pmap);
+
+extern void pt_fake_zone_init(int);
+extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_size_t *, 
+                             uint64_t *, int *, int *, int *);
+extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__printflike(1,2));
 
 /*
  *     Macros for speed.
  */
 
-#if    NCPUS > 1
 
 #include <kern/spl.h>
 
-/*
- *     For multiple CPUS, PMAP_ACTIVATE and PMAP_DEACTIVATE must manage
- *     fields to control TLB invalidation on other CPUS.
- */
-
-#define        PMAP_ACTIVATE_KERNEL(my_cpu)    {                               \
-                                                                       \
-       /*                                                              \
-        *      Let pmap updates proceed while we wait for this pmap.   \
-        */                                                             \
-       i_bit_clear((my_cpu), &cpus_active);                            \
-                                                                       \
-       /*                                                              \
-        *      Lock the pmap to put this cpu in its active set.        \
-        *      Wait for updates here.                                  \
-        */                                                             \
-       simple_lock(&kernel_pmap->lock);                                \
-                                                                       \
-       /*                                                              \
-        *      Mark that this cpu is using the pmap.                   \
-        */                                                             \
-       i_bit_set((my_cpu), &kernel_pmap->cpus_using);                  \
-                                                                       \
-       /*                                                              \
-        *      Mark this cpu active - IPL will be lowered by           \
-        *      load_context().                                         \
-        */                                                             \
-       i_bit_set((my_cpu), &cpus_active);                              \
-                                                                       \
-       simple_unlock(&kernel_pmap->lock);                              \
+                                 
+#define PMAP_ACTIVATE_MAP(map, thread, my_cpu) {                               \
+       register pmap_t         tpmap;                                  \
+                                                                        \
+        tpmap = vm_map_pmap(map);                                      \
+        set_dirbase(tpmap, thread, my_cpu);                                    \
 }
 
-#define        PMAP_DEACTIVATE_KERNEL(my_cpu)  {                               \
-       /*                                                              \
-        *      Mark pmap no longer in use by this cpu even if          \
-        *      pmap is locked against updates.                         \
-        */                                                             \
-       i_bit_clear((my_cpu), &kernel_pmap->cpus_using);                \
-}
+#if   defined(__x86_64__)
+#define PMAP_DEACTIVATE_MAP(map, thread, ccpu)                         \
+       pmap_assert(pmap_pcid_ncpus ? (pcid_for_pmap_cpu_tuple(map->pmap, ccpu) == (get_cr3_raw() & 0xFFF)) : TRUE);
+#else
+#define PMAP_DEACTIVATE_MAP(map, thread)
+#endif
 
-#define PMAP_ACTIVATE_MAP(map, my_cpu) {                               \
-       register struct pmap    *tpmap;                                 \
-                                                                       \
-       tpmap = vm_map_pmap(map);                                       \
-       if (tpmap == kernel_pmap) {                                     \
-           /*                                                          \
-            *  If this is the kernel pmap, switch to its page tables.  \
-            */                                                         \
-           set_dirbase(kernel_pmap, my_cpu);                           \
-       }                                                               \
-       else {                                                          \
-           /*                                                          \
-            *  Let pmap updates proceed while we wait for this pmap.   \
-            */                                                         \
-           i_bit_clear((my_cpu), &cpus_active);                        \
-                                                                       \
-           /*                                                          \
-            *  Lock the pmap to put this cpu in its active set.        \
-            *  Wait for updates here.                                  \
-            */                                                         \
-           simple_lock(&tpmap->lock);                                  \
-                                                                       \
-           /*                                                          \
-            *  No need to invalidate the TLB - the entire user pmap    \
-            *  will be invalidated by reloading dirbase.               \
-            */                                                         \
-           set_dirbase(tpmap, my_cpu);                                 \
-                                                                       \
-           /*                                                          \
-            *  Mark this cpu active - IPL will be lowered by           \
-            *  load_context().                                         \
-            */                                                         \
-           i_bit_set((my_cpu), &cpus_active);                          \
-                                                                       \
-           simple_unlock(&tpmap->lock);                                \
+#define        PMAP_SWITCH_CONTEXT(old_th, new_th, my_cpu) {                   \
+                                                                        \
+       pmap_assert(ml_get_interrupts_enabled() == FALSE);              \
+       if (old_th->map != new_th->map) {                               \
+               PMAP_DEACTIVATE_MAP(old_th->map, old_th, my_cpu);               \
+               PMAP_ACTIVATE_MAP(new_th->map, new_th, my_cpu);         \
        }                                                               \
 }
 
-#define PMAP_DEACTIVATE_MAP(map, my_cpu)
-
-#define PMAP_ACTIVATE_USER(th, my_cpu) {                               \
-       spl_t           spl;                                            \
-                                                                       \
-       spl = splhigh();                                                        \
-       PMAP_ACTIVATE_MAP(th->map, my_cpu)                              \
-       splx(spl);                                                      \
-}
-
-#define PMAP_DEACTIVATE_USER(th, my_cpu)       {                       \
+#if NCOPY_WINDOWS > 0
+#define        PMAP_SWITCH_USER(th, new_map, my_cpu) {                         \
        spl_t           spl;                                            \
                                                                        \
-       spl = splhigh();                                                        \
-       PMAP_DEACTIVATE_MAP(th->map, my_cpu)                            \
+       spl = splhigh();                                                \
+       PMAP_DEACTIVATE_MAP(th->map, th);                               \
+       th->map = new_map;                                              \
+       PMAP_ACTIVATE_MAP(th->map, th);                                 \
        splx(spl);                                                      \
+       inval_copy_windows(th);                                         \
 }
-
-#define        PMAP_SWITCH_CONTEXT(old_th, new_th, my_cpu) {                   \
-       spl_t           spl;                                            \
-                                                                       \
-       if (old_th->map != new_th->map) {                               \
-               spl = splhigh();                                                \
-               PMAP_DEACTIVATE_MAP(old_th->map, my_cpu);               \
-               PMAP_ACTIVATE_MAP(new_th->map, my_cpu);                 \
-               splx(spl);                                              \
-       }                                                               \
-}
-
+#else
 #define        PMAP_SWITCH_USER(th, new_map, my_cpu) {                         \
        spl_t           spl;                                            \
                                                                        \
-       spl = splhigh();                                                        \
-       PMAP_DEACTIVATE_MAP(th->map, my_cpu);                           \
+       spl = splhigh();                                                \
+       PMAP_DEACTIVATE_MAP(th->map, th, my_cpu);                               \
        th->map = new_map;                                              \
-       PMAP_ACTIVATE_MAP(th->map, my_cpu);                             \
+       PMAP_ACTIVATE_MAP(th->map, th, my_cpu);                         \
        splx(spl);                                                      \
 }
+#endif
 
-#if    MP_V1_1
-#define        set_led(cpu)
-#define clear_led(cpu)
-#endif /* MP_V1_1  */
+/*
+ * Marking the current cpu's cr3 inactive is achieved by setting its lsb.
+ * Marking the current cpu's cr3 active once more involves clearng this bit.
+ * Note that valid page tables are page-aligned and so the bottom 12 bits
+ * are normally zero, modulo PCID.
+ * We can only mark the current cpu active/inactive but we can test any cpu.
+ */
+#define CPU_CR3_MARK_INACTIVE()                                                \
+       current_cpu_datap()->cpu_active_cr3 |= 1
+
+#define CPU_CR3_MARK_ACTIVE()                                          \
+       current_cpu_datap()->cpu_active_cr3 &= ~1
+
+#define CPU_CR3_IS_ACTIVE(cpu)                                         \
+       ((cpu_datap(cpu)->cpu_active_cr3 & 1) == 0)
+
+#define CPU_GET_ACTIVE_CR3(cpu)                                                \
+       (cpu_datap(cpu)->cpu_active_cr3 & ~1)
+
+#define CPU_GET_TASK_CR3(cpu)                                          \
+       (cpu_datap(cpu)->cpu_task_cr3)
 
+/*
+ *     Mark this cpu idle, and remove it from the active set,
+ *     since it is not actively using any pmap.  Signal_cpus
+ *     will notice that it is idle, and avoid signaling it,
+ *     but will queue the update request for when the cpu
+ *     becomes active.
+ */
 #define MARK_CPU_IDLE(my_cpu)  {                                       \
-       /*                                                              \
-        *      Mark this cpu idle, and remove it from the active set,  \
-        *      since it is not actively using any pmap.  Signal_cpus   \
-        *      will notice that it is idle, and avoid signaling it,    \
-        *      but will queue the update request for when the cpu      \
-        *      becomes active.                                         \
-        */                                                             \
-       int     s = splhigh();                                          \
-       i_bit_set((my_cpu), &cpus_idle);                                \
-       i_bit_clear((my_cpu), &cpus_active);                            \
-       splx(s);                                                        \
-       set_led(my_cpu);                                                \
+       assert(ml_get_interrupts_enabled() == FALSE);                   \
+       CPU_CR3_MARK_INACTIVE();                                        \
+       mfence();                                                                       \
 }
 
-#define MARK_CPU_ACTIVE(my_cpu)        {                                       \
-                                                                       \
-       int     s = splhigh();                                          \
+#define MARK_CPU_ACTIVE(my_cpu) {                                      \
+       assert(ml_get_interrupts_enabled() == FALSE);                   \
        /*                                                              \
         *      If a kernel_pmap update was requested while this cpu    \
         *      was idle, process it as if we got the interrupt.        \
@@ -450,68 +788,43 @@ extern void flush_dcache(vm_offset_t addr, unsigned count, int phys);
         *      set assures that we will receive another update         \
         *      interrupt if this happens.                              \
         */                                                             \
-       i_bit_clear((my_cpu), &cpus_idle);                              \
+       CPU_CR3_MARK_ACTIVE();                                          \
+       mfence();                                                                       \
                                                                        \
-       /*                                                              \
-        *      Mark that this cpu is now active.                       \
-        */                                                             \
-       i_bit_set((my_cpu), &cpus_active);                              \
-       splx(s);                                                        \
-       clear_led(my_cpu);                                              \
-}
-
-#else  /* NCPUS > 1 */
-
-/*
- *     With only one CPU, we just have to indicate whether the pmap is
- *     in use.
- */
-
-#define        PMAP_ACTIVATE_KERNEL(my_cpu)    {                               \
-       kernel_pmap->cpus_using = TRUE;                                 \
-}
-
-#define        PMAP_DEACTIVATE_KERNEL(my_cpu)  {                               \
-       kernel_pmap->cpus_using = FALSE;                                \
-}
-
-#define        PMAP_ACTIVATE_MAP(map, my_cpu)                                  \
-       set_dirbase(vm_map_pmap(map), my_cpu)
-
-#define PMAP_DEACTIVATE_MAP(map, my_cpu)
-
-#define PMAP_ACTIVATE_USER(th, my_cpu)                                 \
-       PMAP_ACTIVATE_MAP(th->map, my_cpu)
-
-#define PMAP_DEACTIVATE_USER(th, my_cpu)                               \
-       PMAP_DEACTIVATE_MAP(th->map, my_cpu)
-
-#define        PMAP_SWITCH_CONTEXT(old_th, new_th, my_cpu) {                   \
-       if (old_th->map != new_th->map) {                               \
-               PMAP_DEACTIVATE_MAP(old_th->map, my_cpu);               \
-               PMAP_ACTIVATE_MAP(new_th->map, my_cpu);                 \
-       }                                                               \
-}
-
-#define        PMAP_SWITCH_USER(th, new_map, my_cpu) {                         \
-       PMAP_DEACTIVATE_MAP(th->map, my_cpu);                           \
-       th->map = new_map;                                              \
-       PMAP_ACTIVATE_MAP(th->map, my_cpu);                             \
+       if (current_cpu_datap()->cpu_tlb_invalid)                       \
+           process_pmap_updates();                                     \
 }
 
-#endif /* NCPUS > 1 */
-
 #define PMAP_CONTEXT(pmap, thread)
 
 #define pmap_kernel_va(VA)     \
-       (((VA) >= VM_MIN_KERNEL_ADDRESS) && ((VA) <= VM_MAX_KERNEL_ADDRESS))
+       ((((vm_offset_t) (VA)) >= vm_min_kernel_address) &&     \
+        (((vm_offset_t) (VA)) <= vm_max_kernel_address))
+
 
+#define pmap_compressed(pmap)          ((pmap)->stats.compressed)
 #define pmap_resident_count(pmap)      ((pmap)->stats.resident_count)
-#define pmap_phys_address(frame)       ((vm_offset_t) (intel_ptob(frame)))
-#define pmap_phys_to_frame(phys)       ((int) (intel_btop(phys)))
+#define pmap_resident_max(pmap)                ((pmap)->stats.resident_max)
 #define        pmap_copy(dst_pmap,src_pmap,dst_addr,len,src_addr)
 #define        pmap_attribute(pmap,addr,size,attr,value) \
                                        (KERN_INVALID_ADDRESS)
+#define        pmap_attribute_cache_sync(addr,size,attr,value) \
+                                       (KERN_INVALID_ADDRESS)
+
+#define MACHINE_PMAP_IS_EMPTY  1
+extern boolean_t pmap_is_empty(pmap_t          pmap,
+                              vm_map_offset_t  start,
+                              vm_map_offset_t  end);
+
+#define MACHINE_BOOTSTRAPPTD   1       /* Static bootstrap page-tables */
+
+kern_return_t
+pmap_permissions_verify(pmap_t, vm_map_t, vm_offset_t, vm_offset_t);
+
 #endif /* ASSEMBLER */
 
+
 #endif /* _PMAP_MACHINE_ */
+
+
+#endif  /* KERNEL_PRIVATE */