osfmk/i386/pmap.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58
  59 /*
  60  *      File:   pmap.c
  61  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  62  *      (These guys wrote the Vax version)
  63  *
  64  *      Physical Map management code for Intel i386, i486, and i860.
  65  *
  66  *      Manages physical address maps.
  67  *
  68  *      In addition to hardware address maps, this
  69  *      module is called upon to provide software-use-only
  70  *      maps which may or may not be stored in the same
  71  *      form as hardware maps.  These pseudo-maps are
  72  *      used to store intermediate results from copy
  73  *      operations to and from address spaces.
  74  *
  75  *      Since the information managed by this module is
  76  *      also stored by the logical address mapping module,
  77  *      this module may throw away valid virtual-to-physical
  78  *      mappings at almost any time.  However, invalidations
  79  *      of virtual-to-physical mappings must be done as
  80  *      requested.
  81  *
  82  *      In order to cope with hardware architectures which
  83  *      make virtual-to-physical map invalidates expensive,
  84  *      this module may delay invalidate or reduced protection
  85  *      operations until such time as they are actually
  86  *      necessary.  This module is given full information as
  87  *      to which processors are currently using which maps,
  88  *      and to when physical maps must be made correct.
  89  */
  90
  91 #include <string.h>
  92 #include <norma_vm.h>
  93 #include <mach_kdb.h>
  94 #include <mach_ldebug.h>
  95
  96 #include <libkern/OSAtomic.h>
  97
  98 #include <mach/machine/vm_types.h>
  99
 100 #include <mach/boolean.h>
 101 #include <kern/thread.h>
 102 #include <kern/zalloc.h>
 103 #include <kern/queue.h>
 104
 105 #include <kern/lock.h>
 106 #include <kern/kalloc.h>
 107 #include <kern/spl.h>
 108
 109 #include <vm/pmap.h>
 110 #include <vm/vm_map.h>
 111 #include <vm/vm_kern.h>
 112 #include <mach/vm_param.h>
 113 #include <mach/vm_prot.h>
 114 #include <vm/vm_object.h>
 115 #include <vm/vm_page.h>
 116
 117 #include <mach/machine/vm_param.h>
 118 #include <machine/thread.h>
 119
 120 #include <kern/misc_protos.h>                   /* prototyping */
 121 #include <i386/misc_protos.h>
 122
 123 #include <i386/cpuid.h>
 124 #include <i386/cpu_data.h>
 125 #include <i386/cpu_number.h>
 126 #include <i386/machine_cpu.h>
 127 #include <i386/mp_slave_boot.h>
 128 #include <i386/seg.h>
 129 #include <i386/serial_io.h>
 130 #include <i386/cpu_capabilities.h>
 131 #include <i386/machine_routines.h>
 132 #include <i386/proc_reg.h>
 133 #include <i386/tsc.h>
 134
 135 #if     MACH_KDB
 136 #include <ddb/db_command.h>
 137 #include <ddb/db_output.h>
 138 #include <ddb/db_sym.h>
 139 #include <ddb/db_print.h>
 140 #endif  /* MACH_KDB */
 141
 142 #include <vm/vm_protos.h>
 143
 144 #include <i386/mp.h>
 145 #include <i386/mp_desc.h>
 146
 147 #include <sys/kdebug.h>
 148
 149 /* #define DEBUGINTERRUPTS 1  uncomment to ensure pmap callers have interrupts enabled */
 150 #ifdef DEBUGINTERRUPTS
 151 #define pmap_intr_assert() {if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) panic("pmap interrupt assert %s, %d",__FILE__, __LINE__);}
 152 #else
 153 #define pmap_intr_assert()
 154 #endif
 155
 156 #ifdef IWANTTODEBUG
 157 #undef  DEBUG
 158 #define DEBUG 1
 159 #define POSTCODE_DELAY 1
 160 #include <i386/postcode.h>
 161 #endif /* IWANTTODEBUG */
 162
 163 //#define PMAP_TRACES 1
 164 #ifdef  PMAP_TRACES
 165 boolean_t       pmap_trace = FALSE;
 166 #define PMAP_TRACE(x,a,b,c,d,e)                                         \
 167         if (pmap_trace) {                                               \
 168                 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e);                     \
 169         }
 170 #else
 171 #define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e)
 172 #endif /* PMAP_TRACES */
 173
 174 /*
 175  * Forward declarations for internal functions.
 176  */
 177 void            pmap_expand_pml4(
 178                         pmap_t          map,
 179                         vm_map_offset_t v);
 180
 181 void            pmap_expand_pdpt(
 182                         pmap_t          map,
 183                         vm_map_offset_t v);
 184
 185 void    pmap_remove_range(
 186                         pmap_t          pmap,
 187                         vm_map_offset_t va,
 188                         pt_entry_t      *spte,
 189                         pt_entry_t      *epte);
 190
 191 void            phys_attribute_clear(
 192                         ppnum_t         phys,
 193                         int             bits);
 194
 195 int             phys_attribute_test(
 196                         ppnum_t         phys,
 197                         int             bits);
 198
 199 void            phys_attribute_set(
 200                         ppnum_t         phys,
 201                         int             bits);
 202
 203 void            pmap_set_reference(
 204                         ppnum_t pn);
 205
 206 void            pmap_movepage(
 207                         unsigned long   from,
 208                         unsigned long   to,
 209                         vm_size_t       size);
 210
 211 boolean_t       phys_page_exists(
 212                         ppnum_t pn);
 213
 214
 215 #ifdef PMAP_DEBUG
 216 void dump_pmap(pmap_t);
 217 void dump_4GB_pdpt(pmap_t p);
 218 void dump_4GB_pdpt_thread(thread_t tp);
 219 #endif
 220
 221 #define iswired(pte)    ((pte) & INTEL_PTE_WIRED)
 222
 223 int nx_enabled = 1;                     /* enable no-execute protection */
 224 #ifdef CONFIG_EMBEDDED
 225 int allow_data_exec  = 0;       /* no exec from data, embedded is hardcore like that */
 226 #else
 227 int allow_data_exec  = VM_ABI_32;       /* 32-bit apps may execute data by default, 64-bit apps may not */
 228 #endif
 229 int allow_stack_exec = 0;               /* No apps may execute from the stack by default */
 230
 231 int cpu_64bit  = 0;
 232
 233 /*
 234  * when spinning through pmap_remove
 235  * ensure that we don't spend too much
 236  * time with preemption disabled.
 237  * I'm setting the current threshold
 238  * to 20us
 239  */
 240 #define MAX_PREEMPTION_LATENCY_NS 20000
 241
 242 uint64_t max_preemption_latency_tsc = 0;
 243
 244
 245 /*
 246  *      Private data structures.
 247  */
 248
 249 /*
 250  *      For each vm_page_t, there is a list of all currently
 251  *      valid virtual mappings of that page.  An entry is
 252  *      a pv_rooted_entry_t; the list is the pv_table.
 253  *
 254  *      N.B.  with the new combo rooted/hashed scheme it is
 255  *      only possibly to remove individual non-rooted entries
 256  *      if they are found via the hashed chains as there is no
 257  *      way to unlink the singly linked hashed entries if navigated to
 258  *      via the queue list off the rooted entries.  Think of it as
 259  *      hash/walk/pull, keeping track of the prev pointer while walking
 260  *      the singly linked hash list.  All of this is to save memory and
 261  *      keep both types of pv_entries as small as possible.
 262  */
 263
 264 /*
 265
 266 PV HASHING Changes - JK 1/2007
 267
 268 Pve's establish physical to virtual mappings.  These are used for aliasing of a
 269 physical page to (potentially many) virtual addresses within pmaps. In the previous
 270 implementation the structure of the pv_entries (each 16 bytes in size) was
 271
 272 typedef struct pv_entry {
 273     struct pv_entry_t    next;
 274     pmap_t                    pmap;
 275     vm_map_offset_t   va;
 276 } *pv_entry_t;
 277
 278 An initial array of these is created at boot time, one per physical page of memory,
 279 indexed by the physical page number. Additionally, a pool of entries is created from a
 280 pv_zone to be used as needed by pmap_enter() when it is creating new mappings.
 281 Originally, we kept this pool around because the code in pmap_enter() was unable to
 282 block if it needed an entry and none were available - we'd panic.  Some time ago I
 283 restructured the pmap_enter() code so that for user pmaps it can block while zalloc'ing
 284 a pv structure and restart, removing a panic from the code (in the case of the kernel
 285 pmap we cannot block and still panic, so, we keep a separate hot pool for use only on
 286 kernel pmaps).  The pool has not been removed since there is a large performance gain
 287 keeping freed pv's around for reuse and not suffering the overhead of zalloc for every new pv we need.
 288
 289 As pmap_enter() created new mappings it linked the new pve's for them off the fixed
 290 pv array for that ppn (off the next pointer).  These pve's are accessed for several
 291 operations, one of them being address space teardown.  In that case, we basically do this
 292
 293         for (every page/pte in the space) {
 294                 calc pve_ptr from the ppn in the pte
 295                 for (every pv in the list for the ppn) {
 296                         if (this pv is for this pmap/vaddr) {
 297                                 do housekeeping
 298                                 unlink/free the pv
 299                         }
 300                 }
 301         }
 302
 303 The problem arose when we were running, say 8000 (or even 2000) apache or other processes
 304 and one or all terminate. The list hanging off each pv array entry could have thousands of
 305 entries.  We were continuously linearly searching each of these lists as we stepped through
 306 the address space we were tearing down.  Because of the locks we hold, likely taking a cache
 307 miss for each node,  and interrupt disabling for MP issues the system became completely
 308 unresponsive for many seconds while we did this.
 309
 310 Realizing that pve's are accessed in two distinct ways (linearly running the list by ppn
 311 for operations like pmap_page_protect and finding and modifying/removing a single pve as
 312 part of pmap_enter processing) has led to modifying the pve structures and databases.
 313
 314 There are now two types of pve structures.  A "rooted" structure which is basically the
 315 original structure accessed in an array by ppn, and a ''hashed'' structure accessed on a
 316 hash list via a hash of [pmap, vaddr].  These have been designed with the two goals of
 317 minimizing wired memory and making the lookup of a ppn faster.  Since a vast majority of
 318 pages in the system are not aliased and hence represented by a single pv entry I've kept
 319 the rooted entry size as small as possible because there is one of these dedicated for
 320 every physical page of memory.  The hashed pve's are larger due to the addition of the hash
 321 link and the ppn entry needed for matching while running the hash list to find the entry we
 322 are looking for.  This way, only systems that have lots of aliasing (like 2000+ httpd procs)
 323 will pay the extra memory price. Both structures have the same first three fields allowing
 324 some simplification in the code.
 325
 326 They have these shapes
 327
 328 typedef struct pv_rooted_entry {
 329         queue_head_t qlink;
 330         vm_map_offset_t va;
 331         pmap_t          pmap;
 332 } *pv_rooted_entry_t;
 333
 334
 335 typedef struct pv_hashed_entry {
 336   queue_head_t qlink;
 337   vm_map_offset_t va;
 338   pmap_t        pmap;
 339   ppnum_t ppn;
 340   struct pv_hashed_entry *nexth;
 341 } *pv_hashed_entry_t;
 342
 343 The main flow difference is that the code is now aware of the rooted entry and the hashed
 344 entries.  Code that runs the pv list still starts with the rooted entry and then continues
 345 down the qlink onto the hashed entries.  Code that is looking up a specific pv entry first
 346 checks the rooted entry and then hashes and runs the hash list for the match. The hash list
 347 lengths are much smaller than the original pv lists that contained all aliases for the specific ppn.
 348
 349 */
 350
 351 typedef struct pv_rooted_entry {     /* first three entries must match pv_hashed_entry_t */
 352         queue_head_t qlink;
 353         vm_map_offset_t va;             /* virtual address for mapping */
 354         pmap_t          pmap;           /* pmap where mapping lies */
 355 } *pv_rooted_entry_t;
 356
 357 #define PV_ROOTED_ENTRY_NULL    ((pv_rooted_entry_t) 0)
 358
 359 pv_rooted_entry_t       pv_head_table;          /* array of entries, one per page */
 360
 361 typedef struct pv_hashed_entry {     /* first three entries must match pv_rooted_entry_t */
 362   queue_head_t qlink;
 363   vm_map_offset_t va;
 364   pmap_t        pmap;
 365   ppnum_t ppn;
 366   struct pv_hashed_entry *nexth;
 367 } *pv_hashed_entry_t;
 368
 369 #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
 370
 371 #define NPVHASH 4095   /* MUST BE 2^N - 1 */
 372 pv_hashed_entry_t     *pv_hash_table;  /* hash lists */
 373
 374 uint32_t npvhash = 0;
 375
 376 /* #define PV_DEBUG 1   uncomment to enable some PV debugging code */
 377 #ifdef PV_DEBUG
 378 #define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
 379 #else
 380 #define CHK_NPVHASH()
 381 #endif
 382
 383 /*
 384  *      pv_list entries are kept on a list that can only be accessed
 385  *      with the pmap system locked (at SPLVM, not in the cpus_active set).
 386  *      The list is refilled from the pv_hashed_list_zone if it becomes empty.
 387  */
 388 pv_rooted_entry_t       pv_free_list = PV_ROOTED_ENTRY_NULL;            /* free list at SPLVM */
 389 pv_hashed_entry_t       pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
 390 pv_hashed_entry_t      pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
 391 decl_simple_lock_data(,pv_hashed_free_list_lock)
 392 decl_simple_lock_data(,pv_hashed_kern_free_list_lock)
 393 decl_simple_lock_data(,pv_hash_table_lock)
 394
 395 int pv_free_count = 0;
 396 int pv_hashed_free_count = 0;
 397 int pv_kern_free_count = 0;
 398 int pv_hashed_kern_free_count = 0;
 399 #define PV_HASHED_LOW_WATER_MARK 5000
 400 #define PV_HASHED_KERN_LOW_WATER_MARK 100
 401 #define PV_HASHED_ALLOC_CHUNK 2000
 402 #define PV_HASHED_KERN_ALLOC_CHUNK 50
 403 thread_call_t  mapping_adjust_call;
 404 static thread_call_data_t  mapping_adjust_call_data;
 405 uint32_t mappingrecurse = 0;
 406
 407 #define PV_HASHED_ALLOC(pvh_e) { \
 408         simple_lock(&pv_hashed_free_list_lock); \
 409         if ((pvh_e = pv_hashed_free_list) != 0) { \
 410           pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next;   \
 411             pv_hashed_free_count--; \
 412             if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \
 413               if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
 414                 thread_call_enter(mapping_adjust_call); \
 415         } \
 416         simple_unlock(&pv_hashed_free_list_lock); \
 417 }
 418
 419 #define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) {   \
 420         simple_lock(&pv_hashed_free_list_lock); \
 421         pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list;        \
 422         pv_hashed_free_list = pvh_eh; \
 423         pv_hashed_free_count += pv_cnt; \
 424         simple_unlock(&pv_hashed_free_list_lock); \
 425 }
 426
 427 #define PV_HASHED_KERN_ALLOC(pvh_e) { \
 428         simple_lock(&pv_hashed_kern_free_list_lock); \
 429         if ((pvh_e = pv_hashed_kern_free_list) != 0) { \
 430           pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next;      \
 431             pv_hashed_kern_free_count--; \
 432             if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) \
 433               if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
 434                 thread_call_enter(mapping_adjust_call); \
 435         } \
 436         simple_unlock(&pv_hashed_kern_free_list_lock); \
 437 }
 438
 439 #define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) {       \
 440         simple_lock(&pv_hashed_kern_free_list_lock); \
 441         pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list;   \
 442         pv_hashed_kern_free_list = pvh_eh; \
 443         pv_hashed_kern_free_count += pv_cnt; \
 444         simple_unlock(&pv_hashed_kern_free_list_lock); \
 445 }
 446
 447 zone_t          pv_hashed_list_zone;    /* zone of pv_hashed_entry structures */
 448
 449 static zone_t pdpt_zone;
 450
 451 /*
 452  *      Each entry in the pv_head_table is locked by a bit in the
 453  *      pv_lock_table.  The lock bits are accessed by the physical
 454  *      address of the page they lock.
 455  */
 456
 457 char    *pv_lock_table;         /* pointer to array of bits */
 458 #define pv_lock_table_size(n)   (((n)+BYTE_SIZE-1)/BYTE_SIZE)
 459
 460 char    *pv_hash_lock_table;
 461 #define pv_hash_lock_table_size(n)  (((n)+BYTE_SIZE-1)/BYTE_SIZE)
 462
 463 /*
 464  *      First and last physical addresses that we maintain any information
 465  *      for.  Initialized to zero so that pmap operations done before
 466  *      pmap_init won't touch any non-existent structures.
 467  */
 468 boolean_t       pmap_initialized = FALSE;/* Has pmap_init completed? */
 469
 470 static struct vm_object kptobj_object_store;
 471 static vm_object_t kptobj;
 472
 473 /*
 474  *      Index into pv_head table, its lock bits, and the modify/reference and managed bits
 475  */
 476
 477 #define pa_index(pa)    (i386_btop(pa))
 478 #define ppn_to_pai(ppn) ((int)ppn)
 479
 480 #define pai_to_pvh(pai)         (&pv_head_table[pai])
 481 #define lock_pvh_pai(pai)       bit_lock(pai, (void *)pv_lock_table)
 482 #define unlock_pvh_pai(pai)     bit_unlock(pai, (void *)pv_lock_table)
 483
 484 #define pvhashidx(pmap, va) (((uint32_t)pmap ^ ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) & npvhash)
 485 #define pvhash(idx)         (&pv_hash_table[idx])
 486
 487 #define lock_hash_hash(hash)            bit_lock(hash, (void *)pv_hash_lock_table)
 488 #define unlock_hash_hash(hash)  bit_unlock(hash, (void *)pv_hash_lock_table)
 489
 490 /*
 491  *      Array of physical page attribites for managed pages.
 492  *      One byte per physical page.
 493  */
 494 char    *pmap_phys_attributes;
 495 unsigned int    last_managed_page = 0;
 496
 497 /*
 498  *      Physical page attributes.  Copy bits from PTE definition.
 499  */
 500 #define PHYS_MODIFIED   INTEL_PTE_MOD   /* page modified */
 501 #define PHYS_REFERENCED INTEL_PTE_REF   /* page referenced */
 502 #define PHYS_MANAGED    INTEL_PTE_VALID /* page is managed */
 503
 504 /*
 505  *      Amount of virtual memory mapped by one
 506  *      page-directory entry.
 507  */
 508 #define PDE_MAPPED_SIZE         (pdetova(1))
 509 uint64_t pde_mapped_size;
 510
 511 /*
 512  *      Locking and TLB invalidation
 513  */
 514
 515 /*
 516  *      Locking Protocols: (changed 2/2007 JK)
 517  *
 518  *      There are two structures in the pmap module that need locking:
 519  *      the pmaps themselves, and the per-page pv_lists (which are locked
 520  *      by locking the pv_lock_table entry that corresponds to the pv_head
 521  *      for the list in question.)  Most routines want to lock a pmap and
 522  *      then do operations in it that require pv_list locking -- however
 523  *      pmap_remove_all and pmap_copy_on_write operate on a physical page
 524  *      basis and want to do the locking in the reverse order, i.e. lock
 525  *      a pv_list and then go through all the pmaps referenced by that list.
 526  *
 527  *      The system wide pmap lock has been removed. Now, paths take a lock
 528  *      on the pmap before changing its 'shape' and the reverse order lockers
 529  *      (coming in by phys ppn) take a lock on the corresponding pv and then
 530  *      retest to be sure nothing changed during the window before they locked
 531  *      and can then run up/down the pv lists holding the list lock. This also
 532  *      lets the pmap layer run (nearly completely) interrupt enabled, unlike
 533  *      previously.
 534  */
 535
 536 /*
 537  * pmap locking
 538  */
 539
 540 #define PMAP_LOCK(pmap) {       \
 541         simple_lock(&(pmap)->lock);     \
 542 }
 543
 544 #define PMAP_UNLOCK(pmap) {             \
 545         simple_unlock(&(pmap)->lock);           \
 546 }
 547
 548 /*
 549  * PV locking
 550  */
 551
 552 #define LOCK_PVH(index)         {       \
 553     mp_disable_preemption();           \
 554     lock_pvh_pai(index);               \
 555 }
 556
 557 #define UNLOCK_PVH(index)  {      \
 558     unlock_pvh_pai(index);        \
 559     mp_enable_preemption();       \
 560 }
 561
 562 /*
 563  * PV hash locking
 564  */
 565
 566 #define LOCK_PV_HASH(hash)         lock_hash_hash(hash)
 567
 568 #define UNLOCK_PV_HASH(hash)       unlock_hash_hash(hash)
 569
 570 #if     USLOCK_DEBUG
 571 extern int      max_lock_loops;
 572 #define LOOP_VAR                                                        \
 573         unsigned int    loop_count;                                     \
 574         loop_count = disable_serial_output ? max_lock_loops             \
 575                                         : max_lock_loops*100
 576 #define LOOP_CHECK(msg, pmap)                                           \
 577         if (--loop_count == 0) {                                        \
 578                 mp_disable_preemption();                                \
 579                 kprintf("%s: cpu %d pmap %x\n",                         \
 580                           msg, cpu_number(), pmap);                     \
 581                 Debugger("deadlock detection");                         \
 582                 mp_enable_preemption();                                 \
 583                 loop_count = max_lock_loops;                            \
 584         }
 585 #else   /* USLOCK_DEBUG */
 586 #define LOOP_VAR
 587 #define LOOP_CHECK(msg, pmap)
 588 #endif  /* USLOCK_DEBUG */
 589
 590
 591 static void pmap_flush_tlbs(pmap_t pmap);
 592
 593 #define PMAP_UPDATE_TLBS(pmap, s, e)                                    \
 594         pmap_flush_tlbs(pmap)
 595
 596
 597 #define MAX_TBIS_SIZE   32              /* > this -> TBIA */ /* XXX */
 598
 599
 600 pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
 601
 602 /*
 603  *      Other useful macros.
 604  */
 605 #define current_pmap()          (vm_map_pmap(current_thread()->map))
 606
 607 struct pmap     kernel_pmap_store;
 608 pmap_t          kernel_pmap;
 609
 610 pd_entry_t    high_shared_pde;
 611 pd_entry_t    commpage64_pde;
 612
 613 struct zone     *pmap_zone;             /* zone of pmap structures */
 614
 615 int             pmap_debug = 0;         /* flag for debugging prints */
 616
 617 unsigned int    inuse_ptepages_count = 0;
 618
 619 addr64_t        kernel64_cr3;
 620 boolean_t       no_shared_cr3 = FALSE;  /* -no_shared_cr3 boot arg */
 621
 622 /*
 623  *      Pmap cache.  Cache is threaded through ref_count field of pmap.
 624  *      Max will eventually be constant -- variable for experimentation.
 625  */
 626 int             pmap_cache_max = 32;
 627 int             pmap_alloc_chunk = 8;
 628 pmap_t          pmap_cache_list;
 629 int             pmap_cache_count;
 630 decl_simple_lock_data(,pmap_cache_lock)
 631
 632 extern char end;
 633
 634 static int nkpt;
 635 extern uint32_t lowGlo;
 636
 637 pt_entry_t     *DMAP1, *DMAP2;
 638 caddr_t         DADDR1;
 639 caddr_t         DADDR2;
 640
 641 static inline
 642 void pmap_pvh_unlink(pv_hashed_entry_t pv);
 643
 644 /*
 645  * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
 646  * properly deals with the anchor.
 647  * must be called with the hash locked, does not unlock it
 648  */
 649
 650 static inline
 651 void pmap_pvh_unlink(pv_hashed_entry_t pvh)
 652 {
 653   pv_hashed_entry_t curh;
 654   pv_hashed_entry_t *pprevh;
 655   int pvhash_idx;
 656
 657   CHK_NPVHASH();
 658   pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
 659
 660   pprevh = pvhash(pvhash_idx);
 661
 662 #if PV_DEBUG
 663   if (NULL == *pprevh) panic("pvh_unlink null anchor"); /* JK DEBUG */
 664 #endif
 665   curh = *pprevh;
 666
 667   while (PV_HASHED_ENTRY_NULL != curh) {
 668     if (pvh == curh)
 669       break;
 670     pprevh = &curh->nexth;
 671     curh = curh->nexth;
 672   }
 673   if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
 674   *pprevh = pvh->nexth;
 675   return;
 676 }
 677
 678 /*
 679  * for legacy, returns the address of the pde entry.
 680  * for 64 bit, causes the pdpt page containing the pde entry to be mapped,
 681  * then returns the mapped address of the pde entry in that page
 682  */
 683 pd_entry_t *
 684 pmap_pde(pmap_t m, vm_map_offset_t v)
 685 {
 686   pd_entry_t *pde;
 687         if (!cpu_64bit || (m == kernel_pmap)) {
 688           pde = (&((m)->dirbase[(vm_offset_t)(v) >> PDESHIFT]));
 689         } else {
 690           assert(m);
 691           assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
 692           pde = pmap64_pde(m, v);
 693         }
 694         return pde;
 695 }
 696
 697
 698 /*
 699  * the single pml4 page per pmap is allocated at pmap create time and exists
 700  * for the duration of the pmap. we allocate this page in kernel vm (to save us one
 701  * level of page table dynamic mapping.
 702  * this returns the address of the requested pml4 entry in the top level page.
 703  */
 704 static inline
 705 pml4_entry_t *
 706 pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
 707 {
 708   return ((pml4_entry_t *)pmap->pm_hold + ((vm_offset_t)((vaddr>>PML4SHIFT)&(NPML4PG-1))));
 709 }
 710
 711 /*
 712  * maps in the pml4 page, if any, containing the pdpt entry requested
 713  * and returns the address of the pdpt entry in that mapped page
 714  */
 715 pdpt_entry_t *
 716 pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
 717 {
 718   pml4_entry_t newpf;
 719   pml4_entry_t *pml4;
 720   int i;
 721
 722   assert(pmap);
 723   assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
 724   if ((vaddr > 0x00007FFFFFFFFFFFULL) && (vaddr < 0xFFFF800000000000ULL)) {
 725     return(0);
 726   }
 727
 728   pml4 = pmap64_pml4(pmap, vaddr);
 729
 730         if (pml4 && ((*pml4 & INTEL_PTE_VALID))) {
 731
 732                 newpf = *pml4 & PG_FRAME;
 733
 734
 735                 for (i=PMAP_PDPT_FIRST_WINDOW; i < PMAP_PDPT_FIRST_WINDOW+PMAP_PDPT_NWINDOWS; i++) {
 736                   if (((*(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP)) & PG_FRAME) == newpf) {
 737                   return((pdpt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR) +
 738                          ((vm_offset_t)((vaddr>>PDPTSHIFT)&(NPDPTPG-1))));
 739                   }
 740                 }
 741
 742                   current_cpu_datap()->cpu_pmap->pdpt_window_index++;
 743                   if (current_cpu_datap()->cpu_pmap->pdpt_window_index > (PMAP_PDPT_FIRST_WINDOW+PMAP_PDPT_NWINDOWS-1))
 744                     current_cpu_datap()->cpu_pmap->pdpt_window_index = PMAP_PDPT_FIRST_WINDOW;
 745                   pmap_store_pte(
 746                                  (current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pdpt_window_index].prv_CMAP),
 747                                  newpf | INTEL_PTE_RW | INTEL_PTE_VALID);
 748                   invlpg((u_int)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pdpt_window_index].prv_CADDR));
 749                   return ((pdpt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pdpt_window_index].prv_CADDR) +
 750                           ((vm_offset_t)((vaddr>>PDPTSHIFT)&(NPDPTPG-1))));
 751         }
 752
 753         return (NULL);
 754 }
 755
 756 /*
 757  * maps in the pdpt page, if any, containing the pde entry requested
 758  * and returns the address of the pde entry in that mapped page
 759  */
 760 pd_entry_t *
 761 pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr)
 762 {
 763   pdpt_entry_t newpf;
 764   pdpt_entry_t *pdpt;
 765   int i;
 766
 767   assert(pmap);
 768   assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
 769   if ((vaddr > 0x00007FFFFFFFFFFFULL) && (vaddr < 0xFFFF800000000000ULL)) {
 770     return(0);
 771   }
 772
 773   /*  if (vaddr & (1ULL << 63)) panic("neg addr");*/
 774   pdpt = pmap64_pdpt(pmap, vaddr);
 775
 776           if (pdpt && ((*pdpt & INTEL_PTE_VALID))) {
 777
 778                 newpf = *pdpt & PG_FRAME;
 779
 780                 for (i=PMAP_PDE_FIRST_WINDOW; i < PMAP_PDE_FIRST_WINDOW+PMAP_PDE_NWINDOWS; i++) {
 781                   if (((*(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP)) & PG_FRAME) == newpf) {
 782                   return((pd_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR) +
 783                          ((vm_offset_t)((vaddr>>PDSHIFT)&(NPDPG-1))));
 784                   }
 785                 }
 786
 787                   current_cpu_datap()->cpu_pmap->pde_window_index++;
 788                   if (current_cpu_datap()->cpu_pmap->pde_window_index > (PMAP_PDE_FIRST_WINDOW+PMAP_PDE_NWINDOWS-1))
 789                     current_cpu_datap()->cpu_pmap->pde_window_index = PMAP_PDE_FIRST_WINDOW;
 790                   pmap_store_pte(
 791                                  (current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pde_window_index].prv_CMAP),
 792                                  newpf | INTEL_PTE_RW | INTEL_PTE_VALID);
 793                   invlpg((u_int)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pde_window_index].prv_CADDR));
 794                   return ((pd_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pde_window_index].prv_CADDR) +
 795                           ((vm_offset_t)((vaddr>>PDSHIFT)&(NPDPG-1))));
 796         }
 797
 798         return (NULL);
 799 }
 800
 801 /*
 802  * Because the page tables (top 3 levels) are mapped into per cpu windows,
 803  * callers must either disable interrupts or disable preemption before calling
 804  * one of the pte mapping routines (e.g. pmap_pte()) as the returned vaddr
 805  * is in one of those mapped windows and that cannot be allowed to change until
 806  * the caller is done using the returned pte pointer. When done, the caller
 807  * restores interrupts or preemption to its previous state after which point the
 808  * vaddr for the returned pte can no longer be used
 809  */
 810
 811
 812 /*
 813  * return address of mapped pte for vaddr va in pmap pmap.
 814  * must be called with pre-emption or interrupts disabled
 815  * if targeted pmap is not the kernel pmap
 816  * since we may be passing back a virtual address that is
 817  * associated with this cpu... pre-emption or interrupts
 818  * must remain disabled until the caller is done using
 819  * the pointer that was passed back .
 820  *
 821  * maps the pde page, if any, containing the pte in and returns
 822  * the address of the pte in that mapped page
 823  */
 824 pt_entry_t     *
 825 pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
 826 {
 827         pd_entry_t     *pde;
 828         pd_entry_t     newpf;
 829         int i;
 830
 831         assert(pmap);
 832         pde = pmap_pde(pmap,vaddr);
 833
 834         if (pde && ((*pde & INTEL_PTE_VALID))) {
 835             if (pmap == kernel_pmap)
 836                 return (vtopte(vaddr)); /* compat kernel still has pte's mapped */
 837 #if TESTING
 838             if (ml_get_interrupts_enabled() && get_preemption_level() == 0)
 839                 panic("pmap_pte: unsafe call");
 840 #endif
 841                 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
 842
 843                 newpf = *pde & PG_FRAME;
 844
 845                 for (i=PMAP_PTE_FIRST_WINDOW; i < PMAP_PTE_FIRST_WINDOW+PMAP_PTE_NWINDOWS; i++) {
 846                   if (((*(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP)) & PG_FRAME) == newpf) {
 847                   return((pt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR) +
 848                          ((vm_offset_t)i386_btop(vaddr) & (NPTEPG-1)));
 849                   }
 850                 }
 851
 852                   current_cpu_datap()->cpu_pmap->pte_window_index++;
 853                   if (current_cpu_datap()->cpu_pmap->pte_window_index > (PMAP_PTE_FIRST_WINDOW+PMAP_PTE_NWINDOWS-1))
 854                     current_cpu_datap()->cpu_pmap->pte_window_index = PMAP_PTE_FIRST_WINDOW;
 855                   pmap_store_pte(
 856                                  (current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pte_window_index].prv_CMAP),
 857                                  newpf | INTEL_PTE_RW | INTEL_PTE_VALID);
 858                   invlpg((u_int)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pte_window_index].prv_CADDR));
 859                   return ((pt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pte_window_index].prv_CADDR) +
 860                           ((vm_offset_t)i386_btop(vaddr) & (NPTEPG-1)));
 861         }
 862
 863         return(NULL);
 864 }
 865
 866
 867 /*
 868  *      Map memory at initialization.  The physical addresses being
 869  *      mapped are not managed and are never unmapped.
 870  *
 871  *      For now, VM is already on, we only need to map the
 872  *      specified memory.
 873  */
 874 vm_offset_t
 875 pmap_map(
 876         vm_offset_t     virt,
 877         vm_map_offset_t start_addr,
 878         vm_map_offset_t end_addr,
 879         vm_prot_t       prot,
 880         unsigned int    flags)
 881 {
 882         int             ps;
 883
 884         ps = PAGE_SIZE;
 885         while (start_addr < end_addr) {
 886                 pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
 887                            (ppnum_t) i386_btop(start_addr), prot, flags, FALSE);
 888                 virt += ps;
 889                 start_addr += ps;
 890         }
 891         return(virt);
 892 }
 893
 894 /*
 895  *      Back-door routine for mapping kernel VM at initialization.
 896  *      Useful for mapping memory outside the range
 897  *      Sets no-cache, A, D.
 898  *      Otherwise like pmap_map.
 899  */
 900 vm_offset_t
 901 pmap_map_bd(
 902         vm_offset_t     virt,
 903         vm_map_offset_t start_addr,
 904         vm_map_offset_t end_addr,
 905         vm_prot_t       prot,
 906         unsigned int    flags)
 907 {
 908         pt_entry_t      template;
 909         pt_entry_t      *pte;
 910         spl_t           spl;
 911
 912         template = pa_to_pte(start_addr)
 913                 | INTEL_PTE_REF
 914                 | INTEL_PTE_MOD
 915                 | INTEL_PTE_WIRED
 916                 | INTEL_PTE_VALID;
 917
 918         if(flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) {
 919             template |= INTEL_PTE_NCACHE;
 920             if(!(flags & (VM_MEM_GUARDED | VM_WIMG_USE_DEFAULT)))
 921                     template |= INTEL_PTE_PTA;
 922         }
 923
 924         if (prot & VM_PROT_WRITE)
 925             template |= INTEL_PTE_WRITE;
 926
 927         while (start_addr < end_addr) {
 928                 spl = splhigh();
 929                 pte = pmap_pte(kernel_pmap, (vm_map_offset_t)virt);
 930                 if (pte == PT_ENTRY_NULL) {
 931                         panic("pmap_map_bd: Invalid kernel address\n");
 932                 }
 933                 pmap_store_pte(pte, template);
 934                 splx(spl);
 935                 pte_increment_pa(template);
 936                 virt += PAGE_SIZE;
 937                 start_addr += PAGE_SIZE;
 938         }
 939
 940         flush_tlb();
 941         return(virt);
 942 }
 943
 944 extern  char            *first_avail;
 945 extern  vm_offset_t     virtual_avail, virtual_end;
 946 extern  pmap_paddr_t    avail_start, avail_end;
 947 extern  vm_offset_t     etext;
 948 extern  void            *sectHIBB;
 949 extern  int             sectSizeHIB;
 950
 951 void
 952 pmap_cpu_init(void)
 953 {
 954         /*
 955          * Here early in the life of a processor (from cpu_mode_init()).
 956          * If we're not in 64-bit mode, enable the global TLB feature.
 957          * Note: regardless of mode we continue to set the global attribute
 958          * bit in ptes for all (32-bit) global pages such as the commpage.
 959          */
 960         if (!cpu_64bit) {
 961                 set_cr4(get_cr4() | CR4_PGE);
 962         }
 963
 964         /*
 965          * Initialize the per-cpu, TLB-related fields.
 966          */
 967         current_cpu_datap()->cpu_active_cr3 = kernel_pmap->pm_cr3;
 968         current_cpu_datap()->cpu_tlb_invalid = FALSE;
 969 }
 970
 971 vm_offset_t
 972 pmap_high_shared_remap(enum high_fixed_addresses e, vm_offset_t va, int sz)
 973 {
 974   vm_offset_t ve = pmap_index_to_virt(e);
 975   pt_entry_t *ptep;
 976   pmap_paddr_t pa;
 977   int i;
 978   spl_t s;
 979
 980   assert(0 == (va & PAGE_MASK));  /* expecting page aligned */
 981   s = splhigh();
 982   ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ve);
 983
 984   for (i=0; i< sz; i++) {
 985     pa = (pmap_paddr_t) kvtophys(va);
 986     pmap_store_pte(ptep, (pa & PG_FRAME)
 987                                 | INTEL_PTE_VALID
 988                                 | INTEL_PTE_GLOBAL
 989                                 | INTEL_PTE_RW
 990                                 | INTEL_PTE_REF
 991                                 | INTEL_PTE_MOD);
 992     va+= PAGE_SIZE;
 993     ptep++;
 994   }
 995   splx(s);
 996   return ve;
 997 }
 998
 999 vm_offset_t
1000 pmap_cpu_high_shared_remap(int cpu, enum high_cpu_types e, vm_offset_t va, int sz)
1001 {
1002   enum high_fixed_addresses     a = e + HIGH_CPU_END * cpu;
1003   return pmap_high_shared_remap(HIGH_FIXED_CPUS_BEGIN + a, va, sz);
1004 }
1005
1006 void pmap_init_high_shared(void);
1007
1008 extern vm_offset_t gdtptr, idtptr;
1009
1010 extern uint32_t low_intstack;
1011
1012 extern struct fake_descriptor ldt_desc_pattern;
1013 extern struct fake_descriptor tss_desc_pattern;
1014
1015 extern char hi_remap_text, hi_remap_etext;
1016 extern char t_zero_div;
1017
1018 pt_entry_t *pte_unique_base;
1019
1020 void
1021 pmap_init_high_shared(void)
1022 {
1023
1024         vm_offset_t haddr;
1025         struct __gdt_desc_struct gdt_desc = {0,0,0};
1026         struct __idt_desc_struct idt_desc = {0,0,0};
1027         spl_t s;
1028 #if MACH_KDB
1029         struct i386_tss *ttss;
1030 #endif
1031
1032         kprintf("HIGH_MEM_BASE 0x%x fixed per-cpu begin 0x%x\n",
1033                 HIGH_MEM_BASE,pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN));
1034         s = splhigh();
1035         pte_unique_base = pmap_pte(kernel_pmap, (vm_map_offset_t)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN));
1036         splx(s);
1037
1038         if (i386_btop(&hi_remap_etext - &hi_remap_text + 1) >
1039                                 HIGH_FIXED_TRAMPS_END - HIGH_FIXED_TRAMPS + 1)
1040                 panic("tramps too large");
1041         haddr = pmap_high_shared_remap(HIGH_FIXED_TRAMPS,
1042                                         (vm_offset_t) &hi_remap_text, 3);
1043         kprintf("tramp: 0x%x, ",haddr);
1044         printf("hi mem tramps at 0x%x\n",haddr);
1045         /* map gdt up high and update ptr for reload */
1046         haddr = pmap_high_shared_remap(HIGH_FIXED_GDT,
1047                                         (vm_offset_t) master_gdt, 1);
1048         __asm__ __volatile__("sgdt %0": "=m" (gdt_desc): :"memory");
1049         gdt_desc.address = haddr;
1050         kprintf("GDT: 0x%x, ",haddr);
1051         /* map ldt up high */
1052         haddr = pmap_high_shared_remap(HIGH_FIXED_LDT_BEGIN,
1053                                         (vm_offset_t) master_ldt,
1054                                         HIGH_FIXED_LDT_END - HIGH_FIXED_LDT_BEGIN + 1);
1055         kprintf("LDT: 0x%x, ",haddr);
1056         /* put new ldt addr into gdt */
1057         master_gdt[sel_idx(KERNEL_LDT)] = ldt_desc_pattern;
1058         master_gdt[sel_idx(KERNEL_LDT)].offset = (vm_offset_t) haddr;
1059         fix_desc(&master_gdt[sel_idx(KERNEL_LDT)], 1);
1060         master_gdt[sel_idx(USER_LDT)] = ldt_desc_pattern;
1061         master_gdt[sel_idx(USER_LDT)].offset = (vm_offset_t) haddr;
1062         fix_desc(&master_gdt[sel_idx(USER_LDT)], 1);
1063
1064         /* map idt up high */
1065         haddr = pmap_high_shared_remap(HIGH_FIXED_IDT,
1066                                         (vm_offset_t) master_idt, 1);
1067         __asm__ __volatile__("sidt %0" : "=m" (idt_desc));
1068         idt_desc.address = haddr;
1069         kprintf("IDT: 0x%x, ", haddr);
1070         /* remap ktss up high and put new high addr into gdt */
1071         haddr = pmap_high_shared_remap(HIGH_FIXED_KTSS,
1072                                         (vm_offset_t) &master_ktss, 1);
1073         master_gdt[sel_idx(KERNEL_TSS)] = tss_desc_pattern;
1074         master_gdt[sel_idx(KERNEL_TSS)].offset = (vm_offset_t) haddr;
1075         fix_desc(&master_gdt[sel_idx(KERNEL_TSS)], 1);
1076         kprintf("KTSS: 0x%x, ",haddr);
1077 #if MACH_KDB
1078         /* remap dbtss up high and put new high addr into gdt */
1079         haddr = pmap_high_shared_remap(HIGH_FIXED_DBTSS,
1080                                         (vm_offset_t) &master_dbtss, 1);
1081         master_gdt[sel_idx(DEBUG_TSS)] = tss_desc_pattern;
1082         master_gdt[sel_idx(DEBUG_TSS)].offset = (vm_offset_t) haddr;
1083         fix_desc(&master_gdt[sel_idx(DEBUG_TSS)], 1);
1084         ttss = (struct i386_tss *)haddr;
1085         kprintf("DBTSS: 0x%x, ",haddr);
1086 #endif  /* MACH_KDB */
1087
1088         /* remap dftss up high and put new high addr into gdt */
1089         haddr = pmap_high_shared_remap(HIGH_FIXED_DFTSS,
1090                                         (vm_offset_t) &master_dftss, 1);
1091         master_gdt[sel_idx(DF_TSS)] = tss_desc_pattern;
1092         master_gdt[sel_idx(DF_TSS)].offset = (vm_offset_t) haddr;
1093         fix_desc(&master_gdt[sel_idx(DF_TSS)], 1);
1094         kprintf("DFTSS: 0x%x\n",haddr);
1095
1096         /* remap mctss up high and put new high addr into gdt */
1097         haddr = pmap_high_shared_remap(HIGH_FIXED_DFTSS,
1098                                         (vm_offset_t) &master_mctss, 1);
1099         master_gdt[sel_idx(MC_TSS)] = tss_desc_pattern;
1100         master_gdt[sel_idx(MC_TSS)].offset = (vm_offset_t) haddr;
1101         fix_desc(&master_gdt[sel_idx(MC_TSS)], 1);
1102         kprintf("MCTSS: 0x%x\n",haddr);
1103
1104         __asm__ __volatile__("lgdt %0": "=m" (gdt_desc));
1105         __asm__ __volatile__("lidt %0": "=m" (idt_desc));
1106         kprintf("gdt/idt reloaded, ");
1107         set_tr(KERNEL_TSS);
1108         kprintf("tr reset to KERNEL_TSS\n");
1109 }
1110
1111
1112 /*
1113  *      Bootstrap the system enough to run with virtual memory.
1114  *      Map the kernel's code and data, and allocate the system page table.
1115  *      Called with mapping OFF.  Page_size must already be set.
1116  *
1117  *      Parameters:
1118  *      load_start:     PA where kernel was loaded
1119  *      avail_start     PA of first available physical page -
1120  *                         after kernel page tables
1121  *      avail_end       PA of last available physical page
1122  *      virtual_avail   VA of first available page -
1123  *                         after kernel page tables
1124  *      virtual_end     VA of last available page -
1125  *                         end of kernel address space
1126  *
1127  *      &start_text     start of kernel text
1128  *      &etext          end of kernel text
1129  */
1130
1131 void
1132 pmap_bootstrap(
1133         __unused vm_offset_t    load_start,
1134         boolean_t               IA32e)
1135 {
1136         vm_offset_t     va;
1137         pt_entry_t      *pte;
1138         int i;
1139         int wpkernel, boot_arg;
1140         pdpt_entry_t *pdpt;
1141         spl_t s;
1142
1143         vm_last_addr = VM_MAX_KERNEL_ADDRESS;   /* Set the highest address
1144                                                  * known to VM */
1145         /*
1146          *      The kernel's pmap is statically allocated so we don't
1147          *      have to use pmap_create, which is unlikely to work
1148          *      correctly at this part of the boot sequence.
1149          */
1150
1151
1152         kernel_pmap = &kernel_pmap_store;
1153         kernel_pmap->ref_count = 1;
1154         kernel_pmap->nx_enabled = FALSE;
1155         kernel_pmap->pm_task_map = TASK_MAP_32BIT;
1156         kernel_pmap->pm_obj = (vm_object_t) NULL;
1157         kernel_pmap->dirbase = (pd_entry_t *)((unsigned int)IdlePTD | KERNBASE);
1158         kernel_pmap->pdirbase = (pmap_paddr_t)((int)IdlePTD);
1159         pdpt = (pd_entry_t *)((unsigned int)IdlePDPT | KERNBASE );
1160         kernel_pmap->pm_pdpt = pdpt;
1161         kernel_pmap->pm_cr3 = (pmap_paddr_t)((int)IdlePDPT);
1162
1163         va = (vm_offset_t)kernel_pmap->dirbase;
1164         /* setup self referential mapping(s) */
1165         for (i = 0; i< NPGPTD; i++, pdpt++) {
1166           pmap_paddr_t pa;
1167           pa = (pmap_paddr_t) kvtophys(va + i386_ptob(i));
1168           pmap_store_pte(
1169             (pd_entry_t *) (kernel_pmap->dirbase + PTDPTDI + i),
1170             (pa & PG_FRAME) | INTEL_PTE_VALID | INTEL_PTE_RW | INTEL_PTE_REF |
1171               INTEL_PTE_MOD | INTEL_PTE_WIRED) ;
1172           pmap_store_pte(pdpt, pa | INTEL_PTE_VALID);
1173         }
1174
1175         cpu_64bit = IA32e;
1176
1177         lo_kernel_cr3 = kernel_pmap->pm_cr3;
1178         current_cpu_datap()->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
1179
1180         /* save the value we stuff into created pmaps to share the gdts etc */
1181         high_shared_pde = *pmap_pde(kernel_pmap, HIGH_MEM_BASE);
1182         /* make sure G bit is on for high shared pde entry */
1183         high_shared_pde |= INTEL_PTE_GLOBAL;
1184         s = splhigh();
1185         pmap_store_pte(pmap_pde(kernel_pmap, HIGH_MEM_BASE), high_shared_pde);
1186         splx(s);
1187
1188         nkpt = NKPT;
1189         inuse_ptepages_count += NKPT;
1190
1191         virtual_avail = (vm_offset_t)VADDR(KPTDI,0) + (vm_offset_t)first_avail;
1192         virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
1193
1194         /*
1195          * Reserve some special page table entries/VA space for temporary
1196          * mapping of pages.
1197          */
1198 #define SYSMAP(c, p, v, n)      \
1199         v = (c)va; va += ((n)*INTEL_PGBYTES); p = pte; pte += (n)
1200
1201         va = virtual_avail;
1202         pte = vtopte(va);
1203
1204         for (i=0; i<PMAP_NWINDOWS; i++) {
1205             SYSMAP(caddr_t,
1206                    (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
1207                    (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR),
1208                    1);
1209             *current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP = 0;
1210         }
1211
1212         /* DMAP user for debugger */
1213         SYSMAP(caddr_t, DMAP1, DADDR1, 1);
1214         SYSMAP(caddr_t, DMAP2, DADDR2, 1);  /* XXX temporary - can remove */
1215
1216         virtual_avail = va;
1217
1218         if (PE_parse_boot_arg("npvhash", &npvhash)) {
1219           if (0 != ((npvhash+1) & npvhash)) {
1220             kprintf("invalid hash %d, must be ((2^N)-1), using default %d\n",npvhash,NPVHASH);
1221             npvhash = NPVHASH;
1222           }
1223         } else {
1224           npvhash = NPVHASH;
1225         }
1226         printf("npvhash=%d\n",npvhash);
1227
1228         wpkernel = 1;
1229         if (PE_parse_boot_arg("wpkernel", &boot_arg)) {
1230                 if (boot_arg == 0)
1231                         wpkernel = 0;
1232         }
1233
1234         s = splhigh();
1235
1236         /* Remap kernel text readonly unless the "wpkernel" boot-arg is present
1237          * and set to 0.
1238          */
1239         if (wpkernel)
1240         {
1241                 vm_offset_t     myva;
1242                 pt_entry_t     *ptep;
1243
1244                 for (myva = i386_round_page(MP_BOOT + MP_BOOTSTACK); myva < etext; myva += PAGE_SIZE) {
1245                         if (myva >= (vm_offset_t)sectHIBB && myva < ((vm_offset_t)sectHIBB + sectSizeHIB))
1246                                 continue;
1247                         ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1248                         if (ptep)
1249                                 pmap_store_pte(ptep, *ptep & ~INTEL_PTE_RW);
1250                 }
1251         }
1252
1253         /* no matter what,  kernel page zero is not accessible */
1254         pte = pmap_pte(kernel_pmap, 0);
1255         pmap_store_pte(pte, INTEL_PTE_INVALID);
1256
1257         /* map lowmem global page into fixed addr 0x2000 */
1258         if (0 == (pte = pmap_pte(kernel_pmap,0x2000))) panic("lowmem pte");
1259         assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK)); /* make sure it is defined on page boundary */
1260         pmap_store_pte(pte, kvtophys((vm_offset_t)&lowGlo)|INTEL_PTE_VALID|INTEL_PTE_REF|INTEL_PTE_MOD|INTEL_PTE_WIRED|INTEL_PTE_RW);
1261         splx(s);
1262         flush_tlb();
1263
1264         simple_lock_init(&kernel_pmap->lock, 0);
1265         simple_lock_init(&pv_hashed_free_list_lock, 0);
1266         simple_lock_init(&pv_hashed_kern_free_list_lock, 0);
1267         simple_lock_init(&pv_hash_table_lock,0);
1268
1269         pmap_init_high_shared();
1270
1271         pde_mapped_size = PDE_MAPPED_SIZE;
1272
1273         if (cpu_64bit) {
1274           pdpt_entry_t *ppdpt   = (pdpt_entry_t *)IdlePDPT;
1275           pdpt_entry_t *ppdpt64 = (pdpt_entry_t *)IdlePDPT64;
1276           pdpt_entry_t *ppml4   = (pdpt_entry_t *)IdlePML4;
1277           int istate = ml_set_interrupts_enabled(FALSE);
1278
1279           /*
1280            * Clone a new 64-bit 3rd-level page table directory, IdlePML4,
1281            * with page bits set for the correct IA-32e operation and so that
1282            * the legacy-mode IdlePDPT is retained for slave processor start-up.
1283            * This is necessary due to the incompatible use of page bits between
1284            * 64-bit and legacy modes.
1285            */
1286           kernel_pmap->pm_cr3 = (pmap_paddr_t)((int)IdlePML4); /* setup in start.s for us */
1287           kernel_pmap->pm_pml4 = IdlePML4;
1288           kernel_pmap->pm_pdpt = (pd_entry_t *)
1289                                         ((unsigned int)IdlePDPT64 | KERNBASE );
1290 #define PAGE_BITS INTEL_PTE_VALID|INTEL_PTE_RW|INTEL_PTE_USER|INTEL_PTE_REF
1291           pmap_store_pte(kernel_pmap->pm_pml4,
1292                          (uint32_t)IdlePDPT64 | PAGE_BITS);
1293           pmap_store_pte((ppdpt64+0), *(ppdpt+0) | PAGE_BITS);
1294           pmap_store_pte((ppdpt64+1), *(ppdpt+1) | PAGE_BITS);
1295           pmap_store_pte((ppdpt64+2), *(ppdpt+2) | PAGE_BITS);
1296           pmap_store_pte((ppdpt64+3), *(ppdpt+3) | PAGE_BITS);
1297
1298           /*
1299            * The kernel is also mapped in the uber-sapce at the 4GB starting
1300            * 0xFFFFFF80:00000000. This is the highest entry in the 4th-level.
1301            */
1302           pmap_store_pte((ppml4+KERNEL_UBER_PML4_INDEX), *(ppml4+0));
1303
1304           kernel64_cr3 = (addr64_t) kernel_pmap->pm_cr3;
1305
1306           /* Re-initialize descriptors and prepare to switch modes */
1307           cpu_desc_init64(&cpu_data_master, TRUE);
1308           current_cpu_datap()->cpu_is64bit = TRUE;
1309           current_cpu_datap()->cpu_active_cr3 = kernel64_cr3;
1310
1311           pde_mapped_size = 512*4096 ;
1312
1313           ml_set_interrupts_enabled(istate);
1314         }
1315
1316         /* Set 64-bit mode if required. */
1317         cpu_mode_init(&cpu_data_master);
1318
1319         kernel_pmap->pm_hold = (vm_offset_t)kernel_pmap->pm_pml4;
1320
1321         kprintf("Kernel virtual space from 0x%x to 0x%x.\n",
1322                         VADDR(KPTDI,0), virtual_end);
1323         printf("PAE enabled\n");
1324         if (cpu_64bit){
1325           printf("64 bit mode enabled\n");kprintf("64 bit mode enabled\n"); }
1326
1327         kprintf("Available physical space from 0x%llx to 0x%llx\n",
1328                         avail_start, avail_end);
1329
1330         /*
1331          * By default for 64-bit users loaded at 4GB, share kernel mapping.
1332          * But this may be overridden by the -no_shared_cr3 boot-arg.
1333          */
1334         if (PE_parse_boot_arg("-no_shared_cr3", &no_shared_cr3)) {
1335                 kprintf("Shared kernel address space disabled\n");
1336         }
1337
1338 #ifdef  PMAP_TRACES
1339         if (PE_parse_boot_arg("-pmap_trace", &pmap_trace)) {
1340                 kprintf("Kernel traces for pmap operations enabled\n");
1341         }
1342 #endif  /* PMAP_TRACES */
1343 }
1344
1345 void
1346 pmap_virtual_space(
1347         vm_offset_t *startp,
1348         vm_offset_t *endp)
1349 {
1350         *startp = virtual_avail;
1351         *endp = virtual_end;
1352 }
1353
1354 /*
1355  *      Initialize the pmap module.
1356  *      Called by vm_init, to initialize any structures that the pmap
1357  *      system needs to map virtual memory.
1358  */
1359 void
1360 pmap_init(void)
1361 {
1362         register long           npages;
1363         vm_offset_t             addr;
1364         register vm_size_t      s;
1365         vm_map_offset_t         vaddr;
1366         ppnum_t ppn;
1367
1368         /*
1369          *      Allocate memory for the pv_head_table and its lock bits,
1370          *      the modify bit array, and the pte_page table.
1371          */
1372
1373         /*
1374          * zero bias all these arrays now instead of off avail_start
1375          * so we cover all memory
1376          */
1377
1378         npages = i386_btop(avail_end);
1379         s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
1380                          + (sizeof (struct pv_hashed_entry_t *) * (npvhash+1))
1381                          + pv_lock_table_size(npages)
1382                          + pv_hash_lock_table_size((npvhash+1))
1383                                 + npages);
1384
1385         s = round_page(s);
1386         if (kmem_alloc_wired(kernel_map, &addr, s) != KERN_SUCCESS)
1387                 panic("pmap_init");
1388
1389         memset((char *)addr, 0, s);
1390
1391 #if PV_DEBUG
1392         if (0 == npvhash) panic("npvhash not initialized");
1393 #endif
1394
1395         /*
1396          *      Allocate the structures first to preserve word-alignment.
1397          */
1398         pv_head_table = (pv_rooted_entry_t) addr;
1399         addr = (vm_offset_t) (pv_head_table + npages);
1400
1401         pv_hash_table = (pv_hashed_entry_t *)addr;
1402         addr = (vm_offset_t) (pv_hash_table + (npvhash + 1));
1403
1404         pv_lock_table = (char *) addr;
1405         addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
1406
1407         pv_hash_lock_table = (char *) addr;
1408         addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhash+1)));
1409
1410         pmap_phys_attributes = (char *) addr;
1411         {
1412                 unsigned int i;
1413                 unsigned int pn;
1414                 ppnum_t  last_pn;
1415                 pmap_memory_region_t *pmptr = pmap_memory_regions;
1416
1417                 last_pn = i386_btop(avail_end);
1418
1419                 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
1420                         if (pmptr->type == kEfiConventionalMemory) {
1421                                 for (pn = pmptr->base; pn <= pmptr->end; pn++) {
1422                                         if (pn < last_pn) {
1423                                                 pmap_phys_attributes[pn] |= PHYS_MANAGED;
1424
1425                                                 if (pn > last_managed_page)
1426                                                         last_managed_page = pn;
1427                                         }
1428                                 }
1429                         }
1430                 }
1431         }
1432
1433         /*
1434          *      Create the zone of physical maps,
1435          *      and of the physical-to-virtual entries.
1436          */
1437         s = (vm_size_t) sizeof(struct pmap);
1438         pmap_zone = zinit(s, 400*s, 4096, "pmap"); /* XXX */
1439         s = (vm_size_t) sizeof(struct pv_hashed_entry);
1440         pv_hashed_list_zone = zinit(s, 10000*s, 4096, "pv_list"); /* XXX */
1441         s = 63;
1442         pdpt_zone = zinit(s, 400*s, 4096, "pdpt"); /* XXX */
1443
1444         kptobj = &kptobj_object_store;
1445         _vm_object_allocate((vm_object_size_t)(NPGPTD*NPTDPG), kptobj);
1446         kernel_pmap->pm_obj = kptobj;
1447
1448         /* create pv entries for kernel pages mapped by low level
1449            startup code.  these have to exist so we can pmap_remove()
1450            e.g. kext pages from the middle of our addr space */
1451
1452         vaddr = (vm_map_offset_t)0;
1453         for (ppn = 0; ppn < i386_btop(avail_start) ; ppn++ ) {
1454           pv_rooted_entry_t     pv_e;
1455
1456           pv_e = pai_to_pvh(ppn);
1457           pv_e->va = vaddr;
1458           vaddr += PAGE_SIZE;
1459           pv_e->pmap = kernel_pmap;
1460           queue_init(&pv_e->qlink);
1461         }
1462
1463         pmap_initialized = TRUE;
1464
1465         /*
1466          *      Initialize pmap cache.
1467          */
1468         pmap_cache_list = PMAP_NULL;
1469         pmap_cache_count = 0;
1470         simple_lock_init(&pmap_cache_lock, 0);
1471
1472         max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
1473
1474 }
1475
1476 void
1477 x86_lowmem_free(void)
1478 {
1479         /* free lowmem pages back to the vm system. we had to defer doing this
1480            until the vm system was fully up.
1481            the actual pages that are released are determined by which
1482            pages the memory sizing code puts into the region table */
1483
1484         ml_static_mfree((vm_offset_t) i386_ptob(pmap_memory_regions[0].base),
1485                         (vm_size_t) i386_ptob(pmap_memory_regions[0].end - pmap_memory_regions[0].base));
1486 }
1487
1488
1489 #define managed_page(x) ( (unsigned int)x <= last_managed_page && (pmap_phys_attributes[x] & PHYS_MANAGED) )
1490
1491 /*
1492  * this function is only used for debugging fron the vm layer
1493  */
1494 boolean_t
1495 pmap_verify_free(
1496                  ppnum_t pn)
1497 {
1498         pv_rooted_entry_t       pv_h;
1499         int             pai;
1500         boolean_t       result;
1501
1502         assert(pn != vm_page_fictitious_addr);
1503
1504         if (!pmap_initialized)
1505                 return(TRUE);
1506
1507         if (pn == vm_page_guard_addr)
1508                 return TRUE;
1509
1510         pai = ppn_to_pai(pn);
1511         if (!managed_page(pai))
1512                 return(FALSE);
1513         pv_h = pai_to_pvh(pn);
1514         result = (pv_h->pmap == PMAP_NULL);
1515         return(result);
1516 }
1517
1518 boolean_t
1519 pmap_is_empty(
1520        pmap_t          pmap,
1521        vm_map_offset_t vstart,
1522        vm_map_offset_t vend)
1523 {
1524         vm_map_offset_t offset;
1525         ppnum_t         phys_page;
1526
1527         if (pmap == PMAP_NULL) {
1528                 return TRUE;
1529         }
1530         for (offset = vstart;
1531              offset < vend;
1532              offset += PAGE_SIZE_64) {
1533                 phys_page = pmap_find_phys(pmap, offset);
1534                 if (phys_page) {
1535                         if (pmap != kernel_pmap &&
1536                             pmap->pm_task_map == TASK_MAP_32BIT &&
1537                             offset >= HIGH_MEM_BASE) {
1538                                 /*
1539                                  * The "high_shared_pde" is used to share
1540                                  * the entire top-most 2MB of address space
1541                                  * between the kernel and all 32-bit tasks.
1542                                  * So none of this can be removed from 32-bit
1543                                  * tasks.
1544                                  * Let's pretend there's nothing up
1545                                  * there...
1546                                  */
1547                                 return TRUE;
1548                         }
1549                         kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1550                                 "page %d at 0x%llx\n",
1551                                 pmap, vstart, vend, phys_page, offset);
1552                         return FALSE;
1553                 }
1554         }
1555
1556         return TRUE;
1557 }
1558
1559
1560 /*
1561  *      Create and return a physical map.
1562  *
1563  *      If the size specified for the map
1564  *      is zero, the map is an actual physical
1565  *      map, and may be referenced by the
1566  *      hardware.
1567  *
1568  *      If the size specified is non-zero,
1569  *      the map will be used in software only, and
1570  *      is bounded by that size.
1571  */
1572 pmap_t
1573 pmap_create(
1574             vm_map_size_t       sz,
1575             boolean_t           is_64bit)
1576 {
1577         pmap_t                  p;
1578         int             i;
1579         vm_offset_t     va;
1580         vm_size_t       size;
1581         pdpt_entry_t    *pdpt;
1582         pml4_entry_t    *pml4p;
1583         pd_entry_t      *pdp;
1584         int template;
1585         spl_t s;
1586
1587         PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1588                    (int) (sz>>32), (int) sz, (int) is_64bit, 0, 0);
1589
1590         size = (vm_size_t) sz;
1591
1592         /*
1593          *      A software use-only map doesn't even need a map.
1594          */
1595
1596         if (size != 0) {
1597                 return(PMAP_NULL);
1598         }
1599
1600         p = (pmap_t) zalloc(pmap_zone);
1601         if (PMAP_NULL == p)
1602                 panic("pmap_create zalloc");
1603
1604         /* init counts now since we'll be bumping some */
1605         simple_lock_init(&p->lock, 0);
1606         p->stats.resident_count = 0;
1607         p->stats.resident_max = 0;
1608         p->stats.wired_count = 0;
1609         p->ref_count = 1;
1610         p->nx_enabled = 1;
1611         p->pm_shared = FALSE;
1612
1613         assert(!is_64bit || cpu_64bit);
1614         p->pm_task_map = is_64bit ? TASK_MAP_64BIT : TASK_MAP_32BIT;;
1615
1616         if (!cpu_64bit) {
1617                 /* legacy 32 bit setup */
1618                 /* in the legacy case the pdpt layer is hardwired to 4 entries and each
1619                  * entry covers 1GB of addr space */
1620                 if (KERN_SUCCESS != kmem_alloc_wired(kernel_map, (vm_offset_t *)(&p->dirbase), NBPTD))
1621                         panic("pmap_create kmem_alloc_wired");
1622                 p->pm_hold = (vm_offset_t)zalloc(pdpt_zone);
1623                 if ((vm_offset_t)NULL == p->pm_hold) {
1624                         panic("pdpt zalloc");
1625                 }
1626                 pdpt = (pdpt_entry_t *) (( p->pm_hold + 31) & ~31);
1627                 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)pdpt);
1628                 if (NULL == (p->pm_obj = vm_object_allocate((vm_object_size_t)(NPGPTD*NPTDPG))))
1629                         panic("pmap_create vm_object_allocate");
1630
1631                 memset((char *)p->dirbase, 0, NBPTD);
1632
1633                 va = (vm_offset_t)p->dirbase;
1634                 p->pdirbase = kvtophys(va);
1635
1636                 template = cpu_64bit ? INTEL_PTE_VALID|INTEL_PTE_RW|INTEL_PTE_USER|INTEL_PTE_REF : INTEL_PTE_VALID;
1637                 for (i = 0; i< NPGPTD; i++, pdpt++ ) {
1638                         pmap_paddr_t pa;
1639                         pa = (pmap_paddr_t) kvtophys(va + i386_ptob(i));
1640                         pmap_store_pte(pdpt, pa | template);
1641                 }
1642
1643                 /* map the high shared pde */
1644                 s = splhigh();
1645                 pmap_store_pte(pmap_pde(p, HIGH_MEM_BASE), high_shared_pde);
1646                 splx(s);
1647
1648         } else {
1649                 /* 64 bit setup  */
1650
1651                 /* alloc the pml4 page in kernel vm */
1652                 if (KERN_SUCCESS != kmem_alloc_wired(kernel_map, (vm_offset_t *)(&p->pm_hold), PAGE_SIZE))
1653                         panic("pmap_create kmem_alloc_wired pml4");
1654
1655                 memset((char *)p->pm_hold, 0, PAGE_SIZE);
1656                 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_hold);
1657
1658                 vm_page_lock_queues();
1659                 inuse_ptepages_count++;
1660                 vm_page_unlock_queues();
1661
1662                 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1663
1664                 if (NULL == (p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS))))
1665                         panic("pmap_create pdpt obj");
1666
1667                 if (NULL == (p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS))))
1668                         panic("pmap_create pdpt obj");
1669
1670                 if (NULL == (p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS))))
1671                         panic("pmap_create pte obj");
1672
1673                 /* uber space points to uber mapped kernel */
1674                 s = splhigh();
1675                 pml4p = pmap64_pml4(p, 0ULL);
1676                 pmap_store_pte((pml4p+KERNEL_UBER_PML4_INDEX),*kernel_pmap->pm_pml4);
1677
1678
1679                 if (!is_64bit) {
1680                         while ((pdp = pmap64_pde(p, (uint64_t)HIGH_MEM_BASE)) == PD_ENTRY_NULL) {
1681                                 splx(s);
1682                                 pmap_expand_pdpt(p, (uint64_t)HIGH_MEM_BASE); /* need room for another pde entry */
1683                                 s = splhigh();
1684                         }
1685                         pmap_store_pte(pdp, high_shared_pde);
1686                 }
1687                 splx(s);
1688         }
1689
1690         PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1691                    (int) p, is_64bit, 0, 0, 0);
1692
1693         return(p);
1694 }
1695
1696 /*
1697  * The following routines implement the shared address optmization for 64-bit
1698  * users with a 4GB page zero.
1699  *
1700  * pmap_set_4GB_pagezero()
1701  *      is called in the exec and fork paths to mirror the kernel's
1702  *      mapping in the bottom 4G of the user's pmap. The task mapping changes
1703  *      from TASK_MAP_64BIT to TASK_MAP_64BIT_SHARED. This routine returns
1704  *      without doing anything if the -no_shared_cr3 boot-arg is set.
1705  *
1706  * pmap_clear_4GB_pagezero()
1707  *      is called in the exec/exit paths to undo this mirror. The task mapping
1708  *      reverts to TASK_MAP_64BIT. In addition, we switch to the kernel's
1709  *      CR3 by calling pmap_load_kernel_cr3().
1710  *
1711  * pmap_load_kernel_cr3()
1712  *      loads cr3 with the kernel's page table. In addition to being called
1713  *      by pmap_clear_4GB_pagezero(), it is used both prior to teardown and
1714  *      when we go idle in the context of a shared map.
1715  *
1716  * Further notes on per-cpu data used:
1717  *
1718  *      cpu_kernel_cr3  is the cr3 for the kernel's pmap.
1719  *                      This is loaded in a trampoline on entering the kernel
1720  *                      from a 32-bit user (or non-shared-cr3 64-bit user).
1721  *      cpu_task_cr3    is the cr3 for the current thread.
1722  *                      This is loaded in a trampoline as we exit the kernel.
1723  *      cpu_active_cr3  reflects the cr3 currently loaded.
1724  *                      However, the low order bit is set when the
1725  *                      processor is idle or interrupts are disabled
1726  *                      while the system pmap lock is held. It is used by
1727  *                      tlb shoot-down.
1728  *      cpu_task_map    indicates whether the task cr3 belongs to
1729  *                      a 32-bit, a 64-bit or a 64-bit shared map.
1730  *                      The latter allows the avoidance of the cr3 load
1731  *                      on kernel entry and exit.
1732  *      cpu_tlb_invalid set TRUE when a tlb flush is requested.
1733  *                      If the cr3 is "inactive" (the cpu is idle or the
1734  *                      system-wide pmap lock is held) this not serviced by
1735  *                      an IPI but at time when the cr3 becomes "active".
1736  */
1737
1738 void
1739 pmap_set_4GB_pagezero(pmap_t p)
1740 {
1741         pdpt_entry_t    *user_pdptp;
1742         pdpt_entry_t    *kern_pdptp;
1743
1744         assert(p->pm_task_map != TASK_MAP_32BIT);
1745
1746         /* Kernel-shared cr3 may be disabled by boot arg. */
1747         if (no_shared_cr3)
1748                 return;
1749
1750         /*
1751          * Set the bottom 4 3rd-level pte's to be the kernel's.
1752          */
1753         PMAP_LOCK(p);
1754         while ((user_pdptp = pmap64_pdpt(p, 0x0)) == PDPT_ENTRY_NULL) {
1755                 PMAP_UNLOCK(p);
1756                 pmap_expand_pml4(p, 0x0);
1757                 PMAP_LOCK(p);
1758         }
1759         kern_pdptp = kernel_pmap->pm_pdpt;
1760         pmap_store_pte(user_pdptp+0, *(kern_pdptp+0));
1761         pmap_store_pte(user_pdptp+1, *(kern_pdptp+1));
1762         pmap_store_pte(user_pdptp+2, *(kern_pdptp+2));
1763         pmap_store_pte(user_pdptp+3, *(kern_pdptp+3));
1764         p->pm_task_map = TASK_MAP_64BIT_SHARED;
1765         PMAP_UNLOCK(p);
1766 }
1767
1768 void
1769 pmap_clear_4GB_pagezero(pmap_t p)
1770 {
1771         pdpt_entry_t    *user_pdptp;
1772
1773         if (p->pm_task_map != TASK_MAP_64BIT_SHARED)
1774                 return;
1775
1776         PMAP_LOCK(p);
1777
1778         p->pm_task_map = TASK_MAP_64BIT;
1779
1780         pmap_load_kernel_cr3();
1781
1782         user_pdptp = pmap64_pdpt(p, 0x0);
1783         pmap_store_pte(user_pdptp+0, 0);
1784         pmap_store_pte(user_pdptp+1, 0);
1785         pmap_store_pte(user_pdptp+2, 0);
1786         pmap_store_pte(user_pdptp+3, 0);
1787
1788         PMAP_UNLOCK(p);
1789 }
1790
1791 void
1792 pmap_load_kernel_cr3(void)
1793 {
1794         uint64_t        kernel_cr3;
1795
1796         assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
1797
1798         /*
1799          * Reload cr3 with the true kernel cr3.
1800          */
1801         kernel_cr3 = current_cpu_datap()->cpu_kernel_cr3;
1802         set64_cr3(kernel_cr3);
1803         current_cpu_datap()->cpu_active_cr3 = kernel_cr3;
1804         current_cpu_datap()->cpu_tlb_invalid = FALSE;
1805         __asm__ volatile("mfence");
1806 }
1807
1808 /*
1809  *      Retire the given physical map from service.
1810  *      Should only be called if the map contains
1811  *      no valid mappings.
1812  */
1813
1814 void
1815 pmap_destroy(
1816         register pmap_t p)
1817 {
1818         register int            c;
1819
1820         if (p == PMAP_NULL)
1821                 return;
1822
1823         PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1824                    (int) p, 0, 0, 0, 0);
1825
1826         PMAP_LOCK(p);
1827
1828         c = --p->ref_count;
1829
1830         if (c == 0) {
1831                 /*
1832                  * If some cpu is not using the physical pmap pointer that it
1833                  * is supposed to be (see set_dirbase), we might be using the
1834                  * pmap that is being destroyed! Make sure we are
1835                  * physically on the right pmap:
1836                  */
1837                 PMAP_UPDATE_TLBS(p,
1838                                  0x0ULL,
1839                                  0xFFFFFFFFFFFFF000ULL);
1840         }
1841
1842         PMAP_UNLOCK(p);
1843
1844         if (c != 0) {
1845                 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1846                            (int) p, 1, 0, 0, 0);
1847                 return; /* still in use */
1848         }
1849
1850         /*
1851          *      Free the memory maps, then the
1852          *      pmap structure.
1853          */
1854         if (!cpu_64bit) {
1855                 vm_page_lock_queues();
1856                 inuse_ptepages_count -= p->pm_obj->resident_page_count;
1857                 vm_page_unlock_queues();
1858
1859                 kmem_free(kernel_map, (vm_offset_t)p->dirbase, NBPTD);
1860                 zfree(pdpt_zone, (void *)p->pm_hold);
1861
1862                 vm_object_deallocate(p->pm_obj);
1863         } else {
1864                 /* 64 bit */
1865                 int inuse_ptepages = 0;
1866
1867                 /* free 64 bit mode structs */
1868                 inuse_ptepages++;
1869                 kmem_free(kernel_map, (vm_offset_t)p->pm_hold, PAGE_SIZE);
1870
1871                 inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1872                 vm_object_deallocate(p->pm_obj_pml4);
1873
1874                 inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1875                 vm_object_deallocate(p->pm_obj_pdpt);
1876
1877                 inuse_ptepages += p->pm_obj->resident_page_count;
1878                 vm_object_deallocate(p->pm_obj);
1879
1880                 vm_page_lock_queues();
1881                 inuse_ptepages_count -= inuse_ptepages;
1882                 vm_page_unlock_queues();
1883         }
1884         zfree(pmap_zone, p);
1885
1886         PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1887                    0, 0, 0, 0, 0);
1888
1889 }
1890
1891 /*
1892  *      Add a reference to the specified pmap.
1893  */
1894
1895 void
1896 pmap_reference(
1897         register pmap_t p)
1898 {
1899
1900         if (p != PMAP_NULL) {
1901                 PMAP_LOCK(p);
1902                 p->ref_count++;
1903                 PMAP_UNLOCK(p);;
1904         }
1905 }
1906
1907 /*
1908  *      Remove a range of hardware page-table entries.
1909  *      The entries given are the first (inclusive)
1910  *      and last (exclusive) entries for the VM pages.
1911  *      The virtual address is the va for the first pte.
1912  *
1913  *      The pmap must be locked.
1914  *      If the pmap is not the kernel pmap, the range must lie
1915  *      entirely within one pte-page.  This is NOT checked.
1916  *      Assumes that the pte-page exists.
1917  */
1918
1919 void
1920 pmap_remove_range(
1921         pmap_t                  pmap,
1922         vm_map_offset_t         start_vaddr,
1923         pt_entry_t              *spte,
1924         pt_entry_t              *epte)
1925 {
1926         register pt_entry_t     *cpte;
1927         pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
1928         pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
1929         pv_hashed_entry_t       pvh_e;
1930         int                     pvh_cnt = 0;
1931         int                     num_removed, num_unwired, num_found;
1932         int                     pai;
1933         pmap_paddr_t            pa;
1934         vm_map_offset_t         vaddr;
1935         int                     pvhash_idx;
1936         uint32_t                pv_cnt;
1937
1938         num_removed = 0;
1939         num_unwired = 0;
1940         num_found   = 0;
1941
1942         if (pmap != kernel_pmap &&
1943             pmap->pm_task_map == TASK_MAP_32BIT &&
1944             start_vaddr >= HIGH_MEM_BASE) {
1945                 /*
1946                  * The range is in the "high_shared_pde" which is shared
1947                  * between the kernel and all 32-bit tasks.  It holds
1948                  * the 32-bit commpage but also the trampolines, GDT, etc...
1949                  * so we can't let user tasks remove anything from it.
1950                  */
1951                 return;
1952         }
1953
1954         /* invalidate the PTEs first to "freeze" them */
1955         for (cpte = spte, vaddr = start_vaddr;
1956              cpte < epte;
1957              cpte++, vaddr += PAGE_SIZE_64) {
1958
1959             pa = pte_to_pa(*cpte);
1960             if (pa == 0)
1961                 continue;
1962             num_found++;
1963
1964             if (iswired(*cpte))
1965                 num_unwired++;
1966
1967             pai = pa_index(pa);
1968
1969             if (!managed_page(pai)) {
1970                 /*
1971                  *      Outside range of managed physical memory.
1972                  *      Just remove the mappings.
1973                  */
1974                 pmap_store_pte(cpte, 0);
1975                 continue;
1976             }
1977
1978             /* invalidate the PTE */
1979             pmap_update_pte(cpte, *cpte, (*cpte & ~INTEL_PTE_VALID));
1980         }
1981
1982         if (num_found == 0) {
1983                 /* nothing was changed: we're done */
1984                 goto update_counts;
1985         }
1986
1987         /* propagate the invalidates to other CPUs */
1988
1989         PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
1990
1991         for (cpte = spte, vaddr = start_vaddr;
1992              cpte < epte;
1993              cpte++, vaddr += PAGE_SIZE_64) {
1994
1995             pa = pte_to_pa(*cpte);
1996             if (pa == 0)
1997                 continue;
1998
1999             pai = pa_index(pa);
2000
2001             LOCK_PVH(pai);
2002
2003             pa = pte_to_pa(*cpte);
2004             if (pa == 0) {
2005               UNLOCK_PVH(pai);
2006               continue;
2007             }
2008
2009             num_removed++;
2010
2011             /*
2012              *  Get the modify and reference bits, then
2013              *  nuke the entry in the page table
2014              */
2015             /* remember reference and change */
2016             pmap_phys_attributes[pai] |=
2017                     (char)(*cpte & (PHYS_MODIFIED | PHYS_REFERENCED));
2018             /* completely invalidate the PTE */
2019             pmap_store_pte(cpte, 0);
2020
2021             /*
2022              *  Remove the mapping from the pvlist for
2023              *  this physical page.
2024              */
2025             {
2026               pv_rooted_entry_t pv_h;
2027               pv_hashed_entry_t *pprevh;
2028               ppnum_t ppn = (ppnum_t)pai;
2029
2030                 pv_h = pai_to_pvh(pai);
2031                 pvh_e = PV_HASHED_ENTRY_NULL;
2032                 if (pv_h->pmap == PMAP_NULL)
2033                     panic("pmap_remove_range: null pv_list!");
2034
2035                 if (pv_h->va == vaddr && pv_h->pmap == pmap) { /* rooted or not */
2036                     /*
2037                      * Header is the pv_rooted_entry. We can't free that. If there is a queued
2038                      * entry after this one we remove that
2039                      * from the ppn queue, we remove it from the hash chain
2040                      * and copy it to the rooted entry. Then free it instead.
2041                      */
2042
2043                   pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
2044                   if (pv_h != (pv_rooted_entry_t)pvh_e) {  /* any queued after rooted? */
2045                     CHK_NPVHASH();
2046                     pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
2047                     LOCK_PV_HASH(pvhash_idx);
2048                     remque(&pvh_e->qlink);
2049                     {
2050                       pprevh = pvhash(pvhash_idx);
2051                       if (PV_HASHED_ENTRY_NULL == *pprevh) {
2052                         panic("pmap_remove_range empty hash removing rooted pv");
2053                       }
2054                     }
2055                     pmap_pvh_unlink(pvh_e);
2056                     UNLOCK_PV_HASH(pvhash_idx);
2057                     pv_h->pmap = pvh_e->pmap;
2058                     pv_h->va = pvh_e->va;   /* dispose of pvh_e */
2059                   } else {  /* none queued after rooted */
2060                     pv_h->pmap = PMAP_NULL;
2061                     pvh_e = PV_HASHED_ENTRY_NULL;
2062                   }   /* any queued after rooted */
2063
2064                 } else { /* rooted or not */
2065                   /* not removing rooted pv. find it on hash chain, remove from ppn queue and
2066                    * hash chain and free it */
2067                   CHK_NPVHASH();
2068                   pvhash_idx = pvhashidx(pmap,vaddr);
2069                   LOCK_PV_HASH(pvhash_idx);
2070                   pprevh = pvhash(pvhash_idx);
2071                   if (PV_HASHED_ENTRY_NULL == *pprevh) {
2072                     panic("pmap_remove_range empty hash removing hashed pv");
2073                     }
2074                   pvh_e = *pprevh;
2075                   pmap_pv_hashlist_walks++;
2076                   pv_cnt = 0;
2077                   while (PV_HASHED_ENTRY_NULL != pvh_e) {
2078                         pv_cnt++;
2079                         if (pvh_e->pmap == pmap && pvh_e->va == vaddr && pvh_e->ppn == ppn) break;
2080                         pprevh = &pvh_e->nexth;
2081                         pvh_e = pvh_e->nexth;
2082                   }
2083                   pmap_pv_hashlist_cnts += pv_cnt;
2084                   if (pmap_pv_hashlist_max < pv_cnt) pmap_pv_hashlist_max = pv_cnt;
2085                   if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pmap_remove_range pv not on hash");
2086                   *pprevh = pvh_e->nexth;
2087                   remque(&pvh_e->qlink);
2088                   UNLOCK_PV_HASH(pvhash_idx);
2089
2090                 } /* rooted or not */
2091
2092                 UNLOCK_PVH(pai);
2093
2094                 if (pvh_e != PV_HASHED_ENTRY_NULL) {
2095                   pvh_e->qlink.next = (queue_entry_t)pvh_eh;
2096                   pvh_eh = pvh_e;
2097
2098                   if (pvh_et == PV_HASHED_ENTRY_NULL) {
2099                     pvh_et = pvh_e;
2100                   }
2101
2102                   pvh_cnt++;
2103                 }
2104
2105             } /* removing mappings for this phy page */
2106         } /* for loop */
2107
2108         if (pvh_eh != PV_HASHED_ENTRY_NULL) {
2109             PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
2110         }
2111
2112 update_counts:
2113         /*
2114          *      Update the counts
2115          */
2116 #if TESTING
2117         if (pmap->stats.resident_count < num_removed)
2118                 panic("pmap_remove_range: resident_count");
2119 #endif
2120         assert(pmap->stats.resident_count >= num_removed);
2121         OSAddAtomic(-num_removed, (SInt32 *) &pmap->stats.resident_count);
2122
2123 #if TESTING
2124         if (pmap->stats.wired_count < num_unwired)
2125                 panic("pmap_remove_range: wired_count");
2126 #endif
2127         assert(pmap->stats.wired_count >= num_unwired);
2128         OSAddAtomic(-num_unwired, (SInt32 *) &pmap->stats.wired_count);
2129
2130         return;
2131 }
2132
2133 /*
2134  *      Remove phys addr if mapped in specified map
2135  *
2136  */
2137 void
2138 pmap_remove_some_phys(
2139         __unused pmap_t         map,
2140         __unused ppnum_t         pn)
2141 {
2142
2143 /* Implement to support working set code */
2144
2145 }
2146
2147 /*
2148  *      Remove the given range of addresses
2149  *      from the specified map.
2150  *
2151  *      It is assumed that the start and end are properly
2152  *      rounded to the hardware page size.
2153  */
2154
2155
2156 void
2157 pmap_remove(
2158         pmap_t          map,
2159         addr64_t        s64,
2160         addr64_t        e64)
2161 {
2162         pt_entry_t      *pde;
2163         pt_entry_t      *spte, *epte;
2164         addr64_t        l64;
2165         addr64_t        orig_s64;
2166         uint64_t        deadline;
2167
2168         pmap_intr_assert();
2169
2170         if (map == PMAP_NULL || s64 == e64)
2171                 return;
2172
2173         PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
2174                    (int) map,
2175                    (int) (s64>>32), (int) s64,
2176                    (int) (e64>>32), (int) e64);
2177
2178         PMAP_LOCK(map);
2179
2180 #if 0
2181         /*
2182          * Check that address range in the kernel does not overlap the stacks.
2183          * We initialize local static min/max variables once to avoid making
2184          * 2 function calls for every remove. Note also that these functions
2185          * both return 0 before kernel stacks have been initialized, and hence
2186          * the panic is not triggered in this case.
2187          */
2188         if (map == kernel_pmap) {
2189                 static vm_offset_t      kernel_stack_min = 0;
2190                 static vm_offset_t      kernel_stack_max = 0;
2191
2192                 if (kernel_stack_min == 0) {
2193                         kernel_stack_min = min_valid_stack_address();
2194                         kernel_stack_max = max_valid_stack_address();
2195                 }
2196                 if  ((kernel_stack_min <= s64 && s64 <  kernel_stack_max) ||
2197                      (kernel_stack_min <  e64 && e64 <= kernel_stack_max))
2198                         panic("pmap_remove() attempted in kernel stack");
2199         }
2200 #else
2201
2202         /*
2203          * The values of kernel_stack_min and kernel_stack_max are no longer
2204          * relevant now that we allocate kernel stacks anywhere in the kernel map,
2205          * so the old code above no longer applies.  If we wanted to check that
2206          * we weren't removing a mapping of a page in a kernel stack we'd have to
2207          * mark the PTE with an unused bit and check that here.
2208          */
2209
2210 #endif
2211
2212         deadline = rdtsc64() + max_preemption_latency_tsc;
2213
2214         orig_s64 = s64;
2215
2216         while (s64 < e64) {
2217
2218             l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size-1);
2219             if (l64 > e64)
2220                 l64 = e64;
2221             pde = pmap_pde(map, s64);
2222
2223             if (pde && (*pde & INTEL_PTE_VALID)) {
2224                 spte = (pt_entry_t *)pmap_pte(map, (s64 & ~(pde_mapped_size-1)));
2225                 spte = &spte[ptenum(s64)];
2226                 epte = &spte[intel_btop(l64-s64)];
2227
2228                 pmap_remove_range(map, s64, spte, epte);
2229             }
2230             s64 = l64;
2231             pde++;
2232
2233             if (s64 < e64 && rdtsc64() >= deadline) {
2234               PMAP_UNLOCK(map)
2235                 PMAP_LOCK(map)
2236
2237               deadline = rdtsc64() + max_preemption_latency_tsc;
2238             }
2239
2240         }
2241
2242         PMAP_UNLOCK(map);
2243
2244         PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END,
2245                    (int) map, 0, 0, 0, 0);
2246
2247 }
2248
2249 /*
2250  *      Routine:        pmap_page_protect
2251  *
2252  *      Function:
2253  *              Lower the permission for all mappings to a given
2254  *              page.
2255  */
2256 void
2257 pmap_page_protect(
2258         ppnum_t         pn,
2259         vm_prot_t       prot)
2260 {
2261         pv_hashed_entry_t               pvh_eh = PV_HASHED_ENTRY_NULL;
2262         pv_hashed_entry_t               pvh_et = PV_HASHED_ENTRY_NULL;
2263         pv_hashed_entry_t       nexth;
2264         int                     pvh_cnt = 0;
2265         pv_rooted_entry_t               pv_h;
2266         pv_rooted_entry_t               pv_e;
2267         pv_hashed_entry_t       pvh_e;
2268         pt_entry_t              *pte;
2269         int                     pai;
2270         register pmap_t         pmap;
2271         boolean_t               remove;
2272         int                     pvhash_idx;
2273
2274         pmap_intr_assert();
2275         assert(pn != vm_page_fictitious_addr);
2276         if (pn == vm_page_guard_addr)
2277                 return;
2278
2279         pai = ppn_to_pai(pn);
2280
2281         if (!managed_page(pai)) {
2282             /*
2283              *  Not a managed page.
2284              */
2285             return;
2286         }
2287
2288         PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START,
2289                    (int) pn, (int) prot, 0, 0, 0);
2290
2291         /*
2292          * Determine the new protection.
2293          */
2294         switch (prot) {
2295             case VM_PROT_READ:
2296             case VM_PROT_READ|VM_PROT_EXECUTE:
2297                 remove = FALSE;
2298                 break;
2299             case VM_PROT_ALL:
2300                 return; /* nothing to do */
2301             default:
2302                 remove = TRUE;
2303                 break;
2304         }
2305
2306         pv_h = pai_to_pvh(pai);
2307
2308         LOCK_PVH(pai);
2309
2310         /*
2311          * Walk down PV list, changing or removing all mappings.
2312          */
2313         if (pv_h->pmap != PMAP_NULL) {
2314
2315             pv_e = pv_h;
2316             pvh_e = (pv_hashed_entry_t)pv_e; /* cheat */
2317
2318             do {
2319                 register vm_map_offset_t vaddr;
2320                 pmap = pv_e->pmap;
2321
2322                 vaddr = pv_e->va;
2323                 pte = pmap_pte(pmap, vaddr);
2324
2325                 if (0 == pte) {
2326                     kprintf("pmap_page_protect pmap %p pn 0x%x vaddr 0x%llx\n",pmap, pn, vaddr);
2327                     panic("pmap_page_protect");
2328                 }
2329
2330                 nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink);  /* if there is one */
2331
2332                 /*
2333                  * Remove the mapping if new protection is NONE
2334                  * or if write-protecting a kernel mapping.
2335                  */
2336                 if (remove || pmap == kernel_pmap) {
2337                     /*
2338                      * Remove the mapping, collecting any modify bits.
2339                      */
2340                     pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID));
2341
2342                     PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2343
2344                     pmap_phys_attributes[pai] |= *pte & (PHYS_MODIFIED|PHYS_REFERENCED);
2345
2346                     pmap_store_pte(pte, 0);
2347
2348 #if TESTING
2349                     if (pmap->stats.resident_count < 1)
2350                         panic("pmap_page_protect: resident_count");
2351 #endif
2352                     assert(pmap->stats.resident_count >= 1);
2353                     OSAddAtomic(-1, (SInt32 *) &pmap->stats.resident_count);
2354
2355                     /*
2356                      * Deal with the pv_rooted_entry.
2357                      */
2358
2359                     if (pv_e == pv_h) {
2360                         /*
2361                          * Fix up head later.
2362                          */
2363                         pv_h->pmap = PMAP_NULL;
2364                     }
2365                     else {
2366                         /*
2367                          * Delete this entry.
2368                          */
2369                       CHK_NPVHASH();
2370                       pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
2371                       LOCK_PV_HASH(pvhash_idx);
2372                       remque(&pvh_e->qlink);
2373                       pmap_pvh_unlink(pvh_e);
2374                       UNLOCK_PV_HASH(pvhash_idx);
2375
2376                       pvh_e->qlink.next = (queue_entry_t)pvh_eh;
2377                         pvh_eh = pvh_e;
2378
2379                         if (pvh_et == PV_HASHED_ENTRY_NULL)
2380                             pvh_et = pvh_e;
2381                         pvh_cnt++;
2382                     }
2383                 } else {
2384                     /*
2385                      * Write-protect.
2386                      */
2387                     pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WRITE));
2388                     PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2389                 }
2390
2391                 pvh_e = nexth;
2392             } while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h);
2393
2394             /*
2395              * If pv_head mapping was removed, fix it up.
2396              */
2397
2398             if (pv_h->pmap == PMAP_NULL) {
2399               pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
2400
2401               if (pvh_e != (pv_hashed_entry_t)pv_h) {
2402                 CHK_NPVHASH();
2403                 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
2404                 LOCK_PV_HASH(pvhash_idx);
2405                 remque(&pvh_e->qlink);
2406                 pmap_pvh_unlink(pvh_e);
2407                 UNLOCK_PV_HASH(pvhash_idx);
2408                   pv_h->pmap = pvh_e->pmap;
2409                   pv_h->va = pvh_e->va;
2410                   pvh_e->qlink.next = (queue_entry_t)pvh_eh;
2411                     pvh_eh = pvh_e;
2412
2413                     if (pvh_et == PV_HASHED_ENTRY_NULL)
2414                         pvh_et = pvh_e;
2415                     pvh_cnt++;
2416                 }
2417             }
2418         }
2419         if (pvh_eh != PV_HASHED_ENTRY_NULL) {
2420             PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
2421         }
2422
2423         UNLOCK_PVH(pai);
2424
2425         PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END,
2426                    0, 0, 0, 0, 0);
2427
2428 }
2429
2430
2431 /*
2432  *      Routine:
2433  *              pmap_disconnect
2434  *
2435  *      Function:
2436  *              Disconnect all mappings for this page and return reference and change status
2437  *              in generic format.
2438  *
2439  */
2440 unsigned int pmap_disconnect(
2441         ppnum_t pa)
2442 {
2443         pmap_page_protect(pa, 0);                       /* disconnect the page */
2444         return (pmap_get_refmod(pa));                   /* return ref/chg status */
2445 }
2446
2447 /*
2448  *      Set the physical protection on the
2449  *      specified range of this map as requested.
2450  *      Will not increase permissions.
2451  */
2452 void
2453 pmap_protect(
2454         pmap_t          map,
2455         vm_map_offset_t sva,
2456         vm_map_offset_t eva,
2457         vm_prot_t       prot)
2458 {
2459         register pt_entry_t     *pde;
2460         register pt_entry_t     *spte, *epte;
2461         vm_map_offset_t         lva;
2462         vm_map_offset_t         orig_sva;
2463         boolean_t       set_NX;
2464         int             num_found = 0;
2465
2466         pmap_intr_assert();
2467
2468         if (map == PMAP_NULL)
2469                 return;
2470
2471         if (prot == VM_PROT_NONE) {
2472                 pmap_remove(map, sva, eva);
2473                 return;
2474         }
2475
2476         PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
2477                    (int) map,
2478                    (int) (sva>>32), (int) sva,
2479                    (int) (eva>>32), (int) eva);
2480
2481         if ( (prot & VM_PROT_EXECUTE) || !nx_enabled || !map->nx_enabled )
2482                 set_NX = FALSE;
2483         else
2484                 set_NX = TRUE;
2485
2486         PMAP_LOCK(map);
2487
2488         orig_sva = sva;
2489         while (sva < eva) {
2490             lva = (sva + pde_mapped_size) & ~(pde_mapped_size-1);
2491             if (lva > eva)
2492                 lva = eva;
2493             pde = pmap_pde(map, sva);
2494             if (pde && (*pde & INTEL_PTE_VALID)) {
2495                 spte = (pt_entry_t *)pmap_pte(map, (sva & ~(pde_mapped_size-1)));
2496                 spte = &spte[ptenum(sva)];
2497                 epte = &spte[intel_btop(lva-sva)];
2498
2499                 while (spte < epte) {
2500
2501                     if (*spte & INTEL_PTE_VALID) {
2502
2503                         if (prot & VM_PROT_WRITE)
2504                             pmap_update_pte(spte, *spte, (*spte | INTEL_PTE_WRITE));
2505                         else
2506                             pmap_update_pte(spte, *spte, (*spte & ~INTEL_PTE_WRITE));
2507
2508                         if (set_NX == TRUE)
2509                             pmap_update_pte(spte, *spte, (*spte | INTEL_PTE_NX));
2510                         else
2511                             pmap_update_pte(spte, *spte, (*spte & ~INTEL_PTE_NX));
2512
2513                         num_found++;
2514                     }
2515                     spte++;
2516                 }
2517             }
2518             sva = lva;
2519         }
2520         if (num_found)
2521             PMAP_UPDATE_TLBS(map, orig_sva, eva);
2522
2523         PMAP_UNLOCK(map);
2524
2525         PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END,
2526                    0, 0, 0, 0, 0);
2527
2528 }
2529
2530 /* Map a (possibly) autogenned block */
2531 void
2532 pmap_map_block(
2533         pmap_t          pmap,
2534         addr64_t        va,
2535         ppnum_t         pa,
2536         uint32_t        size,
2537         vm_prot_t       prot,
2538         int             attr,
2539         __unused unsigned int   flags)
2540 {
2541     uint32_t page;
2542
2543     for (page = 0; page < size; page++) {
2544         pmap_enter(pmap, va, pa, prot, attr, TRUE);
2545         va += PAGE_SIZE;
2546         pa++;
2547     }
2548 }
2549
2550
2551 /*
2552  *      Insert the given physical page (p) at
2553  *      the specified virtual address (v) in the
2554  *      target physical map with the protection requested.
2555  *
2556  *      If specified, the page will be wired down, meaning
2557  *      that the related pte cannot be reclaimed.
2558  *
2559  *      NB:  This is the only routine which MAY NOT lazy-evaluate
2560  *      or lose information.  That is, this routine must actually
2561  *      insert this page into the given map NOW.
2562  */
2563 void
2564 pmap_enter(
2565         register pmap_t         pmap,
2566         vm_map_offset_t         vaddr,
2567         ppnum_t                 pn,
2568         vm_prot_t               prot,
2569         unsigned int            flags,
2570         boolean_t               wired)
2571 {
2572         register pt_entry_t     *pte;
2573         register pv_rooted_entry_t      pv_h;
2574         register int            pai;
2575         pv_hashed_entry_t               pvh_e;
2576         pv_hashed_entry_t               pvh_new;
2577         pv_hashed_entry_t       *hashp;
2578         pt_entry_t              template;
2579         pmap_paddr_t            old_pa;
2580         pmap_paddr_t             pa = (pmap_paddr_t)i386_ptob(pn);
2581         boolean_t               need_tlbflush = FALSE;
2582         boolean_t               set_NX;
2583         char                    oattr;
2584         int                     pvhash_idx;
2585         uint32_t                pv_cnt;
2586         boolean_t               old_pa_locked;
2587
2588         pmap_intr_assert();
2589         assert(pn != vm_page_fictitious_addr);
2590         if (pmap_debug)
2591                 printf("pmap(%qx, %x)\n", vaddr, pn);
2592         if (pmap == PMAP_NULL)
2593                 return;
2594         if (pn == vm_page_guard_addr)
2595                 return;
2596
2597         PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
2598                    (int) pmap,
2599                    (int) (vaddr>>32), (int) vaddr,
2600                    (int) pn, prot);
2601
2602         if ( (prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled )
2603                 set_NX = FALSE;
2604         else
2605                 set_NX = TRUE;
2606
2607         /*
2608          *      Must allocate a new pvlist entry while we're unlocked;
2609          *      zalloc may cause pageout (which will lock the pmap system).
2610          *      If we determine we need a pvlist entry, we will unlock
2611          *      and allocate one.  Then we will retry, throughing away
2612          *      the allocated entry later (if we no longer need it).
2613          */
2614
2615         pvh_new = PV_HASHED_ENTRY_NULL;
2616 Retry:
2617         pvh_e = PV_HASHED_ENTRY_NULL;
2618
2619         PMAP_LOCK(pmap);
2620
2621         /*
2622          *      Expand pmap to include this pte.  Assume that
2623          *      pmap is always expanded to include enough hardware
2624          *      pages to map one VM page.
2625          */
2626
2627         while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
2628                 /*
2629                  *      Must unlock to expand the pmap.
2630                  */
2631                 PMAP_UNLOCK(pmap);
2632                 pmap_expand(pmap, vaddr); /* going to grow pde level page(s) */
2633                 PMAP_LOCK(pmap);
2634         }
2635
2636         old_pa = pte_to_pa(*pte);
2637         pai = pa_index(old_pa);
2638         old_pa_locked = FALSE;
2639
2640         /*
2641          * if we have a previous managed page, lock the pv entry now. after
2642          * we lock it, check to see if someone beat us to the lock and if so
2643          * drop the lock
2644          */
2645
2646         if ((0 != old_pa) && managed_page(pai)) {
2647           LOCK_PVH(pai);
2648           old_pa_locked = TRUE;
2649           old_pa = pte_to_pa(*pte);
2650           if (0 == old_pa) {
2651             UNLOCK_PVH(pai);  /* some other path beat us to it */
2652             old_pa_locked = FALSE;
2653           }
2654         }
2655
2656
2657         /*
2658          *      Special case if the incoming physical page is already mapped
2659          *      at this address.
2660          */
2661         if (old_pa == pa) {
2662
2663             /*
2664              *  May be changing its wired attribute or protection
2665              */
2666
2667             template = pa_to_pte(pa) | INTEL_PTE_VALID;
2668
2669             if(VM_MEM_NOT_CACHEABLE == (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) {
2670                 if(!(flags & VM_MEM_GUARDED))
2671                         template |= INTEL_PTE_PTA;
2672                 template |= INTEL_PTE_NCACHE;
2673             }
2674
2675             if (pmap != kernel_pmap)
2676                 template |= INTEL_PTE_USER;
2677             if (prot & VM_PROT_WRITE)
2678                 template |= INTEL_PTE_WRITE;
2679
2680             if (set_NX == TRUE)
2681                 template |= INTEL_PTE_NX;
2682
2683             if (wired) {
2684                 template |= INTEL_PTE_WIRED;
2685                 if (!iswired(*pte))
2686                     OSAddAtomic(+1, (SInt32 *) &pmap->stats.wired_count);
2687             }
2688             else {
2689                 if (iswired(*pte)) {
2690                     assert(pmap->stats.wired_count >= 1);
2691                     OSAddAtomic(-1, (SInt32 *) &pmap->stats.wired_count);
2692                 }
2693             }
2694
2695             /* store modified PTE and preserve RC bits */
2696             pmap_update_pte(pte, *pte, template | (*pte & (INTEL_PTE_REF | INTEL_PTE_MOD)));
2697             if (old_pa_locked) {
2698               UNLOCK_PVH(pai);
2699               old_pa_locked = FALSE;
2700             }
2701             need_tlbflush = TRUE;
2702             goto Done;
2703         }
2704
2705         /*
2706          *      Outline of code from here:
2707          *         1) If va was mapped, update TLBs, remove the mapping
2708          *            and remove old pvlist entry.
2709          *         2) Add pvlist entry for new mapping
2710          *         3) Enter new mapping.
2711          *
2712          *      If the old physical page is not managed step 1) is skipped
2713          *      (except for updating the TLBs), and the mapping is
2714          *      overwritten at step 3).  If the new physical page is not
2715          *      managed, step 2) is skipped.
2716          */
2717
2718         if (old_pa != (pmap_paddr_t) 0) {
2719
2720             /*
2721              *  Don't do anything to pages outside valid memory here.
2722              *  Instead convince the code that enters a new mapping
2723              *  to overwrite the old one.
2724              */
2725
2726             /* invalidate the PTE */
2727             pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID));
2728             /* propagate invalidate everywhere */
2729             PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2730             /* remember reference and change */
2731             oattr = (char)(*pte & (PHYS_MODIFIED | PHYS_REFERENCED));
2732             /* completely invalidate the PTE */
2733             pmap_store_pte(pte, 0);
2734
2735             if (managed_page(pai)) {
2736
2737 #if TESTING
2738                 if (pmap->stats.resident_count < 1)
2739                     panic("pmap_enter: resident_count");
2740 #endif
2741                 assert(pmap->stats.resident_count >= 1);
2742                 OSAddAtomic(-1, (SInt32 *) &pmap->stats.resident_count);
2743
2744                 if (iswired(*pte)) {
2745
2746 #if TESTING
2747                     if (pmap->stats.wired_count < 1)
2748                         panic("pmap_enter: wired_count");
2749 #endif
2750                     assert(pmap->stats.wired_count >= 1);
2751                     OSAddAtomic(-1, (SInt32 *) &pmap->stats.wired_count);
2752                 }
2753
2754                 pmap_phys_attributes[pai] |= oattr;
2755                 /*
2756                  *      Remove the mapping from the pvlist for
2757                  *      this physical page.
2758                  *      We'll end up with either a rooted pv or a
2759                  *      hashed pv
2760                  */
2761                 {
2762
2763                     pv_h = pai_to_pvh(pai);
2764
2765                     if (pv_h->pmap == PMAP_NULL) {
2766                         panic("pmap_enter: null pv_list!");
2767                     }
2768
2769                     if (pv_h->va == vaddr && pv_h->pmap == pmap) {
2770                         /*
2771                          * Header is the pv_rooted_entry.
2772                          * If there is a next one, copy it to the
2773                          * header and free the next one (we cannot
2774                          * free the header)
2775                          */
2776                       pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
2777                       if (pvh_e != (pv_hashed_entry_t)pv_h) {
2778                         pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
2779                         LOCK_PV_HASH(pvhash_idx);
2780                           remque(&pvh_e->qlink);
2781                           pmap_pvh_unlink(pvh_e);
2782                           UNLOCK_PV_HASH(pvhash_idx);
2783                           pv_h->pmap = pvh_e->pmap;
2784                           pv_h->va = pvh_e->va;
2785                         }
2786                       else {
2787                         pv_h->pmap = PMAP_NULL;
2788                         pvh_e = PV_HASHED_ENTRY_NULL;
2789                       }
2790                     }
2791                     else {
2792                       pv_hashed_entry_t *pprevh;
2793                       ppnum_t old_ppn;
2794                       /* wasn't the rooted pv - hash, find it, and unlink it */
2795                       old_ppn = (ppnum_t)pa_index(old_pa);
2796                       CHK_NPVHASH();
2797                       pvhash_idx = pvhashidx(pmap,vaddr);
2798                       LOCK_PV_HASH(pvhash_idx);
2799                       pprevh = pvhash(pvhash_idx);
2800 #if PV_DEBUG
2801                       if (NULL==pprevh)panic("pmap enter 1");
2802 #endif
2803                       pvh_e = *pprevh;
2804                       pmap_pv_hashlist_walks++;
2805                       pv_cnt = 0;
2806                       while (PV_HASHED_ENTRY_NULL != pvh_e) {
2807                         pv_cnt++;
2808                         if (pvh_e->pmap == pmap && pvh_e->va == vaddr && pvh_e->ppn == old_ppn) break;
2809                         pprevh = &pvh_e->nexth;
2810                         pvh_e = pvh_e->nexth;
2811                       }
2812                       pmap_pv_hashlist_cnts += pv_cnt;
2813                       if (pmap_pv_hashlist_max < pv_cnt) pmap_pv_hashlist_max = pv_cnt;
2814                       if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pmap_enter: pv not in hash list");
2815                       if(NULL==pprevh)panic("pmap enter 2");
2816                       *pprevh = pvh_e->nexth;
2817                       remque(&pvh_e->qlink);
2818                       UNLOCK_PV_HASH(pvhash_idx);
2819                     }
2820                 }
2821             }
2822             else {
2823
2824                 /*
2825                  *      old_pa is not managed.
2826                  *      Do removal part of accounting.
2827                  */
2828
2829                 if (iswired(*pte)) {
2830                     assert(pmap->stats.wired_count >= 1);
2831                     OSAddAtomic(-1, (SInt32 *) &pmap->stats.wired_count);
2832                 }
2833             }
2834         }
2835
2836         /*
2837          * if we had a previously managed paged locked, unlock it now
2838          */
2839
2840         if (old_pa_locked) {
2841           UNLOCK_PVH(pai);
2842           old_pa_locked = FALSE;
2843         }
2844
2845         pai = pa_index(pa);     /* now working with new incoming phys page */
2846         if (managed_page(pai)) {
2847
2848             /*
2849              *  Step 2) Enter the mapping in the PV list for this
2850              *  physical page.
2851              */
2852             pv_h = pai_to_pvh(pai);
2853
2854             LOCK_PVH(pai);
2855
2856             if (pv_h->pmap == PMAP_NULL) {
2857                 /*
2858                  *      No mappings yet, use  rooted pv
2859                  */
2860                 pv_h->va = vaddr;
2861                 pv_h->pmap = pmap;
2862                 queue_init(&pv_h->qlink);
2863             }
2864             else {
2865                 /*
2866                  *      Add new pv_hashed_entry after header.
2867                  */
2868                 if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
2869                   pvh_e = pvh_new;
2870                   pvh_new = PV_HASHED_ENTRY_NULL;  /* show we used it */
2871                 } else if (PV_HASHED_ENTRY_NULL == pvh_e) {
2872                   PV_HASHED_ALLOC(pvh_e);
2873                   if (PV_HASHED_ENTRY_NULL == pvh_e) {
2874                     /* the pv list is empty.
2875                      * if we are on the kernel pmap we'll use one of the special private
2876                      * kernel pv_e's, else, we need to unlock everything, zalloc a pv_e,
2877                      * and restart bringing in the pv_e with us.
2878                      */
2879                     if (kernel_pmap == pmap) {
2880                       PV_HASHED_KERN_ALLOC(pvh_e);
2881                     } else {
2882                       UNLOCK_PVH(pai);
2883                       PMAP_UNLOCK(pmap);
2884                       pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
2885                       goto Retry;
2886                     }
2887                   }
2888                 }
2889
2890                 if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pvh_e exhaustion");
2891                 pvh_e->va = vaddr;
2892                 pvh_e->pmap = pmap;
2893                 pvh_e->ppn = pn;
2894                 CHK_NPVHASH();
2895                 pvhash_idx = pvhashidx(pmap,vaddr);
2896                 LOCK_PV_HASH(pvhash_idx);
2897                 insque(&pvh_e->qlink, &pv_h->qlink);
2898                 hashp = pvhash(pvhash_idx);
2899 #if PV_DEBUG
2900                 if(NULL==hashp)panic("pmap_enter 4");
2901 #endif
2902                 pvh_e->nexth = *hashp;
2903                 *hashp = pvh_e;
2904                 UNLOCK_PV_HASH(pvhash_idx);
2905
2906                 /*
2907                  *      Remember that we used the pvlist entry.
2908                  */
2909                 pvh_e = PV_HASHED_ENTRY_NULL;
2910             }
2911
2912             /*
2913              * only count the mapping
2914              * for 'managed memory'
2915              */
2916             OSAddAtomic(+1, (SInt32 *) &pmap->stats.resident_count);
2917             if (pmap->stats.resident_count > pmap->stats.resident_max) {
2918                     pmap->stats.resident_max = pmap->stats.resident_count;
2919             }
2920         }
2921
2922         /*
2923          * Step 3) Enter the mapping.
2924          *
2925          *      Build a template to speed up entering -
2926          *      only the pfn changes.
2927          */
2928         template = pa_to_pte(pa) | INTEL_PTE_VALID;
2929
2930         if (flags & VM_MEM_NOT_CACHEABLE) {
2931                 if(!(flags & VM_MEM_GUARDED))
2932                         template |= INTEL_PTE_PTA;
2933                 template |= INTEL_PTE_NCACHE;
2934         }
2935
2936         if (pmap != kernel_pmap)
2937                 template |= INTEL_PTE_USER;
2938         if (prot & VM_PROT_WRITE)
2939                 template |= INTEL_PTE_WRITE;
2940
2941         if (set_NX == TRUE)
2942                 template |= INTEL_PTE_NX;
2943
2944         if (wired) {
2945                 template |= INTEL_PTE_WIRED;
2946                 OSAddAtomic(+1, (SInt32 *) &pmap->stats.wired_count);
2947         }
2948         pmap_store_pte(pte, template);
2949
2950         /* if this was a managed page we delayed unlocking the pv until here
2951          * to prevent pmap_page_protect et al from finding it until the pte
2952          * has been stored */
2953
2954         if (managed_page(pai)) {
2955           UNLOCK_PVH(pai);
2956         }
2957
2958 Done:
2959         if (need_tlbflush == TRUE)
2960                 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2961
2962         if (pvh_e != PV_HASHED_ENTRY_NULL) {
2963                 PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
2964         }
2965
2966         if (pvh_new != PV_HASHED_ENTRY_NULL) {
2967           PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
2968         }
2969
2970         PMAP_UNLOCK(pmap);
2971         PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0);
2972 }
2973
2974 /*
2975  *      Routine:        pmap_change_wiring
2976  *      Function:       Change the wiring attribute for a map/virtual-address
2977  *                      pair.
2978  *      In/out conditions:
2979  *                      The mapping must already exist in the pmap.
2980  */
2981 void
2982 pmap_change_wiring(
2983         register pmap_t map,
2984         vm_map_offset_t vaddr,
2985         boolean_t       wired)
2986 {
2987         register pt_entry_t     *pte;
2988
2989         /*
2990          *      We must grab the pmap system lock because we may
2991          *      change a pte_page queue.
2992          */
2993         PMAP_LOCK(map);
2994
2995         if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL)
2996                 panic("pmap_change_wiring: pte missing");
2997
2998         if (wired && !iswired(*pte)) {
2999             /*
3000              *  wiring down mapping
3001              */
3002             OSAddAtomic(+1, (SInt32 *) &map->stats.wired_count);
3003             pmap_update_pte(pte, *pte, (*pte | INTEL_PTE_WIRED));
3004         }
3005         else if (!wired && iswired(*pte)) {
3006             /*
3007              *  unwiring mapping
3008              */
3009             assert(map->stats.wired_count >= 1);
3010             OSAddAtomic(-1, (SInt32 *) &map->stats.wired_count);
3011             pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WIRED));
3012         }
3013
3014         PMAP_UNLOCK(map);
3015 }
3016
3017 ppnum_t
3018 pmap_find_phys(pmap_t pmap, addr64_t va)
3019 {
3020         pt_entry_t     *ptp;
3021         ppnum_t         ppn;
3022
3023         mp_disable_preemption();
3024
3025         ptp = pmap_pte(pmap, va);
3026         if (PT_ENTRY_NULL == ptp) {
3027                 ppn = 0;
3028         } else {
3029                 ppn = (ppnum_t) i386_btop(pte_to_pa(*ptp));
3030         }
3031         mp_enable_preemption();
3032
3033         return ppn;
3034 }
3035
3036 /*
3037  *      Routine:        pmap_extract
3038  *      Function:
3039  *              Extract the physical page address associated
3040  *              with the given map/virtual_address pair.
3041  *     Change to shim for backwards compatibility but will not
3042  *     work for 64 bit systems.  Some old drivers that we cannot
3043  *     change need this.
3044  */
3045
3046 vm_offset_t
3047 pmap_extract(
3048         register pmap_t pmap,
3049         vm_map_offset_t vaddr)
3050 {
3051         ppnum_t ppn;
3052         vm_offset_t paddr;
3053
3054         paddr = (vm_offset_t)0;
3055         ppn = pmap_find_phys(pmap, vaddr);
3056
3057         if (ppn) {
3058                 paddr = ((vm_offset_t)i386_ptob(ppn)) | (vaddr & INTEL_OFFMASK);
3059         }
3060         return (paddr);
3061 }
3062
3063 void
3064 pmap_expand_pml4(
3065                  pmap_t map,
3066                  vm_map_offset_t vaddr)
3067 {
3068         register vm_page_t      m;
3069         register pmap_paddr_t   pa;
3070         uint64_t                i;
3071         spl_t                   spl;
3072         ppnum_t                 pn;
3073         pml4_entry_t            *pml4p;
3074
3075         if (kernel_pmap == map) panic("expand kernel pml4");
3076
3077         spl = splhigh();
3078         pml4p = pmap64_pml4(map, vaddr);
3079         splx(spl);
3080         if (PML4_ENTRY_NULL == pml4p) panic("pmap_expand_pml4 no pml4p");
3081
3082         /*
3083          *      Allocate a VM page for the pml4 page
3084          */
3085         while ((m = vm_page_grab()) == VM_PAGE_NULL)
3086                 VM_PAGE_WAIT();
3087
3088         /*
3089          *      put the page into the pmap's obj list so it
3090          *      can be found later.
3091          */
3092         pn = m->phys_page;
3093         pa = i386_ptob(pn);
3094         i = pml4idx(map, vaddr);
3095
3096         /*
3097          *      Zero the page.
3098          */
3099         pmap_zero_page(pn);
3100
3101         vm_page_lock_queues();
3102         vm_page_wire(m);
3103         inuse_ptepages_count++;
3104         vm_page_unlock_queues();
3105
3106         /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3107         vm_object_lock(map->pm_obj_pml4);
3108
3109         PMAP_LOCK(map);
3110         /*
3111          *      See if someone else expanded us first
3112          */
3113         if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
3114                 PMAP_UNLOCK(map);
3115                 vm_object_unlock(map->pm_obj_pml4);
3116
3117                 vm_page_lock_queues();
3118                 vm_page_free(m);
3119                 inuse_ptepages_count--;
3120                 vm_page_unlock_queues();
3121
3122                 return;
3123         }
3124
3125 #if 0 /* DEBUG */
3126        if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i)) {
3127                panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
3128                      map, map->pm_obj_pml4, vaddr, i);
3129        }
3130 #endif
3131         vm_page_insert(m, map->pm_obj_pml4, (vm_object_offset_t)i);
3132         vm_object_unlock(map->pm_obj_pml4);
3133
3134         /*
3135          *      Set the page directory entry for this page table.
3136          */
3137         pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
3138
3139         pmap_store_pte(pml4p, pa_to_pte(pa)
3140                                 | INTEL_PTE_VALID
3141                                 | INTEL_PTE_USER
3142                                 | INTEL_PTE_WRITE);
3143
3144         PMAP_UNLOCK(map);
3145
3146         return;
3147
3148 }
3149
3150 void
3151 pmap_expand_pdpt(
3152                  pmap_t map,
3153                  vm_map_offset_t vaddr)
3154 {
3155         register vm_page_t      m;
3156         register pmap_paddr_t   pa;
3157         uint64_t                i;
3158         spl_t                   spl;
3159         ppnum_t                 pn;
3160         pdpt_entry_t            *pdptp;
3161
3162         if (kernel_pmap == map) panic("expand kernel pdpt");
3163
3164         spl = splhigh();
3165         while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
3166                 splx(spl);
3167                 pmap_expand_pml4(map, vaddr); /* need room for another pdpt entry */
3168                 spl = splhigh();
3169         }
3170         splx(spl);
3171
3172         /*
3173          *      Allocate a VM page for the pdpt page
3174          */
3175         while ((m = vm_page_grab()) == VM_PAGE_NULL)
3176                 VM_PAGE_WAIT();
3177
3178         /*
3179          *      put the page into the pmap's obj list so it
3180          *      can be found later.
3181          */
3182         pn = m->phys_page;
3183         pa = i386_ptob(pn);
3184         i = pdptidx(map, vaddr);
3185
3186         /*
3187          *      Zero the page.
3188          */
3189         pmap_zero_page(pn);
3190
3191         vm_page_lock_queues();
3192         vm_page_wire(m);
3193         inuse_ptepages_count++;
3194         vm_page_unlock_queues();
3195
3196         /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3197         vm_object_lock(map->pm_obj_pdpt);
3198
3199         PMAP_LOCK(map);
3200         /*
3201          *      See if someone else expanded us first
3202          */
3203         if (pmap64_pde(map, vaddr) != PD_ENTRY_NULL) {
3204                 PMAP_UNLOCK(map);
3205                 vm_object_unlock(map->pm_obj_pdpt);
3206
3207                 vm_page_lock_queues();
3208                 vm_page_free(m);
3209                 inuse_ptepages_count--;
3210                 vm_page_unlock_queues();
3211
3212                 return;
3213         }
3214
3215 #if 0 /* DEBUG */
3216        if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i)) {
3217                panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
3218                      map, map->pm_obj_pdpt, vaddr, i);
3219        }
3220 #endif
3221         vm_page_insert(m, map->pm_obj_pdpt, (vm_object_offset_t)i);
3222         vm_object_unlock(map->pm_obj_pdpt);
3223
3224         /*
3225          *      Set the page directory entry for this page table.
3226          */
3227         pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
3228
3229         pmap_store_pte(pdptp, pa_to_pte(pa)
3230                                 | INTEL_PTE_VALID
3231                                 | INTEL_PTE_USER
3232                                 | INTEL_PTE_WRITE);
3233
3234         PMAP_UNLOCK(map);
3235
3236         return;
3237
3238 }
3239
3240
3241
3242 /*
3243  *      Routine:        pmap_expand
3244  *
3245  *      Expands a pmap to be able to map the specified virtual address.
3246  *
3247  *      Allocates new virtual memory for the P0 or P1 portion of the
3248  *      pmap, then re-maps the physical pages that were in the old
3249  *      pmap to be in the new pmap.
3250  *
3251  *      Must be called with the pmap system and the pmap unlocked,
3252  *      since these must be unlocked to use vm_allocate or vm_deallocate.
3253  *      Thus it must be called in a loop that checks whether the map
3254  *      has been expanded enough.
3255  *      (We won't loop forever, since page tables aren't shrunk.)
3256  */
3257 void
3258 pmap_expand(
3259         pmap_t          map,
3260         vm_map_offset_t vaddr)
3261 {
3262         pt_entry_t              *pdp;
3263         register vm_page_t      m;
3264         register pmap_paddr_t   pa;
3265         uint64_t                 i;
3266         spl_t                   spl;
3267         ppnum_t                 pn;
3268
3269         /*
3270          * if not the kernel map (while we are still compat kernel mode)
3271          * and we are 64 bit, propagate expand upwards
3272          */
3273
3274         if (cpu_64bit && (map != kernel_pmap)) {
3275                 spl = splhigh();
3276                 while ((pdp = pmap64_pde(map, vaddr)) == PD_ENTRY_NULL) {
3277                         splx(spl);
3278                         pmap_expand_pdpt(map, vaddr); /* need room for another pde entry */
3279                         spl = splhigh();
3280                 }
3281                 splx(spl);
3282         }
3283
3284         /*
3285          *      Allocate a VM page for the pde entries.
3286          */
3287         while ((m = vm_page_grab()) == VM_PAGE_NULL)
3288                 VM_PAGE_WAIT();
3289
3290         /*
3291          *      put the page into the pmap's obj list so it
3292          *      can be found later.
3293          */
3294         pn = m->phys_page;
3295         pa = i386_ptob(pn);
3296         i = pdeidx(map, vaddr);
3297
3298         /*
3299          *      Zero the page.
3300          */
3301         pmap_zero_page(pn);
3302
3303         vm_page_lock_queues();
3304         vm_page_wire(m);
3305         inuse_ptepages_count++;
3306         vm_page_unlock_queues();
3307
3308         /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3309         vm_object_lock(map->pm_obj);
3310
3311         PMAP_LOCK(map);
3312         /*
3313          *      See if someone else expanded us first
3314          */
3315
3316         if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
3317                 PMAP_UNLOCK(map);
3318                 vm_object_unlock(map->pm_obj);
3319
3320                 vm_page_lock_queues();
3321                 vm_page_free(m);
3322                 inuse_ptepages_count--;
3323                 vm_page_unlock_queues();
3324
3325                 return;
3326         }
3327
3328 #if 0 /* DEBUG */
3329        if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i)) {
3330                panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
3331                      map, map->pm_obj, vaddr, i);
3332        }
3333 #endif
3334         vm_page_insert(m, map->pm_obj, (vm_object_offset_t)i);
3335         vm_object_unlock(map->pm_obj);
3336
3337         /*
3338          * refetch while locked
3339          */
3340
3341         pdp = pmap_pde(map, vaddr);
3342
3343         /*
3344          *      Set the page directory entry for this page table.
3345          */
3346         pmap_store_pte(pdp, pa_to_pte(pa)
3347                                 | INTEL_PTE_VALID
3348                                 | INTEL_PTE_USER
3349                                 | INTEL_PTE_WRITE);
3350
3351         PMAP_UNLOCK(map);
3352
3353         return;
3354 }
3355
3356
3357 /*
3358  * pmap_sync_page_data_phys(ppnum_t pa)
3359  *
3360  * Invalidates all of the instruction cache on a physical page and
3361  * pushes any dirty data from the data cache for the same physical page
3362  * Not required in i386.
3363  */
3364 void
3365 pmap_sync_page_data_phys(__unused ppnum_t pa)
3366 {
3367         return;
3368 }
3369
3370 /*
3371  * pmap_sync_page_attributes_phys(ppnum_t pa)
3372  *
3373  * Write back and invalidate all cachelines on a physical page.
3374  */
3375 void
3376 pmap_sync_page_attributes_phys(ppnum_t pa)
3377 {
3378         cache_flush_page_phys(pa);
3379 }
3380
3381
3382
3383 #ifdef CURRENTLY_UNUSED_AND_UNTESTED
3384
3385 int     collect_ref;
3386 int     collect_unref;
3387
3388 /*
3389  *      Routine:        pmap_collect
3390  *      Function:
3391  *              Garbage collects the physical map system for
3392  *              pages which are no longer used.
3393  *              Success need not be guaranteed -- that is, there
3394  *              may well be pages which are not referenced, but
3395  *              others may be collected.
3396  *      Usage:
3397  *              Called by the pageout daemon when pages are scarce.
3398  */
3399 void
3400 pmap_collect(
3401         pmap_t          p)
3402 {
3403         register pt_entry_t     *pdp, *ptp;
3404         pt_entry_t              *eptp;
3405         int                     wired;
3406
3407         if (p == PMAP_NULL)
3408                 return;
3409
3410         if (p == kernel_pmap)
3411                 return;
3412
3413         /*
3414          *      Garbage collect map.
3415          */
3416         PMAP_LOCK(p);
3417
3418         for (pdp = (pt_entry_t *)p->dirbase;
3419              pdp < (pt_entry_t *)&p->dirbase[(UMAXPTDI+1)];
3420              pdp++)
3421         {
3422            if (*pdp & INTEL_PTE_VALID) {
3423               if(*pdp & INTEL_PTE_REF) {
3424                 pmap_store_pte(pdp, *pdp & ~INTEL_PTE_REF);
3425                 collect_ref++;
3426               } else {
3427                 collect_unref++;
3428                 ptp = pmap_pte(p, pdetova(pdp - (pt_entry_t *)p->dirbase));
3429                 eptp = ptp + NPTEPG;
3430
3431                 /*
3432                  * If the pte page has any wired mappings, we cannot
3433                  * free it.
3434                  */
3435                 wired = 0;
3436                 {
3437                     register pt_entry_t *ptep;
3438                     for (ptep = ptp; ptep < eptp; ptep++) {
3439                         if (iswired(*ptep)) {
3440                             wired = 1;
3441                             break;
3442                         }
3443                     }
3444                 }
3445                 if (!wired) {
3446                     /*
3447                      * Remove the virtual addresses mapped by this pte page.
3448                      */
3449                     pmap_remove_range(p,
3450                                 pdetova(pdp - (pt_entry_t *)p->dirbase),
3451                                 ptp,
3452                                 eptp);
3453
3454                     /*
3455                      * Invalidate the page directory pointer.
3456                      */
3457                     pmap_store_pte(pdp, 0x0);
3458
3459                     PMAP_UNLOCK(p);
3460
3461                     /*
3462                      * And free the pte page itself.
3463                      */
3464                     {
3465                         register vm_page_t m;
3466
3467                         vm_object_lock(p->pm_obj);
3468
3469                         m = vm_page_lookup(p->pm_obj,(vm_object_offset_t)(pdp - (pt_entry_t *)&p->dirbase[0]));
3470                         if (m == VM_PAGE_NULL)
3471                             panic("pmap_collect: pte page not in object");
3472
3473                         vm_page_lock_queues();
3474                         vm_page_free(m);
3475                         inuse_ptepages_count--;
3476                         vm_page_unlock_queues();
3477
3478                         vm_object_unlock(p->pm_obj);
3479                     }
3480
3481                     PMAP_LOCK(p);
3482                 }
3483               }
3484            }
3485         }
3486
3487         PMAP_UPDATE_TLBS(p, 0x0, 0xFFFFFFFFFFFFF000ULL);
3488         PMAP_UNLOCK(p);
3489         return;
3490
3491 }
3492 #endif
3493
3494
3495 void
3496 pmap_copy_page(ppnum_t src, ppnum_t dst)
3497 {
3498   bcopy_phys((addr64_t)i386_ptob(src),
3499              (addr64_t)i386_ptob(dst),
3500              PAGE_SIZE);
3501 }
3502
3503
3504 /*
3505  *      Routine:        pmap_pageable
3506  *      Function:
3507  *              Make the specified pages (by pmap, offset)
3508  *              pageable (or not) as requested.
3509  *
3510  *              A page which is not pageable may not take
3511  *              a fault; therefore, its page table entry
3512  *              must remain valid for the duration.
3513  *
3514  *              This routine is merely advisory; pmap_enter
3515  *              will specify that these pages are to be wired
3516  *              down (or not) as appropriate.
3517  */
3518 void
3519 pmap_pageable(
3520         __unused pmap_t         pmap,
3521         __unused vm_map_offset_t        start_addr,
3522         __unused vm_map_offset_t        end_addr,
3523         __unused boolean_t      pageable)
3524 {
3525 #ifdef  lint
3526         pmap++; start_addr++; end_addr++; pageable++;
3527 #endif  /* lint */
3528 }
3529
3530 /*
3531  *      Clear specified attribute bits.
3532  */
3533 void
3534 phys_attribute_clear(
3535         ppnum_t         pn,
3536         int             bits)
3537 {
3538         pv_rooted_entry_t               pv_h;
3539         register pv_hashed_entry_t      pv_e;
3540         register pt_entry_t     *pte;
3541         int                     pai;
3542         register pmap_t         pmap;
3543
3544         pmap_intr_assert();
3545         assert(pn != vm_page_fictitious_addr);
3546         if (pn == vm_page_guard_addr)
3547                 return;
3548
3549         pai = ppn_to_pai(pn);
3550
3551         if (!managed_page(pai)) {
3552             /*
3553              *  Not a managed page.
3554              */
3555             return;
3556         }
3557
3558         PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START,
3559                    (int) pn, bits, 0, 0, 0);
3560
3561         pv_h = pai_to_pvh(pai);
3562
3563         LOCK_PVH(pai);
3564
3565         /*
3566          * Walk down PV list, clearing all modify or reference bits.
3567          * We do not have to lock the pv_list because we have
3568          * the entire pmap system locked.
3569          */
3570         if (pv_h->pmap != PMAP_NULL) {
3571             /*
3572              * There are some mappings.
3573              */
3574
3575           pv_e = (pv_hashed_entry_t)pv_h;
3576
3577           do {
3578                 pmap = pv_e->pmap;
3579
3580                 {
3581                     vm_map_offset_t va;
3582
3583                     va = pv_e->va;
3584                     /*
3585                      * first make sure any processor actively
3586                      * using this pmap, flushes its TLB state
3587                      */
3588
3589                     PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
3590
3591                     /*
3592                      * Clear modify and/or reference bits.
3593                      */
3594
3595                     pte = pmap_pte(pmap, va);
3596                     pmap_update_pte(pte, *pte, (*pte & ~bits));
3597
3598                 }
3599
3600                 pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
3601
3602           } while (pv_e != (pv_hashed_entry_t)pv_h);
3603         }
3604         pmap_phys_attributes[pai] &= ~bits;
3605
3606         UNLOCK_PVH(pai);
3607
3608         PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END,
3609                    0, 0, 0, 0, 0);
3610
3611 }
3612
3613 /*
3614  *      Check specified attribute bits.
3615  */
3616 int
3617 phys_attribute_test(
3618         ppnum_t         pn,
3619         int             bits)
3620 {
3621         pv_rooted_entry_t               pv_h;
3622         register pv_hashed_entry_t      pv_e;
3623         register pt_entry_t     *pte;
3624         int                     pai;
3625         register pmap_t         pmap;
3626         int                     attributes = 0;
3627
3628         pmap_intr_assert();
3629         assert(pn != vm_page_fictitious_addr);
3630         if (pn == vm_page_guard_addr)
3631                 return 0;
3632
3633         pai = ppn_to_pai(pn);
3634
3635         if (!managed_page(pai)) {
3636             /*
3637              *  Not a managed page.
3638              */
3639             return (0);
3640         }
3641
3642         /*
3643          * super fast check...  if bits already collected
3644          * no need to take any locks...
3645          * if not set, we need to recheck after taking
3646          * the lock in case they got pulled in while
3647          * we were waiting for the lock
3648          */
3649         if ( (pmap_phys_attributes[pai] & bits) == bits)
3650             return (bits);
3651
3652         pv_h = pai_to_pvh(pai);
3653
3654         LOCK_PVH(pai);
3655
3656         attributes = pmap_phys_attributes[pai] & bits;
3657
3658         /*
3659          * Walk down PV list, checking the mappings until we
3660          * reach the end or we've found the attributes we've asked for
3661          * We do not have to lock the pv_list because we have
3662          * the entire pmap system locked.
3663          */
3664         if (pv_h->pmap != PMAP_NULL) {
3665             /*
3666              * There are some mappings.
3667              */
3668           pv_e = (pv_hashed_entry_t)pv_h;
3669           if (attributes != bits) do {
3670
3671                 pmap = pv_e->pmap;
3672
3673                 {
3674                     vm_map_offset_t va;
3675
3676                     va = pv_e->va;
3677                     /*
3678                      * first make sure any processor actively
3679                      * using this pmap, flushes its TLB state
3680                      */
3681                     PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
3682
3683                     /*
3684                      * pick up modify and/or reference bits from this mapping
3685                      */
3686
3687                     pte = pmap_pte(pmap, va);
3688                     attributes |= *pte & bits;
3689
3690                 }
3691
3692                 pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
3693
3694             } while ((attributes != bits) && (pv_e != (pv_hashed_entry_t)pv_h));
3695         }
3696
3697         UNLOCK_PVH(pai);
3698         return (attributes);
3699 }
3700
3701 /*
3702  *      Set specified attribute bits.
3703  */
3704 void
3705 phys_attribute_set(
3706         ppnum_t         pn,
3707         int             bits)
3708 {
3709         int             pai;
3710
3711         pmap_intr_assert();
3712         assert(pn != vm_page_fictitious_addr);
3713         if (pn == vm_page_guard_addr)
3714                 return;
3715
3716         pai = ppn_to_pai(pn);
3717
3718         if (!managed_page(pai)) {
3719             /*
3720              *  Not a managed page.
3721              */
3722             return;
3723         }
3724
3725         LOCK_PVH(pai);
3726
3727         pmap_phys_attributes[pai] |= bits;
3728
3729         UNLOCK_PVH(pai);
3730 }
3731
3732 /*
3733  *      Set the modify bit on the specified physical page.
3734  */
3735
3736 void pmap_set_modify(
3737                      ppnum_t pn)
3738 {
3739         phys_attribute_set(pn, PHYS_MODIFIED);
3740 }
3741
3742 /*
3743  *      Clear the modify bits on the specified physical page.
3744  */
3745
3746 void
3747 pmap_clear_modify(
3748                   ppnum_t pn)
3749 {
3750         phys_attribute_clear(pn, PHYS_MODIFIED);
3751 }
3752
3753 /*
3754  *      pmap_is_modified:
3755  *
3756  *      Return whether or not the specified physical page is modified
3757  *      by any physical maps.
3758  */
3759
3760 boolean_t
3761 pmap_is_modified(
3762                  ppnum_t pn)
3763 {
3764         if (phys_attribute_test(pn, PHYS_MODIFIED))
3765                 return TRUE;
3766
3767         return FALSE;
3768 }
3769
3770 /*
3771  *      pmap_clear_reference:
3772  *
3773  *      Clear the reference bit on the specified physical page.
3774  */
3775
3776 void
3777 pmap_clear_reference(
3778                      ppnum_t pn)
3779 {
3780         phys_attribute_clear(pn, PHYS_REFERENCED);
3781 }
3782
3783 void
3784 pmap_set_reference(ppnum_t pn)
3785 {
3786         phys_attribute_set(pn, PHYS_REFERENCED);
3787 }
3788
3789 /*
3790  *      pmap_is_referenced:
3791  *
3792  *      Return whether or not the specified physical page is referenced
3793  *      by any physical maps.
3794  */
3795
3796 boolean_t
3797 pmap_is_referenced(
3798                    ppnum_t pn)
3799 {
3800         if (phys_attribute_test(pn, PHYS_REFERENCED))
3801                 return TRUE;
3802
3803         return FALSE;
3804 }
3805
3806 /*
3807  * pmap_get_refmod(phys)
3808  *  returns the referenced and modified bits of the specified
3809  *  physical page.
3810  */
3811 unsigned int
3812 pmap_get_refmod(ppnum_t pa)
3813 {
3814         int     refmod;
3815         unsigned int retval = 0;
3816
3817         refmod = phys_attribute_test(pa, PHYS_MODIFIED | PHYS_REFERENCED);
3818
3819         if (refmod & PHYS_MODIFIED)
3820                 retval |= VM_MEM_MODIFIED;
3821         if (refmod & PHYS_REFERENCED)
3822                 retval |= VM_MEM_REFERENCED;
3823
3824         return (retval);
3825 }
3826
3827 /*
3828  * pmap_clear_refmod(phys, mask)
3829  *  clears the referenced and modified bits as specified by the mask
3830  *  of the specified physical page.
3831  */
3832 void
3833 pmap_clear_refmod(ppnum_t pa, unsigned int mask)
3834 {
3835         unsigned int  x86Mask;
3836
3837         x86Mask = (   ((mask &   VM_MEM_MODIFIED)?   PHYS_MODIFIED : 0)
3838                     | ((mask & VM_MEM_REFERENCED)? PHYS_REFERENCED : 0));
3839         phys_attribute_clear(pa, x86Mask);
3840 }
3841
3842 void
3843 invalidate_icache(__unused vm_offset_t  addr,
3844                   __unused unsigned     cnt,
3845                   __unused int          phys)
3846 {
3847         return;
3848 }
3849 void
3850 flush_dcache(__unused vm_offset_t       addr,
3851              __unused unsigned          count,
3852              __unused int               phys)
3853 {
3854         return;
3855 }
3856
3857 #if CONFIG_DTRACE
3858 /*
3859  * Constrain DTrace copyin/copyout actions
3860  */
3861 extern kern_return_t dtrace_copyio_preflight(addr64_t);
3862 extern kern_return_t dtrace_copyio_postflight(addr64_t);
3863
3864 kern_return_t dtrace_copyio_preflight(__unused addr64_t va)
3865 {
3866         thread_t thread = current_thread();
3867
3868         if (current_map() == kernel_map)
3869                 return KERN_FAILURE;
3870         else if (thread->machine.specFlags & CopyIOActive)
3871                 return KERN_FAILURE;
3872         else
3873                 return KERN_SUCCESS;
3874 }
3875
3876 kern_return_t dtrace_copyio_postflight(__unused addr64_t va)
3877 {
3878         return KERN_SUCCESS;
3879 }
3880 #endif /* CONFIG_DTRACE */
3881
3882 #if     MACH_KDB
3883
3884 /* show phys page mappings and attributes */
3885
3886 extern void     db_show_page(pmap_paddr_t pa);
3887
3888 #if 0
3889 void
3890 db_show_page(pmap_paddr_t pa)
3891 {
3892         pv_entry_t      pv_h;
3893         int             pai;
3894         char            attr;
3895
3896         pai = pa_index(pa);
3897         pv_h = pai_to_pvh(pai);
3898
3899         attr = pmap_phys_attributes[pai];
3900         printf("phys page %llx ", pa);
3901         if (attr & PHYS_MODIFIED)
3902                 printf("modified, ");
3903         if (attr & PHYS_REFERENCED)
3904                 printf("referenced, ");
3905         if (pv_h->pmap || pv_h->next)
3906                 printf(" mapped at\n");
3907         else
3908                 printf(" not mapped\n");
3909         for (; pv_h; pv_h = pv_h->next)
3910                 if (pv_h->pmap)
3911                         printf("%llx in pmap %p\n", pv_h->va, pv_h->pmap);
3912 }
3913 #endif
3914
3915 #endif /* MACH_KDB */
3916
3917 #if     MACH_KDB
3918 #if 0
3919 void db_kvtophys(vm_offset_t);
3920 void db_show_vaddrs(pt_entry_t  *);
3921
3922 /*
3923  *      print out the results of kvtophys(arg)
3924  */
3925 void
3926 db_kvtophys(
3927         vm_offset_t     vaddr)
3928 {
3929         db_printf("0x%qx", kvtophys(vaddr));
3930 }
3931
3932 /*
3933  *      Walk the pages tables.
3934  */
3935 void
3936 db_show_vaddrs(
3937         pt_entry_t      *dirbase)
3938 {
3939         pt_entry_t      *ptep, *pdep, tmp;
3940         unsigned int    x, y, pdecnt, ptecnt;
3941
3942         if (dirbase == 0) {
3943                 dirbase = kernel_pmap->dirbase;
3944         }
3945         if (dirbase == 0) {
3946                 db_printf("need a dirbase...\n");
3947                 return;
3948         }
3949         dirbase = (pt_entry_t *) (int) ((unsigned long) dirbase & ~INTEL_OFFMASK);
3950
3951         db_printf("dirbase: 0x%x\n", dirbase);
3952
3953         pdecnt = ptecnt = 0;
3954         pdep = &dirbase[0];
3955         for (y = 0; y < NPDEPG; y++, pdep++) {
3956                 if (((tmp = *pdep) & INTEL_PTE_VALID) == 0) {
3957                         continue;
3958                 }
3959                 pdecnt++;
3960                 ptep = (pt_entry_t *) ((unsigned long)(*pdep) & ~INTEL_OFFMASK);
3961                 db_printf("dir[%4d]: 0x%x\n", y, *pdep);
3962                 for (x = 0; x < NPTEPG; x++, ptep++) {
3963                         if (((tmp = *ptep) & INTEL_PTE_VALID) == 0) {
3964                                 continue;
3965                         }
3966                         ptecnt++;
3967                         db_printf("   tab[%4d]: 0x%x, va=0x%x, pa=0x%x\n",
3968                                 x,
3969                                 *ptep,
3970                                 (y << 22) | (x << 12),
3971                                 *ptep & ~INTEL_OFFMASK);
3972                 }
3973         }
3974
3975         db_printf("total: %d tables, %d page table entries.\n", pdecnt, ptecnt);
3976
3977 }
3978 #endif
3979 #endif  /* MACH_KDB */
3980
3981 #include <mach_vm_debug.h>
3982 #if     MACH_VM_DEBUG
3983 #include <vm/vm_debug.h>
3984
3985 int
3986 pmap_list_resident_pages(
3987         __unused pmap_t         pmap,
3988         __unused vm_offset_t    *listp,
3989         __unused int            space)
3990 {
3991         return 0;
3992 }
3993 #endif  /* MACH_VM_DEBUG */
3994
3995
3996
3997 /* temporary workaround */
3998 boolean_t
3999 coredumpok(__unused vm_map_t map, __unused vm_offset_t va)
4000 {
4001 #if 0
4002         pt_entry_t     *ptep;
4003
4004         ptep = pmap_pte(map->pmap, va);
4005         if (0 == ptep)
4006                 return FALSE;
4007         return ((*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED));
4008 #else
4009         return TRUE;
4010 #endif
4011 }
4012
4013
4014 boolean_t
4015 phys_page_exists(
4016                  ppnum_t pn)
4017 {
4018         assert(pn != vm_page_fictitious_addr);
4019
4020         if (!pmap_initialized)
4021                 return (TRUE);
4022
4023         if (pn == vm_page_guard_addr)
4024                 return FALSE;
4025
4026         if (!managed_page(ppn_to_pai(pn)))
4027                 return (FALSE);
4028
4029         return TRUE;
4030 }
4031
4032 void
4033 mapping_free_prime(void)
4034 {
4035         int             i;
4036         pv_hashed_entry_t      pvh_e;
4037         pv_hashed_entry_t      pvh_eh;
4038         pv_hashed_entry_t      pvh_et;
4039         int             pv_cnt;
4040
4041         pv_cnt = 0;
4042         pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
4043         for (i = 0; i < (5 * PV_HASHED_ALLOC_CHUNK); i++) {
4044                 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
4045
4046                 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
4047                 pvh_eh = pvh_e;
4048
4049                 if (pvh_et == PV_HASHED_ENTRY_NULL)
4050                         pvh_et = pvh_e;
4051                 pv_cnt++;
4052         }
4053         PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
4054
4055         pv_cnt = 0;
4056         pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
4057         for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
4058                 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
4059
4060                 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
4061                 pvh_eh = pvh_e;
4062
4063                 if (pvh_et == PV_HASHED_ENTRY_NULL)
4064                         pvh_et = pvh_e;
4065                 pv_cnt++;
4066         }
4067         PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
4068
4069 }
4070
4071 void
4072 mapping_adjust(void)
4073 {
4074         pv_hashed_entry_t      pvh_e;
4075         pv_hashed_entry_t      pvh_eh;
4076         pv_hashed_entry_t      pvh_et;
4077         int             pv_cnt;
4078         int             i;
4079
4080         if (mapping_adjust_call == NULL) {
4081                 thread_call_setup(&mapping_adjust_call_data,
4082                                   (thread_call_func_t) mapping_adjust,
4083                                   (thread_call_param_t) NULL);
4084                 mapping_adjust_call = &mapping_adjust_call_data;
4085         }
4086
4087         pv_cnt = 0;
4088         pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
4089         if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) {
4090                 for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
4091                         pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
4092
4093                         pvh_e->qlink.next = (queue_entry_t)pvh_eh;
4094                         pvh_eh = pvh_e;
4095
4096                         if (pvh_et == PV_HASHED_ENTRY_NULL)
4097                                 pvh_et = pvh_e;
4098                         pv_cnt++;
4099                 }
4100                 PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
4101         }
4102
4103         pv_cnt = 0;
4104         pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
4105         if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) {
4106                 for (i = 0; i < PV_HASHED_ALLOC_CHUNK; i++) {
4107                         pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
4108
4109                         pvh_e->qlink.next = (queue_entry_t)pvh_eh;
4110                         pvh_eh = pvh_e;
4111
4112                         if (pvh_et == PV_HASHED_ENTRY_NULL)
4113                                 pvh_et = pvh_e;
4114                         pv_cnt++;
4115                 }
4116                 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
4117         }
4118         mappingrecurse = 0;
4119 }
4120
4121 void
4122 pmap_commpage32_init(vm_offset_t kernel_commpage, vm_offset_t user_commpage, int cnt)
4123 {
4124         int i;
4125         pt_entry_t *opte, *npte;
4126         pt_entry_t pte;
4127         spl_t s;
4128
4129         for (i = 0; i < cnt; i++) {
4130                 s = splhigh();
4131                 opte = pmap_pte(kernel_pmap, (vm_map_offset_t)kernel_commpage);
4132                 if (0 == opte)
4133                         panic("kernel_commpage");
4134                 pte = *opte | INTEL_PTE_USER|INTEL_PTE_GLOBAL;
4135                 pte &= ~INTEL_PTE_WRITE; // ensure read only
4136                 npte = pmap_pte(kernel_pmap, (vm_map_offset_t)user_commpage);
4137                 if (0 == npte)
4138                         panic("user_commpage");
4139                 pmap_store_pte(npte, pte);
4140                 splx(s);
4141                 kernel_commpage += INTEL_PGBYTES;
4142                 user_commpage += INTEL_PGBYTES;
4143         }
4144 }
4145
4146
4147 #define PMAP_COMMPAGE64_CNT  (_COMM_PAGE64_AREA_USED/PAGE_SIZE)
4148 pt_entry_t pmap_commpage64_ptes[PMAP_COMMPAGE64_CNT];
4149
4150 void
4151 pmap_commpage64_init(vm_offset_t kernel_commpage, __unused vm_map_offset_t user_commpage, int cnt)
4152 {
4153     int i;
4154     pt_entry_t *kptep;
4155
4156     PMAP_LOCK(kernel_pmap);
4157
4158     for (i = 0; i < cnt; i++) {
4159         kptep = pmap_pte(kernel_pmap, (uint64_t)kernel_commpage + (i*PAGE_SIZE));
4160         if ((0 == kptep) || (0 == (*kptep & INTEL_PTE_VALID)))
4161             panic("pmap_commpage64_init pte");
4162         pmap_commpage64_ptes[i] = ((*kptep & ~INTEL_PTE_WRITE) | INTEL_PTE_USER);
4163     }
4164     PMAP_UNLOCK(kernel_pmap);
4165 }
4166
4167
4168 static cpu_pmap_t               cpu_pmap_master;
4169
4170 struct cpu_pmap *
4171 pmap_cpu_alloc(boolean_t is_boot_cpu)
4172 {
4173         int                     ret;
4174         int                     i;
4175         cpu_pmap_t              *cp;
4176         vm_offset_t             address;
4177         vm_map_address_t        mapaddr;
4178         vm_map_entry_t          entry;
4179         pt_entry_t              *pte;
4180
4181         if (is_boot_cpu) {
4182                 cp = &cpu_pmap_master;
4183         } else {
4184                 /*
4185                  * The per-cpu pmap data structure itself.
4186                  */
4187                 ret = kmem_alloc(kernel_map,
4188                                  (vm_offset_t *) &cp, sizeof(cpu_pmap_t));
4189                 if (ret != KERN_SUCCESS) {
4190                         printf("pmap_cpu_alloc() failed ret=%d\n", ret);
4191                         return NULL;
4192                 }
4193                 bzero((void *)cp, sizeof(cpu_pmap_t));
4194
4195                 /*
4196                  * The temporary windows used for copy/zero - see loose_ends.c
4197                  */
4198                 ret = vm_map_find_space(kernel_map,
4199                     &mapaddr, PMAP_NWINDOWS*PAGE_SIZE, (vm_map_offset_t)0, 0, &entry);
4200                 if (ret != KERN_SUCCESS) {
4201                         printf("pmap_cpu_alloc() "
4202                                 "vm_map_find_space ret=%d\n", ret);
4203                         pmap_cpu_free(cp);
4204                         return NULL;
4205                 }
4206                 address = (vm_offset_t)mapaddr;
4207
4208                 for (i = 0; i < PMAP_NWINDOWS; i++, address += PAGE_SIZE) {
4209                   spl_t s;
4210                         s = splhigh();
4211                         while ((pte = pmap_pte(kernel_pmap, (vm_map_offset_t)address)) == 0)
4212                                 pmap_expand(kernel_pmap, (vm_map_offset_t)address);
4213                         * (int *) pte = 0;
4214                         cp->mapwindow[i].prv_CADDR = (caddr_t) address;
4215                         cp->mapwindow[i].prv_CMAP = pte;
4216                         splx(s);
4217                 }
4218                 vm_map_unlock(kernel_map);
4219         }
4220
4221         cp->pdpt_window_index = PMAP_PDPT_FIRST_WINDOW;
4222         cp->pde_window_index = PMAP_PDE_FIRST_WINDOW;
4223         cp->pte_window_index = PMAP_PTE_FIRST_WINDOW;
4224
4225         return cp;
4226 }
4227
4228 void
4229 pmap_cpu_free(struct cpu_pmap *cp)
4230 {
4231         if (cp != NULL && cp != &cpu_pmap_master) {
4232                 kfree((void *) cp, sizeof(cpu_pmap_t));
4233         }
4234 }
4235
4236
4237 mapwindow_t *
4238 pmap_get_mapwindow(pt_entry_t pentry)
4239 {
4240     mapwindow_t *mp;
4241     int i;
4242
4243     assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
4244
4245     /*
4246      * Note: 0th map reserved for pmap_pte()
4247      */
4248     for (i = PMAP_NWINDOWS_FIRSTFREE; i < PMAP_NWINDOWS; i++) {
4249             mp = &current_cpu_datap()->cpu_pmap->mapwindow[i];
4250
4251             if (*mp->prv_CMAP == 0) {
4252                     pmap_store_pte(mp->prv_CMAP, pentry);
4253
4254                     invlpg((uintptr_t)mp->prv_CADDR);
4255
4256                     return (mp);
4257             }
4258     }
4259     panic("pmap_get_mapwindow: no windows available");
4260
4261     return NULL;
4262 }
4263
4264
4265 void
4266 pmap_put_mapwindow(mapwindow_t *mp)
4267 {
4268     pmap_store_pte(mp->prv_CMAP, 0);
4269 }
4270
4271
4272 /*
4273  * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time,
4274  * on a NBPDE boundary.
4275  */
4276 uint64_t pmap_nesting_size_min = NBPDE;
4277 uint64_t pmap_nesting_size_max = 0 - (uint64_t)NBPDE; /* no limit, really... */
4278
4279 /*
4280  *      kern_return_t pmap_nest(grand, subord, vstart, size)
4281  *
4282  *      grand  = the pmap that we will nest subord into
4283  *      subord = the pmap that goes into the grand
4284  *      vstart  = start of range in pmap to be inserted
4285  *      nstart  = start of range in pmap nested pmap
4286  *      size   = Size of nest area (up to 16TB)
4287  *
4288  *      Inserts a pmap into another.  This is used to implement shared segments.
4289  *
4290  *      on x86 this is very limited right now.  must be exactly 1 segment.
4291  *
4292  *      Note that we depend upon higher level VM locks to insure that things don't change while
4293  *      we are doing this.  For example, VM should not be doing any pmap enters while it is nesting
4294  *      or do 2 nests at once.
4295  */
4296
4297
4298 kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t vstart, addr64_t nstart, uint64_t size) {
4299
4300         vm_map_offset_t vaddr, nvaddr;
4301         pd_entry_t      *pde,*npde;
4302         unsigned int    i;
4303         uint64_t        num_pde;
4304
4305         // do validity tests
4306         if (size & (pmap_nesting_size_min-1)) return KERN_INVALID_VALUE;
4307         if(vstart & (pmap_nesting_size_min-1)) return KERN_INVALID_VALUE;
4308         if(nstart & (pmap_nesting_size_min-1)) return KERN_INVALID_VALUE;
4309         if((size >> 28) > 65536)  return KERN_INVALID_VALUE;    /* Max size we can nest is 16TB */
4310         if(size == 0) {
4311                 panic("pmap_nest: size is invalid - %016llX\n", size);
4312         }
4313
4314         PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
4315                    (int) grand, (int) subord,
4316                    (int) (vstart>>32), (int) vstart, 0);
4317
4318         subord->pm_shared = TRUE;
4319         nvaddr = (vm_map_offset_t)nstart;
4320         num_pde = size >> PDESHIFT;
4321
4322         PMAP_LOCK(subord);
4323         for (i = 0; i < num_pde; i++) {
4324           npde = pmap_pde(subord, nvaddr);
4325           while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
4326             PMAP_UNLOCK(subord);
4327             pmap_expand(subord, nvaddr); // pmap_expand handles races
4328             PMAP_LOCK(subord);
4329             npde = pmap_pde(subord, nvaddr);
4330           }
4331           nvaddr += NBPDE;
4332         }
4333
4334         PMAP_UNLOCK(subord);
4335
4336         vaddr = (vm_map_offset_t)vstart;
4337
4338         PMAP_LOCK(grand);
4339
4340         for (i = 0;i < num_pde; i++) {
4341           pd_entry_t tpde;
4342
4343           npde = pmap_pde(subord, nstart);
4344           if (npde == 0)
4345                   panic("pmap_nest: no npde, subord %p nstart 0x%llx", subord, nstart);
4346           tpde = *npde;
4347           nstart += NBPDE;
4348           pde = pmap_pde(grand, vaddr);
4349 /* Legacy mode does not require expansion.
4350  * DRK: consider a debug mode test to verify that no PTEs are extant within
4351  * this range.
4352  */
4353           if ((0 == pde) && cpu_64bit) {
4354             PMAP_UNLOCK(grand);
4355             pmap_expand_pdpt(grand, vaddr);
4356             PMAP_LOCK(grand);
4357             pde = pmap_pde(grand, vaddr);
4358           }
4359
4360           if (pde == 0)
4361                   panic("pmap_nest: no pde, grand  %p vaddr 0x%llx", grand, vaddr);
4362           vaddr += NBPDE;
4363           pmap_store_pte(pde, tpde);
4364         }
4365
4366         /* XXX FBDP: why do we need to flush here ? */
4367         PMAP_UPDATE_TLBS(grand, vstart, vstart + size - 1);
4368
4369         PMAP_UNLOCK(grand);
4370
4371         PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, 0, 0, 0, 0, 0);
4372
4373         return KERN_SUCCESS;
4374 }
4375
4376 /*
4377  *      kern_return_t pmap_unnest(grand, vaddr)
4378  *
4379  *      grand  = the pmap that we will nest subord into
4380  *      vaddr  = start of range in pmap to be unnested
4381  *
4382  *      Removes a pmap from another.  This is used to implement shared segments.
4383  *      On the current PPC processors, this is limited to segment (256MB) aligned
4384  *      segment sized ranges.
4385  */
4386
4387 kern_return_t pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size) {
4388
4389         pd_entry_t *pde;
4390         unsigned int i;
4391         unsigned int num_pde;
4392         addr64_t vstart, vend;
4393
4394         PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
4395                    (int) grand,
4396                    (int) (vaddr>>32), (int) vaddr, 0, 0);
4397
4398         if ((size & (pmap_nesting_size_min-1)) ||
4399             (vaddr & (pmap_nesting_size_min-1))) {
4400                 panic("pmap_unnest(%p,0x%llx,0x%llx): unaligned...\n",
4401                       grand, vaddr, size);
4402         }
4403
4404         /* align everything to PDE boundaries */
4405         vstart = vaddr & ~(NBPDE-1);
4406         vend = (vaddr + size + NBPDE - 1) & ~(NBPDE-1);
4407         size = vend - vstart;
4408
4409         PMAP_LOCK(grand);
4410
4411         // invalidate all pdes for segment at vaddr in pmap grand
4412
4413         num_pde = size >> PDESHIFT;
4414
4415         vaddr = vstart;
4416         for (i=0;i<num_pde;i++,pde++) {
4417           pde = pmap_pde(grand, (vm_map_offset_t)vaddr);
4418           if (pde == 0) panic("pmap_unnest: no pde, grand %p vaddr 0x%llx\n", grand, vaddr);
4419           pmap_store_pte(pde, (pd_entry_t)0);
4420           vaddr += NBPDE;
4421         }
4422         PMAP_UPDATE_TLBS(grand, vstart, vend);
4423
4424         PMAP_UNLOCK(grand);
4425
4426         PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, 0, 0, 0, 0, 0);
4427
4428         return KERN_SUCCESS;
4429 }
4430
4431 void
4432 pmap_switch(pmap_t tpmap)
4433 {
4434         spl_t   s;
4435         int     my_cpu;
4436
4437         s = splhigh();          /* Make sure interruptions are disabled */
4438         my_cpu = cpu_number();
4439
4440         set_dirbase(tpmap, my_cpu);
4441
4442         splx(s);
4443 }
4444
4445
4446 /*
4447  * disable no-execute capability on
4448  * the specified pmap
4449  */
4450 void pmap_disable_NX(pmap_t pmap) {
4451
4452         pmap->nx_enabled = 0;
4453 }
4454
4455 void
4456 pt_fake_zone_info(int *count, vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size,
4457                   vm_size_t *alloc_size, int *collectable, int *exhaustable)
4458 {
4459         *count      = inuse_ptepages_count;
4460         *cur_size   = PAGE_SIZE * inuse_ptepages_count;
4461         *max_size   = PAGE_SIZE * (inuse_ptepages_count + vm_page_inactive_count + vm_page_active_count + vm_page_free_count);
4462         *elem_size  = PAGE_SIZE;
4463         *alloc_size = PAGE_SIZE;
4464
4465         *collectable = 1;
4466         *exhaustable = 0;
4467 }
4468
4469 vm_offset_t pmap_cpu_high_map_vaddr(int cpu, enum high_cpu_types e)
4470 {
4471   enum high_fixed_addresses a;
4472   a = e + HIGH_CPU_END * cpu;
4473   return pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN + a);
4474 }
4475
4476 vm_offset_t pmap_high_map_vaddr(enum high_cpu_types e)
4477 {
4478   return pmap_cpu_high_map_vaddr(cpu_number(), e);
4479 }
4480
4481 vm_offset_t pmap_high_map(pt_entry_t pte, enum high_cpu_types e)
4482 {
4483   enum high_fixed_addresses a;
4484   vm_offset_t vaddr;
4485
4486   a = e + HIGH_CPU_END * cpu_number();
4487   vaddr = (vm_offset_t)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN + a);
4488   pmap_store_pte(pte_unique_base + a, pte);
4489
4490   /* TLB flush for this page for this  cpu */
4491   invlpg((uintptr_t)vaddr);
4492
4493   return  vaddr;
4494 }
4495
4496
4497 /*
4498  * Called with pmap locked, we:
4499  *  - scan through per-cpu data to see which other cpus need to flush
4500  *  - send an IPI to each non-idle cpu to be flushed
4501  *  - wait for all to signal back that they are inactive or we see that
4502  *    they are in an interrupt handler or at a safe point
4503  *  - flush the local tlb is active for this pmap
4504  *  - return ... the caller will unlock the pmap
4505  */
4506 void
4507 pmap_flush_tlbs(pmap_t  pmap)
4508 {
4509         unsigned int    cpu;
4510         unsigned int    cpu_bit;
4511         cpu_set         cpus_to_signal;
4512         unsigned int    my_cpu = cpu_number();
4513         pmap_paddr_t    pmap_cr3 = pmap->pm_cr3;
4514         boolean_t       flush_self = FALSE;
4515         uint64_t        deadline;
4516
4517         assert((processor_avail_count < 2) ||
4518                (ml_get_interrupts_enabled() && get_preemption_level() != 0));
4519
4520         /*
4521          * Scan other cpus for matching active or task CR3.
4522          * For idle cpus (with no active map) we mark them invalid but
4523          * don't signal -- they'll check as they go busy.
4524          * Note: for the kernel pmap we look for 64-bit shared address maps.
4525          */
4526         cpus_to_signal = 0;
4527         for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
4528                 if (!cpu_datap(cpu)->cpu_running)
4529                         continue;
4530                 if ((cpu_datap(cpu)->cpu_task_cr3 == pmap_cr3) ||
4531                     (CPU_GET_ACTIVE_CR3(cpu)      == pmap_cr3) ||
4532                     (pmap->pm_shared) ||
4533                     ((pmap == kernel_pmap) &&
4534                      (!CPU_CR3_IS_ACTIVE(cpu) ||
4535                       cpu_datap(cpu)->cpu_task_map == TASK_MAP_64BIT_SHARED))) {
4536                         if (cpu == my_cpu) {
4537                                 flush_self = TRUE;
4538                                 continue;
4539                         }
4540                         cpu_datap(cpu)->cpu_tlb_invalid = TRUE;
4541                         __asm__ volatile("mfence");
4542
4543                         if (CPU_CR3_IS_ACTIVE(cpu)) {
4544                                 cpus_to_signal |= cpu_bit;
4545                                 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
4546                         }
4547                 }
4548         }
4549
4550         PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_START,
4551                    (int) pmap, cpus_to_signal, flush_self, 0, 0);
4552
4553         if (cpus_to_signal) {
4554                 deadline = mach_absolute_time() + LockTimeOut;
4555                 /*
4556                  * Wait for those other cpus to acknowledge
4557                  */
4558                 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
4559                         while ((cpus_to_signal & cpu_bit) != 0) {
4560                                 if (!cpu_datap(cpu)->cpu_running ||
4561                                     cpu_datap(cpu)->cpu_tlb_invalid == FALSE ||
4562                                     !CPU_CR3_IS_ACTIVE(cpu)) {
4563                                         cpus_to_signal &= ~cpu_bit;
4564                                         break;
4565                                 }
4566                                 if (mach_absolute_time() > deadline) {
4567                                         force_immediate_debugger_NMI = TRUE;
4568                                         panic("pmap_flush_tlbs() timeout: "
4569                                                                 "cpu %d failing to respond to interrupts, pmap=%p cpus_to_signal=%lx",
4570                                                                 cpu, pmap, cpus_to_signal);
4571                                 }
4572                                 cpu_pause();
4573                         }
4574                         if (cpus_to_signal == 0)
4575                                 break;
4576                 }
4577         }
4578
4579         /*
4580          * Flush local tlb if required.
4581          * We need this flush even if the pmap being changed
4582          * is the user map... in case we do a copyin/out
4583          * before returning to user mode.
4584          */
4585         if (flush_self)
4586                 flush_tlb();
4587
4588
4589         PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_END,
4590                    (int) pmap, cpus_to_signal, flush_self, 0, 0);
4591 }
4592
4593 void
4594 process_pmap_updates(void)
4595 {
4596         assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
4597
4598         flush_tlb();
4599
4600         current_cpu_datap()->cpu_tlb_invalid = FALSE;
4601         __asm__ volatile("mfence");
4602 }
4603
4604 void
4605 pmap_update_interrupt(void)
4606 {
4607         PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START,
4608                    0, 0, 0, 0, 0);
4609
4610         process_pmap_updates();
4611
4612         PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END,
4613                    0, 0, 0, 0, 0);
4614 }
4615
4616
4617 unsigned int pmap_cache_attributes(ppnum_t pn) {
4618
4619         if (!managed_page(ppn_to_pai(pn)))
4620                 return (VM_WIMG_IO);
4621
4622         return (VM_WIMG_COPYBACK);
4623 }
4624
4625 #ifdef PMAP_DEBUG
4626 void
4627 pmap_dump(pmap_t p)
4628 {
4629   int i;
4630
4631   kprintf("pmap 0x%x\n",p);
4632
4633   kprintf("  pm_cr3 0x%llx\n",p->pm_cr3);
4634   kprintf("  pm_pml4 0x%x\n",p->pm_pml4);
4635   kprintf("  pm_pdpt 0x%x\n",p->pm_pdpt);
4636
4637   kprintf("    pml4[0] 0x%llx\n",*p->pm_pml4);
4638   for (i=0;i<8;i++)
4639     kprintf("    pdpt[%d] 0x%llx\n",i, p->pm_pdpt[i]);
4640 }
4641
4642 void pmap_dump_wrap(void)
4643 {
4644   pmap_dump(current_cpu_datap()->cpu_active_thread->task->map->pmap);
4645 }
4646
4647 void
4648 dump_4GB_pdpt(pmap_t p)
4649 {
4650         int             spl;
4651         pdpt_entry_t    *user_pdptp;
4652         pdpt_entry_t    *kern_pdptp;
4653         pdpt_entry_t    *pml4p;
4654
4655         spl = splhigh();
4656         while ((user_pdptp = pmap64_pdpt(p, 0x0)) == PDPT_ENTRY_NULL) {
4657                 splx(spl);
4658                 pmap_expand_pml4(p, 0x0);
4659                 spl = splhigh();
4660         }
4661         kern_pdptp = kernel_pmap->pm_pdpt;
4662         if (kern_pdptp == NULL)
4663                 panic("kern_pdptp == NULL");
4664         kprintf("dump_4GB_pdpt(%p)\n"
4665                 "kern_pdptp=%p (phys=0x%016llx)\n"
4666                 "\t 0x%08x: 0x%016llx\n"
4667                 "\t 0x%08x: 0x%016llx\n"
4668                 "\t 0x%08x: 0x%016llx\n"
4669                 "\t 0x%08x: 0x%016llx\n"
4670                 "\t 0x%08x: 0x%016llx\n"
4671                 "user_pdptp=%p (phys=0x%016llx)\n"
4672                 "\t 0x%08x: 0x%016llx\n"
4673                 "\t 0x%08x: 0x%016llx\n"
4674                 "\t 0x%08x: 0x%016llx\n"
4675                 "\t 0x%08x: 0x%016llx\n"
4676                 "\t 0x%08x: 0x%016llx\n",
4677                 p, kern_pdptp, kvtophys(kern_pdptp),
4678                 kern_pdptp+0, *(kern_pdptp+0),
4679                 kern_pdptp+1, *(kern_pdptp+1),
4680                 kern_pdptp+2, *(kern_pdptp+2),
4681                 kern_pdptp+3, *(kern_pdptp+3),
4682                 kern_pdptp+4, *(kern_pdptp+4),
4683                 user_pdptp, kvtophys(user_pdptp),
4684                 user_pdptp+0, *(user_pdptp+0),
4685                 user_pdptp+1, *(user_pdptp+1),
4686                 user_pdptp+2, *(user_pdptp+2),
4687                 user_pdptp+3, *(user_pdptp+3),
4688                 user_pdptp+4, *(user_pdptp+4));
4689         kprintf("user pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
4690                 p->pm_cr3, p->pm_hold, p->pm_pml4);
4691         pml4p = (pdpt_entry_t *)p->pm_hold;
4692         if (pml4p == NULL)
4693                 panic("user pml4p == NULL");
4694         kprintf("\t 0x%08x: 0x%016llx\n"
4695                 "\t 0x%08x: 0x%016llx\n",
4696                 pml4p+0, *(pml4p),
4697                 pml4p+KERNEL_UBER_PML4_INDEX, *(pml4p+KERNEL_UBER_PML4_INDEX));
4698         kprintf("kern pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
4699                 kernel_pmap->pm_cr3, kernel_pmap->pm_hold, kernel_pmap->pm_pml4);
4700         pml4p = (pdpt_entry_t *)kernel_pmap->pm_hold;
4701         if (pml4p == NULL)
4702                 panic("kern pml4p == NULL");
4703         kprintf("\t 0x%08x: 0x%016llx\n"
4704                 "\t 0x%08x: 0x%016llx\n",
4705                 pml4p+0, *(pml4p),
4706                 pml4p+511, *(pml4p+511));
4707         splx(spl);
4708 }
4709
4710 void dump_4GB_pdpt_thread(thread_t tp)
4711 {
4712         dump_4GB_pdpt(tp->map->pmap);
4713 }
4714
4715
4716 #endif