osfmk/i386/pmap.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58
  59 /*
  60  *      File:   pmap.c
  61  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  62  *      (These guys wrote the Vax version)
  63  *
  64  *      Physical Map management code for Intel i386, i486, and i860.
  65  *
  66  *      Manages physical address maps.
  67  *
  68  *      In addition to hardware address maps, this
  69  *      module is called upon to provide software-use-only
  70  *      maps which may or may not be stored in the same
  71  *      form as hardware maps.  These pseudo-maps are
  72  *      used to store intermediate results from copy
  73  *      operations to and from address spaces.
  74  *
  75  *      Since the information managed by this module is
  76  *      also stored by the logical address mapping module,
  77  *      this module may throw away valid virtual-to-physical
  78  *      mappings at almost any time.  However, invalidations
  79  *      of virtual-to-physical mappings must be done as
  80  *      requested.
  81  *
  82  *      In order to cope with hardware architectures which
  83  *      make virtual-to-physical map invalidates expensive,
  84  *      this module may delay invalidate or reduced protection
  85  *      operations until such time as they are actually
  86  *      necessary.  This module is given full information as
  87  *      to which processors are currently using which maps,
  88  *      and to when physical maps must be made correct.
  89  */
  90
  91 #include <string.h>
  92 #include <norma_vm.h>
  93 #include <mach_kdb.h>
  94 #include <mach_ldebug.h>
  95
  96 #include <libkern/OSAtomic.h>
  97
  98 #include <mach/machine/vm_types.h>
  99
 100 #include <mach/boolean.h>
 101 #include <kern/thread.h>
 102 #include <kern/zalloc.h>
 103 #include <kern/queue.h>
 104
 105 #include <kern/lock.h>
 106 #include <kern/kalloc.h>
 107 #include <kern/spl.h>
 108
 109 #include <vm/pmap.h>
 110 #include <vm/vm_map.h>
 111 #include <vm/vm_kern.h>
 112 #include <mach/vm_param.h>
 113 #include <mach/vm_prot.h>
 114 #include <vm/vm_object.h>
 115 #include <vm/vm_page.h>
 116
 117 #include <mach/machine/vm_param.h>
 118 #include <machine/thread.h>
 119
 120 #include <kern/misc_protos.h>                   /* prototyping */
 121 #include <i386/misc_protos.h>
 122
 123 #include <i386/cpuid.h>
 124 #include <i386/cpu_data.h>
 125 #include <i386/cpu_number.h>
 126 #include <i386/machine_cpu.h>
 127 #include <i386/seg.h>
 128 #include <i386/serial_io.h>
 129 #include <i386/cpu_capabilities.h>
 130 #include <i386/machine_routines.h>
 131 #include <i386/proc_reg.h>
 132 #include <i386/tsc.h>
 133 #include <i386/acpi.h>
 134 #include <i386/pmap_internal.h>
 135
 136 #if     MACH_KDB
 137 #include <ddb/db_command.h>
 138 #include <ddb/db_output.h>
 139 #include <ddb/db_sym.h>
 140 #include <ddb/db_print.h>
 141 #endif  /* MACH_KDB */
 142
 143 #include <vm/vm_protos.h>
 144
 145 #include <i386/mp.h>
 146 #include <i386/mp_desc.h>
 147 #include <i386/i386_lowmem.h>
 148
 149
 150 /* #define DEBUGINTERRUPTS 1  uncomment to ensure pmap callers have interrupts enabled */
 151 #ifdef DEBUGINTERRUPTS
 152 #define pmap_intr_assert() {if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) panic("pmap interrupt assert %s, %d",__FILE__, __LINE__);}
 153 #else
 154 #define pmap_intr_assert()
 155 #endif
 156
 157 #ifdef IWANTTODEBUG
 158 #undef  DEBUG
 159 #define DEBUG 1
 160 #define POSTCODE_DELAY 1
 161 #include <i386/postcode.h>
 162 #endif /* IWANTTODEBUG */
 163
 164 /*
 165  * Forward declarations for internal functions.
 166  */
 167
 168 void            pmap_remove_range(
 169                         pmap_t          pmap,
 170                         vm_map_offset_t va,
 171                         pt_entry_t      *spte,
 172                         pt_entry_t      *epte);
 173
 174 void            phys_attribute_clear(
 175                         ppnum_t         phys,
 176                         int             bits);
 177
 178 int             phys_attribute_test(
 179                         ppnum_t         phys,
 180                         int             bits);
 181
 182 void            phys_attribute_set(
 183                         ppnum_t         phys,
 184                         int             bits);
 185
 186 void            pmap_set_reference(
 187                         ppnum_t pn);
 188
 189 boolean_t       phys_page_exists(
 190                         ppnum_t pn);
 191
 192
 193 #ifdef PMAP_DEBUG
 194 void dump_pmap(pmap_t);
 195 void dump_4GB_pdpt(pmap_t p);
 196 void dump_4GB_pdpt_thread(thread_t tp);
 197 #endif
 198
 199 int nx_enabled = 1;                     /* enable no-execute protection */
 200 #ifdef CONFIG_EMBEDDED
 201 int allow_data_exec  = 0;       /* no exec from data, embedded is hardcore like that */
 202 #else
 203 int allow_data_exec  = VM_ABI_32;       /* 32-bit apps may execute data by default, 64-bit apps may not */
 204 #endif
 205 int allow_stack_exec = 0;               /* No apps may execute from the stack by default */
 206
 207 boolean_t cpu_64bit  = FALSE;
 208 boolean_t pmap_trace = FALSE;
 209
 210 /*
 211  * when spinning through pmap_remove
 212  * ensure that we don't spend too much
 213  * time with preemption disabled.
 214  * I'm setting the current threshold
 215  * to 20us
 216  */
 217 #define MAX_PREEMPTION_LATENCY_NS 20000
 218
 219 uint64_t max_preemption_latency_tsc = 0;
 220
 221
 222 /*
 223  *      Private data structures.
 224  */
 225
 226 /*
 227  *      For each vm_page_t, there is a list of all currently
 228  *      valid virtual mappings of that page.  An entry is
 229  *      a pv_rooted_entry_t; the list is the pv_table.
 230  *
 231  *      N.B.  with the new combo rooted/hashed scheme it is
 232  *      only possibly to remove individual non-rooted entries
 233  *      if they are found via the hashed chains as there is no
 234  *      way to unlink the singly linked hashed entries if navigated to
 235  *      via the queue list off the rooted entries.  Think of it as
 236  *      hash/walk/pull, keeping track of the prev pointer while walking
 237  *      the singly linked hash list.  All of this is to save memory and
 238  *      keep both types of pv_entries as small as possible.
 239  */
 240
 241 /*
 242
 243 PV HASHING Changes - JK 1/2007
 244
 245 Pve's establish physical to virtual mappings.  These are used for aliasing of a
 246 physical page to (potentially many) virtual addresses within pmaps. In the previous
 247 implementation the structure of the pv_entries (each 16 bytes in size) was
 248
 249 typedef struct pv_entry {
 250     struct pv_entry_t    next;
 251     pmap_t                    pmap;
 252     vm_map_offset_t   va;
 253 } *pv_entry_t;
 254
 255 An initial array of these is created at boot time, one per physical page of memory,
 256 indexed by the physical page number. Additionally, a pool of entries is created from a
 257 pv_zone to be used as needed by pmap_enter() when it is creating new mappings.
 258 Originally, we kept this pool around because the code in pmap_enter() was unable to
 259 block if it needed an entry and none were available - we'd panic.  Some time ago I
 260 restructured the pmap_enter() code so that for user pmaps it can block while zalloc'ing
 261 a pv structure and restart, removing a panic from the code (in the case of the kernel
 262 pmap we cannot block and still panic, so, we keep a separate hot pool for use only on
 263 kernel pmaps).  The pool has not been removed since there is a large performance gain
 264 keeping freed pv's around for reuse and not suffering the overhead of zalloc for every new pv we need.
 265
 266 As pmap_enter() created new mappings it linked the new pve's for them off the fixed
 267 pv array for that ppn (off the next pointer).  These pve's are accessed for several
 268 operations, one of them being address space teardown.  In that case, we basically do this
 269
 270         for (every page/pte in the space) {
 271                 calc pve_ptr from the ppn in the pte
 272                 for (every pv in the list for the ppn) {
 273                         if (this pv is for this pmap/vaddr) {
 274                                 do housekeeping
 275                                 unlink/free the pv
 276                         }
 277                 }
 278         }
 279
 280 The problem arose when we were running, say 8000 (or even 2000) apache or other processes
 281 and one or all terminate. The list hanging off each pv array entry could have thousands of
 282 entries.  We were continuously linearly searching each of these lists as we stepped through
 283 the address space we were tearing down.  Because of the locks we hold, likely taking a cache
 284 miss for each node,  and interrupt disabling for MP issues the system became completely
 285 unresponsive for many seconds while we did this.
 286
 287 Realizing that pve's are accessed in two distinct ways (linearly running the list by ppn
 288 for operations like pmap_page_protect and finding and modifying/removing a single pve as
 289 part of pmap_enter processing) has led to modifying the pve structures and databases.
 290
 291 There are now two types of pve structures.  A "rooted" structure which is basically the
 292 original structure accessed in an array by ppn, and a ''hashed'' structure accessed on a
 293 hash list via a hash of [pmap, vaddr].  These have been designed with the two goals of
 294 minimizing wired memory and making the lookup of a ppn faster.  Since a vast majority of
 295 pages in the system are not aliased and hence represented by a single pv entry I've kept
 296 the rooted entry size as small as possible because there is one of these dedicated for
 297 every physical page of memory.  The hashed pve's are larger due to the addition of the hash
 298 link and the ppn entry needed for matching while running the hash list to find the entry we
 299 are looking for.  This way, only systems that have lots of aliasing (like 2000+ httpd procs)
 300 will pay the extra memory price. Both structures have the same first three fields allowing
 301 some simplification in the code.
 302
 303 They have these shapes
 304
 305 typedef struct pv_rooted_entry {
 306         queue_head_t qlink;
 307         vm_map_offset_t va;
 308         pmap_t          pmap;
 309 } *pv_rooted_entry_t;
 310
 311
 312 typedef struct pv_hashed_entry {
 313   queue_head_t qlink;
 314   vm_map_offset_t va;
 315   pmap_t        pmap;
 316   ppnum_t ppn;
 317   struct pv_hashed_entry *nexth;
 318 } *pv_hashed_entry_t;
 319
 320 The main flow difference is that the code is now aware of the rooted entry and the hashed
 321 entries.  Code that runs the pv list still starts with the rooted entry and then continues
 322 down the qlink onto the hashed entries.  Code that is looking up a specific pv entry first
 323 checks the rooted entry and then hashes and runs the hash list for the match. The hash list
 324 lengths are much smaller than the original pv lists that contained all aliases for the specific ppn.
 325
 326 */
 327
 328 typedef struct pv_rooted_entry {     /* first three entries must match pv_hashed_entry_t */
 329         queue_head_t qlink;
 330         vm_map_offset_t va;             /* virtual address for mapping */
 331         pmap_t          pmap;           /* pmap where mapping lies */
 332 } *pv_rooted_entry_t;
 333
 334 #define PV_ROOTED_ENTRY_NULL    ((pv_rooted_entry_t) 0)
 335
 336 pv_rooted_entry_t       pv_head_table;          /* array of entries, one per page */
 337
 338 typedef struct pv_hashed_entry {     /* first three entries must match pv_rooted_entry_t */
 339   queue_head_t qlink;
 340   vm_map_offset_t va;
 341   pmap_t        pmap;
 342   ppnum_t ppn;
 343   struct pv_hashed_entry *nexth;
 344 } *pv_hashed_entry_t;
 345
 346 #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
 347
 348 #define NPVHASH 4095   /* MUST BE 2^N - 1 */
 349 pv_hashed_entry_t     *pv_hash_table;  /* hash lists */
 350
 351 uint32_t npvhash = 0;
 352
 353 /* #define PV_DEBUG 1   uncomment to enable some PV debugging code */
 354 #ifdef PV_DEBUG
 355 #define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
 356 #else
 357 #define CHK_NPVHASH()
 358 #endif
 359
 360 /*
 361  *      pv_list entries are kept on a list that can only be accessed
 362  *      with the pmap system locked (at SPLVM, not in the cpus_active set).
 363  *      The list is refilled from the pv_hashed_list_zone if it becomes empty.
 364  */
 365 pv_rooted_entry_t       pv_free_list = PV_ROOTED_ENTRY_NULL;            /* free list at SPLVM */
 366 pv_hashed_entry_t       pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
 367 pv_hashed_entry_t      pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
 368 decl_simple_lock_data(,pv_hashed_free_list_lock)
 369 decl_simple_lock_data(,pv_hashed_kern_free_list_lock)
 370 decl_simple_lock_data(,pv_hash_table_lock)
 371
 372 int pv_free_count = 0;
 373 int pv_hashed_free_count = 0;
 374 int pv_kern_free_count = 0;
 375 int pv_hashed_kern_free_count = 0;
 376 #define PV_HASHED_LOW_WATER_MARK 5000
 377 #define PV_HASHED_KERN_LOW_WATER_MARK 100
 378 #define PV_HASHED_ALLOC_CHUNK 2000
 379 #define PV_HASHED_KERN_ALLOC_CHUNK 50
 380 thread_call_t  mapping_adjust_call;
 381 static thread_call_data_t  mapping_adjust_call_data;
 382 uint32_t mappingrecurse = 0;
 383
 384 #define PV_HASHED_ALLOC(pvh_e) { \
 385         simple_lock(&pv_hashed_free_list_lock); \
 386         if ((pvh_e = pv_hashed_free_list) != 0) { \
 387           pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next;   \
 388             pv_hashed_free_count--; \
 389             if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \
 390               if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
 391                 thread_call_enter(mapping_adjust_call); \
 392         } \
 393         simple_unlock(&pv_hashed_free_list_lock); \
 394 }
 395
 396 #define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) {   \
 397         simple_lock(&pv_hashed_free_list_lock); \
 398         pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list;        \
 399         pv_hashed_free_list = pvh_eh; \
 400         pv_hashed_free_count += pv_cnt; \
 401         simple_unlock(&pv_hashed_free_list_lock); \
 402 }
 403
 404 #define PV_HASHED_KERN_ALLOC(pvh_e) { \
 405         simple_lock(&pv_hashed_kern_free_list_lock); \
 406         if ((pvh_e = pv_hashed_kern_free_list) != 0) { \
 407           pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next;      \
 408             pv_hashed_kern_free_count--; \
 409             if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) \
 410               if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
 411                 thread_call_enter(mapping_adjust_call); \
 412         } \
 413         simple_unlock(&pv_hashed_kern_free_list_lock); \
 414 }
 415
 416 #define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) {       \
 417         simple_lock(&pv_hashed_kern_free_list_lock); \
 418         pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list;   \
 419         pv_hashed_kern_free_list = pvh_eh; \
 420         pv_hashed_kern_free_count += pv_cnt; \
 421         simple_unlock(&pv_hashed_kern_free_list_lock); \
 422 }
 423
 424 zone_t          pv_hashed_list_zone;    /* zone of pv_hashed_entry structures */
 425
 426 static zone_t pdpt_zone;
 427
 428 /*
 429  *      Each entry in the pv_head_table is locked by a bit in the
 430  *      pv_lock_table.  The lock bits are accessed by the physical
 431  *      address of the page they lock.
 432  */
 433
 434 char    *pv_lock_table;         /* pointer to array of bits */
 435 #define pv_lock_table_size(n)   (((n)+BYTE_SIZE-1)/BYTE_SIZE)
 436
 437 char    *pv_hash_lock_table;
 438 #define pv_hash_lock_table_size(n)  (((n)+BYTE_SIZE-1)/BYTE_SIZE)
 439
 440 /*
 441  *      First and last physical addresses that we maintain any information
 442  *      for.  Initialized to zero so that pmap operations done before
 443  *      pmap_init won't touch any non-existent structures.
 444  */
 445 boolean_t       pmap_initialized = FALSE;/* Has pmap_init completed? */
 446
 447 static struct vm_object kptobj_object_store;
 448 static vm_object_t kptobj;
 449
 450 /*
 451  *      Index into pv_head table, its lock bits, and the modify/reference and managed bits
 452  */
 453
 454 #define pa_index(pa)    (i386_btop(pa))
 455 #define ppn_to_pai(ppn) ((int)ppn)
 456
 457 #define pai_to_pvh(pai)         (&pv_head_table[pai])
 458 #define lock_pvh_pai(pai)       bit_lock(pai, (void *)pv_lock_table)
 459 #define unlock_pvh_pai(pai)     bit_unlock(pai, (void *)pv_lock_table)
 460
 461 #define pvhashidx(pmap, va) (((uint32_t)pmap ^ ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) & npvhash)
 462 #define pvhash(idx)         (&pv_hash_table[idx])
 463
 464 #define lock_hash_hash(hash)            bit_lock(hash, (void *)pv_hash_lock_table)
 465 #define unlock_hash_hash(hash)  bit_unlock(hash, (void *)pv_hash_lock_table)
 466
 467 /*
 468  *      Array of physical page attribites for managed pages.
 469  *      One byte per physical page.
 470  */
 471 char    *pmap_phys_attributes;
 472 unsigned int    last_managed_page = 0;
 473
 474 /*
 475  *      Physical page attributes.  Copy bits from PTE definition.
 476  */
 477 #define PHYS_MODIFIED   INTEL_PTE_MOD   /* page modified */
 478 #define PHYS_REFERENCED INTEL_PTE_REF   /* page referenced */
 479 #define PHYS_MANAGED    INTEL_PTE_VALID /* page is managed */
 480
 481 /*
 482  *      Amount of virtual memory mapped by one
 483  *      page-directory entry.
 484  */
 485 #define PDE_MAPPED_SIZE         (pdetova(1))
 486 uint64_t pde_mapped_size;
 487
 488 /*
 489  *      Locking and TLB invalidation
 490  */
 491
 492 /*
 493  *      Locking Protocols: (changed 2/2007 JK)
 494  *
 495  *      There are two structures in the pmap module that need locking:
 496  *      the pmaps themselves, and the per-page pv_lists (which are locked
 497  *      by locking the pv_lock_table entry that corresponds to the pv_head
 498  *      for the list in question.)  Most routines want to lock a pmap and
 499  *      then do operations in it that require pv_list locking -- however
 500  *      pmap_remove_all and pmap_copy_on_write operate on a physical page
 501  *      basis and want to do the locking in the reverse order, i.e. lock
 502  *      a pv_list and then go through all the pmaps referenced by that list.
 503  *
 504  *      The system wide pmap lock has been removed. Now, paths take a lock
 505  *      on the pmap before changing its 'shape' and the reverse order lockers
 506  *      (coming in by phys ppn) take a lock on the corresponding pv and then
 507  *      retest to be sure nothing changed during the window before they locked
 508  *      and can then run up/down the pv lists holding the list lock. This also
 509  *      lets the pmap layer run (nearly completely) interrupt enabled, unlike
 510  *      previously.
 511  */
 512
 513
 514 /*
 515  * PV locking
 516  */
 517
 518 #define LOCK_PVH(index)         {       \
 519     mp_disable_preemption();           \
 520     lock_pvh_pai(index);               \
 521 }
 522
 523 #define UNLOCK_PVH(index)  {      \
 524     unlock_pvh_pai(index);        \
 525     mp_enable_preemption();       \
 526 }
 527
 528 /*
 529  * PV hash locking
 530  */
 531
 532 #define LOCK_PV_HASH(hash)         lock_hash_hash(hash)
 533
 534 #define UNLOCK_PV_HASH(hash)       unlock_hash_hash(hash)
 535
 536 #if     USLOCK_DEBUG
 537 extern int      max_lock_loops;
 538 #define LOOP_VAR                                                        \
 539         unsigned int    loop_count;                                     \
 540         loop_count = disable_serial_output ? max_lock_loops             \
 541                                         : max_lock_loops*100
 542 #define LOOP_CHECK(msg, pmap)                                           \
 543         if (--loop_count == 0) {                                        \
 544                 mp_disable_preemption();                                \
 545                 kprintf("%s: cpu %d pmap %x\n",                         \
 546                           msg, cpu_number(), pmap);                     \
 547                 Debugger("deadlock detection");                         \
 548                 mp_enable_preemption();                                 \
 549                 loop_count = max_lock_loops;                            \
 550         }
 551 #else   /* USLOCK_DEBUG */
 552 #define LOOP_VAR
 553 #define LOOP_CHECK(msg, pmap)
 554 #endif  /* USLOCK_DEBUG */
 555
 556 unsigned pmap_memory_region_count;
 557 unsigned pmap_memory_region_current;
 558
 559 pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
 560
 561 /*
 562  *      Other useful macros.
 563  */
 564 #define current_pmap()          (vm_map_pmap(current_thread()->map))
 565
 566 struct pmap     kernel_pmap_store;
 567 pmap_t          kernel_pmap;
 568
 569 pd_entry_t    high_shared_pde;
 570 pd_entry_t    commpage64_pde;
 571
 572 struct zone     *pmap_zone;             /* zone of pmap structures */
 573
 574 int             pmap_debug = 0;         /* flag for debugging prints */
 575
 576 unsigned int    inuse_ptepages_count = 0;
 577
 578 addr64_t        kernel64_cr3;
 579 boolean_t       no_shared_cr3 = FALSE;  /* -no_shared_cr3 boot arg */
 580
 581
 582 /*
 583  *      Pmap cache.  Cache is threaded through ref_count field of pmap.
 584  *      Max will eventually be constant -- variable for experimentation.
 585  */
 586 int             pmap_cache_max = 32;
 587 int             pmap_alloc_chunk = 8;
 588 pmap_t          pmap_cache_list;
 589 int             pmap_cache_count;
 590 decl_simple_lock_data(,pmap_cache_lock)
 591
 592 extern char end;
 593
 594 static int nkpt;
 595
 596 pt_entry_t     *DMAP1, *DMAP2;
 597 caddr_t         DADDR1;
 598 caddr_t         DADDR2;
 599
 600 static inline
 601 void pmap_pvh_unlink(pv_hashed_entry_t pv);
 602
 603 /*
 604  * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
 605  * properly deals with the anchor.
 606  * must be called with the hash locked, does not unlock it
 607  */
 608
 609 static inline
 610 void pmap_pvh_unlink(pv_hashed_entry_t pvh)
 611 {
 612   pv_hashed_entry_t curh;
 613   pv_hashed_entry_t *pprevh;
 614   int pvhash_idx;
 615
 616   CHK_NPVHASH();
 617   pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
 618
 619   pprevh = pvhash(pvhash_idx);
 620
 621 #if PV_DEBUG
 622   if (NULL == *pprevh) panic("pvh_unlink null anchor"); /* JK DEBUG */
 623 #endif
 624   curh = *pprevh;
 625
 626   while (PV_HASHED_ENTRY_NULL != curh) {
 627     if (pvh == curh)
 628       break;
 629     pprevh = &curh->nexth;
 630     curh = curh->nexth;
 631   }
 632   if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
 633   *pprevh = pvh->nexth;
 634   return;
 635 }
 636
 637 /*
 638  * for legacy, returns the address of the pde entry.
 639  * for 64 bit, causes the pdpt page containing the pde entry to be mapped,
 640  * then returns the mapped address of the pde entry in that page
 641  */
 642 pd_entry_t *
 643 pmap_pde(pmap_t m, vm_map_offset_t v)
 644 {
 645   pd_entry_t *pde;
 646         if (!cpu_64bit || (m == kernel_pmap)) {
 647           pde = (&((m)->dirbase[(vm_offset_t)(v) >> PDESHIFT]));
 648         } else {
 649           assert(m);
 650           assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
 651           pde = pmap64_pde(m, v);
 652         }
 653         return pde;
 654 }
 655
 656
 657 /*
 658  * the single pml4 page per pmap is allocated at pmap create time and exists
 659  * for the duration of the pmap. we allocate this page in kernel vm (to save us one
 660  * level of page table dynamic mapping.
 661  * this returns the address of the requested pml4 entry in the top level page.
 662  */
 663 static inline
 664 pml4_entry_t *
 665 pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
 666 {
 667   return ((pml4_entry_t *)pmap->pm_hold + ((vm_offset_t)((vaddr>>PML4SHIFT)&(NPML4PG-1))));
 668 }
 669
 670 /*
 671  * maps in the pml4 page, if any, containing the pdpt entry requested
 672  * and returns the address of the pdpt entry in that mapped page
 673  */
 674 pdpt_entry_t *
 675 pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
 676 {
 677   pml4_entry_t newpf;
 678   pml4_entry_t *pml4;
 679   int i;
 680
 681   assert(pmap);
 682   assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
 683   if ((vaddr > 0x00007FFFFFFFFFFFULL) && (vaddr < 0xFFFF800000000000ULL)) {
 684     return(0);
 685   }
 686
 687   pml4 = pmap64_pml4(pmap, vaddr);
 688
 689         if (pml4 && ((*pml4 & INTEL_PTE_VALID))) {
 690
 691                 newpf = *pml4 & PG_FRAME;
 692
 693
 694                 for (i=PMAP_PDPT_FIRST_WINDOW; i < PMAP_PDPT_FIRST_WINDOW+PMAP_PDPT_NWINDOWS; i++) {
 695                   if (((*(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP)) & PG_FRAME) == newpf) {
 696                   return((pdpt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR) +
 697                          ((vm_offset_t)((vaddr>>PDPTSHIFT)&(NPDPTPG-1))));
 698                   }
 699                 }
 700
 701                   current_cpu_datap()->cpu_pmap->pdpt_window_index++;
 702                   if (current_cpu_datap()->cpu_pmap->pdpt_window_index > (PMAP_PDPT_FIRST_WINDOW+PMAP_PDPT_NWINDOWS-1))
 703                     current_cpu_datap()->cpu_pmap->pdpt_window_index = PMAP_PDPT_FIRST_WINDOW;
 704                   pmap_store_pte(
 705                                  (current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pdpt_window_index].prv_CMAP),
 706                                  newpf | INTEL_PTE_RW | INTEL_PTE_VALID);
 707                   invlpg((u_int)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pdpt_window_index].prv_CADDR));
 708                   return ((pdpt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pdpt_window_index].prv_CADDR) +
 709                           ((vm_offset_t)((vaddr>>PDPTSHIFT)&(NPDPTPG-1))));
 710         }
 711
 712         return (NULL);
 713 }
 714
 715 /*
 716  * maps in the pdpt page, if any, containing the pde entry requested
 717  * and returns the address of the pde entry in that mapped page
 718  */
 719 pd_entry_t *
 720 pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr)
 721 {
 722   pdpt_entry_t newpf;
 723   pdpt_entry_t *pdpt;
 724   int i;
 725
 726   assert(pmap);
 727   assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
 728   if ((vaddr > 0x00007FFFFFFFFFFFULL) && (vaddr < 0xFFFF800000000000ULL)) {
 729     return(0);
 730   }
 731
 732   /*  if (vaddr & (1ULL << 63)) panic("neg addr");*/
 733   pdpt = pmap64_pdpt(pmap, vaddr);
 734
 735           if (pdpt && ((*pdpt & INTEL_PTE_VALID))) {
 736
 737                 newpf = *pdpt & PG_FRAME;
 738
 739                 for (i=PMAP_PDE_FIRST_WINDOW; i < PMAP_PDE_FIRST_WINDOW+PMAP_PDE_NWINDOWS; i++) {
 740                   if (((*(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP)) & PG_FRAME) == newpf) {
 741                   return((pd_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR) +
 742                          ((vm_offset_t)((vaddr>>PDSHIFT)&(NPDPG-1))));
 743                   }
 744                 }
 745
 746                   current_cpu_datap()->cpu_pmap->pde_window_index++;
 747                   if (current_cpu_datap()->cpu_pmap->pde_window_index > (PMAP_PDE_FIRST_WINDOW+PMAP_PDE_NWINDOWS-1))
 748                     current_cpu_datap()->cpu_pmap->pde_window_index = PMAP_PDE_FIRST_WINDOW;
 749                   pmap_store_pte(
 750                                  (current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pde_window_index].prv_CMAP),
 751                                  newpf | INTEL_PTE_RW | INTEL_PTE_VALID);
 752                   invlpg((u_int)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pde_window_index].prv_CADDR));
 753                   return ((pd_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pde_window_index].prv_CADDR) +
 754                           ((vm_offset_t)((vaddr>>PDSHIFT)&(NPDPG-1))));
 755         }
 756
 757         return (NULL);
 758 }
 759
 760 /*
 761  * Because the page tables (top 3 levels) are mapped into per cpu windows,
 762  * callers must either disable interrupts or disable preemption before calling
 763  * one of the pte mapping routines (e.g. pmap_pte()) as the returned vaddr
 764  * is in one of those mapped windows and that cannot be allowed to change until
 765  * the caller is done using the returned pte pointer. When done, the caller
 766  * restores interrupts or preemption to its previous state after which point the
 767  * vaddr for the returned pte can no longer be used
 768  */
 769
 770
 771 /*
 772  * return address of mapped pte for vaddr va in pmap pmap.
 773  * must be called with pre-emption or interrupts disabled
 774  * if targeted pmap is not the kernel pmap
 775  * since we may be passing back a virtual address that is
 776  * associated with this cpu... pre-emption or interrupts
 777  * must remain disabled until the caller is done using
 778  * the pointer that was passed back .
 779  *
 780  * maps the pde page, if any, containing the pte in and returns
 781  * the address of the pte in that mapped page
 782  */
 783 pt_entry_t     *
 784 pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
 785 {
 786         pd_entry_t     *pde;
 787         pd_entry_t     newpf;
 788         int i;
 789
 790         assert(pmap);
 791         pde = pmap_pde(pmap,vaddr);
 792
 793         if (pde && ((*pde & INTEL_PTE_VALID))) {
 794            if (*pde & INTEL_PTE_PS)
 795                 return pde;
 796             if (pmap == kernel_pmap)
 797                 return (vtopte(vaddr)); /* compat kernel still has pte's mapped */
 798 #if TESTING
 799             if (ml_get_interrupts_enabled() && get_preemption_level() == 0)
 800                 panic("pmap_pte: unsafe call");
 801 #endif
 802                 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
 803
 804                 newpf = *pde & PG_FRAME;
 805
 806                 for (i=PMAP_PTE_FIRST_WINDOW; i < PMAP_PTE_FIRST_WINDOW+PMAP_PTE_NWINDOWS; i++) {
 807                   if (((*(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP)) & PG_FRAME) == newpf) {
 808                   return((pt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR) +
 809                          ((vm_offset_t)i386_btop(vaddr) & (NPTEPG-1)));
 810                   }
 811                 }
 812
 813                   current_cpu_datap()->cpu_pmap->pte_window_index++;
 814                   if (current_cpu_datap()->cpu_pmap->pte_window_index > (PMAP_PTE_FIRST_WINDOW+PMAP_PTE_NWINDOWS-1))
 815                     current_cpu_datap()->cpu_pmap->pte_window_index = PMAP_PTE_FIRST_WINDOW;
 816                   pmap_store_pte(
 817                                  (current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pte_window_index].prv_CMAP),
 818                                  newpf | INTEL_PTE_RW | INTEL_PTE_VALID);
 819                   invlpg((u_int)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pte_window_index].prv_CADDR));
 820                   return ((pt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pte_window_index].prv_CADDR) +
 821                           ((vm_offset_t)i386_btop(vaddr) & (NPTEPG-1)));
 822         }
 823
 824         return(NULL);
 825 }
 826
 827
 828 /*
 829  *      Map memory at initialization.  The physical addresses being
 830  *      mapped are not managed and are never unmapped.
 831  *
 832  *      For now, VM is already on, we only need to map the
 833  *      specified memory.
 834  */
 835 vm_offset_t
 836 pmap_map(
 837         vm_offset_t     virt,
 838         vm_map_offset_t start_addr,
 839         vm_map_offset_t end_addr,
 840         vm_prot_t       prot,
 841         unsigned int    flags)
 842 {
 843         int             ps;
 844
 845         ps = PAGE_SIZE;
 846         while (start_addr < end_addr) {
 847                 pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
 848                            (ppnum_t) i386_btop(start_addr), prot, flags, FALSE);
 849                 virt += ps;
 850                 start_addr += ps;
 851         }
 852         return(virt);
 853 }
 854
 855 /*
 856  *      Back-door routine for mapping kernel VM at initialization.
 857  *      Useful for mapping memory outside the range
 858  *      Sets no-cache, A, D.
 859  *      Otherwise like pmap_map.
 860  */
 861 vm_offset_t
 862 pmap_map_bd(
 863         vm_offset_t     virt,
 864         vm_map_offset_t start_addr,
 865         vm_map_offset_t end_addr,
 866         vm_prot_t       prot,
 867         unsigned int    flags)
 868 {
 869         pt_entry_t      template;
 870         pt_entry_t      *pte;
 871         spl_t           spl;
 872
 873         template = pa_to_pte(start_addr)
 874                 | INTEL_PTE_REF
 875                 | INTEL_PTE_MOD
 876                 | INTEL_PTE_WIRED
 877                 | INTEL_PTE_VALID;
 878
 879         if(flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) {
 880             template |= INTEL_PTE_NCACHE;
 881             if(!(flags & (VM_MEM_GUARDED | VM_WIMG_USE_DEFAULT)))
 882                     template |= INTEL_PTE_PTA;
 883         }
 884
 885         if (prot & VM_PROT_WRITE)
 886             template |= INTEL_PTE_WRITE;
 887
 888
 889         while (start_addr < end_addr) {
 890                 spl = splhigh();
 891                 pte = pmap_pte(kernel_pmap, (vm_map_offset_t)virt);
 892                 if (pte == PT_ENTRY_NULL) {
 893                         panic("pmap_map_bd: Invalid kernel address\n");
 894                 }
 895                 pmap_store_pte(pte, template);
 896                 splx(spl);
 897                 pte_increment_pa(template);
 898                 virt += PAGE_SIZE;
 899                 start_addr += PAGE_SIZE;
 900         }
 901
 902
 903         flush_tlb();
 904         return(virt);
 905 }
 906
 907 extern  char                    *first_avail;
 908 extern  vm_offset_t             virtual_avail, virtual_end;
 909 extern  pmap_paddr_t            avail_start, avail_end;
 910
 911 void
 912 pmap_cpu_init(void)
 913 {
 914         /*
 915          * Here early in the life of a processor (from cpu_mode_init()).
 916          * If we're not in 64-bit mode, enable the global TLB feature.
 917          * Note: regardless of mode we continue to set the global attribute
 918          * bit in ptes for all (32-bit) global pages such as the commpage.
 919          */
 920         if (!cpu_64bit) {
 921                 set_cr4(get_cr4() | CR4_PGE);
 922         }
 923
 924         /*
 925          * Initialize the per-cpu, TLB-related fields.
 926          */
 927         current_cpu_datap()->cpu_active_cr3 = kernel_pmap->pm_cr3;
 928         current_cpu_datap()->cpu_tlb_invalid = FALSE;
 929 }
 930
 931 vm_offset_t
 932 pmap_high_shared_remap(enum high_fixed_addresses e, vm_offset_t va, int sz)
 933 {
 934   vm_offset_t ve = pmap_index_to_virt(e);
 935   pt_entry_t *ptep;
 936   pmap_paddr_t pa;
 937   int i;
 938   spl_t s;
 939
 940   assert(0 == (va & PAGE_MASK));  /* expecting page aligned */
 941   s = splhigh();
 942   ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ve);
 943
 944   for (i=0; i< sz; i++) {
 945     pa = (pmap_paddr_t) kvtophys(va);
 946     pmap_store_pte(ptep, (pa & PG_FRAME)
 947                                 | INTEL_PTE_VALID
 948                                 | INTEL_PTE_GLOBAL
 949                                 | INTEL_PTE_RW
 950                                 | INTEL_PTE_REF
 951                                 | INTEL_PTE_MOD);
 952     va+= PAGE_SIZE;
 953     ptep++;
 954   }
 955   splx(s);
 956   return ve;
 957 }
 958
 959 vm_offset_t
 960 pmap_cpu_high_shared_remap(int cpu, enum high_cpu_types e, vm_offset_t va, int sz)
 961 {
 962   enum high_fixed_addresses     a = e + HIGH_CPU_END * cpu;
 963   return pmap_high_shared_remap(HIGH_FIXED_CPUS_BEGIN + a, va, sz);
 964 }
 965
 966 void pmap_init_high_shared(void);
 967
 968 extern vm_offset_t gdtptr, idtptr;
 969
 970 extern uint32_t low_intstack;
 971
 972 extern struct fake_descriptor ldt_desc_pattern;
 973 extern struct fake_descriptor tss_desc_pattern;
 974
 975 extern char hi_remap_text, hi_remap_etext;
 976 extern char t_zero_div;
 977
 978 pt_entry_t *pte_unique_base;
 979
 980 void
 981 pmap_init_high_shared(void)
 982 {
 983
 984         vm_offset_t haddr;
 985         spl_t s;
 986 #if MACH_KDB
 987         struct i386_tss *ttss;
 988 #endif
 989
 990         cpu_desc_index_t * cdi = &cpu_data_master.cpu_desc_index;
 991
 992         kprintf("HIGH_MEM_BASE 0x%x fixed per-cpu begin 0x%x\n",
 993                 HIGH_MEM_BASE,pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN));
 994         s = splhigh();
 995         pte_unique_base = pmap_pte(kernel_pmap, (vm_map_offset_t)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN));
 996         splx(s);
 997
 998         if (i386_btop(&hi_remap_etext - &hi_remap_text + 1) >
 999                                 HIGH_FIXED_TRAMPS_END - HIGH_FIXED_TRAMPS + 1)
1000                 panic("tramps too large");
1001         haddr = pmap_high_shared_remap(HIGH_FIXED_TRAMPS,
1002                                         (vm_offset_t) &hi_remap_text, 3);
1003         kprintf("tramp: 0x%x, ",haddr);
1004         /* map gdt up high and update ptr for reload */
1005         haddr = pmap_high_shared_remap(HIGH_FIXED_GDT,
1006                                         (vm_offset_t) master_gdt, 1);
1007         cdi->cdi_gdt.ptr = (void *)haddr;
1008         kprintf("GDT: 0x%x, ",haddr);
1009         /* map ldt up high */
1010         haddr = pmap_high_shared_remap(HIGH_FIXED_LDT_BEGIN,
1011                                         (vm_offset_t) master_ldt,
1012                                         HIGH_FIXED_LDT_END - HIGH_FIXED_LDT_BEGIN + 1);
1013         cdi->cdi_ldt = (struct fake_descriptor *)haddr;
1014         kprintf("LDT: 0x%x, ",haddr);
1015         /* put new ldt addr into gdt */
1016         struct fake_descriptor temp_fake_desc;
1017         temp_fake_desc = ldt_desc_pattern;
1018         temp_fake_desc.offset = (vm_offset_t) haddr;
1019         fix_desc(&temp_fake_desc, 1);
1020
1021         *(struct fake_descriptor *) &master_gdt[sel_idx(KERNEL_LDT)] = temp_fake_desc;
1022         *(struct fake_descriptor *) &master_gdt[sel_idx(USER_LDT)] = temp_fake_desc;
1023
1024         /* map idt up high */
1025         haddr = pmap_high_shared_remap(HIGH_FIXED_IDT,
1026                                         (vm_offset_t) master_idt, 1);
1027         cdi->cdi_idt.ptr = (void *)haddr;
1028         kprintf("IDT: 0x%x, ", haddr);
1029         /* remap ktss up high and put new high addr into gdt */
1030         haddr = pmap_high_shared_remap(HIGH_FIXED_KTSS,
1031                                         (vm_offset_t) &master_ktss, 1);
1032
1033         temp_fake_desc = tss_desc_pattern;
1034         temp_fake_desc.offset = (vm_offset_t) haddr;
1035         fix_desc(&temp_fake_desc, 1);
1036         *(struct fake_descriptor *) &master_gdt[sel_idx(KERNEL_TSS)] = temp_fake_desc;
1037         kprintf("KTSS: 0x%x, ",haddr);
1038 #if MACH_KDB
1039         /* remap dbtss up high and put new high addr into gdt */
1040         haddr = pmap_high_shared_remap(HIGH_FIXED_DBTSS,
1041                                         (vm_offset_t) &master_dbtss, 1);
1042         temp_fake_desc = tss_desc_pattern;
1043         temp_fake_desc.offset = (vm_offset_t) haddr;
1044         fix_desc(&temp_fake_desc, 1);
1045         *(struct fake_descriptor *)&master_gdt[sel_idx(DEBUG_TSS)] = temp_fake_desc;
1046         ttss = (struct i386_tss *)haddr;
1047         kprintf("DBTSS: 0x%x, ",haddr);
1048 #endif  /* MACH_KDB */
1049
1050         /* remap dftss up high and put new high addr into gdt */
1051         haddr = pmap_high_shared_remap(HIGH_FIXED_DFTSS,
1052                                         (vm_offset_t) &master_dftss, 1);
1053         temp_fake_desc = tss_desc_pattern;
1054         temp_fake_desc.offset = (vm_offset_t) haddr;
1055         fix_desc(&temp_fake_desc, 1);
1056         *(struct fake_descriptor *) &master_gdt[sel_idx(DF_TSS)] = temp_fake_desc;
1057         kprintf("DFTSS: 0x%x\n",haddr);
1058
1059         /* remap mctss up high and put new high addr into gdt */
1060         haddr = pmap_high_shared_remap(HIGH_FIXED_DFTSS,
1061                                         (vm_offset_t) &master_mctss, 1);
1062         temp_fake_desc = tss_desc_pattern;
1063         temp_fake_desc.offset = (vm_offset_t) haddr;
1064         fix_desc(&temp_fake_desc, 1);
1065         *(struct fake_descriptor *) &master_gdt[sel_idx(MC_TSS)] = temp_fake_desc;
1066         kprintf("MCTSS: 0x%x\n",haddr);
1067
1068         cpu_desc_load(&cpu_data_master);
1069 }
1070
1071
1072 /*
1073  *      Bootstrap the system enough to run with virtual memory.
1074  *      Map the kernel's code and data, and allocate the system page table.
1075  *      Called with mapping OFF.  Page_size must already be set.
1076  */
1077
1078 void
1079 pmap_bootstrap(
1080         __unused vm_offset_t    load_start,
1081         boolean_t               IA32e)
1082 {
1083         vm_offset_t     va;
1084         pt_entry_t      *pte;
1085         int i;
1086         pdpt_entry_t *pdpt;
1087         spl_t s;
1088
1089         vm_last_addr = VM_MAX_KERNEL_ADDRESS;   /* Set the highest address
1090                                                  * known to VM */
1091         /*
1092          *      The kernel's pmap is statically allocated so we don't
1093          *      have to use pmap_create, which is unlikely to work
1094          *      correctly at this part of the boot sequence.
1095          */
1096
1097
1098         kernel_pmap = &kernel_pmap_store;
1099         kernel_pmap->ref_count = 1;
1100         kernel_pmap->nx_enabled = FALSE;
1101         kernel_pmap->pm_task_map = TASK_MAP_32BIT;
1102         kernel_pmap->pm_obj = (vm_object_t) NULL;
1103         kernel_pmap->dirbase = (pd_entry_t *)((unsigned int)IdlePTD | KERNBASE);
1104         kernel_pmap->pdirbase = (pmap_paddr_t)((int)IdlePTD);
1105         pdpt = (pd_entry_t *)((unsigned int)IdlePDPT | KERNBASE );
1106         kernel_pmap->pm_pdpt = pdpt;
1107         kernel_pmap->pm_cr3 = (pmap_paddr_t)((int)IdlePDPT);
1108
1109
1110         va = (vm_offset_t)kernel_pmap->dirbase;
1111         /* setup self referential mapping(s) */
1112         for (i = 0; i< NPGPTD; i++, pdpt++) {
1113           pmap_paddr_t pa;
1114           pa = (pmap_paddr_t) kvtophys((vm_offset_t)(va + i386_ptob(i)));
1115           pmap_store_pte(
1116             (pd_entry_t *) (kernel_pmap->dirbase + PTDPTDI + i),
1117             (pa & PG_FRAME) | INTEL_PTE_VALID | INTEL_PTE_RW | INTEL_PTE_REF |
1118               INTEL_PTE_MOD | INTEL_PTE_WIRED) ;
1119           pmap_store_pte(pdpt, pa | INTEL_PTE_VALID);
1120         }
1121
1122         cpu_64bit = IA32e;
1123
1124         lo_kernel_cr3 = kernel_pmap->pm_cr3;
1125         current_cpu_datap()->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
1126
1127         /* save the value we stuff into created pmaps to share the gdts etc */
1128         high_shared_pde = *pmap_pde(kernel_pmap, HIGH_MEM_BASE);
1129         /* make sure G bit is on for high shared pde entry */
1130         high_shared_pde |= INTEL_PTE_GLOBAL;
1131         s = splhigh();
1132         pmap_store_pte(pmap_pde(kernel_pmap, HIGH_MEM_BASE), high_shared_pde);
1133         splx(s);
1134
1135         nkpt = NKPT;
1136         OSAddAtomic(NKPT, &inuse_ptepages_count);
1137
1138         virtual_avail = (vm_offset_t)VADDR(KPTDI,0) + (vm_offset_t)first_avail;
1139         virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
1140
1141         /*
1142          * Reserve some special page table entries/VA space for temporary
1143          * mapping of pages.
1144          */
1145 #define SYSMAP(c, p, v, n)      \
1146         v = (c)va; va += ((n)*INTEL_PGBYTES); p = pte; pte += (n)
1147
1148         va = virtual_avail;
1149         pte = vtopte(va);
1150
1151         for (i=0; i<PMAP_NWINDOWS; i++) {
1152             SYSMAP(caddr_t,
1153                    (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
1154                    (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR),
1155                    1);
1156             *current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP = 0;
1157         }
1158
1159         /* DMAP user for debugger */
1160         SYSMAP(caddr_t, DMAP1, DADDR1, 1);
1161         SYSMAP(caddr_t, DMAP2, DADDR2, 1);  /* XXX temporary - can remove */
1162
1163         virtual_avail = va;
1164
1165         if (PE_parse_boot_argn("npvhash", &npvhash, sizeof (npvhash))) {
1166           if (0 != ((npvhash+1) & npvhash)) {
1167             kprintf("invalid hash %d, must be ((2^N)-1), using default %d\n",npvhash,NPVHASH);
1168             npvhash = NPVHASH;
1169           }
1170         } else {
1171           npvhash = NPVHASH;
1172         }
1173         printf("npvhash=%d\n",npvhash);
1174
1175         simple_lock_init(&kernel_pmap->lock, 0);
1176         simple_lock_init(&pv_hashed_free_list_lock, 0);
1177         simple_lock_init(&pv_hashed_kern_free_list_lock, 0);
1178         simple_lock_init(&pv_hash_table_lock,0);
1179
1180         pmap_init_high_shared();
1181
1182         pde_mapped_size = PDE_MAPPED_SIZE;
1183
1184         if (cpu_64bit) {
1185           pdpt_entry_t *ppdpt   = IdlePDPT;
1186           pdpt_entry_t *ppdpt64 = (pdpt_entry_t *)IdlePDPT64;
1187           pdpt_entry_t *ppml4   = (pdpt_entry_t *)IdlePML4;
1188           int istate = ml_set_interrupts_enabled(FALSE);
1189
1190           /*
1191            * Clone a new 64-bit 3rd-level page table directory, IdlePML4,
1192            * with page bits set for the correct IA-32e operation and so that
1193            * the legacy-mode IdlePDPT is retained for slave processor start-up.
1194            * This is necessary due to the incompatible use of page bits between
1195            * 64-bit and legacy modes.
1196            */
1197           kernel_pmap->pm_cr3 = (pmap_paddr_t)((int)IdlePML4); /* setup in start.s for us */
1198           kernel_pmap->pm_pml4 = IdlePML4;
1199           kernel_pmap->pm_pdpt = (pd_entry_t *)
1200                                         ((unsigned int)IdlePDPT64 | KERNBASE );
1201 #define PAGE_BITS INTEL_PTE_VALID|INTEL_PTE_RW|INTEL_PTE_USER|INTEL_PTE_REF
1202           pmap_store_pte(kernel_pmap->pm_pml4,
1203                          (uint32_t)IdlePDPT64 | PAGE_BITS);
1204           pmap_store_pte((ppdpt64+0), *(ppdpt+0) | PAGE_BITS);
1205           pmap_store_pte((ppdpt64+1), *(ppdpt+1) | PAGE_BITS);
1206           pmap_store_pte((ppdpt64+2), *(ppdpt+2) | PAGE_BITS);
1207           pmap_store_pte((ppdpt64+3), *(ppdpt+3) | PAGE_BITS);
1208
1209           /*
1210            * The kernel is also mapped in the uber-sapce at the 4GB starting
1211            * 0xFFFFFF80:00000000. This is the highest entry in the 4th-level.
1212            */
1213           pmap_store_pte((ppml4+KERNEL_UBER_PML4_INDEX), *(ppml4+0));
1214
1215           kernel64_cr3 = (addr64_t) kernel_pmap->pm_cr3;
1216
1217           /* Re-initialize descriptors and prepare to switch modes */
1218           cpu_desc_init64(&cpu_data_master);
1219           current_cpu_datap()->cpu_is64bit = TRUE;
1220           current_cpu_datap()->cpu_active_cr3 = kernel64_cr3;
1221
1222           pde_mapped_size = 512*4096 ;
1223
1224           ml_set_interrupts_enabled(istate);
1225         }
1226
1227         /* Sets 64-bit mode if required. */
1228         cpu_mode_init(&cpu_data_master);
1229         /* Update in-kernel CPUID information if we're now in 64-bit mode */
1230         if (IA32e)
1231                 cpuid_set_info();
1232
1233         kernel_pmap->pm_hold = (vm_offset_t)kernel_pmap->pm_pml4;
1234
1235         kprintf("Kernel virtual space from 0x%x to 0x%x.\n",
1236                         VADDR(KPTDI,0), virtual_end);
1237         printf("PAE enabled\n");
1238         if (cpu_64bit){
1239           printf("64 bit mode enabled\n");kprintf("64 bit mode enabled\n"); }
1240
1241         kprintf("Available physical space from 0x%llx to 0x%llx\n",
1242                         avail_start, avail_end);
1243
1244         /*
1245          * By default for 64-bit users loaded at 4GB, share kernel mapping.
1246          * But this may be overridden by the -no_shared_cr3 boot-arg.
1247          */
1248         if (PE_parse_boot_argn("-no_shared_cr3", &no_shared_cr3, sizeof (no_shared_cr3))) {
1249                 kprintf("Shared kernel address space disabled\n");
1250         }
1251
1252 #ifdef  PMAP_TRACES
1253         if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof (pmap_trace))) {
1254                 kprintf("Kernel traces for pmap operations enabled\n");
1255         }
1256 #endif  /* PMAP_TRACES */
1257 }
1258
1259 void
1260 pmap_virtual_space(
1261         vm_offset_t *startp,
1262         vm_offset_t *endp)
1263 {
1264         *startp = virtual_avail;
1265         *endp = virtual_end;
1266 }
1267
1268 /*
1269  *      Initialize the pmap module.
1270  *      Called by vm_init, to initialize any structures that the pmap
1271  *      system needs to map virtual memory.
1272  */
1273 void
1274 pmap_init(void)
1275 {
1276         register long           npages;
1277         vm_offset_t             addr;
1278         register vm_size_t      s;
1279         vm_map_offset_t         vaddr;
1280         ppnum_t ppn;
1281
1282         /*
1283          *      Allocate memory for the pv_head_table and its lock bits,
1284          *      the modify bit array, and the pte_page table.
1285          */
1286
1287         /*
1288          * zero bias all these arrays now instead of off avail_start
1289          * so we cover all memory
1290          */
1291
1292         npages = (long)i386_btop(avail_end);
1293         s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
1294                          + (sizeof (struct pv_hashed_entry_t *) * (npvhash+1))
1295                          + pv_lock_table_size(npages)
1296                          + pv_hash_lock_table_size((npvhash+1))
1297                                 + npages);
1298
1299         s = round_page(s);
1300         if (kernel_memory_allocate(kernel_map, &addr, s, 0,
1301                                    KMA_KOBJECT | KMA_PERMANENT)
1302             != KERN_SUCCESS)
1303                 panic("pmap_init");
1304
1305         memset((char *)addr, 0, s);
1306
1307 #if PV_DEBUG
1308         if (0 == npvhash) panic("npvhash not initialized");
1309 #endif
1310
1311         /*
1312          *      Allocate the structures first to preserve word-alignment.
1313          */
1314         pv_head_table = (pv_rooted_entry_t) addr;
1315         addr = (vm_offset_t) (pv_head_table + npages);
1316
1317         pv_hash_table = (pv_hashed_entry_t *)addr;
1318         addr = (vm_offset_t) (pv_hash_table + (npvhash + 1));
1319
1320         pv_lock_table = (char *) addr;
1321         addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
1322
1323         pv_hash_lock_table = (char *) addr;
1324         addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhash+1)));
1325
1326         pmap_phys_attributes = (char *) addr;
1327         {
1328                 unsigned int i;
1329                 unsigned int pn;
1330                 ppnum_t  last_pn;
1331                 pmap_memory_region_t *pmptr = pmap_memory_regions;
1332
1333                 last_pn = (ppnum_t)i386_btop(avail_end);
1334
1335                 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
1336                         if (pmptr->type == kEfiConventionalMemory) {
1337
1338                                 for (pn = pmptr->base; pn <= pmptr->end; pn++) {
1339                                         if (pn < last_pn) {
1340                                                 pmap_phys_attributes[pn] |= PHYS_MANAGED;
1341
1342                                                 if (pn > last_managed_page)
1343                                                         last_managed_page = pn;
1344                                         }
1345                                 }
1346                         }
1347                 }
1348         }
1349
1350         /*
1351          *      Create the zone of physical maps,
1352          *      and of the physical-to-virtual entries.
1353          */
1354         s = (vm_size_t) sizeof(struct pmap);
1355         pmap_zone = zinit(s, 400*s, 4096, "pmap"); /* XXX */
1356         s = (vm_size_t) sizeof(struct pv_hashed_entry);
1357         pv_hashed_list_zone = zinit(s, 10000*s, 4096, "pv_list"); /* XXX */
1358         s = 63;
1359         pdpt_zone = zinit(s, 400*s, 4096, "pdpt"); /* XXX */
1360
1361         kptobj = &kptobj_object_store;
1362         _vm_object_allocate((vm_object_size_t)(NPGPTD*NPTDPG), kptobj);
1363         kernel_pmap->pm_obj = kptobj;
1364
1365         /* create pv entries for kernel pages mapped by low level
1366            startup code.  these have to exist so we can pmap_remove()
1367            e.g. kext pages from the middle of our addr space */
1368
1369         vaddr = (vm_map_offset_t)0;
1370         for (ppn = 0; ppn < i386_btop(avail_start) ; ppn++ ) {
1371           pv_rooted_entry_t     pv_e;
1372
1373           pv_e = pai_to_pvh(ppn);
1374           pv_e->va = vaddr;
1375           vaddr += PAGE_SIZE;
1376           pv_e->pmap = kernel_pmap;
1377           queue_init(&pv_e->qlink);
1378         }
1379
1380         pmap_initialized = TRUE;
1381
1382         /*
1383          *      Initialize pmap cache.
1384          */
1385         pmap_cache_list = PMAP_NULL;
1386         pmap_cache_count = 0;
1387         simple_lock_init(&pmap_cache_lock, 0);
1388
1389         max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
1390
1391 }
1392
1393
1394 #define managed_page(x) ( (unsigned int)x <= last_managed_page && (pmap_phys_attributes[x] & PHYS_MANAGED) )
1395
1396 /*
1397  * this function is only used for debugging fron the vm layer
1398  */
1399 boolean_t
1400 pmap_verify_free(
1401                  ppnum_t pn)
1402 {
1403         pv_rooted_entry_t       pv_h;
1404         int             pai;
1405         boolean_t       result;
1406
1407         assert(pn != vm_page_fictitious_addr);
1408
1409         if (!pmap_initialized)
1410                 return(TRUE);
1411
1412         if (pn == vm_page_guard_addr)
1413                 return TRUE;
1414
1415         pai = ppn_to_pai(pn);
1416         if (!managed_page(pai))
1417                 return(FALSE);
1418         pv_h = pai_to_pvh(pn);
1419         result = (pv_h->pmap == PMAP_NULL);
1420         return(result);
1421 }
1422
1423 boolean_t
1424 pmap_is_empty(
1425        pmap_t          pmap,
1426        vm_map_offset_t va_start,
1427        vm_map_offset_t va_end)
1428 {
1429         vm_map_offset_t offset;
1430         ppnum_t         phys_page;
1431
1432         if (pmap == PMAP_NULL) {
1433                 return TRUE;
1434         }
1435
1436         /*
1437          * Check the resident page count
1438          * - if it's zero, the pmap is completely empty.
1439          * This short-circuit test prevents a virtual address scan which is
1440          * painfully slow for 64-bit spaces.
1441          * This assumes the count is correct
1442          * .. the debug kernel ought to be checking perhaps by page table walk.
1443          */
1444         if (pmap->stats.resident_count == 0)
1445                 return TRUE;
1446
1447         for (offset = va_start;
1448              offset < va_end;
1449              offset += PAGE_SIZE_64) {
1450                 phys_page = pmap_find_phys(pmap, offset);
1451                 if (phys_page) {
1452                         if (pmap != kernel_pmap &&
1453                             pmap->pm_task_map == TASK_MAP_32BIT &&
1454                             offset >= HIGH_MEM_BASE) {
1455                                 /*
1456                                  * The "high_shared_pde" is used to share
1457                                  * the entire top-most 2MB of address space
1458                                  * between the kernel and all 32-bit tasks.
1459                                  * So none of this can be removed from 32-bit
1460                                  * tasks.
1461                                  * Let's pretend there's nothing up
1462                                  * there...
1463                                  */
1464                                 return TRUE;
1465                         }
1466                         kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1467                                 "page %d at 0x%llx\n",
1468                                 pmap, va_start, va_end, phys_page, offset);
1469                         return FALSE;
1470                 }
1471         }
1472
1473         return TRUE;
1474 }
1475
1476
1477 /*
1478  *      Create and return a physical map.
1479  *
1480  *      If the size specified for the map
1481  *      is zero, the map is an actual physical
1482  *      map, and may be referenced by the
1483  *      hardware.
1484  *
1485  *      If the size specified is non-zero,
1486  *      the map will be used in software only, and
1487  *      is bounded by that size.
1488  */
1489 pmap_t
1490 pmap_create(
1491             vm_map_size_t       sz,
1492             boolean_t           is_64bit)
1493 {
1494         pmap_t                  p;
1495         int             i;
1496         vm_offset_t     va;
1497         vm_size_t       size;
1498         pdpt_entry_t    *pdpt;
1499         pml4_entry_t    *pml4p;
1500         pd_entry_t      *pdp;
1501         int template;
1502         spl_t s;
1503
1504         PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1505                    (int) (sz>>32), (int) sz, (int) is_64bit, 0, 0);
1506
1507         size = (vm_size_t) sz;
1508
1509         /*
1510          *      A software use-only map doesn't even need a map.
1511          */
1512
1513         if (size != 0) {
1514                 return(PMAP_NULL);
1515         }
1516
1517         p = (pmap_t) zalloc(pmap_zone);
1518         if (PMAP_NULL == p)
1519                 panic("pmap_create zalloc");
1520
1521         /* init counts now since we'll be bumping some */
1522         simple_lock_init(&p->lock, 0);
1523         p->stats.resident_count = 0;
1524         p->stats.resident_max = 0;
1525         p->stats.wired_count = 0;
1526         p->ref_count = 1;
1527         p->nx_enabled = 1;
1528         p->pm_shared = FALSE;
1529
1530         assert(!is_64bit || cpu_64bit);
1531         p->pm_task_map = is_64bit ? TASK_MAP_64BIT : TASK_MAP_32BIT;;
1532
1533         if (!cpu_64bit) {
1534                 /* legacy 32 bit setup */
1535                 /* in the legacy case the pdpt layer is hardwired to 4 entries and each
1536                  * entry covers 1GB of addr space */
1537                 if (KERN_SUCCESS != kmem_alloc_kobject(kernel_map, (vm_offset_t *)(&p->dirbase), NBPTD))
1538                         panic("pmap_create kmem_alloc_kobject");
1539                 p->pm_hold = (vm_offset_t)zalloc(pdpt_zone);
1540                 if ((vm_offset_t)NULL == p->pm_hold) {
1541                         panic("pdpt zalloc");
1542                 }
1543                 pdpt = (pdpt_entry_t *) (( p->pm_hold + 31) & ~31);
1544                 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)pdpt);
1545                 if (NULL == (p->pm_obj = vm_object_allocate((vm_object_size_t)(NPGPTD*NPTDPG))))
1546                         panic("pmap_create vm_object_allocate");
1547
1548                 memset((char *)p->dirbase, 0, NBPTD);
1549
1550                 va = (vm_offset_t)p->dirbase;
1551                 p->pdirbase = kvtophys(va);
1552
1553                 template = cpu_64bit ? INTEL_PTE_VALID|INTEL_PTE_RW|INTEL_PTE_USER|INTEL_PTE_REF : INTEL_PTE_VALID;
1554                 for (i = 0; i< NPGPTD; i++, pdpt++ ) {
1555                         pmap_paddr_t pa;
1556                         pa = (pmap_paddr_t) kvtophys((vm_offset_t)(va + i386_ptob(i)));
1557                         pmap_store_pte(pdpt, pa | template);
1558                 }
1559
1560                 /* map the high shared pde */
1561                 s = splhigh();
1562                 pmap_store_pte(pmap_pde(p, HIGH_MEM_BASE), high_shared_pde);
1563                 splx(s);
1564
1565         } else {
1566                 /* 64 bit setup  */
1567
1568                 /* alloc the pml4 page in kernel vm */
1569                 if (KERN_SUCCESS != kmem_alloc_kobject(kernel_map, (vm_offset_t *)(&p->pm_hold), PAGE_SIZE))
1570                         panic("pmap_create kmem_alloc_kobject pml4");
1571
1572                 memset((char *)p->pm_hold, 0, PAGE_SIZE);
1573                 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_hold);
1574
1575                 OSAddAtomic(1,  &inuse_ptepages_count);
1576
1577                 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1578
1579                 if (NULL == (p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS))))
1580                         panic("pmap_create pdpt obj");
1581
1582                 if (NULL == (p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS))))
1583                         panic("pmap_create pdpt obj");
1584
1585                 if (NULL == (p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS))))
1586                         panic("pmap_create pte obj");
1587
1588                 /* uber space points to uber mapped kernel */
1589                 s = splhigh();
1590                 pml4p = pmap64_pml4(p, 0ULL);
1591                 pmap_store_pte((pml4p+KERNEL_UBER_PML4_INDEX),*kernel_pmap->pm_pml4);
1592
1593
1594                 if (!is_64bit) {
1595                         while ((pdp = pmap64_pde(p, (uint64_t)HIGH_MEM_BASE)) == PD_ENTRY_NULL) {
1596                                 splx(s);
1597                                 pmap_expand_pdpt(p, (uint64_t)HIGH_MEM_BASE); /* need room for another pde entry */
1598                                 s = splhigh();
1599                         }
1600                         pmap_store_pte(pdp, high_shared_pde);
1601                 }
1602                 splx(s);
1603         }
1604
1605         PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1606                    (int) p, is_64bit, 0, 0, 0);
1607
1608         return(p);
1609 }
1610
1611 /*
1612  * The following routines implement the shared address optmization for 64-bit
1613  * users with a 4GB page zero.
1614  *
1615  * pmap_set_4GB_pagezero()
1616  *      is called in the exec and fork paths to mirror the kernel's
1617  *      mapping in the bottom 4G of the user's pmap. The task mapping changes
1618  *      from TASK_MAP_64BIT to TASK_MAP_64BIT_SHARED. This routine returns
1619  *      without doing anything if the -no_shared_cr3 boot-arg is set.
1620  *
1621  * pmap_clear_4GB_pagezero()
1622  *      is called in the exec/exit paths to undo this mirror. The task mapping
1623  *      reverts to TASK_MAP_64BIT. In addition, we switch to the kernel's
1624  *      CR3 by calling pmap_load_kernel_cr3().
1625  *
1626  * pmap_load_kernel_cr3()
1627  *      loads cr3 with the kernel's page table. In addition to being called
1628  *      by pmap_clear_4GB_pagezero(), it is used both prior to teardown and
1629  *      when we go idle in the context of a shared map.
1630  *
1631  * Further notes on per-cpu data used:
1632  *
1633  *      cpu_kernel_cr3  is the cr3 for the kernel's pmap.
1634  *                      This is loaded in a trampoline on entering the kernel
1635  *                      from a 32-bit user (or non-shared-cr3 64-bit user).
1636  *      cpu_task_cr3    is the cr3 for the current thread.
1637  *                      This is loaded in a trampoline as we exit the kernel.
1638  *      cpu_active_cr3  reflects the cr3 currently loaded.
1639  *                      However, the low order bit is set when the
1640  *                      processor is idle or interrupts are disabled
1641  *                      while the system pmap lock is held. It is used by
1642  *                      tlb shoot-down.
1643  *      cpu_task_map    indicates whether the task cr3 belongs to
1644  *                      a 32-bit, a 64-bit or a 64-bit shared map.
1645  *                      The latter allows the avoidance of the cr3 load
1646  *                      on kernel entry and exit.
1647  *      cpu_tlb_invalid set TRUE when a tlb flush is requested.
1648  *                      If the cr3 is "inactive" (the cpu is idle or the
1649  *                      system-wide pmap lock is held) this not serviced by
1650  *                      an IPI but at time when the cr3 becomes "active".
1651  */
1652
1653 void
1654 pmap_set_4GB_pagezero(pmap_t p)
1655 {
1656         pdpt_entry_t    *user_pdptp;
1657         pdpt_entry_t    *kern_pdptp;
1658
1659         assert(p->pm_task_map != TASK_MAP_32BIT);
1660
1661         /* Kernel-shared cr3 may be disabled by boot arg. */
1662         if (no_shared_cr3)
1663                 return;
1664
1665         /*
1666          * Set the bottom 4 3rd-level pte's to be the kernel's.
1667          */
1668         PMAP_LOCK(p);
1669         while ((user_pdptp = pmap64_pdpt(p, 0x0)) == PDPT_ENTRY_NULL) {
1670                 PMAP_UNLOCK(p);
1671                 pmap_expand_pml4(p, 0x0);
1672                 PMAP_LOCK(p);
1673         }
1674         kern_pdptp = kernel_pmap->pm_pdpt;
1675         pmap_store_pte(user_pdptp+0, *(kern_pdptp+0));
1676         pmap_store_pte(user_pdptp+1, *(kern_pdptp+1));
1677         pmap_store_pte(user_pdptp+2, *(kern_pdptp+2));
1678         pmap_store_pte(user_pdptp+3, *(kern_pdptp+3));
1679         p->pm_task_map = TASK_MAP_64BIT_SHARED;
1680         PMAP_UNLOCK(p);
1681 }
1682
1683 void
1684 pmap_clear_4GB_pagezero(pmap_t p)
1685 {
1686         pdpt_entry_t    *user_pdptp;
1687
1688         if (p->pm_task_map != TASK_MAP_64BIT_SHARED)
1689                 return;
1690
1691         PMAP_LOCK(p);
1692
1693         p->pm_task_map = TASK_MAP_64BIT;
1694
1695         pmap_load_kernel_cr3();
1696
1697         user_pdptp = pmap64_pdpt(p, 0x0);
1698         pmap_store_pte(user_pdptp+0, 0);
1699         pmap_store_pte(user_pdptp+1, 0);
1700         pmap_store_pte(user_pdptp+2, 0);
1701         pmap_store_pte(user_pdptp+3, 0);
1702
1703         PMAP_UNLOCK(p);
1704 }
1705
1706 void
1707 pmap_load_kernel_cr3(void)
1708 {
1709         uint64_t        kernel_cr3;
1710
1711         assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
1712
1713         /*
1714          * Reload cr3 with the true kernel cr3.
1715          */
1716         kernel_cr3 = current_cpu_datap()->cpu_kernel_cr3;
1717         set64_cr3(kernel_cr3);
1718         current_cpu_datap()->cpu_active_cr3 = kernel_cr3;
1719         current_cpu_datap()->cpu_tlb_invalid = FALSE;
1720         __asm__ volatile("mfence");
1721 }
1722
1723 /*
1724  *      Retire the given physical map from service.
1725  *      Should only be called if the map contains
1726  *      no valid mappings.
1727  */
1728
1729 void
1730 pmap_destroy(
1731         register pmap_t p)
1732 {
1733         register int            c;
1734
1735         if (p == PMAP_NULL)
1736                 return;
1737
1738         PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1739                    (int) p, 0, 0, 0, 0);
1740
1741         PMAP_LOCK(p);
1742
1743         c = --p->ref_count;
1744
1745         if (c == 0) {
1746                 /*
1747                  * If some cpu is not using the physical pmap pointer that it
1748                  * is supposed to be (see set_dirbase), we might be using the
1749                  * pmap that is being destroyed! Make sure we are
1750                  * physically on the right pmap:
1751                  */
1752                 PMAP_UPDATE_TLBS(p,
1753                                  0x0ULL,
1754                                  0xFFFFFFFFFFFFF000ULL);
1755         }
1756
1757         PMAP_UNLOCK(p);
1758
1759         if (c != 0) {
1760                 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1761                            (int) p, 1, 0, 0, 0);
1762                 return; /* still in use */
1763         }
1764
1765         /*
1766          *      Free the memory maps, then the
1767          *      pmap structure.
1768          */
1769         if (!cpu_64bit) {
1770                 OSAddAtomic(-p->pm_obj->resident_page_count,  &inuse_ptepages_count);
1771
1772                 kmem_free(kernel_map, (vm_offset_t)p->dirbase, NBPTD);
1773                 zfree(pdpt_zone, (void *)p->pm_hold);
1774
1775                 vm_object_deallocate(p->pm_obj);
1776         } else {
1777                 /* 64 bit */
1778                 int inuse_ptepages = 0;
1779
1780                 /* free 64 bit mode structs */
1781                 inuse_ptepages++;
1782                 kmem_free(kernel_map, (vm_offset_t)p->pm_hold, PAGE_SIZE);
1783
1784                 inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1785                 vm_object_deallocate(p->pm_obj_pml4);
1786
1787                 inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1788                 vm_object_deallocate(p->pm_obj_pdpt);
1789
1790                 inuse_ptepages += p->pm_obj->resident_page_count;
1791                 vm_object_deallocate(p->pm_obj);
1792
1793                 OSAddAtomic(-inuse_ptepages,  &inuse_ptepages_count);
1794         }
1795         zfree(pmap_zone, p);
1796
1797         PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1798                    0, 0, 0, 0, 0);
1799
1800 }
1801
1802 /*
1803  *      Add a reference to the specified pmap.
1804  */
1805
1806 void
1807 pmap_reference(
1808         register pmap_t p)
1809 {
1810
1811         if (p != PMAP_NULL) {
1812                 PMAP_LOCK(p);
1813                 p->ref_count++;
1814                 PMAP_UNLOCK(p);;
1815         }
1816 }
1817
1818 /*
1819  *      Remove a range of hardware page-table entries.
1820  *      The entries given are the first (inclusive)
1821  *      and last (exclusive) entries for the VM pages.
1822  *      The virtual address is the va for the first pte.
1823  *
1824  *      The pmap must be locked.
1825  *      If the pmap is not the kernel pmap, the range must lie
1826  *      entirely within one pte-page.  This is NOT checked.
1827  *      Assumes that the pte-page exists.
1828  */
1829
1830 void
1831 pmap_remove_range(
1832         pmap_t                  pmap,
1833         vm_map_offset_t         start_vaddr,
1834         pt_entry_t              *spte,
1835         pt_entry_t              *epte)
1836 {
1837         register pt_entry_t     *cpte;
1838         pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
1839         pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
1840         pv_hashed_entry_t       pvh_e;
1841         int                     pvh_cnt = 0;
1842         int                     num_removed, num_unwired, num_found;
1843         int                     pai;
1844         pmap_paddr_t            pa;
1845         vm_map_offset_t         vaddr;
1846         int                     pvhash_idx;
1847         uint32_t                pv_cnt;
1848
1849         num_removed = 0;
1850         num_unwired = 0;
1851         num_found   = 0;
1852
1853         if (pmap != kernel_pmap &&
1854             pmap->pm_task_map == TASK_MAP_32BIT &&
1855             start_vaddr >= HIGH_MEM_BASE) {
1856                 /*
1857                  * The range is in the "high_shared_pde" which is shared
1858                  * between the kernel and all 32-bit tasks.  It holds
1859                  * the 32-bit commpage but also the trampolines, GDT, etc...
1860                  * so we can't let user tasks remove anything from it.
1861                  */
1862                 return;
1863         }
1864
1865         /* invalidate the PTEs first to "freeze" them */
1866         for (cpte = spte, vaddr = start_vaddr;
1867              cpte < epte;
1868              cpte++, vaddr += PAGE_SIZE_64) {
1869
1870             pa = pte_to_pa(*cpte);
1871             if (pa == 0)
1872                 continue;
1873             num_found++;
1874
1875             if (iswired(*cpte))
1876                 num_unwired++;
1877
1878             pai = pa_index(pa);
1879
1880             if (!managed_page(pai)) {
1881                 /*
1882                  *      Outside range of managed physical memory.
1883                  *      Just remove the mappings.
1884                  */
1885                 pmap_store_pte(cpte, 0);
1886                 continue;
1887             }
1888
1889             /* invalidate the PTE */
1890             pmap_update_pte(cpte, *cpte, (*cpte & ~INTEL_PTE_VALID));
1891         }
1892
1893         if (num_found == 0) {
1894                 /* nothing was changed: we're done */
1895                 goto update_counts;
1896         }
1897
1898         /* propagate the invalidates to other CPUs */
1899
1900         PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
1901
1902         for (cpte = spte, vaddr = start_vaddr;
1903              cpte < epte;
1904              cpte++, vaddr += PAGE_SIZE_64) {
1905
1906             pa = pte_to_pa(*cpte);
1907             if (pa == 0)
1908                 continue;
1909
1910             pai = pa_index(pa);
1911
1912             LOCK_PVH(pai);
1913
1914             pa = pte_to_pa(*cpte);
1915             if (pa == 0) {
1916               UNLOCK_PVH(pai);
1917               continue;
1918             }
1919
1920             num_removed++;
1921
1922             /*
1923              *  Get the modify and reference bits, then
1924              *  nuke the entry in the page table
1925              */
1926             /* remember reference and change */
1927             pmap_phys_attributes[pai] |=
1928                     (char)(*cpte & (PHYS_MODIFIED | PHYS_REFERENCED));
1929             /* completely invalidate the PTE */
1930             pmap_store_pte(cpte, 0);
1931
1932             /*
1933              *  Remove the mapping from the pvlist for
1934              *  this physical page.
1935              */
1936             {
1937               pv_rooted_entry_t pv_h;
1938               pv_hashed_entry_t *pprevh;
1939               ppnum_t ppn = (ppnum_t)pai;
1940
1941                 pv_h = pai_to_pvh(pai);
1942                 pvh_e = PV_HASHED_ENTRY_NULL;
1943                 if (pv_h->pmap == PMAP_NULL)
1944                     panic("pmap_remove_range: null pv_list!");
1945
1946                 if (pv_h->va == vaddr && pv_h->pmap == pmap) { /* rooted or not */
1947                     /*
1948                      * Header is the pv_rooted_entry. We can't free that. If there is a queued
1949                      * entry after this one we remove that
1950                      * from the ppn queue, we remove it from the hash chain
1951                      * and copy it to the rooted entry. Then free it instead.
1952                      */
1953
1954                   pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
1955                   if (pv_h != (pv_rooted_entry_t)pvh_e) {  /* any queued after rooted? */
1956                     CHK_NPVHASH();
1957                     pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
1958                     LOCK_PV_HASH(pvhash_idx);
1959                     remque(&pvh_e->qlink);
1960                     {
1961                       pprevh = pvhash(pvhash_idx);
1962                       if (PV_HASHED_ENTRY_NULL == *pprevh) {
1963                         panic("pmap_remove_range empty hash removing rooted pv");
1964                       }
1965                     }
1966                     pmap_pvh_unlink(pvh_e);
1967                     UNLOCK_PV_HASH(pvhash_idx);
1968                     pv_h->pmap = pvh_e->pmap;
1969                     pv_h->va = pvh_e->va;   /* dispose of pvh_e */
1970                   } else {  /* none queued after rooted */
1971                     pv_h->pmap = PMAP_NULL;
1972                     pvh_e = PV_HASHED_ENTRY_NULL;
1973                   }   /* any queued after rooted */
1974
1975                 } else { /* rooted or not */
1976                   /* not removing rooted pv. find it on hash chain, remove from ppn queue and
1977                    * hash chain and free it */
1978                   CHK_NPVHASH();
1979                   pvhash_idx = pvhashidx(pmap,vaddr);
1980                   LOCK_PV_HASH(pvhash_idx);
1981                   pprevh = pvhash(pvhash_idx);
1982                   if (PV_HASHED_ENTRY_NULL == *pprevh) {
1983                     panic("pmap_remove_range empty hash removing hashed pv");
1984                     }
1985                   pvh_e = *pprevh;
1986                   pmap_pv_hashlist_walks++;
1987                   pv_cnt = 0;
1988                   while (PV_HASHED_ENTRY_NULL != pvh_e) {
1989                         pv_cnt++;
1990                         if (pvh_e->pmap == pmap && pvh_e->va == vaddr && pvh_e->ppn == ppn) break;
1991                         pprevh = &pvh_e->nexth;
1992                         pvh_e = pvh_e->nexth;
1993                   }
1994                   pmap_pv_hashlist_cnts += pv_cnt;
1995                   if (pmap_pv_hashlist_max < pv_cnt) pmap_pv_hashlist_max = pv_cnt;
1996                   if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pmap_remove_range pv not on hash");
1997                   *pprevh = pvh_e->nexth;
1998                   remque(&pvh_e->qlink);
1999                   UNLOCK_PV_HASH(pvhash_idx);
2000
2001                 } /* rooted or not */
2002
2003                 UNLOCK_PVH(pai);
2004
2005                 if (pvh_e != PV_HASHED_ENTRY_NULL) {
2006                   pvh_e->qlink.next = (queue_entry_t)pvh_eh;
2007                   pvh_eh = pvh_e;
2008
2009                   if (pvh_et == PV_HASHED_ENTRY_NULL) {
2010                     pvh_et = pvh_e;
2011                   }
2012
2013                   pvh_cnt++;
2014                 }
2015
2016             } /* removing mappings for this phy page */
2017         } /* for loop */
2018
2019         if (pvh_eh != PV_HASHED_ENTRY_NULL) {
2020             PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
2021         }
2022
2023 update_counts:
2024         /*
2025          *      Update the counts
2026          */
2027 #if TESTING
2028         if (pmap->stats.resident_count < num_removed)
2029                 panic("pmap_remove_range: resident_count");
2030 #endif
2031         assert(pmap->stats.resident_count >= num_removed);
2032         OSAddAtomic(-num_removed,  &pmap->stats.resident_count);
2033
2034 #if TESTING
2035         if (pmap->stats.wired_count < num_unwired)
2036                 panic("pmap_remove_range: wired_count");
2037 #endif
2038         assert(pmap->stats.wired_count >= num_unwired);
2039         OSAddAtomic(-num_unwired,  &pmap->stats.wired_count);
2040
2041         return;
2042 }
2043
2044 /*
2045  *      Remove phys addr if mapped in specified map
2046  *
2047  */
2048 void
2049 pmap_remove_some_phys(
2050         __unused pmap_t         map,
2051         __unused ppnum_t         pn)
2052 {
2053
2054 /* Implement to support working set code */
2055
2056 }
2057
2058 /*
2059  *      Remove the given range of addresses
2060  *      from the specified map.
2061  *
2062  *      It is assumed that the start and end are properly
2063  *      rounded to the hardware page size.
2064  */
2065
2066
2067 void
2068 pmap_remove(
2069         pmap_t          map,
2070         addr64_t        s64,
2071         addr64_t        e64)
2072 {
2073         pt_entry_t      *pde;
2074         pt_entry_t      *spte, *epte;
2075         addr64_t        l64;
2076         addr64_t        orig_s64;
2077         uint64_t        deadline;
2078
2079         pmap_intr_assert();
2080
2081         if (map == PMAP_NULL || s64 == e64)
2082                 return;
2083
2084         PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
2085                    (int) map,
2086                    (int) (s64>>32), (int) s64,
2087                    (int) (e64>>32), (int) e64);
2088
2089         PMAP_LOCK(map);
2090
2091 #if 0
2092         /*
2093          * Check that address range in the kernel does not overlap the stacks.
2094          * We initialize local static min/max variables once to avoid making
2095          * 2 function calls for every remove. Note also that these functions
2096          * both return 0 before kernel stacks have been initialized, and hence
2097          * the panic is not triggered in this case.
2098          */
2099         if (map == kernel_pmap) {
2100                 static vm_offset_t      kernel_stack_min = 0;
2101                 static vm_offset_t      kernel_stack_max = 0;
2102
2103                 if (kernel_stack_min == 0) {
2104                         kernel_stack_min = min_valid_stack_address();
2105                         kernel_stack_max = max_valid_stack_address();
2106                 }
2107                 if  ((kernel_stack_min <= s64 && s64 <  kernel_stack_max) ||
2108                      (kernel_stack_min <  e64 && e64 <= kernel_stack_max))
2109                         panic("pmap_remove() attempted in kernel stack");
2110         }
2111 #else
2112
2113         /*
2114          * The values of kernel_stack_min and kernel_stack_max are no longer
2115          * relevant now that we allocate kernel stacks anywhere in the kernel map,
2116          * so the old code above no longer applies.  If we wanted to check that
2117          * we weren't removing a mapping of a page in a kernel stack we'd have to
2118          * mark the PTE with an unused bit and check that here.
2119          */
2120
2121 #endif
2122
2123         deadline = rdtsc64() + max_preemption_latency_tsc;
2124
2125         orig_s64 = s64;
2126
2127         while (s64 < e64) {
2128             l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size-1);
2129             if (l64 > e64)
2130                 l64 = e64;
2131             pde = pmap_pde(map, s64);
2132
2133             if (pde && (*pde & INTEL_PTE_VALID)) {
2134                 spte = (pt_entry_t *)pmap_pte(map, (s64 & ~(pde_mapped_size-1)));
2135                 spte = &spte[ptenum(s64)];
2136                 epte = &spte[intel_btop(l64-s64)];
2137
2138                 pmap_remove_range(map, s64, spte, epte);
2139             }
2140             s64 = l64;
2141             pde++;
2142
2143             if (s64 < e64 && rdtsc64() >= deadline) {
2144               PMAP_UNLOCK(map)
2145                 PMAP_LOCK(map)
2146
2147               deadline = rdtsc64() + max_preemption_latency_tsc;
2148             }
2149
2150         }
2151
2152         PMAP_UNLOCK(map);
2153
2154         PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END,
2155                    (int) map, 0, 0, 0, 0);
2156
2157 }
2158
2159 /*
2160  *      Routine:        pmap_page_protect
2161  *
2162  *      Function:
2163  *              Lower the permission for all mappings to a given
2164  *              page.
2165  */
2166 void
2167 pmap_page_protect(
2168         ppnum_t         pn,
2169         vm_prot_t       prot)
2170 {
2171         pv_hashed_entry_t               pvh_eh = PV_HASHED_ENTRY_NULL;
2172         pv_hashed_entry_t               pvh_et = PV_HASHED_ENTRY_NULL;
2173         pv_hashed_entry_t       nexth;
2174         int                     pvh_cnt = 0;
2175         pv_rooted_entry_t               pv_h;
2176         pv_rooted_entry_t               pv_e;
2177         pv_hashed_entry_t       pvh_e;
2178         pt_entry_t              *pte;
2179         int                     pai;
2180         register pmap_t         pmap;
2181         boolean_t               remove;
2182         int                     pvhash_idx;
2183
2184         pmap_intr_assert();
2185         assert(pn != vm_page_fictitious_addr);
2186         if (pn == vm_page_guard_addr)
2187                 return;
2188
2189         pai = ppn_to_pai(pn);
2190
2191         if (!managed_page(pai)) {
2192             /*
2193              *  Not a managed page.
2194              */
2195             return;
2196         }
2197
2198         PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START,
2199                    (int) pn, (int) prot, 0, 0, 0);
2200
2201         /*
2202          * Determine the new protection.
2203          */
2204         switch (prot) {
2205             case VM_PROT_READ:
2206             case VM_PROT_READ|VM_PROT_EXECUTE:
2207                 remove = FALSE;
2208                 break;
2209             case VM_PROT_ALL:
2210                 return; /* nothing to do */
2211             default:
2212                 remove = TRUE;
2213                 break;
2214         }
2215
2216         pv_h = pai_to_pvh(pai);
2217
2218         LOCK_PVH(pai);
2219
2220
2221         /*
2222          * Walk down PV list, changing or removing all mappings.
2223          */
2224         if (pv_h->pmap != PMAP_NULL) {
2225
2226             pv_e = pv_h;
2227             pvh_e = (pv_hashed_entry_t)pv_e; /* cheat */
2228
2229             do {
2230                 register vm_map_offset_t vaddr;
2231                 pmap = pv_e->pmap;
2232
2233                 vaddr = pv_e->va;
2234                 pte = pmap_pte(pmap, vaddr);
2235
2236                 if (0 == pte) {
2237                         panic("pmap_page_protect: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx, prot: %d kernel_pmap: %p", pmap, pn, vaddr, prot, kernel_pmap);
2238                 }
2239
2240                 nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink);  /* if there is one */
2241
2242                 /*
2243                  * Remove the mapping if new protection is NONE
2244                  * or if write-protecting a kernel mapping.
2245                  */
2246                 if (remove || pmap == kernel_pmap) {
2247                     /*
2248                      * Remove the mapping, collecting any modify bits.
2249                      */
2250                     pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID));
2251
2252                     PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2253
2254                     pmap_phys_attributes[pai] |= *pte & (PHYS_MODIFIED|PHYS_REFERENCED);
2255
2256                     pmap_store_pte(pte, 0);
2257
2258 #if TESTING
2259                     if (pmap->stats.resident_count < 1)
2260                         panic("pmap_page_protect: resident_count");
2261 #endif
2262                     assert(pmap->stats.resident_count >= 1);
2263                     OSAddAtomic(-1,  &pmap->stats.resident_count);
2264
2265                     /*
2266                      * Deal with the pv_rooted_entry.
2267                      */
2268
2269                     if (pv_e == pv_h) {
2270                         /*
2271                          * Fix up head later.
2272                          */
2273                         pv_h->pmap = PMAP_NULL;
2274                     }
2275                     else {
2276                         /*
2277                          * Delete this entry.
2278                          */
2279                       CHK_NPVHASH();
2280                       pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
2281                       LOCK_PV_HASH(pvhash_idx);
2282                       remque(&pvh_e->qlink);
2283                       pmap_pvh_unlink(pvh_e);
2284                       UNLOCK_PV_HASH(pvhash_idx);
2285
2286                       pvh_e->qlink.next = (queue_entry_t)pvh_eh;
2287                         pvh_eh = pvh_e;
2288
2289                         if (pvh_et == PV_HASHED_ENTRY_NULL)
2290                             pvh_et = pvh_e;
2291                         pvh_cnt++;
2292                     }
2293                 } else {
2294                     /*
2295                      * Write-protect.
2296                      */
2297                     pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WRITE));
2298                     PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2299                 }
2300
2301                 pvh_e = nexth;
2302             } while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h);
2303
2304
2305             /*
2306              * If pv_head mapping was removed, fix it up.
2307              */
2308
2309             if (pv_h->pmap == PMAP_NULL) {
2310               pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
2311
2312               if (pvh_e != (pv_hashed_entry_t)pv_h) {
2313                 CHK_NPVHASH();
2314                 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
2315                 LOCK_PV_HASH(pvhash_idx);
2316                 remque(&pvh_e->qlink);
2317                 pmap_pvh_unlink(pvh_e);
2318                 UNLOCK_PV_HASH(pvhash_idx);
2319                   pv_h->pmap = pvh_e->pmap;
2320                   pv_h->va = pvh_e->va;
2321                   pvh_e->qlink.next = (queue_entry_t)pvh_eh;
2322                     pvh_eh = pvh_e;
2323
2324                     if (pvh_et == PV_HASHED_ENTRY_NULL)
2325                         pvh_et = pvh_e;
2326                     pvh_cnt++;
2327                 }
2328             }
2329         }
2330         if (pvh_eh != PV_HASHED_ENTRY_NULL) {
2331             PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
2332         }
2333
2334         UNLOCK_PVH(pai);
2335
2336         PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END,
2337                    0, 0, 0, 0, 0);
2338
2339 }
2340
2341
2342 /*
2343  *      Routine:
2344  *              pmap_disconnect
2345  *
2346  *      Function:
2347  *              Disconnect all mappings for this page and return reference and change status
2348  *              in generic format.
2349  *
2350  */
2351 unsigned int pmap_disconnect(
2352         ppnum_t pa)
2353 {
2354         pmap_page_protect(pa, 0);                       /* disconnect the page */
2355         return (pmap_get_refmod(pa));                   /* return ref/chg status */
2356 }
2357
2358 /*
2359  *      Set the physical protection on the
2360  *      specified range of this map as requested.
2361  *      Will not increase permissions.
2362  */
2363 void
2364 pmap_protect(
2365         pmap_t          map,
2366         vm_map_offset_t sva,
2367         vm_map_offset_t eva,
2368         vm_prot_t       prot)
2369 {
2370         register pt_entry_t     *pde;
2371         register pt_entry_t     *spte, *epte;
2372         vm_map_offset_t         lva;
2373         vm_map_offset_t         orig_sva;
2374         boolean_t       set_NX;
2375         int             num_found = 0;
2376
2377         pmap_intr_assert();
2378
2379         if (map == PMAP_NULL)
2380                 return;
2381
2382         if (prot == VM_PROT_NONE) {
2383                 pmap_remove(map, sva, eva);
2384                 return;
2385         }
2386
2387         PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
2388                    (int) map,
2389                    (int) (sva>>32), (int) sva,
2390                    (int) (eva>>32), (int) eva);
2391
2392         if ( (prot & VM_PROT_EXECUTE) || !nx_enabled || !map->nx_enabled )
2393                 set_NX = FALSE;
2394         else
2395                 set_NX = TRUE;
2396
2397         PMAP_LOCK(map);
2398
2399         orig_sva = sva;
2400         while (sva < eva) {
2401             lva = (sva + pde_mapped_size) & ~(pde_mapped_size-1);
2402             if (lva > eva)
2403                 lva = eva;
2404             pde = pmap_pde(map, sva);
2405             if (pde && (*pde & INTEL_PTE_VALID)) {
2406                 spte = (pt_entry_t *)pmap_pte(map, (sva & ~(pde_mapped_size-1)));
2407                 spte = &spte[ptenum(sva)];
2408                 epte = &spte[intel_btop(lva-sva)];
2409
2410                 while (spte < epte) {
2411
2412                     if (*spte & INTEL_PTE_VALID) {
2413
2414                         if (prot & VM_PROT_WRITE)
2415                             pmap_update_pte(spte, *spte, (*spte | INTEL_PTE_WRITE));
2416                         else
2417                             pmap_update_pte(spte, *spte, (*spte & ~INTEL_PTE_WRITE));
2418
2419                         if (set_NX == TRUE)
2420                             pmap_update_pte(spte, *spte, (*spte | INTEL_PTE_NX));
2421                         else
2422                             pmap_update_pte(spte, *spte, (*spte & ~INTEL_PTE_NX));
2423
2424                         num_found++;
2425                     }
2426                     spte++;
2427                 }
2428             }
2429             sva = lva;
2430         }
2431         if (num_found)
2432             PMAP_UPDATE_TLBS(map, orig_sva, eva);
2433
2434         PMAP_UNLOCK(map);
2435
2436         PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END,
2437                    0, 0, 0, 0, 0);
2438
2439 }
2440
2441 /* Map a (possibly) autogenned block */
2442 void
2443 pmap_map_block(
2444         pmap_t          pmap,
2445         addr64_t        va,
2446         ppnum_t         pa,
2447         uint32_t        size,
2448         vm_prot_t       prot,
2449         int             attr,
2450         __unused unsigned int   flags)
2451 {
2452     uint32_t page;
2453
2454     for (page = 0; page < size; page++) {
2455         pmap_enter(pmap, va, pa, prot, attr, TRUE);
2456         va += PAGE_SIZE;
2457         pa++;
2458     }
2459 }
2460
2461
2462 /*
2463  *      Insert the given physical page (p) at
2464  *      the specified virtual address (v) in the
2465  *      target physical map with the protection requested.
2466  *
2467  *      If specified, the page will be wired down, meaning
2468  *      that the related pte cannot be reclaimed.
2469  *
2470  *      NB:  This is the only routine which MAY NOT lazy-evaluate
2471  *      or lose information.  That is, this routine must actually
2472  *      insert this page into the given map NOW.
2473  */
2474 void
2475 pmap_enter(
2476         register pmap_t         pmap,
2477         vm_map_offset_t         vaddr,
2478         ppnum_t                 pn,
2479         vm_prot_t               prot,
2480         unsigned int            flags,
2481         boolean_t               wired)
2482 {
2483         register pt_entry_t     *pte;
2484         register pv_rooted_entry_t      pv_h;
2485         register int            pai;
2486         pv_hashed_entry_t               pvh_e;
2487         pv_hashed_entry_t               pvh_new;
2488         pv_hashed_entry_t       *hashp;
2489         pt_entry_t              template;
2490         pmap_paddr_t            old_pa;
2491         pmap_paddr_t             pa = (pmap_paddr_t)i386_ptob(pn);
2492         boolean_t               need_tlbflush = FALSE;
2493         boolean_t               set_NX;
2494         char                    oattr;
2495         int                     pvhash_idx;
2496         uint32_t                pv_cnt;
2497         boolean_t               old_pa_locked;
2498
2499         pmap_intr_assert();
2500         assert(pn != vm_page_fictitious_addr);
2501         if (pmap_debug)
2502                 printf("pmap(%qx, %x)\n", vaddr, pn);
2503         if (pmap == PMAP_NULL)
2504                 return;
2505         if (pn == vm_page_guard_addr)
2506                 return;
2507
2508         PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
2509                    (int) pmap,
2510                    (int) (vaddr>>32), (int) vaddr,
2511                    (int) pn, prot);
2512
2513         if ( (prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled )
2514                 set_NX = FALSE;
2515         else
2516                 set_NX = TRUE;
2517
2518         /*
2519          *      Must allocate a new pvlist entry while we're unlocked;
2520          *      zalloc may cause pageout (which will lock the pmap system).
2521          *      If we determine we need a pvlist entry, we will unlock
2522          *      and allocate one.  Then we will retry, throughing away
2523          *      the allocated entry later (if we no longer need it).
2524          */
2525
2526         pvh_new = PV_HASHED_ENTRY_NULL;
2527 Retry:
2528         pvh_e = PV_HASHED_ENTRY_NULL;
2529
2530         PMAP_LOCK(pmap);
2531
2532         /*
2533          *      Expand pmap to include this pte.  Assume that
2534          *      pmap is always expanded to include enough hardware
2535          *      pages to map one VM page.
2536          */
2537
2538         while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
2539                 /*
2540                  *      Must unlock to expand the pmap.
2541                  */
2542                 PMAP_UNLOCK(pmap);
2543                 pmap_expand(pmap, vaddr); /* going to grow pde level page(s) */
2544                 PMAP_LOCK(pmap);
2545         }
2546
2547         old_pa = pte_to_pa(*pte);
2548         pai = pa_index(old_pa);
2549         old_pa_locked = FALSE;
2550
2551         /*
2552          * if we have a previous managed page, lock the pv entry now. after
2553          * we lock it, check to see if someone beat us to the lock and if so
2554          * drop the lock
2555          */
2556
2557         if ((0 != old_pa) && managed_page(pai)) {
2558           LOCK_PVH(pai);
2559           old_pa_locked = TRUE;
2560           old_pa = pte_to_pa(*pte);
2561           if (0 == old_pa) {
2562             UNLOCK_PVH(pai);  /* some other path beat us to it */
2563             old_pa_locked = FALSE;
2564           }
2565         }
2566
2567
2568         /*
2569          *      Special case if the incoming physical page is already mapped
2570          *      at this address.
2571          */
2572         if (old_pa == pa) {
2573
2574             /*
2575              *  May be changing its wired attribute or protection
2576              */
2577
2578             template = pa_to_pte(pa) | INTEL_PTE_VALID;
2579
2580             if(VM_MEM_NOT_CACHEABLE == (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) {
2581                 if(!(flags & VM_MEM_GUARDED))
2582                         template |= INTEL_PTE_PTA;
2583                 template |= INTEL_PTE_NCACHE;
2584             }
2585
2586             if (pmap != kernel_pmap)
2587                 template |= INTEL_PTE_USER;
2588             if (prot & VM_PROT_WRITE)
2589                 template |= INTEL_PTE_WRITE;
2590
2591             if (set_NX == TRUE)
2592                 template |= INTEL_PTE_NX;
2593
2594             if (wired) {
2595                 template |= INTEL_PTE_WIRED;
2596                 if (!iswired(*pte))
2597                     OSAddAtomic(+1,  &pmap->stats.wired_count);
2598             }
2599             else {
2600                 if (iswired(*pte)) {
2601                     assert(pmap->stats.wired_count >= 1);
2602                     OSAddAtomic(-1,  &pmap->stats.wired_count);
2603                 }
2604             }
2605
2606             /* store modified PTE and preserve RC bits */
2607             pmap_update_pte(pte, *pte, template | (*pte & (INTEL_PTE_REF | INTEL_PTE_MOD)));
2608             if (old_pa_locked) {
2609               UNLOCK_PVH(pai);
2610               old_pa_locked = FALSE;
2611             }
2612             need_tlbflush = TRUE;
2613             goto Done;
2614         }
2615
2616         /*
2617          *      Outline of code from here:
2618          *         1) If va was mapped, update TLBs, remove the mapping
2619          *            and remove old pvlist entry.
2620          *         2) Add pvlist entry for new mapping
2621          *         3) Enter new mapping.
2622          *
2623          *      If the old physical page is not managed step 1) is skipped
2624          *      (except for updating the TLBs), and the mapping is
2625          *      overwritten at step 3).  If the new physical page is not
2626          *      managed, step 2) is skipped.
2627          */
2628
2629         if (old_pa != (pmap_paddr_t) 0) {
2630
2631             /*
2632              *  Don't do anything to pages outside valid memory here.
2633              *  Instead convince the code that enters a new mapping
2634              *  to overwrite the old one.
2635              */
2636
2637             /* invalidate the PTE */
2638             pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID));
2639             /* propagate invalidate everywhere */
2640             PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2641             /* remember reference and change */
2642             oattr = (char)(*pte & (PHYS_MODIFIED | PHYS_REFERENCED));
2643             /* completely invalidate the PTE */
2644             pmap_store_pte(pte, 0);
2645
2646             if (managed_page(pai)) {
2647 #if TESTING
2648                 if (pmap->stats.resident_count < 1)
2649                     panic("pmap_enter: resident_count");
2650 #endif
2651                 assert(pmap->stats.resident_count >= 1);
2652                 OSAddAtomic(-1,  &pmap->stats.resident_count);
2653
2654                 if (iswired(*pte)) {
2655
2656 #if TESTING
2657                     if (pmap->stats.wired_count < 1)
2658                         panic("pmap_enter: wired_count");
2659 #endif
2660                     assert(pmap->stats.wired_count >= 1);
2661                     OSAddAtomic(-1,  &pmap->stats.wired_count);
2662                 }
2663
2664                 pmap_phys_attributes[pai] |= oattr;
2665                 /*
2666                  *      Remove the mapping from the pvlist for
2667                  *      this physical page.
2668                  *      We'll end up with either a rooted pv or a
2669                  *      hashed pv
2670                  */
2671                 {
2672
2673                     pv_h = pai_to_pvh(pai);
2674
2675                     if (pv_h->pmap == PMAP_NULL) {
2676                         panic("pmap_enter: null pv_list!");
2677                     }
2678
2679                     if (pv_h->va == vaddr && pv_h->pmap == pmap) {
2680                         /*
2681                          * Header is the pv_rooted_entry.
2682                          * If there is a next one, copy it to the
2683                          * header and free the next one (we cannot
2684                          * free the header)
2685                          */
2686                       pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
2687                       if (pvh_e != (pv_hashed_entry_t)pv_h) {
2688                         pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
2689                         LOCK_PV_HASH(pvhash_idx);
2690                           remque(&pvh_e->qlink);
2691                           pmap_pvh_unlink(pvh_e);
2692                           UNLOCK_PV_HASH(pvhash_idx);
2693                           pv_h->pmap = pvh_e->pmap;
2694                           pv_h->va = pvh_e->va;
2695                         }
2696                       else {
2697                         pv_h->pmap = PMAP_NULL;
2698                         pvh_e = PV_HASHED_ENTRY_NULL;
2699                       }
2700                     }
2701                     else {
2702                       pv_hashed_entry_t *pprevh;
2703                       ppnum_t old_ppn;
2704                       /* wasn't the rooted pv - hash, find it, and unlink it */
2705                       old_ppn = (ppnum_t)pa_index(old_pa);
2706                       CHK_NPVHASH();
2707                       pvhash_idx = pvhashidx(pmap,vaddr);
2708                       LOCK_PV_HASH(pvhash_idx);
2709                       pprevh = pvhash(pvhash_idx);
2710 #if PV_DEBUG
2711                       if (NULL==pprevh)panic("pmap enter 1");
2712 #endif
2713                       pvh_e = *pprevh;
2714                       pmap_pv_hashlist_walks++;
2715                       pv_cnt = 0;
2716                       while (PV_HASHED_ENTRY_NULL != pvh_e) {
2717                         pv_cnt++;
2718                         if (pvh_e->pmap == pmap && pvh_e->va == vaddr && pvh_e->ppn == old_ppn) break;
2719                         pprevh = &pvh_e->nexth;
2720                         pvh_e = pvh_e->nexth;
2721                       }
2722                       pmap_pv_hashlist_cnts += pv_cnt;
2723                       if (pmap_pv_hashlist_max < pv_cnt) pmap_pv_hashlist_max = pv_cnt;
2724                       if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pmap_enter: pv not in hash list");
2725                       if(NULL==pprevh)panic("pmap enter 2");
2726                       *pprevh = pvh_e->nexth;
2727                       remque(&pvh_e->qlink);
2728                       UNLOCK_PV_HASH(pvhash_idx);
2729                     }
2730                 }
2731             }
2732             else {
2733                 /*
2734                  *      old_pa is not managed.
2735                  *      Do removal part of accounting.
2736                  */
2737
2738                 if (iswired(*pte)) {
2739                     assert(pmap->stats.wired_count >= 1);
2740                     OSAddAtomic(-1,  &pmap->stats.wired_count);
2741                 }
2742             }
2743         }
2744
2745         /*
2746          * if we had a previously managed paged locked, unlock it now
2747          */
2748
2749         if (old_pa_locked) {
2750           UNLOCK_PVH(pai);
2751           old_pa_locked = FALSE;
2752         }
2753
2754         pai = pa_index(pa);     /* now working with new incoming phys page */
2755         if (managed_page(pai)) {
2756
2757             /*
2758              *  Step 2) Enter the mapping in the PV list for this
2759              *  physical page.
2760              */
2761             pv_h = pai_to_pvh(pai);
2762
2763             LOCK_PVH(pai);
2764
2765             if (pv_h->pmap == PMAP_NULL) {
2766                 /*
2767                  *      No mappings yet, use  rooted pv
2768                  */
2769                 pv_h->va = vaddr;
2770                 pv_h->pmap = pmap;
2771                 queue_init(&pv_h->qlink);
2772             }
2773             else {
2774                 /*
2775                  *      Add new pv_hashed_entry after header.
2776                  */
2777                 if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
2778                   pvh_e = pvh_new;
2779                   pvh_new = PV_HASHED_ENTRY_NULL;  /* show we used it */
2780                 } else if (PV_HASHED_ENTRY_NULL == pvh_e) {
2781                   PV_HASHED_ALLOC(pvh_e);
2782                   if (PV_HASHED_ENTRY_NULL == pvh_e) {
2783                     /* the pv list is empty.
2784                      * if we are on the kernel pmap we'll use one of the special private
2785                      * kernel pv_e's, else, we need to unlock everything, zalloc a pv_e,
2786                      * and restart bringing in the pv_e with us.
2787                      */
2788                     if (kernel_pmap == pmap) {
2789                       PV_HASHED_KERN_ALLOC(pvh_e);
2790                     } else {
2791                       UNLOCK_PVH(pai);
2792                       PMAP_UNLOCK(pmap);
2793                       pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
2794                       goto Retry;
2795                     }
2796                   }
2797                 }
2798
2799                 if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pvh_e exhaustion");
2800                 pvh_e->va = vaddr;
2801                 pvh_e->pmap = pmap;
2802                 pvh_e->ppn = pn;
2803                 CHK_NPVHASH();
2804                 pvhash_idx = pvhashidx(pmap,vaddr);
2805                 LOCK_PV_HASH(pvhash_idx);
2806                 insque(&pvh_e->qlink, &pv_h->qlink);
2807                 hashp = pvhash(pvhash_idx);
2808 #if PV_DEBUG
2809                 if(NULL==hashp)panic("pmap_enter 4");
2810 #endif
2811                 pvh_e->nexth = *hashp;
2812                 *hashp = pvh_e;
2813                 UNLOCK_PV_HASH(pvhash_idx);
2814
2815                 /*
2816                  *      Remember that we used the pvlist entry.
2817                  */
2818                 pvh_e = PV_HASHED_ENTRY_NULL;
2819             }
2820
2821             /*
2822              * only count the mapping
2823              * for 'managed memory'
2824              */
2825             OSAddAtomic(+1,  &pmap->stats.resident_count);
2826             if (pmap->stats.resident_count > pmap->stats.resident_max) {
2827                     pmap->stats.resident_max = pmap->stats.resident_count;
2828             }
2829         }
2830
2831         /*
2832          * Step 3) Enter the mapping.
2833          *
2834          *      Build a template to speed up entering -
2835          *      only the pfn changes.
2836          */
2837         template = pa_to_pte(pa) | INTEL_PTE_VALID;
2838
2839         if (flags & VM_MEM_NOT_CACHEABLE) {
2840                 if(!(flags & VM_MEM_GUARDED))
2841                         template |= INTEL_PTE_PTA;
2842                 template |= INTEL_PTE_NCACHE;
2843         }
2844
2845         if (pmap != kernel_pmap)
2846                 template |= INTEL_PTE_USER;
2847         if (prot & VM_PROT_WRITE)
2848                 template |= INTEL_PTE_WRITE;
2849
2850         if (set_NX == TRUE)
2851                 template |= INTEL_PTE_NX;
2852
2853         if (wired) {
2854                 template |= INTEL_PTE_WIRED;
2855                 OSAddAtomic(+1,  &pmap->stats.wired_count);
2856         }
2857         pmap_store_pte(pte, template);
2858
2859         /* if this was a managed page we delayed unlocking the pv until here
2860          * to prevent pmap_page_protect et al from finding it until the pte
2861          * has been stored */
2862
2863         if (managed_page(pai)) {
2864           UNLOCK_PVH(pai);
2865         }
2866
2867 Done:
2868         if (need_tlbflush == TRUE)
2869                 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2870
2871         if (pvh_e != PV_HASHED_ENTRY_NULL) {
2872                 PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
2873         }
2874
2875         if (pvh_new != PV_HASHED_ENTRY_NULL) {
2876           PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
2877         }
2878
2879         PMAP_UNLOCK(pmap);
2880         PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0);
2881 }
2882
2883 /*
2884  *      Routine:        pmap_change_wiring
2885  *      Function:       Change the wiring attribute for a map/virtual-address
2886  *                      pair.
2887  *      In/out conditions:
2888  *                      The mapping must already exist in the pmap.
2889  */
2890 void
2891 pmap_change_wiring(
2892         register pmap_t map,
2893         vm_map_offset_t vaddr,
2894         boolean_t       wired)
2895 {
2896         register pt_entry_t     *pte;
2897
2898         /*
2899          *      We must grab the pmap system lock because we may
2900          *      change a pte_page queue.
2901          */
2902         PMAP_LOCK(map);
2903
2904         if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL)
2905                 panic("pmap_change_wiring: pte missing");
2906
2907         if (wired && !iswired(*pte)) {
2908             /*
2909              *  wiring down mapping
2910              */
2911             OSAddAtomic(+1,  &map->stats.wired_count);
2912             pmap_update_pte(pte, *pte, (*pte | INTEL_PTE_WIRED));
2913         }
2914         else if (!wired && iswired(*pte)) {
2915             /*
2916              *  unwiring mapping
2917              */
2918             assert(map->stats.wired_count >= 1);
2919             OSAddAtomic(-1,  &map->stats.wired_count);
2920             pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WIRED));
2921         }
2922
2923         PMAP_UNLOCK(map);
2924 }
2925
2926
2927 /*
2928  *      Routine:        pmap_extract
2929  *      Function:
2930  *              Extract the physical page address associated
2931  *              with the given map/virtual_address pair.
2932  *     Change to shim for backwards compatibility but will not
2933  *     work for 64 bit systems.  Some old drivers that we cannot
2934  *     change need this.
2935  */
2936
2937 vm_offset_t
2938 pmap_extract(
2939         register pmap_t pmap,
2940         vm_map_offset_t vaddr)
2941 {
2942         ppnum_t ppn;
2943         vm_offset_t paddr;
2944
2945         paddr = (vm_offset_t)0;
2946         ppn = pmap_find_phys(pmap, vaddr);
2947
2948         if (ppn) {
2949                 paddr = ((vm_offset_t)i386_ptob(ppn)) | ((vm_offset_t)vaddr & INTEL_OFFMASK);
2950         }
2951         return (paddr);
2952 }
2953
2954 void
2955 pmap_expand_pml4(
2956                  pmap_t map,
2957                  vm_map_offset_t vaddr)
2958 {
2959         register vm_page_t      m;
2960         register pmap_paddr_t   pa;
2961         uint64_t                i;
2962         spl_t                   spl;
2963         ppnum_t                 pn;
2964         pml4_entry_t            *pml4p;
2965
2966         if (kernel_pmap == map) panic("expand kernel pml4");
2967
2968         spl = splhigh();
2969         pml4p = pmap64_pml4(map, vaddr);
2970         splx(spl);
2971         if (PML4_ENTRY_NULL == pml4p) panic("pmap_expand_pml4 no pml4p");
2972
2973         /*
2974          *      Allocate a VM page for the pml4 page
2975          */
2976         while ((m = vm_page_grab()) == VM_PAGE_NULL)
2977                 VM_PAGE_WAIT();
2978
2979         /*
2980          *      put the page into the pmap's obj list so it
2981          *      can be found later.
2982          */
2983         pn = m->phys_page;
2984         pa = i386_ptob(pn);
2985         i = pml4idx(map, vaddr);
2986
2987         /*
2988          *      Zero the page.
2989          */
2990         pmap_zero_page(pn);
2991
2992         vm_page_lockspin_queues();
2993         vm_page_wire(m);
2994         vm_page_unlock_queues();
2995
2996         OSAddAtomic(1,  &inuse_ptepages_count);
2997
2998         /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2999         vm_object_lock(map->pm_obj_pml4);
3000
3001         PMAP_LOCK(map);
3002         /*
3003          *      See if someone else expanded us first
3004          */
3005         if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
3006                 PMAP_UNLOCK(map);
3007                 vm_object_unlock(map->pm_obj_pml4);
3008
3009                 VM_PAGE_FREE(m);
3010
3011                 OSAddAtomic(-1,  &inuse_ptepages_count);
3012                 return;
3013         }
3014
3015 #if 0 /* DEBUG */
3016        if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i)) {
3017                panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
3018                      map, map->pm_obj_pml4, vaddr, i);
3019        }
3020 #endif
3021         vm_page_insert(m, map->pm_obj_pml4, (vm_object_offset_t)i);
3022         vm_object_unlock(map->pm_obj_pml4);
3023
3024         /*
3025          *      Set the page directory entry for this page table.
3026          */
3027         pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
3028
3029         pmap_store_pte(pml4p, pa_to_pte(pa)
3030                                 | INTEL_PTE_VALID
3031                                 | INTEL_PTE_USER
3032                                 | INTEL_PTE_WRITE);
3033
3034         PMAP_UNLOCK(map);
3035
3036         return;
3037
3038 }
3039
3040 void
3041 pmap_expand_pdpt(
3042                  pmap_t map,
3043                  vm_map_offset_t vaddr)
3044 {
3045         register vm_page_t      m;
3046         register pmap_paddr_t   pa;
3047         uint64_t                i;
3048         spl_t                   spl;
3049         ppnum_t                 pn;
3050         pdpt_entry_t            *pdptp;
3051
3052         if (kernel_pmap == map) panic("expand kernel pdpt");
3053
3054         spl = splhigh();
3055         while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
3056                 splx(spl);
3057                 pmap_expand_pml4(map, vaddr); /* need room for another pdpt entry */
3058                 spl = splhigh();
3059         }
3060         splx(spl);
3061
3062         /*
3063          *      Allocate a VM page for the pdpt page
3064          */
3065         while ((m = vm_page_grab()) == VM_PAGE_NULL)
3066                 VM_PAGE_WAIT();
3067
3068         /*
3069          *      put the page into the pmap's obj list so it
3070          *      can be found later.
3071          */
3072         pn = m->phys_page;
3073         pa = i386_ptob(pn);
3074         i = pdptidx(map, vaddr);
3075
3076         /*
3077          *      Zero the page.
3078          */
3079         pmap_zero_page(pn);
3080
3081         vm_page_lockspin_queues();
3082         vm_page_wire(m);
3083         vm_page_unlock_queues();
3084
3085         OSAddAtomic(1,  &inuse_ptepages_count);
3086
3087         /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3088         vm_object_lock(map->pm_obj_pdpt);
3089
3090         PMAP_LOCK(map);
3091         /*
3092          *      See if someone else expanded us first
3093          */
3094         if (pmap64_pde(map, vaddr) != PD_ENTRY_NULL) {
3095                 PMAP_UNLOCK(map);
3096                 vm_object_unlock(map->pm_obj_pdpt);
3097
3098                 VM_PAGE_FREE(m);
3099
3100                 OSAddAtomic(-1,  &inuse_ptepages_count);
3101                 return;
3102         }
3103
3104 #if 0 /* DEBUG */
3105        if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i)) {
3106                panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
3107                      map, map->pm_obj_pdpt, vaddr, i);
3108        }
3109 #endif
3110         vm_page_insert(m, map->pm_obj_pdpt, (vm_object_offset_t)i);
3111         vm_object_unlock(map->pm_obj_pdpt);
3112
3113         /*
3114          *      Set the page directory entry for this page table.
3115          */
3116         pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
3117
3118         pmap_store_pte(pdptp, pa_to_pte(pa)
3119                                 | INTEL_PTE_VALID
3120                                 | INTEL_PTE_USER
3121                                 | INTEL_PTE_WRITE);
3122
3123         PMAP_UNLOCK(map);
3124
3125         return;
3126
3127 }
3128
3129
3130
3131 /*
3132  *      Routine:        pmap_expand
3133  *
3134  *      Expands a pmap to be able to map the specified virtual address.
3135  *
3136  *      Allocates new virtual memory for the P0 or P1 portion of the
3137  *      pmap, then re-maps the physical pages that were in the old
3138  *      pmap to be in the new pmap.
3139  *
3140  *      Must be called with the pmap system and the pmap unlocked,
3141  *      since these must be unlocked to use vm_allocate or vm_deallocate.
3142  *      Thus it must be called in a loop that checks whether the map
3143  *      has been expanded enough.
3144  *      (We won't loop forever, since page tables aren't shrunk.)
3145  */
3146 void
3147 pmap_expand(
3148         pmap_t          map,
3149         vm_map_offset_t vaddr)
3150 {
3151         pt_entry_t              *pdp;
3152         register vm_page_t      m;
3153         register pmap_paddr_t   pa;
3154         uint64_t                 i;
3155         spl_t                   spl;
3156         ppnum_t                 pn;
3157
3158         /*
3159          * if not the kernel map (while we are still compat kernel mode)
3160          * and we are 64 bit, propagate expand upwards
3161          */
3162
3163         if (cpu_64bit && (map != kernel_pmap)) {
3164                 spl = splhigh();
3165                 while ((pdp = pmap64_pde(map, vaddr)) == PD_ENTRY_NULL) {
3166                         splx(spl);
3167                         pmap_expand_pdpt(map, vaddr); /* need room for another pde entry */
3168                         spl = splhigh();
3169                 }
3170                 splx(spl);
3171         }
3172
3173         /*
3174          *      Allocate a VM page for the pde entries.
3175          */
3176         while ((m = vm_page_grab()) == VM_PAGE_NULL)
3177                 VM_PAGE_WAIT();
3178
3179         /*
3180          *      put the page into the pmap's obj list so it
3181          *      can be found later.
3182          */
3183         pn = m->phys_page;
3184         pa = i386_ptob(pn);
3185         i = pdeidx(map, vaddr);
3186
3187         /*
3188          *      Zero the page.
3189          */
3190         pmap_zero_page(pn);
3191
3192         vm_page_lockspin_queues();
3193         vm_page_wire(m);
3194         vm_page_unlock_queues();
3195
3196         OSAddAtomic(1,  &inuse_ptepages_count);
3197
3198         /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3199         vm_object_lock(map->pm_obj);
3200
3201         PMAP_LOCK(map);
3202         /*
3203          *      See if someone else expanded us first
3204          */
3205
3206         if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
3207                 PMAP_UNLOCK(map);
3208                 vm_object_unlock(map->pm_obj);
3209
3210                 VM_PAGE_FREE(m);
3211
3212                 OSAddAtomic(-1,  &inuse_ptepages_count);
3213                 return;
3214         }
3215
3216 #if 0 /* DEBUG */
3217        if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i)) {
3218                panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
3219                      map, map->pm_obj, vaddr, i);
3220        }
3221 #endif
3222         vm_page_insert(m, map->pm_obj, (vm_object_offset_t)i);
3223         vm_object_unlock(map->pm_obj);
3224
3225         /*
3226          * refetch while locked
3227          */
3228
3229         pdp = pmap_pde(map, vaddr);
3230
3231         /*
3232          *      Set the page directory entry for this page table.
3233          */
3234         pmap_store_pte(pdp, pa_to_pte(pa)
3235                                 | INTEL_PTE_VALID
3236                                 | INTEL_PTE_USER
3237                                 | INTEL_PTE_WRITE);
3238
3239         PMAP_UNLOCK(map);
3240
3241         return;
3242 }
3243
3244
3245 /*
3246  * pmap_sync_page_data_phys(ppnum_t pa)
3247  *
3248  * Invalidates all of the instruction cache on a physical page and
3249  * pushes any dirty data from the data cache for the same physical page
3250  * Not required in i386.
3251  */
3252 void
3253 pmap_sync_page_data_phys(__unused ppnum_t pa)
3254 {
3255         return;
3256 }
3257
3258 /*
3259  * pmap_sync_page_attributes_phys(ppnum_t pa)
3260  *
3261  * Write back and invalidate all cachelines on a physical page.
3262  */
3263 void
3264 pmap_sync_page_attributes_phys(ppnum_t pa)
3265 {
3266         cache_flush_page_phys(pa);
3267 }
3268
3269
3270
3271 #ifdef CURRENTLY_UNUSED_AND_UNTESTED
3272
3273 int     collect_ref;
3274 int     collect_unref;
3275
3276 /*
3277  *      Routine:        pmap_collect
3278  *      Function:
3279  *              Garbage collects the physical map system for
3280  *              pages which are no longer used.
3281  *              Success need not be guaranteed -- that is, there
3282  *              may well be pages which are not referenced, but
3283  *              others may be collected.
3284  *      Usage:
3285  *              Called by the pageout daemon when pages are scarce.
3286  */
3287 void
3288 pmap_collect(
3289         pmap_t          p)
3290 {
3291         register pt_entry_t     *pdp, *ptp;
3292         pt_entry_t              *eptp;
3293         int                     wired;
3294
3295         if (p == PMAP_NULL)
3296                 return;
3297
3298         if (p == kernel_pmap)
3299                 return;
3300
3301         /*
3302          *      Garbage collect map.
3303          */
3304         PMAP_LOCK(p);
3305
3306         for (pdp = (pt_entry_t *)p->dirbase;
3307              pdp < (pt_entry_t *)&p->dirbase[(UMAXPTDI+1)];
3308              pdp++)
3309         {
3310            if (*pdp & INTEL_PTE_VALID) {
3311               if(*pdp & INTEL_PTE_REF) {
3312                 pmap_store_pte(pdp, *pdp & ~INTEL_PTE_REF);
3313                 collect_ref++;
3314               } else {
3315                 collect_unref++;
3316                 ptp = pmap_pte(p, pdetova(pdp - (pt_entry_t *)p->dirbase));
3317                 eptp = ptp + NPTEPG;
3318
3319                 /*
3320                  * If the pte page has any wired mappings, we cannot
3321                  * free it.
3322                  */
3323                 wired = 0;
3324                 {
3325                     register pt_entry_t *ptep;
3326                     for (ptep = ptp; ptep < eptp; ptep++) {
3327                         if (iswired(*ptep)) {
3328                             wired = 1;
3329                             break;
3330                         }
3331                     }
3332                 }
3333                 if (!wired) {
3334                     /*
3335                      * Remove the virtual addresses mapped by this pte page.
3336                      */
3337                     pmap_remove_range(p,
3338                                 pdetova(pdp - (pt_entry_t *)p->dirbase),
3339                                 ptp,
3340                                 eptp);
3341
3342                     /*
3343                      * Invalidate the page directory pointer.
3344                      */
3345                     pmap_store_pte(pdp, 0x0);
3346
3347                     PMAP_UNLOCK(p);
3348
3349                     /*
3350                      * And free the pte page itself.
3351                      */
3352                     {
3353                         register vm_page_t m;
3354
3355                         vm_object_lock(p->pm_obj);
3356
3357                         m = vm_page_lookup(p->pm_obj,(vm_object_offset_t)(pdp - (pt_entry_t *)&p->dirbase[0]));
3358                         if (m == VM_PAGE_NULL)
3359                             panic("pmap_collect: pte page not in object");
3360
3361                         VM_PAGE_FREE(m);
3362
3363                         OSAddAtomic(-1,  &inuse_ptepages_count);
3364
3365                         vm_object_unlock(p->pm_obj);
3366                     }
3367
3368                     PMAP_LOCK(p);
3369                 }
3370               }
3371            }
3372         }
3373
3374         PMAP_UPDATE_TLBS(p, 0x0, 0xFFFFFFFFFFFFF000ULL);
3375         PMAP_UNLOCK(p);
3376         return;
3377
3378 }
3379 #endif
3380
3381
3382 void
3383 pmap_copy_page(ppnum_t src, ppnum_t dst)
3384 {
3385   bcopy_phys((addr64_t)i386_ptob(src),
3386              (addr64_t)i386_ptob(dst),
3387              PAGE_SIZE);
3388 }
3389
3390
3391 /*
3392  *      Routine:        pmap_pageable
3393  *      Function:
3394  *              Make the specified pages (by pmap, offset)
3395  *              pageable (or not) as requested.
3396  *
3397  *              A page which is not pageable may not take
3398  *              a fault; therefore, its page table entry
3399  *              must remain valid for the duration.
3400  *
3401  *              This routine is merely advisory; pmap_enter
3402  *              will specify that these pages are to be wired
3403  *              down (or not) as appropriate.
3404  */
3405 void
3406 pmap_pageable(
3407         __unused pmap_t         pmap,
3408         __unused vm_map_offset_t        start_addr,
3409         __unused vm_map_offset_t        end_addr,
3410         __unused boolean_t      pageable)
3411 {
3412 #ifdef  lint
3413         pmap++; start_addr++; end_addr++; pageable++;
3414 #endif  /* lint */
3415 }
3416
3417 /*
3418  *      Clear specified attribute bits.
3419  */
3420 void
3421 phys_attribute_clear(
3422         ppnum_t         pn,
3423         int             bits)
3424 {
3425         pv_rooted_entry_t               pv_h;
3426         register pv_hashed_entry_t      pv_e;
3427         register pt_entry_t     *pte;
3428         int                     pai;
3429         register pmap_t         pmap;
3430
3431         pmap_intr_assert();
3432         assert(pn != vm_page_fictitious_addr);
3433         if (pn == vm_page_guard_addr)
3434                 return;
3435
3436         pai = ppn_to_pai(pn);
3437
3438         if (!managed_page(pai)) {
3439             /*
3440              *  Not a managed page.
3441              */
3442             return;
3443         }
3444
3445
3446         PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START,
3447                    (int) pn, bits, 0, 0, 0);
3448
3449         pv_h = pai_to_pvh(pai);
3450
3451         LOCK_PVH(pai);
3452
3453         /*
3454          * Walk down PV list, clearing all modify or reference bits.
3455          * We do not have to lock the pv_list because we have
3456          * the entire pmap system locked.
3457          */
3458         if (pv_h->pmap != PMAP_NULL) {
3459             /*
3460              * There are some mappings.
3461              */
3462
3463           pv_e = (pv_hashed_entry_t)pv_h;
3464
3465           do {
3466                 pmap = pv_e->pmap;
3467
3468                 {
3469                     vm_map_offset_t va;
3470
3471                     va = pv_e->va;
3472
3473                     /*
3474                      * Clear modify and/or reference bits.
3475                      */
3476
3477                     pte = pmap_pte(pmap, va);
3478                     pmap_update_pte(pte, *pte, (*pte & ~bits));
3479                     /* Ensure all processors using this translation
3480                      * invalidate this TLB entry. The invalidation *must* follow
3481                      * the PTE update, to ensure that the TLB shadow of the
3482                      * 'D' bit (in particular) is synchronized with the
3483                      * updated PTE.
3484                      */
3485                     PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
3486                 }
3487
3488                 pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
3489
3490           } while (pv_e != (pv_hashed_entry_t)pv_h);
3491         }
3492         pmap_phys_attributes[pai] &= ~bits;
3493
3494         UNLOCK_PVH(pai);
3495
3496         PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END,
3497                    0, 0, 0, 0, 0);
3498
3499 }
3500
3501 /*
3502  *      Check specified attribute bits.
3503  */
3504 int
3505 phys_attribute_test(
3506         ppnum_t         pn,
3507         int             bits)
3508 {
3509         pv_rooted_entry_t               pv_h;
3510         register pv_hashed_entry_t      pv_e;
3511         register pt_entry_t     *pte;
3512         int                     pai;
3513         register pmap_t         pmap;
3514         int                     attributes = 0;
3515
3516         pmap_intr_assert();
3517         assert(pn != vm_page_fictitious_addr);
3518         if (pn == vm_page_guard_addr)
3519                 return 0;
3520
3521         pai = ppn_to_pai(pn);
3522
3523         if (!managed_page(pai)) {
3524             /*
3525              *  Not a managed page.
3526              */
3527             return (0);
3528         }
3529
3530         /*
3531          * super fast check...  if bits already collected
3532          * no need to take any locks...
3533          * if not set, we need to recheck after taking
3534          * the lock in case they got pulled in while
3535          * we were waiting for the lock
3536          */
3537         if ( (pmap_phys_attributes[pai] & bits) == bits)
3538             return (bits);
3539
3540         pv_h = pai_to_pvh(pai);
3541
3542         LOCK_PVH(pai);
3543
3544         attributes = pmap_phys_attributes[pai] & bits;
3545
3546
3547         /*
3548          * Walk down PV list, checking the mappings until we
3549          * reach the end or we've found the attributes we've asked for
3550          * We do not have to lock the pv_list because we have
3551          * the entire pmap system locked.
3552          */
3553         if (pv_h->pmap != PMAP_NULL) {
3554             /*
3555              * There are some mappings.
3556              */
3557           pv_e = (pv_hashed_entry_t)pv_h;
3558           if (attributes != bits) do {
3559
3560                 pmap = pv_e->pmap;
3561
3562                 {
3563                     vm_map_offset_t va;
3564
3565                     va = pv_e->va;
3566                     /*
3567                      * first make sure any processor actively
3568                      * using this pmap, flushes its TLB state
3569                      */
3570                     PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
3571
3572                     /*
3573                      * pick up modify and/or reference bits from this mapping
3574                      */
3575                     pte = pmap_pte(pmap, va);
3576                     attributes |= (int)(*pte & bits);
3577
3578                 }
3579
3580                 pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
3581
3582             } while ((attributes != bits) && (pv_e != (pv_hashed_entry_t)pv_h));
3583         }
3584
3585         UNLOCK_PVH(pai);
3586         return (attributes);
3587 }
3588
3589 /*
3590  *      Set specified attribute bits.
3591  */
3592 void
3593 phys_attribute_set(
3594         ppnum_t         pn,
3595         int             bits)
3596 {
3597         int             pai;
3598
3599         pmap_intr_assert();
3600         assert(pn != vm_page_fictitious_addr);
3601         if (pn == vm_page_guard_addr)
3602                 return;
3603
3604         pai = ppn_to_pai(pn);
3605
3606         if (!managed_page(pai)) {
3607             /*
3608              *  Not a managed page.
3609              */
3610             return;
3611         }
3612
3613         LOCK_PVH(pai);
3614
3615         pmap_phys_attributes[pai] |= bits;
3616
3617         UNLOCK_PVH(pai);
3618 }
3619
3620 /*
3621  *      Set the modify bit on the specified physical page.
3622  */
3623
3624 void pmap_set_modify(
3625                      ppnum_t pn)
3626 {
3627         phys_attribute_set(pn, PHYS_MODIFIED);
3628 }
3629
3630 /*
3631  *      Clear the modify bits on the specified physical page.
3632  */
3633
3634 void
3635 pmap_clear_modify(
3636                   ppnum_t pn)
3637 {
3638         phys_attribute_clear(pn, PHYS_MODIFIED);
3639 }
3640
3641 /*
3642  *      pmap_is_modified:
3643  *
3644  *      Return whether or not the specified physical page is modified
3645  *      by any physical maps.
3646  */
3647
3648 boolean_t
3649 pmap_is_modified(
3650                  ppnum_t pn)
3651 {
3652         if (phys_attribute_test(pn, PHYS_MODIFIED))
3653                 return TRUE;
3654
3655         return FALSE;
3656 }
3657
3658 /*
3659  *      pmap_clear_reference:
3660  *
3661  *      Clear the reference bit on the specified physical page.
3662  */
3663
3664 void
3665 pmap_clear_reference(
3666                      ppnum_t pn)
3667 {
3668         phys_attribute_clear(pn, PHYS_REFERENCED);
3669 }
3670
3671 void
3672 pmap_set_reference(ppnum_t pn)
3673 {
3674         phys_attribute_set(pn, PHYS_REFERENCED);
3675 }
3676
3677 /*
3678  *      pmap_is_referenced:
3679  *
3680  *      Return whether or not the specified physical page is referenced
3681  *      by any physical maps.
3682  */
3683
3684 boolean_t
3685 pmap_is_referenced(
3686                    ppnum_t pn)
3687 {
3688         if (phys_attribute_test(pn, PHYS_REFERENCED))
3689                 return TRUE;
3690
3691         return FALSE;
3692 }
3693
3694 /*
3695  * pmap_get_refmod(phys)
3696  *  returns the referenced and modified bits of the specified
3697  *  physical page.
3698  */
3699 unsigned int
3700 pmap_get_refmod(ppnum_t pa)
3701 {
3702         int     refmod;
3703         unsigned int retval = 0;
3704
3705         refmod = phys_attribute_test(pa, PHYS_MODIFIED | PHYS_REFERENCED);
3706
3707         if (refmod & PHYS_MODIFIED)
3708                 retval |= VM_MEM_MODIFIED;
3709         if (refmod & PHYS_REFERENCED)
3710                 retval |= VM_MEM_REFERENCED;
3711
3712         return (retval);
3713 }
3714
3715 /*
3716  * pmap_clear_refmod(phys, mask)
3717  *  clears the referenced and modified bits as specified by the mask
3718  *  of the specified physical page.
3719  */
3720 void
3721 pmap_clear_refmod(ppnum_t pa, unsigned int mask)
3722 {
3723         unsigned int  x86Mask;
3724
3725         x86Mask = (   ((mask &   VM_MEM_MODIFIED)?   PHYS_MODIFIED : 0)
3726                     | ((mask & VM_MEM_REFERENCED)? PHYS_REFERENCED : 0));
3727         phys_attribute_clear(pa, x86Mask);
3728 }
3729
3730 void
3731 invalidate_icache(__unused vm_offset_t  addr,
3732                   __unused unsigned     cnt,
3733                   __unused int          phys)
3734 {
3735         return;
3736 }
3737 void
3738 flush_dcache(__unused vm_offset_t       addr,
3739              __unused unsigned          count,
3740              __unused int               phys)
3741 {
3742         return;
3743 }
3744
3745 #if CONFIG_DTRACE
3746 /*
3747  * Constrain DTrace copyin/copyout actions
3748  */
3749 extern kern_return_t dtrace_copyio_preflight(addr64_t);
3750 extern kern_return_t dtrace_copyio_postflight(addr64_t);
3751
3752 kern_return_t dtrace_copyio_preflight(__unused addr64_t va)
3753 {
3754         thread_t thread = current_thread();
3755
3756         if (current_map() == kernel_map)
3757                 return KERN_FAILURE;
3758         else if (thread->machine.specFlags & CopyIOActive)
3759                 return KERN_FAILURE;
3760         else
3761                 return KERN_SUCCESS;
3762 }
3763
3764 kern_return_t dtrace_copyio_postflight(__unused addr64_t va)
3765 {
3766         return KERN_SUCCESS;
3767 }
3768 #endif /* CONFIG_DTRACE */
3769
3770 #if     MACH_KDB
3771
3772 /* show phys page mappings and attributes */
3773
3774 extern void     db_show_page(pmap_paddr_t pa);
3775
3776 #if 0
3777 void
3778 db_show_page(pmap_paddr_t pa)
3779 {
3780         pv_entry_t      pv_h;
3781         int             pai;
3782         char            attr;
3783
3784         pai = pa_index(pa);
3785         pv_h = pai_to_pvh(pai);
3786
3787         attr = pmap_phys_attributes[pai];
3788         printf("phys page %llx ", pa);
3789         if (attr & PHYS_MODIFIED)
3790                 printf("modified, ");
3791         if (attr & PHYS_REFERENCED)
3792                 printf("referenced, ");
3793         if (pv_h->pmap || pv_h->next)
3794                 printf(" mapped at\n");
3795         else
3796                 printf(" not mapped\n");
3797         for (; pv_h; pv_h = pv_h->next)
3798                 if (pv_h->pmap)
3799                         printf("%llx in pmap %p\n", pv_h->va, pv_h->pmap);
3800 }
3801 #endif
3802
3803 #endif /* MACH_KDB */
3804
3805 #if     MACH_KDB
3806 #if 0
3807 void db_kvtophys(vm_offset_t);
3808 void db_show_vaddrs(pt_entry_t  *);
3809
3810 /*
3811  *      print out the results of kvtophys(arg)
3812  */
3813 void
3814 db_kvtophys(
3815         vm_offset_t     vaddr)
3816 {
3817         db_printf("0x%qx", kvtophys(vaddr));
3818 }
3819
3820 /*
3821  *      Walk the pages tables.
3822  */
3823 void
3824 db_show_vaddrs(
3825         pt_entry_t      *dirbase)
3826 {
3827         pt_entry_t      *ptep, *pdep, tmp;
3828         unsigned int    x, y, pdecnt, ptecnt;
3829
3830         if (dirbase == 0) {
3831                 dirbase = kernel_pmap->dirbase;
3832         }
3833         if (dirbase == 0) {
3834                 db_printf("need a dirbase...\n");
3835                 return;
3836         }
3837         dirbase = (pt_entry_t *) (int) ((unsigned long) dirbase & ~INTEL_OFFMASK);
3838
3839         db_printf("dirbase: 0x%x\n", dirbase);
3840
3841         pdecnt = ptecnt = 0;
3842         pdep = &dirbase[0];
3843         for (y = 0; y < NPDEPG; y++, pdep++) {
3844                 if (((tmp = *pdep) & INTEL_PTE_VALID) == 0) {
3845                         continue;
3846                 }
3847                 pdecnt++;
3848                 ptep = (pt_entry_t *) ((unsigned long)(*pdep) & ~INTEL_OFFMASK);
3849                 db_printf("dir[%4d]: 0x%x\n", y, *pdep);
3850                 for (x = 0; x < NPTEPG; x++, ptep++) {
3851                         if (((tmp = *ptep) & INTEL_PTE_VALID) == 0) {
3852                                 continue;
3853                         }
3854                         ptecnt++;
3855                         db_printf("   tab[%4d]: 0x%x, va=0x%x, pa=0x%x\n",
3856                                 x,
3857                                 *ptep,
3858                                 (y << 22) | (x << 12),
3859                                 *ptep & ~INTEL_OFFMASK);
3860                 }
3861         }
3862
3863         db_printf("total: %d tables, %d page table entries.\n", pdecnt, ptecnt);
3864
3865 }
3866 #endif
3867 #endif  /* MACH_KDB */
3868
3869 #include <mach_vm_debug.h>
3870 #if     MACH_VM_DEBUG
3871 #include <vm/vm_debug.h>
3872
3873 int
3874 pmap_list_resident_pages(
3875         __unused pmap_t         pmap,
3876         __unused vm_offset_t    *listp,
3877         __unused int            space)
3878 {
3879         return 0;
3880 }
3881 #endif  /* MACH_VM_DEBUG */
3882
3883
3884
3885 /* temporary workaround */
3886 boolean_t
3887 coredumpok(__unused vm_map_t map, __unused vm_offset_t va)
3888 {
3889 #if 0
3890         pt_entry_t     *ptep;
3891
3892         ptep = pmap_pte(map->pmap, va);
3893         if (0 == ptep)
3894                 return FALSE;
3895         return ((*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED));
3896 #else
3897         return TRUE;
3898 #endif
3899 }
3900
3901
3902 boolean_t
3903 phys_page_exists(
3904                  ppnum_t pn)
3905 {
3906         assert(pn != vm_page_fictitious_addr);
3907
3908         if (!pmap_initialized)
3909                 return (TRUE);
3910
3911         if (pn == vm_page_guard_addr)
3912                 return FALSE;
3913
3914         if (!managed_page(ppn_to_pai(pn)))
3915                 return (FALSE);
3916
3917         return TRUE;
3918 }
3919
3920 void
3921 mapping_free_prime(void)
3922 {
3923         int             i;
3924         pv_hashed_entry_t      pvh_e;
3925         pv_hashed_entry_t      pvh_eh;
3926         pv_hashed_entry_t      pvh_et;
3927         int             pv_cnt;
3928
3929         pv_cnt = 0;
3930         pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3931         for (i = 0; i < (5 * PV_HASHED_ALLOC_CHUNK); i++) {
3932                 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3933
3934                 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3935                 pvh_eh = pvh_e;
3936
3937                 if (pvh_et == PV_HASHED_ENTRY_NULL)
3938                         pvh_et = pvh_e;
3939                 pv_cnt++;
3940         }
3941         PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
3942
3943         pv_cnt = 0;
3944         pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3945         for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
3946                 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3947
3948                 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3949                 pvh_eh = pvh_e;
3950
3951                 if (pvh_et == PV_HASHED_ENTRY_NULL)
3952                         pvh_et = pvh_e;
3953                 pv_cnt++;
3954         }
3955         PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
3956
3957 }
3958
3959 void
3960 mapping_adjust(void)
3961 {
3962         pv_hashed_entry_t      pvh_e;
3963         pv_hashed_entry_t      pvh_eh;
3964         pv_hashed_entry_t      pvh_et;
3965         int             pv_cnt;
3966         int             i;
3967
3968         if (mapping_adjust_call == NULL) {
3969                 thread_call_setup(&mapping_adjust_call_data,
3970                                   (thread_call_func_t) mapping_adjust,
3971                                   (thread_call_param_t) NULL);
3972                 mapping_adjust_call = &mapping_adjust_call_data;
3973         }
3974
3975         pv_cnt = 0;
3976         pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3977         if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) {
3978                 for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
3979                         pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3980
3981                         pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3982                         pvh_eh = pvh_e;
3983
3984                         if (pvh_et == PV_HASHED_ENTRY_NULL)
3985                                 pvh_et = pvh_e;
3986                         pv_cnt++;
3987                 }
3988                 PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
3989         }
3990
3991         pv_cnt = 0;
3992         pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3993         if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) {
3994                 for (i = 0; i < PV_HASHED_ALLOC_CHUNK; i++) {
3995                         pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3996
3997                         pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3998                         pvh_eh = pvh_e;
3999
4000                         if (pvh_et == PV_HASHED_ENTRY_NULL)
4001                                 pvh_et = pvh_e;
4002                         pv_cnt++;
4003                 }
4004                 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
4005         }
4006         mappingrecurse = 0;
4007 }
4008
4009 void
4010 pmap_commpage32_init(vm_offset_t kernel_commpage, vm_offset_t user_commpage, int cnt)
4011 {
4012         int i;
4013         pt_entry_t *opte, *npte;
4014         pt_entry_t pte;
4015         spl_t s;
4016
4017         for (i = 0; i < cnt; i++) {
4018                 s = splhigh();
4019                 opte = pmap_pte(kernel_pmap, (vm_map_offset_t)kernel_commpage);
4020                 if (0 == opte)
4021                         panic("kernel_commpage");
4022                 pte = *opte | INTEL_PTE_USER|INTEL_PTE_GLOBAL;
4023                 pte &= ~INTEL_PTE_WRITE; // ensure read only
4024                 npte = pmap_pte(kernel_pmap, (vm_map_offset_t)user_commpage);
4025                 if (0 == npte)
4026                         panic("user_commpage");
4027                 pmap_store_pte(npte, pte);
4028                 splx(s);
4029                 kernel_commpage += INTEL_PGBYTES;
4030                 user_commpage += INTEL_PGBYTES;
4031         }
4032 }
4033
4034
4035 #define PMAP_COMMPAGE64_CNT  (_COMM_PAGE64_AREA_USED/PAGE_SIZE)
4036 pt_entry_t pmap_commpage64_ptes[PMAP_COMMPAGE64_CNT];
4037
4038 void
4039 pmap_commpage64_init(vm_offset_t kernel_commpage, __unused vm_map_offset_t user_commpage, int cnt)
4040 {
4041     int i;
4042     pt_entry_t *kptep;
4043
4044     PMAP_LOCK(kernel_pmap);
4045
4046     for (i = 0; i < cnt; i++) {
4047         kptep = pmap_pte(kernel_pmap, (uint64_t)kernel_commpage + (i*PAGE_SIZE));
4048         if ((0 == kptep) || (0 == (*kptep & INTEL_PTE_VALID)))
4049             panic("pmap_commpage64_init pte");
4050         pmap_commpage64_ptes[i] = ((*kptep & ~INTEL_PTE_WRITE) | INTEL_PTE_USER);
4051     }
4052     PMAP_UNLOCK(kernel_pmap);
4053 }
4054
4055
4056 static cpu_pmap_t               cpu_pmap_master;
4057
4058 struct cpu_pmap *
4059 pmap_cpu_alloc(boolean_t is_boot_cpu)
4060 {
4061         int                     ret;
4062         int                     i;
4063         cpu_pmap_t              *cp;
4064         vm_offset_t             address;
4065         vm_map_address_t        mapaddr;
4066         vm_map_entry_t          entry;
4067         pt_entry_t              *pte;
4068
4069         if (is_boot_cpu) {
4070                 cp = &cpu_pmap_master;
4071         } else {
4072                 /*
4073                  * The per-cpu pmap data structure itself.
4074                  */
4075                 ret = kmem_alloc(kernel_map,
4076                                  (vm_offset_t *) &cp, sizeof(cpu_pmap_t));
4077                 if (ret != KERN_SUCCESS) {
4078                         printf("pmap_cpu_alloc() failed ret=%d\n", ret);
4079                         return NULL;
4080                 }
4081                 bzero((void *)cp, sizeof(cpu_pmap_t));
4082
4083                 /*
4084                  * The temporary windows used for copy/zero - see loose_ends.c
4085                  */
4086                 ret = vm_map_find_space(kernel_map,
4087                     &mapaddr, PMAP_NWINDOWS*PAGE_SIZE, (vm_map_offset_t)0, 0, &entry);
4088                 if (ret != KERN_SUCCESS) {
4089                         printf("pmap_cpu_alloc() "
4090                                 "vm_map_find_space ret=%d\n", ret);
4091                         pmap_cpu_free(cp);
4092                         return NULL;
4093                 }
4094                 address = (vm_offset_t)mapaddr;
4095
4096                 for (i = 0; i < PMAP_NWINDOWS; i++, address += PAGE_SIZE) {
4097                   spl_t s;
4098                         s = splhigh();
4099                         while ((pte = pmap_pte(kernel_pmap, (vm_map_offset_t)address)) == 0)
4100                                 pmap_expand(kernel_pmap, (vm_map_offset_t)address);
4101                         * (int *) pte = 0;
4102                         cp->mapwindow[i].prv_CADDR = (caddr_t) address;
4103                         cp->mapwindow[i].prv_CMAP = pte;
4104                         splx(s);
4105                 }
4106                 vm_map_unlock(kernel_map);
4107         }
4108
4109         cp->pdpt_window_index = PMAP_PDPT_FIRST_WINDOW;
4110         cp->pde_window_index = PMAP_PDE_FIRST_WINDOW;
4111         cp->pte_window_index = PMAP_PTE_FIRST_WINDOW;
4112
4113         return cp;
4114 }
4115
4116 void
4117 pmap_cpu_free(struct cpu_pmap *cp)
4118 {
4119         if (cp != NULL && cp != &cpu_pmap_master) {
4120                 kfree((void *) cp, sizeof(cpu_pmap_t));
4121         }
4122 }
4123
4124
4125 mapwindow_t *
4126 pmap_get_mapwindow(pt_entry_t pentry)
4127 {
4128     mapwindow_t *mp;
4129     int i;
4130
4131     assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
4132
4133     /*
4134      * Note: 0th map reserved for pmap_pte()
4135      */
4136     for (i = PMAP_NWINDOWS_FIRSTFREE; i < PMAP_NWINDOWS; i++) {
4137             mp = &current_cpu_datap()->cpu_pmap->mapwindow[i];
4138
4139             if (*mp->prv_CMAP == 0) {
4140                     pmap_store_pte(mp->prv_CMAP, pentry);
4141
4142                     invlpg((uintptr_t)mp->prv_CADDR);
4143
4144                     return (mp);
4145             }
4146     }
4147     panic("pmap_get_mapwindow: no windows available");
4148
4149     return NULL;
4150 }
4151
4152
4153 void
4154 pmap_put_mapwindow(mapwindow_t *mp)
4155 {
4156     pmap_store_pte(mp->prv_CMAP, 0);
4157 }
4158
4159 void
4160 pmap_switch(pmap_t tpmap)
4161 {
4162         spl_t   s;
4163
4164         s = splhigh();          /* Make sure interruptions are disabled */
4165
4166         set_dirbase(tpmap, current_thread());
4167
4168         splx(s);
4169 }
4170
4171
4172 /*
4173  * disable no-execute capability on
4174  * the specified pmap
4175  */
4176 void pmap_disable_NX(pmap_t pmap) {
4177
4178         pmap->nx_enabled = 0;
4179 }
4180
4181 void
4182 pt_fake_zone_info(int *count, vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size,
4183                   vm_size_t *alloc_size, int *collectable, int *exhaustable)
4184 {
4185         *count      = inuse_ptepages_count;
4186         *cur_size   = PAGE_SIZE * inuse_ptepages_count;
4187         *max_size   = PAGE_SIZE * (inuse_ptepages_count + vm_page_inactive_count + vm_page_active_count + vm_page_free_count);
4188         *elem_size  = PAGE_SIZE;
4189         *alloc_size = PAGE_SIZE;
4190
4191         *collectable = 1;
4192         *exhaustable = 0;
4193 }
4194
4195 vm_offset_t pmap_cpu_high_map_vaddr(int cpu, enum high_cpu_types e)
4196 {
4197   enum high_fixed_addresses a;
4198   a = e + HIGH_CPU_END * cpu;
4199   return pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN + a);
4200 }
4201
4202 vm_offset_t pmap_high_map_vaddr(enum high_cpu_types e)
4203 {
4204   return pmap_cpu_high_map_vaddr(cpu_number(), e);
4205 }
4206
4207 vm_offset_t pmap_high_map(pt_entry_t pte, enum high_cpu_types e)
4208 {
4209   enum high_fixed_addresses a;
4210   vm_offset_t vaddr;
4211
4212   a = e + HIGH_CPU_END * cpu_number();
4213   vaddr = (vm_offset_t)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN + a);
4214   pmap_store_pte(pte_unique_base + a, pte);
4215
4216   /* TLB flush for this page for this  cpu */
4217   invlpg((uintptr_t)vaddr);
4218
4219   return  vaddr;
4220 }
4221
4222 static inline void
4223 pmap_cpuset_NMIPI(cpu_set cpu_mask) {
4224         unsigned int cpu, cpu_bit;
4225         uint64_t deadline;
4226
4227         for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
4228                 if (cpu_mask & cpu_bit)
4229                         cpu_NMI_interrupt(cpu);
4230         }
4231         deadline = mach_absolute_time() + (LockTimeOut);
4232         while (mach_absolute_time() < deadline)
4233                 cpu_pause();
4234 }
4235
4236 /*
4237  * Called with pmap locked, we:
4238  *  - scan through per-cpu data to see which other cpus need to flush
4239  *  - send an IPI to each non-idle cpu to be flushed
4240  *  - wait for all to signal back that they are inactive or we see that
4241  *    they are in an interrupt handler or at a safe point
4242  *  - flush the local tlb is active for this pmap
4243  *  - return ... the caller will unlock the pmap
4244  */
4245 void
4246 pmap_flush_tlbs(pmap_t  pmap)
4247 {
4248         unsigned int    cpu;
4249         unsigned int    cpu_bit;
4250         cpu_set         cpus_to_signal;
4251         unsigned int    my_cpu = cpu_number();
4252         pmap_paddr_t    pmap_cr3 = pmap->pm_cr3;
4253         boolean_t       flush_self = FALSE;
4254         uint64_t        deadline;
4255
4256         assert((processor_avail_count < 2) ||
4257                (ml_get_interrupts_enabled() && get_preemption_level() != 0));
4258
4259         /*
4260          * Scan other cpus for matching active or task CR3.
4261          * For idle cpus (with no active map) we mark them invalid but
4262          * don't signal -- they'll check as they go busy.
4263          * Note: for the kernel pmap we look for 64-bit shared address maps.
4264          */
4265         cpus_to_signal = 0;
4266         for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
4267                 if (!cpu_datap(cpu)->cpu_running)
4268                         continue;
4269                 if ((cpu_datap(cpu)->cpu_task_cr3 == pmap_cr3) ||
4270                     (CPU_GET_ACTIVE_CR3(cpu)      == pmap_cr3) ||
4271                     (pmap->pm_shared) ||
4272                     ((pmap == kernel_pmap) &&
4273                      (!CPU_CR3_IS_ACTIVE(cpu) ||
4274                       cpu_datap(cpu)->cpu_task_map == TASK_MAP_64BIT_SHARED))) {
4275                         if (cpu == my_cpu) {
4276                                 flush_self = TRUE;
4277                                 continue;
4278                         }
4279                         cpu_datap(cpu)->cpu_tlb_invalid = TRUE;
4280                         __asm__ volatile("mfence");
4281
4282                         if (CPU_CR3_IS_ACTIVE(cpu)) {
4283                                 cpus_to_signal |= cpu_bit;
4284                                 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
4285                         }
4286                 }
4287         }
4288
4289         PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_START,
4290                    (int) pmap, cpus_to_signal, flush_self, 0, 0);
4291
4292         if (cpus_to_signal) {
4293                 cpu_set cpus_to_respond = cpus_to_signal;
4294
4295                 deadline = mach_absolute_time() + LockTimeOut;
4296                 /*
4297                  * Wait for those other cpus to acknowledge
4298                  */
4299                 while (cpus_to_respond != 0) {
4300                         if (mach_absolute_time() > deadline) {
4301                                 if (mp_recent_debugger_activity())
4302                                         continue;
4303                                 if (!panic_active()) {
4304                                         pmap_tlb_flush_timeout = TRUE;
4305                                         pmap_cpuset_NMIPI(cpus_to_respond);
4306                                 }
4307                                 panic("pmap_flush_tlbs() timeout: "
4308                                     "cpu(s) failing to respond to interrupts, pmap=%p cpus_to_respond=0x%lx",
4309                                     pmap, cpus_to_respond);
4310                         }
4311
4312                         for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
4313                                 if ((cpus_to_respond & cpu_bit) != 0) {
4314                                         if (!cpu_datap(cpu)->cpu_running ||
4315                                             cpu_datap(cpu)->cpu_tlb_invalid == FALSE ||
4316                                             !CPU_CR3_IS_ACTIVE(cpu)) {
4317                                                 cpus_to_respond &= ~cpu_bit;
4318                                         }
4319                                         cpu_pause();
4320                                 }
4321                                 if (cpus_to_respond == 0)
4322                                         break;
4323                         }
4324                 }
4325         }
4326         /*
4327          * Flush local tlb if required.
4328          * We need this flush even if the pmap being changed
4329          * is the user map... in case we do a copyin/out
4330          * before returning to user mode.
4331          */
4332         if (flush_self)
4333                 flush_tlb();
4334
4335         if ((pmap == kernel_pmap) && (flush_self != TRUE)) {
4336                 panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map);
4337         }
4338
4339         PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_END,
4340                    (int) pmap, cpus_to_signal, flush_self, 0, 0);
4341 }
4342
4343 void
4344 process_pmap_updates(void)
4345 {
4346         assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
4347
4348         flush_tlb();
4349
4350         current_cpu_datap()->cpu_tlb_invalid = FALSE;
4351         __asm__ volatile("mfence");
4352 }
4353
4354 void
4355 pmap_update_interrupt(void)
4356 {
4357         PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START,
4358                    0, 0, 0, 0, 0);
4359
4360         process_pmap_updates();
4361
4362         PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END,
4363                    0, 0, 0, 0, 0);
4364 }
4365
4366
4367 unsigned int pmap_cache_attributes(ppnum_t pn) {
4368
4369         if (!managed_page(ppn_to_pai(pn)))
4370                 return (VM_WIMG_IO);
4371
4372         return (VM_WIMG_COPYBACK);
4373 }
4374
4375 #ifdef PMAP_DEBUG
4376 void
4377 pmap_dump(pmap_t p)
4378 {
4379   int i;
4380
4381   kprintf("pmap 0x%x\n",p);
4382
4383   kprintf("  pm_cr3 0x%llx\n",p->pm_cr3);
4384   kprintf("  pm_pml4 0x%x\n",p->pm_pml4);
4385   kprintf("  pm_pdpt 0x%x\n",p->pm_pdpt);
4386
4387   kprintf("    pml4[0] 0x%llx\n",*p->pm_pml4);
4388   for (i=0;i<8;i++)
4389     kprintf("    pdpt[%d] 0x%llx\n",i, p->pm_pdpt[i]);
4390 }
4391
4392 void pmap_dump_wrap(void)
4393 {
4394   pmap_dump(current_cpu_datap()->cpu_active_thread->task->map->pmap);
4395 }
4396
4397 void
4398 dump_4GB_pdpt(pmap_t p)
4399 {
4400         int             spl;
4401         pdpt_entry_t    *user_pdptp;
4402         pdpt_entry_t    *kern_pdptp;
4403         pdpt_entry_t    *pml4p;
4404
4405         spl = splhigh();
4406         while ((user_pdptp = pmap64_pdpt(p, 0x0)) == PDPT_ENTRY_NULL) {
4407                 splx(spl);
4408                 pmap_expand_pml4(p, 0x0);
4409                 spl = splhigh();
4410         }
4411         kern_pdptp = kernel_pmap->pm_pdpt;
4412         if (kern_pdptp == NULL)
4413                 panic("kern_pdptp == NULL");
4414         kprintf("dump_4GB_pdpt(%p)\n"
4415                 "kern_pdptp=%p (phys=0x%016llx)\n"
4416                 "\t 0x%08x: 0x%016llx\n"
4417                 "\t 0x%08x: 0x%016llx\n"
4418                 "\t 0x%08x: 0x%016llx\n"
4419                 "\t 0x%08x: 0x%016llx\n"
4420                 "\t 0x%08x: 0x%016llx\n"
4421                 "user_pdptp=%p (phys=0x%016llx)\n"
4422                 "\t 0x%08x: 0x%016llx\n"
4423                 "\t 0x%08x: 0x%016llx\n"
4424                 "\t 0x%08x: 0x%016llx\n"
4425                 "\t 0x%08x: 0x%016llx\n"
4426                 "\t 0x%08x: 0x%016llx\n",
4427                 p, kern_pdptp, kvtophys(kern_pdptp),
4428                 kern_pdptp+0, *(kern_pdptp+0),
4429                 kern_pdptp+1, *(kern_pdptp+1),
4430                 kern_pdptp+2, *(kern_pdptp+2),
4431                 kern_pdptp+3, *(kern_pdptp+3),
4432                 kern_pdptp+4, *(kern_pdptp+4),
4433                 user_pdptp, kvtophys(user_pdptp),
4434                 user_pdptp+0, *(user_pdptp+0),
4435                 user_pdptp+1, *(user_pdptp+1),
4436                 user_pdptp+2, *(user_pdptp+2),
4437                 user_pdptp+3, *(user_pdptp+3),
4438                 user_pdptp+4, *(user_pdptp+4));
4439         kprintf("user pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
4440                 p->pm_cr3, p->pm_hold, p->pm_pml4);
4441         pml4p = (pdpt_entry_t *)p->pm_hold;
4442         if (pml4p == NULL)
4443                 panic("user pml4p == NULL");
4444         kprintf("\t 0x%08x: 0x%016llx\n"
4445                 "\t 0x%08x: 0x%016llx\n",
4446                 pml4p+0, *(pml4p),
4447                 pml4p+KERNEL_UBER_PML4_INDEX, *(pml4p+KERNEL_UBER_PML4_INDEX));
4448         kprintf("kern pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
4449                 kernel_pmap->pm_cr3, kernel_pmap->pm_hold, kernel_pmap->pm_pml4);
4450         pml4p = (pdpt_entry_t *)kernel_pmap->pm_hold;
4451         if (pml4p == NULL)
4452                 panic("kern pml4p == NULL");
4453         kprintf("\t 0x%08x: 0x%016llx\n"
4454                 "\t 0x%08x: 0x%016llx\n",
4455                 pml4p+0, *(pml4p),
4456                 pml4p+511, *(pml4p+511));
4457         splx(spl);
4458 }
4459
4460 void dump_4GB_pdpt_thread(thread_t tp)
4461 {
4462         dump_4GB_pdpt(tp->map->pmap);
4463 }
4464
4465
4466 #endif
4467