osfmk/x86_64/pmap.c

   1
   2 /*
   3  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   4  *
   5  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   6  *
   7  * This file contains Original Code and/or Modifications of Original Code
   8  * as defined in and that are subject to the Apple Public Source License
   9  * Version 2.0 (the 'License'). You may not use this file except in
  10  * compliance with the License. The rights granted to you under the License
  11  * may not be used to create, or enable the creation or redistribution of,
  12  * unlawful or unlicensed copies of an Apple operating system, or to
  13  * circumvent, violate, or enable the circumvention or violation of, any
  14  * terms of an Apple operating system software license agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  18  *
  19  * The Original Code and all software distributed under the License are
  20  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  21  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  22  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  23  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  24  * Please see the License for the specific language governing rights and
  25  * limitations under the License.
  26  *
  27  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  28  */
  29 /*
  30  * @OSF_COPYRIGHT@
  31  */
  32 /*
  33  * Mach Operating System
  34  * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
  35  * All Rights Reserved.
  36  *
  37  * Permission to use, copy, modify and distribute this software and its
  38  * documentation is hereby granted, provided that both the copyright
  39  * notice and this permission notice appear in all copies of the
  40  * software, derivative works or modified versions, and any portions
  41  * thereof, and that both notices appear in supporting documentation.
  42  *
  43  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  44  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  45  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  46  *
  47  * Carnegie Mellon requests users of this software to return to
  48  *
  49  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  50  *  School of Computer Science
  51  *  Carnegie Mellon University
  52  *  Pittsburgh PA 15213-3890
  53  *
  54  * any improvements or extensions that they make and grant Carnegie Mellon
  55  * the rights to redistribute these changes.
  56  */
  57 /*
  58  */
  59
  60 /*
  61  *      File:   pmap.c
  62  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  63  *      (These guys wrote the Vax version)
  64  *
  65  *      Physical Map management code for Intel i386, i486, and i860.
  66  *
  67  *      Manages physical address maps.
  68  *
  69  *      In addition to hardware address maps, this
  70  *      module is called upon to provide software-use-only
  71  *      maps which may or may not be stored in the same
  72  *      form as hardware maps.  These pseudo-maps are
  73  *      used to store intermediate results from copy
  74  *      operations to and from address spaces.
  75  *
  76  *      Since the information managed by this module is
  77  *      also stored by the logical address mapping module,
  78  *      this module may throw away valid virtual-to-physical
  79  *      mappings at almost any time.  However, invalidations
  80  *      of virtual-to-physical mappings must be done as
  81  *      requested.
  82  *
  83  *      In order to cope with hardware architectures which
  84  *      make virtual-to-physical map invalidates expensive,
  85  *      this module may delay invalidate or reduced protection
  86  *      operations until such time as they are actually
  87  *      necessary.  This module is given full information as
  88  *      to which processors are currently using which maps,
  89  *      and to when physical maps must be made correct.
  90  */
  91
  92 #include <string.h>
  93 #include <norma_vm.h>
  94 #include <mach_kdb.h>
  95 #include <mach_ldebug.h>
  96
  97 #include <libkern/OSAtomic.h>
  98
  99 #include <mach/machine/vm_types.h>
 100
 101 #include <mach/boolean.h>
 102 #include <kern/thread.h>
 103 #include <kern/zalloc.h>
 104 #include <kern/queue.h>
 105
 106 #include <kern/lock.h>
 107 #include <kern/kalloc.h>
 108 #include <kern/spl.h>
 109
 110 #include <vm/pmap.h>
 111 #include <vm/vm_map.h>
 112 #include <vm/vm_kern.h>
 113 #include <mach/vm_param.h>
 114 #include <mach/vm_prot.h>
 115 #include <vm/vm_object.h>
 116 #include <vm/vm_page.h>
 117
 118 #include <mach/machine/vm_param.h>
 119 #include <machine/thread.h>
 120
 121 #include <kern/misc_protos.h>                   /* prototyping */
 122 #include <i386/misc_protos.h>
 123 #include <x86_64/lowglobals.h>
 124
 125 #include <i386/cpuid.h>
 126 #include <i386/cpu_data.h>
 127 #include <i386/cpu_number.h>
 128 #include <i386/machine_cpu.h>
 129 #include <i386/seg.h>
 130 #include <i386/serial_io.h>
 131 #include <i386/cpu_capabilities.h>
 132 #include <i386/machine_routines.h>
 133 #include <i386/proc_reg.h>
 134 #include <i386/tsc.h>
 135 #include <i386/pmap_internal.h>
 136
 137 #if     MACH_KDB
 138 #include <ddb/db_command.h>
 139 #include <ddb/db_output.h>
 140 #include <ddb/db_sym.h>
 141 #include <ddb/db_print.h>
 142 #endif  /* MACH_KDB */
 143
 144 #include <vm/vm_protos.h>
 145
 146 #include <i386/mp.h>
 147 #include <i386/mp_desc.h>
 148
 149
 150 /* #define DEBUGINTERRUPTS 1  uncomment to ensure pmap callers have interrupts enabled */
 151 #ifdef DEBUGINTERRUPTS
 152 #define pmap_intr_assert() {                                                    \
 153         if (processor_avail_count > 1 && !ml_get_interrupts_enabled())          \
 154                 panic("pmap interrupt assert %s, %d",__FILE__, __LINE__);       \
 155 }
 156 #else
 157 #define pmap_intr_assert()
 158 #endif
 159
 160 #ifdef IWANTTODEBUG
 161 #undef  DEBUG
 162 #define DEBUG 1
 163 #define POSTCODE_DELAY 1
 164 #include <i386/postcode.h>
 165 #endif /* IWANTTODEBUG */
 166
 167 boolean_t pmap_trace = FALSE;
 168
 169 #if PMAP_DBG
 170 #define DBG(x...)       kprintf("DBG: " x)
 171 #else
 172 #define DBG(x...)
 173 #endif
 174
 175 boolean_t       no_shared_cr3 = DEBUG;          /* TRUE for DEBUG by default */
 176
 177 /*
 178  * Forward declarations for internal functions.
 179  */
 180
 181 void            pmap_remove_range(
 182                         pmap_t          pmap,
 183                         vm_map_offset_t va,
 184                         pt_entry_t      *spte,
 185                         pt_entry_t      *epte);
 186
 187 void            phys_attribute_clear(
 188                         ppnum_t         phys,
 189                         int             bits);
 190
 191 int             phys_attribute_test(
 192                         ppnum_t         phys,
 193                         int             bits);
 194
 195 void            phys_attribute_set(
 196                         ppnum_t         phys,
 197                         int             bits);
 198
 199 void            pmap_set_reference(
 200                         ppnum_t pn);
 201
 202 boolean_t       phys_page_exists(
 203                         ppnum_t pn);
 204
 205
 206 int nx_enabled = 1;                     /* enable no-execute protection */
 207 int allow_data_exec  = VM_ABI_32;       /* 32-bit apps may execute data by default, 64-bit apps may not */
 208 int allow_stack_exec = 0;               /* No apps may execute from the stack by default */
 209
 210 const boolean_t cpu_64bit  = TRUE; /* Mais oui! */
 211
 212 /*
 213  * when spinning through pmap_remove
 214  * ensure that we don't spend too much
 215  * time with preemption disabled.
 216  * I'm setting the current threshold
 217  * to 20us
 218  */
 219 #define MAX_PREEMPTION_LATENCY_NS 20000
 220
 221 uint64_t max_preemption_latency_tsc = 0;
 222
 223
 224 /*
 225  *      Private data structures.
 226  */
 227
 228 /*
 229  *      For each vm_page_t, there is a list of all currently
 230  *      valid virtual mappings of that page.  An entry is
 231  *      a pv_rooted_entry_t; the list is the pv_table.
 232  *
 233  *      N.B.  with the new combo rooted/hashed scheme it is
 234  *      only possibly to remove individual non-rooted entries
 235  *      if they are found via the hashed chains as there is no
 236  *      way to unlink the singly linked hashed entries if navigated to
 237  *      via the queue list off the rooted entries.  Think of it as
 238  *      hash/walk/pull, keeping track of the prev pointer while walking
 239  *      the singly linked hash list.  All of this is to save memory and
 240  *      keep both types of pv_entries as small as possible.
 241  */
 242
 243 /*
 244
 245 PV HASHING Changes - JK 1/2007
 246
 247 Pve's establish physical to virtual mappings.  These are used for aliasing of a
 248 physical page to (potentially many) virtual addresses within pmaps. In the
 249 previous implementation the structure of the pv_entries (each 16 bytes in size) was
 250
 251 typedef struct pv_entry {
 252     struct pv_entry_t    next;
 253     pmap_t                    pmap;
 254     vm_map_offset_t   va;
 255 } *pv_entry_t;
 256
 257 An initial array of these is created at boot time, one per physical page of
 258 memory, indexed by the physical page number. Additionally, a pool of entries
 259 is created from a pv_zone to be used as needed by pmap_enter() when it is
 260 creating new mappings.  Originally, we kept this pool around because the code
 261 in pmap_enter() was unable to block if it needed an entry and none were
 262 available - we'd panic.  Some time ago I restructured the pmap_enter() code
 263 so that for user pmaps it can block while zalloc'ing a pv structure and restart,
 264 removing a panic from the code (in the case of the kernel pmap we cannot block
 265 and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
 266 The pool has not been removed since there is a large performance gain keeping
 267 freed pv's around for reuse and not suffering the overhead of zalloc for every
 268 new pv we need.
 269
 270 As pmap_enter() created new mappings it linked the new pve's for them off the
 271 fixed pv array for that ppn (off the next pointer).  These pve's are accessed
 272 for several operations, one of them being address space teardown. In that case,
 273 we basically do this
 274
 275         for (every page/pte in the space) {
 276                 calc pve_ptr from the ppn in the pte
 277                 for (every pv in the list for the ppn) {
 278                         if (this pv is for this pmap/vaddr) {
 279                                 do housekeeping
 280                                 unlink/free the pv
 281                         }
 282                 }
 283         }
 284
 285 The problem arose when we were running, say 8000 (or even 2000) apache or
 286 other processes and one or all terminate. The list hanging off each pv array
 287 entry could have thousands of entries.  We were continuously linearly searching
 288 each of these lists as we stepped through the address space we were tearing
 289 down.  Because of the locks we hold, likely taking a cache miss for each node,
 290 and interrupt disabling for MP issues the system became completely unresponsive
 291 for many seconds while we did this.
 292
 293 Realizing that pve's are accessed in two distinct ways (linearly running the
 294 list by ppn for operations like pmap_page_protect and finding and
 295 modifying/removing a single pve as part of pmap_enter processing) has led to
 296 modifying the pve structures and databases.
 297
 298 There are now two types of pve structures.  A "rooted" structure which is
 299 basically the original structure accessed in an array by ppn, and a ''hashed''
 300 structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
 301 designed with the two goals of minimizing wired memory and making the lookup of
 302 a ppn faster.  Since a vast majority of pages in the system are not aliased
 303 and hence represented by a single pv entry I've kept the rooted entry size as
 304 small as possible because there is one of these dedicated for every physical
 305 page of memory.  The hashed pve's are larger due to the addition of the hash
 306 link and the ppn entry needed for matching while running the hash list to find
 307 the entry we are looking for.  This way, only systems that have lots of
 308 aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
 309 structures have the same first three fields allowing some simplification in
 310 the code.
 311
 312 They have these shapes
 313
 314 typedef struct pv_rooted_entry {
 315         queue_head_t            qlink;
 316         vm_map_offset_t         va;
 317         pmap_t                  pmap;
 318 } *pv_rooted_entry_t;
 319
 320
 321 typedef struct pv_hashed_entry {
 322         queue_head_t            qlink;
 323         vm_map_offset_t         va;
 324         pmap_t                  pmap;
 325         ppnum_t                 ppn;
 326         struct pv_hashed_entry *nexth;
 327 } *pv_hashed_entry_t;
 328
 329 The main flow difference is that the code is now aware of the rooted entry and
 330 the hashed entries.  Code that runs the pv list still starts with the rooted
 331 entry and then continues down the qlink onto the hashed entries.  Code that is
 332 looking up a specific pv entry first checks the rooted entry and then hashes
 333 and runs the hash list for the match. The hash list lengths are much smaller
 334 than the original pv lists that contained all aliases for the specific ppn.
 335
 336 */
 337
 338 typedef struct pv_rooted_entry {
 339         /* first three entries must match pv_hashed_entry_t */
 340         queue_head_t            qlink;
 341         vm_map_offset_t         va;     /* virtual address for mapping */
 342         pmap_t                  pmap;   /* pmap where mapping lies */
 343 } *pv_rooted_entry_t;
 344
 345 #define PV_ROOTED_ENTRY_NULL    ((pv_rooted_entry_t) 0)
 346
 347 pv_rooted_entry_t       pv_head_table;          /* array of entries, one per page */
 348
 349 typedef struct pv_hashed_entry {
 350         /* first three entries must match pv_rooted_entry_t */
 351         queue_head_t            qlink;
 352         vm_map_offset_t         va;
 353         pmap_t                  pmap;
 354         ppnum_t                 ppn;
 355         struct pv_hashed_entry  *nexth;
 356 } *pv_hashed_entry_t;
 357
 358 #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
 359
 360 #define NPVHASH 4095   /* MUST BE 2^N - 1 */
 361 pv_hashed_entry_t     *pv_hash_table;  /* hash lists */
 362
 363 uint32_t npvhash = 0;
 364
 365 //#define PV_DEBUG 1   /* uncomment to enable some PV debugging code */
 366 #ifdef PV_DEBUG
 367 #define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
 368 #else
 369 #define CHK_NPVHASH(x)
 370 #endif
 371
 372 pv_hashed_entry_t       pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
 373 pv_hashed_entry_t       pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
 374 decl_simple_lock_data(,pv_hashed_free_list_lock)
 375 decl_simple_lock_data(,pv_hashed_kern_free_list_lock)
 376 decl_simple_lock_data(,pv_hash_table_lock)
 377
 378 int                     pv_hashed_free_count = 0;
 379 int                     pv_hashed_kern_free_count = 0;
 380 #define PV_HASHED_LOW_WATER_MARK 5000
 381 #define PV_HASHED_KERN_LOW_WATER_MARK 100
 382 #define PV_HASHED_ALLOC_CHUNK 2000
 383 #define PV_HASHED_KERN_ALLOC_CHUNK 50
 384 thread_call_t           mapping_adjust_call;
 385 static thread_call_data_t mapping_adjust_call_data;
 386 uint32_t                mappingrecurse = 0;
 387
 388 #define PV_HASHED_ALLOC(pvh_e) {                                        \
 389         simple_lock(&pv_hashed_free_list_lock);                         \
 390         if ((pvh_e = pv_hashed_free_list) != 0) {                       \
 391           pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next;   \
 392           pv_hashed_free_count--;                                       \
 393           if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK)          \
 394             if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse))     \
 395               thread_call_enter(mapping_adjust_call);                   \
 396         }                                                               \
 397         simple_unlock(&pv_hashed_free_list_lock);                       \
 398 }
 399
 400 #define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) {                   \
 401         simple_lock(&pv_hashed_free_list_lock);                         \
 402         pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list;        \
 403         pv_hashed_free_list = pvh_eh;                                   \
 404         pv_hashed_free_count += pv_cnt;                                 \
 405         simple_unlock(&pv_hashed_free_list_lock);                       \
 406 }
 407
 408 #define PV_HASHED_KERN_ALLOC(pvh_e) {                                   \
 409         simple_lock(&pv_hashed_kern_free_list_lock);                    \
 410         if ((pvh_e = pv_hashed_kern_free_list) != 0) {                  \
 411           pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
 412           pv_hashed_kern_free_count--;                                  \
 413           if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK)\
 414             if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse))     \
 415               thread_call_enter(mapping_adjust_call);                   \
 416         }                                                               \
 417         simple_unlock(&pv_hashed_kern_free_list_lock);                  \
 418 }
 419
 420 #define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) {              \
 421         simple_lock(&pv_hashed_kern_free_list_lock);                    \
 422         pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list;   \
 423         pv_hashed_kern_free_list = pvh_eh;                              \
 424         pv_hashed_kern_free_count += pv_cnt;                            \
 425         simple_unlock(&pv_hashed_kern_free_list_lock);                  \
 426 }
 427
 428 zone_t          pv_hashed_list_zone;    /* zone of pv_hashed_entry structures */
 429
 430 static zone_t pdpt_zone;
 431
 432 /*
 433  *      Each entry in the pv_head_table is locked by a bit in the
 434  *      pv_lock_table.  The lock bits are accessed by the physical
 435  *      address of the page they lock.
 436  */
 437
 438 char    *pv_lock_table;         /* pointer to array of bits */
 439 #define pv_lock_table_size(n)   (((n)+BYTE_SIZE-1)/BYTE_SIZE)
 440
 441 char    *pv_hash_lock_table;
 442 #define pv_hash_lock_table_size(n)  (((n)+BYTE_SIZE-1)/BYTE_SIZE)
 443
 444 /*
 445  *      First and last physical addresses that we maintain any information
 446  *      for.  Initialized to zero so that pmap operations done before
 447  *      pmap_init won't touch any non-existent structures.
 448  */
 449 boolean_t       pmap_initialized = FALSE;/* Has pmap_init completed? */
 450
 451 static struct vm_object kptobj_object_store;
 452 static struct vm_object kpml4obj_object_store;
 453 static struct vm_object kpdptobj_object_store;
 454
 455 /*
 456  *      Index into pv_head table, its lock bits, and the modify/reference and managed bits
 457  */
 458
 459 #define pa_index(pa)            (i386_btop(pa))
 460 #define ppn_to_pai(ppn)         ((int)ppn)
 461
 462 #define pai_to_pvh(pai)         (&pv_head_table[pai])
 463 #define lock_pvh_pai(pai)       bit_lock(pai, (void *)pv_lock_table)
 464 #define unlock_pvh_pai(pai)     bit_unlock(pai, (void *)pv_lock_table)
 465
 466 static inline uint32_t
 467 pvhashidx(pmap_t pmap, vm_offset_t va)
 468 {
 469         return ((uint32_t)(uint64_t)pmap ^
 470                 ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
 471                npvhash;
 472 }
 473 #define pvhash(idx)             (&pv_hash_table[idx])
 474
 475 #define lock_hash_hash(hash)    bit_lock(hash, (void *)pv_hash_lock_table)
 476 #define unlock_hash_hash(hash)  bit_unlock(hash, (void *)pv_hash_lock_table)
 477
 478 /*
 479  *      Array of physical page attribites for managed pages.
 480  *      One byte per physical page.
 481  */
 482 char            *pmap_phys_attributes;
 483 unsigned int    last_managed_page = 0;
 484 #define IS_MANAGED_PAGE(x)                              \
 485         ((unsigned int)(x) <= last_managed_page &&      \
 486          (pmap_phys_attributes[x] & PHYS_MANAGED))
 487
 488 /*
 489  *      Physical page attributes.  Copy bits from PTE definition.
 490  */
 491 #define PHYS_MODIFIED   INTEL_PTE_MOD   /* page modified */
 492 #define PHYS_REFERENCED INTEL_PTE_REF   /* page referenced */
 493 #define PHYS_MANAGED    INTEL_PTE_VALID /* page is managed */
 494
 495 /*
 496  *      Amount of virtual memory mapped by one
 497  *      page-directory entry.
 498  */
 499 #define PDE_MAPPED_SIZE         (pdetova(1))
 500 uint64_t pde_mapped_size = PDE_MAPPED_SIZE;
 501
 502 /*
 503  *      Locking and TLB invalidation
 504  */
 505
 506 /*
 507  *      Locking Protocols: (changed 2/2007 JK)
 508  *
 509  *      There are two structures in the pmap module that need locking:
 510  *      the pmaps themselves, and the per-page pv_lists (which are locked
 511  *      by locking the pv_lock_table entry that corresponds to the pv_head
 512  *      for the list in question.)  Most routines want to lock a pmap and
 513  *      then do operations in it that require pv_list locking -- however
 514  *      pmap_remove_all and pmap_copy_on_write operate on a physical page
 515  *      basis and want to do the locking in the reverse order, i.e. lock
 516  *      a pv_list and then go through all the pmaps referenced by that list.
 517  *
 518  *      The system wide pmap lock has been removed. Now, paths take a lock
 519  *      on the pmap before changing its 'shape' and the reverse order lockers
 520  *      (coming in by phys ppn) take a lock on the corresponding pv and then
 521  *      retest to be sure nothing changed during the window before they locked
 522  *      and can then run up/down the pv lists holding the list lock. This also
 523  *      lets the pmap layer run (nearly completely) interrupt enabled, unlike
 524  *      previously.
 525  */
 526
 527 /*
 528  * PV locking
 529  */
 530
 531 #define LOCK_PVH(index) {               \
 532         mp_disable_preemption();        \
 533         lock_pvh_pai(index);            \
 534 }
 535
 536 #define UNLOCK_PVH(index) {             \
 537         unlock_pvh_pai(index);          \
 538         mp_enable_preemption();         \
 539 }
 540 /*
 541  * PV hash locking
 542  */
 543
 544 #define LOCK_PV_HASH(hash)         lock_hash_hash(hash)
 545 #define UNLOCK_PV_HASH(hash)       unlock_hash_hash(hash)
 546
 547 unsigned pmap_memory_region_count;
 548 unsigned pmap_memory_region_current;
 549
 550 pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
 551
 552 /*
 553  *      Other useful macros.
 554  */
 555 #define current_pmap()          (vm_map_pmap(current_thread()->map))
 556
 557 struct pmap     kernel_pmap_store;
 558 pmap_t          kernel_pmap;
 559
 560 pd_entry_t      high_shared_pde;
 561 pd_entry_t      commpage64_pde;
 562
 563 struct zone     *pmap_zone;             /* zone of pmap structures */
 564
 565 int             pmap_debug = 0;         /* flag for debugging prints */
 566
 567 unsigned int    inuse_ptepages_count = 0;
 568
 569 addr64_t        kernel64_cr3;
 570
 571 /*
 572  *      Pmap cache.  Cache is threaded through ref_count field of pmap.
 573  *      Max will eventually be constant -- variable for experimentation.
 574  */
 575 int             pmap_cache_max = 32;
 576 int             pmap_alloc_chunk = 8;
 577 pmap_t          pmap_cache_list;
 578 int             pmap_cache_count;
 579 decl_simple_lock_data(,pmap_cache_lock)
 580
 581 extern char     end;
 582
 583 static int      nkpt;
 584
 585 pt_entry_t     *DMAP1, *DMAP2;
 586 caddr_t         DADDR1;
 587 caddr_t         DADDR2;
 588
 589 /*
 590  * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
 591  * properly deals with the anchor.
 592  * must be called with the hash locked, does not unlock it
 593  */
 594
 595 static inline void
 596 pmap_pvh_unlink(pv_hashed_entry_t pvh)
 597 {
 598         pv_hashed_entry_t       curh;
 599         pv_hashed_entry_t       *pprevh;
 600         int                     pvhash_idx;
 601
 602         CHK_NPVHASH();
 603         pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
 604
 605         pprevh = pvhash(pvhash_idx);
 606
 607 #if PV_DEBUG
 608         if (NULL == *pprevh)
 609                 panic("pvh_unlink null anchor"); /* JK DEBUG */
 610 #endif
 611         curh = *pprevh;
 612
 613         while (PV_HASHED_ENTRY_NULL != curh) {
 614                 if (pvh == curh)
 615                         break;
 616                 pprevh = &curh->nexth;
 617                 curh = curh->nexth;
 618         }
 619         if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
 620         *pprevh = pvh->nexth;
 621         return;
 622 }
 623
 624 static inline void
 625 pv_hash_add(pv_hashed_entry_t   pvh_e,
 626             pv_rooted_entry_t   pv_h)
 627 {
 628         pv_hashed_entry_t       *hashp;
 629         int                     pvhash_idx;
 630
 631         CHK_NPVHASH();
 632         pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
 633         LOCK_PV_HASH(pvhash_idx);
 634         insque(&pvh_e->qlink, &pv_h->qlink);
 635         hashp = pvhash(pvhash_idx);
 636 #if PV_DEBUG
 637         if (NULL==hashp)
 638                 panic("pv_hash_add(%p) null hash bucket", pvh_e);
 639 #endif
 640         pvh_e->nexth = *hashp;
 641         *hashp = pvh_e;
 642         UNLOCK_PV_HASH(pvhash_idx);
 643 }
 644
 645 static inline void
 646 pv_hash_remove(pv_hashed_entry_t pvh_e)
 647 {
 648         int                     pvhash_idx;
 649
 650         CHK_NPVHASH();
 651         pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
 652         LOCK_PV_HASH(pvhash_idx);
 653         remque(&pvh_e->qlink);
 654         pmap_pvh_unlink(pvh_e);
 655         UNLOCK_PV_HASH(pvhash_idx);
 656 }
 657
 658 /*
 659  * Remove pv list entry.
 660  * Called with pv_head_table entry locked.
 661  * Returns pv entry to be freed (or NULL).
 662  */
 663 static inline pv_hashed_entry_t
 664 pmap_pv_remove(pmap_t           pmap,
 665                vm_map_offset_t  vaddr,
 666                ppnum_t          ppn)
 667 {
 668         pv_hashed_entry_t       pvh_e;
 669         pv_rooted_entry_t       pv_h;
 670         pv_hashed_entry_t       *pprevh;
 671         int                     pvhash_idx;
 672         uint32_t                pv_cnt;
 673
 674         pvh_e = PV_HASHED_ENTRY_NULL;
 675         pv_h = pai_to_pvh(ppn_to_pai(ppn));
 676         if (pv_h->pmap == PMAP_NULL)
 677                 panic("pmap_pv_remove(%p,%llu,%u): null pv_list!",
 678                       pmap, vaddr, ppn);
 679
 680         if (pv_h->va == vaddr && pv_h->pmap == pmap) {
 681                 /*
 682                  * Header is the pv_rooted_entry.
 683                  * We can't free that. If there is a queued
 684                  * entry after this one we remove that
 685                  * from the ppn queue, we remove it from the hash chain
 686                  * and copy it to the rooted entry. Then free it instead.
 687                  */
 688                 pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
 689                 if (pv_h != (pv_rooted_entry_t) pvh_e) {
 690                         /*
 691                          * Entry queued to root, remove this from hash
 692                          * and install as nem root.
 693                          */
 694                         CHK_NPVHASH();
 695                         pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
 696                         LOCK_PV_HASH(pvhash_idx);
 697                         remque(&pvh_e->qlink);
 698                         pprevh = pvhash(pvhash_idx);
 699                         if (PV_HASHED_ENTRY_NULL == *pprevh) {
 700                                 panic("pmap_pv_remove(%p,%llu,%u): "
 701                                       "empty hash, removing rooted",
 702                                       pmap, vaddr, ppn);
 703                         }
 704                         pmap_pvh_unlink(pvh_e);
 705                         UNLOCK_PV_HASH(pvhash_idx);
 706                         pv_h->pmap = pvh_e->pmap;
 707                         pv_h->va = pvh_e->va;   /* dispose of pvh_e */
 708                 } else {
 709                         /* none queued after rooted */
 710                         pv_h->pmap = PMAP_NULL;
 711                         pvh_e = PV_HASHED_ENTRY_NULL;
 712                 }
 713         } else {
 714                 /*
 715                  * not removing rooted pv. find it on hash chain, remove from
 716                  * ppn queue and hash chain and free it
 717                  */
 718                 CHK_NPVHASH();
 719                 pvhash_idx = pvhashidx(pmap, vaddr);
 720                 LOCK_PV_HASH(pvhash_idx);
 721                 pprevh = pvhash(pvhash_idx);
 722                 if (PV_HASHED_ENTRY_NULL == *pprevh) {
 723                         panic("pmap_pv_remove(%p,%llu,%u): empty hash",
 724                               pmap, vaddr, ppn);
 725                 }
 726                 pvh_e = *pprevh;
 727                 pmap_pv_hashlist_walks++;
 728                 pv_cnt = 0;
 729                 while (PV_HASHED_ENTRY_NULL != pvh_e) {
 730                         pv_cnt++;
 731                         if (pvh_e->pmap == pmap &&
 732                             pvh_e->va == vaddr &&
 733                             pvh_e->ppn == ppn)
 734                                 break;
 735                         pprevh = &pvh_e->nexth;
 736                         pvh_e = pvh_e->nexth;
 737                 }
 738                 if (PV_HASHED_ENTRY_NULL == pvh_e)
 739                         panic("pmap_pv_remove(%p,%llu,%u): pv not on hash",
 740                          pmap, vaddr, ppn);
 741                 pmap_pv_hashlist_cnts += pv_cnt;
 742                 if (pmap_pv_hashlist_max < pv_cnt)
 743                         pmap_pv_hashlist_max = pv_cnt;
 744                 *pprevh = pvh_e->nexth;
 745                 remque(&pvh_e->qlink);
 746                 UNLOCK_PV_HASH(pvhash_idx);
 747         }
 748
 749         return pvh_e;
 750 }
 751
 752 /*
 753  * for legacy, returns the address of the pde entry.
 754  * for 64 bit, causes the pdpt page containing the pde entry to be mapped,
 755  * then returns the mapped address of the pde entry in that page
 756  */
 757 pd_entry_t     *
 758 pmap_pde(pmap_t m, vm_map_offset_t v)
 759 {
 760         pd_entry_t     *pde;
 761
 762         assert(m);
 763 #if 0
 764         if (m == kernel_pmap)
 765                 pde = (&((m)->dirbase[(vm_offset_t)(v) >> PDESHIFT]));
 766         else
 767 #endif
 768                 pde = pmap64_pde(m, v);
 769
 770         return pde;
 771 }
 772
 773 /*
 774  * the single pml4 page per pmap is allocated at pmap create time and exists
 775  * for the duration of the pmap. we allocate this page in kernel vm.
 776  * this returns the address of the requested pml4 entry in the top level page.
 777  */
 778 static inline
 779 pml4_entry_t *
 780 pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
 781 {
 782         return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)];
 783 }
 784
 785 /*
 786  * maps in the pml4 page, if any, containing the pdpt entry requested
 787  * and returns the address of the pdpt entry in that mapped page
 788  */
 789 pdpt_entry_t *
 790 pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
 791 {
 792         pml4_entry_t    newpf;
 793         pml4_entry_t    *pml4;
 794
 795         assert(pmap);
 796         if ((vaddr > 0x00007FFFFFFFFFFFULL) &&
 797             (vaddr < 0xFFFF800000000000ULL)) {
 798                 return (0);
 799         }
 800
 801         pml4 = pmap64_pml4(pmap, vaddr);
 802         if (pml4 && ((*pml4 & INTEL_PTE_VALID))) {
 803                 newpf = *pml4 & PG_FRAME;
 804                 return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf))
 805                         [(vaddr >> PDPTSHIFT) & (NPDPTPG-1)];
 806         }
 807         return (NULL);
 808 }
 809 /*
 810  * maps in the pdpt page, if any, containing the pde entry requested
 811  * and returns the address of the pde entry in that mapped page
 812  */
 813 pd_entry_t *
 814 pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr)
 815 {
 816         pdpt_entry_t    newpf;
 817         pdpt_entry_t    *pdpt;
 818
 819         assert(pmap);
 820         if ((vaddr > 0x00007FFFFFFFFFFFULL) &&
 821             (vaddr < 0xFFFF800000000000ULL)) {
 822                 return (0);
 823         }
 824
 825         pdpt = pmap64_pdpt(pmap, vaddr);
 826
 827         if (pdpt && ((*pdpt & INTEL_PTE_VALID))) {
 828                 newpf = *pdpt & PG_FRAME;
 829                 return &((pd_entry_t *) PHYSMAP_PTOV(newpf))
 830                         [(vaddr >> PDSHIFT) & (NPDPG-1)];
 831         }
 832         return (NULL);
 833 }
 834
 835 /*
 836  * return address of mapped pte for vaddr va in pmap pmap.
 837  *
 838  * physically maps the pde page, if any, containing the pte in and returns
 839  * the address of the pte in that mapped page
 840  *
 841  * In case the pde maps a superpage, return the pde, which, in this case
 842  * is the actual page table entry.
 843  */
 844 pt_entry_t *
 845 pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
 846 {
 847         pd_entry_t      *pde;
 848         pd_entry_t      newpf;
 849
 850         assert(pmap);
 851         pde = pmap_pde(pmap, vaddr);
 852
 853         if (pde && ((*pde & INTEL_PTE_VALID))) {
 854                 if (*pde & INTEL_PTE_PS)
 855                         return pde;
 856                 newpf = *pde & PG_FRAME;
 857                 return &((pt_entry_t *)PHYSMAP_PTOV(newpf))
 858                         [i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)];
 859         }
 860         return (NULL);
 861 }
 862
 863 /*
 864  *      Map memory at initialization.  The physical addresses being
 865  *      mapped are not managed and are never unmapped.
 866  *
 867  *      For now, VM is already on, we only need to map the
 868  *      specified memory.
 869  */
 870 vm_offset_t
 871 pmap_map(
 872         vm_offset_t     virt,
 873         vm_map_offset_t start_addr,
 874         vm_map_offset_t end_addr,
 875         vm_prot_t       prot,
 876         unsigned int    flags)
 877 {
 878         int             ps;
 879
 880         ps = PAGE_SIZE;
 881         while (start_addr < end_addr) {
 882                 pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
 883                            (ppnum_t) i386_btop(start_addr), prot, flags, FALSE);
 884                 virt += ps;
 885                 start_addr += ps;
 886         }
 887         return(virt);
 888 }
 889
 890 /*
 891  *      Back-door routine for mapping kernel VM at initialization.
 892  *      Useful for mapping memory outside the range
 893  *      Sets no-cache, A, D.
 894  *      Otherwise like pmap_map.
 895  */
 896 vm_offset_t
 897 pmap_map_bd(
 898         vm_offset_t     virt,
 899         vm_map_offset_t start_addr,
 900         vm_map_offset_t end_addr,
 901         vm_prot_t       prot,
 902         unsigned int    flags)
 903 {
 904         pt_entry_t      template;
 905         pt_entry_t      *pte;
 906         spl_t           spl;
 907
 908         template = pa_to_pte(start_addr)
 909                 | INTEL_PTE_REF
 910                 | INTEL_PTE_MOD
 911                 | INTEL_PTE_WIRED
 912                 | INTEL_PTE_VALID;
 913
 914         if (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) {
 915                 template |= INTEL_PTE_NCACHE;
 916                 if (!(flags & (VM_MEM_GUARDED | VM_WIMG_USE_DEFAULT)))
 917                         template |= INTEL_PTE_PTA;
 918         }
 919         if (prot & VM_PROT_WRITE)
 920                 template |= INTEL_PTE_WRITE;
 921
 922
 923         while (start_addr < end_addr) {
 924                 spl = splhigh();
 925                 pte = pmap_pte(kernel_pmap, (vm_map_offset_t)virt);
 926                 if (pte == PT_ENTRY_NULL) {
 927                         panic("pmap_map_bd: Invalid kernel address\n");
 928                 }
 929                 pmap_store_pte(pte, template);
 930                 splx(spl);
 931                 pte_increment_pa(template);
 932                 virt += PAGE_SIZE;
 933                 start_addr += PAGE_SIZE;
 934         }
 935
 936
 937         flush_tlb();
 938         return(virt);
 939 }
 940
 941 extern  char                    *first_avail;
 942 extern  vm_offset_t             virtual_avail, virtual_end;
 943 extern  pmap_paddr_t            avail_start, avail_end;
 944 extern  vm_offset_t             sHIB;
 945 extern  vm_offset_t             eHIB;
 946 extern  vm_offset_t             stext;
 947 extern  vm_offset_t             etext;
 948 extern  vm_offset_t             sdata;
 949
 950 void
 951 pmap_cpu_init(void)
 952 {
 953         /*
 954          * Here early in the life of a processor (from cpu_mode_init()).
 955          * Ensure global page feature is disabled.
 956          */
 957         set_cr4(get_cr4() &~ CR4_PGE);
 958
 959         /*
 960          * Initialize the per-cpu, TLB-related fields.
 961          */
 962         current_cpu_datap()->cpu_kernel_cr3 = kernel_pmap->pm_cr3;
 963         current_cpu_datap()->cpu_active_cr3 = kernel_pmap->pm_cr3;
 964         current_cpu_datap()->cpu_tlb_invalid = FALSE;
 965 }
 966
 967
 968
 969 /*
 970  *      Bootstrap the system enough to run with virtual memory.
 971  *      Map the kernel's code and data, and allocate the system page table.
 972  *      Called with mapping OFF.  Page_size must already be set.
 973  */
 974
 975 void
 976 pmap_bootstrap(
 977         __unused vm_offset_t    load_start,
 978         __unused boolean_t      IA32e)
 979 {
 980 #if NCOPY_WINDOWS > 0
 981         vm_offset_t     va;
 982         int i;
 983 #endif
 984
 985         assert(IA32e);
 986
 987         vm_last_addr = VM_MAX_KERNEL_ADDRESS;   /* Set the highest address
 988                                                  * known to VM */
 989         /*
 990          *      The kernel's pmap is statically allocated so we don't
 991          *      have to use pmap_create, which is unlikely to work
 992          *      correctly at this part of the boot sequence.
 993          */
 994
 995         kernel_pmap = &kernel_pmap_store;
 996         kernel_pmap->ref_count = 1;
 997         kernel_pmap->nx_enabled = FALSE;
 998         kernel_pmap->pm_task_map = TASK_MAP_64BIT;
 999         kernel_pmap->pm_obj = (vm_object_t) NULL;
1000         kernel_pmap->dirbase = (pd_entry_t *)((uintptr_t)IdlePTD);
1001         kernel_pmap->pm_pdpt = (pd_entry_t *) ((uintptr_t)IdlePDPT);
1002         kernel_pmap->pm_pml4 = IdlePML4;
1003         kernel_pmap->pm_cr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
1004
1005
1006         current_cpu_datap()->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
1007
1008         nkpt = NKPT;
1009         OSAddAtomic(NKPT,  &inuse_ptepages_count);
1010
1011         virtual_avail = (vm_offset_t)(VM_MIN_KERNEL_ADDRESS) + (vm_offset_t)first_avail;
1012         virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
1013
1014 #if NCOPY_WINDOWS > 0
1015         /*
1016          * Reserve some special page table entries/VA space for temporary
1017          * mapping of pages.
1018          */
1019 #define SYSMAP(c, p, v, n)      \
1020         v = (c)va; va += ((n)*INTEL_PGBYTES);
1021
1022         va = virtual_avail;
1023
1024         for (i=0; i<PMAP_NWINDOWS; i++) {
1025 #if 1
1026             kprintf("trying to do SYSMAP idx %d %p\n", i,
1027                 current_cpu_datap());
1028             kprintf("cpu_pmap %p\n", current_cpu_datap()->cpu_pmap);
1029             kprintf("mapwindow %p\n", current_cpu_datap()->cpu_pmap->mapwindow);
1030             kprintf("two stuff %p %p\n",
1031                    (void *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
1032                    (void *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR));
1033 #endif
1034             SYSMAP(caddr_t,
1035                    (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
1036                    (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR),
1037                    1);
1038             current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP =
1039                 &(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP_store);
1040             *current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP = 0;
1041         }
1042
1043         /* DMAP user for debugger */
1044         SYSMAP(caddr_t, DMAP1, DADDR1, 1);
1045         SYSMAP(caddr_t, DMAP2, DADDR2, 1);  /* XXX temporary - can remove */
1046
1047         virtual_avail = va;
1048 #endif
1049
1050         if (PE_parse_boot_argn("npvhash", &npvhash, sizeof (npvhash))) {
1051                 if (0 != ((npvhash + 1) & npvhash)) {
1052                         kprintf("invalid hash %d, must be ((2^N)-1), "
1053                                 "using default %d\n", npvhash, NPVHASH);
1054                         npvhash = NPVHASH;
1055                 }
1056         } else {
1057                 npvhash = NPVHASH;
1058         }
1059
1060         printf("npvhash=%d\n", npvhash);
1061
1062         simple_lock_init(&kernel_pmap->lock, 0);
1063         simple_lock_init(&pv_hashed_free_list_lock, 0);
1064         simple_lock_init(&pv_hashed_kern_free_list_lock, 0);
1065         simple_lock_init(&pv_hash_table_lock,0);
1066
1067         pmap_cpu_init();
1068
1069         kprintf("Kernel virtual space from 0x%lx to 0x%lx.\n",
1070                         (long)KERNEL_BASE, (long)virtual_end);
1071         kprintf("Available physical space from 0x%llx to 0x%llx\n",
1072                         avail_start, avail_end);
1073
1074         /*
1075          * The -no_shared_cr3 boot-arg is a debugging feature (set by default
1076          * in the DEBUG kernel) to force the kernel to switch to its own map
1077          * (and cr3) when control is in kernelspace. The kernel's map does not
1078          * include (i.e. share) userspace so wild references will cause
1079          * a panic. Only copyin and copyout are exempt from this.
1080          */
1081         (void) PE_parse_boot_argn("-no_shared_cr3",
1082                                   &no_shared_cr3, sizeof (no_shared_cr3));
1083         if (no_shared_cr3)
1084                 kprintf("Kernel not sharing user map\n");
1085
1086 #ifdef  PMAP_TRACES
1087         if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof (pmap_trace))) {
1088                 kprintf("Kernel traces for pmap operations enabled\n");
1089         }
1090 #endif  /* PMAP_TRACES */
1091 }
1092
1093 void
1094 pmap_virtual_space(
1095         vm_offset_t *startp,
1096         vm_offset_t *endp)
1097 {
1098         *startp = virtual_avail;
1099         *endp = virtual_end;
1100 }
1101
1102 /*
1103  *      Initialize the pmap module.
1104  *      Called by vm_init, to initialize any structures that the pmap
1105  *      system needs to map virtual memory.
1106  */
1107 void
1108 pmap_init(void)
1109 {
1110         long                    npages;
1111         vm_offset_t             addr;
1112         vm_size_t               s;
1113         vm_map_offset_t         vaddr;
1114         ppnum_t ppn;
1115
1116
1117         kernel_pmap->pm_obj_pml4 = &kpml4obj_object_store;
1118         _vm_object_allocate((vm_object_size_t)NPML4PGS, &kpml4obj_object_store);
1119
1120         kernel_pmap->pm_obj_pdpt = &kpdptobj_object_store;
1121         _vm_object_allocate((vm_object_size_t)NPDPTPGS, &kpdptobj_object_store);
1122
1123         kernel_pmap->pm_obj = &kptobj_object_store;
1124         _vm_object_allocate((vm_object_size_t)NPDEPGS, &kptobj_object_store);
1125
1126         /*
1127          *      Allocate memory for the pv_head_table and its lock bits,
1128          *      the modify bit array, and the pte_page table.
1129          */
1130
1131         /*
1132          * zero bias all these arrays now instead of off avail_start
1133          * so we cover all memory
1134          */
1135
1136         npages = i386_btop(avail_end);
1137         s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
1138                          + (sizeof (struct pv_hashed_entry_t *) * (npvhash+1))
1139                          + pv_lock_table_size(npages)
1140                          + pv_hash_lock_table_size((npvhash+1))
1141                                 + npages);
1142
1143         s = round_page(s);
1144         if (kernel_memory_allocate(kernel_map, &addr, s, 0,
1145                                    KMA_KOBJECT | KMA_PERMANENT)
1146             != KERN_SUCCESS)
1147                 panic("pmap_init");
1148
1149         memset((char *)addr, 0, s);
1150
1151 #if PV_DEBUG
1152         if (0 == npvhash) panic("npvhash not initialized");
1153 #endif
1154
1155         /*
1156          *      Allocate the structures first to preserve word-alignment.
1157          */
1158         pv_head_table = (pv_rooted_entry_t) addr;
1159         addr = (vm_offset_t) (pv_head_table + npages);
1160
1161         pv_hash_table = (pv_hashed_entry_t *)addr;
1162         addr = (vm_offset_t) (pv_hash_table + (npvhash + 1));
1163
1164         pv_lock_table = (char *) addr;
1165         addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
1166
1167         pv_hash_lock_table = (char *) addr;
1168         addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhash+1)));
1169
1170         pmap_phys_attributes = (char *) addr;
1171
1172         ppnum_t  last_pn = i386_btop(avail_end);
1173         unsigned int i;
1174         pmap_memory_region_t *pmptr = pmap_memory_regions;
1175         for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
1176                 if (pmptr->type != kEfiConventionalMemory)
1177                         continue;
1178                 unsigned int pn;
1179                 for (pn = pmptr->base; pn <= pmptr->end; pn++) {
1180                         if (pn < last_pn) {
1181                                 pmap_phys_attributes[pn] |= PHYS_MANAGED;
1182                                 if (pn > last_managed_page)
1183                                         last_managed_page = pn;
1184                         }
1185                 }
1186         }
1187
1188         /*
1189          *      Create the zone of physical maps,
1190          *      and of the physical-to-virtual entries.
1191          */
1192         s = (vm_size_t) sizeof(struct pmap);
1193         pmap_zone = zinit(s, 400*s, 4096, "pmap"); /* XXX */
1194         s = (vm_size_t) sizeof(struct pv_hashed_entry);
1195         pv_hashed_list_zone = zinit(s, 10000*s, 4096, "pv_list"); /* XXX */
1196         s = 63;
1197         pdpt_zone = zinit(s, 400*s, 4096, "pdpt"); /* XXX */
1198
1199
1200         /* create pv entries for kernel pages mapped by low level
1201            startup code.  these have to exist so we can pmap_remove()
1202            e.g. kext pages from the middle of our addr space */
1203
1204         vaddr = (vm_map_offset_t) VM_MIN_KERNEL_ADDRESS;
1205         for (ppn = 0; ppn < i386_btop(avail_start); ppn++) {
1206                 pv_rooted_entry_t pv_e;
1207
1208                 pv_e = pai_to_pvh(ppn);
1209                 pv_e->va = vaddr;
1210                 vaddr += PAGE_SIZE;
1211                 pv_e->pmap = kernel_pmap;
1212                 queue_init(&pv_e->qlink);
1213         }
1214         pmap_initialized = TRUE;
1215
1216         /*
1217          *      Initialize pmap cache.
1218          */
1219         pmap_cache_list = PMAP_NULL;
1220         pmap_cache_count = 0;
1221         simple_lock_init(&pmap_cache_lock, 0);
1222
1223         max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
1224
1225         /*
1226          * Ensure the kernel's PML4 entry exists for the basement
1227          * before this is shared with any user.
1228          */
1229         pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT);
1230 }
1231
1232
1233 /*
1234  * this function is only used for debugging fron the vm layer
1235  */
1236 boolean_t
1237 pmap_verify_free(
1238                  ppnum_t pn)
1239 {
1240         pv_rooted_entry_t       pv_h;
1241         int             pai;
1242         boolean_t       result;
1243
1244         assert(pn != vm_page_fictitious_addr);
1245
1246         if (!pmap_initialized)
1247                 return(TRUE);
1248
1249         if (pn == vm_page_guard_addr)
1250                 return TRUE;
1251
1252         pai = ppn_to_pai(pn);
1253         if (!IS_MANAGED_PAGE(pai))
1254                 return(FALSE);
1255         pv_h = pai_to_pvh(pn);
1256         result = (pv_h->pmap == PMAP_NULL);
1257         return(result);
1258 }
1259
1260 boolean_t
1261 pmap_is_empty(
1262        pmap_t          pmap,
1263        vm_map_offset_t va_start,
1264        vm_map_offset_t va_end)
1265 {
1266         vm_map_offset_t offset;
1267         ppnum_t         phys_page;
1268
1269         if (pmap == PMAP_NULL) {
1270                 return TRUE;
1271         }
1272
1273         /*
1274          * Check the resident page count
1275          * - if it's zero, the pmap is completely empty.
1276          * This short-circuit test prevents a virtual address scan which is
1277          * painfully slow for 64-bit spaces.
1278          * This assumes the count is correct
1279          * .. the debug kernel ought to be checking perhaps by page table walk.
1280          */
1281         if (pmap->stats.resident_count == 0)
1282                 return TRUE;
1283
1284         for (offset = va_start;
1285              offset < va_end;
1286              offset += PAGE_SIZE_64) {
1287                 phys_page = pmap_find_phys(pmap, offset);
1288                 if (phys_page) {
1289                         kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1290                                 "page %d at 0x%llx\n",
1291                                 pmap, va_start, va_end, phys_page, offset);
1292                         return FALSE;
1293                 }
1294         }
1295
1296         return TRUE;
1297 }
1298
1299
1300 /*
1301  *      Create and return a physical map.
1302  *
1303  *      If the size specified for the map
1304  *      is zero, the map is an actual physical
1305  *      map, and may be referenced by the
1306  *      hardware.
1307  *
1308  *      If the size specified is non-zero,
1309  *      the map will be used in software only, and
1310  *      is bounded by that size.
1311  */
1312 pmap_t
1313 pmap_create(
1314             vm_map_size_t       sz,
1315             boolean_t           is_64bit)
1316 {
1317         pmap_t          p;
1318         vm_size_t       size;
1319         pml4_entry_t    *pml4;
1320         pml4_entry_t    *kpml4;
1321
1322         PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1323                    (uint32_t) (sz>>32), (uint32_t) sz, is_64bit, 0, 0);
1324
1325         size = (vm_size_t) sz;
1326
1327         /*
1328          *      A software use-only map doesn't even need a map.
1329          */
1330
1331         if (size != 0) {
1332                 return(PMAP_NULL);
1333         }
1334
1335         p = (pmap_t) zalloc(pmap_zone);
1336         if (PMAP_NULL == p)
1337                 panic("pmap_create zalloc");
1338
1339         /* init counts now since we'll be bumping some */
1340         simple_lock_init(&p->lock, 0);
1341         p->stats.resident_count = 0;
1342         p->stats.resident_max = 0;
1343         p->stats.wired_count = 0;
1344         p->ref_count = 1;
1345         p->nx_enabled = 1;
1346         p->pm_shared = FALSE;
1347
1348         p->pm_task_map = is_64bit ? TASK_MAP_64BIT : TASK_MAP_32BIT;;
1349
1350         /* alloc the pml4 page in kernel vm */
1351         if (KERN_SUCCESS != kmem_alloc_kobject(kernel_map, (vm_offset_t *)(&p->pm_pml4), PAGE_SIZE))
1352                 panic("pmap_create kmem_alloc_kobject pml4");
1353
1354         memset((char *)p->pm_pml4, 0, PAGE_SIZE);
1355         p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4);
1356
1357         OSAddAtomic(1,  &inuse_ptepages_count);
1358
1359         /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1360
1361         p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS));
1362         if (NULL == p->pm_obj_pml4)
1363                 panic("pmap_create pdpt obj");
1364
1365         p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS));
1366         if (NULL == p->pm_obj_pdpt)
1367                 panic("pmap_create pdpt obj");
1368
1369         p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS));
1370         if (NULL == p->pm_obj)
1371                 panic("pmap_create pte obj");
1372
1373         /* All pmaps share the kennel's pml4 */
1374         pml4 = pmap64_pml4(p, 0ULL);
1375         kpml4 = kernel_pmap->pm_pml4;
1376         pml4[KERNEL_PML4_INDEX]    = kpml4[KERNEL_PML4_INDEX];
1377         pml4[KERNEL_KEXTS_INDEX]   = kpml4[KERNEL_KEXTS_INDEX];
1378         pml4[KERNEL_PHYSMAP_INDEX] = kpml4[KERNEL_PHYSMAP_INDEX];
1379
1380         PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1381                    p, is_64bit, 0, 0, 0);
1382
1383         return(p);
1384 }
1385
1386 /*
1387  *      Retire the given physical map from service.
1388  *      Should only be called if the map contains
1389  *      no valid mappings.
1390  */
1391
1392 void
1393 pmap_destroy(
1394         register pmap_t p)
1395 {
1396         register int            c;
1397
1398         if (p == PMAP_NULL)
1399                 return;
1400
1401         PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1402                    p, 0, 0, 0, 0);
1403
1404         PMAP_LOCK(p);
1405
1406         c = --p->ref_count;
1407
1408         if (c == 0) {
1409                 /*
1410                  * If some cpu is not using the physical pmap pointer that it
1411                  * is supposed to be (see set_dirbase), we might be using the
1412                  * pmap that is being destroyed! Make sure we are
1413                  * physically on the right pmap:
1414                  */
1415                 PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL);
1416         }
1417
1418         PMAP_UNLOCK(p);
1419
1420         if (c != 0) {
1421                 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1422                            p, 1, 0, 0, 0);
1423                 return; /* still in use */
1424         }
1425
1426         /*
1427          *      Free the memory maps, then the
1428          *      pmap structure.
1429          */
1430         int inuse_ptepages = 0;
1431
1432         inuse_ptepages++;
1433         kmem_free(kernel_map, (vm_offset_t)p->pm_pml4, PAGE_SIZE);
1434
1435         inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1436         vm_object_deallocate(p->pm_obj_pml4);
1437
1438         inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1439         vm_object_deallocate(p->pm_obj_pdpt);
1440
1441         inuse_ptepages += p->pm_obj->resident_page_count;
1442         vm_object_deallocate(p->pm_obj);
1443
1444         OSAddAtomic(-inuse_ptepages,  &inuse_ptepages_count);
1445
1446         zfree(pmap_zone, p);
1447
1448         PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1449                    0, 0, 0, 0, 0);
1450 }
1451
1452 /*
1453  *      Add a reference to the specified pmap.
1454  */
1455
1456 void
1457 pmap_reference(pmap_t   p)
1458 {
1459         if (p != PMAP_NULL) {
1460                 PMAP_LOCK(p);
1461                 p->ref_count++;
1462                 PMAP_UNLOCK(p);;
1463         }
1464 }
1465
1466 /*
1467  *      Remove a range of hardware page-table entries.
1468  *      The entries given are the first (inclusive)
1469  *      and last (exclusive) entries for the VM pages.
1470  *      The virtual address is the va for the first pte.
1471  *
1472  *      The pmap must be locked.
1473  *      If the pmap is not the kernel pmap, the range must lie
1474  *      entirely within one pte-page.  This is NOT checked.
1475  *      Assumes that the pte-page exists.
1476  */
1477
1478 void
1479 pmap_remove_range(
1480         pmap_t                  pmap,
1481         vm_map_offset_t         start_vaddr,
1482         pt_entry_t              *spte,
1483         pt_entry_t              *epte)
1484 {
1485         pt_entry_t              *cpte;
1486         pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
1487         pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
1488         pv_hashed_entry_t       pvh_e;
1489         int                     pvh_cnt = 0;
1490         int                     num_removed, num_unwired, num_found;
1491         int                     pai;
1492         pmap_paddr_t            pa;
1493         vm_map_offset_t         vaddr;
1494
1495         num_removed = 0;
1496         num_unwired = 0;
1497         num_found   = 0;
1498
1499         /* invalidate the PTEs first to "freeze" them */
1500         for (cpte = spte, vaddr = start_vaddr;
1501              cpte < epte;
1502              cpte++, vaddr += PAGE_SIZE_64) {
1503
1504                 pa = pte_to_pa(*cpte);
1505                 if (pa == 0)
1506                         continue;
1507                 num_found++;
1508
1509                 if (iswired(*cpte))
1510                         num_unwired++;
1511
1512                 pai = pa_index(pa);
1513
1514                 if (!IS_MANAGED_PAGE(pai)) {
1515                         /*
1516                          *      Outside range of managed physical memory.
1517                          *      Just remove the mappings.
1518                          */
1519                         pmap_store_pte(cpte, 0);
1520                         continue;
1521                 }
1522
1523                 /* invalidate the PTE */
1524                 pmap_update_pte(cpte, *cpte, (*cpte & ~INTEL_PTE_VALID));
1525         }
1526
1527         if (num_found == 0) {
1528                 /* nothing was changed: we're done */
1529                 goto update_counts;
1530         }
1531
1532         /* propagate the invalidates to other CPUs */
1533
1534         PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
1535
1536         for (cpte = spte, vaddr = start_vaddr;
1537              cpte < epte;
1538              cpte++, vaddr += PAGE_SIZE_64) {
1539
1540                 pa = pte_to_pa(*cpte);
1541                 if (pa == 0)
1542                         continue;
1543
1544                 pai = pa_index(pa);
1545
1546                 LOCK_PVH(pai);
1547
1548                 pa = pte_to_pa(*cpte);
1549                 if (pa == 0) {
1550                         UNLOCK_PVH(pai);
1551                         continue;
1552                 }
1553                 num_removed++;
1554
1555                 /*
1556                  * Get the modify and reference bits, then
1557                  * nuke the entry in the page table
1558                  */
1559                 /* remember reference and change */
1560                 pmap_phys_attributes[pai] |=
1561                         (char) (*cpte & (PHYS_MODIFIED | PHYS_REFERENCED));
1562                 /* completely invalidate the PTE */
1563                 pmap_store_pte(cpte, 0);
1564
1565                 /*
1566                  * Remove the mapping from the pvlist for this physical page.
1567                  */
1568                 pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t) pai);
1569
1570                 UNLOCK_PVH(pai);
1571
1572                 if (pvh_e != PV_HASHED_ENTRY_NULL) {
1573                         pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1574                         pvh_eh = pvh_e;
1575
1576                         if (pvh_et == PV_HASHED_ENTRY_NULL) {
1577                                 pvh_et = pvh_e;
1578                         }
1579                         pvh_cnt++;
1580                 }
1581         } /* for loop */
1582
1583         if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1584                 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1585         }
1586 update_counts:
1587         /*
1588          *      Update the counts
1589          */
1590 #if TESTING
1591         if (pmap->stats.resident_count < num_removed)
1592                 panic("pmap_remove_range: resident_count");
1593 #endif
1594         assert(pmap->stats.resident_count >= num_removed);
1595         OSAddAtomic(-num_removed,  &pmap->stats.resident_count);
1596
1597 #if TESTING
1598         if (pmap->stats.wired_count < num_unwired)
1599                 panic("pmap_remove_range: wired_count");
1600 #endif
1601         assert(pmap->stats.wired_count >= num_unwired);
1602         OSAddAtomic(-num_unwired,  &pmap->stats.wired_count);
1603
1604         return;
1605 }
1606
1607 /*
1608  *      Remove phys addr if mapped in specified map
1609  *
1610  */
1611 void
1612 pmap_remove_some_phys(
1613         __unused pmap_t         map,
1614         __unused ppnum_t         pn)
1615 {
1616
1617 /* Implement to support working set code */
1618
1619 }
1620
1621 /*
1622  *      Remove the given range of addresses
1623  *      from the specified map.
1624  *
1625  *      It is assumed that the start and end are properly
1626  *      rounded to the hardware page size.
1627  */
1628 void
1629 pmap_remove(
1630         pmap_t          map,
1631         addr64_t        s64,
1632         addr64_t        e64)
1633 {
1634         pt_entry_t     *pde;
1635         pt_entry_t     *spte, *epte;
1636         addr64_t        l64;
1637         uint64_t        deadline;
1638
1639         pmap_intr_assert();
1640
1641         if (map == PMAP_NULL || s64 == e64)
1642                 return;
1643
1644         PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
1645                    map,
1646                    (uint32_t) (s64 >> 32), s64,
1647                    (uint32_t) (e64 >> 32), e64);
1648
1649
1650         PMAP_LOCK(map);
1651
1652 #if 0
1653         /*
1654          * Check that address range in the kernel does not overlap the stacks.
1655          * We initialize local static min/max variables once to avoid making
1656          * 2 function calls for every remove. Note also that these functions
1657          * both return 0 before kernel stacks have been initialized, and hence
1658          * the panic is not triggered in this case.
1659          */
1660         if (map == kernel_pmap) {
1661                 static vm_offset_t kernel_stack_min = 0;
1662                 static vm_offset_t kernel_stack_max = 0;
1663
1664                 if (kernel_stack_min == 0) {
1665                         kernel_stack_min = min_valid_stack_address();
1666                         kernel_stack_max = max_valid_stack_address();
1667                 }
1668                 if ((kernel_stack_min <= s64 && s64 < kernel_stack_max) ||
1669                     (kernel_stack_min < e64 && e64 <= kernel_stack_max))
1670                         panic("pmap_remove() attempted in kernel stack");
1671         }
1672 #else
1673
1674         /*
1675          * The values of kernel_stack_min and kernel_stack_max are no longer
1676          * relevant now that we allocate kernel stacks in the kernel map,
1677          * so the old code above no longer applies.  If we wanted to check that
1678          * we weren't removing a mapping of a page in a kernel stack we'd
1679          * mark the PTE with an unused bit and check that here.
1680          */
1681
1682 #endif
1683
1684         deadline = rdtsc64() + max_preemption_latency_tsc;
1685
1686         while (s64 < e64) {
1687                 l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size - 1);
1688                 if (l64 > e64)
1689                         l64 = e64;
1690                 pde = pmap_pde(map, s64);
1691
1692                 if (pde && (*pde & INTEL_PTE_VALID)) {
1693                         if (*pde & INTEL_PTE_PS) {
1694                                 /*
1695                                  * If we're removing a superpage, pmap_remove_range()
1696                                  * must work on level 2 instead of level 1; and we're
1697                                  * only passing a single level 2 entry instead of a
1698                                  * level 1 range.
1699                                  */
1700                                 spte = pde;
1701                                 epte = spte+1; /* excluded */
1702                         } else {
1703                                 spte = pmap_pte(map, (s64 & ~(pde_mapped_size - 1)));
1704                                 spte = &spte[ptenum(s64)];
1705                                 epte = &spte[intel_btop(l64 - s64)];
1706                         }
1707                         pmap_remove_range(map, s64, spte, epte);
1708                 }
1709                 s64 = l64;
1710                 pde++;
1711
1712                 if (s64 < e64 && rdtsc64() >= deadline) {
1713                         PMAP_UNLOCK(map)
1714                         PMAP_LOCK(map)
1715                         deadline = rdtsc64() + max_preemption_latency_tsc;
1716                 }
1717         }
1718
1719         PMAP_UNLOCK(map);
1720
1721         PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END,
1722                    map, 0, 0, 0, 0);
1723
1724 }
1725
1726 /*
1727  *      Routine:        pmap_page_protect
1728  *
1729  *      Function:
1730  *              Lower the permission for all mappings to a given
1731  *              page.
1732  */
1733 void
1734 pmap_page_protect(
1735         ppnum_t         pn,
1736         vm_prot_t       prot)
1737 {
1738         pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
1739         pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
1740         pv_hashed_entry_t       nexth;
1741         int                     pvh_cnt = 0;
1742         pv_rooted_entry_t       pv_h;
1743         pv_rooted_entry_t       pv_e;
1744         pv_hashed_entry_t       pvh_e;
1745         pt_entry_t              *pte;
1746         int                     pai;
1747         pmap_t                  pmap;
1748         boolean_t               remove;
1749
1750         pmap_intr_assert();
1751         assert(pn != vm_page_fictitious_addr);
1752         if (pn == vm_page_guard_addr)
1753                 return;
1754
1755         pai = ppn_to_pai(pn);
1756
1757         if (!IS_MANAGED_PAGE(pai)) {
1758                 /*
1759                  *      Not a managed page.
1760                  */
1761                 return;
1762         }
1763         PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START,
1764                    pn, prot, 0, 0, 0);
1765
1766         /*
1767          * Determine the new protection.
1768          */
1769         switch (prot) {
1770         case VM_PROT_READ:
1771         case VM_PROT_READ | VM_PROT_EXECUTE:
1772                 remove = FALSE;
1773                 break;
1774         case VM_PROT_ALL:
1775                 return;         /* nothing to do */
1776         default:
1777                 remove = TRUE;
1778                 break;
1779         }
1780
1781         pv_h = pai_to_pvh(pai);
1782
1783         LOCK_PVH(pai);
1784
1785
1786         /*
1787          * Walk down PV list, if any, changing or removing all mappings.
1788          */
1789         if (pv_h->pmap == PMAP_NULL)
1790                 goto done;
1791
1792         pv_e = pv_h;
1793         pvh_e = (pv_hashed_entry_t) pv_e;       /* cheat */
1794
1795         do {
1796                 vm_map_offset_t vaddr;
1797
1798                 pmap = pv_e->pmap;
1799                 vaddr = pv_e->va;
1800                 pte = pmap_pte(pmap, vaddr);
1801                 if (0 == pte) {
1802                         panic("pmap_page_protect() "
1803                                 "pmap=%p pn=0x%x vaddr=0x%llx\n",
1804                                 pmap, pn, vaddr);
1805                 }
1806                 nexth = (pv_hashed_entry_t) queue_next(&pvh_e->qlink);
1807
1808                 /*
1809                  * Remove the mapping if new protection is NONE
1810                  * or if write-protecting a kernel mapping.
1811                  */
1812                 if (remove || pmap == kernel_pmap) {
1813                         /*
1814                          * Remove the mapping, collecting dirty bits.
1815                          */
1816                         pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_VALID);
1817                         PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE);
1818                         pmap_phys_attributes[pai] |=
1819                                 *pte & (PHYS_MODIFIED|PHYS_REFERENCED);
1820                         pmap_store_pte(pte, 0);
1821
1822 #if TESTING
1823                         if (pmap->stats.resident_count < 1)
1824                                 panic("pmap_page_protect: resident_count");
1825 #endif
1826                         assert(pmap->stats.resident_count >= 1);
1827                         OSAddAtomic(-1,  &pmap->stats.resident_count);
1828
1829                         /*
1830                          * Deal with the pv_rooted_entry.
1831                          */
1832
1833                         if (pv_e == pv_h) {
1834                                 /*
1835                                  * Fix up head later.
1836                                  */
1837                                 pv_h->pmap = PMAP_NULL;
1838                         } else {
1839                                 /*
1840                                  * Delete this entry.
1841                                  */
1842                                 pv_hash_remove(pvh_e);
1843                                 pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1844                                 pvh_eh = pvh_e;
1845
1846                                 if (pvh_et == PV_HASHED_ENTRY_NULL)
1847                                         pvh_et = pvh_e;
1848                                 pvh_cnt++;
1849                         }
1850                 } else {
1851                         /*
1852                          * Write-protect.
1853                          */
1854                         pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_WRITE);
1855                         PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE);
1856                 }
1857                 pvh_e = nexth;
1858         } while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h);
1859
1860
1861         /*
1862          * If pv_head mapping was removed, fix it up.
1863          */
1864         if (pv_h->pmap == PMAP_NULL) {
1865                 pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
1866
1867                 if (pvh_e != (pv_hashed_entry_t) pv_h) {
1868                         pv_hash_remove(pvh_e);
1869                         pv_h->pmap = pvh_e->pmap;
1870                         pv_h->va = pvh_e->va;
1871                         pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1872                         pvh_eh = pvh_e;
1873
1874                         if (pvh_et == PV_HASHED_ENTRY_NULL)
1875                                 pvh_et = pvh_e;
1876                         pvh_cnt++;
1877                 }
1878         }
1879         if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1880                 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1881         }
1882 done:
1883         UNLOCK_PVH(pai);
1884
1885         PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END,
1886                    0, 0, 0, 0, 0);
1887 }
1888
1889
1890 /*
1891  *      Routine:
1892  *              pmap_disconnect
1893  *
1894  *      Function:
1895  *              Disconnect all mappings for this page and return reference and change status
1896  *              in generic format.
1897  *
1898  */
1899 unsigned int pmap_disconnect(
1900         ppnum_t pa)
1901 {
1902         pmap_page_protect(pa, 0);               /* disconnect the page */
1903         return (pmap_get_refmod(pa));           /* return ref/chg status */
1904 }
1905
1906 /*
1907  *      Set the physical protection on the
1908  *      specified range of this map as requested.
1909  *      Will not increase permissions.
1910  */
1911 void
1912 pmap_protect(
1913         pmap_t          map,
1914         vm_map_offset_t sva,
1915         vm_map_offset_t eva,
1916         vm_prot_t       prot)
1917 {
1918         pt_entry_t      *pde;
1919         pt_entry_t      *spte, *epte;
1920         vm_map_offset_t lva;
1921         vm_map_offset_t orig_sva;
1922         boolean_t       set_NX;
1923         int             num_found = 0;
1924
1925         pmap_intr_assert();
1926
1927         if (map == PMAP_NULL)
1928                 return;
1929
1930         if (prot == VM_PROT_NONE) {
1931                 pmap_remove(map, sva, eva);
1932                 return;
1933         }
1934         PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
1935                    map,
1936                    (uint32_t) (sva >> 32), (uint32_t) sva,
1937                    (uint32_t) (eva >> 32), (uint32_t) eva);
1938
1939         if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !map->nx_enabled)
1940                 set_NX = FALSE;
1941         else
1942                 set_NX = TRUE;
1943
1944         PMAP_LOCK(map);
1945
1946         orig_sva = sva;
1947         while (sva < eva) {
1948                 lva = (sva + pde_mapped_size) & ~(pde_mapped_size - 1);
1949                 if (lva > eva)
1950                         lva = eva;
1951                 pde = pmap_pde(map, sva);
1952                 if (pde && (*pde & INTEL_PTE_VALID)) {
1953                         if (*pde & INTEL_PTE_PS) {
1954                                 /* superpage */
1955                                 spte = pde;
1956                                 epte = spte+1; /* excluded */
1957                         } else {
1958                                 spte = pmap_pte(map, (sva & ~(pde_mapped_size - 1)));
1959                                 spte = &spte[ptenum(sva)];
1960                                 epte = &spte[intel_btop(lva - sva)];
1961                         }
1962
1963                         for (; spte < epte; spte++) {
1964                                 if (!(*spte & INTEL_PTE_VALID))
1965                                         continue;
1966
1967                                 if (prot & VM_PROT_WRITE)
1968                                         pmap_update_pte(spte, *spte,
1969                                                 *spte | INTEL_PTE_WRITE);
1970                                 else
1971                                         pmap_update_pte(spte, *spte,
1972                                                 *spte & ~INTEL_PTE_WRITE);
1973
1974                                 if (set_NX)
1975                                         pmap_update_pte(spte, *spte,
1976                                                 *spte | INTEL_PTE_NX);
1977                                 else
1978                                         pmap_update_pte(spte, *spte,
1979                                                 *spte & ~INTEL_PTE_NX);
1980
1981                                 num_found++;
1982                         }
1983                 }
1984                 sva = lva;
1985         }
1986         if (num_found)
1987                 PMAP_UPDATE_TLBS(map, orig_sva, eva);
1988
1989         PMAP_UNLOCK(map);
1990
1991         PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END,
1992                    0, 0, 0, 0, 0);
1993
1994 }
1995
1996 /* Map a (possibly) autogenned block */
1997 void
1998 pmap_map_block(
1999         pmap_t          pmap,
2000         addr64_t        va,
2001         ppnum_t         pa,
2002         uint32_t        size,
2003         vm_prot_t       prot,
2004         int             attr,
2005         __unused unsigned int   flags)
2006 {
2007         uint32_t        page;
2008         int             cur_page_size;
2009
2010         if (attr & VM_MEM_SUPERPAGE)
2011                 cur_page_size =  SUPERPAGE_SIZE;
2012         else
2013                 cur_page_size =  PAGE_SIZE;
2014
2015         for (page = 0; page < size; page+=cur_page_size/PAGE_SIZE) {
2016                 pmap_enter(pmap, va, pa, prot, attr, TRUE);
2017                 va += cur_page_size;
2018                 pa+=cur_page_size/PAGE_SIZE;
2019         }
2020 }
2021
2022
2023 /*
2024  *      Insert the given physical page (p) at
2025  *      the specified virtual address (v) in the
2026  *      target physical map with the protection requested.
2027  *
2028  *      If specified, the page will be wired down, meaning
2029  *      that the related pte cannot be reclaimed.
2030  *
2031  *      NB:  This is the only routine which MAY NOT lazy-evaluate
2032  *      or lose information.  That is, this routine must actually
2033  *      insert this page into the given map NOW.
2034  */
2035 void
2036 pmap_enter(
2037         register pmap_t         pmap,
2038         vm_map_offset_t         vaddr,
2039         ppnum_t                 pn,
2040         vm_prot_t               prot,
2041         unsigned int            flags,
2042         boolean_t               wired)
2043 {
2044         pt_entry_t              *pte;
2045         pv_rooted_entry_t       pv_h;
2046         int                     pai;
2047         pv_hashed_entry_t       pvh_e;
2048         pv_hashed_entry_t       pvh_new;
2049         pt_entry_t              template;
2050         pmap_paddr_t            old_pa;
2051         pmap_paddr_t            pa = (pmap_paddr_t) i386_ptob(pn);
2052         boolean_t               need_tlbflush = FALSE;
2053         boolean_t               set_NX;
2054         char                    oattr;
2055         boolean_t               old_pa_locked;
2056         boolean_t               superpage = flags & VM_MEM_SUPERPAGE;
2057         vm_object_t             delpage_pm_obj = NULL;
2058         int                     delpage_pde_index = 0;
2059
2060
2061         pmap_intr_assert();
2062         assert(pn != vm_page_fictitious_addr);
2063         if (pmap_debug)
2064                 kprintf("pmap_enter(%p,%llu,%u)\n", pmap, vaddr, pn);
2065         if (pmap == PMAP_NULL)
2066                 return;
2067         if (pn == vm_page_guard_addr)
2068                 return;
2069
2070         PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
2071                    pmap,
2072                    (uint32_t) (vaddr >> 32), (uint32_t) vaddr,
2073                    pn, prot);
2074
2075         if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
2076                 set_NX = FALSE;
2077         else
2078                 set_NX = TRUE;
2079
2080         /*
2081          *      Must allocate a new pvlist entry while we're unlocked;
2082          *      zalloc may cause pageout (which will lock the pmap system).
2083          *      If we determine we need a pvlist entry, we will unlock
2084          *      and allocate one.  Then we will retry, throughing away
2085          *      the allocated entry later (if we no longer need it).
2086          */
2087
2088         pvh_new = PV_HASHED_ENTRY_NULL;
2089 Retry:
2090         pvh_e = PV_HASHED_ENTRY_NULL;
2091
2092         PMAP_LOCK(pmap);
2093
2094         /*
2095          *      Expand pmap to include this pte.  Assume that
2096          *      pmap is always expanded to include enough hardware
2097          *      pages to map one VM page.
2098          */
2099          if(superpage) {
2100                 while ((pte = pmap64_pde(pmap, vaddr)) == PD_ENTRY_NULL) {
2101                         /* need room for another pde entry */
2102                         PMAP_UNLOCK(pmap);
2103                         pmap_expand_pdpt(pmap, vaddr);
2104                         PMAP_LOCK(pmap);
2105                 }
2106         } else {
2107                 while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
2108                         /*
2109                          * Must unlock to expand the pmap
2110                          * going to grow pde level page(s)
2111                          */
2112                         PMAP_UNLOCK(pmap);
2113                         pmap_expand(pmap, vaddr);
2114                         PMAP_LOCK(pmap);
2115                 }
2116         }
2117
2118         if (superpage && *pte && !(*pte & INTEL_PTE_PS)) {
2119                 /*
2120                  * There is still an empty page table mapped that
2121                  * was used for a previous base page mapping.
2122                  * Remember the PDE and the PDE index, so that we
2123                  * can free the page at the end of this function.
2124                  */
2125                 delpage_pde_index = (int)pdeidx(pmap, vaddr);
2126                 delpage_pm_obj = pmap->pm_obj;
2127                 *pte = 0;
2128         }
2129
2130         old_pa = pte_to_pa(*pte);
2131         pai = pa_index(old_pa);
2132         old_pa_locked = FALSE;
2133
2134         /*
2135          * if we have a previous managed page, lock the pv entry now. after
2136          * we lock it, check to see if someone beat us to the lock and if so
2137          * drop the lock
2138          */
2139         if ((0 != old_pa) && IS_MANAGED_PAGE(pai)) {
2140                 LOCK_PVH(pai);
2141                 old_pa_locked = TRUE;
2142                 old_pa = pte_to_pa(*pte);
2143                 if (0 == old_pa) {
2144                         UNLOCK_PVH(pai);        /* another path beat us to it */
2145                         old_pa_locked = FALSE;
2146                 }
2147         }
2148
2149         /*
2150          *      Special case if the incoming physical page is already mapped
2151          *      at this address.
2152          */
2153         if (old_pa == pa) {
2154
2155                 /*
2156                  *      May be changing its wired attribute or protection
2157                  */
2158
2159                 template = pa_to_pte(pa) | INTEL_PTE_VALID;
2160
2161                 if (VM_MEM_NOT_CACHEABLE ==
2162                     (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) {
2163                         if (!(flags & VM_MEM_GUARDED))
2164                                 template |= INTEL_PTE_PTA;
2165                         template |= INTEL_PTE_NCACHE;
2166                 }
2167                 if (pmap != kernel_pmap)
2168                         template |= INTEL_PTE_USER;
2169                 if (prot & VM_PROT_WRITE)
2170                         template |= INTEL_PTE_WRITE;
2171
2172                 if (set_NX)
2173                         template |= INTEL_PTE_NX;
2174
2175                 if (wired) {
2176                         template |= INTEL_PTE_WIRED;
2177                         if (!iswired(*pte))
2178                                 OSAddAtomic(+1,
2179                                         &pmap->stats.wired_count);
2180                 } else {
2181                         if (iswired(*pte)) {
2182                                 assert(pmap->stats.wired_count >= 1);
2183                                 OSAddAtomic(-1,
2184                                         &pmap->stats.wired_count);
2185                         }
2186                 }
2187                 if (superpage)          /* this path can not be used */
2188                         template |= INTEL_PTE_PS;       /* to change the page size! */
2189
2190                 /* store modified PTE and preserve RC bits */
2191                 pmap_update_pte(pte, *pte,
2192                         template | (*pte & (INTEL_PTE_REF | INTEL_PTE_MOD)));
2193                 if (old_pa_locked) {
2194                         UNLOCK_PVH(pai);
2195                         old_pa_locked = FALSE;
2196                 }
2197                 need_tlbflush = TRUE;
2198                 goto Done;
2199         }
2200
2201         /*
2202          *      Outline of code from here:
2203          *         1) If va was mapped, update TLBs, remove the mapping
2204          *            and remove old pvlist entry.
2205          *         2) Add pvlist entry for new mapping
2206          *         3) Enter new mapping.
2207          *
2208          *      If the old physical page is not managed step 1) is skipped
2209          *      (except for updating the TLBs), and the mapping is
2210          *      overwritten at step 3).  If the new physical page is not
2211          *      managed, step 2) is skipped.
2212          */
2213
2214         if (old_pa != (pmap_paddr_t) 0) {
2215
2216                 /*
2217                  *      Don't do anything to pages outside valid memory here.
2218                  *      Instead convince the code that enters a new mapping
2219                  *      to overwrite the old one.
2220                  */
2221
2222                 /* invalidate the PTE */
2223                 pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID));
2224                 /* propagate invalidate everywhere */
2225                 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2226                 /* remember reference and change */
2227                 oattr = (char) (*pte & (PHYS_MODIFIED | PHYS_REFERENCED));
2228                 /* completely invalidate the PTE */
2229                 pmap_store_pte(pte, 0);
2230
2231                 if (IS_MANAGED_PAGE(pai)) {
2232 #if TESTING
2233                         if (pmap->stats.resident_count < 1)
2234                                 panic("pmap_enter: resident_count");
2235 #endif
2236                         assert(pmap->stats.resident_count >= 1);
2237                         OSAddAtomic(-1,
2238                                 &pmap->stats.resident_count);
2239
2240                         if (iswired(*pte)) {
2241 #if TESTING
2242                                 if (pmap->stats.wired_count < 1)
2243                                         panic("pmap_enter: wired_count");
2244 #endif
2245                                 assert(pmap->stats.wired_count >= 1);
2246                                 OSAddAtomic(-1,
2247                                         &pmap->stats.wired_count);
2248                         }
2249                         pmap_phys_attributes[pai] |= oattr;
2250
2251                         /*
2252                          *      Remove the mapping from the pvlist for
2253                          *      this physical page.
2254                          *      We'll end up with either a rooted pv or a
2255                          *      hashed pv
2256                          */
2257                         pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t) pai);
2258
2259                 } else {
2260
2261                         /*
2262                          *      old_pa is not managed.
2263                          *      Do removal part of accounting.
2264                          */
2265
2266                         if (iswired(*pte)) {
2267                                 assert(pmap->stats.wired_count >= 1);
2268                                 OSAddAtomic(-1,
2269                                         &pmap->stats.wired_count);
2270                         }
2271                 }
2272         }
2273
2274         /*
2275          * if we had a previously managed paged locked, unlock it now
2276          */
2277         if (old_pa_locked) {
2278                 UNLOCK_PVH(pai);
2279                 old_pa_locked = FALSE;
2280         }
2281
2282         pai = pa_index(pa);     /* now working with new incoming phys page */
2283         if (IS_MANAGED_PAGE(pai)) {
2284
2285                 /*
2286                  *      Step 2) Enter the mapping in the PV list for this
2287                  *      physical page.
2288                  */
2289                 pv_h = pai_to_pvh(pai);
2290
2291                 LOCK_PVH(pai);
2292
2293                 if (pv_h->pmap == PMAP_NULL) {
2294                         /*
2295                          *      No mappings yet, use rooted pv
2296                          */
2297                         pv_h->va = vaddr;
2298                         pv_h->pmap = pmap;
2299                         queue_init(&pv_h->qlink);
2300                 } else {
2301                         /*
2302                          *      Add new pv_hashed_entry after header.
2303                          */
2304                         if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
2305                                 pvh_e = pvh_new;
2306                                 pvh_new = PV_HASHED_ENTRY_NULL;
2307                         } else if (PV_HASHED_ENTRY_NULL == pvh_e) {
2308                                 PV_HASHED_ALLOC(pvh_e);
2309                                 if (PV_HASHED_ENTRY_NULL == pvh_e) {
2310                                         /*
2311                                          * the pv list is empty. if we are on
2312                                          * the kernel pmap we'll use one of
2313                                          * the special private kernel pv_e's,
2314                                          * else, we need to unlock
2315                                          * everything, zalloc a pv_e, and
2316                                          * restart bringing in the pv_e with
2317                                          * us.
2318                                          */
2319                                         if (kernel_pmap == pmap) {
2320                                                 PV_HASHED_KERN_ALLOC(pvh_e);
2321                                         } else {
2322                                                 UNLOCK_PVH(pai);
2323                                                 PMAP_UNLOCK(pmap);
2324                                                 pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
2325                                                 goto Retry;
2326                                         }
2327                                 }
2328                         }
2329                         if (PV_HASHED_ENTRY_NULL == pvh_e)
2330                                 panic("pvh_e exhaustion");
2331
2332                         pvh_e->va = vaddr;
2333                         pvh_e->pmap = pmap;
2334                         pvh_e->ppn = pn;
2335                         pv_hash_add(pvh_e, pv_h);
2336
2337                         /*
2338                          *      Remember that we used the pvlist entry.
2339                          */
2340                         pvh_e = PV_HASHED_ENTRY_NULL;
2341                 }
2342
2343                 /*
2344                  * only count the mapping
2345                  * for 'managed memory'
2346                  */
2347                 OSAddAtomic(+1,  & pmap->stats.resident_count);
2348                 if (pmap->stats.resident_count > pmap->stats.resident_max) {
2349                         pmap->stats.resident_max = pmap->stats.resident_count;
2350                 }
2351         }
2352         /*
2353          * Step 3) Enter the mapping.
2354          *
2355          *      Build a template to speed up entering -
2356          *      only the pfn changes.
2357          */
2358         template = pa_to_pte(pa) | INTEL_PTE_VALID;
2359
2360         if (flags & VM_MEM_NOT_CACHEABLE) {
2361                 if (!(flags & VM_MEM_GUARDED))
2362                         template |= INTEL_PTE_PTA;
2363                 template |= INTEL_PTE_NCACHE;
2364         }
2365         if (pmap != kernel_pmap)
2366                 template |= INTEL_PTE_USER;
2367         if (prot & VM_PROT_WRITE)
2368                 template |= INTEL_PTE_WRITE;
2369         if (set_NX)
2370                 template |= INTEL_PTE_NX;
2371         if (wired) {
2372                 template |= INTEL_PTE_WIRED;
2373                 OSAddAtomic(+1,  & pmap->stats.wired_count);
2374         }
2375         if (superpage)
2376                 template |= INTEL_PTE_PS;
2377         pmap_store_pte(pte, template);
2378
2379         /*
2380          * if this was a managed page we delayed unlocking the pv until here
2381          * to prevent pmap_page_protect et al from finding it until the pte
2382          * has been stored
2383          */
2384         if (IS_MANAGED_PAGE(pai)) {
2385                 UNLOCK_PVH(pai);
2386         }
2387 Done:
2388         if (need_tlbflush == TRUE)
2389                 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2390
2391         if (pvh_e != PV_HASHED_ENTRY_NULL) {
2392                 PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
2393         }
2394         if (pvh_new != PV_HASHED_ENTRY_NULL) {
2395                 PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
2396         }
2397         PMAP_UNLOCK(pmap);
2398
2399         if (delpage_pm_obj) {
2400                 vm_page_t m;
2401
2402                 vm_object_lock(delpage_pm_obj);
2403                 m = vm_page_lookup(delpage_pm_obj, delpage_pde_index);
2404                 if (m == VM_PAGE_NULL)
2405                     panic("pmap_enter: pte page not in object");
2406                 VM_PAGE_FREE(m);
2407                 OSAddAtomic(-1,  &inuse_ptepages_count);
2408                 vm_object_unlock(delpage_pm_obj);
2409         }
2410
2411         PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0);
2412 }
2413
2414 /*
2415  *      Routine:        pmap_change_wiring
2416  *      Function:       Change the wiring attribute for a map/virtual-address
2417  *                      pair.
2418  *      In/out conditions:
2419  *                      The mapping must already exist in the pmap.
2420  */
2421 void
2422 pmap_change_wiring(
2423         pmap_t          map,
2424         vm_map_offset_t vaddr,
2425         boolean_t       wired)
2426 {
2427         pt_entry_t      *pte;
2428
2429         PMAP_LOCK(map);
2430
2431         if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL)
2432                 panic("pmap_change_wiring: pte missing");
2433
2434         if (wired && !iswired(*pte)) {
2435                 /*
2436                  * wiring down mapping
2437                  */
2438                 OSAddAtomic(+1,  &map->stats.wired_count);
2439                 pmap_update_pte(pte, *pte, (*pte | INTEL_PTE_WIRED));
2440         }
2441         else if (!wired && iswired(*pte)) {
2442                 /*
2443                  * unwiring mapping
2444                  */
2445                 assert(map->stats.wired_count >= 1);
2446                 OSAddAtomic(-1,  &map->stats.wired_count);
2447                 pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WIRED));
2448         }
2449
2450         PMAP_UNLOCK(map);
2451 }
2452
2453 void
2454 pmap_expand_pml4(
2455         pmap_t          map,
2456         vm_map_offset_t vaddr)
2457 {
2458         vm_page_t       m;
2459         pmap_paddr_t    pa;
2460         uint64_t        i;
2461         ppnum_t         pn;
2462         pml4_entry_t    *pml4p;
2463
2464         DBG("pmap_expand_pml4(%p,%p)\n", map, (void *)vaddr);
2465
2466         /*
2467          *      Allocate a VM page for the pml4 page
2468          */
2469         while ((m = vm_page_grab()) == VM_PAGE_NULL)
2470                 VM_PAGE_WAIT();
2471
2472         /*
2473          *      put the page into the pmap's obj list so it
2474          *      can be found later.
2475          */
2476         pn = m->phys_page;
2477         pa = i386_ptob(pn);
2478         i = pml4idx(map, vaddr);
2479
2480         /*
2481          *      Zero the page.
2482          */
2483         pmap_zero_page(pn);
2484
2485         vm_page_lockspin_queues();
2486         vm_page_wire(m);
2487         vm_page_unlock_queues();
2488
2489         OSAddAtomic(1,  &inuse_ptepages_count);
2490
2491         /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2492         vm_object_lock(map->pm_obj_pml4);
2493
2494         PMAP_LOCK(map);
2495         /*
2496          *      See if someone else expanded us first
2497          */
2498         if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
2499                 PMAP_UNLOCK(map);
2500                 vm_object_unlock(map->pm_obj_pml4);
2501
2502                 VM_PAGE_FREE(m);
2503
2504                 OSAddAtomic(-1,  &inuse_ptepages_count);
2505                 return;
2506         }
2507
2508 #if 0 /* DEBUG */
2509        if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i)) {
2510                panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
2511                      map, map->pm_obj_pml4, vaddr, i);
2512        }
2513 #endif
2514         vm_page_insert(m, map->pm_obj_pml4, (vm_object_offset_t)i);
2515         vm_object_unlock(map->pm_obj_pml4);
2516
2517         /*
2518          *      Set the page directory entry for this page table.
2519          */
2520         pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
2521
2522         pmap_store_pte(pml4p, pa_to_pte(pa)
2523                                 | INTEL_PTE_VALID
2524                                 | INTEL_PTE_USER
2525                                 | INTEL_PTE_WRITE);
2526
2527         PMAP_UNLOCK(map);
2528
2529         return;
2530 }
2531
2532 void
2533 pmap_expand_pdpt(
2534                  pmap_t map,
2535                  vm_map_offset_t vaddr)
2536 {
2537         vm_page_t       m;
2538         pmap_paddr_t    pa;
2539         uint64_t        i;
2540         ppnum_t         pn;
2541         pdpt_entry_t    *pdptp;
2542
2543         DBG("pmap_expand_pdpt(%p,%p)\n", map, (void *)vaddr);
2544
2545         while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
2546                 pmap_expand_pml4(map, vaddr);
2547         }
2548
2549         /*
2550          *      Allocate a VM page for the pdpt page
2551          */
2552         while ((m = vm_page_grab()) == VM_PAGE_NULL)
2553                 VM_PAGE_WAIT();
2554
2555         /*
2556          *      put the page into the pmap's obj list so it
2557          *      can be found later.
2558          */
2559         pn = m->phys_page;
2560         pa = i386_ptob(pn);
2561         i = pdptidx(map, vaddr);
2562
2563         /*
2564          *      Zero the page.
2565          */
2566         pmap_zero_page(pn);
2567
2568         vm_page_lockspin_queues();
2569         vm_page_wire(m);
2570         vm_page_unlock_queues();
2571
2572         OSAddAtomic(1,  &inuse_ptepages_count);
2573
2574         /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2575         vm_object_lock(map->pm_obj_pdpt);
2576
2577         PMAP_LOCK(map);
2578         /*
2579          *      See if someone else expanded us first
2580          */
2581         if (pmap64_pde(map, vaddr) != PD_ENTRY_NULL) {
2582                 PMAP_UNLOCK(map);
2583                 vm_object_unlock(map->pm_obj_pdpt);
2584
2585                 VM_PAGE_FREE(m);
2586
2587                 OSAddAtomic(-1,  &inuse_ptepages_count);
2588                 return;
2589         }
2590
2591 #if 0 /* DEBUG */
2592        if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i)) {
2593                panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
2594                      map, map->pm_obj_pdpt, vaddr, i);
2595        }
2596 #endif
2597         vm_page_insert(m, map->pm_obj_pdpt, (vm_object_offset_t)i);
2598         vm_object_unlock(map->pm_obj_pdpt);
2599
2600         /*
2601          *      Set the page directory entry for this page table.
2602          */
2603         pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
2604
2605         pmap_store_pte(pdptp, pa_to_pte(pa)
2606                                 | INTEL_PTE_VALID
2607                                 | INTEL_PTE_USER
2608                                 | INTEL_PTE_WRITE);
2609
2610         PMAP_UNLOCK(map);
2611
2612         return;
2613
2614 }
2615
2616
2617
2618 /*
2619  *      Routine:        pmap_expand
2620  *
2621  *      Expands a pmap to be able to map the specified virtual address.
2622  *
2623  *      Allocates new virtual memory for the P0 or P1 portion of the
2624  *      pmap, then re-maps the physical pages that were in the old
2625  *      pmap to be in the new pmap.
2626  *
2627  *      Must be called with the pmap system and the pmap unlocked,
2628  *      since these must be unlocked to use vm_allocate or vm_deallocate.
2629  *      Thus it must be called in a loop that checks whether the map
2630  *      has been expanded enough.
2631  *      (We won't loop forever, since page tables aren't shrunk.)
2632  */
2633 void
2634 pmap_expand(
2635         pmap_t          map,
2636         vm_map_offset_t vaddr)
2637 {
2638         pt_entry_t              *pdp;
2639         register vm_page_t      m;
2640         register pmap_paddr_t   pa;
2641         uint64_t                i;
2642         ppnum_t                 pn;
2643
2644
2645         /*
2646          * For the kernel, the virtual address must be in or above the basement
2647          * which is for kexts and is in the 512GB immediately below the kernel..
2648          * XXX - should use VM_MIN_KERNEL_AND_KEXT_ADDRESS not KERNEL_BASEMENT
2649          */
2650         if (map == kernel_pmap &&
2651             !(vaddr >= KERNEL_BASEMENT && vaddr <= VM_MAX_KERNEL_ADDRESS))
2652                 panic("pmap_expand: bad vaddr 0x%llx for kernel pmap", vaddr);
2653
2654
2655         while ((pdp = pmap64_pde(map, vaddr)) == PD_ENTRY_NULL) {
2656                 /* need room for another pde entry */
2657                 pmap_expand_pdpt(map, vaddr);
2658         }
2659
2660         /*
2661          *      Allocate a VM page for the pde entries.
2662          */
2663         while ((m = vm_page_grab()) == VM_PAGE_NULL)
2664                 VM_PAGE_WAIT();
2665
2666         /*
2667          *      put the page into the pmap's obj list so it
2668          *      can be found later.
2669          */
2670         pn = m->phys_page;
2671         pa = i386_ptob(pn);
2672         i = pdeidx(map, vaddr);
2673
2674         /*
2675          *      Zero the page.
2676          */
2677         pmap_zero_page(pn);
2678
2679         vm_page_lockspin_queues();
2680         vm_page_wire(m);
2681         vm_page_unlock_queues();
2682
2683         OSAddAtomic(1,  &inuse_ptepages_count);
2684
2685         /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2686         vm_object_lock(map->pm_obj);
2687
2688         PMAP_LOCK(map);
2689
2690         /*
2691          *      See if someone else expanded us first
2692          */
2693         if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
2694                 PMAP_UNLOCK(map);
2695                 vm_object_unlock(map->pm_obj);
2696
2697                 VM_PAGE_FREE(m);
2698
2699                 OSAddAtomic(-1,  &inuse_ptepages_count);
2700                 return;
2701         }
2702
2703 #if 0 /* DEBUG */
2704        if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i)) {
2705                panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
2706                      map, map->pm_obj, vaddr, i);
2707        }
2708 #endif
2709         vm_page_insert(m, map->pm_obj, (vm_object_offset_t)i);
2710         vm_object_unlock(map->pm_obj);
2711
2712         /*
2713          *      Set the page directory entry for this page table.
2714          */
2715         pdp = pmap_pde(map, vaddr);
2716         pmap_store_pte(pdp, pa_to_pte(pa)
2717                                 | INTEL_PTE_VALID
2718                                 | INTEL_PTE_USER
2719                                 | INTEL_PTE_WRITE);
2720
2721         PMAP_UNLOCK(map);
2722
2723         return;
2724 }
2725
2726 /* On K64 machines with more than 32GB of memory, pmap_steal_memory
2727  * will allocate past the 1GB of pre-expanded virtual kernel area. This
2728  * function allocates all the page tables using memory from the same pool
2729  * that pmap_steal_memory uses, rather than calling vm_page_grab (which
2730  * isn't available yet). */
2731 void
2732 pmap_pre_expand(pmap_t pmap, vm_map_offset_t vaddr) {
2733         ppnum_t pn;
2734         pt_entry_t              *pte;
2735
2736         PMAP_LOCK(pmap);
2737
2738         if(pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) {
2739                 if (!pmap_next_page_k64(&pn))
2740                         panic("pmap_pre_expand");
2741
2742                 pmap_zero_page(pn);
2743
2744                 pte = pmap64_pml4(pmap, vaddr);
2745
2746                 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
2747                                 | INTEL_PTE_VALID
2748                                 | INTEL_PTE_USER
2749                                 | INTEL_PTE_WRITE);
2750         }
2751
2752         if(pmap64_pde(pmap, vaddr) == PD_ENTRY_NULL) {
2753                 if (!pmap_next_page_k64(&pn))
2754                         panic("pmap_pre_expand");
2755
2756                 pmap_zero_page(pn);
2757
2758                 pte = pmap64_pdpt(pmap, vaddr);
2759
2760                 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
2761                                 | INTEL_PTE_VALID
2762                                 | INTEL_PTE_USER
2763                                 | INTEL_PTE_WRITE);
2764         }
2765
2766         if(pmap_pte(pmap, vaddr) == PT_ENTRY_NULL) {
2767                 if (!pmap_next_page_k64(&pn))
2768                         panic("pmap_pre_expand");
2769
2770                 pmap_zero_page(pn);
2771
2772                 pte = pmap64_pde(pmap, vaddr);
2773
2774                 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
2775                                 | INTEL_PTE_VALID
2776                                 | INTEL_PTE_USER
2777                                 | INTEL_PTE_WRITE);
2778         }
2779
2780         PMAP_UNLOCK(pmap);
2781 }
2782
2783 /*
2784  * pmap_sync_page_data_phys(ppnum_t pa)
2785  *
2786  * Invalidates all of the instruction cache on a physical page and
2787  * pushes any dirty data from the data cache for the same physical page
2788  * Not required in i386.
2789  */
2790 void
2791 pmap_sync_page_data_phys(__unused ppnum_t pa)
2792 {
2793         return;
2794 }
2795
2796 /*
2797  * pmap_sync_page_attributes_phys(ppnum_t pa)
2798  *
2799  * Write back and invalidate all cachelines on a physical page.
2800  */
2801 void
2802 pmap_sync_page_attributes_phys(ppnum_t pa)
2803 {
2804         cache_flush_page_phys(pa);
2805 }
2806
2807
2808
2809 #ifdef CURRENTLY_UNUSED_AND_UNTESTED
2810
2811 int     collect_ref;
2812 int     collect_unref;
2813
2814 /*
2815  *      Routine:        pmap_collect
2816  *      Function:
2817  *              Garbage collects the physical map system for
2818  *              pages which are no longer used.
2819  *              Success need not be guaranteed -- that is, there
2820  *              may well be pages which are not referenced, but
2821  *              others may be collected.
2822  *      Usage:
2823  *              Called by the pageout daemon when pages are scarce.
2824  */
2825 void
2826 pmap_collect(
2827         pmap_t          p)
2828 {
2829         register pt_entry_t     *pdp, *ptp;
2830         pt_entry_t              *eptp;
2831         int                     wired;
2832
2833         if (p == PMAP_NULL)
2834                 return;
2835
2836         if (p == kernel_pmap)
2837                 return;
2838
2839         /*
2840          *      Garbage collect map.
2841          */
2842         PMAP_LOCK(p);
2843
2844         for (pdp = (pt_entry_t *)p->dirbase;
2845              pdp < (pt_entry_t *)&p->dirbase[(UMAXPTDI+1)];
2846              pdp++)
2847         {
2848            if (*pdp & INTEL_PTE_VALID) {
2849               if(*pdp & INTEL_PTE_REF) {
2850                 pmap_store_pte(pdp, *pdp & ~INTEL_PTE_REF);
2851                 collect_ref++;
2852               } else {
2853                 collect_unref++;
2854                 ptp = pmap_pte(p, pdetova(pdp - (pt_entry_t *)p->dirbase));
2855                 eptp = ptp + NPTEPG;
2856
2857                 /*
2858                  * If the pte page has any wired mappings, we cannot
2859                  * free it.
2860                  */
2861                 wired = 0;
2862                 {
2863                     register pt_entry_t *ptep;
2864                     for (ptep = ptp; ptep < eptp; ptep++) {
2865                         if (iswired(*ptep)) {
2866                             wired = 1;
2867                             break;
2868                         }
2869                     }
2870                 }
2871                 if (!wired) {
2872                     /*
2873                      * Remove the virtual addresses mapped by this pte page.
2874                      */
2875                     pmap_remove_range(p,
2876                                 pdetova(pdp - (pt_entry_t *)p->dirbase),
2877                                 ptp,
2878                                 eptp);
2879
2880                     /*
2881                      * Invalidate the page directory pointer.
2882                      */
2883                     pmap_store_pte(pdp, 0x0);
2884
2885                     PMAP_UNLOCK(p);
2886
2887                     /*
2888                      * And free the pte page itself.
2889                      */
2890                     {
2891                         register vm_page_t m;
2892
2893                         vm_object_lock(p->pm_obj);
2894
2895                         m = vm_page_lookup(p->pm_obj,(vm_object_offset_t)(pdp - (pt_entry_t *)&p->dirbase[0]));
2896                         if (m == VM_PAGE_NULL)
2897                             panic("pmap_collect: pte page not in object");
2898
2899                         VM_PAGE_FREE(m);
2900
2901                         OSAddAtomic(-1,  &inuse_ptepages_count);
2902
2903                         vm_object_unlock(p->pm_obj);
2904                     }
2905
2906                     PMAP_LOCK(p);
2907                 }
2908               }
2909            }
2910         }
2911
2912         PMAP_UPDATE_TLBS(p, 0x0, 0xFFFFFFFFFFFFF000ULL);
2913         PMAP_UNLOCK(p);
2914         return;
2915
2916 }
2917 #endif
2918
2919
2920 void
2921 pmap_copy_page(ppnum_t src, ppnum_t dst)
2922 {
2923         bcopy_phys((addr64_t)i386_ptob(src),
2924                    (addr64_t)i386_ptob(dst),
2925                    PAGE_SIZE);
2926 }
2927
2928
2929 /*
2930  *      Routine:        pmap_pageable
2931  *      Function:
2932  *              Make the specified pages (by pmap, offset)
2933  *              pageable (or not) as requested.
2934  *
2935  *              A page which is not pageable may not take
2936  *              a fault; therefore, its page table entry
2937  *              must remain valid for the duration.
2938  *
2939  *              This routine is merely advisory; pmap_enter
2940  *              will specify that these pages are to be wired
2941  *              down (or not) as appropriate.
2942  */
2943 void
2944 pmap_pageable(
2945         __unused pmap_t                 pmap,
2946         __unused vm_map_offset_t        start_addr,
2947         __unused vm_map_offset_t        end_addr,
2948         __unused boolean_t              pageable)
2949 {
2950 #ifdef  lint
2951         pmap++; start_addr++; end_addr++; pageable++;
2952 #endif  /* lint */
2953 }
2954
2955 /*
2956  *      Clear specified attribute bits.
2957  */
2958 void
2959 phys_attribute_clear(
2960         ppnum_t         pn,
2961         int             bits)
2962 {
2963         pv_rooted_entry_t       pv_h;
2964         pv_hashed_entry_t       pv_e;
2965         pt_entry_t              *pte;
2966         int                     pai;
2967         pmap_t                  pmap;
2968
2969         pmap_intr_assert();
2970         assert(pn != vm_page_fictitious_addr);
2971         if (pn == vm_page_guard_addr)
2972                 return;
2973
2974         pai = ppn_to_pai(pn);
2975
2976         if (!IS_MANAGED_PAGE(pai)) {
2977                 /*
2978                  *      Not a managed page.
2979                  */
2980                 return;
2981         }
2982
2983
2984         PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START,
2985                    pn, bits, 0, 0, 0);
2986
2987         pv_h = pai_to_pvh(pai);
2988
2989         LOCK_PVH(pai);
2990
2991         /*
2992          * Walk down PV list, clearing all modify or reference bits.
2993          * We do not have to lock the pv_list because we have
2994          * the entire pmap system locked.
2995          */
2996         if (pv_h->pmap != PMAP_NULL) {
2997                 /*
2998                  * There are some mappings.
2999                  */
3000
3001                 pv_e = (pv_hashed_entry_t)pv_h;
3002
3003                 do {
3004                         vm_map_offset_t va;
3005
3006                         pmap = pv_e->pmap;
3007                         va = pv_e->va;
3008
3009                          /*
3010                           * Clear modify and/or reference bits.
3011                           */
3012                         pte = pmap_pte(pmap, va);
3013                         pmap_update_pte(pte, *pte, (*pte & ~bits));
3014                         /* Ensure all processors using this translation
3015                          * invalidate this TLB entry. The invalidation *must*
3016                          * follow the PTE update, to ensure that the TLB
3017                          * shadow of the 'D' bit (in particular) is
3018                          * synchronized with the updated PTE.
3019                          */
3020                         PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
3021
3022                         pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
3023
3024                 } while (pv_e != (pv_hashed_entry_t)pv_h);
3025         }
3026         pmap_phys_attributes[pai] &= ~bits;
3027
3028         UNLOCK_PVH(pai);
3029
3030         PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END,
3031                    0, 0, 0, 0, 0);
3032 }
3033
3034 /*
3035  *      Check specified attribute bits.
3036  */
3037 int
3038 phys_attribute_test(
3039         ppnum_t         pn,
3040         int             bits)
3041 {
3042         pv_rooted_entry_t       pv_h;
3043         pv_hashed_entry_t       pv_e;
3044         pt_entry_t              *pte;
3045         int                     pai;
3046         pmap_t                  pmap;
3047         int                     attributes = 0;
3048
3049         pmap_intr_assert();
3050         assert(pn != vm_page_fictitious_addr);
3051         if (pn == vm_page_guard_addr)
3052                 return 0;
3053
3054         pai = ppn_to_pai(pn);
3055
3056         if (!IS_MANAGED_PAGE(pai)) {
3057                 /*
3058                  *      Not a managed page.
3059                  */
3060                 return 0;
3061         }
3062
3063         /*
3064          * super fast check...  if bits already collected
3065          * no need to take any locks...
3066          * if not set, we need to recheck after taking
3067          * the lock in case they got pulled in while
3068          * we were waiting for the lock
3069          */
3070         if ((pmap_phys_attributes[pai] & bits) == bits)
3071                 return bits;
3072
3073         pv_h = pai_to_pvh(pai);
3074
3075         LOCK_PVH(pai);
3076
3077         attributes = pmap_phys_attributes[pai] & bits;
3078
3079
3080         /*
3081          * Walk down PV list, checking the mappings until we
3082          * reach the end or we've found the attributes we've asked for
3083          * We do not have to lock the pv_list because we have
3084          * the entire pmap system locked.
3085          */
3086         if (attributes != bits &&
3087             pv_h->pmap != PMAP_NULL) {
3088                 /*
3089                  * There are some mappings.
3090                  */
3091                 pv_e = (pv_hashed_entry_t)pv_h;
3092                 do {
3093                         vm_map_offset_t va;
3094
3095                         pmap = pv_e->pmap;
3096                         va = pv_e->va;
3097                         /*
3098                          * first make sure any processor actively
3099                          * using this pmap, flushes its TLB state
3100                          */
3101                         PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
3102
3103                         /*
3104                          * pick up modify and/or reference bits from mapping
3105                          */
3106
3107                         pte = pmap_pte(pmap, va);
3108                         attributes |= (int)(*pte & bits);
3109
3110                         pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
3111
3112                 } while ((attributes != bits) &&
3113                          (pv_e != (pv_hashed_entry_t)pv_h));
3114         }
3115
3116         UNLOCK_PVH(pai);
3117         return (attributes);
3118 }
3119
3120 /*
3121  *      Set specified attribute bits.
3122  */
3123 void
3124 phys_attribute_set(
3125         ppnum_t         pn,
3126         int             bits)
3127 {
3128         int             pai;
3129
3130         pmap_intr_assert();
3131         assert(pn != vm_page_fictitious_addr);
3132         if (pn == vm_page_guard_addr)
3133                 return;
3134
3135         pai = ppn_to_pai(pn);
3136
3137         if (!IS_MANAGED_PAGE(pai)) {
3138                 /* Not a managed page.  */
3139                 return;
3140         }
3141
3142         LOCK_PVH(pai);
3143         pmap_phys_attributes[pai] |= bits;
3144         UNLOCK_PVH(pai);
3145 }
3146
3147 /*
3148  *      Set the modify bit on the specified physical page.
3149  */
3150
3151 void
3152 pmap_set_modify(ppnum_t pn)
3153 {
3154         phys_attribute_set(pn, PHYS_MODIFIED);
3155 }
3156
3157 /*
3158  *      Clear the modify bits on the specified physical page.
3159  */
3160
3161 void
3162 pmap_clear_modify(ppnum_t pn)
3163 {
3164         phys_attribute_clear(pn, PHYS_MODIFIED);
3165 }
3166
3167 /*
3168  *      pmap_is_modified:
3169  *
3170  *      Return whether or not the specified physical page is modified
3171  *      by any physical maps.
3172  */
3173
3174 boolean_t
3175 pmap_is_modified(ppnum_t pn)
3176 {
3177         if (phys_attribute_test(pn, PHYS_MODIFIED))
3178                 return TRUE;
3179         return FALSE;
3180 }
3181
3182 /*
3183  *      pmap_clear_reference:
3184  *
3185  *      Clear the reference bit on the specified physical page.
3186  */
3187
3188 void
3189 pmap_clear_reference(ppnum_t pn)
3190 {
3191         phys_attribute_clear(pn, PHYS_REFERENCED);
3192 }
3193
3194 void
3195 pmap_set_reference(ppnum_t pn)
3196 {
3197         phys_attribute_set(pn, PHYS_REFERENCED);
3198 }
3199
3200 /*
3201  *      pmap_is_referenced:
3202  *
3203  *      Return whether or not the specified physical page is referenced
3204  *      by any physical maps.
3205  */
3206
3207 boolean_t
3208 pmap_is_referenced(ppnum_t pn)
3209 {
3210         if (phys_attribute_test(pn, PHYS_REFERENCED))
3211                 return TRUE;
3212         return FALSE;
3213 }
3214
3215 /*
3216  * pmap_get_refmod(phys)
3217  *  returns the referenced and modified bits of the specified
3218  *  physical page.
3219  */
3220 unsigned int
3221 pmap_get_refmod(ppnum_t pn)
3222 {
3223         int             refmod;
3224         unsigned int    retval = 0;
3225
3226         refmod = phys_attribute_test(pn, PHYS_MODIFIED | PHYS_REFERENCED);
3227
3228         if (refmod & PHYS_MODIFIED)
3229                 retval |= VM_MEM_MODIFIED;
3230         if (refmod & PHYS_REFERENCED)
3231                 retval |= VM_MEM_REFERENCED;
3232
3233         return (retval);
3234 }
3235
3236 /*
3237  * pmap_clear_refmod(phys, mask)
3238  *  clears the referenced and modified bits as specified by the mask
3239  *  of the specified physical page.
3240  */
3241 void
3242 pmap_clear_refmod(ppnum_t pn, unsigned int mask)
3243 {
3244         unsigned int  x86Mask;
3245
3246         x86Mask = (   ((mask &   VM_MEM_MODIFIED)?   PHYS_MODIFIED : 0)
3247                     | ((mask & VM_MEM_REFERENCED)? PHYS_REFERENCED : 0));
3248         phys_attribute_clear(pn, x86Mask);
3249 }
3250
3251 void
3252 invalidate_icache(__unused vm_offset_t  addr,
3253                   __unused unsigned     cnt,
3254                   __unused int          phys)
3255 {
3256         return;
3257 }
3258
3259 void
3260 flush_dcache(__unused vm_offset_t       addr,
3261              __unused unsigned          count,
3262              __unused int               phys)
3263 {
3264         return;
3265 }
3266
3267 #if CONFIG_DTRACE
3268 /*
3269  * Constrain DTrace copyin/copyout actions
3270  */
3271 extern kern_return_t dtrace_copyio_preflight(addr64_t);
3272 extern kern_return_t dtrace_copyio_postflight(addr64_t);
3273
3274 kern_return_t dtrace_copyio_preflight(__unused addr64_t va)
3275 {
3276         thread_t thread = current_thread();
3277
3278         if (current_map() == kernel_map)
3279                 return KERN_FAILURE;
3280         else if (get_cr3() != thread->map->pmap->pm_cr3)
3281                 return KERN_FAILURE;
3282         else if (thread->machine.specFlags & CopyIOActive)
3283                 return KERN_FAILURE;
3284         else
3285                 return KERN_SUCCESS;
3286 }
3287
3288 kern_return_t dtrace_copyio_postflight(__unused addr64_t va)
3289 {
3290         return KERN_SUCCESS;
3291 }
3292 #endif /* CONFIG_DTRACE */
3293
3294 #include <mach_vm_debug.h>
3295 #if     MACH_VM_DEBUG
3296 #include <vm/vm_debug.h>
3297
3298 int
3299 pmap_list_resident_pages(
3300         __unused pmap_t         pmap,
3301         __unused vm_offset_t    *listp,
3302         __unused int            space)
3303 {
3304         return 0;
3305 }
3306 #endif  /* MACH_VM_DEBUG */
3307
3308
3309
3310 /* temporary workaround */
3311 boolean_t
3312 coredumpok(__unused vm_map_t map, __unused vm_offset_t va)
3313 {
3314 #if 0
3315         pt_entry_t     *ptep;
3316
3317         ptep = pmap_pte(map->pmap, va);
3318         if (0 == ptep)
3319                 return FALSE;
3320         return ((*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED));
3321 #else
3322         return TRUE;
3323 #endif
3324 }
3325
3326
3327 boolean_t
3328 phys_page_exists(ppnum_t pn)
3329 {
3330         assert(pn != vm_page_fictitious_addr);
3331
3332         if (!pmap_initialized)
3333                 return TRUE;
3334
3335         if (pn == vm_page_guard_addr)
3336                 return FALSE;
3337
3338         if (!IS_MANAGED_PAGE(ppn_to_pai(pn)))
3339                 return FALSE;
3340
3341         return TRUE;
3342 }
3343
3344 void
3345 mapping_free_prime(void)
3346 {
3347         int                     i;
3348         pv_hashed_entry_t       pvh_e;
3349         pv_hashed_entry_t       pvh_eh;
3350         pv_hashed_entry_t       pvh_et;
3351         int                     pv_cnt;
3352
3353         pv_cnt = 0;
3354         pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3355         for (i = 0; i < (5 * PV_HASHED_ALLOC_CHUNK); i++) {
3356                 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3357
3358                 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3359                 pvh_eh = pvh_e;
3360
3361                 if (pvh_et == PV_HASHED_ENTRY_NULL)
3362                         pvh_et = pvh_e;
3363                 pv_cnt++;
3364         }
3365         PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
3366
3367         pv_cnt = 0;
3368         pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3369         for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
3370                 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3371
3372                 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3373                 pvh_eh = pvh_e;
3374
3375                 if (pvh_et == PV_HASHED_ENTRY_NULL)
3376                         pvh_et = pvh_e;
3377                 pv_cnt++;
3378         }
3379         PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
3380
3381 }
3382
3383 void
3384 mapping_adjust(void)
3385 {
3386         pv_hashed_entry_t       pvh_e;
3387         pv_hashed_entry_t       pvh_eh;
3388         pv_hashed_entry_t       pvh_et;
3389         int                     pv_cnt;
3390         int                     i;
3391
3392         if (mapping_adjust_call == NULL) {
3393                 thread_call_setup(&mapping_adjust_call_data,
3394                                   (thread_call_func_t) mapping_adjust,
3395                                   (thread_call_param_t) NULL);
3396                 mapping_adjust_call = &mapping_adjust_call_data;
3397         }
3398
3399         pv_cnt = 0;
3400         pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3401         if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) {
3402                 for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
3403                         pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3404
3405                         pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3406                         pvh_eh = pvh_e;
3407
3408                         if (pvh_et == PV_HASHED_ENTRY_NULL)
3409                                 pvh_et = pvh_e;
3410                         pv_cnt++;
3411                 }
3412                 PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
3413         }
3414
3415         pv_cnt = 0;
3416         pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3417         if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) {
3418                 for (i = 0; i < PV_HASHED_ALLOC_CHUNK; i++) {
3419                         pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3420
3421                         pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3422                         pvh_eh = pvh_e;
3423
3424                         if (pvh_et == PV_HASHED_ENTRY_NULL)
3425                                 pvh_et = pvh_e;
3426                         pv_cnt++;
3427                 }
3428                 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
3429         }
3430         mappingrecurse = 0;
3431 }
3432
3433
3434 void
3435 pmap_switch(pmap_t tpmap)
3436 {
3437         spl_t   s;
3438
3439         s = splhigh();          /* Make sure interruptions are disabled */
3440         set_dirbase(tpmap, current_thread());
3441         splx(s);
3442 }
3443
3444
3445 /*
3446  * disable no-execute capability on
3447  * the specified pmap
3448  */
3449 void
3450 pmap_disable_NX(pmap_t pmap)
3451 {
3452         pmap->nx_enabled = 0;
3453 }
3454
3455 void
3456 pt_fake_zone_info(
3457         int             *count,
3458         vm_size_t       *cur_size,
3459         vm_size_t       *max_size,
3460         vm_size_t       *elem_size,
3461         vm_size_t       *alloc_size,
3462         int             *collectable,
3463         int             *exhaustable)
3464 {
3465         *count      = inuse_ptepages_count;
3466         *cur_size   = PAGE_SIZE * inuse_ptepages_count;
3467         *max_size   = PAGE_SIZE * (inuse_ptepages_count +
3468                                    vm_page_inactive_count +
3469                                    vm_page_active_count +
3470                                    vm_page_free_count);
3471         *elem_size  = PAGE_SIZE;
3472         *alloc_size = PAGE_SIZE;
3473
3474         *collectable = 1;
3475         *exhaustable = 0;
3476 }
3477
3478 static inline void
3479 pmap_cpuset_NMIPI(cpu_set cpu_mask) {
3480         unsigned int cpu, cpu_bit;
3481         uint64_t deadline;
3482
3483         for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
3484                 if (cpu_mask & cpu_bit)
3485                         cpu_NMI_interrupt(cpu);
3486         }
3487         deadline = mach_absolute_time() + (LockTimeOut);
3488         while (mach_absolute_time() < deadline)
3489                 cpu_pause();
3490 }
3491
3492 /*
3493  * Called with pmap locked, we:
3494  *  - scan through per-cpu data to see which other cpus need to flush
3495  *  - send an IPI to each non-idle cpu to be flushed
3496  *  - wait for all to signal back that they are inactive or we see that
3497  *    they are at a safe point (idle).
3498  *  - flush the local tlb if active for this pmap
3499  *  - return ... the caller will unlock the pmap
3500  */
3501 void
3502 pmap_flush_tlbs(pmap_t  pmap)
3503 {
3504         unsigned int    cpu;
3505         unsigned int    cpu_bit;
3506         cpu_set         cpus_to_signal;
3507         unsigned int    my_cpu = cpu_number();
3508         pmap_paddr_t    pmap_cr3 = pmap->pm_cr3;
3509         boolean_t       flush_self = FALSE;
3510         uint64_t        deadline;
3511
3512         assert((processor_avail_count < 2) ||
3513                (ml_get_interrupts_enabled() && get_preemption_level() != 0));
3514
3515         /*
3516          * Scan other cpus for matching active or task CR3.
3517          * For idle cpus (with no active map) we mark them invalid but
3518          * don't signal -- they'll check as they go busy.
3519          */
3520         cpus_to_signal = 0;
3521         for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
3522                 if (!cpu_datap(cpu)->cpu_running)
3523                         continue;
3524                 uint64_t        cpu_active_cr3 = CPU_GET_ACTIVE_CR3(cpu);
3525                 uint64_t        cpu_task_cr3 = CPU_GET_TASK_CR3(cpu);
3526
3527                 if ((pmap_cr3 == cpu_task_cr3) ||
3528                     (pmap_cr3 == cpu_active_cr3) ||
3529                     (pmap->pm_shared) ||
3530                     (pmap == kernel_pmap)) {
3531                         if (cpu == my_cpu) {
3532                                 flush_self = TRUE;
3533                                 continue;
3534                         }
3535                         cpu_datap(cpu)->cpu_tlb_invalid = TRUE;
3536                         __asm__ volatile("mfence");
3537
3538                         /*
3539                          * We don't need to signal processors which will flush
3540                          * lazily at the idle state or kernel boundary.
3541                          * For example, if we're invalidating the kernel pmap,
3542                          * processors currently in userspace don't need to flush
3543                          * their TLBs until the next time they enter the kernel.
3544                          * Alterations to the address space of a task active
3545                          * on a remote processor result in a signal, to
3546                          * account for copy operations. (There may be room
3547                          * for optimization in such cases).
3548                          * The order of the loads below with respect
3549                          * to the store to the "cpu_tlb_invalid" field above
3550                          * is important--hence the barrier.
3551                          */
3552                         if (CPU_CR3_IS_ACTIVE(cpu) &&
3553                             (pmap_cr3 == CPU_GET_ACTIVE_CR3(cpu) ||
3554                             pmap->pm_shared ||
3555                             (pmap_cr3 == CPU_GET_TASK_CR3(cpu)))) {
3556                                 cpus_to_signal |= cpu_bit;
3557                                 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
3558                         }
3559                 }
3560         }
3561
3562         PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_START,
3563                    pmap, cpus_to_signal, flush_self, 0, 0);
3564
3565         /*
3566          * Flush local tlb if required.
3567          * Do this now to overlap with other processors responding.
3568          */
3569         if (flush_self)
3570                 flush_tlb();
3571
3572         if (cpus_to_signal) {
3573                 cpu_set cpus_to_respond = cpus_to_signal;
3574
3575                 deadline = mach_absolute_time() + LockTimeOut;
3576                 /*
3577                  * Wait for those other cpus to acknowledge
3578                  */
3579                 while (cpus_to_respond != 0) {
3580                         if (mach_absolute_time() > deadline) {
3581                                 if (mp_recent_debugger_activity())
3582                                         continue;
3583                                 if (!panic_active()) {
3584                                         pmap_tlb_flush_timeout = TRUE;
3585                                         pmap_cpuset_NMIPI(cpus_to_respond);
3586                                 }
3587                                 panic("pmap_flush_tlbs() timeout: "
3588                                     "cpu(s) failing to respond to interrupts, pmap=%p cpus_to_respond=0x%lx",
3589                                     pmap, cpus_to_respond);
3590                         }
3591
3592                         for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
3593                                 if ((cpus_to_respond & cpu_bit) != 0) {
3594                                         if (!cpu_datap(cpu)->cpu_running ||
3595                                             cpu_datap(cpu)->cpu_tlb_invalid == FALSE ||
3596                                             !CPU_CR3_IS_ACTIVE(cpu)) {
3597                                                 cpus_to_respond &= ~cpu_bit;
3598                                         }
3599                                         cpu_pause();
3600                                 }
3601                                 if (cpus_to_respond == 0)
3602                                         break;
3603                         }
3604                 }
3605         }
3606
3607         PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_END,
3608                    pmap, cpus_to_signal, flush_self, 0, 0);
3609 }
3610
3611 void
3612 process_pmap_updates(void)
3613 {
3614         assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
3615
3616         flush_tlb();
3617
3618         current_cpu_datap()->cpu_tlb_invalid = FALSE;
3619         __asm__ volatile("mfence");
3620 }
3621
3622 void
3623 pmap_update_interrupt(void)
3624 {
3625         PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START,
3626                    0, 0, 0, 0, 0);
3627
3628         process_pmap_updates();
3629
3630         PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END,
3631                    0, 0, 0, 0, 0);
3632 }
3633
3634
3635 unsigned int
3636 pmap_cache_attributes(ppnum_t pn)
3637 {
3638         return IS_MANAGED_PAGE(ppn_to_pai(pn)) ? VM_WIMG_COPYBACK
3639                                                : VM_WIMG_IO;
3640 }
3641
3642