osfmk/i386/pmap.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30 /*
  31  * @OSF_COPYRIGHT@
  32  */
  33 /*
  34  * Mach Operating System
  35  * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
  36  * All Rights Reserved.
  37  *
  38  * Permission to use, copy, modify and distribute this software and its
  39  * documentation is hereby granted, provided that both the copyright
  40  * notice and this permission notice appear in all copies of the
  41  * software, derivative works or modified versions, and any portions
  42  * thereof, and that both notices appear in supporting documentation.
  43  *
  44  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  45  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  46  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  47  *
  48  * Carnegie Mellon requests users of this software to return to
  49  *
  50  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  51  *  School of Computer Science
  52  *  Carnegie Mellon University
  53  *  Pittsburgh PA 15213-3890
  54  *
  55  * any improvements or extensions that they make and grant Carnegie Mellon
  56  * the rights to redistribute these changes.
  57  */
  58 /*
  59  */
  60
  61 /*
  62  *      File:   pmap.c
  63  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  64  *      (These guys wrote the Vax version)
  65  *
  66  *      Physical Map management code for Intel i386, i486, and i860.
  67  *
  68  *      Manages physical address maps.
  69  *
  70  *      In addition to hardware address maps, this
  71  *      module is called upon to provide software-use-only
  72  *      maps which may or may not be stored in the same
  73  *      form as hardware maps.  These pseudo-maps are
  74  *      used to store intermediate results from copy
  75  *      operations to and from address spaces.
  76  *
  77  *      Since the information managed by this module is
  78  *      also stored by the logical address mapping module,
  79  *      this module may throw away valid virtual-to-physical
  80  *      mappings at almost any time.  However, invalidations
  81  *      of virtual-to-physical mappings must be done as
  82  *      requested.
  83  *
  84  *      In order to cope with hardware architectures which
  85  *      make virtual-to-physical map invalidates expensive,
  86  *      this module may delay invalidate or reduced protection
  87  *      operations until such time as they are actually
  88  *      necessary.  This module is given full information as
  89  *      to which processors are currently using which maps,
  90  *      and to when physical maps must be made correct.
  91  */
  92
  93 #include <string.h>
  94 #include <norma_vm.h>
  95 #include <mach_kdb.h>
  96 #include <mach_ldebug.h>
  97
  98 #include <mach/machine/vm_types.h>
  99
 100 #include <mach/boolean.h>
 101 #include <kern/thread.h>
 102 #include <kern/zalloc.h>
 103
 104 #include <kern/lock.h>
 105 #include <kern/kalloc.h>
 106 #include <kern/spl.h>
 107
 108 #include <vm/pmap.h>
 109 #include <vm/vm_map.h>
 110 #include <vm/vm_kern.h>
 111 #include <mach/vm_param.h>
 112 #include <mach/vm_prot.h>
 113 #include <vm/vm_object.h>
 114 #include <vm/vm_page.h>
 115
 116 #include <mach/machine/vm_param.h>
 117 #include <machine/thread.h>
 118
 119 #include <kern/misc_protos.h>                   /* prototyping */
 120 #include <i386/misc_protos.h>
 121
 122 #include <i386/cpuid.h>
 123 #include <i386/cpu_data.h>
 124 #include <i386/cpu_number.h>
 125 #include <i386/machine_cpu.h>
 126 #include <i386/mp_slave_boot.h>
 127
 128 #if     MACH_KDB
 129 #include <ddb/db_command.h>
 130 #include <ddb/db_output.h>
 131 #include <ddb/db_sym.h>
 132 #include <ddb/db_print.h>
 133 #endif  /* MACH_KDB */
 134
 135 #include <kern/xpr.h>
 136
 137 #include <vm/vm_protos.h>
 138
 139 #include <i386/mp.h>
 140
 141 /*
 142  * Forward declarations for internal functions.
 143  */
 144 void            pmap_expand(
 145                         pmap_t          map,
 146                         vm_offset_t     v);
 147
 148 extern void     pmap_remove_range(
 149                         pmap_t          pmap,
 150                         vm_offset_t     va,
 151                         pt_entry_t      *spte,
 152                         pt_entry_t      *epte);
 153
 154 void            phys_attribute_clear(
 155                         ppnum_t phys,
 156                         int             bits);
 157
 158 boolean_t       phys_attribute_test(
 159                         ppnum_t phys,
 160                         int             bits);
 161
 162 void            phys_attribute_set(
 163                         ppnum_t phys,
 164                         int             bits);
 165
 166 void            pmap_growkernel(
 167                         vm_offset_t addr);
 168
 169 void            pmap_set_reference(
 170                         ppnum_t pn);
 171
 172 void            pmap_movepage(
 173                         unsigned long   from,
 174                         unsigned long   to,
 175                         vm_size_t       size);
 176
 177 pt_entry_t *    pmap_mapgetpte(
 178                         vm_map_t        map,
 179                         vm_offset_t     v);
 180
 181 boolean_t       phys_page_exists(
 182                         ppnum_t pn);
 183
 184 #ifndef set_dirbase
 185 void            set_dirbase(vm_offset_t dirbase);
 186 #endif  /* set_dirbase */
 187
 188 #define iswired(pte)    ((pte) & INTEL_PTE_WIRED)
 189
 190 #define WRITE_PTE(pte_p, pte_entry)             *(pte_p) = (pte_entry);
 191 #define WRITE_PTE_FAST(pte_p, pte_entry)        *(pte_p) = (pte_entry);
 192
 193 #define value_64bit(value)  ((value) & 0xFFFFFFFF00000000LL)
 194 #define low32(x) ((unsigned int)((x) & 0x00000000ffffffffLL))
 195
 196 /*
 197  *      Private data structures.
 198  */
 199
 200 /*
 201  *      For each vm_page_t, there is a list of all currently
 202  *      valid virtual mappings of that page.  An entry is
 203  *      a pv_entry_t; the list is the pv_table.
 204  */
 205
 206 typedef struct pv_entry {
 207         struct pv_entry *next;          /* next pv_entry */
 208         pmap_t          pmap;           /* pmap where mapping lies */
 209         vm_offset_t     va;             /* virtual address for mapping */
 210 } *pv_entry_t;
 211
 212 #define PV_ENTRY_NULL   ((pv_entry_t) 0)
 213
 214 pv_entry_t      pv_head_table;          /* array of entries, one per page */
 215
 216 /*
 217  *      pv_list entries are kept on a list that can only be accessed
 218  *      with the pmap system locked (at SPLVM, not in the cpus_active set).
 219  *      The list is refilled from the pv_list_zone if it becomes empty.
 220  */
 221 pv_entry_t      pv_free_list;           /* free list at SPLVM */
 222 decl_simple_lock_data(,pv_free_list_lock)
 223 int pv_free_count = 0;
 224 #define PV_LOW_WATER_MARK 5000
 225 #define PV_ALLOC_CHUNK 2000
 226 thread_call_t  mapping_adjust_call;
 227 static thread_call_data_t  mapping_adjust_call_data;
 228 int mappingrecurse = 0;
 229
 230 #define PV_ALLOC(pv_e) { \
 231         simple_lock(&pv_free_list_lock); \
 232         if ((pv_e = pv_free_list) != 0) { \
 233             pv_free_list = pv_e->next; \
 234             pv_free_count--; \
 235             if (pv_free_count < PV_LOW_WATER_MARK) \
 236               if (hw_compare_and_store(0,1,&mappingrecurse)) \
 237                 thread_call_enter(mapping_adjust_call); \
 238         } \
 239         simple_unlock(&pv_free_list_lock); \
 240 }
 241
 242 #define PV_FREE(pv_e) { \
 243         simple_lock(&pv_free_list_lock); \
 244         pv_e->next = pv_free_list; \
 245         pv_free_list = pv_e; \
 246         pv_free_count++; \
 247         simple_unlock(&pv_free_list_lock); \
 248 }
 249
 250 zone_t          pv_list_zone;           /* zone of pv_entry structures */
 251
 252 #ifdef PAE
 253 static zone_t pdpt_zone;
 254 #endif
 255
 256
 257 /*
 258  *      Each entry in the pv_head_table is locked by a bit in the
 259  *      pv_lock_table.  The lock bits are accessed by the physical
 260  *      address of the page they lock.
 261  */
 262
 263 char    *pv_lock_table;         /* pointer to array of bits */
 264 #define pv_lock_table_size(n)   (((n)+BYTE_SIZE-1)/BYTE_SIZE)
 265
 266 /*
 267  *      First and last physical addresses that we maintain any information
 268  *      for.  Initialized to zero so that pmap operations done before
 269  *      pmap_init won't touch any non-existent structures.
 270  */
 271 pmap_paddr_t    vm_first_phys = (pmap_paddr_t) 0;
 272 pmap_paddr_t    vm_last_phys  = (pmap_paddr_t) 0;
 273 boolean_t       pmap_initialized = FALSE;/* Has pmap_init completed? */
 274
 275 pmap_paddr_t    kernel_vm_end = (pmap_paddr_t)0;
 276
 277 #define GROW_KERNEL_FUNCTION_IMPLEMENTED 1
 278 #if GROW_KERNEL_FUNCTION_IMPLEMENTED  /* not needed until growing kernel pmap */
 279 static struct vm_object kptobj_object_store;
 280 static vm_object_t kptobj;
 281 #endif
 282
 283
 284 /*
 285  *      Index into pv_head table, its lock bits, and the modify/reference
 286  *      bits starting at vm_first_phys.
 287  */
 288
 289 #define pa_index(pa)    (i386_btop(pa - vm_first_phys))
 290
 291 #define pai_to_pvh(pai)         (&pv_head_table[pai])
 292 #define lock_pvh_pai(pai)       bit_lock(pai, (void *)pv_lock_table)
 293 #define unlock_pvh_pai(pai)     bit_unlock(pai, (void *)pv_lock_table)
 294
 295 /*
 296  *      Array of physical page attribites for managed pages.
 297  *      One byte per physical page.
 298  */
 299 char    *pmap_phys_attributes;
 300
 301 /*
 302  *      Physical page attributes.  Copy bits from PTE definition.
 303  */
 304 #define PHYS_MODIFIED   INTEL_PTE_MOD   /* page modified */
 305 #define PHYS_REFERENCED INTEL_PTE_REF   /* page referenced */
 306 #define PHYS_NCACHE     INTEL_PTE_NCACHE
 307
 308 /*
 309  *      Amount of virtual memory mapped by one
 310  *      page-directory entry.
 311  */
 312 #define PDE_MAPPED_SIZE         (pdetova(1))
 313
 314 /*
 315  *      Locking and TLB invalidation
 316  */
 317
 318 /*
 319  *      Locking Protocols:
 320  *
 321  *      There are two structures in the pmap module that need locking:
 322  *      the pmaps themselves, and the per-page pv_lists (which are locked
 323  *      by locking the pv_lock_table entry that corresponds to the pv_head
 324  *      for the list in question.)  Most routines want to lock a pmap and
 325  *      then do operations in it that require pv_list locking -- however
 326  *      pmap_remove_all and pmap_copy_on_write operate on a physical page
 327  *      basis and want to do the locking in the reverse order, i.e. lock
 328  *      a pv_list and then go through all the pmaps referenced by that list.
 329  *      To protect against deadlock between these two cases, the pmap_lock
 330  *      is used.  There are three different locking protocols as a result:
 331  *
 332  *  1.  pmap operations only (pmap_extract, pmap_access, ...)  Lock only
 333  *              the pmap.
 334  *
 335  *  2.  pmap-based operations (pmap_enter, pmap_remove, ...)  Get a read
 336  *              lock on the pmap_lock (shared read), then lock the pmap
 337  *              and finally the pv_lists as needed [i.e. pmap lock before
 338  *              pv_list lock.]
 339  *
 340  *  3.  pv_list-based operations (pmap_remove_all, pmap_copy_on_write, ...)
 341  *              Get a write lock on the pmap_lock (exclusive write); this
 342  *              also guaranteees exclusive access to the pv_lists.  Lock the
 343  *              pmaps as needed.
 344  *
 345  *      At no time may any routine hold more than one pmap lock or more than
 346  *      one pv_list lock.  Because interrupt level routines can allocate
 347  *      mbufs and cause pmap_enter's, the pmap_lock and the lock on the
 348  *      kernel_pmap can only be held at splhigh.
 349  */
 350
 351 /*
 352  *      We raise the interrupt level to splvm, to block interprocessor
 353  *      interrupts during pmap operations.  We must take the CPU out of
 354  *      the cpus_active set while interrupts are blocked.
 355  */
 356 #define SPLVM(spl)      { \
 357         spl = splhigh(); \
 358         mp_disable_preemption(); \
 359         i_bit_clear(cpu_number(), &cpus_active); \
 360         mp_enable_preemption(); \
 361 }
 362
 363 #define SPLX(spl)       { \
 364         mp_disable_preemption(); \
 365         i_bit_set(cpu_number(), &cpus_active); \
 366         mp_enable_preemption(); \
 367         splx(spl); \
 368 }
 369
 370 /*
 371  *      Lock on pmap system
 372  */
 373 lock_t  pmap_system_lock;
 374
 375 #define PMAP_READ_LOCK(pmap, spl) {     \
 376         SPLVM(spl);                     \
 377         lock_read(&pmap_system_lock);   \
 378         simple_lock(&(pmap)->lock);     \
 379 }
 380
 381 #define PMAP_WRITE_LOCK(spl) {          \
 382         SPLVM(spl);                     \
 383         lock_write(&pmap_system_lock);  \
 384 }
 385
 386 #define PMAP_READ_UNLOCK(pmap, spl) {           \
 387         simple_unlock(&(pmap)->lock);           \
 388         lock_read_done(&pmap_system_lock);      \
 389         SPLX(spl);                              \
 390 }
 391
 392 #define PMAP_WRITE_UNLOCK(spl) {                \
 393         lock_write_done(&pmap_system_lock);     \
 394         SPLX(spl);                              \
 395 }
 396
 397 #define PMAP_WRITE_TO_READ_LOCK(pmap) {         \
 398         simple_lock(&(pmap)->lock);             \
 399         lock_write_to_read(&pmap_system_lock);  \
 400 }
 401
 402 #define LOCK_PVH(index)         lock_pvh_pai(index)
 403
 404 #define UNLOCK_PVH(index)       unlock_pvh_pai(index)
 405
 406 #if     USLOCK_DEBUG
 407 extern int      max_lock_loops;
 408 extern int      disableSerialOuput;
 409 #define LOOP_VAR                                                        \
 410         unsigned int    loop_count;                                     \
 411         loop_count = disableSerialOuput ? max_lock_loops                \
 412                                         : max_lock_loops*100
 413 #define LOOP_CHECK(msg, pmap)                                           \
 414         if (--loop_count == 0) {                                        \
 415                 mp_disable_preemption();                                \
 416                 kprintf("%s: cpu %d pmap %x, cpus_active 0x%x\n",       \
 417                           msg, cpu_number(), pmap, cpus_active);        \
 418                 Debugger("deadlock detection");                         \
 419                 mp_enable_preemption();                                 \
 420                 loop_count = max_lock_loops;                            \
 421         }
 422 #else   /* USLOCK_DEBUG */
 423 #define LOOP_VAR
 424 #define LOOP_CHECK(msg, pmap)
 425 #endif  /* USLOCK_DEBUG */
 426
 427 #define PMAP_UPDATE_TLBS(pmap, s, e)                                    \
 428 {                                                                       \
 429         cpu_set cpu_mask;                                               \
 430         cpu_set users;                                                  \
 431                                                                         \
 432         mp_disable_preemption();                                        \
 433         cpu_mask = 1 << cpu_number();                                   \
 434                                                                         \
 435         /* Since the pmap is locked, other updates are locked */        \
 436         /* out, and any pmap_activate has finished. */                  \
 437                                                                         \
 438         /* find other cpus using the pmap */                            \
 439         users = (pmap)->cpus_using & ~cpu_mask;                         \
 440         if (users) {                                                    \
 441             LOOP_VAR;                                                   \
 442             /* signal them, and wait for them to finish */              \
 443             /* using the pmap */                                        \
 444             signal_cpus(users, (pmap), (s), (e));                       \
 445             while (((pmap)->cpus_using & cpus_active & ~cpu_mask)) {    \
 446                 LOOP_CHECK("PMAP_UPDATE_TLBS", pmap);                   \
 447                 cpu_pause();                                            \
 448             }                                                           \
 449         }                                                               \
 450         /* invalidate our own TLB if pmap is in use */                  \
 451                                                                         \
 452         if ((pmap)->cpus_using & cpu_mask) {                            \
 453             INVALIDATE_TLB((pmap), (s), (e));                           \
 454         }                                                               \
 455                                                                         \
 456         mp_enable_preemption();                                         \
 457 }
 458
 459 #define MAX_TBIS_SIZE   32              /* > this -> TBIA */ /* XXX */
 460
 461 #define INVALIDATE_TLB(m, s, e) {       \
 462         flush_tlb();                    \
 463 }
 464
 465 /*
 466  *      Structures to keep track of pending TLB invalidations
 467  */
 468 cpu_set                 cpus_active;
 469 cpu_set                 cpus_idle;
 470
 471 #define UPDATE_LIST_SIZE        4
 472
 473 struct pmap_update_item {
 474         pmap_t          pmap;           /* pmap to invalidate */
 475         vm_offset_t     start;          /* start address to invalidate */
 476         vm_offset_t     end;            /* end address to invalidate */
 477 };
 478
 479 typedef struct pmap_update_item *pmap_update_item_t;
 480
 481 /*
 482  *      List of pmap updates.  If the list overflows,
 483  *      the last entry is changed to invalidate all.
 484  */
 485 struct pmap_update_list {
 486         decl_simple_lock_data(,lock)
 487         int                     count;
 488         struct pmap_update_item item[UPDATE_LIST_SIZE];
 489 } ;
 490 typedef struct pmap_update_list *pmap_update_list_t;
 491
 492 extern void signal_cpus(
 493                         cpu_set         use_list,
 494                         pmap_t          pmap,
 495                         vm_offset_t     start,
 496                         vm_offset_t     end);
 497
 498 pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
 499
 500 /*
 501  *      Other useful macros.
 502  */
 503 #define current_pmap()          (vm_map_pmap(current_thread()->map))
 504 #define pmap_in_use(pmap, cpu)  (((pmap)->cpus_using & (1 << (cpu))) != 0)
 505
 506 struct pmap     kernel_pmap_store;
 507 pmap_t          kernel_pmap;
 508
 509 #ifdef PMAP_QUEUE
 510 decl_simple_lock_data(,free_pmap_lock)
 511 #endif
 512
 513 struct zone     *pmap_zone;             /* zone of pmap structures */
 514
 515 int             pmap_debug = 0;         /* flag for debugging prints */
 516
 517 unsigned int    inuse_ptepages_count = 0;       /* debugging */
 518
 519 /*
 520  *      Pmap cache.  Cache is threaded through ref_count field of pmap.
 521  *      Max will eventually be constant -- variable for experimentation.
 522  */
 523 int             pmap_cache_max = 32;
 524 int             pmap_alloc_chunk = 8;
 525 pmap_t          pmap_cache_list;
 526 int             pmap_cache_count;
 527 decl_simple_lock_data(,pmap_cache_lock)
 528
 529 extern  vm_offset_t     hole_start, hole_end;
 530
 531 extern char end;
 532
 533 static int nkpt;
 534
 535 pt_entry_t     *DMAP1, *DMAP2;
 536 caddr_t         DADDR1;
 537 caddr_t         DADDR2;
 538
 539 #if  DEBUG_ALIAS
 540 #define PMAP_ALIAS_MAX 32
 541 struct pmap_alias {
 542         vm_offset_t rpc;
 543         pmap_t pmap;
 544         vm_offset_t va;
 545         int cookie;
 546 #define PMAP_ALIAS_COOKIE 0xdeadbeef
 547 } pmap_aliasbuf[PMAP_ALIAS_MAX];
 548 int pmap_alias_index = 0;
 549 extern vm_offset_t get_rpc();
 550
 551 #endif  /* DEBUG_ALIAS */
 552
 553 #define pmap_pde(m, v)  (&((m)->dirbase[(vm_offset_t)(v) >> PDESHIFT]))
 554 #define pdir_pde(d, v) (d[(vm_offset_t)(v) >> PDESHIFT])
 555
 556 static __inline int
 557 pmap_is_current(pmap_t pmap)
 558 {
 559   return (pmap == kernel_pmap ||
 560           (pmap->dirbase[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME));
 561 }
 562
 563
 564 /*
 565  * return address of mapped pte for vaddr va in pmap pmap.
 566  */
 567 pt_entry_t     *
 568 pmap_pte(pmap_t pmap, vm_offset_t va)
 569 {
 570   pd_entry_t     *pde;
 571   pd_entry_t     newpf;
 572
 573   pde = pmap_pde(pmap, va);
 574   if (*pde != 0) {
 575     if (pmap_is_current(pmap))
 576       return( vtopte(va));
 577     newpf = *pde & PG_FRAME;
 578     if (((*CM4) & PG_FRAME) != newpf) {
 579       *CM4 = newpf | INTEL_PTE_RW | INTEL_PTE_VALID;
 580       invlpg((u_int)CA4);
 581     }
 582     return (pt_entry_t *)CA4 + (i386_btop(va) & (NPTEPG-1));
 583   }
 584   return(0);
 585 }
 586
 587 #define DEBUG_PTE_PAGE  0
 588
 589 #if     DEBUG_PTE_PAGE
 590 void
 591 ptep_check(
 592         ptep_t  ptep)
 593 {
 594         register pt_entry_t     *pte, *epte;
 595         int                     ctu, ctw;
 596
 597         /* check the use and wired counts */
 598         if (ptep == PTE_PAGE_NULL)
 599                 return;
 600         pte = pmap_pte(ptep->pmap, ptep->va);
 601         epte = pte + INTEL_PGBYTES/sizeof(pt_entry_t);
 602         ctu = 0;
 603         ctw = 0;
 604         while (pte < epte) {
 605                 if (pte->pfn != 0) {
 606                         ctu++;
 607                         if (pte->wired)
 608                                 ctw++;
 609                 }
 610                 pte++;
 611         }
 612
 613         if (ctu != ptep->use_count || ctw != ptep->wired_count) {
 614                 printf("use %d wired %d - actual use %d wired %d\n",
 615                         ptep->use_count, ptep->wired_count, ctu, ctw);
 616                 panic("pte count");
 617         }
 618 }
 619 #endif  /* DEBUG_PTE_PAGE */
 620
 621 /*
 622  *      Map memory at initialization.  The physical addresses being
 623  *      mapped are not managed and are never unmapped.
 624  *
 625  *      For now, VM is already on, we only need to map the
 626  *      specified memory.
 627  */
 628 vm_offset_t
 629 pmap_map(
 630         register vm_offset_t    virt,
 631         register vm_offset_t    start_addr,
 632         register vm_offset_t    end_addr,
 633         register vm_prot_t      prot)
 634 {
 635         register int            ps;
 636
 637         ps = PAGE_SIZE;
 638         while (start_addr < end_addr) {
 639                 pmap_enter(kernel_pmap,
 640                         virt, (ppnum_t) i386_btop(start_addr), prot, 0, FALSE);
 641                 virt += ps;
 642                 start_addr += ps;
 643         }
 644         return(virt);
 645 }
 646
 647 /*
 648  *      Back-door routine for mapping kernel VM at initialization.
 649  *      Useful for mapping memory outside the range
 650  *      Sets no-cache, A, D.
 651  *      [vm_first_phys, vm_last_phys) (i.e., devices).
 652  *      Otherwise like pmap_map.
 653  */
 654 vm_offset_t
 655 pmap_map_bd(
 656         register vm_offset_t    virt,
 657         register vm_offset_t    start_addr,
 658         register vm_offset_t    end_addr,
 659         vm_prot_t               prot)
 660 {
 661         register pt_entry_t     template;
 662         register pt_entry_t     *pte;
 663
 664         template = pa_to_pte(start_addr)
 665                 | INTEL_PTE_NCACHE
 666                 | INTEL_PTE_REF
 667                 | INTEL_PTE_MOD
 668                 | INTEL_PTE_WIRED
 669                 | INTEL_PTE_VALID;
 670         if (prot & VM_PROT_WRITE)
 671             template |= INTEL_PTE_WRITE;
 672
 673         /* XXX move pmap_pte out of loop, once one pte mapped, all are */
 674         while (start_addr < end_addr) {
 675                 pte = pmap_pte(kernel_pmap, virt);
 676                 if (pte == PT_ENTRY_NULL) {
 677                         panic("pmap_map_bd: Invalid kernel address\n");
 678                 }
 679                 WRITE_PTE_FAST(pte, template)
 680                 pte_increment_pa(template);
 681                 virt += PAGE_SIZE;
 682                 start_addr += PAGE_SIZE;
 683         }
 684
 685         flush_tlb();
 686         return(virt);
 687 }
 688
 689 extern  char            *first_avail;
 690 extern  vm_offset_t     virtual_avail, virtual_end;
 691 extern  pmap_paddr_t    avail_start, avail_end;
 692 extern  vm_offset_t     etext;
 693 extern  void            *sectHIBB;
 694 extern  int             sectSizeHIB;
 695
 696 /*
 697  *      Bootstrap the system enough to run with virtual memory.
 698  *      Map the kernel's code and data, and allocate the system page table.
 699  *      Called with mapping OFF.  Page_size must already be set.
 700  *
 701  *      Parameters:
 702  *      load_start:     PA where kernel was loaded
 703  *      avail_start     PA of first available physical page -
 704  *                         after kernel page tables
 705  *      avail_end       PA of last available physical page
 706  *      virtual_avail   VA of first available page -
 707  *                         after kernel page tables
 708  *      virtual_end     VA of last available page -
 709  *                         end of kernel address space
 710  *
 711  *      &start_text     start of kernel text
 712  *      &etext          end of kernel text
 713  */
 714
 715 void
 716 pmap_bootstrap(
 717         __unused vm_offset_t    load_start)
 718 {
 719         vm_offset_t     va;
 720         pt_entry_t      *pte;
 721         int i;
 722         int wpkernel, boot_arg;
 723
 724         vm_last_addr = VM_MAX_KERNEL_ADDRESS;   /* Set the highest address
 725                                                  * known to VM */
 726
 727         /*
 728          *      The kernel's pmap is statically allocated so we don't
 729          *      have to use pmap_create, which is unlikely to work
 730          *      correctly at this part of the boot sequence.
 731          */
 732
 733         kernel_pmap = &kernel_pmap_store;
 734 #ifdef PMAP_QUEUE
 735         kernel_pmap->pmap_link.next = (queue_t)kernel_pmap;             /* Set up anchor forward */
 736         kernel_pmap->pmap_link.prev = (queue_t)kernel_pmap;             /* Set up anchor reverse */
 737 #endif
 738         kernel_pmap->ref_count = 1;
 739         kernel_pmap->pm_obj = (vm_object_t) NULL;
 740         kernel_pmap->dirbase = (pd_entry_t *)((unsigned int)IdlePTD | KERNBASE);
 741         kernel_pmap->pdirbase = (pd_entry_t *)IdlePTD;
 742 #ifdef PAE
 743         kernel_pmap->pm_pdpt = (pd_entry_t *)((unsigned int)IdlePDPT | KERNBASE );
 744         kernel_pmap->pm_ppdpt = (vm_offset_t)IdlePDPT;
 745 #endif
 746
 747         va = (vm_offset_t)kernel_pmap->dirbase;
 748         /* setup self referential mapping(s) */
 749         for (i = 0; i< NPGPTD; i++ ) {
 750           pmap_paddr_t pa;
 751           pa = (pmap_paddr_t) kvtophys(va + i386_ptob(i));
 752           * (pd_entry_t *) (kernel_pmap->dirbase + PTDPTDI + i) =
 753             (pa & PG_FRAME) | INTEL_PTE_VALID | INTEL_PTE_RW | INTEL_PTE_REF |
 754             INTEL_PTE_MOD | INTEL_PTE_WIRED ;
 755 #ifdef PAE
 756           kernel_pmap->pm_pdpt[i] = pa | INTEL_PTE_VALID;
 757 #endif
 758         }
 759
 760         nkpt = NKPT;
 761
 762         virtual_avail = (vm_offset_t)VADDR(KPTDI,0) + (vm_offset_t)first_avail;
 763         virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
 764
 765         /*
 766          * Reserve some special page table entries/VA space for temporary
 767          * mapping of pages.
 768          */
 769 #define SYSMAP(c, p, v, n)      \
 770         v = (c)va; va += ((n)*INTEL_PGBYTES); p = pte; pte += (n);
 771
 772         va = virtual_avail;
 773         pte = (pt_entry_t *) pmap_pte(kernel_pmap, va);
 774
 775         /*
 776          * CMAP1/CMAP2 are used for zeroing and copying pages.
 777          * CMAP3 is used for ml_phys_read/write.
 778          */
 779         SYSMAP(caddr_t, CM1, CA1, 1)
 780         * (pt_entry_t *) CM1 = 0;
 781         SYSMAP(caddr_t, CM2, CA2, 1)
 782         * (pt_entry_t *) CM2 = 0;
 783         SYSMAP(caddr_t, CM3, CA3, 1)
 784         * (pt_entry_t *) CM3 = 0;
 785
 786         /* used by pmap_pte */
 787         SYSMAP(caddr_t, CM4, CA4, 1)
 788           * (pt_entry_t *) CM4 = 0;
 789
 790         /* DMAP user for debugger */
 791         SYSMAP(caddr_t, DMAP1, DADDR1, 1);
 792         SYSMAP(caddr_t, DMAP2, DADDR2, 1);  /* XXX temporary - can remove */
 793
 794
 795         lock_init(&pmap_system_lock,
 796                   FALSE,                /* NOT a sleep lock */
 797                   0, 0);
 798
 799         virtual_avail = va;
 800
 801         wpkernel = 1;
 802         if (PE_parse_boot_arg("debug", &boot_arg)) {
 803           if (boot_arg & DB_PRT) wpkernel = 0;
 804           if (boot_arg & DB_NMI) wpkernel = 0;
 805         }
 806
 807         /* remap kernel text readonly if not debugging or kprintfing */
 808         if (wpkernel)
 809         {
 810                 vm_offset_t     myva;
 811                 pt_entry_t     *ptep;
 812
 813                 for (myva = i386_round_page(VM_MIN_KERNEL_ADDRESS + MP_BOOT + MP_BOOTSTACK); myva < etext; myva += PAGE_SIZE) {
 814                         if (myva >= (vm_offset_t)sectHIBB && myva < ((vm_offset_t)sectHIBB + sectSizeHIB))
 815                                 continue;
 816                         ptep = pmap_pte(kernel_pmap, myva);
 817                         if (ptep)
 818                                 *ptep &= ~INTEL_PTE_RW;
 819                 }
 820                 flush_tlb();
 821         }
 822
 823         simple_lock_init(&kernel_pmap->lock, 0);
 824         simple_lock_init(&pv_free_list_lock, 0);
 825
 826         /* invalidate user virtual addresses */
 827         memset((char *)kernel_pmap->dirbase,
 828                0,
 829                (KPTDI) * sizeof(pd_entry_t));
 830
 831         kprintf("Kernel virtual space from 0x%x to 0x%x.\n",
 832                         VADDR(KPTDI,0), virtual_end);
 833 #ifdef PAE
 834         kprintf("Available physical space from 0x%llx to 0x%llx\n",
 835                         avail_start, avail_end);
 836         printf("PAE enabled\n");
 837 #else
 838         kprintf("Available physical space from 0x%x to 0x%x\n",
 839                         avail_start, avail_end);
 840 #endif
 841 }
 842
 843 void
 844 pmap_virtual_space(
 845         vm_offset_t *startp,
 846         vm_offset_t *endp)
 847 {
 848         *startp = virtual_avail;
 849         *endp = virtual_end;
 850 }
 851
 852 /*
 853  *      Initialize the pmap module.
 854  *      Called by vm_init, to initialize any structures that the pmap
 855  *      system needs to map virtual memory.
 856  */
 857 void
 858 pmap_init(void)
 859 {
 860         register long           npages;
 861         vm_offset_t             addr;
 862         register vm_size_t      s;
 863         vm_offset_t vaddr;
 864         ppnum_t ppn;
 865
 866         /*
 867          *      Allocate memory for the pv_head_table and its lock bits,
 868          *      the modify bit array, and the pte_page table.
 869          */
 870
 871         /* zero bias all these arrays now instead of off avail_start
 872            so we cover all memory */
 873         npages = i386_btop(avail_end);
 874         s = (vm_size_t) (sizeof(struct pv_entry) * npages
 875                                 + pv_lock_table_size(npages)
 876                                 + npages);
 877
 878         s = round_page(s);
 879         if (kmem_alloc_wired(kernel_map, &addr, s) != KERN_SUCCESS)
 880                 panic("pmap_init");
 881
 882         memset((char *)addr, 0, s);
 883
 884         /*
 885          *      Allocate the structures first to preserve word-alignment.
 886          */
 887         pv_head_table = (pv_entry_t) addr;
 888         addr = (vm_offset_t) (pv_head_table + npages);
 889
 890         pv_lock_table = (char *) addr;
 891         addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
 892
 893         pmap_phys_attributes = (char *) addr;
 894
 895         /*
 896          *      Create the zone of physical maps,
 897          *      and of the physical-to-virtual entries.
 898          */
 899         s = (vm_size_t) sizeof(struct pmap);
 900         pmap_zone = zinit(s, 400*s, 4096, "pmap"); /* XXX */
 901         s = (vm_size_t) sizeof(struct pv_entry);
 902         pv_list_zone = zinit(s, 10000*s, 4096, "pv_list"); /* XXX */
 903 #ifdef PAE
 904         //      s = (vm_size_t) (sizeof(pdpt_entry_t) * NPGPTD);
 905         s = 63;
 906         pdpt_zone = zinit(s, 400*s, 4096, "pdpt"); /* XXX */
 907 #endif
 908
 909         /*
 910          *      Only now, when all of the data structures are allocated,
 911          *      can we set vm_first_phys and vm_last_phys.  If we set them
 912          *      too soon, the kmem_alloc_wired above will try to use these
 913          *      data structures and blow up.
 914          */
 915
 916         /* zero bias this now so we cover all memory */
 917         vm_first_phys = 0;
 918         vm_last_phys = avail_end;
 919
 920 #if GROW_KERNEL_FUNCTION_IMPLEMENTED
 921         kptobj = &kptobj_object_store;
 922         _vm_object_allocate((vm_object_size_t)NKPDE, kptobj);
 923         kernel_pmap->pm_obj = kptobj;
 924 #endif
 925
 926         /* create pv entries for kernel pages mapped by low level
 927            startup code.  these have to exist so we can pmap_remove()
 928            e.g. kext pages from the middle of our addr space */
 929
 930         vaddr = (vm_offset_t)VM_MIN_KERNEL_ADDRESS;
 931         for (ppn = 0; ppn < i386_btop(avail_start) ; ppn++ ) {
 932           pv_entry_t    pv_e;
 933
 934           pv_e = pai_to_pvh(ppn);
 935           pv_e->va = vaddr;
 936           vaddr += PAGE_SIZE;
 937           pv_e->pmap = kernel_pmap;
 938           pv_e->next = PV_ENTRY_NULL;
 939         }
 940
 941         pmap_initialized = TRUE;
 942
 943         /*
 944          *      Initializie pmap cache.
 945          */
 946         pmap_cache_list = PMAP_NULL;
 947         pmap_cache_count = 0;
 948         simple_lock_init(&pmap_cache_lock, 0);
 949 #ifdef PMAP_QUEUE
 950         simple_lock_init(&free_pmap_lock, 0);
 951 #endif
 952
 953 }
 954
 955 void
 956 x86_lowmem_free(void)
 957 {
 958         /* free lowmem pages back to the vm system. we had to defer doing this
 959            until the vm system was fully up.
 960            the actual pages that are released are determined by which
 961            pages the memory sizing code puts into the region table */
 962
 963         ml_static_mfree((vm_offset_t) i386_ptob(pmap_memory_regions[0].base)|VM_MIN_KERNEL_ADDRESS,
 964                         (vm_size_t) i386_ptob(pmap_memory_regions[0].end - pmap_memory_regions[0].base));
 965 }
 966
 967
 968 #define valid_page(x) (pmap_initialized && pmap_valid_page(x))
 969
 970 boolean_t
 971 pmap_verify_free(
 972                  ppnum_t pn)
 973 {
 974         pmap_paddr_t    phys;
 975         pv_entry_t      pv_h;
 976         int             pai;
 977         spl_t           spl;
 978         boolean_t       result;
 979
 980         assert(pn != vm_page_fictitious_addr);
 981         phys = (pmap_paddr_t)i386_ptob(pn);
 982         if (!pmap_initialized)
 983                 return(TRUE);
 984
 985         if (!pmap_valid_page(pn))
 986                 return(FALSE);
 987
 988         PMAP_WRITE_LOCK(spl);
 989
 990         pai = pa_index(phys);
 991         pv_h = pai_to_pvh(pai);
 992
 993         result = (pv_h->pmap == PMAP_NULL);
 994         PMAP_WRITE_UNLOCK(spl);
 995
 996         return(result);
 997 }
 998
 999 /*
1000  *      Create and return a physical map.
1001  *
1002  *      If the size specified for the map
1003  *      is zero, the map is an actual physical
1004  *      map, and may be referenced by the
1005  *      hardware.
1006  *
1007  *      If the size specified is non-zero,
1008  *      the map will be used in software only, and
1009  *      is bounded by that size.
1010  */
1011 pmap_t
1012 pmap_create(
1013         vm_size_t       size)
1014 {
1015   register pmap_t                       p;
1016 #ifdef PMAP_QUEUE
1017   register pmap_t pro;
1018   spl_t s;
1019 #endif
1020         register int i;
1021         register vm_offset_t va;
1022
1023         /*
1024          *      A software use-only map doesn't even need a map.
1025          */
1026
1027         if (size != 0) {
1028                 return(PMAP_NULL);
1029         }
1030
1031         p = (pmap_t) zalloc(pmap_zone);
1032         if (PMAP_NULL == p)
1033           panic("pmap_create zalloc");
1034         if (KERN_SUCCESS != kmem_alloc_wired(kernel_map, (vm_offset_t *)(&p->dirbase), NBPTD))
1035           panic("pmap_create kmem_alloc_wired");
1036 #ifdef PAE
1037         p->pm_hold = (vm_offset_t)zalloc(pdpt_zone);
1038         if ((vm_offset_t)NULL == p->pm_hold) {
1039           panic("pdpt zalloc");
1040         }
1041         p->pm_pdpt = (pdpt_entry_t *) (( p->pm_hold + 31) & ~31);
1042         p->pm_ppdpt = kvtophys((vm_offset_t)p->pm_pdpt);  /* XXX */
1043 #endif
1044         if (NULL == (p->pm_obj = vm_object_allocate((vm_object_size_t)(NPGPTD*NPDEPG))))
1045           panic("pmap_create vm_object_allocate");
1046         memcpy(p->dirbase,
1047                (void *)((unsigned int)IdlePTD | KERNBASE),
1048                NBPTD);
1049         va = (vm_offset_t)p->dirbase;
1050         p->pdirbase = (pd_entry_t *)(kvtophys(va));
1051         simple_lock_init(&p->lock, 0);
1052
1053         /* setup self referential mapping(s) */
1054         for (i = 0; i< NPGPTD; i++ ) {
1055           pmap_paddr_t pa;
1056           pa = (pmap_paddr_t) kvtophys(va + i386_ptob(i));
1057           * (pd_entry_t *) (p->dirbase + PTDPTDI + i) =
1058             (pa & PG_FRAME) | INTEL_PTE_VALID | INTEL_PTE_RW | INTEL_PTE_REF |
1059             INTEL_PTE_MOD | INTEL_PTE_WIRED ;
1060 #ifdef PAE
1061           p->pm_pdpt[i] = pa | INTEL_PTE_VALID;
1062 #endif
1063         }
1064
1065         p->cpus_using = 0;
1066         p->stats.resident_count = 0;
1067         p->stats.wired_count = 0;
1068         p->ref_count = 1;
1069
1070 #ifdef PMAP_QUEUE
1071         /* insert new pmap at head of queue hanging off kernel_pmap */
1072         SPLVM(s);
1073         simple_lock(&free_pmap_lock);
1074         p->pmap_link.next = (queue_t)kernel_pmap->pmap_link.next;
1075         kernel_pmap->pmap_link.next = (queue_t)p;
1076
1077         pro = (pmap_t) p->pmap_link.next;
1078         p->pmap_link.prev = (queue_t)pro->pmap_link.prev;
1079         pro->pmap_link.prev = (queue_t)p;
1080
1081
1082         simple_unlock(&free_pmap_lock);
1083         SPLX(s);
1084 #endif
1085
1086         return(p);
1087 }
1088
1089 /*
1090  *      Retire the given physical map from service.
1091  *      Should only be called if the map contains
1092  *      no valid mappings.
1093  */
1094
1095 void
1096 pmap_destroy(
1097         register pmap_t p)
1098 {
1099         register pt_entry_t     *pdep;
1100         register int            c;
1101         spl_t                   s;
1102         register vm_page_t      m;
1103 #ifdef PMAP_QUEUE
1104         register pmap_t        pre,pro;
1105 #endif
1106
1107         if (p == PMAP_NULL)
1108                 return;
1109
1110         SPLVM(s);
1111         simple_lock(&p->lock);
1112         c = --p->ref_count;
1113         if (c == 0) {
1114                 register int    my_cpu;
1115
1116                 mp_disable_preemption();
1117                 my_cpu = cpu_number();
1118
1119                 /*
1120                  * If some cpu is not using the physical pmap pointer that it
1121                  * is supposed to be (see set_dirbase), we might be using the
1122                  * pmap that is being destroyed! Make sure we are
1123                  * physically on the right pmap:
1124                  */
1125                 /* force pmap/cr3 update */
1126                 PMAP_UPDATE_TLBS(p,
1127                                  VM_MIN_ADDRESS,
1128                                  VM_MAX_KERNEL_ADDRESS);
1129
1130                 if (PMAP_REAL(my_cpu) == p) {
1131                         PMAP_CPU_CLR(p, my_cpu);
1132                         PMAP_REAL(my_cpu) = kernel_pmap;
1133 #ifdef PAE
1134                         set_cr3((unsigned int)kernel_pmap->pm_ppdpt);
1135 #else
1136                         set_cr3((unsigned int)kernel_pmap->pdirbase);
1137 #endif
1138                 }
1139                 mp_enable_preemption();
1140         }
1141         simple_unlock(&p->lock);
1142         SPLX(s);
1143
1144         if (c != 0) {
1145             return;     /* still in use */
1146         }
1147
1148 #ifdef PMAP_QUEUE
1149         /* remove from pmap queue */
1150         SPLVM(s);
1151         simple_lock(&free_pmap_lock);
1152
1153         pre = (pmap_t)p->pmap_link.prev;
1154         pre->pmap_link.next = (queue_t)p->pmap_link.next;
1155         pro = (pmap_t)p->pmap_link.next;
1156         pro->pmap_link.prev = (queue_t)p->pmap_link.prev;
1157
1158         simple_unlock(&free_pmap_lock);
1159         SPLX(s);
1160 #endif
1161
1162         /*
1163          *      Free the memory maps, then the
1164          *      pmap structure.
1165          */
1166
1167         pdep = (pt_entry_t *)p->dirbase;
1168
1169         while (pdep < (pt_entry_t *)&p->dirbase[(UMAXPTDI+1)]) {
1170           int ind;
1171             if (*pdep & INTEL_PTE_VALID) {
1172               ind = pdep - (pt_entry_t *)&p->dirbase[0];
1173                 vm_object_lock(p->pm_obj);
1174                 m = vm_page_lookup(p->pm_obj, (vm_object_offset_t)ind);
1175                 if (m == VM_PAGE_NULL) {
1176                     panic("pmap_destroy: pte page not in object");
1177                 }
1178                 vm_page_lock_queues();
1179                 vm_page_free(m);
1180                 inuse_ptepages_count--;
1181                 vm_object_unlock(p->pm_obj);
1182                 vm_page_unlock_queues();
1183
1184                 /*
1185                  *      Clear pdes, this might be headed for the cache.
1186                  */
1187                 *pdep++ = 0;
1188             }
1189             else {
1190               *pdep++ = 0;
1191             }
1192
1193         }
1194
1195         vm_object_deallocate(p->pm_obj);
1196         kmem_free(kernel_map, (vm_offset_t)p->dirbase, NBPTD);
1197 #ifdef PAE
1198         zfree(pdpt_zone, (void *)p->pm_hold);
1199 #endif
1200         zfree(pmap_zone, p);
1201 }
1202
1203 /*
1204  *      Add a reference to the specified pmap.
1205  */
1206
1207 void
1208 pmap_reference(
1209         register pmap_t p)
1210 {
1211         spl_t   s;
1212
1213         if (p != PMAP_NULL) {
1214                 SPLVM(s);
1215                 simple_lock(&p->lock);
1216                 p->ref_count++;
1217                 simple_unlock(&p->lock);
1218                 SPLX(s);
1219         }
1220 }
1221
1222 /*
1223  *      Remove a range of hardware page-table entries.
1224  *      The entries given are the first (inclusive)
1225  *      and last (exclusive) entries for the VM pages.
1226  *      The virtual address is the va for the first pte.
1227  *
1228  *      The pmap must be locked.
1229  *      If the pmap is not the kernel pmap, the range must lie
1230  *      entirely within one pte-page.  This is NOT checked.
1231  *      Assumes that the pte-page exists.
1232  */
1233
1234 /* static */
1235 void
1236 pmap_remove_range(
1237         pmap_t                  pmap,
1238         vm_offset_t             va,
1239         pt_entry_t              *spte,
1240         pt_entry_t              *epte)
1241 {
1242         register pt_entry_t     *cpte;
1243         int                     num_removed, num_unwired;
1244         int                     pai;
1245         pmap_paddr_t            pa;
1246
1247 #if     DEBUG_PTE_PAGE
1248         if (pmap != kernel_pmap)
1249                 ptep_check(get_pte_page(spte));
1250 #endif  /* DEBUG_PTE_PAGE */
1251         num_removed = 0;
1252         num_unwired = 0;
1253
1254         for (cpte = spte; cpte < epte;
1255              cpte++, va += PAGE_SIZE) {
1256
1257             pa = pte_to_pa(*cpte);
1258             if (pa == 0)
1259                 continue;
1260
1261             num_removed++;
1262             if (iswired(*cpte))
1263                 num_unwired++;
1264
1265             if (!valid_page(i386_btop(pa))) {
1266
1267                 /*
1268                  *      Outside range of managed physical memory.
1269                  *      Just remove the mappings.
1270                  */
1271                 register pt_entry_t     *lpte = cpte;
1272
1273                 *lpte = 0;
1274                 continue;
1275             }
1276
1277             pai = pa_index(pa);
1278             LOCK_PVH(pai);
1279
1280             /*
1281              *  Get the modify and reference bits.
1282              */
1283             {
1284                 register pt_entry_t     *lpte;
1285
1286                 lpte = cpte;
1287                     pmap_phys_attributes[pai] |=
1288                         *lpte & (PHYS_MODIFIED|PHYS_REFERENCED);
1289                     *lpte = 0;
1290
1291             }
1292
1293             /*
1294              *  Remove the mapping from the pvlist for
1295              *  this physical page.
1296              */
1297             {
1298                 register pv_entry_t     pv_h, prev, cur;
1299
1300                 pv_h = pai_to_pvh(pai);
1301                 if (pv_h->pmap == PMAP_NULL) {
1302                     panic("pmap_remove: null pv_list!");
1303                 }
1304                 if (pv_h->va == va && pv_h->pmap == pmap) {
1305                     /*
1306                      * Header is the pv_entry.  Copy the next one
1307                      * to header and free the next one (we cannot
1308                      * free the header)
1309                      */
1310                     cur = pv_h->next;
1311                     if (cur != PV_ENTRY_NULL) {
1312                         *pv_h = *cur;
1313                         PV_FREE(cur);
1314                     }
1315                     else {
1316                         pv_h->pmap = PMAP_NULL;
1317                     }
1318                 }
1319                 else {
1320                     cur = pv_h;
1321                     do {
1322                         prev = cur;
1323                         if ((cur = prev->next) == PV_ENTRY_NULL) {
1324                           panic("pmap-remove: mapping not in pv_list!");
1325                         }
1326                     } while (cur->va != va || cur->pmap != pmap);
1327                     prev->next = cur->next;
1328                     PV_FREE(cur);
1329                 }
1330                 UNLOCK_PVH(pai);
1331             }
1332         }
1333
1334         /*
1335          *      Update the counts
1336          */
1337         assert(pmap->stats.resident_count >= num_removed);
1338         pmap->stats.resident_count -= num_removed;
1339         assert(pmap->stats.wired_count >= num_unwired);
1340         pmap->stats.wired_count -= num_unwired;
1341 }
1342
1343 /*
1344  *      Remove phys addr if mapped in specified map
1345  *
1346  */
1347 void
1348 pmap_remove_some_phys(
1349         __unused pmap_t         map,
1350         __unused ppnum_t         pn)
1351 {
1352
1353 /* Implement to support working set code */
1354
1355 }
1356
1357 /*
1358  *      Remove the given range of addresses
1359  *      from the specified map.
1360  *
1361  *      It is assumed that the start and end are properly
1362  *      rounded to the hardware page size.
1363  */
1364
1365
1366 void
1367 pmap_remove(
1368         pmap_t          map,
1369         addr64_t        s64,
1370         addr64_t        e64)
1371 {
1372         spl_t                   spl;
1373         register pt_entry_t     *pde;
1374         register pt_entry_t     *spte, *epte;
1375         vm_offset_t             l;
1376         vm_offset_t    s, e;
1377         vm_offset_t    orig_s;
1378
1379         if (map == PMAP_NULL)
1380                 return;
1381
1382         PMAP_READ_LOCK(map, spl);
1383
1384         if (value_64bit(s64) || value_64bit(e64)) {
1385           panic("pmap_remove addr overflow");
1386         }
1387
1388         orig_s = s = (vm_offset_t)low32(s64);
1389         e = (vm_offset_t)low32(e64);
1390
1391         pde = pmap_pde(map, s);
1392
1393         while (s < e) {
1394             l = (s + PDE_MAPPED_SIZE) & ~(PDE_MAPPED_SIZE-1);
1395             if (l > e)
1396                 l = e;
1397             if (*pde & INTEL_PTE_VALID) {
1398               spte = (pt_entry_t *)pmap_pte(map, (s & ~(PDE_MAPPED_SIZE-1)));
1399                 spte = &spte[ptenum(s)];
1400                 epte = &spte[intel_btop(l-s)];
1401                 pmap_remove_range(map, s, spte, epte);
1402             }
1403             s = l;
1404             pde++;
1405         }
1406
1407         PMAP_UPDATE_TLBS(map, orig_s, e);
1408
1409         PMAP_READ_UNLOCK(map, spl);
1410 }
1411
1412 /*
1413  *      Routine:        pmap_page_protect
1414  *
1415  *      Function:
1416  *              Lower the permission for all mappings to a given
1417  *              page.
1418  */
1419 void
1420 pmap_page_protect(
1421         ppnum_t         pn,
1422         vm_prot_t       prot)
1423 {
1424         pv_entry_t              pv_h, prev;
1425         register pv_entry_t     pv_e;
1426         register pt_entry_t     *pte;
1427         int                     pai;
1428         register pmap_t         pmap;
1429         spl_t                   spl;
1430         boolean_t               remove;
1431         pmap_paddr_t             phys;
1432
1433         assert(pn != vm_page_fictitious_addr);
1434         phys = (pmap_paddr_t)i386_ptob(pn);
1435         if (!valid_page(pn)) {
1436             /*
1437              *  Not a managed page.
1438              */
1439             return;
1440         }
1441
1442         /*
1443          * Determine the new protection.
1444          */
1445         switch (prot) {
1446             case VM_PROT_READ:
1447             case VM_PROT_READ|VM_PROT_EXECUTE:
1448                 remove = FALSE;
1449                 break;
1450             case VM_PROT_ALL:
1451                 return; /* nothing to do */
1452             default:
1453                 remove = TRUE;
1454                 break;
1455         }
1456
1457         /*
1458          *      Lock the pmap system first, since we will be changing
1459          *      several pmaps.
1460          */
1461
1462         PMAP_WRITE_LOCK(spl);
1463
1464         pai = pa_index(phys);
1465         pv_h = pai_to_pvh(pai);
1466
1467         /*
1468          * Walk down PV list, changing or removing all mappings.
1469          * We do not have to lock the pv_list because we have
1470          * the entire pmap system locked.
1471          */
1472         if (pv_h->pmap != PMAP_NULL) {
1473
1474             prev = pv_e = pv_h;
1475             do {
1476               register vm_offset_t va;
1477                 pmap = pv_e->pmap;
1478                 /*
1479                  * Lock the pmap to block pmap_extract and similar routines.
1480                  */
1481                 simple_lock(&pmap->lock);
1482
1483                 {
1484
1485                     va = pv_e->va;
1486                     pte = pmap_pte(pmap, va);
1487
1488                     /*
1489                      * Consistency checks.
1490                      */
1491                     /* assert(*pte & INTEL_PTE_VALID); XXX */
1492                     /* assert(pte_to_phys(*pte) == phys); */
1493
1494                 }
1495
1496                 /*
1497                  * Remove the mapping if new protection is NONE
1498                  * or if write-protecting a kernel mapping.
1499                  */
1500                 if (remove || pmap == kernel_pmap) {
1501                     /*
1502                      * Remove the mapping, collecting any modify bits.
1503                      */
1504                     {
1505                             pmap_phys_attributes[pai] |=
1506                                 *pte & (PHYS_MODIFIED|PHYS_REFERENCED);
1507                             *pte++ = 0;
1508                             PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
1509                     }
1510
1511                     assert(pmap->stats.resident_count >= 1);
1512                     pmap->stats.resident_count--;
1513
1514                     /*
1515                      * Remove the pv_entry.
1516                      */
1517                     if (pv_e == pv_h) {
1518                         /*
1519                          * Fix up head later.
1520                          */
1521                         pv_h->pmap = PMAP_NULL;
1522                     }
1523                     else {
1524                         /*
1525                          * Delete this entry.
1526                          */
1527                         prev->next = pv_e->next;
1528                         PV_FREE(pv_e);
1529                     }
1530                 }
1531                 else {
1532                     /*
1533                      * Write-protect.
1534                      */
1535
1536                         *pte &= ~INTEL_PTE_WRITE;
1537                         pte++;
1538                         PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
1539                     /*
1540                      * Advance prev.
1541                      */
1542                     prev = pv_e;
1543                 }
1544
1545                 simple_unlock(&pmap->lock);
1546
1547             } while ((pv_e = prev->next) != PV_ENTRY_NULL);
1548
1549             /*
1550              * If pv_head mapping was removed, fix it up.
1551              */
1552             if (pv_h->pmap == PMAP_NULL) {
1553                 pv_e = pv_h->next;
1554                 if (pv_e != PV_ENTRY_NULL) {
1555                     *pv_h = *pv_e;
1556                     PV_FREE(pv_e);
1557                 }
1558             }
1559         }
1560
1561         PMAP_WRITE_UNLOCK(spl);
1562 }
1563
1564 /*
1565  *      Routine:
1566  *              pmap_disconnect
1567  *
1568  *      Function:
1569  *              Disconnect all mappings for this page and return reference and change status
1570  *              in generic format.
1571  *
1572  */
1573 unsigned int pmap_disconnect(
1574         ppnum_t pa)
1575 {
1576         pmap_page_protect(pa, 0);                               /* disconnect the page */
1577         return (pmap_get_refmod(pa));                   /* return ref/chg status */
1578 }
1579
1580 /*
1581  *      Set the physical protection on the
1582  *      specified range of this map as requested.
1583  *      Will not increase permissions.
1584  */
1585 void
1586 pmap_protect(
1587         pmap_t          map,
1588         vm_offset_t     s,
1589         vm_offset_t     e,
1590         vm_prot_t       prot)
1591 {
1592         register pt_entry_t     *pde;
1593         register pt_entry_t     *spte, *epte;
1594         vm_offset_t             l;
1595         spl_t           spl;
1596         vm_offset_t    orig_s = s;
1597
1598
1599         if (map == PMAP_NULL)
1600                 return;
1601
1602         /*
1603          * Determine the new protection.
1604          */
1605         switch (prot) {
1606             case VM_PROT_READ:
1607             case VM_PROT_READ|VM_PROT_EXECUTE:
1608                 break;
1609             case VM_PROT_READ|VM_PROT_WRITE:
1610             case VM_PROT_ALL:
1611                 return; /* nothing to do */
1612             default:
1613                 pmap_remove(map, (addr64_t)s, (addr64_t)e);
1614                 return;
1615         }
1616
1617         SPLVM(spl);
1618         simple_lock(&map->lock);
1619
1620         pde = pmap_pde(map, s);
1621         while (s < e) {
1622             l = (s + PDE_MAPPED_SIZE) & ~(PDE_MAPPED_SIZE-1);
1623             if (l > e)
1624                 l = e;
1625             if (*pde & INTEL_PTE_VALID) {
1626               spte = (pt_entry_t *)pmap_pte(map, (s & ~(PDE_MAPPED_SIZE-1)));
1627                 spte = &spte[ptenum(s)];
1628                 epte = &spte[intel_btop(l-s)];
1629
1630                 while (spte < epte) {
1631                     if (*spte & INTEL_PTE_VALID)
1632                         *spte &= ~INTEL_PTE_WRITE;
1633                     spte++;
1634                 }
1635             }
1636             s = l;
1637             pde++;
1638         }
1639
1640         PMAP_UPDATE_TLBS(map, orig_s, e);
1641
1642         simple_unlock(&map->lock);
1643         SPLX(spl);
1644 }
1645
1646
1647
1648 /*
1649  *      Insert the given physical page (p) at
1650  *      the specified virtual address (v) in the
1651  *      target physical map with the protection requested.
1652  *
1653  *      If specified, the page will be wired down, meaning
1654  *      that the related pte cannot be reclaimed.
1655  *
1656  *      NB:  This is the only routine which MAY NOT lazy-evaluate
1657  *      or lose information.  That is, this routine must actually
1658  *      insert this page into the given map NOW.
1659  */
1660 void
1661 pmap_enter(
1662         register pmap_t         pmap,
1663         vm_offset_t             v,
1664         ppnum_t                 pn,
1665         vm_prot_t               prot,
1666         unsigned int            flags,
1667         boolean_t               wired)
1668 {
1669         register pt_entry_t     *pte;
1670         register pv_entry_t     pv_h;
1671         register int            pai;
1672         pv_entry_t              pv_e;
1673         pt_entry_t              template;
1674         spl_t                   spl;
1675         pmap_paddr_t            old_pa;
1676         pmap_paddr_t             pa = (pmap_paddr_t)i386_ptob(pn);
1677
1678         XPR(0x80000000, "%x/%x: pmap_enter %x/%x/%x\n",
1679             current_thread(),
1680             current_thread(),
1681             pmap, v, pn);
1682
1683         assert(pn != vm_page_fictitious_addr);
1684         if (pmap_debug)
1685                 printf("pmap(%x, %x)\n", v, pn);
1686         if (pmap == PMAP_NULL)
1687                 return;
1688
1689         /*
1690          *      Must allocate a new pvlist entry while we're unlocked;
1691          *      zalloc may cause pageout (which will lock the pmap system).
1692          *      If we determine we need a pvlist entry, we will unlock
1693          *      and allocate one.  Then we will retry, throughing away
1694          *      the allocated entry later (if we no longer need it).
1695          */
1696         pv_e = PV_ENTRY_NULL;
1697
1698         PMAP_READ_LOCK(pmap, spl);
1699
1700         /*
1701          *      Expand pmap to include this pte.  Assume that
1702          *      pmap is always expanded to include enough hardware
1703          *      pages to map one VM page.
1704          */
1705
1706         while ((pte = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
1707                 /*
1708                  *      Must unlock to expand the pmap.
1709                  */
1710                 PMAP_READ_UNLOCK(pmap, spl);
1711
1712                 pmap_expand(pmap, v);
1713
1714                 PMAP_READ_LOCK(pmap, spl);
1715         }
1716         /*
1717          *      Special case if the physical page is already mapped
1718          *      at this address.
1719          */
1720         old_pa = pte_to_pa(*pte);
1721         if (old_pa == pa) {
1722             /*
1723              *  May be changing its wired attribute or protection
1724              */
1725
1726             template = pa_to_pte(pa) | INTEL_PTE_VALID;
1727
1728             if(flags & VM_MEM_NOT_CACHEABLE) {
1729                 if(!(flags & VM_MEM_GUARDED))
1730                         template |= INTEL_PTE_PTA;
1731                 template |= INTEL_PTE_NCACHE;
1732             }
1733
1734             if (pmap != kernel_pmap)
1735                 template |= INTEL_PTE_USER;
1736             if (prot & VM_PROT_WRITE)
1737                 template |= INTEL_PTE_WRITE;
1738             if (wired) {
1739                 template |= INTEL_PTE_WIRED;
1740                 if (!iswired(*pte))
1741                     pmap->stats.wired_count++;
1742             }
1743             else {
1744                 if (iswired(*pte)) {
1745                     assert(pmap->stats.wired_count >= 1);
1746                     pmap->stats.wired_count--;
1747                 }
1748             }
1749
1750                 if (*pte & INTEL_PTE_MOD)
1751                     template |= INTEL_PTE_MOD;
1752                 WRITE_PTE(pte, template)
1753                   pte++;
1754
1755             goto Done;
1756         }
1757
1758         /*
1759          *      Outline of code from here:
1760          *         1) If va was mapped, update TLBs, remove the mapping
1761          *            and remove old pvlist entry.
1762          *         2) Add pvlist entry for new mapping
1763          *         3) Enter new mapping.
1764          *
1765          *      SHARING_FAULTS complicates this slightly in that it cannot
1766          *      replace the mapping, but must remove it (because adding the
1767          *      pvlist entry for the new mapping may remove others), and
1768          *      hence always enters the new mapping at step 3)
1769          *
1770          *      If the old physical page is not managed step 1) is skipped
1771          *      (except for updating the TLBs), and the mapping is
1772          *      overwritten at step 3).  If the new physical page is not
1773          *      managed, step 2) is skipped.
1774          */
1775
1776         if (old_pa != (pmap_paddr_t) 0) {
1777
1778
1779 #if     DEBUG_PTE_PAGE
1780             if (pmap != kernel_pmap)
1781                 ptep_check(get_pte_page(pte));
1782 #endif  /* DEBUG_PTE_PAGE */
1783
1784             /*
1785              *  Don't do anything to pages outside valid memory here.
1786              *  Instead convince the code that enters a new mapping
1787              *  to overwrite the old one.
1788              */
1789
1790             if (valid_page(i386_btop(old_pa))) {
1791
1792                 pai = pa_index(old_pa);
1793                 LOCK_PVH(pai);
1794
1795                 assert(pmap->stats.resident_count >= 1);
1796                 pmap->stats.resident_count--;
1797                 if (iswired(*pte)) {
1798                     assert(pmap->stats.wired_count >= 1);
1799                     pmap->stats.wired_count--;
1800                 }
1801
1802                     pmap_phys_attributes[pai] |=
1803                         *pte & (PHYS_MODIFIED|PHYS_REFERENCED);
1804                     WRITE_PTE(pte, 0)
1805
1806                 /*
1807                  *      Remove the mapping from the pvlist for
1808                  *      this physical page.
1809                  */
1810                 {
1811                     register pv_entry_t prev, cur;
1812
1813                     pv_h = pai_to_pvh(pai);
1814                     if (pv_h->pmap == PMAP_NULL) {
1815                         panic("pmap_enter: null pv_list!");
1816                     }
1817                     if (pv_h->va == v && pv_h->pmap == pmap) {
1818                         /*
1819                          * Header is the pv_entry.  Copy the next one
1820                          * to header and free the next one (we cannot
1821                          * free the header)
1822                          */
1823                         cur = pv_h->next;
1824                         if (cur != PV_ENTRY_NULL) {
1825                             *pv_h = *cur;
1826                             pv_e = cur;
1827                         }
1828                         else {
1829                             pv_h->pmap = PMAP_NULL;
1830                         }
1831                     }
1832                     else {
1833                         cur = pv_h;
1834                         do {
1835                             prev = cur;
1836                             if ((cur = prev->next) == PV_ENTRY_NULL) {
1837                                 panic("pmap_enter: mapping not in pv_list!");
1838                             }
1839                         } while (cur->va != v || cur->pmap != pmap);
1840                         prev->next = cur->next;
1841                         pv_e = cur;
1842                     }
1843                 }
1844                 UNLOCK_PVH(pai);
1845             }
1846             else {
1847
1848                 /*
1849                  *      old_pa is not managed.  Pretend it's zero so code
1850                  *      at Step 3) will enter new mapping (overwriting old
1851                  *      one).  Do removal part of accounting.
1852                  */
1853                 old_pa = (pmap_paddr_t) 0;
1854                 assert(pmap->stats.resident_count >= 1);
1855                 pmap->stats.resident_count--;
1856                 if (iswired(*pte)) {
1857                     assert(pmap->stats.wired_count >= 1);
1858                     pmap->stats.wired_count--;
1859                 }
1860             }
1861
1862         }
1863
1864         if (valid_page(i386_btop(pa))) {
1865
1866             /*
1867              *  Step 2) Enter the mapping in the PV list for this
1868              *  physical page.
1869              */
1870
1871             pai = pa_index(pa);
1872
1873
1874 #if SHARING_FAULTS
1875 RetryPvList:
1876             /*
1877              * We can return here from the sharing fault code below
1878              * in case we removed the only entry on the pv list and thus
1879              * must enter the new one in the list header.
1880              */
1881 #endif /* SHARING_FAULTS */
1882             LOCK_PVH(pai);
1883             pv_h = pai_to_pvh(pai);
1884
1885             if (pv_h->pmap == PMAP_NULL) {
1886                 /*
1887                  *      No mappings yet
1888                  */
1889                 pv_h->va = v;
1890                 pv_h->pmap = pmap;
1891                 pv_h->next = PV_ENTRY_NULL;
1892             }
1893             else {
1894 #if     DEBUG
1895                 {
1896                     /*
1897                      * check that this mapping is not already there
1898                      * or there is no alias for this mapping in the same map
1899                      */
1900                     pv_entry_t  e = pv_h;
1901                     while (e != PV_ENTRY_NULL) {
1902                         if (e->pmap == pmap && e->va == v)
1903                             panic("pmap_enter: already in pv_list");
1904                         e = e->next;
1905                     }
1906                 }
1907 #endif  /* DEBUG */
1908 #if SHARING_FAULTS
1909                 {
1910                     /*
1911                      * do sharing faults.
1912                      * if we find an entry on this pv list in the same address
1913                      * space, remove it.  we know there will not be more
1914                      * than one.
1915                      */
1916                     pv_entry_t  e = pv_h;
1917                     pt_entry_t      *opte;
1918
1919                     while (e != PV_ENTRY_NULL) {
1920                         if (e->pmap == pmap) {
1921                             /*
1922                              *  Remove it, drop pv list lock first.
1923                              */
1924                             UNLOCK_PVH(pai);
1925
1926                             opte = pmap_pte(pmap, e->va);
1927                             assert(opte != PT_ENTRY_NULL);
1928                             /*
1929                              *  Invalidate the translation buffer,
1930                              *  then remove the mapping.
1931                              */
1932                              pmap_remove_range(pmap, e->va, opte,
1933                                                       opte + 1);
1934                              PMAP_UPDATE_TLBS(pmap, e->va, e->va + PAGE_SIZE);
1935
1936                              /*
1937                               * We could have remove the head entry,
1938                               * so there could be no more entries
1939                               * and so we have to use the pv head entry.
1940                               * so, go back to the top and try the entry
1941                               * again.
1942                               */
1943                              goto RetryPvList;
1944                         }
1945                         e = e->next;
1946                     }
1947
1948                     /*
1949                      * check that this mapping is not already there
1950                      */
1951                     e = pv_h;
1952                     while (e != PV_ENTRY_NULL) {
1953                         if (e->pmap == pmap)
1954                             panic("pmap_enter: alias in pv_list");
1955                         e = e->next;
1956                     }
1957                 }
1958 #endif /* SHARING_FAULTS */
1959 #if DEBUG_ALIAS
1960                 {
1961                     /*
1962                      * check for aliases within the same address space.
1963                      */
1964                     pv_entry_t  e = pv_h;
1965                     vm_offset_t     rpc = get_rpc();
1966
1967                     while (e != PV_ENTRY_NULL) {
1968                         if (e->pmap == pmap) {
1969                             /*
1970                              * log this entry in the alias ring buffer
1971                              * if it's not there already.
1972                              */
1973                             struct pmap_alias *pma;
1974                             int ii, logit;
1975
1976                             logit = TRUE;
1977                             for (ii = 0; ii < pmap_alias_index; ii++) {
1978                                 if (pmap_aliasbuf[ii].rpc == rpc) {
1979                                     /* found it in the log already */
1980                                     logit = FALSE;
1981                                     break;
1982                                 }
1983                             }
1984                             if (logit) {
1985                                 pma = &pmap_aliasbuf[pmap_alias_index];
1986                                 pma->pmap = pmap;
1987                                 pma->va = v;
1988                                 pma->rpc = rpc;
1989                                 pma->cookie = PMAP_ALIAS_COOKIE;
1990                                 if (++pmap_alias_index >= PMAP_ALIAS_MAX)
1991                                     panic("pmap_enter: exhausted alias log");
1992                             }
1993                         }
1994                         e = e->next;
1995                     }
1996                 }
1997 #endif /* DEBUG_ALIAS */
1998                 /*
1999                  *      Add new pv_entry after header.
2000                  */
2001                 if (pv_e == PV_ENTRY_NULL) {
2002                     PV_ALLOC(pv_e);
2003                     if (pv_e == PV_ENTRY_NULL) {
2004                       panic("pmap no pv_e's");
2005                     }
2006                 }
2007                 pv_e->va = v;
2008                 pv_e->pmap = pmap;
2009                 pv_e->next = pv_h->next;
2010                 pv_h->next = pv_e;
2011                 /*
2012                  *      Remember that we used the pvlist entry.
2013                  */
2014                 pv_e = PV_ENTRY_NULL;
2015             }
2016             UNLOCK_PVH(pai);
2017         }
2018
2019         /*
2020          * Step 3) Enter and count the mapping.
2021          */
2022
2023         pmap->stats.resident_count++;
2024
2025         /*
2026          *      Build a template to speed up entering -
2027          *      only the pfn changes.
2028          */
2029         template = pa_to_pte(pa) | INTEL_PTE_VALID;
2030
2031         if(flags & VM_MEM_NOT_CACHEABLE) {
2032                 if(!(flags & VM_MEM_GUARDED))
2033                         template |= INTEL_PTE_PTA;
2034                 template |= INTEL_PTE_NCACHE;
2035         }
2036
2037         if (pmap != kernel_pmap)
2038                 template |= INTEL_PTE_USER;
2039         if (prot & VM_PROT_WRITE)
2040                 template |= INTEL_PTE_WRITE;
2041         if (wired) {
2042                 template |= INTEL_PTE_WIRED;
2043                 pmap->stats.wired_count++;
2044         }
2045
2046          WRITE_PTE(pte, template)
2047
2048 Done:
2049          PMAP_UPDATE_TLBS(pmap, v, v + PAGE_SIZE);
2050
2051         if (pv_e != PV_ENTRY_NULL) {
2052             PV_FREE(pv_e);
2053         }
2054
2055         PMAP_READ_UNLOCK(pmap, spl);
2056 }
2057
2058 /*
2059  *      Routine:        pmap_change_wiring
2060  *      Function:       Change the wiring attribute for a map/virtual-address
2061  *                      pair.
2062  *      In/out conditions:
2063  *                      The mapping must already exist in the pmap.
2064  */
2065 void
2066 pmap_change_wiring(
2067         register pmap_t map,
2068         vm_offset_t     v,
2069         boolean_t       wired)
2070 {
2071         register pt_entry_t     *pte;
2072         spl_t                   spl;
2073
2074 #if 1
2075         /*
2076          *      We must grab the pmap system lock because we may
2077          *      change a pte_page queue.
2078          */
2079         PMAP_READ_LOCK(map, spl);
2080
2081         if ((pte = pmap_pte(map, v)) == PT_ENTRY_NULL)
2082                 panic("pmap_change_wiring: pte missing");
2083
2084         if (wired && !iswired(*pte)) {
2085             /*
2086              *  wiring down mapping
2087              */
2088             map->stats.wired_count++;
2089             *pte++ |= INTEL_PTE_WIRED;
2090         }
2091         else if (!wired && iswired(*pte)) {
2092             /*
2093              *  unwiring mapping
2094              */
2095             assert(map->stats.wired_count >= 1);
2096             map->stats.wired_count--;
2097             *pte++ &= ~INTEL_PTE_WIRED;
2098         }
2099
2100         PMAP_READ_UNLOCK(map, spl);
2101
2102 #else
2103         return;
2104 #endif
2105
2106 }
2107
2108 ppnum_t
2109 pmap_find_phys(pmap_t pmap, addr64_t va)
2110 {
2111         pt_entry_t     *ptp;
2112         vm_offset_t     a32;
2113         ppnum_t         ppn;
2114
2115         if (value_64bit(va))
2116                 panic("pmap_find_phys 64 bit value");
2117         a32 = (vm_offset_t) low32(va);
2118         ptp = pmap_pte(pmap, a32);
2119         if (PT_ENTRY_NULL == ptp) {
2120                 ppn = 0;
2121         } else {
2122                 ppn = (ppnum_t) i386_btop(pte_to_pa(*ptp));
2123         }
2124         return ppn;
2125 }
2126
2127 /*
2128  *      Routine:        pmap_extract
2129  *      Function:
2130  *              Extract the physical page address associated
2131  *              with the given map/virtual_address pair.
2132  *     Change to shim for backwards compatibility but will not
2133  *     work for 64 bit systems.  Some old drivers that we cannot
2134  *     change need this.
2135  */
2136
2137 vm_offset_t
2138 pmap_extract(
2139         register pmap_t pmap,
2140         vm_offset_t     va)
2141 {
2142   ppnum_t ppn;
2143   vm_offset_t vaddr;
2144
2145   vaddr = (vm_offset_t)0;
2146   ppn = pmap_find_phys(pmap, (addr64_t)va);
2147   if (ppn) {
2148     vaddr = ((vm_offset_t)i386_ptob(ppn)) | (va & INTEL_OFFMASK);
2149   }
2150   return (vaddr);
2151 }
2152
2153
2154 /*
2155  *      Routine:        pmap_expand
2156  *
2157  *      Expands a pmap to be able to map the specified virtual address.
2158  *
2159  *      Allocates new virtual memory for the P0 or P1 portion of the
2160  *      pmap, then re-maps the physical pages that were in the old
2161  *      pmap to be in the new pmap.
2162  *
2163  *      Must be called with the pmap system and the pmap unlocked,
2164  *      since these must be unlocked to use vm_allocate or vm_deallocate.
2165  *      Thus it must be called in a loop that checks whether the map
2166  *      has been expanded enough.
2167  *      (We won't loop forever, since page tables aren't shrunk.)
2168  */
2169 void
2170 pmap_expand(
2171         register pmap_t         map,
2172         register vm_offset_t    v)
2173 {
2174         pt_entry_t              *pdp;
2175         register vm_page_t      m;
2176         register pmap_paddr_t   pa;
2177         register int            i;
2178         spl_t                   spl;
2179         ppnum_t                 pn;
2180
2181         if (map == kernel_pmap) {
2182           pmap_growkernel(v);
2183           return;
2184         }
2185
2186         /*
2187          *      Allocate a VM page for the level 2 page table entries.
2188          */
2189         while ((m = vm_page_grab()) == VM_PAGE_NULL)
2190                 VM_PAGE_WAIT();
2191
2192         /*
2193          *      put the page into the pmap's obj list so it
2194          *      can be found later.
2195          */
2196         pn = m->phys_page;
2197         pa = i386_ptob(pn);
2198         i = pdenum(map, v);
2199         vm_object_lock(map->pm_obj);
2200         vm_page_insert(m, map->pm_obj, (vm_object_offset_t)i);
2201         vm_page_lock_queues();
2202         vm_page_wire(m);
2203         inuse_ptepages_count++;
2204         vm_object_unlock(map->pm_obj);
2205         vm_page_unlock_queues();
2206
2207         /*
2208          *      Zero the page.
2209          */
2210         pmap_zero_page(pn);
2211
2212         PMAP_READ_LOCK(map, spl);
2213         /*
2214          *      See if someone else expanded us first
2215          */
2216         if (pmap_pte(map, v) != PT_ENTRY_NULL) {
2217                 PMAP_READ_UNLOCK(map, spl);
2218                 vm_object_lock(map->pm_obj);
2219                 vm_page_lock_queues();
2220                 vm_page_free(m);
2221                 inuse_ptepages_count--;
2222                 vm_page_unlock_queues();
2223                 vm_object_unlock(map->pm_obj);
2224                 return;
2225         }
2226
2227         /*
2228          *      Set the page directory entry for this page table.
2229          *      If we have allocated more than one hardware page,
2230          *      set several page directory entries.
2231          */
2232
2233         pdp = &map->dirbase[pdenum(map, v)];
2234             *pdp = pa_to_pte(pa)
2235                 | INTEL_PTE_VALID
2236                 | INTEL_PTE_USER
2237                 | INTEL_PTE_WRITE;
2238
2239         PMAP_READ_UNLOCK(map, spl);
2240         return;
2241 }
2242
2243 /*
2244  *      Copy the range specified by src_addr/len
2245  *      from the source map to the range dst_addr/len
2246  *      in the destination map.
2247  *
2248  *      This routine is only advisory and need not do anything.
2249  */
2250 #if     0
2251 void
2252 pmap_copy(
2253         pmap_t          dst_pmap,
2254         pmap_t          src_pmap,
2255         vm_offset_t     dst_addr,
2256         vm_size_t       len,
2257         vm_offset_t     src_addr)
2258 {
2259 #ifdef  lint
2260         dst_pmap++; src_pmap++; dst_addr++; len++; src_addr++;
2261 #endif  /* lint */
2262 }
2263 #endif/*        0 */
2264
2265 /*
2266  * pmap_sync_page_data_phys(ppnum_t pa)
2267  *
2268  * Invalidates all of the instruction cache on a physical page and
2269  * pushes any dirty data from the data cache for the same physical page
2270  * Not required in i386.
2271  */
2272 void
2273 pmap_sync_page_data_phys(__unused ppnum_t pa)
2274 {
2275         return;
2276 }
2277
2278 /*
2279  * pmap_sync_page_attributes_phys(ppnum_t pa)
2280  *
2281  * Write back and invalidate all cachelines on a physical page.
2282  */
2283 void
2284 pmap_sync_page_attributes_phys(ppnum_t pa)
2285 {
2286         cache_flush_page_phys(pa);
2287 }
2288
2289 int     collect_ref;
2290 int     collect_unref;
2291
2292 /*
2293  *      Routine:        pmap_collect
2294  *      Function:
2295  *              Garbage collects the physical map system for
2296  *              pages which are no longer used.
2297  *              Success need not be guaranteed -- that is, there
2298  *              may well be pages which are not referenced, but
2299  *              others may be collected.
2300  *      Usage:
2301  *              Called by the pageout daemon when pages are scarce.
2302  */
2303 void
2304 pmap_collect(
2305         pmap_t          p)
2306 {
2307         register pt_entry_t     *pdp, *ptp;
2308         pt_entry_t              *eptp;
2309         int                     wired;
2310         spl_t                   spl;
2311
2312         if (p == PMAP_NULL)
2313                 return;
2314
2315         if (p == kernel_pmap)
2316                 return;
2317
2318         /*
2319          *      Garbage collect map.
2320          */
2321         PMAP_READ_LOCK(p, spl);
2322
2323         for (pdp = (pt_entry_t *)p->dirbase;
2324              pdp < (pt_entry_t *)&p->dirbase[(UMAXPTDI+1)];
2325              pdp++)
2326         {
2327            if (*pdp & INTEL_PTE_VALID) {
2328               if(*pdp & INTEL_PTE_REF) {
2329                 *pdp &= ~INTEL_PTE_REF;
2330                 collect_ref++;
2331               } else {
2332                 collect_unref++;
2333                 ptp = pmap_pte(p, pdetova(pdp - (pt_entry_t *)p->dirbase));
2334                 eptp = ptp + NPTEPG;
2335
2336                 /*
2337                  * If the pte page has any wired mappings, we cannot
2338                  * free it.
2339                  */
2340                 wired = 0;
2341                 {
2342                     register pt_entry_t *ptep;
2343                     for (ptep = ptp; ptep < eptp; ptep++) {
2344                         if (iswired(*ptep)) {
2345                             wired = 1;
2346                             break;
2347                         }
2348                     }
2349                 }
2350                 if (!wired) {
2351                     /*
2352                      * Remove the virtual addresses mapped by this pte page.
2353                      */
2354                     pmap_remove_range(p,
2355                                 pdetova(pdp - (pt_entry_t *)p->dirbase),
2356                                 ptp,
2357                                 eptp);
2358
2359                     /*
2360                      * Invalidate the page directory pointer.
2361                      */
2362                     *pdp = 0x0;
2363
2364                     PMAP_READ_UNLOCK(p, spl);
2365
2366                     /*
2367                      * And free the pte page itself.
2368                      */
2369                     {
2370                         register vm_page_t m;
2371
2372                         vm_object_lock(p->pm_obj);
2373                         m = vm_page_lookup(p->pm_obj,(vm_object_offset_t)(pdp - (pt_entry_t *)&p->dirbase[0]));
2374                         if (m == VM_PAGE_NULL)
2375                             panic("pmap_collect: pte page not in object");
2376                         vm_page_lock_queues();
2377                         vm_page_free(m);
2378                         inuse_ptepages_count--;
2379                         vm_page_unlock_queues();
2380                         vm_object_unlock(p->pm_obj);
2381                     }
2382
2383                     PMAP_READ_LOCK(p, spl);
2384                 }
2385               }
2386            }
2387         }
2388         PMAP_UPDATE_TLBS(p, VM_MIN_ADDRESS, VM_MAX_ADDRESS);
2389         PMAP_READ_UNLOCK(p, spl);
2390         return;
2391
2392 }
2393
2394 /*
2395  *      Routine:        pmap_kernel
2396  *      Function:
2397  *              Returns the physical map handle for the kernel.
2398  */
2399 #if     0
2400 pmap_t
2401 pmap_kernel(void)
2402 {
2403         return (kernel_pmap);
2404 }
2405 #endif/*        0 */
2406
2407 void
2408 pmap_copy_page(src, dst)
2409         ppnum_t src;
2410         ppnum_t dst;
2411 {
2412   bcopy_phys((addr64_t)i386_ptob(src),
2413              (addr64_t)i386_ptob(dst),
2414              PAGE_SIZE);
2415 }
2416
2417
2418 /*
2419  *      Routine:        pmap_pageable
2420  *      Function:
2421  *              Make the specified pages (by pmap, offset)
2422  *              pageable (or not) as requested.
2423  *
2424  *              A page which is not pageable may not take
2425  *              a fault; therefore, its page table entry
2426  *              must remain valid for the duration.
2427  *
2428  *              This routine is merely advisory; pmap_enter
2429  *              will specify that these pages are to be wired
2430  *              down (or not) as appropriate.
2431  */
2432 void
2433 pmap_pageable(
2434         __unused pmap_t         pmap,
2435         __unused vm_offset_t    start_addr,
2436         __unused vm_offset_t    end_addr,
2437         __unused boolean_t      pageable)
2438 {
2439 #ifdef  lint
2440         pmap++; start_addr++; end_addr++; pageable++;
2441 #endif  /* lint */
2442 }
2443
2444 /*
2445  *      Clear specified attribute bits.
2446  */
2447 void
2448 phys_attribute_clear(
2449         ppnum_t pn,
2450         int             bits)
2451 {
2452         pv_entry_t              pv_h;
2453         register pv_entry_t     pv_e;
2454         register pt_entry_t     *pte;
2455         int                     pai;
2456         register pmap_t         pmap;
2457         spl_t                   spl;
2458         pmap_paddr_t            phys;
2459
2460         assert(pn != vm_page_fictitious_addr);
2461         if (!valid_page(pn)) {
2462             /*
2463              *  Not a managed page.
2464              */
2465             return;
2466         }
2467
2468         /*
2469          *      Lock the pmap system first, since we will be changing
2470          *      several pmaps.
2471          */
2472
2473         PMAP_WRITE_LOCK(spl);
2474         phys = i386_ptob(pn);
2475         pai = pa_index(phys);
2476         pv_h = pai_to_pvh(pai);
2477
2478         /*
2479          * Walk down PV list, clearing all modify or reference bits.
2480          * We do not have to lock the pv_list because we have
2481          * the entire pmap system locked.
2482          */
2483         if (pv_h->pmap != PMAP_NULL) {
2484             /*
2485              * There are some mappings.
2486              */
2487             for (pv_e = pv_h; pv_e != PV_ENTRY_NULL; pv_e = pv_e->next) {
2488
2489                 pmap = pv_e->pmap;
2490                 /*
2491                  * Lock the pmap to block pmap_extract and similar routines.
2492                  */
2493                 simple_lock(&pmap->lock);
2494
2495                 {
2496                     register vm_offset_t va;
2497
2498                     va = pv_e->va;
2499                     pte = pmap_pte(pmap, va);
2500
2501 #if     0
2502                     /*
2503                      * Consistency checks.
2504                      */
2505                     assert(*pte & INTEL_PTE_VALID);
2506                     /* assert(pte_to_phys(*pte) == phys); */
2507 #endif
2508
2509                 /*
2510                  * Clear modify or reference bits.
2511                  */
2512
2513                         *pte++ &= ~bits;
2514                         PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
2515                 }
2516                 simple_unlock(&pmap->lock);
2517
2518             }
2519         }
2520
2521         pmap_phys_attributes[pai] &= ~bits;
2522
2523         PMAP_WRITE_UNLOCK(spl);
2524 }
2525
2526 /*
2527  *      Check specified attribute bits.
2528  */
2529 boolean_t
2530 phys_attribute_test(
2531         ppnum_t pn,
2532         int             bits)
2533 {
2534         pv_entry_t              pv_h;
2535         register pv_entry_t     pv_e;
2536         register pt_entry_t     *pte;
2537         int                     pai;
2538         register pmap_t         pmap;
2539         spl_t                   spl;
2540         pmap_paddr_t            phys;
2541
2542         assert(pn != vm_page_fictitious_addr);
2543         if (!valid_page(pn)) {
2544             /*
2545              *  Not a managed page.
2546              */
2547             return (FALSE);
2548         }
2549
2550         /*
2551          *      Lock the pmap system first, since we will be checking
2552          *      several pmaps.
2553          */
2554
2555         PMAP_WRITE_LOCK(spl);
2556         phys = i386_ptob(pn);
2557         pai = pa_index(phys);
2558         pv_h = pai_to_pvh(pai);
2559
2560         if (pmap_phys_attributes[pai] & bits) {
2561             PMAP_WRITE_UNLOCK(spl);
2562             return (TRUE);
2563         }
2564
2565         /*
2566          * Walk down PV list, checking all mappings.
2567          * We do not have to lock the pv_list because we have
2568          * the entire pmap system locked.
2569          */
2570         if (pv_h->pmap != PMAP_NULL) {
2571             /*
2572              * There are some mappings.
2573              */
2574             for (pv_e = pv_h; pv_e != PV_ENTRY_NULL; pv_e = pv_e->next) {
2575
2576                 pmap = pv_e->pmap;
2577                 /*
2578                  * Lock the pmap to block pmap_extract and similar routines.
2579                  */
2580                 simple_lock(&pmap->lock);
2581
2582                 {
2583                     register vm_offset_t va;
2584
2585                     va = pv_e->va;
2586                     pte = pmap_pte(pmap, va);
2587
2588 #if     0
2589                     /*
2590                      * Consistency checks.
2591                      */
2592                     assert(*pte & INTEL_PTE_VALID);
2593                     /* assert(pte_to_phys(*pte) == phys); */
2594 #endif
2595                 }
2596
2597                 /*
2598                  * Check modify or reference bits.
2599                  */
2600                 {
2601                         if (*pte++ & bits) {
2602                             simple_unlock(&pmap->lock);
2603                             PMAP_WRITE_UNLOCK(spl);
2604                             return (TRUE);
2605                         }
2606                 }
2607                 simple_unlock(&pmap->lock);
2608             }
2609         }
2610         PMAP_WRITE_UNLOCK(spl);
2611         return (FALSE);
2612 }
2613
2614 /*
2615  *      Set specified attribute bits.
2616  */
2617 void
2618 phys_attribute_set(
2619         ppnum_t pn,
2620         int             bits)
2621 {
2622         int                     spl;
2623         pmap_paddr_t   phys;
2624
2625         assert(pn != vm_page_fictitious_addr);
2626         if (!valid_page(pn)) {
2627             /*
2628              *  Not a managed page.
2629              */
2630             return;
2631         }
2632
2633         /*
2634          *      Lock the pmap system and set the requested bits in
2635          *      the phys attributes array.  Don't need to bother with
2636          *      ptes because the test routine looks here first.
2637          */
2638         phys = i386_ptob(pn);
2639         PMAP_WRITE_LOCK(spl);
2640         pmap_phys_attributes[pa_index(phys)] |= bits;
2641         PMAP_WRITE_UNLOCK(spl);
2642 }
2643
2644 /*
2645  *      Set the modify bit on the specified physical page.
2646  */
2647
2648 void pmap_set_modify(
2649                      ppnum_t pn)
2650 {
2651         phys_attribute_set(pn, PHYS_MODIFIED);
2652 }
2653
2654 /*
2655  *      Clear the modify bits on the specified physical page.
2656  */
2657
2658 void
2659 pmap_clear_modify(
2660                   ppnum_t pn)
2661 {
2662         phys_attribute_clear(pn, PHYS_MODIFIED);
2663 }
2664
2665 /*
2666  *      pmap_is_modified:
2667  *
2668  *      Return whether or not the specified physical page is modified
2669  *      by any physical maps.
2670  */
2671
2672 boolean_t
2673 pmap_is_modified(
2674                  ppnum_t pn)
2675 {
2676         return (phys_attribute_test(pn, PHYS_MODIFIED));
2677 }
2678
2679 /*
2680  *      pmap_clear_reference:
2681  *
2682  *      Clear the reference bit on the specified physical page.
2683  */
2684
2685 void
2686 pmap_clear_reference(
2687                      ppnum_t pn)
2688 {
2689         phys_attribute_clear(pn, PHYS_REFERENCED);
2690 }
2691
2692 void
2693 pmap_set_reference(ppnum_t pn)
2694 {
2695         phys_attribute_set(pn, PHYS_REFERENCED);
2696 }
2697
2698 /*
2699  *      pmap_is_referenced:
2700  *
2701  *      Return whether or not the specified physical page is referenced
2702  *      by any physical maps.
2703  */
2704
2705 boolean_t
2706 pmap_is_referenced(
2707                    ppnum_t pn)
2708 {
2709         return (phys_attribute_test(pn, PHYS_REFERENCED));
2710 }
2711
2712 /*
2713  * pmap_get_refmod(phys)
2714  *  returns the referenced and modified bits of the specified
2715  *  physical page.
2716  */
2717 unsigned int
2718 pmap_get_refmod(ppnum_t pa)
2719 {
2720         return (   ((phys_attribute_test(pa,   PHYS_MODIFIED))?   VM_MEM_MODIFIED : 0)
2721                          | ((phys_attribute_test(pa, PHYS_REFERENCED))? VM_MEM_REFERENCED : 0));
2722 }
2723
2724 /*
2725  * pmap_clear_refmod(phys, mask)
2726  *  clears the referenced and modified bits as specified by the mask
2727  *  of the specified physical page.
2728  */
2729 void
2730 pmap_clear_refmod(ppnum_t pa, unsigned int mask)
2731 {
2732         unsigned int  x86Mask;
2733
2734         x86Mask = (   ((mask &   VM_MEM_MODIFIED)?   PHYS_MODIFIED : 0)
2735                     | ((mask & VM_MEM_REFERENCED)? PHYS_REFERENCED : 0));
2736         phys_attribute_clear(pa, x86Mask);
2737 }
2738
2739 /*
2740  *      Set the modify bit on the specified range
2741  *      of this map as requested.
2742  *
2743  *      This optimization stands only if each time the dirty bit
2744  *      in vm_page_t is tested, it is also tested in the pmap.
2745  */
2746 void
2747 pmap_modify_pages(
2748         pmap_t          map,
2749         vm_offset_t     s,
2750         vm_offset_t     e)
2751 {
2752         spl_t                   spl;
2753         register pt_entry_t     *pde;
2754         register pt_entry_t     *spte, *epte;
2755         vm_offset_t             l;
2756         vm_offset_t             orig_s = s;
2757
2758         if (map == PMAP_NULL)
2759                 return;
2760
2761         PMAP_READ_LOCK(map, spl);
2762
2763         pde = pmap_pde(map, s);
2764         while (s && s < e) {
2765             l = (s + PDE_MAPPED_SIZE) & ~(PDE_MAPPED_SIZE-1);
2766             if (l > e)
2767                 l = e;
2768             if (*pde & INTEL_PTE_VALID) {
2769               spte = (pt_entry_t *)pmap_pte(map, (s & ~(PDE_MAPPED_SIZE-1)));
2770                 if (l) {
2771                    spte = &spte[ptenum(s)];
2772                    epte = &spte[intel_btop(l-s)];
2773                 } else {
2774                    epte = &spte[intel_btop(PDE_MAPPED_SIZE)];
2775                    spte = &spte[ptenum(s)];
2776                 }
2777                 while (spte < epte) {
2778                     if (*spte & INTEL_PTE_VALID) {
2779                         *spte |= (INTEL_PTE_MOD | INTEL_PTE_WRITE);
2780                     }
2781                     spte++;
2782                 }
2783             }
2784             s = l;
2785             pde++;
2786         }
2787         PMAP_UPDATE_TLBS(map, orig_s, e);
2788         PMAP_READ_UNLOCK(map, spl);
2789 }
2790
2791
2792 void
2793 invalidate_icache(__unused vm_offset_t  addr,
2794                   __unused unsigned     cnt,
2795                   __unused int          phys)
2796 {
2797         return;
2798 }
2799 void
2800 flush_dcache(__unused vm_offset_t       addr,
2801              __unused unsigned          count,
2802              __unused int               phys)
2803 {
2804         return;
2805 }
2806
2807 /*
2808 *           TLB Coherence Code (TLB "shootdown" code)
2809 *
2810 * Threads that belong to the same task share the same address space and
2811 * hence share a pmap.  However, they  may run on distinct cpus and thus
2812 * have distinct TLBs that cache page table entries. In order to guarantee
2813 * the TLBs are consistent, whenever a pmap is changed, all threads that
2814 * are active in that pmap must have their TLB updated. To keep track of
2815 * this information, the set of cpus that are currently using a pmap is
2816 * maintained within each pmap structure (cpus_using). Pmap_activate() and
2817 * pmap_deactivate add and remove, respectively, a cpu from this set.
2818 * Since the TLBs are not addressable over the bus, each processor must
2819 * flush its own TLB; a processor that needs to invalidate another TLB
2820 * needs to interrupt the processor that owns that TLB to signal the
2821 * update.
2822 *
2823 * Whenever a pmap is updated, the lock on that pmap is locked, and all
2824 * cpus using the pmap are signaled to invalidate. All threads that need
2825 * to activate a pmap must wait for the lock to clear to await any updates
2826 * in progress before using the pmap. They must ACQUIRE the lock to add
2827 * their cpu to the cpus_using set. An implicit assumption made
2828 * throughout the TLB code is that all kernel code that runs at or higher
2829 * than splvm blocks out update interrupts, and that such code does not
2830 * touch pageable pages.
2831 *
2832 * A shootdown interrupt serves another function besides signaling a
2833 * processor to invalidate. The interrupt routine (pmap_update_interrupt)
2834 * waits for the both the pmap lock (and the kernel pmap lock) to clear,
2835 * preventing user code from making implicit pmap updates while the
2836 * sending processor is performing its update. (This could happen via a
2837 * user data write reference that turns on the modify bit in the page
2838 * table). It must wait for any kernel updates that may have started
2839 * concurrently with a user pmap update because the IPC code
2840 * changes mappings.
2841 * Spinning on the VALUES of the locks is sufficient (rather than
2842 * having to acquire the locks) because any updates that occur subsequent
2843 * to finding the lock unlocked will be signaled via another interrupt.
2844 * (This assumes the interrupt is cleared before the low level interrupt code
2845 * calls pmap_update_interrupt()).
2846 *
2847 * The signaling processor must wait for any implicit updates in progress
2848 * to terminate before continuing with its update. Thus it must wait for an
2849 * acknowledgement of the interrupt from each processor for which such
2850 * references could be made. For maintaining this information, a set
2851 * cpus_active is used. A cpu is in this set if and only if it can
2852 * use a pmap. When pmap_update_interrupt() is entered, a cpu is removed from
2853 * this set; when all such cpus are removed, it is safe to update.
2854 *
2855 * Before attempting to acquire the update lock on a pmap, a cpu (A) must
2856 * be at least at the priority of the interprocessor interrupt
2857 * (splip<=splvm). Otherwise, A could grab a lock and be interrupted by a
2858 * kernel update; it would spin forever in pmap_update_interrupt() trying
2859 * to acquire the user pmap lock it had already acquired. Furthermore A
2860 * must remove itself from cpus_active.  Otherwise, another cpu holding
2861 * the lock (B) could be in the process of sending an update signal to A,
2862 * and thus be waiting for A to remove itself from cpus_active. If A is
2863 * spinning on the lock at priority this will never happen and a deadlock
2864 * will result.
2865 */
2866
2867 /*
2868  *      Signal another CPU that it must flush its TLB
2869  */
2870 void
2871 signal_cpus(
2872         cpu_set         use_list,
2873         pmap_t          pmap,
2874         vm_offset_t     start_addr,
2875         vm_offset_t     end_addr)
2876 {
2877         register int            which_cpu, j;
2878         register pmap_update_list_t     update_list_p;
2879
2880         while ((which_cpu = ffs((unsigned long)use_list)) != 0) {
2881             which_cpu -= 1;     /* convert to 0 origin */
2882
2883             update_list_p = cpu_update_list(which_cpu);
2884             simple_lock(&update_list_p->lock);
2885
2886             j = update_list_p->count;
2887             if (j >= UPDATE_LIST_SIZE) {
2888                 /*
2889                  *      list overflowed.  Change last item to
2890                  *      indicate overflow.
2891                  */
2892                 update_list_p->item[UPDATE_LIST_SIZE-1].pmap  = kernel_pmap;
2893                 update_list_p->item[UPDATE_LIST_SIZE-1].start = VM_MIN_ADDRESS;
2894                 update_list_p->item[UPDATE_LIST_SIZE-1].end   = VM_MAX_KERNEL_ADDRESS;
2895             }
2896             else {
2897                 update_list_p->item[j].pmap  = pmap;
2898                 update_list_p->item[j].start = start_addr;
2899                 update_list_p->item[j].end   = end_addr;
2900                 update_list_p->count = j+1;
2901             }
2902             cpu_update_needed(which_cpu) = TRUE;
2903             simple_unlock(&update_list_p->lock);
2904
2905             /* if its the kernel pmap, ignore cpus_idle */
2906             if (((cpus_idle & (1 << which_cpu)) == 0) ||
2907                 (pmap == kernel_pmap) || PMAP_REAL(which_cpu) == pmap)
2908               {
2909                 i386_signal_cpu(which_cpu, MP_TLB_FLUSH, ASYNC);
2910               }
2911             use_list &= ~(1 << which_cpu);
2912         }
2913 }
2914
2915 void
2916 process_pmap_updates(
2917         register pmap_t         my_pmap)
2918 {
2919         register int            my_cpu;
2920         register pmap_update_list_t     update_list_p;
2921         register int            j;
2922         register pmap_t         pmap;
2923
2924         mp_disable_preemption();
2925         my_cpu = cpu_number();
2926         update_list_p = cpu_update_list(my_cpu);
2927         simple_lock(&update_list_p->lock);
2928
2929         for (j = 0; j < update_list_p->count; j++) {
2930             pmap = update_list_p->item[j].pmap;
2931             if (pmap == my_pmap ||
2932                 pmap == kernel_pmap) {
2933
2934                 if (pmap->ref_count <= 0) {
2935                         PMAP_CPU_CLR(pmap, my_cpu);
2936                         PMAP_REAL(my_cpu) = kernel_pmap;
2937 #ifdef PAE
2938                         set_cr3((unsigned int)kernel_pmap->pm_ppdpt);
2939 #else
2940                         set_cr3((unsigned int)kernel_pmap->pdirbase);
2941 #endif
2942                 } else
2943                         INVALIDATE_TLB(pmap,
2944                                        update_list_p->item[j].start,
2945                                        update_list_p->item[j].end);
2946             }
2947         }
2948         update_list_p->count = 0;
2949         cpu_update_needed(my_cpu) = FALSE;
2950         simple_unlock(&update_list_p->lock);
2951         mp_enable_preemption();
2952 }
2953
2954 /*
2955  *      Interrupt routine for TBIA requested from other processor.
2956  *      This routine can also be called at all interrupts time if
2957  *      the cpu was idle. Some driver interrupt routines might access
2958  *      newly allocated vm. (This is the case for hd)
2959  */
2960 void
2961 pmap_update_interrupt(void)
2962 {
2963         register int            my_cpu;
2964         spl_t                   s;
2965         register pmap_t         my_pmap;
2966
2967         mp_disable_preemption();
2968         my_cpu = cpu_number();
2969
2970         /*
2971          *      Raise spl to splvm (above splip) to block out pmap_extract
2972          *      from IO code (which would put this cpu back in the active
2973          *      set).
2974          */
2975         s = splhigh();
2976
2977         my_pmap = PMAP_REAL(my_cpu);
2978
2979         if (!(my_pmap && pmap_in_use(my_pmap, my_cpu)))
2980                 my_pmap = kernel_pmap;
2981
2982         do {
2983             LOOP_VAR;
2984
2985             /*
2986              *  Indicate that we're not using either user or kernel
2987              *  pmap.
2988              */
2989             i_bit_clear(my_cpu, &cpus_active);
2990
2991             /*
2992              *  Wait for any pmap updates in progress, on either user
2993              *  or kernel pmap.
2994              */
2995             while (*(volatile int *)(&my_pmap->lock.interlock.lock_data) ||
2996                    *(volatile int *)(&kernel_pmap->lock.interlock.lock_data)) {
2997                 LOOP_CHECK("pmap_update_interrupt", my_pmap);
2998                 cpu_pause();
2999             }
3000
3001             process_pmap_updates(my_pmap);
3002
3003             i_bit_set(my_cpu, &cpus_active);
3004
3005         } while (cpu_update_needed(my_cpu));
3006
3007         splx(s);
3008         mp_enable_preemption();
3009 }
3010
3011 #if     MACH_KDB
3012
3013 /* show phys page mappings and attributes */
3014
3015 extern void     db_show_page(pmap_paddr_t pa);
3016
3017 void
3018 db_show_page(pmap_paddr_t pa)
3019 {
3020         pv_entry_t      pv_h;
3021         int             pai;
3022         char            attr;
3023
3024         pai = pa_index(pa);
3025         pv_h = pai_to_pvh(pai);
3026
3027         attr = pmap_phys_attributes[pai];
3028         printf("phys page %x ", pa);
3029         if (attr & PHYS_MODIFIED)
3030                 printf("modified, ");
3031         if (attr & PHYS_REFERENCED)
3032                 printf("referenced, ");
3033         if (pv_h->pmap || pv_h->next)
3034                 printf(" mapped at\n");
3035         else
3036                 printf(" not mapped\n");
3037         for (; pv_h; pv_h = pv_h->next)
3038                 if (pv_h->pmap)
3039                         printf("%x in pmap %x\n", pv_h->va, pv_h->pmap);
3040 }
3041
3042 #endif /* MACH_KDB */
3043
3044 #if     MACH_KDB
3045 void db_kvtophys(vm_offset_t);
3046 void db_show_vaddrs(pt_entry_t  *);
3047
3048 /*
3049  *      print out the results of kvtophys(arg)
3050  */
3051 void
3052 db_kvtophys(
3053         vm_offset_t     vaddr)
3054 {
3055         db_printf("0x%x", kvtophys(vaddr));
3056 }
3057
3058 /*
3059  *      Walk the pages tables.
3060  */
3061 void
3062 db_show_vaddrs(
3063         pt_entry_t      *dirbase)
3064 {
3065         pt_entry_t      *ptep, *pdep, tmp;
3066         int             x, y, pdecnt, ptecnt;
3067
3068         if (dirbase == 0) {
3069                 dirbase = kernel_pmap->dirbase;
3070         }
3071         if (dirbase == 0) {
3072                 db_printf("need a dirbase...\n");
3073                 return;
3074         }
3075         dirbase = (pt_entry_t *) ((unsigned long) dirbase & ~INTEL_OFFMASK);
3076
3077         db_printf("dirbase: 0x%x\n", dirbase);
3078
3079         pdecnt = ptecnt = 0;
3080         pdep = &dirbase[0];
3081         for (y = 0; y < NPDEPG; y++, pdep++) {
3082                 if (((tmp = *pdep) & INTEL_PTE_VALID) == 0) {
3083                         continue;
3084                 }
3085                 pdecnt++;
3086                 ptep = (pt_entry_t *) ((*pdep) & ~INTEL_OFFMASK);
3087                 db_printf("dir[%4d]: 0x%x\n", y, *pdep);
3088                 for (x = 0; x < NPTEPG; x++, ptep++) {
3089                         if (((tmp = *ptep) & INTEL_PTE_VALID) == 0) {
3090                                 continue;
3091                         }
3092                         ptecnt++;
3093                         db_printf("   tab[%4d]: 0x%x, va=0x%x, pa=0x%x\n",
3094                                 x,
3095                                 *ptep,
3096                                 (y << 22) | (x << 12),
3097                                 *ptep & ~INTEL_OFFMASK);
3098                 }
3099         }
3100
3101         db_printf("total: %d tables, %d page table entries.\n", pdecnt, ptecnt);
3102
3103 }
3104 #endif  /* MACH_KDB */
3105
3106 #include <mach_vm_debug.h>
3107 #if     MACH_VM_DEBUG
3108 #include <vm/vm_debug.h>
3109
3110 int
3111 pmap_list_resident_pages(
3112         __unused pmap_t         pmap,
3113         __unused vm_offset_t    *listp,
3114         __unused int            space)
3115 {
3116         return 0;
3117 }
3118 #endif  /* MACH_VM_DEBUG */
3119
3120 #ifdef MACH_BSD
3121 /*
3122  * pmap_pagemove
3123  *
3124  * BSD support routine to reassign virtual addresses.
3125  */
3126
3127 void
3128 pmap_movepage(unsigned long from, unsigned long to, vm_size_t size)
3129 {
3130         spl_t   spl;
3131         pt_entry_t      *pte, saved_pte;
3132
3133         /* Lock the kernel map */
3134         PMAP_READ_LOCK(kernel_pmap, spl);
3135
3136
3137         while (size > 0) {
3138                 pte = pmap_pte(kernel_pmap, from);
3139                 if (pte == NULL)
3140                         panic("pmap_pagemove from pte NULL");
3141                 saved_pte = *pte;
3142                 PMAP_READ_UNLOCK(kernel_pmap, spl);
3143
3144                 pmap_enter(kernel_pmap, to, (ppnum_t)i386_btop(i386_trunc_page(*pte)),
3145                         VM_PROT_READ|VM_PROT_WRITE, 0, *pte & INTEL_PTE_WIRED);
3146
3147                 pmap_remove(kernel_pmap, (addr64_t)from, (addr64_t)(from+PAGE_SIZE));
3148
3149                 PMAP_READ_LOCK(kernel_pmap, spl);
3150                 pte = pmap_pte(kernel_pmap, to);
3151                 if (pte == NULL)
3152                         panic("pmap_pagemove 'to' pte NULL");
3153
3154                 *pte = saved_pte;
3155
3156                 from += PAGE_SIZE;
3157                 to += PAGE_SIZE;
3158                 size -= PAGE_SIZE;
3159         }
3160
3161         /* Get the processors to update the TLBs */
3162         PMAP_UPDATE_TLBS(kernel_pmap, from, from+size);
3163         PMAP_UPDATE_TLBS(kernel_pmap, to, to+size);
3164
3165         PMAP_READ_UNLOCK(kernel_pmap, spl);
3166
3167 }
3168 #endif /* MACH_BSD */
3169
3170 /* temporary workaround */
3171 boolean_t
3172 coredumpok(vm_map_t map, vm_offset_t va)
3173 {
3174         pt_entry_t     *ptep;
3175
3176         ptep = pmap_pte(map->pmap, va);
3177         if (0 == ptep)
3178                 return FALSE;
3179         return ((*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED));
3180 }
3181
3182 /*
3183  * grow the number of kernel page table entries, if needed
3184  */
3185 void
3186 pmap_growkernel(vm_offset_t addr)
3187 {
3188 #if GROW_KERNEL_FUNCTION_IMPLEMENTED
3189         struct pmap *pmap;
3190         int s;
3191         vm_offset_t ptppaddr;
3192         ppnum_t  ppn;
3193         vm_page_t nkpg;
3194         pd_entry_t newpdir = 0;
3195
3196         /*
3197          * Serialize.
3198          * Losers return to try again until the winner completes the work.
3199          */
3200         if (kptobj == 0) panic("growkernel 0");
3201         if (!vm_object_lock_try(kptobj)) {
3202             return;
3203         }
3204
3205         vm_page_lock_queues();
3206
3207         s = splhigh();
3208
3209         /*
3210          * If this is the first time thru, locate the end of the
3211          * kernel page table entries and set nkpt to the current
3212          * number of kernel page table pages
3213          */
3214
3215         if (kernel_vm_end == 0) {
3216                 kernel_vm_end = KERNBASE;
3217                 nkpt = 0;
3218
3219                 while (pdir_pde(kernel_pmap->dirbase, kernel_vm_end)) {
3220                         kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
3221                         nkpt++;
3222                 }
3223         }
3224
3225         /*
3226          * Now allocate and map the required number of page tables
3227          */
3228         addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
3229         while (kernel_vm_end < addr) {
3230                 if (pdir_pde(kernel_pmap->dirbase, kernel_vm_end)) {
3231                         kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
3232                         continue; /* someone already filled this one */
3233                 }
3234
3235                 nkpg = vm_page_alloc(kptobj, nkpt);
3236                 if (!nkpg)
3237                         panic("pmap_growkernel: no memory to grow kernel");
3238
3239                 nkpt++;
3240                 vm_page_wire(nkpg);
3241                 ppn  = nkpg->phys_page;
3242                 pmap_zero_page(ppn);
3243                 ptppaddr = i386_ptob(ppn);
3244                 newpdir = (pd_entry_t) (ptppaddr | INTEL_PTE_VALID |
3245                                         INTEL_PTE_RW | INTEL_PTE_REF | INTEL_PTE_MOD);
3246                 pdir_pde(kernel_pmap->dirbase, kernel_vm_end) = newpdir;
3247
3248                 simple_lock(&free_pmap_lock);
3249                 for (pmap = (struct pmap *)kernel_pmap->pmap_link.next;
3250                      pmap != kernel_pmap ;
3251                      pmap = (struct pmap *)pmap->pmap_link.next ) {
3252                                 *pmap_pde(pmap, kernel_vm_end) = newpdir;
3253                 }
3254                 simple_unlock(&free_pmap_lock);
3255         }
3256         splx(s);
3257         vm_page_unlock_queues();
3258         vm_object_unlock(kptobj);
3259 #endif
3260 }
3261
3262 pt_entry_t *
3263 pmap_mapgetpte(vm_map_t map, vm_offset_t v)
3264 {
3265         return pmap_pte(map->pmap, v);
3266 }
3267
3268 boolean_t
3269 phys_page_exists(
3270                  ppnum_t pn)
3271 {
3272         pmap_paddr_t     phys;
3273
3274         assert(pn != vm_page_fictitious_addr);
3275
3276         if (!pmap_initialized)
3277                 return (TRUE);
3278         phys = (pmap_paddr_t) i386_ptob(pn);
3279         if (!pmap_valid_page(pn))
3280                 return (FALSE);
3281
3282         return TRUE;
3283 }
3284
3285 void
3286 mapping_free_prime()
3287 {
3288         int             i;
3289         pv_entry_t      pv_e;
3290
3291         for (i = 0; i < (5 * PV_ALLOC_CHUNK); i++) {
3292                 pv_e = (pv_entry_t) zalloc(pv_list_zone);
3293                 PV_FREE(pv_e);
3294         }
3295 }
3296
3297 void
3298 mapping_adjust()
3299 {
3300         pv_entry_t      pv_e;
3301         int             i;
3302         int             spl;
3303
3304         if (mapping_adjust_call == NULL) {
3305                 thread_call_setup(&mapping_adjust_call_data,
3306                                   (thread_call_func_t) mapping_adjust,
3307                                   (thread_call_param_t) NULL);
3308                 mapping_adjust_call = &mapping_adjust_call_data;
3309         }
3310         /* XXX  rethink best way to do locking here */
3311         if (pv_free_count < PV_LOW_WATER_MARK) {
3312                 for (i = 0; i < PV_ALLOC_CHUNK; i++) {
3313                         pv_e = (pv_entry_t) zalloc(pv_list_zone);
3314                         SPLVM(spl);
3315                         PV_FREE(pv_e);
3316                         SPLX(spl);
3317                 }
3318         }
3319         mappingrecurse = 0;
3320 }
3321
3322 void
3323 pmap_commpage_init(vm_offset_t kernel_commpage, vm_offset_t user_commpage, int cnt)
3324 {
3325   int i;
3326   pt_entry_t *opte, *npte;
3327   pt_entry_t pte;
3328
3329   for (i = 0; i < cnt; i++) {
3330     opte = pmap_pte(kernel_pmap, kernel_commpage);
3331     if (0 == opte) panic("kernel_commpage");
3332     npte = pmap_pte(kernel_pmap, user_commpage);
3333     if (0 == npte) panic("user_commpage");
3334     pte = *opte | INTEL_PTE_USER|INTEL_PTE_GLOBAL;
3335     pte &= ~INTEL_PTE_WRITE; // ensure read only
3336     WRITE_PTE_FAST(npte, pte);
3337     kernel_commpage += INTEL_PGBYTES;
3338     user_commpage += INTEL_PGBYTES;
3339   }
3340 }
3341
3342 static cpu_pmap_t               cpu_pmap_master;
3343 static struct pmap_update_list  cpu_update_list_master;
3344
3345 struct cpu_pmap *
3346 pmap_cpu_alloc(boolean_t is_boot_cpu)
3347 {
3348         int                     ret;
3349         int                     i;
3350         cpu_pmap_t              *cp;
3351         pmap_update_list_t      up;
3352         vm_offset_t             address;
3353         vm_map_entry_t          entry;
3354
3355         if (is_boot_cpu) {
3356                 cp = &cpu_pmap_master;
3357                 up = &cpu_update_list_master;
3358         } else {
3359                 /*
3360                  * The per-cpu pmap data structure itself.
3361                  */
3362                 ret = kmem_alloc(kernel_map,
3363                                  (vm_offset_t *) &cp, sizeof(cpu_pmap_t));
3364                 if (ret != KERN_SUCCESS) {
3365                         printf("pmap_cpu_alloc() failed ret=%d\n", ret);
3366                         return NULL;
3367                 }
3368                 bzero((void *)cp, sizeof(cpu_pmap_t));
3369
3370                 /*
3371                  * The tlb flush update list.
3372                  */
3373                 ret = kmem_alloc(kernel_map,
3374                                  (vm_offset_t *) &up, sizeof(*up));
3375                 if (ret != KERN_SUCCESS) {
3376                         printf("pmap_cpu_alloc() failed ret=%d\n", ret);
3377                         pmap_cpu_free(cp);
3378                         return NULL;
3379                 }
3380
3381                 /*
3382                  * The temporary windows used for copy/zero - see loose_ends.c
3383                  */
3384                 for (i = 0; i < PMAP_NWINDOWS; i++) {
3385                         ret = vm_map_find_space(kernel_map,
3386                                         &address, PAGE_SIZE, 0, &entry);
3387                         if (ret != KERN_SUCCESS) {
3388                                 printf("pmap_cpu_alloc() "
3389                                         "vm_map_find_space ret=%d\n", ret);
3390                                 pmap_cpu_free(cp);
3391                                 return NULL;
3392                         }
3393                         vm_map_unlock(kernel_map);
3394
3395                         cp->mapwindow[i].prv_CADDR = (caddr_t) address;
3396                         cp->mapwindow[i].prv_CMAP = vtopte(address);
3397                         * (int *) cp->mapwindow[i].prv_CMAP = 0;
3398
3399                         kprintf("pmap_cpu_alloc() "
3400                                 "window=%d CADDR=0x%x CMAP=0x%x\n",
3401                                 i, address, vtopte(address));
3402                 }
3403         }
3404
3405         /*
3406          *      Set up the pmap request list
3407          */
3408         cp->update_list = up;
3409         simple_lock_init(&up->lock, 0);
3410         up->count = 0;
3411
3412         return cp;
3413 }
3414
3415 void
3416 pmap_cpu_free(struct cpu_pmap *cp)
3417 {
3418         if (cp != NULL && cp != &cpu_pmap_master) {
3419                 if (cp->update_list != NULL)
3420                         kfree((void *) cp->update_list,
3421                                 sizeof(*cp->update_list));
3422                 kfree((void *) cp, sizeof(cpu_pmap_t));
3423         }
3424 }