2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 * Carnegie Mellon requests users of this software to return to
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
61 * Author: Avadis Tevanian, Jr., Michael Wayne Young
62 * (These guys wrote the Vax version)
64 * Physical Map management code for Intel i386, i486, and i860.
66 * Manages physical address maps.
68 * In addition to hardware address maps, this
69 * module is called upon to provide software-use-only
70 * maps which may or may not be stored in the same
71 * form as hardware maps. These pseudo-maps are
72 * used to store intermediate results from copy
73 * operations to and from address spaces.
75 * Since the information managed by this module is
76 * also stored by the logical address mapping module,
77 * this module may throw away valid virtual-to-physical
78 * mappings at almost any time. However, invalidations
79 * of virtual-to-physical mappings must be done as
82 * In order to cope with hardware architectures which
83 * make virtual-to-physical map invalidates expensive,
84 * this module may delay invalidate or reduced protection
85 * operations until such time as they are actually
86 * necessary. This module is given full information as
87 * to which processors are currently using which maps,
88 * and to when physical maps must be made correct.
94 #include <mach_ldebug.h>
96 #include <libkern/OSAtomic.h>
98 #include <mach/machine/vm_types.h>
100 #include <mach/boolean.h>
101 #include <kern/thread.h>
102 #include <kern/zalloc.h>
103 #include <kern/queue.h>
105 #include <kern/lock.h>
106 #include <kern/kalloc.h>
107 #include <kern/spl.h>
110 #include <vm/vm_map.h>
111 #include <vm/vm_kern.h>
112 #include <mach/vm_param.h>
113 #include <mach/vm_prot.h>
114 #include <vm/vm_object.h>
115 #include <vm/vm_page.h>
117 #include <mach/machine/vm_param.h>
118 #include <machine/thread.h>
120 #include <kern/misc_protos.h> /* prototyping */
121 #include <i386/misc_protos.h>
123 #include <i386/cpuid.h>
124 #include <i386/cpu_data.h>
125 #include <i386/cpu_number.h>
126 #include <i386/machine_cpu.h>
127 #include <i386/seg.h>
128 #include <i386/serial_io.h>
129 #include <i386/cpu_capabilities.h>
130 #include <i386/machine_routines.h>
131 #include <i386/proc_reg.h>
132 #include <i386/tsc.h>
133 #include <i386/acpi.h>
134 #include <i386/pmap_internal.h>
137 #include <ddb/db_command.h>
138 #include <ddb/db_output.h>
139 #include <ddb/db_sym.h>
140 #include <ddb/db_print.h>
141 #endif /* MACH_KDB */
143 #include <vm/vm_protos.h>
146 #include <i386/mp_desc.h>
147 #include <i386/i386_lowmem.h>
150 /* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
151 #ifdef DEBUGINTERRUPTS
152 #define pmap_intr_assert() {if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) panic("pmap interrupt assert %s, %d",__FILE__, __LINE__);}
154 #define pmap_intr_assert()
160 #define POSTCODE_DELAY 1
161 #include <i386/postcode.h>
162 #endif /* IWANTTODEBUG */
165 * Forward declarations for internal functions.
168 void pmap_remove_range(
174 void phys_attribute_clear(
178 int phys_attribute_test(
182 void phys_attribute_set(
186 void pmap_set_reference(
189 boolean_t
phys_page_exists(
194 void dump_pmap(pmap_t
);
195 void dump_4GB_pdpt(pmap_t p
);
196 void dump_4GB_pdpt_thread(thread_t tp
);
199 int nx_enabled
= 1; /* enable no-execute protection */
200 #ifdef CONFIG_EMBEDDED
201 int allow_data_exec
= 0; /* no exec from data, embedded is hardcore like that */
203 int allow_data_exec
= VM_ABI_32
; /* 32-bit apps may execute data by default, 64-bit apps may not */
205 int allow_stack_exec
= 0; /* No apps may execute from the stack by default */
207 boolean_t cpu_64bit
= FALSE
;
208 boolean_t pmap_trace
= FALSE
;
211 * when spinning through pmap_remove
212 * ensure that we don't spend too much
213 * time with preemption disabled.
214 * I'm setting the current threshold
217 #define MAX_PREEMPTION_LATENCY_NS 20000
219 uint64_t max_preemption_latency_tsc
= 0;
223 * Private data structures.
227 * For each vm_page_t, there is a list of all currently
228 * valid virtual mappings of that page. An entry is
229 * a pv_rooted_entry_t; the list is the pv_table.
231 * N.B. with the new combo rooted/hashed scheme it is
232 * only possibly to remove individual non-rooted entries
233 * if they are found via the hashed chains as there is no
234 * way to unlink the singly linked hashed entries if navigated to
235 * via the queue list off the rooted entries. Think of it as
236 * hash/walk/pull, keeping track of the prev pointer while walking
237 * the singly linked hash list. All of this is to save memory and
238 * keep both types of pv_entries as small as possible.
243 PV HASHING Changes - JK 1/2007
245 Pve's establish physical to virtual mappings. These are used for aliasing of a
246 physical page to (potentially many) virtual addresses within pmaps. In the previous
247 implementation the structure of the pv_entries (each 16 bytes in size) was
249 typedef struct pv_entry {
250 struct pv_entry_t next;
255 An initial array of these is created at boot time, one per physical page of memory,
256 indexed by the physical page number. Additionally, a pool of entries is created from a
257 pv_zone to be used as needed by pmap_enter() when it is creating new mappings.
258 Originally, we kept this pool around because the code in pmap_enter() was unable to
259 block if it needed an entry and none were available - we'd panic. Some time ago I
260 restructured the pmap_enter() code so that for user pmaps it can block while zalloc'ing
261 a pv structure and restart, removing a panic from the code (in the case of the kernel
262 pmap we cannot block and still panic, so, we keep a separate hot pool for use only on
263 kernel pmaps). The pool has not been removed since there is a large performance gain
264 keeping freed pv's around for reuse and not suffering the overhead of zalloc for every new pv we need.
266 As pmap_enter() created new mappings it linked the new pve's for them off the fixed
267 pv array for that ppn (off the next pointer). These pve's are accessed for several
268 operations, one of them being address space teardown. In that case, we basically do this
270 for (every page/pte in the space) {
271 calc pve_ptr from the ppn in the pte
272 for (every pv in the list for the ppn) {
273 if (this pv is for this pmap/vaddr) {
280 The problem arose when we were running, say 8000 (or even 2000) apache or other processes
281 and one or all terminate. The list hanging off each pv array entry could have thousands of
282 entries. We were continuously linearly searching each of these lists as we stepped through
283 the address space we were tearing down. Because of the locks we hold, likely taking a cache
284 miss for each node, and interrupt disabling for MP issues the system became completely
285 unresponsive for many seconds while we did this.
287 Realizing that pve's are accessed in two distinct ways (linearly running the list by ppn
288 for operations like pmap_page_protect and finding and modifying/removing a single pve as
289 part of pmap_enter processing) has led to modifying the pve structures and databases.
291 There are now two types of pve structures. A "rooted" structure which is basically the
292 original structure accessed in an array by ppn, and a ''hashed'' structure accessed on a
293 hash list via a hash of [pmap, vaddr]. These have been designed with the two goals of
294 minimizing wired memory and making the lookup of a ppn faster. Since a vast majority of
295 pages in the system are not aliased and hence represented by a single pv entry I've kept
296 the rooted entry size as small as possible because there is one of these dedicated for
297 every physical page of memory. The hashed pve's are larger due to the addition of the hash
298 link and the ppn entry needed for matching while running the hash list to find the entry we
299 are looking for. This way, only systems that have lots of aliasing (like 2000+ httpd procs)
300 will pay the extra memory price. Both structures have the same first three fields allowing
301 some simplification in the code.
303 They have these shapes
305 typedef struct pv_rooted_entry {
309 } *pv_rooted_entry_t;
312 typedef struct pv_hashed_entry {
317 struct pv_hashed_entry *nexth;
318 } *pv_hashed_entry_t;
320 The main flow difference is that the code is now aware of the rooted entry and the hashed
321 entries. Code that runs the pv list still starts with the rooted entry and then continues
322 down the qlink onto the hashed entries. Code that is looking up a specific pv entry first
323 checks the rooted entry and then hashes and runs the hash list for the match. The hash list
324 lengths are much smaller than the original pv lists that contained all aliases for the specific ppn.
328 typedef struct pv_rooted_entry
{ /* first three entries must match pv_hashed_entry_t */
330 vm_map_offset_t va
; /* virtual address for mapping */
331 pmap_t pmap
; /* pmap where mapping lies */
332 } *pv_rooted_entry_t
;
334 #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
336 pv_rooted_entry_t pv_head_table
; /* array of entries, one per page */
338 typedef struct pv_hashed_entry
{ /* first three entries must match pv_rooted_entry_t */
343 struct pv_hashed_entry
*nexth
;
344 } *pv_hashed_entry_t
;
346 #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
348 #define NPVHASH 4095 /* MUST BE 2^N - 1 */
349 pv_hashed_entry_t
*pv_hash_table
; /* hash lists */
351 uint32_t npvhash
= 0;
353 /* #define PV_DEBUG 1 uncomment to enable some PV debugging code */
355 #define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
357 #define CHK_NPVHASH()
361 * pv_list entries are kept on a list that can only be accessed
362 * with the pmap system locked (at SPLVM, not in the cpus_active set).
363 * The list is refilled from the pv_hashed_list_zone if it becomes empty.
365 pv_rooted_entry_t pv_free_list
= PV_ROOTED_ENTRY_NULL
; /* free list at SPLVM */
366 pv_hashed_entry_t pv_hashed_free_list
= PV_HASHED_ENTRY_NULL
;
367 pv_hashed_entry_t pv_hashed_kern_free_list
= PV_HASHED_ENTRY_NULL
;
368 decl_simple_lock_data(,pv_hashed_free_list_lock
)
369 decl_simple_lock_data(,pv_hashed_kern_free_list_lock
)
370 decl_simple_lock_data(,pv_hash_table_lock
)
372 int pv_free_count
= 0;
373 int pv_hashed_free_count
= 0;
374 int pv_kern_free_count
= 0;
375 int pv_hashed_kern_free_count
= 0;
376 #define PV_HASHED_LOW_WATER_MARK 5000
377 #define PV_HASHED_KERN_LOW_WATER_MARK 100
378 #define PV_HASHED_ALLOC_CHUNK 2000
379 #define PV_HASHED_KERN_ALLOC_CHUNK 50
380 thread_call_t mapping_adjust_call
;
381 static thread_call_data_t mapping_adjust_call_data
;
382 uint32_t mappingrecurse
= 0;
384 #define PV_HASHED_ALLOC(pvh_e) { \
385 simple_lock(&pv_hashed_free_list_lock); \
386 if ((pvh_e = pv_hashed_free_list) != 0) { \
387 pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
388 pv_hashed_free_count--; \
389 if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \
390 if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
391 thread_call_enter(mapping_adjust_call); \
393 simple_unlock(&pv_hashed_free_list_lock); \
396 #define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
397 simple_lock(&pv_hashed_free_list_lock); \
398 pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; \
399 pv_hashed_free_list = pvh_eh; \
400 pv_hashed_free_count += pv_cnt; \
401 simple_unlock(&pv_hashed_free_list_lock); \
404 #define PV_HASHED_KERN_ALLOC(pvh_e) { \
405 simple_lock(&pv_hashed_kern_free_list_lock); \
406 if ((pvh_e = pv_hashed_kern_free_list) != 0) { \
407 pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
408 pv_hashed_kern_free_count--; \
409 if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) \
410 if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
411 thread_call_enter(mapping_adjust_call); \
413 simple_unlock(&pv_hashed_kern_free_list_lock); \
416 #define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
417 simple_lock(&pv_hashed_kern_free_list_lock); \
418 pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; \
419 pv_hashed_kern_free_list = pvh_eh; \
420 pv_hashed_kern_free_count += pv_cnt; \
421 simple_unlock(&pv_hashed_kern_free_list_lock); \
424 zone_t pv_hashed_list_zone
; /* zone of pv_hashed_entry structures */
426 static zone_t pdpt_zone
;
429 * Each entry in the pv_head_table is locked by a bit in the
430 * pv_lock_table. The lock bits are accessed by the physical
431 * address of the page they lock.
434 char *pv_lock_table
; /* pointer to array of bits */
435 #define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
437 char *pv_hash_lock_table
;
438 #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
441 * First and last physical addresses that we maintain any information
442 * for. Initialized to zero so that pmap operations done before
443 * pmap_init won't touch any non-existent structures.
445 boolean_t pmap_initialized
= FALSE
;/* Has pmap_init completed? */
447 static struct vm_object kptobj_object_store
;
448 static vm_object_t kptobj
;
451 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
454 #define pa_index(pa) (i386_btop(pa))
455 #define ppn_to_pai(ppn) ((int)ppn)
457 #define pai_to_pvh(pai) (&pv_head_table[pai])
458 #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
459 #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
461 #define pvhashidx(pmap, va) (((uint32_t)pmap ^ ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) & npvhash)
462 #define pvhash(idx) (&pv_hash_table[idx])
464 #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
465 #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
468 * Array of physical page attribites for managed pages.
469 * One byte per physical page.
471 char *pmap_phys_attributes
;
472 unsigned int last_managed_page
= 0;
475 * Physical page attributes. Copy bits from PTE definition.
477 #define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
478 #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
479 #define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
482 * Amount of virtual memory mapped by one
483 * page-directory entry.
485 #define PDE_MAPPED_SIZE (pdetova(1))
486 uint64_t pde_mapped_size
;
489 * Locking and TLB invalidation
493 * Locking Protocols: (changed 2/2007 JK)
495 * There are two structures in the pmap module that need locking:
496 * the pmaps themselves, and the per-page pv_lists (which are locked
497 * by locking the pv_lock_table entry that corresponds to the pv_head
498 * for the list in question.) Most routines want to lock a pmap and
499 * then do operations in it that require pv_list locking -- however
500 * pmap_remove_all and pmap_copy_on_write operate on a physical page
501 * basis and want to do the locking in the reverse order, i.e. lock
502 * a pv_list and then go through all the pmaps referenced by that list.
504 * The system wide pmap lock has been removed. Now, paths take a lock
505 * on the pmap before changing its 'shape' and the reverse order lockers
506 * (coming in by phys ppn) take a lock on the corresponding pv and then
507 * retest to be sure nothing changed during the window before they locked
508 * and can then run up/down the pv lists holding the list lock. This also
509 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
518 #define LOCK_PVH(index) { \
519 mp_disable_preemption(); \
520 lock_pvh_pai(index); \
523 #define UNLOCK_PVH(index) { \
524 unlock_pvh_pai(index); \
525 mp_enable_preemption(); \
532 #define LOCK_PV_HASH(hash) lock_hash_hash(hash)
534 #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
537 extern int max_lock_loops
;
539 unsigned int loop_count; \
540 loop_count = disable_serial_output ? max_lock_loops \
542 #define LOOP_CHECK(msg, pmap) \
543 if (--loop_count == 0) { \
544 mp_disable_preemption(); \
545 kprintf("%s: cpu %d pmap %x\n", \
546 msg, cpu_number(), pmap); \
547 Debugger("deadlock detection"); \
548 mp_enable_preemption(); \
549 loop_count = max_lock_loops; \
551 #else /* USLOCK_DEBUG */
553 #define LOOP_CHECK(msg, pmap)
554 #endif /* USLOCK_DEBUG */
556 unsigned pmap_memory_region_count
;
557 unsigned pmap_memory_region_current
;
559 pmap_memory_region_t pmap_memory_regions
[PMAP_MEMORY_REGIONS_SIZE
];
562 * Other useful macros.
564 #define current_pmap() (vm_map_pmap(current_thread()->map))
566 struct pmap kernel_pmap_store
;
569 pd_entry_t high_shared_pde
;
570 pd_entry_t commpage64_pde
;
572 struct zone
*pmap_zone
; /* zone of pmap structures */
574 int pmap_debug
= 0; /* flag for debugging prints */
576 unsigned int inuse_ptepages_count
= 0;
578 addr64_t kernel64_cr3
;
579 boolean_t no_shared_cr3
= FALSE
; /* -no_shared_cr3 boot arg */
583 * Pmap cache. Cache is threaded through ref_count field of pmap.
584 * Max will eventually be constant -- variable for experimentation.
586 int pmap_cache_max
= 32;
587 int pmap_alloc_chunk
= 8;
588 pmap_t pmap_cache_list
;
589 int pmap_cache_count
;
590 decl_simple_lock_data(,pmap_cache_lock
)
596 pt_entry_t
*DMAP1
, *DMAP2
;
601 void pmap_pvh_unlink(pv_hashed_entry_t pv
);
604 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
605 * properly deals with the anchor.
606 * must be called with the hash locked, does not unlock it
610 void pmap_pvh_unlink(pv_hashed_entry_t pvh
)
612 pv_hashed_entry_t curh
;
613 pv_hashed_entry_t
*pprevh
;
617 pvhash_idx
= pvhashidx(pvh
->pmap
, pvh
->va
);
619 pprevh
= pvhash(pvhash_idx
);
622 if (NULL
== *pprevh
) panic("pvh_unlink null anchor"); /* JK DEBUG */
626 while (PV_HASHED_ENTRY_NULL
!= curh
) {
629 pprevh
= &curh
->nexth
;
632 if (PV_HASHED_ENTRY_NULL
== curh
) panic("pmap_pvh_unlink no pvh");
633 *pprevh
= pvh
->nexth
;
638 * for legacy, returns the address of the pde entry.
639 * for 64 bit, causes the pdpt page containing the pde entry to be mapped,
640 * then returns the mapped address of the pde entry in that page
643 pmap_pde(pmap_t m
, vm_map_offset_t v
)
646 if (!cpu_64bit
|| (m
== kernel_pmap
)) {
647 pde
= (&((m
)->dirbase
[(vm_offset_t
)(v
) >> PDESHIFT
]));
650 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
651 pde
= pmap64_pde(m
, v
);
658 * the single pml4 page per pmap is allocated at pmap create time and exists
659 * for the duration of the pmap. we allocate this page in kernel vm (to save us one
660 * level of page table dynamic mapping.
661 * this returns the address of the requested pml4 entry in the top level page.
665 pmap64_pml4(pmap_t pmap
, vm_map_offset_t vaddr
)
667 return ((pml4_entry_t
*)pmap
->pm_hold
+ ((vm_offset_t
)((vaddr
>>PML4SHIFT
)&(NPML4PG
-1))));
671 * maps in the pml4 page, if any, containing the pdpt entry requested
672 * and returns the address of the pdpt entry in that mapped page
675 pmap64_pdpt(pmap_t pmap
, vm_map_offset_t vaddr
)
682 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
683 if ((vaddr
> 0x00007FFFFFFFFFFFULL
) && (vaddr
< 0xFFFF800000000000ULL
)) {
687 pml4
= pmap64_pml4(pmap
, vaddr
);
689 if (pml4
&& ((*pml4
& INTEL_PTE_VALID
))) {
691 newpf
= *pml4
& PG_FRAME
;
694 for (i
=PMAP_PDPT_FIRST_WINDOW
; i
< PMAP_PDPT_FIRST_WINDOW
+PMAP_PDPT_NWINDOWS
; i
++) {
695 if (((*(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
)) & PG_FRAME
) == newpf
) {
696 return((pdpt_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CADDR
) +
697 ((vm_offset_t
)((vaddr
>>PDPTSHIFT
)&(NPDPTPG
-1))));
701 current_cpu_datap()->cpu_pmap
->pdpt_window_index
++;
702 if (current_cpu_datap()->cpu_pmap
->pdpt_window_index
> (PMAP_PDPT_FIRST_WINDOW
+PMAP_PDPT_NWINDOWS
-1))
703 current_cpu_datap()->cpu_pmap
->pdpt_window_index
= PMAP_PDPT_FIRST_WINDOW
;
705 (current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pdpt_window_index
].prv_CMAP
),
706 newpf
| INTEL_PTE_RW
| INTEL_PTE_VALID
);
707 invlpg((u_int
)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pdpt_window_index
].prv_CADDR
));
708 return ((pdpt_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pdpt_window_index
].prv_CADDR
) +
709 ((vm_offset_t
)((vaddr
>>PDPTSHIFT
)&(NPDPTPG
-1))));
716 * maps in the pdpt page, if any, containing the pde entry requested
717 * and returns the address of the pde entry in that mapped page
720 pmap64_pde(pmap_t pmap
, vm_map_offset_t vaddr
)
727 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
728 if ((vaddr
> 0x00007FFFFFFFFFFFULL
) && (vaddr
< 0xFFFF800000000000ULL
)) {
732 /* if (vaddr & (1ULL << 63)) panic("neg addr");*/
733 pdpt
= pmap64_pdpt(pmap
, vaddr
);
735 if (pdpt
&& ((*pdpt
& INTEL_PTE_VALID
))) {
737 newpf
= *pdpt
& PG_FRAME
;
739 for (i
=PMAP_PDE_FIRST_WINDOW
; i
< PMAP_PDE_FIRST_WINDOW
+PMAP_PDE_NWINDOWS
; i
++) {
740 if (((*(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
)) & PG_FRAME
) == newpf
) {
741 return((pd_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CADDR
) +
742 ((vm_offset_t
)((vaddr
>>PDSHIFT
)&(NPDPG
-1))));
746 current_cpu_datap()->cpu_pmap
->pde_window_index
++;
747 if (current_cpu_datap()->cpu_pmap
->pde_window_index
> (PMAP_PDE_FIRST_WINDOW
+PMAP_PDE_NWINDOWS
-1))
748 current_cpu_datap()->cpu_pmap
->pde_window_index
= PMAP_PDE_FIRST_WINDOW
;
750 (current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pde_window_index
].prv_CMAP
),
751 newpf
| INTEL_PTE_RW
| INTEL_PTE_VALID
);
752 invlpg((u_int
)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pde_window_index
].prv_CADDR
));
753 return ((pd_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pde_window_index
].prv_CADDR
) +
754 ((vm_offset_t
)((vaddr
>>PDSHIFT
)&(NPDPG
-1))));
761 * Because the page tables (top 3 levels) are mapped into per cpu windows,
762 * callers must either disable interrupts or disable preemption before calling
763 * one of the pte mapping routines (e.g. pmap_pte()) as the returned vaddr
764 * is in one of those mapped windows and that cannot be allowed to change until
765 * the caller is done using the returned pte pointer. When done, the caller
766 * restores interrupts or preemption to its previous state after which point the
767 * vaddr for the returned pte can no longer be used
772 * return address of mapped pte for vaddr va in pmap pmap.
773 * must be called with pre-emption or interrupts disabled
774 * if targeted pmap is not the kernel pmap
775 * since we may be passing back a virtual address that is
776 * associated with this cpu... pre-emption or interrupts
777 * must remain disabled until the caller is done using
778 * the pointer that was passed back .
780 * maps the pde page, if any, containing the pte in and returns
781 * the address of the pte in that mapped page
784 pmap_pte(pmap_t pmap
, vm_map_offset_t vaddr
)
791 pde
= pmap_pde(pmap
,vaddr
);
793 if (pde
&& ((*pde
& INTEL_PTE_VALID
))) {
794 if (*pde
& INTEL_PTE_PS
)
796 if (pmap
== kernel_pmap
)
797 return (vtopte(vaddr
)); /* compat kernel still has pte's mapped */
799 if (ml_get_interrupts_enabled() && get_preemption_level() == 0)
800 panic("pmap_pte: unsafe call");
802 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
804 newpf
= *pde
& PG_FRAME
;
806 for (i
=PMAP_PTE_FIRST_WINDOW
; i
< PMAP_PTE_FIRST_WINDOW
+PMAP_PTE_NWINDOWS
; i
++) {
807 if (((*(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
)) & PG_FRAME
) == newpf
) {
808 return((pt_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CADDR
) +
809 ((vm_offset_t
)i386_btop(vaddr
) & (NPTEPG
-1)));
813 current_cpu_datap()->cpu_pmap
->pte_window_index
++;
814 if (current_cpu_datap()->cpu_pmap
->pte_window_index
> (PMAP_PTE_FIRST_WINDOW
+PMAP_PTE_NWINDOWS
-1))
815 current_cpu_datap()->cpu_pmap
->pte_window_index
= PMAP_PTE_FIRST_WINDOW
;
817 (current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pte_window_index
].prv_CMAP
),
818 newpf
| INTEL_PTE_RW
| INTEL_PTE_VALID
);
819 invlpg((u_int
)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pte_window_index
].prv_CADDR
));
820 return ((pt_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pte_window_index
].prv_CADDR
) +
821 ((vm_offset_t
)i386_btop(vaddr
) & (NPTEPG
-1)));
829 * Map memory at initialization. The physical addresses being
830 * mapped are not managed and are never unmapped.
832 * For now, VM is already on, we only need to map the
838 vm_map_offset_t start_addr
,
839 vm_map_offset_t end_addr
,
846 while (start_addr
< end_addr
) {
847 pmap_enter(kernel_pmap
, (vm_map_offset_t
)virt
,
848 (ppnum_t
) i386_btop(start_addr
), prot
, flags
, FALSE
);
856 * Back-door routine for mapping kernel VM at initialization.
857 * Useful for mapping memory outside the range
858 * Sets no-cache, A, D.
859 * Otherwise like pmap_map.
864 vm_map_offset_t start_addr
,
865 vm_map_offset_t end_addr
,
873 template = pa_to_pte(start_addr
)
879 if(flags
& (VM_MEM_NOT_CACHEABLE
| VM_WIMG_USE_DEFAULT
)) {
880 template |= INTEL_PTE_NCACHE
;
881 if(!(flags
& (VM_MEM_GUARDED
| VM_WIMG_USE_DEFAULT
)))
882 template |= INTEL_PTE_PTA
;
885 if (prot
& VM_PROT_WRITE
)
886 template |= INTEL_PTE_WRITE
;
889 while (start_addr
< end_addr
) {
891 pte
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)virt
);
892 if (pte
== PT_ENTRY_NULL
) {
893 panic("pmap_map_bd: Invalid kernel address\n");
895 pmap_store_pte(pte
, template);
897 pte_increment_pa(template);
899 start_addr
+= PAGE_SIZE
;
907 extern char *first_avail
;
908 extern vm_offset_t virtual_avail
, virtual_end
;
909 extern pmap_paddr_t avail_start
, avail_end
;
915 * Here early in the life of a processor (from cpu_mode_init()).
916 * If we're not in 64-bit mode, enable the global TLB feature.
917 * Note: regardless of mode we continue to set the global attribute
918 * bit in ptes for all (32-bit) global pages such as the commpage.
921 set_cr4(get_cr4() | CR4_PGE
);
925 * Initialize the per-cpu, TLB-related fields.
927 current_cpu_datap()->cpu_active_cr3
= kernel_pmap
->pm_cr3
;
928 current_cpu_datap()->cpu_tlb_invalid
= FALSE
;
932 pmap_high_shared_remap(enum high_fixed_addresses e
, vm_offset_t va
, int sz
)
934 vm_offset_t ve
= pmap_index_to_virt(e
);
940 assert(0 == (va
& PAGE_MASK
)); /* expecting page aligned */
942 ptep
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)ve
);
944 for (i
=0; i
< sz
; i
++) {
945 pa
= (pmap_paddr_t
) kvtophys(va
);
946 pmap_store_pte(ptep
, (pa
& PG_FRAME
)
960 pmap_cpu_high_shared_remap(int cpu
, enum high_cpu_types e
, vm_offset_t va
, int sz
)
962 enum high_fixed_addresses a
= e
+ HIGH_CPU_END
* cpu
;
963 return pmap_high_shared_remap(HIGH_FIXED_CPUS_BEGIN
+ a
, va
, sz
);
966 void pmap_init_high_shared(void);
968 extern vm_offset_t gdtptr
, idtptr
;
970 extern uint32_t low_intstack
;
972 extern struct fake_descriptor ldt_desc_pattern
;
973 extern struct fake_descriptor tss_desc_pattern
;
975 extern char hi_remap_text
, hi_remap_etext
;
976 extern char t_zero_div
;
978 pt_entry_t
*pte_unique_base
;
981 pmap_init_high_shared(void)
987 struct i386_tss
*ttss
;
990 cpu_desc_index_t
* cdi
= &cpu_data_master
.cpu_desc_index
;
992 kprintf("HIGH_MEM_BASE 0x%x fixed per-cpu begin 0x%x\n",
993 HIGH_MEM_BASE
,pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN
));
995 pte_unique_base
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN
));
998 if (i386_btop(&hi_remap_etext
- &hi_remap_text
+ 1) >
999 HIGH_FIXED_TRAMPS_END
- HIGH_FIXED_TRAMPS
+ 1)
1000 panic("tramps too large");
1001 haddr
= pmap_high_shared_remap(HIGH_FIXED_TRAMPS
,
1002 (vm_offset_t
) &hi_remap_text
, 3);
1003 kprintf("tramp: 0x%x, ",haddr
);
1004 /* map gdt up high and update ptr for reload */
1005 haddr
= pmap_high_shared_remap(HIGH_FIXED_GDT
,
1006 (vm_offset_t
) master_gdt
, 1);
1007 cdi
->cdi_gdt
.ptr
= (void *)haddr
;
1008 kprintf("GDT: 0x%x, ",haddr
);
1009 /* map ldt up high */
1010 haddr
= pmap_high_shared_remap(HIGH_FIXED_LDT_BEGIN
,
1011 (vm_offset_t
) master_ldt
,
1012 HIGH_FIXED_LDT_END
- HIGH_FIXED_LDT_BEGIN
+ 1);
1013 cdi
->cdi_ldt
= (struct fake_descriptor
*)haddr
;
1014 kprintf("LDT: 0x%x, ",haddr
);
1015 /* put new ldt addr into gdt */
1016 struct fake_descriptor temp_fake_desc
;
1017 temp_fake_desc
= ldt_desc_pattern
;
1018 temp_fake_desc
.offset
= (vm_offset_t
) haddr
;
1019 fix_desc(&temp_fake_desc
, 1);
1021 *(struct fake_descriptor
*) &master_gdt
[sel_idx(KERNEL_LDT
)] = temp_fake_desc
;
1022 *(struct fake_descriptor
*) &master_gdt
[sel_idx(USER_LDT
)] = temp_fake_desc
;
1024 /* map idt up high */
1025 haddr
= pmap_high_shared_remap(HIGH_FIXED_IDT
,
1026 (vm_offset_t
) master_idt
, 1);
1027 cdi
->cdi_idt
.ptr
= (void *)haddr
;
1028 kprintf("IDT: 0x%x, ", haddr
);
1029 /* remap ktss up high and put new high addr into gdt */
1030 haddr
= pmap_high_shared_remap(HIGH_FIXED_KTSS
,
1031 (vm_offset_t
) &master_ktss
, 1);
1033 temp_fake_desc
= tss_desc_pattern
;
1034 temp_fake_desc
.offset
= (vm_offset_t
) haddr
;
1035 fix_desc(&temp_fake_desc
, 1);
1036 *(struct fake_descriptor
*) &master_gdt
[sel_idx(KERNEL_TSS
)] = temp_fake_desc
;
1037 kprintf("KTSS: 0x%x, ",haddr
);
1039 /* remap dbtss up high and put new high addr into gdt */
1040 haddr
= pmap_high_shared_remap(HIGH_FIXED_DBTSS
,
1041 (vm_offset_t
) &master_dbtss
, 1);
1042 temp_fake_desc
= tss_desc_pattern
;
1043 temp_fake_desc
.offset
= (vm_offset_t
) haddr
;
1044 fix_desc(&temp_fake_desc
, 1);
1045 *(struct fake_descriptor
*)&master_gdt
[sel_idx(DEBUG_TSS
)] = temp_fake_desc
;
1046 ttss
= (struct i386_tss
*)haddr
;
1047 kprintf("DBTSS: 0x%x, ",haddr
);
1048 #endif /* MACH_KDB */
1050 /* remap dftss up high and put new high addr into gdt */
1051 haddr
= pmap_high_shared_remap(HIGH_FIXED_DFTSS
,
1052 (vm_offset_t
) &master_dftss
, 1);
1053 temp_fake_desc
= tss_desc_pattern
;
1054 temp_fake_desc
.offset
= (vm_offset_t
) haddr
;
1055 fix_desc(&temp_fake_desc
, 1);
1056 *(struct fake_descriptor
*) &master_gdt
[sel_idx(DF_TSS
)] = temp_fake_desc
;
1057 kprintf("DFTSS: 0x%x\n",haddr
);
1059 /* remap mctss up high and put new high addr into gdt */
1060 haddr
= pmap_high_shared_remap(HIGH_FIXED_DFTSS
,
1061 (vm_offset_t
) &master_mctss
, 1);
1062 temp_fake_desc
= tss_desc_pattern
;
1063 temp_fake_desc
.offset
= (vm_offset_t
) haddr
;
1064 fix_desc(&temp_fake_desc
, 1);
1065 *(struct fake_descriptor
*) &master_gdt
[sel_idx(MC_TSS
)] = temp_fake_desc
;
1066 kprintf("MCTSS: 0x%x\n",haddr
);
1068 cpu_desc_load(&cpu_data_master
);
1073 * Bootstrap the system enough to run with virtual memory.
1074 * Map the kernel's code and data, and allocate the system page table.
1075 * Called with mapping OFF. Page_size must already be set.
1080 __unused vm_offset_t load_start
,
1089 vm_last_addr
= VM_MAX_KERNEL_ADDRESS
; /* Set the highest address
1092 * The kernel's pmap is statically allocated so we don't
1093 * have to use pmap_create, which is unlikely to work
1094 * correctly at this part of the boot sequence.
1098 kernel_pmap
= &kernel_pmap_store
;
1099 kernel_pmap
->ref_count
= 1;
1100 kernel_pmap
->nx_enabled
= FALSE
;
1101 kernel_pmap
->pm_task_map
= TASK_MAP_32BIT
;
1102 kernel_pmap
->pm_obj
= (vm_object_t
) NULL
;
1103 kernel_pmap
->dirbase
= (pd_entry_t
*)((unsigned int)IdlePTD
| KERNBASE
);
1104 kernel_pmap
->pdirbase
= (pmap_paddr_t
)((int)IdlePTD
);
1105 pdpt
= (pd_entry_t
*)((unsigned int)IdlePDPT
| KERNBASE
);
1106 kernel_pmap
->pm_pdpt
= pdpt
;
1107 kernel_pmap
->pm_cr3
= (pmap_paddr_t
)((int)IdlePDPT
);
1110 va
= (vm_offset_t
)kernel_pmap
->dirbase
;
1111 /* setup self referential mapping(s) */
1112 for (i
= 0; i
< NPGPTD
; i
++, pdpt
++) {
1114 pa
= (pmap_paddr_t
) kvtophys((vm_offset_t
)(va
+ i386_ptob(i
)));
1116 (pd_entry_t
*) (kernel_pmap
->dirbase
+ PTDPTDI
+ i
),
1117 (pa
& PG_FRAME
) | INTEL_PTE_VALID
| INTEL_PTE_RW
| INTEL_PTE_REF
|
1118 INTEL_PTE_MOD
| INTEL_PTE_WIRED
) ;
1119 pmap_store_pte(pdpt
, pa
| INTEL_PTE_VALID
);
1124 lo_kernel_cr3
= kernel_pmap
->pm_cr3
;
1125 current_cpu_datap()->cpu_kernel_cr3
= (addr64_t
) kernel_pmap
->pm_cr3
;
1127 /* save the value we stuff into created pmaps to share the gdts etc */
1128 high_shared_pde
= *pmap_pde(kernel_pmap
, HIGH_MEM_BASE
);
1129 /* make sure G bit is on for high shared pde entry */
1130 high_shared_pde
|= INTEL_PTE_GLOBAL
;
1132 pmap_store_pte(pmap_pde(kernel_pmap
, HIGH_MEM_BASE
), high_shared_pde
);
1136 OSAddAtomic(NKPT
, &inuse_ptepages_count
);
1138 virtual_avail
= (vm_offset_t
)VADDR(KPTDI
,0) + (vm_offset_t
)first_avail
;
1139 virtual_end
= (vm_offset_t
)(VM_MAX_KERNEL_ADDRESS
);
1142 * Reserve some special page table entries/VA space for temporary
1145 #define SYSMAP(c, p, v, n) \
1146 v = (c)va; va += ((n)*INTEL_PGBYTES); p = pte; pte += (n)
1151 for (i
=0; i
<PMAP_NWINDOWS
; i
++) {
1153 (current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
),
1154 (current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CADDR
),
1156 *current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
= 0;
1159 /* DMAP user for debugger */
1160 SYSMAP(caddr_t
, DMAP1
, DADDR1
, 1);
1161 SYSMAP(caddr_t
, DMAP2
, DADDR2
, 1); /* XXX temporary - can remove */
1165 if (PE_parse_boot_argn("npvhash", &npvhash
, sizeof (npvhash
))) {
1166 if (0 != ((npvhash
+1) & npvhash
)) {
1167 kprintf("invalid hash %d, must be ((2^N)-1), using default %d\n",npvhash
,NPVHASH
);
1173 printf("npvhash=%d\n",npvhash
);
1175 simple_lock_init(&kernel_pmap
->lock
, 0);
1176 simple_lock_init(&pv_hashed_free_list_lock
, 0);
1177 simple_lock_init(&pv_hashed_kern_free_list_lock
, 0);
1178 simple_lock_init(&pv_hash_table_lock
,0);
1180 pmap_init_high_shared();
1182 pde_mapped_size
= PDE_MAPPED_SIZE
;
1185 pdpt_entry_t
*ppdpt
= IdlePDPT
;
1186 pdpt_entry_t
*ppdpt64
= (pdpt_entry_t
*)IdlePDPT64
;
1187 pdpt_entry_t
*ppml4
= (pdpt_entry_t
*)IdlePML4
;
1188 int istate
= ml_set_interrupts_enabled(FALSE
);
1191 * Clone a new 64-bit 3rd-level page table directory, IdlePML4,
1192 * with page bits set for the correct IA-32e operation and so that
1193 * the legacy-mode IdlePDPT is retained for slave processor start-up.
1194 * This is necessary due to the incompatible use of page bits between
1195 * 64-bit and legacy modes.
1197 kernel_pmap
->pm_cr3
= (pmap_paddr_t
)((int)IdlePML4
); /* setup in start.s for us */
1198 kernel_pmap
->pm_pml4
= IdlePML4
;
1199 kernel_pmap
->pm_pdpt
= (pd_entry_t
*)
1200 ((unsigned int)IdlePDPT64
| KERNBASE
);
1201 #define PAGE_BITS INTEL_PTE_VALID|INTEL_PTE_RW|INTEL_PTE_USER|INTEL_PTE_REF
1202 pmap_store_pte(kernel_pmap
->pm_pml4
,
1203 (uint32_t)IdlePDPT64
| PAGE_BITS
);
1204 pmap_store_pte((ppdpt64
+0), *(ppdpt
+0) | PAGE_BITS
);
1205 pmap_store_pte((ppdpt64
+1), *(ppdpt
+1) | PAGE_BITS
);
1206 pmap_store_pte((ppdpt64
+2), *(ppdpt
+2) | PAGE_BITS
);
1207 pmap_store_pte((ppdpt64
+3), *(ppdpt
+3) | PAGE_BITS
);
1210 * The kernel is also mapped in the uber-sapce at the 4GB starting
1211 * 0xFFFFFF80:00000000. This is the highest entry in the 4th-level.
1213 pmap_store_pte((ppml4
+KERNEL_UBER_PML4_INDEX
), *(ppml4
+0));
1215 kernel64_cr3
= (addr64_t
) kernel_pmap
->pm_cr3
;
1217 /* Re-initialize descriptors and prepare to switch modes */
1218 cpu_desc_init64(&cpu_data_master
);
1219 current_cpu_datap()->cpu_is64bit
= TRUE
;
1220 current_cpu_datap()->cpu_active_cr3
= kernel64_cr3
;
1222 pde_mapped_size
= 512*4096 ;
1224 ml_set_interrupts_enabled(istate
);
1227 /* Sets 64-bit mode if required. */
1228 cpu_mode_init(&cpu_data_master
);
1229 /* Update in-kernel CPUID information if we're now in 64-bit mode */
1233 kernel_pmap
->pm_hold
= (vm_offset_t
)kernel_pmap
->pm_pml4
;
1235 kprintf("Kernel virtual space from 0x%x to 0x%x.\n",
1236 VADDR(KPTDI
,0), virtual_end
);
1237 printf("PAE enabled\n");
1239 printf("64 bit mode enabled\n");kprintf("64 bit mode enabled\n"); }
1241 kprintf("Available physical space from 0x%llx to 0x%llx\n",
1242 avail_start
, avail_end
);
1245 * By default for 64-bit users loaded at 4GB, share kernel mapping.
1246 * But this may be overridden by the -no_shared_cr3 boot-arg.
1248 if (PE_parse_boot_argn("-no_shared_cr3", &no_shared_cr3
, sizeof (no_shared_cr3
))) {
1249 kprintf("Shared kernel address space disabled\n");
1253 if (PE_parse_boot_argn("-pmap_trace", &pmap_trace
, sizeof (pmap_trace
))) {
1254 kprintf("Kernel traces for pmap operations enabled\n");
1256 #endif /* PMAP_TRACES */
1261 vm_offset_t
*startp
,
1264 *startp
= virtual_avail
;
1265 *endp
= virtual_end
;
1269 * Initialize the pmap module.
1270 * Called by vm_init, to initialize any structures that the pmap
1271 * system needs to map virtual memory.
1276 register long npages
;
1278 register vm_size_t s
;
1279 vm_map_offset_t vaddr
;
1283 * Allocate memory for the pv_head_table and its lock bits,
1284 * the modify bit array, and the pte_page table.
1288 * zero bias all these arrays now instead of off avail_start
1289 * so we cover all memory
1292 npages
= (long)i386_btop(avail_end
);
1293 s
= (vm_size_t
) (sizeof(struct pv_rooted_entry
) * npages
1294 + (sizeof (struct pv_hashed_entry_t
*) * (npvhash
+1))
1295 + pv_lock_table_size(npages
)
1296 + pv_hash_lock_table_size((npvhash
+1))
1300 if (kernel_memory_allocate(kernel_map
, &addr
, s
, 0,
1301 KMA_KOBJECT
| KMA_PERMANENT
)
1305 memset((char *)addr
, 0, s
);
1308 if (0 == npvhash
) panic("npvhash not initialized");
1312 * Allocate the structures first to preserve word-alignment.
1314 pv_head_table
= (pv_rooted_entry_t
) addr
;
1315 addr
= (vm_offset_t
) (pv_head_table
+ npages
);
1317 pv_hash_table
= (pv_hashed_entry_t
*)addr
;
1318 addr
= (vm_offset_t
) (pv_hash_table
+ (npvhash
+ 1));
1320 pv_lock_table
= (char *) addr
;
1321 addr
= (vm_offset_t
) (pv_lock_table
+ pv_lock_table_size(npages
));
1323 pv_hash_lock_table
= (char *) addr
;
1324 addr
= (vm_offset_t
) (pv_hash_lock_table
+ pv_hash_lock_table_size((npvhash
+1)));
1326 pmap_phys_attributes
= (char *) addr
;
1331 pmap_memory_region_t
*pmptr
= pmap_memory_regions
;
1333 last_pn
= (ppnum_t
)i386_btop(avail_end
);
1335 for (i
= 0; i
< pmap_memory_region_count
; i
++, pmptr
++) {
1336 if (pmptr
->type
== kEfiConventionalMemory
) {
1338 for (pn
= pmptr
->base
; pn
<= pmptr
->end
; pn
++) {
1340 pmap_phys_attributes
[pn
] |= PHYS_MANAGED
;
1342 if (pn
> last_managed_page
)
1343 last_managed_page
= pn
;
1351 * Create the zone of physical maps,
1352 * and of the physical-to-virtual entries.
1354 s
= (vm_size_t
) sizeof(struct pmap
);
1355 pmap_zone
= zinit(s
, 400*s
, 4096, "pmap"); /* XXX */
1356 s
= (vm_size_t
) sizeof(struct pv_hashed_entry
);
1357 pv_hashed_list_zone
= zinit(s
, 10000*s
, 4096, "pv_list"); /* XXX */
1359 pdpt_zone
= zinit(s
, 400*s
, 4096, "pdpt"); /* XXX */
1361 kptobj
= &kptobj_object_store
;
1362 _vm_object_allocate((vm_object_size_t
)(NPGPTD
*NPTDPG
), kptobj
);
1363 kernel_pmap
->pm_obj
= kptobj
;
1365 /* create pv entries for kernel pages mapped by low level
1366 startup code. these have to exist so we can pmap_remove()
1367 e.g. kext pages from the middle of our addr space */
1369 vaddr
= (vm_map_offset_t
)0;
1370 for (ppn
= 0; ppn
< i386_btop(avail_start
) ; ppn
++ ) {
1371 pv_rooted_entry_t pv_e
;
1373 pv_e
= pai_to_pvh(ppn
);
1376 pv_e
->pmap
= kernel_pmap
;
1377 queue_init(&pv_e
->qlink
);
1380 pmap_initialized
= TRUE
;
1383 * Initialize pmap cache.
1385 pmap_cache_list
= PMAP_NULL
;
1386 pmap_cache_count
= 0;
1387 simple_lock_init(&pmap_cache_lock
, 0);
1389 max_preemption_latency_tsc
= tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS
, tscFCvtn2t
);
1394 #define managed_page(x) ( (unsigned int)x <= last_managed_page && (pmap_phys_attributes[x] & PHYS_MANAGED) )
1397 * this function is only used for debugging fron the vm layer
1403 pv_rooted_entry_t pv_h
;
1407 assert(pn
!= vm_page_fictitious_addr
);
1409 if (!pmap_initialized
)
1412 if (pn
== vm_page_guard_addr
)
1415 pai
= ppn_to_pai(pn
);
1416 if (!managed_page(pai
))
1418 pv_h
= pai_to_pvh(pn
);
1419 result
= (pv_h
->pmap
== PMAP_NULL
);
1426 vm_map_offset_t va_start
,
1427 vm_map_offset_t va_end
)
1429 vm_map_offset_t offset
;
1432 if (pmap
== PMAP_NULL
) {
1437 * Check the resident page count
1438 * - if it's zero, the pmap is completely empty.
1439 * This short-circuit test prevents a virtual address scan which is
1440 * painfully slow for 64-bit spaces.
1441 * This assumes the count is correct
1442 * .. the debug kernel ought to be checking perhaps by page table walk.
1444 if (pmap
->stats
.resident_count
== 0)
1447 for (offset
= va_start
;
1449 offset
+= PAGE_SIZE_64
) {
1450 phys_page
= pmap_find_phys(pmap
, offset
);
1452 if (pmap
!= kernel_pmap
&&
1453 pmap
->pm_task_map
== TASK_MAP_32BIT
&&
1454 offset
>= HIGH_MEM_BASE
) {
1456 * The "high_shared_pde" is used to share
1457 * the entire top-most 2MB of address space
1458 * between the kernel and all 32-bit tasks.
1459 * So none of this can be removed from 32-bit
1461 * Let's pretend there's nothing up
1466 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1467 "page %d at 0x%llx\n",
1468 pmap
, va_start
, va_end
, phys_page
, offset
);
1478 * Create and return a physical map.
1480 * If the size specified for the map
1481 * is zero, the map is an actual physical
1482 * map, and may be referenced by the
1485 * If the size specified is non-zero,
1486 * the map will be used in software only, and
1487 * is bounded by that size.
1499 pml4_entry_t
*pml4p
;
1504 PMAP_TRACE(PMAP_CODE(PMAP__CREATE
) | DBG_FUNC_START
,
1505 (int) (sz
>>32), (int) sz
, (int) is_64bit
, 0, 0);
1507 size
= (vm_size_t
) sz
;
1510 * A software use-only map doesn't even need a map.
1517 p
= (pmap_t
) zalloc(pmap_zone
);
1519 panic("pmap_create zalloc");
1521 /* init counts now since we'll be bumping some */
1522 simple_lock_init(&p
->lock
, 0);
1523 p
->stats
.resident_count
= 0;
1524 p
->stats
.resident_max
= 0;
1525 p
->stats
.wired_count
= 0;
1528 p
->pm_shared
= FALSE
;
1530 assert(!is_64bit
|| cpu_64bit
);
1531 p
->pm_task_map
= is_64bit
? TASK_MAP_64BIT
: TASK_MAP_32BIT
;;
1534 /* legacy 32 bit setup */
1535 /* in the legacy case the pdpt layer is hardwired to 4 entries and each
1536 * entry covers 1GB of addr space */
1537 if (KERN_SUCCESS
!= kmem_alloc_kobject(kernel_map
, (vm_offset_t
*)(&p
->dirbase
), NBPTD
))
1538 panic("pmap_create kmem_alloc_kobject");
1539 p
->pm_hold
= (vm_offset_t
)zalloc(pdpt_zone
);
1540 if ((vm_offset_t
)NULL
== p
->pm_hold
) {
1541 panic("pdpt zalloc");
1543 pdpt
= (pdpt_entry_t
*) (( p
->pm_hold
+ 31) & ~31);
1544 p
->pm_cr3
= (pmap_paddr_t
)kvtophys((vm_offset_t
)pdpt
);
1545 if (NULL
== (p
->pm_obj
= vm_object_allocate((vm_object_size_t
)(NPGPTD
*NPTDPG
))))
1546 panic("pmap_create vm_object_allocate");
1548 memset((char *)p
->dirbase
, 0, NBPTD
);
1550 va
= (vm_offset_t
)p
->dirbase
;
1551 p
->pdirbase
= kvtophys(va
);
1553 template = cpu_64bit
? INTEL_PTE_VALID
|INTEL_PTE_RW
|INTEL_PTE_USER
|INTEL_PTE_REF
: INTEL_PTE_VALID
;
1554 for (i
= 0; i
< NPGPTD
; i
++, pdpt
++ ) {
1556 pa
= (pmap_paddr_t
) kvtophys((vm_offset_t
)(va
+ i386_ptob(i
)));
1557 pmap_store_pte(pdpt
, pa
| template);
1560 /* map the high shared pde */
1562 pmap_store_pte(pmap_pde(p
, HIGH_MEM_BASE
), high_shared_pde
);
1568 /* alloc the pml4 page in kernel vm */
1569 if (KERN_SUCCESS
!= kmem_alloc_kobject(kernel_map
, (vm_offset_t
*)(&p
->pm_hold
), PAGE_SIZE
))
1570 panic("pmap_create kmem_alloc_kobject pml4");
1572 memset((char *)p
->pm_hold
, 0, PAGE_SIZE
);
1573 p
->pm_cr3
= (pmap_paddr_t
)kvtophys((vm_offset_t
)p
->pm_hold
);
1575 OSAddAtomic(1, &inuse_ptepages_count
);
1577 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1579 if (NULL
== (p
->pm_obj_pml4
= vm_object_allocate((vm_object_size_t
)(NPML4PGS
))))
1580 panic("pmap_create pdpt obj");
1582 if (NULL
== (p
->pm_obj_pdpt
= vm_object_allocate((vm_object_size_t
)(NPDPTPGS
))))
1583 panic("pmap_create pdpt obj");
1585 if (NULL
== (p
->pm_obj
= vm_object_allocate((vm_object_size_t
)(NPDEPGS
))))
1586 panic("pmap_create pte obj");
1588 /* uber space points to uber mapped kernel */
1590 pml4p
= pmap64_pml4(p
, 0ULL);
1591 pmap_store_pte((pml4p
+KERNEL_UBER_PML4_INDEX
),*kernel_pmap
->pm_pml4
);
1595 while ((pdp
= pmap64_pde(p
, (uint64_t)HIGH_MEM_BASE
)) == PD_ENTRY_NULL
) {
1597 pmap_expand_pdpt(p
, (uint64_t)HIGH_MEM_BASE
); /* need room for another pde entry */
1600 pmap_store_pte(pdp
, high_shared_pde
);
1605 PMAP_TRACE(PMAP_CODE(PMAP__CREATE
) | DBG_FUNC_START
,
1606 (int) p
, is_64bit
, 0, 0, 0);
1612 * The following routines implement the shared address optmization for 64-bit
1613 * users with a 4GB page zero.
1615 * pmap_set_4GB_pagezero()
1616 * is called in the exec and fork paths to mirror the kernel's
1617 * mapping in the bottom 4G of the user's pmap. The task mapping changes
1618 * from TASK_MAP_64BIT to TASK_MAP_64BIT_SHARED. This routine returns
1619 * without doing anything if the -no_shared_cr3 boot-arg is set.
1621 * pmap_clear_4GB_pagezero()
1622 * is called in the exec/exit paths to undo this mirror. The task mapping
1623 * reverts to TASK_MAP_64BIT. In addition, we switch to the kernel's
1624 * CR3 by calling pmap_load_kernel_cr3().
1626 * pmap_load_kernel_cr3()
1627 * loads cr3 with the kernel's page table. In addition to being called
1628 * by pmap_clear_4GB_pagezero(), it is used both prior to teardown and
1629 * when we go idle in the context of a shared map.
1631 * Further notes on per-cpu data used:
1633 * cpu_kernel_cr3 is the cr3 for the kernel's pmap.
1634 * This is loaded in a trampoline on entering the kernel
1635 * from a 32-bit user (or non-shared-cr3 64-bit user).
1636 * cpu_task_cr3 is the cr3 for the current thread.
1637 * This is loaded in a trampoline as we exit the kernel.
1638 * cpu_active_cr3 reflects the cr3 currently loaded.
1639 * However, the low order bit is set when the
1640 * processor is idle or interrupts are disabled
1641 * while the system pmap lock is held. It is used by
1643 * cpu_task_map indicates whether the task cr3 belongs to
1644 * a 32-bit, a 64-bit or a 64-bit shared map.
1645 * The latter allows the avoidance of the cr3 load
1646 * on kernel entry and exit.
1647 * cpu_tlb_invalid set TRUE when a tlb flush is requested.
1648 * If the cr3 is "inactive" (the cpu is idle or the
1649 * system-wide pmap lock is held) this not serviced by
1650 * an IPI but at time when the cr3 becomes "active".
1654 pmap_set_4GB_pagezero(pmap_t p
)
1656 pdpt_entry_t
*user_pdptp
;
1657 pdpt_entry_t
*kern_pdptp
;
1659 assert(p
->pm_task_map
!= TASK_MAP_32BIT
);
1661 /* Kernel-shared cr3 may be disabled by boot arg. */
1666 * Set the bottom 4 3rd-level pte's to be the kernel's.
1669 while ((user_pdptp
= pmap64_pdpt(p
, 0x0)) == PDPT_ENTRY_NULL
) {
1671 pmap_expand_pml4(p
, 0x0);
1674 kern_pdptp
= kernel_pmap
->pm_pdpt
;
1675 pmap_store_pte(user_pdptp
+0, *(kern_pdptp
+0));
1676 pmap_store_pte(user_pdptp
+1, *(kern_pdptp
+1));
1677 pmap_store_pte(user_pdptp
+2, *(kern_pdptp
+2));
1678 pmap_store_pte(user_pdptp
+3, *(kern_pdptp
+3));
1679 p
->pm_task_map
= TASK_MAP_64BIT_SHARED
;
1684 pmap_clear_4GB_pagezero(pmap_t p
)
1686 pdpt_entry_t
*user_pdptp
;
1688 if (p
->pm_task_map
!= TASK_MAP_64BIT_SHARED
)
1693 p
->pm_task_map
= TASK_MAP_64BIT
;
1695 pmap_load_kernel_cr3();
1697 user_pdptp
= pmap64_pdpt(p
, 0x0);
1698 pmap_store_pte(user_pdptp
+0, 0);
1699 pmap_store_pte(user_pdptp
+1, 0);
1700 pmap_store_pte(user_pdptp
+2, 0);
1701 pmap_store_pte(user_pdptp
+3, 0);
1707 pmap_load_kernel_cr3(void)
1709 uint64_t kernel_cr3
;
1711 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
1714 * Reload cr3 with the true kernel cr3.
1716 kernel_cr3
= current_cpu_datap()->cpu_kernel_cr3
;
1717 set64_cr3(kernel_cr3
);
1718 current_cpu_datap()->cpu_active_cr3
= kernel_cr3
;
1719 current_cpu_datap()->cpu_tlb_invalid
= FALSE
;
1720 __asm__
volatile("mfence");
1724 * Retire the given physical map from service.
1725 * Should only be called if the map contains
1726 * no valid mappings.
1738 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY
) | DBG_FUNC_START
,
1739 (int) p
, 0, 0, 0, 0);
1747 * If some cpu is not using the physical pmap pointer that it
1748 * is supposed to be (see set_dirbase), we might be using the
1749 * pmap that is being destroyed! Make sure we are
1750 * physically on the right pmap:
1754 0xFFFFFFFFFFFFF000ULL
);
1760 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY
) | DBG_FUNC_END
,
1761 (int) p
, 1, 0, 0, 0);
1762 return; /* still in use */
1766 * Free the memory maps, then the
1770 OSAddAtomic(-p
->pm_obj
->resident_page_count
, &inuse_ptepages_count
);
1772 kmem_free(kernel_map
, (vm_offset_t
)p
->dirbase
, NBPTD
);
1773 zfree(pdpt_zone
, (void *)p
->pm_hold
);
1775 vm_object_deallocate(p
->pm_obj
);
1778 int inuse_ptepages
= 0;
1780 /* free 64 bit mode structs */
1782 kmem_free(kernel_map
, (vm_offset_t
)p
->pm_hold
, PAGE_SIZE
);
1784 inuse_ptepages
+= p
->pm_obj_pml4
->resident_page_count
;
1785 vm_object_deallocate(p
->pm_obj_pml4
);
1787 inuse_ptepages
+= p
->pm_obj_pdpt
->resident_page_count
;
1788 vm_object_deallocate(p
->pm_obj_pdpt
);
1790 inuse_ptepages
+= p
->pm_obj
->resident_page_count
;
1791 vm_object_deallocate(p
->pm_obj
);
1793 OSAddAtomic(-inuse_ptepages
, &inuse_ptepages_count
);
1795 zfree(pmap_zone
, p
);
1797 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY
) | DBG_FUNC_END
,
1803 * Add a reference to the specified pmap.
1811 if (p
!= PMAP_NULL
) {
1819 * Remove a range of hardware page-table entries.
1820 * The entries given are the first (inclusive)
1821 * and last (exclusive) entries for the VM pages.
1822 * The virtual address is the va for the first pte.
1824 * The pmap must be locked.
1825 * If the pmap is not the kernel pmap, the range must lie
1826 * entirely within one pte-page. This is NOT checked.
1827 * Assumes that the pte-page exists.
1833 vm_map_offset_t start_vaddr
,
1837 register pt_entry_t
*cpte
;
1838 pv_hashed_entry_t pvh_et
= PV_HASHED_ENTRY_NULL
;
1839 pv_hashed_entry_t pvh_eh
= PV_HASHED_ENTRY_NULL
;
1840 pv_hashed_entry_t pvh_e
;
1842 int num_removed
, num_unwired
, num_found
;
1845 vm_map_offset_t vaddr
;
1853 if (pmap
!= kernel_pmap
&&
1854 pmap
->pm_task_map
== TASK_MAP_32BIT
&&
1855 start_vaddr
>= HIGH_MEM_BASE
) {
1857 * The range is in the "high_shared_pde" which is shared
1858 * between the kernel and all 32-bit tasks. It holds
1859 * the 32-bit commpage but also the trampolines, GDT, etc...
1860 * so we can't let user tasks remove anything from it.
1865 /* invalidate the PTEs first to "freeze" them */
1866 for (cpte
= spte
, vaddr
= start_vaddr
;
1868 cpte
++, vaddr
+= PAGE_SIZE_64
) {
1870 pa
= pte_to_pa(*cpte
);
1880 if (!managed_page(pai
)) {
1882 * Outside range of managed physical memory.
1883 * Just remove the mappings.
1885 pmap_store_pte(cpte
, 0);
1889 /* invalidate the PTE */
1890 pmap_update_pte(cpte
, *cpte
, (*cpte
& ~INTEL_PTE_VALID
));
1893 if (num_found
== 0) {
1894 /* nothing was changed: we're done */
1898 /* propagate the invalidates to other CPUs */
1900 PMAP_UPDATE_TLBS(pmap
, start_vaddr
, vaddr
);
1902 for (cpte
= spte
, vaddr
= start_vaddr
;
1904 cpte
++, vaddr
+= PAGE_SIZE_64
) {
1906 pa
= pte_to_pa(*cpte
);
1914 pa
= pte_to_pa(*cpte
);
1923 * Get the modify and reference bits, then
1924 * nuke the entry in the page table
1926 /* remember reference and change */
1927 pmap_phys_attributes
[pai
] |=
1928 (char)(*cpte
& (PHYS_MODIFIED
| PHYS_REFERENCED
));
1929 /* completely invalidate the PTE */
1930 pmap_store_pte(cpte
, 0);
1933 * Remove the mapping from the pvlist for
1934 * this physical page.
1937 pv_rooted_entry_t pv_h
;
1938 pv_hashed_entry_t
*pprevh
;
1939 ppnum_t ppn
= (ppnum_t
)pai
;
1941 pv_h
= pai_to_pvh(pai
);
1942 pvh_e
= PV_HASHED_ENTRY_NULL
;
1943 if (pv_h
->pmap
== PMAP_NULL
)
1944 panic("pmap_remove_range: null pv_list!");
1946 if (pv_h
->va
== vaddr
&& pv_h
->pmap
== pmap
) { /* rooted or not */
1948 * Header is the pv_rooted_entry. We can't free that. If there is a queued
1949 * entry after this one we remove that
1950 * from the ppn queue, we remove it from the hash chain
1951 * and copy it to the rooted entry. Then free it instead.
1954 pvh_e
= (pv_hashed_entry_t
)queue_next(&pv_h
->qlink
);
1955 if (pv_h
!= (pv_rooted_entry_t
)pvh_e
) { /* any queued after rooted? */
1957 pvhash_idx
= pvhashidx(pvh_e
->pmap
,pvh_e
->va
);
1958 LOCK_PV_HASH(pvhash_idx
);
1959 remque(&pvh_e
->qlink
);
1961 pprevh
= pvhash(pvhash_idx
);
1962 if (PV_HASHED_ENTRY_NULL
== *pprevh
) {
1963 panic("pmap_remove_range empty hash removing rooted pv");
1966 pmap_pvh_unlink(pvh_e
);
1967 UNLOCK_PV_HASH(pvhash_idx
);
1968 pv_h
->pmap
= pvh_e
->pmap
;
1969 pv_h
->va
= pvh_e
->va
; /* dispose of pvh_e */
1970 } else { /* none queued after rooted */
1971 pv_h
->pmap
= PMAP_NULL
;
1972 pvh_e
= PV_HASHED_ENTRY_NULL
;
1973 } /* any queued after rooted */
1975 } else { /* rooted or not */
1976 /* not removing rooted pv. find it on hash chain, remove from ppn queue and
1977 * hash chain and free it */
1979 pvhash_idx
= pvhashidx(pmap
,vaddr
);
1980 LOCK_PV_HASH(pvhash_idx
);
1981 pprevh
= pvhash(pvhash_idx
);
1982 if (PV_HASHED_ENTRY_NULL
== *pprevh
) {
1983 panic("pmap_remove_range empty hash removing hashed pv");
1986 pmap_pv_hashlist_walks
++;
1988 while (PV_HASHED_ENTRY_NULL
!= pvh_e
) {
1990 if (pvh_e
->pmap
== pmap
&& pvh_e
->va
== vaddr
&& pvh_e
->ppn
== ppn
) break;
1991 pprevh
= &pvh_e
->nexth
;
1992 pvh_e
= pvh_e
->nexth
;
1994 pmap_pv_hashlist_cnts
+= pv_cnt
;
1995 if (pmap_pv_hashlist_max
< pv_cnt
) pmap_pv_hashlist_max
= pv_cnt
;
1996 if (PV_HASHED_ENTRY_NULL
== pvh_e
) panic("pmap_remove_range pv not on hash");
1997 *pprevh
= pvh_e
->nexth
;
1998 remque(&pvh_e
->qlink
);
1999 UNLOCK_PV_HASH(pvhash_idx
);
2001 } /* rooted or not */
2005 if (pvh_e
!= PV_HASHED_ENTRY_NULL
) {
2006 pvh_e
->qlink
.next
= (queue_entry_t
)pvh_eh
;
2009 if (pvh_et
== PV_HASHED_ENTRY_NULL
) {
2016 } /* removing mappings for this phy page */
2019 if (pvh_eh
!= PV_HASHED_ENTRY_NULL
) {
2020 PV_HASHED_FREE_LIST(pvh_eh
, pvh_et
, pvh_cnt
);
2028 if (pmap
->stats
.resident_count
< num_removed
)
2029 panic("pmap_remove_range: resident_count");
2031 assert(pmap
->stats
.resident_count
>= num_removed
);
2032 OSAddAtomic(-num_removed
, &pmap
->stats
.resident_count
);
2035 if (pmap
->stats
.wired_count
< num_unwired
)
2036 panic("pmap_remove_range: wired_count");
2038 assert(pmap
->stats
.wired_count
>= num_unwired
);
2039 OSAddAtomic(-num_unwired
, &pmap
->stats
.wired_count
);
2045 * Remove phys addr if mapped in specified map
2049 pmap_remove_some_phys(
2050 __unused pmap_t map
,
2051 __unused ppnum_t pn
)
2054 /* Implement to support working set code */
2059 * Remove the given range of addresses
2060 * from the specified map.
2062 * It is assumed that the start and end are properly
2063 * rounded to the hardware page size.
2074 pt_entry_t
*spte
, *epte
;
2081 if (map
== PMAP_NULL
|| s64
== e64
)
2084 PMAP_TRACE(PMAP_CODE(PMAP__REMOVE
) | DBG_FUNC_START
,
2086 (int) (s64
>>32), (int) s64
,
2087 (int) (e64
>>32), (int) e64
);
2093 * Check that address range in the kernel does not overlap the stacks.
2094 * We initialize local static min/max variables once to avoid making
2095 * 2 function calls for every remove. Note also that these functions
2096 * both return 0 before kernel stacks have been initialized, and hence
2097 * the panic is not triggered in this case.
2099 if (map
== kernel_pmap
) {
2100 static vm_offset_t kernel_stack_min
= 0;
2101 static vm_offset_t kernel_stack_max
= 0;
2103 if (kernel_stack_min
== 0) {
2104 kernel_stack_min
= min_valid_stack_address();
2105 kernel_stack_max
= max_valid_stack_address();
2107 if ((kernel_stack_min
<= s64
&& s64
< kernel_stack_max
) ||
2108 (kernel_stack_min
< e64
&& e64
<= kernel_stack_max
))
2109 panic("pmap_remove() attempted in kernel stack");
2114 * The values of kernel_stack_min and kernel_stack_max are no longer
2115 * relevant now that we allocate kernel stacks anywhere in the kernel map,
2116 * so the old code above no longer applies. If we wanted to check that
2117 * we weren't removing a mapping of a page in a kernel stack we'd have to
2118 * mark the PTE with an unused bit and check that here.
2123 deadline
= rdtsc64() + max_preemption_latency_tsc
;
2128 l64
= (s64
+ pde_mapped_size
) & ~(pde_mapped_size
-1);
2131 pde
= pmap_pde(map
, s64
);
2133 if (pde
&& (*pde
& INTEL_PTE_VALID
)) {
2134 spte
= (pt_entry_t
*)pmap_pte(map
, (s64
& ~(pde_mapped_size
-1)));
2135 spte
= &spte
[ptenum(s64
)];
2136 epte
= &spte
[intel_btop(l64
-s64
)];
2138 pmap_remove_range(map
, s64
, spte
, epte
);
2143 if (s64
< e64
&& rdtsc64() >= deadline
) {
2147 deadline
= rdtsc64() + max_preemption_latency_tsc
;
2154 PMAP_TRACE(PMAP_CODE(PMAP__REMOVE
) | DBG_FUNC_END
,
2155 (int) map
, 0, 0, 0, 0);
2160 * Routine: pmap_page_protect
2163 * Lower the permission for all mappings to a given
2171 pv_hashed_entry_t pvh_eh
= PV_HASHED_ENTRY_NULL
;
2172 pv_hashed_entry_t pvh_et
= PV_HASHED_ENTRY_NULL
;
2173 pv_hashed_entry_t nexth
;
2175 pv_rooted_entry_t pv_h
;
2176 pv_rooted_entry_t pv_e
;
2177 pv_hashed_entry_t pvh_e
;
2180 register pmap_t pmap
;
2185 assert(pn
!= vm_page_fictitious_addr
);
2186 if (pn
== vm_page_guard_addr
)
2189 pai
= ppn_to_pai(pn
);
2191 if (!managed_page(pai
)) {
2193 * Not a managed page.
2198 PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT
) | DBG_FUNC_START
,
2199 (int) pn
, (int) prot
, 0, 0, 0);
2202 * Determine the new protection.
2206 case VM_PROT_READ
|VM_PROT_EXECUTE
:
2210 return; /* nothing to do */
2216 pv_h
= pai_to_pvh(pai
);
2222 * Walk down PV list, changing or removing all mappings.
2224 if (pv_h
->pmap
!= PMAP_NULL
) {
2227 pvh_e
= (pv_hashed_entry_t
)pv_e
; /* cheat */
2230 register vm_map_offset_t vaddr
;
2234 pte
= pmap_pte(pmap
, vaddr
);
2237 panic("pmap_page_protect: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx, prot: %d kernel_pmap: %p", pmap
, pn
, vaddr
, prot
, kernel_pmap
);
2240 nexth
= (pv_hashed_entry_t
)queue_next(&pvh_e
->qlink
); /* if there is one */
2243 * Remove the mapping if new protection is NONE
2244 * or if write-protecting a kernel mapping.
2246 if (remove
|| pmap
== kernel_pmap
) {
2248 * Remove the mapping, collecting any modify bits.
2250 pmap_update_pte(pte
, *pte
, (*pte
& ~INTEL_PTE_VALID
));
2252 PMAP_UPDATE_TLBS(pmap
, vaddr
, vaddr
+ PAGE_SIZE
);
2254 pmap_phys_attributes
[pai
] |= *pte
& (PHYS_MODIFIED
|PHYS_REFERENCED
);
2256 pmap_store_pte(pte
, 0);
2259 if (pmap
->stats
.resident_count
< 1)
2260 panic("pmap_page_protect: resident_count");
2262 assert(pmap
->stats
.resident_count
>= 1);
2263 OSAddAtomic(-1, &pmap
->stats
.resident_count
);
2266 * Deal with the pv_rooted_entry.
2271 * Fix up head later.
2273 pv_h
->pmap
= PMAP_NULL
;
2277 * Delete this entry.
2280 pvhash_idx
= pvhashidx(pvh_e
->pmap
,pvh_e
->va
);
2281 LOCK_PV_HASH(pvhash_idx
);
2282 remque(&pvh_e
->qlink
);
2283 pmap_pvh_unlink(pvh_e
);
2284 UNLOCK_PV_HASH(pvhash_idx
);
2286 pvh_e
->qlink
.next
= (queue_entry_t
)pvh_eh
;
2289 if (pvh_et
== PV_HASHED_ENTRY_NULL
)
2297 pmap_update_pte(pte
, *pte
, (*pte
& ~INTEL_PTE_WRITE
));
2298 PMAP_UPDATE_TLBS(pmap
, vaddr
, vaddr
+ PAGE_SIZE
);
2302 } while ((pv_e
= (pv_rooted_entry_t
)nexth
) != pv_h
);
2306 * If pv_head mapping was removed, fix it up.
2309 if (pv_h
->pmap
== PMAP_NULL
) {
2310 pvh_e
= (pv_hashed_entry_t
)queue_next(&pv_h
->qlink
);
2312 if (pvh_e
!= (pv_hashed_entry_t
)pv_h
) {
2314 pvhash_idx
= pvhashidx(pvh_e
->pmap
,pvh_e
->va
);
2315 LOCK_PV_HASH(pvhash_idx
);
2316 remque(&pvh_e
->qlink
);
2317 pmap_pvh_unlink(pvh_e
);
2318 UNLOCK_PV_HASH(pvhash_idx
);
2319 pv_h
->pmap
= pvh_e
->pmap
;
2320 pv_h
->va
= pvh_e
->va
;
2321 pvh_e
->qlink
.next
= (queue_entry_t
)pvh_eh
;
2324 if (pvh_et
== PV_HASHED_ENTRY_NULL
)
2330 if (pvh_eh
!= PV_HASHED_ENTRY_NULL
) {
2331 PV_HASHED_FREE_LIST(pvh_eh
, pvh_et
, pvh_cnt
);
2336 PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT
) | DBG_FUNC_END
,
2347 * Disconnect all mappings for this page and return reference and change status
2348 * in generic format.
2351 unsigned int pmap_disconnect(
2354 pmap_page_protect(pa
, 0); /* disconnect the page */
2355 return (pmap_get_refmod(pa
)); /* return ref/chg status */
2359 * Set the physical protection on the
2360 * specified range of this map as requested.
2361 * Will not increase permissions.
2366 vm_map_offset_t sva
,
2367 vm_map_offset_t eva
,
2370 register pt_entry_t
*pde
;
2371 register pt_entry_t
*spte
, *epte
;
2372 vm_map_offset_t lva
;
2373 vm_map_offset_t orig_sva
;
2379 if (map
== PMAP_NULL
)
2382 if (prot
== VM_PROT_NONE
) {
2383 pmap_remove(map
, sva
, eva
);
2387 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT
) | DBG_FUNC_START
,
2389 (int) (sva
>>32), (int) sva
,
2390 (int) (eva
>>32), (int) eva
);
2392 if ( (prot
& VM_PROT_EXECUTE
) || !nx_enabled
|| !map
->nx_enabled
)
2401 lva
= (sva
+ pde_mapped_size
) & ~(pde_mapped_size
-1);
2404 pde
= pmap_pde(map
, sva
);
2405 if (pde
&& (*pde
& INTEL_PTE_VALID
)) {
2406 spte
= (pt_entry_t
*)pmap_pte(map
, (sva
& ~(pde_mapped_size
-1)));
2407 spte
= &spte
[ptenum(sva
)];
2408 epte
= &spte
[intel_btop(lva
-sva
)];
2410 while (spte
< epte
) {
2412 if (*spte
& INTEL_PTE_VALID
) {
2414 if (prot
& VM_PROT_WRITE
)
2415 pmap_update_pte(spte
, *spte
, (*spte
| INTEL_PTE_WRITE
));
2417 pmap_update_pte(spte
, *spte
, (*spte
& ~INTEL_PTE_WRITE
));
2420 pmap_update_pte(spte
, *spte
, (*spte
| INTEL_PTE_NX
));
2422 pmap_update_pte(spte
, *spte
, (*spte
& ~INTEL_PTE_NX
));
2432 PMAP_UPDATE_TLBS(map
, orig_sva
, eva
);
2436 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT
) | DBG_FUNC_END
,
2441 /* Map a (possibly) autogenned block */
2450 __unused
unsigned int flags
)
2454 for (page
= 0; page
< size
; page
++) {
2455 pmap_enter(pmap
, va
, pa
, prot
, attr
, TRUE
);
2463 * Insert the given physical page (p) at
2464 * the specified virtual address (v) in the
2465 * target physical map with the protection requested.
2467 * If specified, the page will be wired down, meaning
2468 * that the related pte cannot be reclaimed.
2470 * NB: This is the only routine which MAY NOT lazy-evaluate
2471 * or lose information. That is, this routine must actually
2472 * insert this page into the given map NOW.
2476 register pmap_t pmap
,
2477 vm_map_offset_t vaddr
,
2483 register pt_entry_t
*pte
;
2484 register pv_rooted_entry_t pv_h
;
2486 pv_hashed_entry_t pvh_e
;
2487 pv_hashed_entry_t pvh_new
;
2488 pv_hashed_entry_t
*hashp
;
2489 pt_entry_t
template;
2490 pmap_paddr_t old_pa
;
2491 pmap_paddr_t pa
= (pmap_paddr_t
)i386_ptob(pn
);
2492 boolean_t need_tlbflush
= FALSE
;
2497 boolean_t old_pa_locked
;
2500 assert(pn
!= vm_page_fictitious_addr
);
2502 printf("pmap(%qx, %x)\n", vaddr
, pn
);
2503 if (pmap
== PMAP_NULL
)
2505 if (pn
== vm_page_guard_addr
)
2508 PMAP_TRACE(PMAP_CODE(PMAP__ENTER
) | DBG_FUNC_START
,
2510 (int) (vaddr
>>32), (int) vaddr
,
2513 if ( (prot
& VM_PROT_EXECUTE
) || !nx_enabled
|| !pmap
->nx_enabled
)
2519 * Must allocate a new pvlist entry while we're unlocked;
2520 * zalloc may cause pageout (which will lock the pmap system).
2521 * If we determine we need a pvlist entry, we will unlock
2522 * and allocate one. Then we will retry, throughing away
2523 * the allocated entry later (if we no longer need it).
2526 pvh_new
= PV_HASHED_ENTRY_NULL
;
2528 pvh_e
= PV_HASHED_ENTRY_NULL
;
2533 * Expand pmap to include this pte. Assume that
2534 * pmap is always expanded to include enough hardware
2535 * pages to map one VM page.
2538 while ((pte
= pmap_pte(pmap
, vaddr
)) == PT_ENTRY_NULL
) {
2540 * Must unlock to expand the pmap.
2543 pmap_expand(pmap
, vaddr
); /* going to grow pde level page(s) */
2547 old_pa
= pte_to_pa(*pte
);
2548 pai
= pa_index(old_pa
);
2549 old_pa_locked
= FALSE
;
2552 * if we have a previous managed page, lock the pv entry now. after
2553 * we lock it, check to see if someone beat us to the lock and if so
2557 if ((0 != old_pa
) && managed_page(pai
)) {
2559 old_pa_locked
= TRUE
;
2560 old_pa
= pte_to_pa(*pte
);
2562 UNLOCK_PVH(pai
); /* some other path beat us to it */
2563 old_pa_locked
= FALSE
;
2569 * Special case if the incoming physical page is already mapped
2575 * May be changing its wired attribute or protection
2578 template = pa_to_pte(pa
) | INTEL_PTE_VALID
;
2580 if(VM_MEM_NOT_CACHEABLE
== (flags
& (VM_MEM_NOT_CACHEABLE
| VM_WIMG_USE_DEFAULT
))) {
2581 if(!(flags
& VM_MEM_GUARDED
))
2582 template |= INTEL_PTE_PTA
;
2583 template |= INTEL_PTE_NCACHE
;
2586 if (pmap
!= kernel_pmap
)
2587 template |= INTEL_PTE_USER
;
2588 if (prot
& VM_PROT_WRITE
)
2589 template |= INTEL_PTE_WRITE
;
2592 template |= INTEL_PTE_NX
;
2595 template |= INTEL_PTE_WIRED
;
2597 OSAddAtomic(+1, &pmap
->stats
.wired_count
);
2600 if (iswired(*pte
)) {
2601 assert(pmap
->stats
.wired_count
>= 1);
2602 OSAddAtomic(-1, &pmap
->stats
.wired_count
);
2606 /* store modified PTE and preserve RC bits */
2607 pmap_update_pte(pte
, *pte
, template | (*pte
& (INTEL_PTE_REF
| INTEL_PTE_MOD
)));
2608 if (old_pa_locked
) {
2610 old_pa_locked
= FALSE
;
2612 need_tlbflush
= TRUE
;
2617 * Outline of code from here:
2618 * 1) If va was mapped, update TLBs, remove the mapping
2619 * and remove old pvlist entry.
2620 * 2) Add pvlist entry for new mapping
2621 * 3) Enter new mapping.
2623 * If the old physical page is not managed step 1) is skipped
2624 * (except for updating the TLBs), and the mapping is
2625 * overwritten at step 3). If the new physical page is not
2626 * managed, step 2) is skipped.
2629 if (old_pa
!= (pmap_paddr_t
) 0) {
2632 * Don't do anything to pages outside valid memory here.
2633 * Instead convince the code that enters a new mapping
2634 * to overwrite the old one.
2637 /* invalidate the PTE */
2638 pmap_update_pte(pte
, *pte
, (*pte
& ~INTEL_PTE_VALID
));
2639 /* propagate invalidate everywhere */
2640 PMAP_UPDATE_TLBS(pmap
, vaddr
, vaddr
+ PAGE_SIZE
);
2641 /* remember reference and change */
2642 oattr
= (char)(*pte
& (PHYS_MODIFIED
| PHYS_REFERENCED
));
2643 /* completely invalidate the PTE */
2644 pmap_store_pte(pte
, 0);
2646 if (managed_page(pai
)) {
2648 if (pmap
->stats
.resident_count
< 1)
2649 panic("pmap_enter: resident_count");
2651 assert(pmap
->stats
.resident_count
>= 1);
2652 OSAddAtomic(-1, &pmap
->stats
.resident_count
);
2654 if (iswired(*pte
)) {
2657 if (pmap
->stats
.wired_count
< 1)
2658 panic("pmap_enter: wired_count");
2660 assert(pmap
->stats
.wired_count
>= 1);
2661 OSAddAtomic(-1, &pmap
->stats
.wired_count
);
2664 pmap_phys_attributes
[pai
] |= oattr
;
2666 * Remove the mapping from the pvlist for
2667 * this physical page.
2668 * We'll end up with either a rooted pv or a
2673 pv_h
= pai_to_pvh(pai
);
2675 if (pv_h
->pmap
== PMAP_NULL
) {
2676 panic("pmap_enter: null pv_list!");
2679 if (pv_h
->va
== vaddr
&& pv_h
->pmap
== pmap
) {
2681 * Header is the pv_rooted_entry.
2682 * If there is a next one, copy it to the
2683 * header and free the next one (we cannot
2686 pvh_e
= (pv_hashed_entry_t
)queue_next(&pv_h
->qlink
);
2687 if (pvh_e
!= (pv_hashed_entry_t
)pv_h
) {
2688 pvhash_idx
= pvhashidx(pvh_e
->pmap
, pvh_e
->va
);
2689 LOCK_PV_HASH(pvhash_idx
);
2690 remque(&pvh_e
->qlink
);
2691 pmap_pvh_unlink(pvh_e
);
2692 UNLOCK_PV_HASH(pvhash_idx
);
2693 pv_h
->pmap
= pvh_e
->pmap
;
2694 pv_h
->va
= pvh_e
->va
;
2697 pv_h
->pmap
= PMAP_NULL
;
2698 pvh_e
= PV_HASHED_ENTRY_NULL
;
2702 pv_hashed_entry_t
*pprevh
;
2704 /* wasn't the rooted pv - hash, find it, and unlink it */
2705 old_ppn
= (ppnum_t
)pa_index(old_pa
);
2707 pvhash_idx
= pvhashidx(pmap
,vaddr
);
2708 LOCK_PV_HASH(pvhash_idx
);
2709 pprevh
= pvhash(pvhash_idx
);
2711 if (NULL
==pprevh
)panic("pmap enter 1");
2714 pmap_pv_hashlist_walks
++;
2716 while (PV_HASHED_ENTRY_NULL
!= pvh_e
) {
2718 if (pvh_e
->pmap
== pmap
&& pvh_e
->va
== vaddr
&& pvh_e
->ppn
== old_ppn
) break;
2719 pprevh
= &pvh_e
->nexth
;
2720 pvh_e
= pvh_e
->nexth
;
2722 pmap_pv_hashlist_cnts
+= pv_cnt
;
2723 if (pmap_pv_hashlist_max
< pv_cnt
) pmap_pv_hashlist_max
= pv_cnt
;
2724 if (PV_HASHED_ENTRY_NULL
== pvh_e
) panic("pmap_enter: pv not in hash list");
2725 if(NULL
==pprevh
)panic("pmap enter 2");
2726 *pprevh
= pvh_e
->nexth
;
2727 remque(&pvh_e
->qlink
);
2728 UNLOCK_PV_HASH(pvhash_idx
);
2734 * old_pa is not managed.
2735 * Do removal part of accounting.
2738 if (iswired(*pte
)) {
2739 assert(pmap
->stats
.wired_count
>= 1);
2740 OSAddAtomic(-1, &pmap
->stats
.wired_count
);
2746 * if we had a previously managed paged locked, unlock it now
2749 if (old_pa_locked
) {
2751 old_pa_locked
= FALSE
;
2754 pai
= pa_index(pa
); /* now working with new incoming phys page */
2755 if (managed_page(pai
)) {
2758 * Step 2) Enter the mapping in the PV list for this
2761 pv_h
= pai_to_pvh(pai
);
2765 if (pv_h
->pmap
== PMAP_NULL
) {
2767 * No mappings yet, use rooted pv
2771 queue_init(&pv_h
->qlink
);
2775 * Add new pv_hashed_entry after header.
2777 if ((PV_HASHED_ENTRY_NULL
== pvh_e
) && pvh_new
) {
2779 pvh_new
= PV_HASHED_ENTRY_NULL
; /* show we used it */
2780 } else if (PV_HASHED_ENTRY_NULL
== pvh_e
) {
2781 PV_HASHED_ALLOC(pvh_e
);
2782 if (PV_HASHED_ENTRY_NULL
== pvh_e
) {
2783 /* the pv list is empty.
2784 * if we are on the kernel pmap we'll use one of the special private
2785 * kernel pv_e's, else, we need to unlock everything, zalloc a pv_e,
2786 * and restart bringing in the pv_e with us.
2788 if (kernel_pmap
== pmap
) {
2789 PV_HASHED_KERN_ALLOC(pvh_e
);
2793 pvh_new
= (pv_hashed_entry_t
) zalloc(pv_hashed_list_zone
);
2799 if (PV_HASHED_ENTRY_NULL
== pvh_e
) panic("pvh_e exhaustion");
2804 pvhash_idx
= pvhashidx(pmap
,vaddr
);
2805 LOCK_PV_HASH(pvhash_idx
);
2806 insque(&pvh_e
->qlink
, &pv_h
->qlink
);
2807 hashp
= pvhash(pvhash_idx
);
2809 if(NULL
==hashp
)panic("pmap_enter 4");
2811 pvh_e
->nexth
= *hashp
;
2813 UNLOCK_PV_HASH(pvhash_idx
);
2816 * Remember that we used the pvlist entry.
2818 pvh_e
= PV_HASHED_ENTRY_NULL
;
2822 * only count the mapping
2823 * for 'managed memory'
2825 OSAddAtomic(+1, &pmap
->stats
.resident_count
);
2826 if (pmap
->stats
.resident_count
> pmap
->stats
.resident_max
) {
2827 pmap
->stats
.resident_max
= pmap
->stats
.resident_count
;
2832 * Step 3) Enter the mapping.
2834 * Build a template to speed up entering -
2835 * only the pfn changes.
2837 template = pa_to_pte(pa
) | INTEL_PTE_VALID
;
2839 if (flags
& VM_MEM_NOT_CACHEABLE
) {
2840 if(!(flags
& VM_MEM_GUARDED
))
2841 template |= INTEL_PTE_PTA
;
2842 template |= INTEL_PTE_NCACHE
;
2845 if (pmap
!= kernel_pmap
)
2846 template |= INTEL_PTE_USER
;
2847 if (prot
& VM_PROT_WRITE
)
2848 template |= INTEL_PTE_WRITE
;
2851 template |= INTEL_PTE_NX
;
2854 template |= INTEL_PTE_WIRED
;
2855 OSAddAtomic(+1, &pmap
->stats
.wired_count
);
2857 pmap_store_pte(pte
, template);
2859 /* if this was a managed page we delayed unlocking the pv until here
2860 * to prevent pmap_page_protect et al from finding it until the pte
2861 * has been stored */
2863 if (managed_page(pai
)) {
2868 if (need_tlbflush
== TRUE
)
2869 PMAP_UPDATE_TLBS(pmap
, vaddr
, vaddr
+ PAGE_SIZE
);
2871 if (pvh_e
!= PV_HASHED_ENTRY_NULL
) {
2872 PV_HASHED_FREE_LIST(pvh_e
, pvh_e
, 1);
2875 if (pvh_new
!= PV_HASHED_ENTRY_NULL
) {
2876 PV_HASHED_KERN_FREE_LIST(pvh_new
, pvh_new
, 1);
2880 PMAP_TRACE(PMAP_CODE(PMAP__ENTER
) | DBG_FUNC_END
, 0, 0, 0, 0, 0);
2884 * Routine: pmap_change_wiring
2885 * Function: Change the wiring attribute for a map/virtual-address
2887 * In/out conditions:
2888 * The mapping must already exist in the pmap.
2892 register pmap_t map
,
2893 vm_map_offset_t vaddr
,
2896 register pt_entry_t
*pte
;
2899 * We must grab the pmap system lock because we may
2900 * change a pte_page queue.
2904 if ((pte
= pmap_pte(map
, vaddr
)) == PT_ENTRY_NULL
)
2905 panic("pmap_change_wiring: pte missing");
2907 if (wired
&& !iswired(*pte
)) {
2909 * wiring down mapping
2911 OSAddAtomic(+1, &map
->stats
.wired_count
);
2912 pmap_update_pte(pte
, *pte
, (*pte
| INTEL_PTE_WIRED
));
2914 else if (!wired
&& iswired(*pte
)) {
2918 assert(map
->stats
.wired_count
>= 1);
2919 OSAddAtomic(-1, &map
->stats
.wired_count
);
2920 pmap_update_pte(pte
, *pte
, (*pte
& ~INTEL_PTE_WIRED
));
2928 * Routine: pmap_extract
2930 * Extract the physical page address associated
2931 * with the given map/virtual_address pair.
2932 * Change to shim for backwards compatibility but will not
2933 * work for 64 bit systems. Some old drivers that we cannot
2939 register pmap_t pmap
,
2940 vm_map_offset_t vaddr
)
2945 paddr
= (vm_offset_t
)0;
2946 ppn
= pmap_find_phys(pmap
, vaddr
);
2949 paddr
= ((vm_offset_t
)i386_ptob(ppn
)) | ((vm_offset_t
)vaddr
& INTEL_OFFMASK
);
2957 vm_map_offset_t vaddr
)
2959 register vm_page_t m
;
2960 register pmap_paddr_t pa
;
2964 pml4_entry_t
*pml4p
;
2966 if (kernel_pmap
== map
) panic("expand kernel pml4");
2969 pml4p
= pmap64_pml4(map
, vaddr
);
2971 if (PML4_ENTRY_NULL
== pml4p
) panic("pmap_expand_pml4 no pml4p");
2974 * Allocate a VM page for the pml4 page
2976 while ((m
= vm_page_grab()) == VM_PAGE_NULL
)
2980 * put the page into the pmap's obj list so it
2981 * can be found later.
2985 i
= pml4idx(map
, vaddr
);
2992 vm_page_lockspin_queues();
2994 vm_page_unlock_queues();
2996 OSAddAtomic(1, &inuse_ptepages_count
);
2998 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2999 vm_object_lock(map
->pm_obj_pml4
);
3003 * See if someone else expanded us first
3005 if (pmap64_pdpt(map
, vaddr
) != PDPT_ENTRY_NULL
) {
3007 vm_object_unlock(map
->pm_obj_pml4
);
3011 OSAddAtomic(-1, &inuse_ptepages_count
);
3016 if (0 != vm_page_lookup(map
->pm_obj_pml4
, (vm_object_offset_t
)i
)) {
3017 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
3018 map
, map
->pm_obj_pml4
, vaddr
, i
);
3021 vm_page_insert(m
, map
->pm_obj_pml4
, (vm_object_offset_t
)i
);
3022 vm_object_unlock(map
->pm_obj_pml4
);
3025 * Set the page directory entry for this page table.
3027 pml4p
= pmap64_pml4(map
, vaddr
); /* refetch under lock */
3029 pmap_store_pte(pml4p
, pa_to_pte(pa
)
3043 vm_map_offset_t vaddr
)
3045 register vm_page_t m
;
3046 register pmap_paddr_t pa
;
3050 pdpt_entry_t
*pdptp
;
3052 if (kernel_pmap
== map
) panic("expand kernel pdpt");
3055 while ((pdptp
= pmap64_pdpt(map
, vaddr
)) == PDPT_ENTRY_NULL
) {
3057 pmap_expand_pml4(map
, vaddr
); /* need room for another pdpt entry */
3063 * Allocate a VM page for the pdpt page
3065 while ((m
= vm_page_grab()) == VM_PAGE_NULL
)
3069 * put the page into the pmap's obj list so it
3070 * can be found later.
3074 i
= pdptidx(map
, vaddr
);
3081 vm_page_lockspin_queues();
3083 vm_page_unlock_queues();
3085 OSAddAtomic(1, &inuse_ptepages_count
);
3087 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3088 vm_object_lock(map
->pm_obj_pdpt
);
3092 * See if someone else expanded us first
3094 if (pmap64_pde(map
, vaddr
) != PD_ENTRY_NULL
) {
3096 vm_object_unlock(map
->pm_obj_pdpt
);
3100 OSAddAtomic(-1, &inuse_ptepages_count
);
3105 if (0 != vm_page_lookup(map
->pm_obj_pdpt
, (vm_object_offset_t
)i
)) {
3106 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
3107 map
, map
->pm_obj_pdpt
, vaddr
, i
);
3110 vm_page_insert(m
, map
->pm_obj_pdpt
, (vm_object_offset_t
)i
);
3111 vm_object_unlock(map
->pm_obj_pdpt
);
3114 * Set the page directory entry for this page table.
3116 pdptp
= pmap64_pdpt(map
, vaddr
); /* refetch under lock */
3118 pmap_store_pte(pdptp
, pa_to_pte(pa
)
3132 * Routine: pmap_expand
3134 * Expands a pmap to be able to map the specified virtual address.
3136 * Allocates new virtual memory for the P0 or P1 portion of the
3137 * pmap, then re-maps the physical pages that were in the old
3138 * pmap to be in the new pmap.
3140 * Must be called with the pmap system and the pmap unlocked,
3141 * since these must be unlocked to use vm_allocate or vm_deallocate.
3142 * Thus it must be called in a loop that checks whether the map
3143 * has been expanded enough.
3144 * (We won't loop forever, since page tables aren't shrunk.)
3149 vm_map_offset_t vaddr
)
3152 register vm_page_t m
;
3153 register pmap_paddr_t pa
;
3159 * if not the kernel map (while we are still compat kernel mode)
3160 * and we are 64 bit, propagate expand upwards
3163 if (cpu_64bit
&& (map
!= kernel_pmap
)) {
3165 while ((pdp
= pmap64_pde(map
, vaddr
)) == PD_ENTRY_NULL
) {
3167 pmap_expand_pdpt(map
, vaddr
); /* need room for another pde entry */
3174 * Allocate a VM page for the pde entries.
3176 while ((m
= vm_page_grab()) == VM_PAGE_NULL
)
3180 * put the page into the pmap's obj list so it
3181 * can be found later.
3185 i
= pdeidx(map
, vaddr
);
3192 vm_page_lockspin_queues();
3194 vm_page_unlock_queues();
3196 OSAddAtomic(1, &inuse_ptepages_count
);
3198 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3199 vm_object_lock(map
->pm_obj
);
3203 * See if someone else expanded us first
3206 if (pmap_pte(map
, vaddr
) != PT_ENTRY_NULL
) {
3208 vm_object_unlock(map
->pm_obj
);
3212 OSAddAtomic(-1, &inuse_ptepages_count
);
3217 if (0 != vm_page_lookup(map
->pm_obj
, (vm_object_offset_t
)i
)) {
3218 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
3219 map
, map
->pm_obj
, vaddr
, i
);
3222 vm_page_insert(m
, map
->pm_obj
, (vm_object_offset_t
)i
);
3223 vm_object_unlock(map
->pm_obj
);
3226 * refetch while locked
3229 pdp
= pmap_pde(map
, vaddr
);
3232 * Set the page directory entry for this page table.
3234 pmap_store_pte(pdp
, pa_to_pte(pa
)
3246 * pmap_sync_page_data_phys(ppnum_t pa)
3248 * Invalidates all of the instruction cache on a physical page and
3249 * pushes any dirty data from the data cache for the same physical page
3250 * Not required in i386.
3253 pmap_sync_page_data_phys(__unused ppnum_t pa
)
3259 * pmap_sync_page_attributes_phys(ppnum_t pa)
3261 * Write back and invalidate all cachelines on a physical page.
3264 pmap_sync_page_attributes_phys(ppnum_t pa
)
3266 cache_flush_page_phys(pa
);
3271 #ifdef CURRENTLY_UNUSED_AND_UNTESTED
3277 * Routine: pmap_collect
3279 * Garbage collects the physical map system for
3280 * pages which are no longer used.
3281 * Success need not be guaranteed -- that is, there
3282 * may well be pages which are not referenced, but
3283 * others may be collected.
3285 * Called by the pageout daemon when pages are scarce.
3291 register pt_entry_t
*pdp
, *ptp
;
3298 if (p
== kernel_pmap
)
3302 * Garbage collect map.
3306 for (pdp
= (pt_entry_t
*)p
->dirbase
;
3307 pdp
< (pt_entry_t
*)&p
->dirbase
[(UMAXPTDI
+1)];
3310 if (*pdp
& INTEL_PTE_VALID
) {
3311 if(*pdp
& INTEL_PTE_REF
) {
3312 pmap_store_pte(pdp
, *pdp
& ~INTEL_PTE_REF
);
3316 ptp
= pmap_pte(p
, pdetova(pdp
- (pt_entry_t
*)p
->dirbase
));
3317 eptp
= ptp
+ NPTEPG
;
3320 * If the pte page has any wired mappings, we cannot
3325 register pt_entry_t
*ptep
;
3326 for (ptep
= ptp
; ptep
< eptp
; ptep
++) {
3327 if (iswired(*ptep
)) {
3335 * Remove the virtual addresses mapped by this pte page.
3337 pmap_remove_range(p
,
3338 pdetova(pdp
- (pt_entry_t
*)p
->dirbase
),
3343 * Invalidate the page directory pointer.
3345 pmap_store_pte(pdp
, 0x0);
3350 * And free the pte page itself.
3353 register vm_page_t m
;
3355 vm_object_lock(p
->pm_obj
);
3357 m
= vm_page_lookup(p
->pm_obj
,(vm_object_offset_t
)(pdp
- (pt_entry_t
*)&p
->dirbase
[0]));
3358 if (m
== VM_PAGE_NULL
)
3359 panic("pmap_collect: pte page not in object");
3363 OSAddAtomic(-1, &inuse_ptepages_count
);
3365 vm_object_unlock(p
->pm_obj
);
3374 PMAP_UPDATE_TLBS(p
, 0x0, 0xFFFFFFFFFFFFF000ULL
);
3383 pmap_copy_page(ppnum_t src
, ppnum_t dst
)
3385 bcopy_phys((addr64_t
)i386_ptob(src
),
3386 (addr64_t
)i386_ptob(dst
),
3392 * Routine: pmap_pageable
3394 * Make the specified pages (by pmap, offset)
3395 * pageable (or not) as requested.
3397 * A page which is not pageable may not take
3398 * a fault; therefore, its page table entry
3399 * must remain valid for the duration.
3401 * This routine is merely advisory; pmap_enter
3402 * will specify that these pages are to be wired
3403 * down (or not) as appropriate.
3407 __unused pmap_t pmap
,
3408 __unused vm_map_offset_t start_addr
,
3409 __unused vm_map_offset_t end_addr
,
3410 __unused boolean_t pageable
)
3413 pmap
++; start_addr
++; end_addr
++; pageable
++;
3418 * Clear specified attribute bits.
3421 phys_attribute_clear(
3425 pv_rooted_entry_t pv_h
;
3426 register pv_hashed_entry_t pv_e
;
3427 register pt_entry_t
*pte
;
3429 register pmap_t pmap
;
3432 assert(pn
!= vm_page_fictitious_addr
);
3433 if (pn
== vm_page_guard_addr
)
3436 pai
= ppn_to_pai(pn
);
3438 if (!managed_page(pai
)) {
3440 * Not a managed page.
3446 PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR
) | DBG_FUNC_START
,
3447 (int) pn
, bits
, 0, 0, 0);
3449 pv_h
= pai_to_pvh(pai
);
3454 * Walk down PV list, clearing all modify or reference bits.
3455 * We do not have to lock the pv_list because we have
3456 * the entire pmap system locked.
3458 if (pv_h
->pmap
!= PMAP_NULL
) {
3460 * There are some mappings.
3463 pv_e
= (pv_hashed_entry_t
)pv_h
;
3474 * Clear modify and/or reference bits.
3477 pte
= pmap_pte(pmap
, va
);
3478 pmap_update_pte(pte
, *pte
, (*pte
& ~bits
));
3479 /* Ensure all processors using this translation
3480 * invalidate this TLB entry. The invalidation *must* follow
3481 * the PTE update, to ensure that the TLB shadow of the
3482 * 'D' bit (in particular) is synchronized with the
3485 PMAP_UPDATE_TLBS(pmap
, va
, va
+ PAGE_SIZE
);
3488 pv_e
= (pv_hashed_entry_t
)queue_next(&pv_e
->qlink
);
3490 } while (pv_e
!= (pv_hashed_entry_t
)pv_h
);
3492 pmap_phys_attributes
[pai
] &= ~bits
;
3496 PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR
) | DBG_FUNC_END
,
3502 * Check specified attribute bits.
3505 phys_attribute_test(
3509 pv_rooted_entry_t pv_h
;
3510 register pv_hashed_entry_t pv_e
;
3511 register pt_entry_t
*pte
;
3513 register pmap_t pmap
;
3517 assert(pn
!= vm_page_fictitious_addr
);
3518 if (pn
== vm_page_guard_addr
)
3521 pai
= ppn_to_pai(pn
);
3523 if (!managed_page(pai
)) {
3525 * Not a managed page.
3531 * super fast check... if bits already collected
3532 * no need to take any locks...
3533 * if not set, we need to recheck after taking
3534 * the lock in case they got pulled in while
3535 * we were waiting for the lock
3537 if ( (pmap_phys_attributes
[pai
] & bits
) == bits
)
3540 pv_h
= pai_to_pvh(pai
);
3544 attributes
= pmap_phys_attributes
[pai
] & bits
;
3548 * Walk down PV list, checking the mappings until we
3549 * reach the end or we've found the attributes we've asked for
3550 * We do not have to lock the pv_list because we have
3551 * the entire pmap system locked.
3553 if (pv_h
->pmap
!= PMAP_NULL
) {
3555 * There are some mappings.
3557 pv_e
= (pv_hashed_entry_t
)pv_h
;
3558 if (attributes
!= bits
) do {
3567 * first make sure any processor actively
3568 * using this pmap, flushes its TLB state
3570 PMAP_UPDATE_TLBS(pmap
, va
, va
+ PAGE_SIZE
);
3573 * pick up modify and/or reference bits from this mapping
3575 pte
= pmap_pte(pmap
, va
);
3576 attributes
|= (int)(*pte
& bits
);
3580 pv_e
= (pv_hashed_entry_t
)queue_next(&pv_e
->qlink
);
3582 } while ((attributes
!= bits
) && (pv_e
!= (pv_hashed_entry_t
)pv_h
));
3586 return (attributes
);
3590 * Set specified attribute bits.
3600 assert(pn
!= vm_page_fictitious_addr
);
3601 if (pn
== vm_page_guard_addr
)
3604 pai
= ppn_to_pai(pn
);
3606 if (!managed_page(pai
)) {
3608 * Not a managed page.
3615 pmap_phys_attributes
[pai
] |= bits
;
3621 * Set the modify bit on the specified physical page.
3624 void pmap_set_modify(
3627 phys_attribute_set(pn
, PHYS_MODIFIED
);
3631 * Clear the modify bits on the specified physical page.
3638 phys_attribute_clear(pn
, PHYS_MODIFIED
);
3644 * Return whether or not the specified physical page is modified
3645 * by any physical maps.
3652 if (phys_attribute_test(pn
, PHYS_MODIFIED
))
3659 * pmap_clear_reference:
3661 * Clear the reference bit on the specified physical page.
3665 pmap_clear_reference(
3668 phys_attribute_clear(pn
, PHYS_REFERENCED
);
3672 pmap_set_reference(ppnum_t pn
)
3674 phys_attribute_set(pn
, PHYS_REFERENCED
);
3678 * pmap_is_referenced:
3680 * Return whether or not the specified physical page is referenced
3681 * by any physical maps.
3688 if (phys_attribute_test(pn
, PHYS_REFERENCED
))
3695 * pmap_get_refmod(phys)
3696 * returns the referenced and modified bits of the specified
3700 pmap_get_refmod(ppnum_t pa
)
3703 unsigned int retval
= 0;
3705 refmod
= phys_attribute_test(pa
, PHYS_MODIFIED
| PHYS_REFERENCED
);
3707 if (refmod
& PHYS_MODIFIED
)
3708 retval
|= VM_MEM_MODIFIED
;
3709 if (refmod
& PHYS_REFERENCED
)
3710 retval
|= VM_MEM_REFERENCED
;
3716 * pmap_clear_refmod(phys, mask)
3717 * clears the referenced and modified bits as specified by the mask
3718 * of the specified physical page.
3721 pmap_clear_refmod(ppnum_t pa
, unsigned int mask
)
3723 unsigned int x86Mask
;
3725 x86Mask
= ( ((mask
& VM_MEM_MODIFIED
)? PHYS_MODIFIED
: 0)
3726 | ((mask
& VM_MEM_REFERENCED
)? PHYS_REFERENCED
: 0));
3727 phys_attribute_clear(pa
, x86Mask
);
3731 invalidate_icache(__unused vm_offset_t addr
,
3732 __unused
unsigned cnt
,
3738 flush_dcache(__unused vm_offset_t addr
,
3739 __unused
unsigned count
,
3747 * Constrain DTrace copyin/copyout actions
3749 extern kern_return_t
dtrace_copyio_preflight(addr64_t
);
3750 extern kern_return_t
dtrace_copyio_postflight(addr64_t
);
3752 kern_return_t
dtrace_copyio_preflight(__unused addr64_t va
)
3754 thread_t thread
= current_thread();
3756 if (current_map() == kernel_map
)
3757 return KERN_FAILURE
;
3758 else if (thread
->machine
.specFlags
& CopyIOActive
)
3759 return KERN_FAILURE
;
3761 return KERN_SUCCESS
;
3764 kern_return_t
dtrace_copyio_postflight(__unused addr64_t va
)
3766 return KERN_SUCCESS
;
3768 #endif /* CONFIG_DTRACE */
3772 /* show phys page mappings and attributes */
3774 extern void db_show_page(pmap_paddr_t pa
);
3778 db_show_page(pmap_paddr_t pa
)
3785 pv_h
= pai_to_pvh(pai
);
3787 attr
= pmap_phys_attributes
[pai
];
3788 printf("phys page %llx ", pa
);
3789 if (attr
& PHYS_MODIFIED
)
3790 printf("modified, ");
3791 if (attr
& PHYS_REFERENCED
)
3792 printf("referenced, ");
3793 if (pv_h
->pmap
|| pv_h
->next
)
3794 printf(" mapped at\n");
3796 printf(" not mapped\n");
3797 for (; pv_h
; pv_h
= pv_h
->next
)
3799 printf("%llx in pmap %p\n", pv_h
->va
, pv_h
->pmap
);
3803 #endif /* MACH_KDB */
3807 void db_kvtophys(vm_offset_t
);
3808 void db_show_vaddrs(pt_entry_t
*);
3811 * print out the results of kvtophys(arg)
3817 db_printf("0x%qx", kvtophys(vaddr
));
3821 * Walk the pages tables.
3825 pt_entry_t
*dirbase
)
3827 pt_entry_t
*ptep
, *pdep
, tmp
;
3828 unsigned int x
, y
, pdecnt
, ptecnt
;
3831 dirbase
= kernel_pmap
->dirbase
;
3834 db_printf("need a dirbase...\n");
3837 dirbase
= (pt_entry_t
*) (int) ((unsigned long) dirbase
& ~INTEL_OFFMASK
);
3839 db_printf("dirbase: 0x%x\n", dirbase
);
3841 pdecnt
= ptecnt
= 0;
3843 for (y
= 0; y
< NPDEPG
; y
++, pdep
++) {
3844 if (((tmp
= *pdep
) & INTEL_PTE_VALID
) == 0) {
3848 ptep
= (pt_entry_t
*) ((unsigned long)(*pdep
) & ~INTEL_OFFMASK
);
3849 db_printf("dir[%4d]: 0x%x\n", y
, *pdep
);
3850 for (x
= 0; x
< NPTEPG
; x
++, ptep
++) {
3851 if (((tmp
= *ptep
) & INTEL_PTE_VALID
) == 0) {
3855 db_printf(" tab[%4d]: 0x%x, va=0x%x, pa=0x%x\n",
3858 (y
<< 22) | (x
<< 12),
3859 *ptep
& ~INTEL_OFFMASK
);
3863 db_printf("total: %d tables, %d page table entries.\n", pdecnt
, ptecnt
);
3867 #endif /* MACH_KDB */
3869 #include <mach_vm_debug.h>
3871 #include <vm/vm_debug.h>
3874 pmap_list_resident_pages(
3875 __unused pmap_t pmap
,
3876 __unused vm_offset_t
*listp
,
3881 #endif /* MACH_VM_DEBUG */
3885 /* temporary workaround */
3887 coredumpok(__unused vm_map_t map
, __unused vm_offset_t va
)
3892 ptep
= pmap_pte(map
->pmap
, va
);
3895 return ((*ptep
& (INTEL_PTE_NCACHE
| INTEL_PTE_WIRED
)) != (INTEL_PTE_NCACHE
| INTEL_PTE_WIRED
));
3906 assert(pn
!= vm_page_fictitious_addr
);
3908 if (!pmap_initialized
)
3911 if (pn
== vm_page_guard_addr
)
3914 if (!managed_page(ppn_to_pai(pn
)))
3921 mapping_free_prime(void)
3924 pv_hashed_entry_t pvh_e
;
3925 pv_hashed_entry_t pvh_eh
;
3926 pv_hashed_entry_t pvh_et
;
3930 pvh_eh
= pvh_et
= PV_HASHED_ENTRY_NULL
;
3931 for (i
= 0; i
< (5 * PV_HASHED_ALLOC_CHUNK
); i
++) {
3932 pvh_e
= (pv_hashed_entry_t
) zalloc(pv_hashed_list_zone
);
3934 pvh_e
->qlink
.next
= (queue_entry_t
)pvh_eh
;
3937 if (pvh_et
== PV_HASHED_ENTRY_NULL
)
3941 PV_HASHED_FREE_LIST(pvh_eh
, pvh_et
, pv_cnt
);
3944 pvh_eh
= pvh_et
= PV_HASHED_ENTRY_NULL
;
3945 for (i
= 0; i
< PV_HASHED_KERN_ALLOC_CHUNK
; i
++) {
3946 pvh_e
= (pv_hashed_entry_t
) zalloc(pv_hashed_list_zone
);
3948 pvh_e
->qlink
.next
= (queue_entry_t
)pvh_eh
;
3951 if (pvh_et
== PV_HASHED_ENTRY_NULL
)
3955 PV_HASHED_KERN_FREE_LIST(pvh_eh
, pvh_et
, pv_cnt
);
3960 mapping_adjust(void)
3962 pv_hashed_entry_t pvh_e
;
3963 pv_hashed_entry_t pvh_eh
;
3964 pv_hashed_entry_t pvh_et
;
3968 if (mapping_adjust_call
== NULL
) {
3969 thread_call_setup(&mapping_adjust_call_data
,
3970 (thread_call_func_t
) mapping_adjust
,
3971 (thread_call_param_t
) NULL
);
3972 mapping_adjust_call
= &mapping_adjust_call_data
;
3976 pvh_eh
= pvh_et
= PV_HASHED_ENTRY_NULL
;
3977 if (pv_hashed_kern_free_count
< PV_HASHED_KERN_LOW_WATER_MARK
) {
3978 for (i
= 0; i
< PV_HASHED_KERN_ALLOC_CHUNK
; i
++) {
3979 pvh_e
= (pv_hashed_entry_t
) zalloc(pv_hashed_list_zone
);
3981 pvh_e
->qlink
.next
= (queue_entry_t
)pvh_eh
;
3984 if (pvh_et
== PV_HASHED_ENTRY_NULL
)
3988 PV_HASHED_KERN_FREE_LIST(pvh_eh
, pvh_et
, pv_cnt
);
3992 pvh_eh
= pvh_et
= PV_HASHED_ENTRY_NULL
;
3993 if (pv_hashed_free_count
< PV_HASHED_LOW_WATER_MARK
) {
3994 for (i
= 0; i
< PV_HASHED_ALLOC_CHUNK
; i
++) {
3995 pvh_e
= (pv_hashed_entry_t
) zalloc(pv_hashed_list_zone
);
3997 pvh_e
->qlink
.next
= (queue_entry_t
)pvh_eh
;
4000 if (pvh_et
== PV_HASHED_ENTRY_NULL
)
4004 PV_HASHED_FREE_LIST(pvh_eh
, pvh_et
, pv_cnt
);
4010 pmap_commpage32_init(vm_offset_t kernel_commpage
, vm_offset_t user_commpage
, int cnt
)
4013 pt_entry_t
*opte
, *npte
;
4017 for (i
= 0; i
< cnt
; i
++) {
4019 opte
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)kernel_commpage
);
4021 panic("kernel_commpage");
4022 pte
= *opte
| INTEL_PTE_USER
|INTEL_PTE_GLOBAL
;
4023 pte
&= ~INTEL_PTE_WRITE
; // ensure read only
4024 npte
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)user_commpage
);
4026 panic("user_commpage");
4027 pmap_store_pte(npte
, pte
);
4029 kernel_commpage
+= INTEL_PGBYTES
;
4030 user_commpage
+= INTEL_PGBYTES
;
4035 #define PMAP_COMMPAGE64_CNT (_COMM_PAGE64_AREA_USED/PAGE_SIZE)
4036 pt_entry_t pmap_commpage64_ptes
[PMAP_COMMPAGE64_CNT
];
4039 pmap_commpage64_init(vm_offset_t kernel_commpage
, __unused vm_map_offset_t user_commpage
, int cnt
)
4044 PMAP_LOCK(kernel_pmap
);
4046 for (i
= 0; i
< cnt
; i
++) {
4047 kptep
= pmap_pte(kernel_pmap
, (uint64_t)kernel_commpage
+ (i
*PAGE_SIZE
));
4048 if ((0 == kptep
) || (0 == (*kptep
& INTEL_PTE_VALID
)))
4049 panic("pmap_commpage64_init pte");
4050 pmap_commpage64_ptes
[i
] = ((*kptep
& ~INTEL_PTE_WRITE
) | INTEL_PTE_USER
);
4052 PMAP_UNLOCK(kernel_pmap
);
4056 static cpu_pmap_t cpu_pmap_master
;
4059 pmap_cpu_alloc(boolean_t is_boot_cpu
)
4064 vm_offset_t address
;
4065 vm_map_address_t mapaddr
;
4066 vm_map_entry_t entry
;
4070 cp
= &cpu_pmap_master
;
4073 * The per-cpu pmap data structure itself.
4075 ret
= kmem_alloc(kernel_map
,
4076 (vm_offset_t
*) &cp
, sizeof(cpu_pmap_t
));
4077 if (ret
!= KERN_SUCCESS
) {
4078 printf("pmap_cpu_alloc() failed ret=%d\n", ret
);
4081 bzero((void *)cp
, sizeof(cpu_pmap_t
));
4084 * The temporary windows used for copy/zero - see loose_ends.c
4086 ret
= vm_map_find_space(kernel_map
,
4087 &mapaddr
, PMAP_NWINDOWS
*PAGE_SIZE
, (vm_map_offset_t
)0, 0, &entry
);
4088 if (ret
!= KERN_SUCCESS
) {
4089 printf("pmap_cpu_alloc() "
4090 "vm_map_find_space ret=%d\n", ret
);
4094 address
= (vm_offset_t
)mapaddr
;
4096 for (i
= 0; i
< PMAP_NWINDOWS
; i
++, address
+= PAGE_SIZE
) {
4099 while ((pte
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)address
)) == 0)
4100 pmap_expand(kernel_pmap
, (vm_map_offset_t
)address
);
4102 cp
->mapwindow
[i
].prv_CADDR
= (caddr_t
) address
;
4103 cp
->mapwindow
[i
].prv_CMAP
= pte
;
4106 vm_map_unlock(kernel_map
);
4109 cp
->pdpt_window_index
= PMAP_PDPT_FIRST_WINDOW
;
4110 cp
->pde_window_index
= PMAP_PDE_FIRST_WINDOW
;
4111 cp
->pte_window_index
= PMAP_PTE_FIRST_WINDOW
;
4117 pmap_cpu_free(struct cpu_pmap
*cp
)
4119 if (cp
!= NULL
&& cp
!= &cpu_pmap_master
) {
4120 kfree((void *) cp
, sizeof(cpu_pmap_t
));
4126 pmap_get_mapwindow(pt_entry_t pentry
)
4131 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
4134 * Note: 0th map reserved for pmap_pte()
4136 for (i
= PMAP_NWINDOWS_FIRSTFREE
; i
< PMAP_NWINDOWS
; i
++) {
4137 mp
= ¤t_cpu_datap()->cpu_pmap
->mapwindow
[i
];
4139 if (*mp
->prv_CMAP
== 0) {
4140 pmap_store_pte(mp
->prv_CMAP
, pentry
);
4142 invlpg((uintptr_t)mp
->prv_CADDR
);
4147 panic("pmap_get_mapwindow: no windows available");
4154 pmap_put_mapwindow(mapwindow_t
*mp
)
4156 pmap_store_pte(mp
->prv_CMAP
, 0);
4160 pmap_switch(pmap_t tpmap
)
4164 s
= splhigh(); /* Make sure interruptions are disabled */
4166 set_dirbase(tpmap
, current_thread());
4173 * disable no-execute capability on
4174 * the specified pmap
4176 void pmap_disable_NX(pmap_t pmap
) {
4178 pmap
->nx_enabled
= 0;
4182 pt_fake_zone_info(int *count
, vm_size_t
*cur_size
, vm_size_t
*max_size
, vm_size_t
*elem_size
,
4183 vm_size_t
*alloc_size
, int *collectable
, int *exhaustable
)
4185 *count
= inuse_ptepages_count
;
4186 *cur_size
= PAGE_SIZE
* inuse_ptepages_count
;
4187 *max_size
= PAGE_SIZE
* (inuse_ptepages_count
+ vm_page_inactive_count
+ vm_page_active_count
+ vm_page_free_count
);
4188 *elem_size
= PAGE_SIZE
;
4189 *alloc_size
= PAGE_SIZE
;
4195 vm_offset_t
pmap_cpu_high_map_vaddr(int cpu
, enum high_cpu_types e
)
4197 enum high_fixed_addresses a
;
4198 a
= e
+ HIGH_CPU_END
* cpu
;
4199 return pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN
+ a
);
4202 vm_offset_t
pmap_high_map_vaddr(enum high_cpu_types e
)
4204 return pmap_cpu_high_map_vaddr(cpu_number(), e
);
4207 vm_offset_t
pmap_high_map(pt_entry_t pte
, enum high_cpu_types e
)
4209 enum high_fixed_addresses a
;
4212 a
= e
+ HIGH_CPU_END
* cpu_number();
4213 vaddr
= (vm_offset_t
)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN
+ a
);
4214 pmap_store_pte(pte_unique_base
+ a
, pte
);
4216 /* TLB flush for this page for this cpu */
4217 invlpg((uintptr_t)vaddr
);
4223 pmap_cpuset_NMIPI(cpu_set cpu_mask
) {
4224 unsigned int cpu
, cpu_bit
;
4227 for (cpu
= 0, cpu_bit
= 1; cpu
< real_ncpus
; cpu
++, cpu_bit
<<= 1) {
4228 if (cpu_mask
& cpu_bit
)
4229 cpu_NMI_interrupt(cpu
);
4231 deadline
= mach_absolute_time() + (LockTimeOut
);
4232 while (mach_absolute_time() < deadline
)
4237 * Called with pmap locked, we:
4238 * - scan through per-cpu data to see which other cpus need to flush
4239 * - send an IPI to each non-idle cpu to be flushed
4240 * - wait for all to signal back that they are inactive or we see that
4241 * they are in an interrupt handler or at a safe point
4242 * - flush the local tlb is active for this pmap
4243 * - return ... the caller will unlock the pmap
4246 pmap_flush_tlbs(pmap_t pmap
)
4249 unsigned int cpu_bit
;
4250 cpu_set cpus_to_signal
;
4251 unsigned int my_cpu
= cpu_number();
4252 pmap_paddr_t pmap_cr3
= pmap
->pm_cr3
;
4253 boolean_t flush_self
= FALSE
;
4256 assert((processor_avail_count
< 2) ||
4257 (ml_get_interrupts_enabled() && get_preemption_level() != 0));
4260 * Scan other cpus for matching active or task CR3.
4261 * For idle cpus (with no active map) we mark them invalid but
4262 * don't signal -- they'll check as they go busy.
4263 * Note: for the kernel pmap we look for 64-bit shared address maps.
4266 for (cpu
= 0, cpu_bit
= 1; cpu
< real_ncpus
; cpu
++, cpu_bit
<<= 1) {
4267 if (!cpu_datap(cpu
)->cpu_running
)
4269 if ((cpu_datap(cpu
)->cpu_task_cr3
== pmap_cr3
) ||
4270 (CPU_GET_ACTIVE_CR3(cpu
) == pmap_cr3
) ||
4271 (pmap
->pm_shared
) ||
4272 ((pmap
== kernel_pmap
) &&
4273 (!CPU_CR3_IS_ACTIVE(cpu
) ||
4274 cpu_datap(cpu
)->cpu_task_map
== TASK_MAP_64BIT_SHARED
))) {
4275 if (cpu
== my_cpu
) {
4279 cpu_datap(cpu
)->cpu_tlb_invalid
= TRUE
;
4280 __asm__
volatile("mfence");
4282 if (CPU_CR3_IS_ACTIVE(cpu
)) {
4283 cpus_to_signal
|= cpu_bit
;
4284 i386_signal_cpu(cpu
, MP_TLB_FLUSH
, ASYNC
);
4289 PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS
) | DBG_FUNC_START
,
4290 (int) pmap
, cpus_to_signal
, flush_self
, 0, 0);
4292 if (cpus_to_signal
) {
4293 cpu_set cpus_to_respond
= cpus_to_signal
;
4295 deadline
= mach_absolute_time() + LockTimeOut
;
4297 * Wait for those other cpus to acknowledge
4299 while (cpus_to_respond
!= 0) {
4300 if (mach_absolute_time() > deadline
) {
4301 if (mp_recent_debugger_activity())
4303 if (!panic_active()) {
4304 pmap_tlb_flush_timeout
= TRUE
;
4305 pmap_cpuset_NMIPI(cpus_to_respond
);
4307 panic("pmap_flush_tlbs() timeout: "
4308 "cpu(s) failing to respond to interrupts, pmap=%p cpus_to_respond=0x%lx",
4309 pmap
, cpus_to_respond
);
4312 for (cpu
= 0, cpu_bit
= 1; cpu
< real_ncpus
; cpu
++, cpu_bit
<<= 1) {
4313 if ((cpus_to_respond
& cpu_bit
) != 0) {
4314 if (!cpu_datap(cpu
)->cpu_running
||
4315 cpu_datap(cpu
)->cpu_tlb_invalid
== FALSE
||
4316 !CPU_CR3_IS_ACTIVE(cpu
)) {
4317 cpus_to_respond
&= ~cpu_bit
;
4321 if (cpus_to_respond
== 0)
4327 * Flush local tlb if required.
4328 * We need this flush even if the pmap being changed
4329 * is the user map... in case we do a copyin/out
4330 * before returning to user mode.
4335 if ((pmap
== kernel_pmap
) && (flush_self
!= TRUE
)) {
4336 panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap
->pm_cr3
, current_cpu_datap()->cpu_active_cr3
, current_cpu_datap()->cpu_task_map
);
4339 PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS
) | DBG_FUNC_END
,
4340 (int) pmap
, cpus_to_signal
, flush_self
, 0, 0);
4344 process_pmap_updates(void)
4346 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
4350 current_cpu_datap()->cpu_tlb_invalid
= FALSE
;
4351 __asm__
volatile("mfence");
4355 pmap_update_interrupt(void)
4357 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT
) | DBG_FUNC_START
,
4360 process_pmap_updates();
4362 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT
) | DBG_FUNC_END
,
4367 unsigned int pmap_cache_attributes(ppnum_t pn
) {
4369 if (!managed_page(ppn_to_pai(pn
)))
4370 return (VM_WIMG_IO
);
4372 return (VM_WIMG_COPYBACK
);
4381 kprintf("pmap 0x%x\n",p
);
4383 kprintf(" pm_cr3 0x%llx\n",p
->pm_cr3
);
4384 kprintf(" pm_pml4 0x%x\n",p
->pm_pml4
);
4385 kprintf(" pm_pdpt 0x%x\n",p
->pm_pdpt
);
4387 kprintf(" pml4[0] 0x%llx\n",*p
->pm_pml4
);
4389 kprintf(" pdpt[%d] 0x%llx\n",i
, p
->pm_pdpt
[i
]);
4392 void pmap_dump_wrap(void)
4394 pmap_dump(current_cpu_datap()->cpu_active_thread
->task
->map
->pmap
);
4398 dump_4GB_pdpt(pmap_t p
)
4401 pdpt_entry_t
*user_pdptp
;
4402 pdpt_entry_t
*kern_pdptp
;
4403 pdpt_entry_t
*pml4p
;
4406 while ((user_pdptp
= pmap64_pdpt(p
, 0x0)) == PDPT_ENTRY_NULL
) {
4408 pmap_expand_pml4(p
, 0x0);
4411 kern_pdptp
= kernel_pmap
->pm_pdpt
;
4412 if (kern_pdptp
== NULL
)
4413 panic("kern_pdptp == NULL");
4414 kprintf("dump_4GB_pdpt(%p)\n"
4415 "kern_pdptp=%p (phys=0x%016llx)\n"
4416 "\t 0x%08x: 0x%016llx\n"
4417 "\t 0x%08x: 0x%016llx\n"
4418 "\t 0x%08x: 0x%016llx\n"
4419 "\t 0x%08x: 0x%016llx\n"
4420 "\t 0x%08x: 0x%016llx\n"
4421 "user_pdptp=%p (phys=0x%016llx)\n"
4422 "\t 0x%08x: 0x%016llx\n"
4423 "\t 0x%08x: 0x%016llx\n"
4424 "\t 0x%08x: 0x%016llx\n"
4425 "\t 0x%08x: 0x%016llx\n"
4426 "\t 0x%08x: 0x%016llx\n",
4427 p
, kern_pdptp
, kvtophys(kern_pdptp
),
4428 kern_pdptp
+0, *(kern_pdptp
+0),
4429 kern_pdptp
+1, *(kern_pdptp
+1),
4430 kern_pdptp
+2, *(kern_pdptp
+2),
4431 kern_pdptp
+3, *(kern_pdptp
+3),
4432 kern_pdptp
+4, *(kern_pdptp
+4),
4433 user_pdptp
, kvtophys(user_pdptp
),
4434 user_pdptp
+0, *(user_pdptp
+0),
4435 user_pdptp
+1, *(user_pdptp
+1),
4436 user_pdptp
+2, *(user_pdptp
+2),
4437 user_pdptp
+3, *(user_pdptp
+3),
4438 user_pdptp
+4, *(user_pdptp
+4));
4439 kprintf("user pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
4440 p
->pm_cr3
, p
->pm_hold
, p
->pm_pml4
);
4441 pml4p
= (pdpt_entry_t
*)p
->pm_hold
;
4443 panic("user pml4p == NULL");
4444 kprintf("\t 0x%08x: 0x%016llx\n"
4445 "\t 0x%08x: 0x%016llx\n",
4447 pml4p
+KERNEL_UBER_PML4_INDEX
, *(pml4p
+KERNEL_UBER_PML4_INDEX
));
4448 kprintf("kern pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
4449 kernel_pmap
->pm_cr3
, kernel_pmap
->pm_hold
, kernel_pmap
->pm_pml4
);
4450 pml4p
= (pdpt_entry_t
*)kernel_pmap
->pm_hold
;
4452 panic("kern pml4p == NULL");
4453 kprintf("\t 0x%08x: 0x%016llx\n"
4454 "\t 0x%08x: 0x%016llx\n",
4456 pml4p
+511, *(pml4p
+511));
4460 void dump_4GB_pdpt_thread(thread_t tp
)
4462 dump_4GB_pdpt(tp
->map
->pmap
);