2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 * Carnegie Mellon requests users of this software to return to
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
61 * Author: Avadis Tevanian, Jr., Michael Wayne Young
62 * (These guys wrote the Vax version)
64 * Physical Map management code for Intel i386, i486, and i860.
66 * Manages physical address maps.
68 * In addition to hardware address maps, this
69 * module is called upon to provide software-use-only
70 * maps which may or may not be stored in the same
71 * form as hardware maps. These pseudo-maps are
72 * used to store intermediate results from copy
73 * operations to and from address spaces.
75 * Since the information managed by this module is
76 * also stored by the logical address mapping module,
77 * this module may throw away valid virtual-to-physical
78 * mappings at almost any time. However, invalidations
79 * of virtual-to-physical mappings must be done as
82 * In order to cope with hardware architectures which
83 * make virtual-to-physical map invalidates expensive,
84 * this module may delay invalidate or reduced protection
85 * operations until such time as they are actually
86 * necessary. This module is given full information as
87 * to which processors are currently using which maps,
88 * and to when physical maps must be made correct.
93 #include <mach_ldebug.h>
95 #include <libkern/OSAtomic.h>
97 #include <mach/machine/vm_types.h>
99 #include <mach/boolean.h>
100 #include <kern/thread.h>
101 #include <kern/zalloc.h>
102 #include <kern/queue.h>
104 #include <kern/lock.h>
105 #include <kern/kalloc.h>
106 #include <kern/spl.h>
109 #include <vm/vm_map.h>
110 #include <vm/vm_kern.h>
111 #include <mach/vm_param.h>
112 #include <mach/vm_prot.h>
113 #include <vm/vm_object.h>
114 #include <vm/vm_page.h>
116 #include <mach/machine/vm_param.h>
117 #include <machine/thread.h>
119 #include <kern/misc_protos.h> /* prototyping */
120 #include <i386/misc_protos.h>
122 #include <i386/cpuid.h>
123 #include <i386/cpu_data.h>
124 #include <i386/cpu_number.h>
125 #include <i386/machine_cpu.h>
126 #include <i386/seg.h>
127 #include <i386/serial_io.h>
128 #include <i386/cpu_capabilities.h>
129 #include <i386/machine_routines.h>
130 #include <i386/proc_reg.h>
131 #include <i386/tsc.h>
132 #include <i386/acpi.h>
133 #include <i386/pmap_internal.h>
136 #include <ddb/db_command.h>
137 #include <ddb/db_output.h>
138 #include <ddb/db_sym.h>
139 #include <ddb/db_print.h>
140 #endif /* MACH_KDB */
142 #include <vm/vm_protos.h>
145 #include <i386/mp_desc.h>
146 #include <i386/i386_lowmem.h>
149 /* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
150 #ifdef DEBUGINTERRUPTS
151 #define pmap_intr_assert() {if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) panic("pmap interrupt assert %s, %d",__FILE__, __LINE__);}
153 #define pmap_intr_assert()
159 #define POSTCODE_DELAY 1
160 #include <i386/postcode.h>
161 #endif /* IWANTTODEBUG */
164 * Forward declarations for internal functions.
167 void pmap_remove_range(
173 void phys_attribute_clear(
177 int phys_attribute_test(
181 void phys_attribute_set(
185 void pmap_set_reference(
188 boolean_t
phys_page_exists(
193 void dump_pmap(pmap_t
);
194 void dump_4GB_pdpt(pmap_t p
);
195 void dump_4GB_pdpt_thread(thread_t tp
);
198 int nx_enabled
= 1; /* enable no-execute protection */
199 #ifdef CONFIG_EMBEDDED
200 int allow_data_exec
= 0; /* no exec from data, embedded is hardcore like that */
202 int allow_data_exec
= VM_ABI_32
; /* 32-bit apps may execute data by default, 64-bit apps may not */
204 int allow_stack_exec
= 0; /* No apps may execute from the stack by default */
206 boolean_t cpu_64bit
= FALSE
;
207 boolean_t pmap_trace
= FALSE
;
210 * when spinning through pmap_remove
211 * ensure that we don't spend too much
212 * time with preemption disabled.
213 * I'm setting the current threshold
216 #define MAX_PREEMPTION_LATENCY_NS 20000
218 uint64_t max_preemption_latency_tsc
= 0;
221 pv_hashed_entry_t
*pv_hash_table
; /* hash lists */
223 uint32_t npvhash
= 0;
227 * pv_list entries are kept on a list that can only be accessed
228 * with the pmap system locked (at SPLVM, not in the cpus_active set).
229 * The list is refilled from the pv_hashed_list_zone if it becomes empty.
231 pv_rooted_entry_t pv_free_list
= PV_ROOTED_ENTRY_NULL
; /* free list at SPLVM */
232 pv_hashed_entry_t pv_hashed_free_list
= PV_HASHED_ENTRY_NULL
;
233 pv_hashed_entry_t pv_hashed_kern_free_list
= PV_HASHED_ENTRY_NULL
;
234 decl_simple_lock_data(,pv_hashed_free_list_lock
)
235 decl_simple_lock_data(,pv_hashed_kern_free_list_lock
)
236 decl_simple_lock_data(,pv_hash_table_lock
)
238 int pv_free_count
= 0;
239 int pv_hashed_free_count
= 0;
240 int pv_kern_free_count
= 0;
241 int pv_hashed_kern_free_count
= 0;
243 zone_t pv_hashed_list_zone
; /* zone of pv_hashed_entry structures */
245 static zone_t pdpt_zone
;
248 * Each entry in the pv_head_table is locked by a bit in the
249 * pv_lock_table. The lock bits are accessed by the physical
250 * address of the page they lock.
253 char *pv_lock_table
; /* pointer to array of bits */
254 #define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
256 char *pv_hash_lock_table
;
257 #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
260 * First and last physical addresses that we maintain any information
261 * for. Initialized to zero so that pmap operations done before
262 * pmap_init won't touch any non-existent structures.
264 boolean_t pmap_initialized
= FALSE
;/* Has pmap_init completed? */
266 static struct vm_object kptobj_object_store
;
267 static vm_object_t kptobj
;
270 * Array of physical page attribites for managed pages.
271 * One byte per physical page.
273 char *pmap_phys_attributes
;
274 unsigned int last_managed_page
= 0;
276 extern ppnum_t lowest_lo
;
277 extern ppnum_t lowest_hi
;
278 extern ppnum_t highest_hi
;
281 * Amount of virtual memory mapped by one
282 * page-directory entry.
284 #define PDE_MAPPED_SIZE (pdetova(1))
285 uint64_t pde_mapped_size
;
288 * Locking and TLB invalidation
292 * Locking Protocols: (changed 2/2007 JK)
294 * There are two structures in the pmap module that need locking:
295 * the pmaps themselves, and the per-page pv_lists (which are locked
296 * by locking the pv_lock_table entry that corresponds to the pv_head
297 * for the list in question.) Most routines want to lock a pmap and
298 * then do operations in it that require pv_list locking -- however
299 * pmap_remove_all and pmap_copy_on_write operate on a physical page
300 * basis and want to do the locking in the reverse order, i.e. lock
301 * a pv_list and then go through all the pmaps referenced by that list.
303 * The system wide pmap lock has been removed. Now, paths take a lock
304 * on the pmap before changing its 'shape' and the reverse order lockers
305 * (coming in by phys ppn) take a lock on the corresponding pv and then
306 * retest to be sure nothing changed during the window before they locked
307 * and can then run up/down the pv lists holding the list lock. This also
308 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
317 #define LOCK_PVH(index) { \
318 mp_disable_preemption(); \
319 lock_pvh_pai(index); \
322 #define UNLOCK_PVH(index) { \
323 unlock_pvh_pai(index); \
324 mp_enable_preemption(); \
331 #define LOCK_PV_HASH(hash) lock_hash_hash(hash)
333 #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
336 extern int max_lock_loops
;
338 unsigned int loop_count; \
339 loop_count = disable_serial_output ? max_lock_loops \
341 #define LOOP_CHECK(msg, pmap) \
342 if (--loop_count == 0) { \
343 mp_disable_preemption(); \
344 kprintf("%s: cpu %d pmap %x\n", \
345 msg, cpu_number(), pmap); \
346 Debugger("deadlock detection"); \
347 mp_enable_preemption(); \
348 loop_count = max_lock_loops; \
350 #else /* USLOCK_DEBUG */
352 #define LOOP_CHECK(msg, pmap)
353 #endif /* USLOCK_DEBUG */
355 unsigned pmap_memory_region_count
;
356 unsigned pmap_memory_region_current
;
358 pmap_memory_region_t pmap_memory_regions
[PMAP_MEMORY_REGIONS_SIZE
];
361 * Other useful macros.
363 #define current_pmap() (vm_map_pmap(current_thread()->map))
365 struct pmap kernel_pmap_store
;
368 pd_entry_t high_shared_pde
;
369 pd_entry_t commpage64_pde
;
371 struct zone
*pmap_zone
; /* zone of pmap structures */
373 int pmap_debug
= 0; /* flag for debugging prints */
375 unsigned int inuse_ptepages_count
= 0;
377 addr64_t kernel64_cr3
;
378 boolean_t no_shared_cr3
= FALSE
; /* -no_shared_cr3 boot arg */
382 * Pmap cache. Cache is threaded through ref_count field of pmap.
383 * Max will eventually be constant -- variable for experimentation.
385 int pmap_cache_max
= 32;
386 int pmap_alloc_chunk
= 8;
387 pmap_t pmap_cache_list
;
388 int pmap_cache_count
;
389 decl_simple_lock_data(,pmap_cache_lock
)
395 pt_entry_t
*DMAP1
, *DMAP2
;
399 * for legacy, returns the address of the pde entry.
400 * for 64 bit, causes the pdpt page containing the pde entry to be mapped,
401 * then returns the mapped address of the pde entry in that page
404 pmap_pde(pmap_t m
, vm_map_offset_t v
)
407 if (!cpu_64bit
|| (m
== kernel_pmap
)) {
408 pde
= (&((m
)->dirbase
[(vm_offset_t
)(v
) >> PDESHIFT
]));
411 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
412 pde
= pmap64_pde(m
, v
);
419 * the single pml4 page per pmap is allocated at pmap create time and exists
420 * for the duration of the pmap. we allocate this page in kernel vm (to save us one
421 * level of page table dynamic mapping.
422 * this returns the address of the requested pml4 entry in the top level page.
426 pmap64_pml4(pmap_t pmap
, vm_map_offset_t vaddr
)
428 return ((pml4_entry_t
*)pmap
->pm_hold
+ ((vm_offset_t
)((vaddr
>>PML4SHIFT
)&(NPML4PG
-1))));
432 * maps in the pml4 page, if any, containing the pdpt entry requested
433 * and returns the address of the pdpt entry in that mapped page
436 pmap64_pdpt(pmap_t pmap
, vm_map_offset_t vaddr
)
443 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
444 if ((vaddr
> 0x00007FFFFFFFFFFFULL
) && (vaddr
< 0xFFFF800000000000ULL
)) {
448 pml4
= pmap64_pml4(pmap
, vaddr
);
450 if (pml4
&& ((*pml4
& INTEL_PTE_VALID
))) {
452 newpf
= *pml4
& PG_FRAME
;
455 for (i
=PMAP_PDPT_FIRST_WINDOW
; i
< PMAP_PDPT_FIRST_WINDOW
+PMAP_PDPT_NWINDOWS
; i
++) {
456 if (((*(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
)) & PG_FRAME
) == newpf
) {
457 return((pdpt_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CADDR
) +
458 ((vm_offset_t
)((vaddr
>>PDPTSHIFT
)&(NPDPTPG
-1))));
462 current_cpu_datap()->cpu_pmap
->pdpt_window_index
++;
463 if (current_cpu_datap()->cpu_pmap
->pdpt_window_index
> (PMAP_PDPT_FIRST_WINDOW
+PMAP_PDPT_NWINDOWS
-1))
464 current_cpu_datap()->cpu_pmap
->pdpt_window_index
= PMAP_PDPT_FIRST_WINDOW
;
466 (current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pdpt_window_index
].prv_CMAP
),
467 newpf
| INTEL_PTE_RW
| INTEL_PTE_VALID
);
468 invlpg((u_int
)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pdpt_window_index
].prv_CADDR
));
469 return ((pdpt_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pdpt_window_index
].prv_CADDR
) +
470 ((vm_offset_t
)((vaddr
>>PDPTSHIFT
)&(NPDPTPG
-1))));
477 * maps in the pdpt page, if any, containing the pde entry requested
478 * and returns the address of the pde entry in that mapped page
481 pmap64_pde(pmap_t pmap
, vm_map_offset_t vaddr
)
488 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
489 if ((vaddr
> 0x00007FFFFFFFFFFFULL
) && (vaddr
< 0xFFFF800000000000ULL
)) {
493 /* if (vaddr & (1ULL << 63)) panic("neg addr");*/
494 pdpt
= pmap64_pdpt(pmap
, vaddr
);
496 if (pdpt
&& ((*pdpt
& INTEL_PTE_VALID
))) {
498 newpf
= *pdpt
& PG_FRAME
;
500 for (i
=PMAP_PDE_FIRST_WINDOW
; i
< PMAP_PDE_FIRST_WINDOW
+PMAP_PDE_NWINDOWS
; i
++) {
501 if (((*(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
)) & PG_FRAME
) == newpf
) {
502 return((pd_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CADDR
) +
503 ((vm_offset_t
)((vaddr
>>PDSHIFT
)&(NPDPG
-1))));
507 current_cpu_datap()->cpu_pmap
->pde_window_index
++;
508 if (current_cpu_datap()->cpu_pmap
->pde_window_index
> (PMAP_PDE_FIRST_WINDOW
+PMAP_PDE_NWINDOWS
-1))
509 current_cpu_datap()->cpu_pmap
->pde_window_index
= PMAP_PDE_FIRST_WINDOW
;
511 (current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pde_window_index
].prv_CMAP
),
512 newpf
| INTEL_PTE_RW
| INTEL_PTE_VALID
);
513 invlpg((u_int
)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pde_window_index
].prv_CADDR
));
514 return ((pd_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pde_window_index
].prv_CADDR
) +
515 ((vm_offset_t
)((vaddr
>>PDSHIFT
)&(NPDPG
-1))));
522 * Because the page tables (top 3 levels) are mapped into per cpu windows,
523 * callers must either disable interrupts or disable preemption before calling
524 * one of the pte mapping routines (e.g. pmap_pte()) as the returned vaddr
525 * is in one of those mapped windows and that cannot be allowed to change until
526 * the caller is done using the returned pte pointer. When done, the caller
527 * restores interrupts or preemption to its previous state after which point the
528 * vaddr for the returned pte can no longer be used
533 * return address of mapped pte for vaddr va in pmap pmap.
534 * must be called with pre-emption or interrupts disabled
535 * if targeted pmap is not the kernel pmap
536 * since we may be passing back a virtual address that is
537 * associated with this cpu... pre-emption or interrupts
538 * must remain disabled until the caller is done using
539 * the pointer that was passed back .
541 * maps the pde page, if any, containing the pte in and returns
542 * the address of the pte in that mapped page
545 pmap_pte(pmap_t pmap
, vm_map_offset_t vaddr
)
552 pde
= pmap_pde(pmap
,vaddr
);
554 if (pde
&& ((*pde
& INTEL_PTE_VALID
))) {
555 if (*pde
& INTEL_PTE_PS
)
557 if (pmap
== kernel_pmap
)
558 return (vtopte(vaddr
)); /* compat kernel still has pte's mapped */
560 if (ml_get_interrupts_enabled() && get_preemption_level() == 0)
561 panic("pmap_pte: unsafe call");
563 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
565 newpf
= *pde
& PG_FRAME
;
567 for (i
=PMAP_PTE_FIRST_WINDOW
; i
< PMAP_PTE_FIRST_WINDOW
+PMAP_PTE_NWINDOWS
; i
++) {
568 if (((*(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
)) & PG_FRAME
) == newpf
) {
569 return((pt_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CADDR
) +
570 ((vm_offset_t
)i386_btop(vaddr
) & (NPTEPG
-1)));
574 current_cpu_datap()->cpu_pmap
->pte_window_index
++;
575 if (current_cpu_datap()->cpu_pmap
->pte_window_index
> (PMAP_PTE_FIRST_WINDOW
+PMAP_PTE_NWINDOWS
-1))
576 current_cpu_datap()->cpu_pmap
->pte_window_index
= PMAP_PTE_FIRST_WINDOW
;
578 (current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pte_window_index
].prv_CMAP
),
579 newpf
| INTEL_PTE_RW
| INTEL_PTE_VALID
);
580 invlpg((u_int
)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pte_window_index
].prv_CADDR
));
581 return ((pt_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pte_window_index
].prv_CADDR
) +
582 ((vm_offset_t
)i386_btop(vaddr
) & (NPTEPG
-1)));
590 * Map memory at initialization. The physical addresses being
591 * mapped are not managed and are never unmapped.
593 * For now, VM is already on, we only need to map the
599 vm_map_offset_t start_addr
,
600 vm_map_offset_t end_addr
,
607 while (start_addr
< end_addr
) {
608 pmap_enter(kernel_pmap
, (vm_map_offset_t
)virt
,
609 (ppnum_t
) i386_btop(start_addr
), prot
, flags
, FALSE
);
617 * Back-door routine for mapping kernel VM at initialization.
618 * Useful for mapping memory outside the range
619 * Sets no-cache, A, D.
620 * Otherwise like pmap_map.
625 vm_map_offset_t start_addr
,
626 vm_map_offset_t end_addr
,
634 template = pa_to_pte(start_addr
)
640 if(flags
& (VM_MEM_NOT_CACHEABLE
| VM_WIMG_USE_DEFAULT
)) {
641 template |= INTEL_PTE_NCACHE
;
642 if(!(flags
& (VM_MEM_GUARDED
| VM_WIMG_USE_DEFAULT
)))
643 template |= INTEL_PTE_PTA
;
646 if (prot
& VM_PROT_WRITE
)
647 template |= INTEL_PTE_WRITE
;
650 while (start_addr
< end_addr
) {
652 pte
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)virt
);
653 if (pte
== PT_ENTRY_NULL
) {
654 panic("pmap_map_bd: Invalid kernel address\n");
656 pmap_store_pte(pte
, template);
658 pte_increment_pa(template);
660 start_addr
+= PAGE_SIZE
;
668 extern char *first_avail
;
669 extern vm_offset_t virtual_avail
, virtual_end
;
670 extern pmap_paddr_t avail_start
, avail_end
;
676 * Here early in the life of a processor (from cpu_mode_init()).
680 * Initialize the per-cpu, TLB-related fields.
682 current_cpu_datap()->cpu_active_cr3
= kernel_pmap
->pm_cr3
;
683 current_cpu_datap()->cpu_tlb_invalid
= FALSE
;
687 pmap_high_shared_remap(enum high_fixed_addresses e
, vm_offset_t va
, int sz
)
689 vm_offset_t ve
= pmap_index_to_virt(e
);
695 assert(0 == (va
& PAGE_MASK
)); /* expecting page aligned */
697 ptep
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)ve
);
699 for (i
=0; i
< sz
; i
++) {
700 pa
= (pmap_paddr_t
) kvtophys(va
);
701 pmap_store_pte(ptep
, (pa
& PG_FRAME
)
715 pmap_cpu_high_shared_remap(int cpu
, enum high_cpu_types e
, vm_offset_t va
, int sz
)
717 enum high_fixed_addresses a
= e
+ HIGH_CPU_END
* cpu
;
718 return pmap_high_shared_remap(HIGH_FIXED_CPUS_BEGIN
+ a
, va
, sz
);
721 void pmap_init_high_shared(void);
723 extern vm_offset_t gdtptr
, idtptr
;
725 extern uint32_t low_intstack
;
727 extern struct fake_descriptor ldt_desc_pattern
;
728 extern struct fake_descriptor tss_desc_pattern
;
730 extern char hi_remap_text
, hi_remap_etext
;
731 extern char t_zero_div
;
733 pt_entry_t
*pte_unique_base
;
736 pmap_init_high_shared(void)
742 struct i386_tss
*ttss
;
745 cpu_desc_index_t
* cdi
= &cpu_data_master
.cpu_desc_index
;
747 kprintf("HIGH_MEM_BASE 0x%x fixed per-cpu begin 0x%x\n",
748 HIGH_MEM_BASE
,pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN
));
750 pte_unique_base
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN
));
753 if (i386_btop(&hi_remap_etext
- &hi_remap_text
+ 1) >
754 HIGH_FIXED_TRAMPS_END
- HIGH_FIXED_TRAMPS
+ 1)
755 panic("tramps too large");
756 haddr
= pmap_high_shared_remap(HIGH_FIXED_TRAMPS
,
757 (vm_offset_t
) &hi_remap_text
, 3);
758 kprintf("tramp: 0x%x, ",haddr
);
759 /* map gdt up high and update ptr for reload */
760 haddr
= pmap_high_shared_remap(HIGH_FIXED_GDT
,
761 (vm_offset_t
) master_gdt
, 1);
762 cdi
->cdi_gdt
.ptr
= (void *)haddr
;
763 kprintf("GDT: 0x%x, ",haddr
);
764 /* map ldt up high */
765 haddr
= pmap_high_shared_remap(HIGH_FIXED_LDT_BEGIN
,
766 (vm_offset_t
) master_ldt
,
767 HIGH_FIXED_LDT_END
- HIGH_FIXED_LDT_BEGIN
+ 1);
768 cdi
->cdi_ldt
= (struct fake_descriptor
*)haddr
;
769 kprintf("LDT: 0x%x, ",haddr
);
770 /* put new ldt addr into gdt */
771 struct fake_descriptor temp_fake_desc
;
772 temp_fake_desc
= ldt_desc_pattern
;
773 temp_fake_desc
.offset
= (vm_offset_t
) haddr
;
774 fix_desc(&temp_fake_desc
, 1);
776 *(struct fake_descriptor
*) &master_gdt
[sel_idx(KERNEL_LDT
)] = temp_fake_desc
;
777 *(struct fake_descriptor
*) &master_gdt
[sel_idx(USER_LDT
)] = temp_fake_desc
;
779 /* map idt up high */
780 haddr
= pmap_high_shared_remap(HIGH_FIXED_IDT
,
781 (vm_offset_t
) master_idt
, 1);
782 cdi
->cdi_idt
.ptr
= (void *)haddr
;
783 kprintf("IDT: 0x%x, ", haddr
);
784 /* remap ktss up high and put new high addr into gdt */
785 haddr
= pmap_high_shared_remap(HIGH_FIXED_KTSS
,
786 (vm_offset_t
) &master_ktss
, 1);
788 temp_fake_desc
= tss_desc_pattern
;
789 temp_fake_desc
.offset
= (vm_offset_t
) haddr
;
790 fix_desc(&temp_fake_desc
, 1);
791 *(struct fake_descriptor
*) &master_gdt
[sel_idx(KERNEL_TSS
)] = temp_fake_desc
;
792 kprintf("KTSS: 0x%x, ",haddr
);
794 /* remap dbtss up high and put new high addr into gdt */
795 haddr
= pmap_high_shared_remap(HIGH_FIXED_DBTSS
,
796 (vm_offset_t
) &master_dbtss
, 1);
797 temp_fake_desc
= tss_desc_pattern
;
798 temp_fake_desc
.offset
= (vm_offset_t
) haddr
;
799 fix_desc(&temp_fake_desc
, 1);
800 *(struct fake_descriptor
*)&master_gdt
[sel_idx(DEBUG_TSS
)] = temp_fake_desc
;
801 ttss
= (struct i386_tss
*)haddr
;
802 kprintf("DBTSS: 0x%x, ",haddr
);
803 #endif /* MACH_KDB */
805 /* remap dftss up high and put new high addr into gdt */
806 haddr
= pmap_high_shared_remap(HIGH_FIXED_DFTSS
,
807 (vm_offset_t
) &master_dftss
, 1);
808 temp_fake_desc
= tss_desc_pattern
;
809 temp_fake_desc
.offset
= (vm_offset_t
) haddr
;
810 fix_desc(&temp_fake_desc
, 1);
811 *(struct fake_descriptor
*) &master_gdt
[sel_idx(DF_TSS
)] = temp_fake_desc
;
812 kprintf("DFTSS: 0x%x\n",haddr
);
814 /* remap mctss up high and put new high addr into gdt */
815 haddr
= pmap_high_shared_remap(HIGH_FIXED_DFTSS
,
816 (vm_offset_t
) &master_mctss
, 1);
817 temp_fake_desc
= tss_desc_pattern
;
818 temp_fake_desc
.offset
= (vm_offset_t
) haddr
;
819 fix_desc(&temp_fake_desc
, 1);
820 *(struct fake_descriptor
*) &master_gdt
[sel_idx(MC_TSS
)] = temp_fake_desc
;
821 kprintf("MCTSS: 0x%x\n",haddr
);
823 cpu_desc_load(&cpu_data_master
);
828 * Bootstrap the system enough to run with virtual memory.
829 * Map the kernel's code and data, and allocate the system page table.
830 * Called with mapping OFF. Page_size must already be set.
835 __unused vm_offset_t load_start
,
844 vm_last_addr
= VM_MAX_KERNEL_ADDRESS
; /* Set the highest address
847 * The kernel's pmap is statically allocated so we don't
848 * have to use pmap_create, which is unlikely to work
849 * correctly at this part of the boot sequence.
853 kernel_pmap
= &kernel_pmap_store
;
854 kernel_pmap
->ref_count
= 1;
855 kernel_pmap
->nx_enabled
= FALSE
;
856 kernel_pmap
->pm_task_map
= TASK_MAP_32BIT
;
857 kernel_pmap
->pm_obj
= (vm_object_t
) NULL
;
858 kernel_pmap
->dirbase
= (pd_entry_t
*)((unsigned int)IdlePTD
| KERNBASE
);
859 kernel_pmap
->pdirbase
= (pmap_paddr_t
)((int)IdlePTD
);
860 pdpt
= (pd_entry_t
*)((unsigned int)IdlePDPT
| KERNBASE
);
861 kernel_pmap
->pm_pdpt
= pdpt
;
862 kernel_pmap
->pm_cr3
= (pmap_paddr_t
)((int)IdlePDPT
);
865 va
= (vm_offset_t
)kernel_pmap
->dirbase
;
866 /* setup self referential mapping(s) */
867 for (i
= 0; i
< NPGPTD
; i
++, pdpt
++) {
869 pa
= (pmap_paddr_t
) kvtophys((vm_offset_t
)(va
+ i386_ptob(i
)));
871 (pd_entry_t
*) (kernel_pmap
->dirbase
+ PTDPTDI
+ i
),
872 (pa
& PG_FRAME
) | INTEL_PTE_VALID
| INTEL_PTE_RW
| INTEL_PTE_REF
|
873 INTEL_PTE_MOD
| INTEL_PTE_WIRED
) ;
874 pmap_store_pte(pdpt
, pa
| INTEL_PTE_VALID
);
879 lo_kernel_cr3
= kernel_pmap
->pm_cr3
;
880 current_cpu_datap()->cpu_kernel_cr3
= (addr64_t
) kernel_pmap
->pm_cr3
;
882 /* save the value we stuff into created pmaps to share the gdts etc */
883 high_shared_pde
= *pmap_pde(kernel_pmap
, HIGH_MEM_BASE
);
884 /* make sure G bit is on for high shared pde entry */
885 high_shared_pde
|= INTEL_PTE_GLOBAL
;
887 pmap_store_pte(pmap_pde(kernel_pmap
, HIGH_MEM_BASE
), high_shared_pde
);
891 OSAddAtomic(NKPT
, &inuse_ptepages_count
);
893 virtual_avail
= (vm_offset_t
)VADDR(KPTDI
,0) + (vm_offset_t
)first_avail
;
894 virtual_end
= (vm_offset_t
)(VM_MAX_KERNEL_ADDRESS
);
897 * Reserve some special page table entries/VA space for temporary
900 #define SYSMAP(c, p, v, n) \
901 v = (c)va; va += ((n)*INTEL_PGBYTES); p = pte; pte += (n)
906 for (i
=0; i
<PMAP_NWINDOWS
; i
++) {
908 (current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
),
909 (current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CADDR
),
911 *current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
= 0;
914 /* DMAP user for debugger */
915 SYSMAP(caddr_t
, DMAP1
, DADDR1
, 1);
916 SYSMAP(caddr_t
, DMAP2
, DADDR2
, 1); /* XXX temporary - can remove */
920 if (PE_parse_boot_argn("npvhash", &npvhash
, sizeof (npvhash
))) {
921 if (0 != ((npvhash
+1) & npvhash
)) {
922 kprintf("invalid hash %d, must be ((2^N)-1), using default %d\n",npvhash
,NPVHASH
);
928 printf("npvhash=%d\n",npvhash
);
930 simple_lock_init(&kernel_pmap
->lock
, 0);
931 simple_lock_init(&pv_hashed_free_list_lock
, 0);
932 simple_lock_init(&pv_hashed_kern_free_list_lock
, 0);
933 simple_lock_init(&pv_hash_table_lock
,0);
935 pmap_init_high_shared();
937 pde_mapped_size
= PDE_MAPPED_SIZE
;
940 pdpt_entry_t
*ppdpt
= IdlePDPT
;
941 pdpt_entry_t
*ppdpt64
= (pdpt_entry_t
*)IdlePDPT64
;
942 pdpt_entry_t
*ppml4
= (pdpt_entry_t
*)IdlePML4
;
943 int istate
= ml_set_interrupts_enabled(FALSE
);
946 * Clone a new 64-bit 3rd-level page table directory, IdlePML4,
947 * with page bits set for the correct IA-32e operation and so that
948 * the legacy-mode IdlePDPT is retained for slave processor start-up.
949 * This is necessary due to the incompatible use of page bits between
950 * 64-bit and legacy modes.
952 kernel_pmap
->pm_cr3
= (pmap_paddr_t
)((int)IdlePML4
); /* setup in start.s for us */
953 kernel_pmap
->pm_pml4
= IdlePML4
;
954 kernel_pmap
->pm_pdpt
= (pd_entry_t
*)
955 ((unsigned int)IdlePDPT64
| KERNBASE
);
956 #define PAGE_BITS INTEL_PTE_VALID|INTEL_PTE_RW|INTEL_PTE_USER|INTEL_PTE_REF
957 pmap_store_pte(kernel_pmap
->pm_pml4
,
958 (uint32_t)IdlePDPT64
| PAGE_BITS
);
959 pmap_store_pte((ppdpt64
+0), *(ppdpt
+0) | PAGE_BITS
);
960 pmap_store_pte((ppdpt64
+1), *(ppdpt
+1) | PAGE_BITS
);
961 pmap_store_pte((ppdpt64
+2), *(ppdpt
+2) | PAGE_BITS
);
962 pmap_store_pte((ppdpt64
+3), *(ppdpt
+3) | PAGE_BITS
);
965 * The kernel is also mapped in the uber-sapce at the 4GB starting
966 * 0xFFFFFF80:00000000. This is the highest entry in the 4th-level.
968 pmap_store_pte((ppml4
+KERNEL_UBER_PML4_INDEX
), *(ppml4
+0));
970 kernel64_cr3
= (addr64_t
) kernel_pmap
->pm_cr3
;
972 /* Re-initialize descriptors and prepare to switch modes */
973 cpu_desc_init64(&cpu_data_master
);
974 current_cpu_datap()->cpu_is64bit
= TRUE
;
975 current_cpu_datap()->cpu_active_cr3
= kernel64_cr3
;
977 pde_mapped_size
= 512*4096 ;
979 ml_set_interrupts_enabled(istate
);
982 /* Sets 64-bit mode if required. */
983 cpu_mode_init(&cpu_data_master
);
984 /* Update in-kernel CPUID information if we're now in 64-bit mode */
988 kernel_pmap
->pm_hold
= (vm_offset_t
)kernel_pmap
->pm_pml4
;
990 kprintf("Kernel virtual space from 0x%x to 0x%x.\n",
991 VADDR(KPTDI
,0), virtual_end
);
992 printf("PAE enabled\n");
994 printf("64 bit mode enabled\n");kprintf("64 bit mode enabled\n"); }
996 kprintf("Available physical space from 0x%llx to 0x%llx\n",
997 avail_start
, avail_end
);
1000 * By default for 64-bit users loaded at 4GB, share kernel mapping.
1001 * But this may be overridden by the -no_shared_cr3 boot-arg.
1003 if (PE_parse_boot_argn("-no_shared_cr3", &no_shared_cr3
, sizeof (no_shared_cr3
))) {
1004 kprintf("Shared kernel address space disabled\n");
1008 if (PE_parse_boot_argn("-pmap_trace", &pmap_trace
, sizeof (pmap_trace
))) {
1009 kprintf("Kernel traces for pmap operations enabled\n");
1011 #endif /* PMAP_TRACES */
1016 vm_offset_t
*startp
,
1019 *startp
= virtual_avail
;
1020 *endp
= virtual_end
;
1024 * Initialize the pmap module.
1025 * Called by vm_init, to initialize any structures that the pmap
1026 * system needs to map virtual memory.
1032 vm_map_offset_t vaddr
;
1038 * Allocate memory for the pv_head_table and its lock bits,
1039 * the modify bit array, and the pte_page table.
1043 * zero bias all these arrays now instead of off avail_start
1044 * so we cover all memory
1047 npages
= (long)i386_btop(avail_end
);
1048 s
= (vm_size_t
) (sizeof(struct pv_rooted_entry
) * npages
1049 + (sizeof (struct pv_hashed_entry_t
*) * (npvhash
+1))
1050 + pv_lock_table_size(npages
)
1051 + pv_hash_lock_table_size((npvhash
+1))
1055 if (kernel_memory_allocate(kernel_map
, &addr
, s
, 0,
1056 KMA_KOBJECT
| KMA_PERMANENT
)
1060 memset((char *)addr
, 0, s
);
1066 if (0 == npvhash
) panic("npvhash not initialized");
1070 * Allocate the structures first to preserve word-alignment.
1072 pv_head_table
= (pv_rooted_entry_t
) addr
;
1073 addr
= (vm_offset_t
) (pv_head_table
+ npages
);
1075 pv_hash_table
= (pv_hashed_entry_t
*)addr
;
1076 addr
= (vm_offset_t
) (pv_hash_table
+ (npvhash
+ 1));
1078 pv_lock_table
= (char *) addr
;
1079 addr
= (vm_offset_t
) (pv_lock_table
+ pv_lock_table_size(npages
));
1081 pv_hash_lock_table
= (char *) addr
;
1082 addr
= (vm_offset_t
) (pv_hash_lock_table
+ pv_hash_lock_table_size((npvhash
+1)));
1084 pmap_phys_attributes
= (char *) addr
;
1089 pmap_memory_region_t
*pmptr
= pmap_memory_regions
;
1091 last_pn
= (ppnum_t
)i386_btop(avail_end
);
1093 for (i
= 0; i
< pmap_memory_region_count
; i
++, pmptr
++) {
1094 if (pmptr
->type
== kEfiConventionalMemory
) {
1096 for (pn
= pmptr
->base
; pn
<= pmptr
->end
; pn
++) {
1098 pmap_phys_attributes
[pn
] |= PHYS_MANAGED
;
1100 if (pn
> last_managed_page
)
1101 last_managed_page
= pn
;
1104 pmap_phys_attributes
[pn
] |= PHYS_NOENCRYPT
;
1105 else if (pn
>= lowest_hi
&& pn
<= highest_hi
)
1106 pmap_phys_attributes
[pn
] |= PHYS_NOENCRYPT
;
1113 ppn
= pmap_find_phys(kernel_pmap
, vaddr
);
1115 pmap_phys_attributes
[ppn
] |= PHYS_NOENCRYPT
;
1122 * Create the zone of physical maps,
1123 * and of the physical-to-virtual entries.
1125 s
= (vm_size_t
) sizeof(struct pmap
);
1126 pmap_zone
= zinit(s
, 400*s
, 4096, "pmap"); /* XXX */
1127 zone_change(pmap_zone
, Z_NOENCRYPT
, TRUE
);
1129 s
= (vm_size_t
) sizeof(struct pv_hashed_entry
);
1130 pv_hashed_list_zone
= zinit(s
, 10000*s
, 4096, "pv_list"); /* XXX */
1131 zone_change(pv_hashed_list_zone
, Z_NOENCRYPT
, TRUE
);
1134 pdpt_zone
= zinit(s
, 400*s
, 4096, "pdpt"); /* XXX */
1135 zone_change(pdpt_zone
, Z_NOENCRYPT
, TRUE
);
1137 kptobj
= &kptobj_object_store
;
1138 _vm_object_allocate((vm_object_size_t
)(NPGPTD
*NPTDPG
), kptobj
);
1139 kernel_pmap
->pm_obj
= kptobj
;
1141 /* create pv entries for kernel pages mapped by low level
1142 startup code. these have to exist so we can pmap_remove()
1143 e.g. kext pages from the middle of our addr space */
1145 vaddr
= (vm_map_offset_t
)0;
1146 for (ppn
= 0; ppn
< i386_btop(avail_start
) ; ppn
++ ) {
1147 pv_rooted_entry_t pv_e
;
1149 pv_e
= pai_to_pvh(ppn
);
1152 pv_e
->pmap
= kernel_pmap
;
1153 queue_init(&pv_e
->qlink
);
1156 pmap_initialized
= TRUE
;
1159 * Initialize pmap cache.
1161 pmap_cache_list
= PMAP_NULL
;
1162 pmap_cache_count
= 0;
1163 simple_lock_init(&pmap_cache_lock
, 0);
1165 max_preemption_latency_tsc
= tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS
, tscFCvtn2t
);
1170 #define managed_page(x) ( (unsigned int)x <= last_managed_page && (pmap_phys_attributes[x] & PHYS_MANAGED) )
1173 * this function is only used for debugging fron the vm layer
1179 pv_rooted_entry_t pv_h
;
1183 assert(pn
!= vm_page_fictitious_addr
);
1185 if (!pmap_initialized
)
1188 if (pn
== vm_page_guard_addr
)
1191 pai
= ppn_to_pai(pn
);
1192 if (!managed_page(pai
))
1194 pv_h
= pai_to_pvh(pn
);
1195 result
= (pv_h
->pmap
== PMAP_NULL
);
1202 vm_map_offset_t va_start
,
1203 vm_map_offset_t va_end
)
1205 vm_map_offset_t offset
;
1208 if (pmap
== PMAP_NULL
) {
1213 * Check the resident page count
1214 * - if it's zero, the pmap is completely empty.
1215 * This short-circuit test prevents a virtual address scan which is
1216 * painfully slow for 64-bit spaces.
1217 * This assumes the count is correct
1218 * .. the debug kernel ought to be checking perhaps by page table walk.
1220 if (pmap
->stats
.resident_count
== 0)
1223 for (offset
= va_start
;
1225 offset
+= PAGE_SIZE_64
) {
1226 phys_page
= pmap_find_phys(pmap
, offset
);
1228 if (pmap
!= kernel_pmap
&&
1229 pmap
->pm_task_map
== TASK_MAP_32BIT
&&
1230 offset
>= HIGH_MEM_BASE
) {
1232 * The "high_shared_pde" is used to share
1233 * the entire top-most 2MB of address space
1234 * between the kernel and all 32-bit tasks.
1235 * So none of this can be removed from 32-bit
1237 * Let's pretend there's nothing up
1242 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1243 "page %d at 0x%llx\n",
1244 pmap
, va_start
, va_end
, phys_page
, offset
);
1254 * Create and return a physical map.
1256 * If the size specified for the map
1257 * is zero, the map is an actual physical
1258 * map, and may be referenced by the
1261 * If the size specified is non-zero,
1262 * the map will be used in software only, and
1263 * is bounded by that size.
1275 pml4_entry_t
*pml4p
;
1280 PMAP_TRACE(PMAP_CODE(PMAP__CREATE
) | DBG_FUNC_START
,
1281 (int) (sz
>>32), (int) sz
, (int) is_64bit
, 0, 0);
1283 size
= (vm_size_t
) sz
;
1286 * A software use-only map doesn't even need a map.
1293 p
= (pmap_t
) zalloc(pmap_zone
);
1295 panic("pmap_create zalloc");
1297 /* init counts now since we'll be bumping some */
1298 simple_lock_init(&p
->lock
, 0);
1299 p
->stats
.resident_count
= 0;
1300 p
->stats
.resident_max
= 0;
1301 p
->stats
.wired_count
= 0;
1304 p
->pm_shared
= FALSE
;
1306 assert(!is_64bit
|| cpu_64bit
);
1307 p
->pm_task_map
= is_64bit
? TASK_MAP_64BIT
: TASK_MAP_32BIT
;;
1310 /* legacy 32 bit setup */
1311 /* in the legacy case the pdpt layer is hardwired to 4 entries and each
1312 * entry covers 1GB of addr space */
1313 if (KERN_SUCCESS
!= kmem_alloc_kobject(kernel_map
, (vm_offset_t
*)(&p
->dirbase
), NBPTD
))
1314 panic("pmap_create kmem_alloc_kobject");
1315 p
->pm_hold
= (vm_offset_t
)zalloc(pdpt_zone
);
1316 if ((vm_offset_t
)NULL
== p
->pm_hold
) {
1317 panic("pdpt zalloc");
1319 pdpt
= (pdpt_entry_t
*) (( p
->pm_hold
+ 31) & ~31);
1320 p
->pm_cr3
= (pmap_paddr_t
)kvtophys((vm_offset_t
)pdpt
);
1321 if (NULL
== (p
->pm_obj
= vm_object_allocate((vm_object_size_t
)(NPGPTD
*NPTDPG
))))
1322 panic("pmap_create vm_object_allocate");
1324 memset((char *)p
->dirbase
, 0, NBPTD
);
1326 va
= (vm_offset_t
)p
->dirbase
;
1327 p
->pdirbase
= kvtophys(va
);
1329 template = INTEL_PTE_VALID
;
1330 for (i
= 0; i
< NPGPTD
; i
++, pdpt
++ ) {
1332 pa
= (pmap_paddr_t
) kvtophys((vm_offset_t
)(va
+ i386_ptob(i
)));
1333 pmap_store_pte(pdpt
, pa
| template);
1336 /* map the high shared pde */
1338 pmap_store_pte(pmap_pde(p
, HIGH_MEM_BASE
), high_shared_pde
);
1344 /* alloc the pml4 page in kernel vm */
1345 if (KERN_SUCCESS
!= kmem_alloc_kobject(kernel_map
, (vm_offset_t
*)(&p
->pm_hold
), PAGE_SIZE
))
1346 panic("pmap_create kmem_alloc_kobject pml4");
1348 memset((char *)p
->pm_hold
, 0, PAGE_SIZE
);
1349 p
->pm_cr3
= (pmap_paddr_t
)kvtophys((vm_offset_t
)p
->pm_hold
);
1351 OSAddAtomic(1, &inuse_ptepages_count
);
1353 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1355 if (NULL
== (p
->pm_obj_pml4
= vm_object_allocate((vm_object_size_t
)(NPML4PGS
))))
1356 panic("pmap_create pdpt obj");
1358 if (NULL
== (p
->pm_obj_pdpt
= vm_object_allocate((vm_object_size_t
)(NPDPTPGS
))))
1359 panic("pmap_create pdpt obj");
1361 if (NULL
== (p
->pm_obj
= vm_object_allocate((vm_object_size_t
)(NPDEPGS
))))
1362 panic("pmap_create pte obj");
1364 /* uber space points to uber mapped kernel */
1366 pml4p
= pmap64_pml4(p
, 0ULL);
1367 pmap_store_pte((pml4p
+KERNEL_UBER_PML4_INDEX
), *kernel_pmap
->pm_pml4
);
1371 while ((pdp
= pmap64_pde(p
, (uint64_t)HIGH_MEM_BASE
)) == PD_ENTRY_NULL
) {
1373 pmap_expand_pdpt(p
, (uint64_t)HIGH_MEM_BASE
); /* need room for another pde entry */
1376 pmap_store_pte(pdp
, high_shared_pde
);
1381 PMAP_TRACE(PMAP_CODE(PMAP__CREATE
) | DBG_FUNC_START
,
1382 (int) p
, is_64bit
, 0, 0, 0);
1388 * The following routines implement the shared address optmization for 64-bit
1389 * users with a 4GB page zero.
1391 * pmap_set_4GB_pagezero()
1392 * is called in the exec and fork paths to mirror the kernel's
1393 * mapping in the bottom 4G of the user's pmap. The task mapping changes
1394 * from TASK_MAP_64BIT to TASK_MAP_64BIT_SHARED. This routine returns
1395 * without doing anything if the -no_shared_cr3 boot-arg is set.
1397 * pmap_clear_4GB_pagezero()
1398 * is called in the exec/exit paths to undo this mirror. The task mapping
1399 * reverts to TASK_MAP_64BIT. In addition, we switch to the kernel's
1400 * CR3 by calling pmap_load_kernel_cr3().
1402 * pmap_load_kernel_cr3()
1403 * loads cr3 with the kernel's page table. In addition to being called
1404 * by pmap_clear_4GB_pagezero(), it is used both prior to teardown and
1405 * when we go idle in the context of a shared map.
1407 * Further notes on per-cpu data used:
1409 * cpu_kernel_cr3 is the cr3 for the kernel's pmap.
1410 * This is loaded in a trampoline on entering the kernel
1411 * from a 32-bit user (or non-shared-cr3 64-bit user).
1412 * cpu_task_cr3 is the cr3 for the current thread.
1413 * This is loaded in a trampoline as we exit the kernel.
1414 * cpu_active_cr3 reflects the cr3 currently loaded.
1415 * However, the low order bit is set when the
1416 * processor is idle or interrupts are disabled
1417 * while the system pmap lock is held. It is used by
1419 * cpu_task_map indicates whether the task cr3 belongs to
1420 * a 32-bit, a 64-bit or a 64-bit shared map.
1421 * The latter allows the avoidance of the cr3 load
1422 * on kernel entry and exit.
1423 * cpu_tlb_invalid set TRUE when a tlb flush is requested.
1424 * If the cr3 is "inactive" (the cpu is idle or the
1425 * system-wide pmap lock is held) this not serviced by
1426 * an IPI but at time when the cr3 becomes "active".
1430 pmap_set_4GB_pagezero(pmap_t p
)
1432 pdpt_entry_t
*user_pdptp
;
1433 pdpt_entry_t
*kern_pdptp
;
1435 assert(p
->pm_task_map
!= TASK_MAP_32BIT
);
1437 /* Kernel-shared cr3 may be disabled by boot arg. */
1442 * Set the bottom 4 3rd-level pte's to be the kernel's.
1445 while ((user_pdptp
= pmap64_pdpt(p
, 0x0)) == PDPT_ENTRY_NULL
) {
1447 pmap_expand_pml4(p
, 0x0);
1450 kern_pdptp
= kernel_pmap
->pm_pdpt
;
1451 pmap_store_pte(user_pdptp
+0, *(kern_pdptp
+0));
1452 pmap_store_pte(user_pdptp
+1, *(kern_pdptp
+1));
1453 pmap_store_pte(user_pdptp
+2, *(kern_pdptp
+2));
1454 pmap_store_pte(user_pdptp
+3, *(kern_pdptp
+3));
1455 p
->pm_task_map
= TASK_MAP_64BIT_SHARED
;
1460 pmap_clear_4GB_pagezero(pmap_t p
)
1462 pdpt_entry_t
*user_pdptp
;
1464 if (p
->pm_task_map
!= TASK_MAP_64BIT_SHARED
)
1469 p
->pm_task_map
= TASK_MAP_64BIT
;
1471 pmap_load_kernel_cr3();
1473 user_pdptp
= pmap64_pdpt(p
, 0x0);
1474 pmap_store_pte(user_pdptp
+0, 0);
1475 pmap_store_pte(user_pdptp
+1, 0);
1476 pmap_store_pte(user_pdptp
+2, 0);
1477 pmap_store_pte(user_pdptp
+3, 0);
1483 pmap_load_kernel_cr3(void)
1485 uint64_t kernel_cr3
;
1487 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
1490 * Reload cr3 with the true kernel cr3.
1492 kernel_cr3
= current_cpu_datap()->cpu_kernel_cr3
;
1493 set64_cr3(kernel_cr3
);
1494 current_cpu_datap()->cpu_active_cr3
= kernel_cr3
;
1495 current_cpu_datap()->cpu_tlb_invalid
= FALSE
;
1496 __asm__
volatile("mfence");
1500 * Retire the given physical map from service.
1501 * Should only be called if the map contains
1502 * no valid mappings.
1514 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY
) | DBG_FUNC_START
,
1515 (int) p
, 0, 0, 0, 0);
1523 * If some cpu is not using the physical pmap pointer that it
1524 * is supposed to be (see set_dirbase), we might be using the
1525 * pmap that is being destroyed! Make sure we are
1526 * physically on the right pmap:
1530 0xFFFFFFFFFFFFF000ULL
);
1536 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY
) | DBG_FUNC_END
,
1537 (int) p
, 1, 0, 0, 0);
1538 return; /* still in use */
1542 * Free the memory maps, then the
1546 OSAddAtomic(-p
->pm_obj
->resident_page_count
, &inuse_ptepages_count
);
1548 kmem_free(kernel_map
, (vm_offset_t
)p
->dirbase
, NBPTD
);
1549 zfree(pdpt_zone
, (void *)p
->pm_hold
);
1551 vm_object_deallocate(p
->pm_obj
);
1554 int inuse_ptepages
= 0;
1556 /* free 64 bit mode structs */
1558 kmem_free(kernel_map
, (vm_offset_t
)p
->pm_hold
, PAGE_SIZE
);
1560 inuse_ptepages
+= p
->pm_obj_pml4
->resident_page_count
;
1561 vm_object_deallocate(p
->pm_obj_pml4
);
1563 inuse_ptepages
+= p
->pm_obj_pdpt
->resident_page_count
;
1564 vm_object_deallocate(p
->pm_obj_pdpt
);
1566 inuse_ptepages
+= p
->pm_obj
->resident_page_count
;
1567 vm_object_deallocate(p
->pm_obj
);
1569 OSAddAtomic(-inuse_ptepages
, &inuse_ptepages_count
);
1571 zfree(pmap_zone
, p
);
1573 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY
) | DBG_FUNC_END
,
1579 * Add a reference to the specified pmap.
1587 if (p
!= PMAP_NULL
) {
1596 * Remove phys addr if mapped in specified map
1600 pmap_remove_some_phys(
1601 __unused pmap_t map
,
1602 __unused ppnum_t pn
)
1605 /* Implement to support working set code */
1614 * Disconnect all mappings for this page and return reference and change status
1615 * in generic format.
1618 unsigned int pmap_disconnect(
1621 pmap_page_protect(pa
, 0); /* disconnect the page */
1622 return (pmap_get_refmod(pa
)); /* return ref/chg status */
1626 * Set the physical protection on the
1627 * specified range of this map as requested.
1628 * Will not increase permissions.
1633 vm_map_offset_t sva
,
1634 vm_map_offset_t eva
,
1637 register pt_entry_t
*pde
;
1638 register pt_entry_t
*spte
, *epte
;
1639 vm_map_offset_t lva
;
1640 vm_map_offset_t orig_sva
;
1646 if (map
== PMAP_NULL
)
1649 if (prot
== VM_PROT_NONE
) {
1650 pmap_remove(map
, sva
, eva
);
1654 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT
) | DBG_FUNC_START
,
1656 (int) (sva
>>32), (int) sva
,
1657 (int) (eva
>>32), (int) eva
);
1659 if ( (prot
& VM_PROT_EXECUTE
) || !nx_enabled
|| !map
->nx_enabled
)
1668 lva
= (sva
+ pde_mapped_size
) & ~(pde_mapped_size
-1);
1671 pde
= pmap_pde(map
, sva
);
1672 if (pde
&& (*pde
& INTEL_PTE_VALID
)) {
1673 spte
= (pt_entry_t
*)pmap_pte(map
, (sva
& ~(pde_mapped_size
-1)));
1674 spte
= &spte
[ptenum(sva
)];
1675 epte
= &spte
[intel_btop(lva
-sva
)];
1677 while (spte
< epte
) {
1679 if (*spte
& INTEL_PTE_VALID
) {
1681 if (prot
& VM_PROT_WRITE
)
1682 pmap_update_pte(spte
, *spte
, (*spte
| INTEL_PTE_WRITE
));
1684 pmap_update_pte(spte
, *spte
, (*spte
& ~INTEL_PTE_WRITE
));
1687 pmap_update_pte(spte
, *spte
, (*spte
| INTEL_PTE_NX
));
1689 pmap_update_pte(spte
, *spte
, (*spte
& ~INTEL_PTE_NX
));
1699 PMAP_UPDATE_TLBS(map
, orig_sva
, eva
);
1703 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT
) | DBG_FUNC_END
,
1708 /* Map a (possibly) autogenned block */
1717 __unused
unsigned int flags
)
1721 for (page
= 0; page
< size
; page
++) {
1722 pmap_enter(pmap
, va
, pa
, prot
, attr
, TRUE
);
1730 * Routine: pmap_change_wiring
1731 * Function: Change the wiring attribute for a map/virtual-address
1733 * In/out conditions:
1734 * The mapping must already exist in the pmap.
1738 register pmap_t map
,
1739 vm_map_offset_t vaddr
,
1742 register pt_entry_t
*pte
;
1745 * We must grab the pmap system lock because we may
1746 * change a pte_page queue.
1750 if ((pte
= pmap_pte(map
, vaddr
)) == PT_ENTRY_NULL
)
1751 panic("pmap_change_wiring: pte missing");
1753 if (wired
&& !iswired(*pte
)) {
1755 * wiring down mapping
1757 OSAddAtomic(+1, &map
->stats
.wired_count
);
1758 pmap_update_pte(pte
, *pte
, (*pte
| INTEL_PTE_WIRED
));
1760 else if (!wired
&& iswired(*pte
)) {
1764 assert(map
->stats
.wired_count
>= 1);
1765 OSAddAtomic(-1, &map
->stats
.wired_count
);
1766 pmap_update_pte(pte
, *pte
, (*pte
& ~INTEL_PTE_WIRED
));
1774 * Routine: pmap_extract
1776 * Extract the physical page address associated
1777 * with the given map/virtual_address pair.
1778 * Change to shim for backwards compatibility but will not
1779 * work for 64 bit systems. Some old drivers that we cannot
1785 register pmap_t pmap
,
1786 vm_map_offset_t vaddr
)
1791 paddr
= (vm_offset_t
)0;
1792 ppn
= pmap_find_phys(pmap
, vaddr
);
1795 paddr
= ((vm_offset_t
)i386_ptob(ppn
)) | ((vm_offset_t
)vaddr
& INTEL_OFFMASK
);
1803 vm_map_offset_t vaddr
)
1805 register vm_page_t m
;
1806 register pmap_paddr_t pa
;
1810 pml4_entry_t
*pml4p
;
1812 if (kernel_pmap
== map
) panic("expand kernel pml4");
1815 pml4p
= pmap64_pml4(map
, vaddr
);
1817 if (PML4_ENTRY_NULL
== pml4p
) panic("pmap_expand_pml4 no pml4p");
1820 * Allocate a VM page for the pml4 page
1822 while ((m
= vm_page_grab()) == VM_PAGE_NULL
)
1826 * put the page into the pmap's obj list so it
1827 * can be found later.
1831 i
= pml4idx(map
, vaddr
);
1838 vm_page_lockspin_queues();
1840 vm_page_unlock_queues();
1842 OSAddAtomic(1, &inuse_ptepages_count
);
1844 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
1845 vm_object_lock(map
->pm_obj_pml4
);
1849 * See if someone else expanded us first
1851 if (pmap64_pdpt(map
, vaddr
) != PDPT_ENTRY_NULL
) {
1853 vm_object_unlock(map
->pm_obj_pml4
);
1857 OSAddAtomic(-1, &inuse_ptepages_count
);
1860 pmap_set_noencrypt(pn
);
1863 if (0 != vm_page_lookup(map
->pm_obj_pml4
, (vm_object_offset_t
)i
)) {
1864 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
1865 map
, map
->pm_obj_pml4
, vaddr
, i
);
1868 vm_page_insert(m
, map
->pm_obj_pml4
, (vm_object_offset_t
)i
);
1869 vm_object_unlock(map
->pm_obj_pml4
);
1872 * Set the page directory entry for this page table.
1874 pml4p
= pmap64_pml4(map
, vaddr
); /* refetch under lock */
1876 pmap_store_pte(pml4p
, pa_to_pte(pa
)
1890 vm_map_offset_t vaddr
)
1892 register vm_page_t m
;
1893 register pmap_paddr_t pa
;
1897 pdpt_entry_t
*pdptp
;
1899 if (kernel_pmap
== map
) panic("expand kernel pdpt");
1902 while ((pdptp
= pmap64_pdpt(map
, vaddr
)) == PDPT_ENTRY_NULL
) {
1904 pmap_expand_pml4(map
, vaddr
); /* need room for another pdpt entry */
1910 * Allocate a VM page for the pdpt page
1912 while ((m
= vm_page_grab()) == VM_PAGE_NULL
)
1916 * put the page into the pmap's obj list so it
1917 * can be found later.
1921 i
= pdptidx(map
, vaddr
);
1928 vm_page_lockspin_queues();
1930 vm_page_unlock_queues();
1932 OSAddAtomic(1, &inuse_ptepages_count
);
1934 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
1935 vm_object_lock(map
->pm_obj_pdpt
);
1939 * See if someone else expanded us first
1941 if (pmap64_pde(map
, vaddr
) != PD_ENTRY_NULL
) {
1943 vm_object_unlock(map
->pm_obj_pdpt
);
1947 OSAddAtomic(-1, &inuse_ptepages_count
);
1950 pmap_set_noencrypt(pn
);
1953 if (0 != vm_page_lookup(map
->pm_obj_pdpt
, (vm_object_offset_t
)i
)) {
1954 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
1955 map
, map
->pm_obj_pdpt
, vaddr
, i
);
1958 vm_page_insert(m
, map
->pm_obj_pdpt
, (vm_object_offset_t
)i
);
1959 vm_object_unlock(map
->pm_obj_pdpt
);
1962 * Set the page directory entry for this page table.
1964 pdptp
= pmap64_pdpt(map
, vaddr
); /* refetch under lock */
1966 pmap_store_pte(pdptp
, pa_to_pte(pa
)
1980 * Routine: pmap_expand
1982 * Expands a pmap to be able to map the specified virtual address.
1984 * Allocates new virtual memory for the P0 or P1 portion of the
1985 * pmap, then re-maps the physical pages that were in the old
1986 * pmap to be in the new pmap.
1988 * Must be called with the pmap system and the pmap unlocked,
1989 * since these must be unlocked to use vm_allocate or vm_deallocate.
1990 * Thus it must be called in a loop that checks whether the map
1991 * has been expanded enough.
1992 * (We won't loop forever, since page tables aren't shrunk.)
1997 vm_map_offset_t vaddr
)
2000 register vm_page_t m
;
2001 register pmap_paddr_t pa
;
2007 * if not the kernel map (while we are still compat kernel mode)
2008 * and we are 64 bit, propagate expand upwards
2011 if (cpu_64bit
&& (map
!= kernel_pmap
)) {
2013 while ((pdp
= pmap64_pde(map
, vaddr
)) == PD_ENTRY_NULL
) {
2015 pmap_expand_pdpt(map
, vaddr
); /* need room for another pde entry */
2022 * Allocate a VM page for the pde entries.
2024 while ((m
= vm_page_grab()) == VM_PAGE_NULL
)
2028 * put the page into the pmap's obj list so it
2029 * can be found later.
2033 i
= pdeidx(map
, vaddr
);
2040 vm_page_lockspin_queues();
2042 vm_page_unlock_queues();
2044 OSAddAtomic(1, &inuse_ptepages_count
);
2046 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2047 vm_object_lock(map
->pm_obj
);
2051 * See if someone else expanded us first
2054 if (pmap_pte(map
, vaddr
) != PT_ENTRY_NULL
) {
2056 vm_object_unlock(map
->pm_obj
);
2060 OSAddAtomic(-1, &inuse_ptepages_count
);
2063 pmap_set_noencrypt(pn
);
2066 if (0 != vm_page_lookup(map
->pm_obj
, (vm_object_offset_t
)i
)) {
2067 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
2068 map
, map
->pm_obj
, vaddr
, i
);
2071 vm_page_insert(m
, map
->pm_obj
, (vm_object_offset_t
)i
);
2072 vm_object_unlock(map
->pm_obj
);
2075 * refetch while locked
2078 pdp
= pmap_pde(map
, vaddr
);
2081 * Set the page directory entry for this page table.
2083 pmap_store_pte(pdp
, pa_to_pte(pa
)
2095 * pmap_sync_page_data_phys(ppnum_t pa)
2097 * Invalidates all of the instruction cache on a physical page and
2098 * pushes any dirty data from the data cache for the same physical page
2099 * Not required in i386.
2102 pmap_sync_page_data_phys(__unused ppnum_t pa
)
2108 * pmap_sync_page_attributes_phys(ppnum_t pa)
2110 * Write back and invalidate all cachelines on a physical page.
2113 pmap_sync_page_attributes_phys(ppnum_t pa
)
2115 cache_flush_page_phys(pa
);
2120 #ifdef CURRENTLY_UNUSED_AND_UNTESTED
2126 * Routine: pmap_collect
2128 * Garbage collects the physical map system for
2129 * pages which are no longer used.
2130 * Success need not be guaranteed -- that is, there
2131 * may well be pages which are not referenced, but
2132 * others may be collected.
2134 * Called by the pageout daemon when pages are scarce.
2140 register pt_entry_t
*pdp
, *ptp
;
2147 if (p
== kernel_pmap
)
2151 * Garbage collect map.
2155 for (pdp
= (pt_entry_t
*)p
->dirbase
;
2156 pdp
< (pt_entry_t
*)&p
->dirbase
[(UMAXPTDI
+1)];
2159 if (*pdp
& INTEL_PTE_VALID
) {
2160 if(*pdp
& INTEL_PTE_REF
) {
2161 pmap_store_pte(pdp
, *pdp
& ~INTEL_PTE_REF
);
2165 ptp
= pmap_pte(p
, pdetova(pdp
- (pt_entry_t
*)p
->dirbase
));
2166 eptp
= ptp
+ NPTEPG
;
2169 * If the pte page has any wired mappings, we cannot
2174 register pt_entry_t
*ptep
;
2175 for (ptep
= ptp
; ptep
< eptp
; ptep
++) {
2176 if (iswired(*ptep
)) {
2184 * Remove the virtual addresses mapped by this pte page.
2186 pmap_remove_range(p
,
2187 pdetova(pdp
- (pt_entry_t
*)p
->dirbase
),
2192 * Invalidate the page directory pointer.
2194 pmap_store_pte(pdp
, 0x0);
2199 * And free the pte page itself.
2202 register vm_page_t m
;
2204 vm_object_lock(p
->pm_obj
);
2206 m
= vm_page_lookup(p
->pm_obj
,(vm_object_offset_t
)(pdp
- (pt_entry_t
*)&p
->dirbase
[0]));
2207 if (m
== VM_PAGE_NULL
)
2208 panic("pmap_collect: pte page not in object");
2212 OSAddAtomic(-1, &inuse_ptepages_count
);
2214 vm_object_unlock(p
->pm_obj
);
2223 PMAP_UPDATE_TLBS(p
, 0x0, 0xFFFFFFFFFFFFF000ULL
);
2232 pmap_copy_page(ppnum_t src
, ppnum_t dst
)
2234 bcopy_phys((addr64_t
)i386_ptob(src
),
2235 (addr64_t
)i386_ptob(dst
),
2241 * Routine: pmap_pageable
2243 * Make the specified pages (by pmap, offset)
2244 * pageable (or not) as requested.
2246 * A page which is not pageable may not take
2247 * a fault; therefore, its page table entry
2248 * must remain valid for the duration.
2250 * This routine is merely advisory; pmap_enter
2251 * will specify that these pages are to be wired
2252 * down (or not) as appropriate.
2256 __unused pmap_t pmap
,
2257 __unused vm_map_offset_t start_addr
,
2258 __unused vm_map_offset_t end_addr
,
2259 __unused boolean_t pageable
)
2262 pmap
++; start_addr
++; end_addr
++; pageable
++;
2267 * Clear specified attribute bits.
2270 phys_attribute_clear(
2274 pv_rooted_entry_t pv_h
;
2275 register pv_hashed_entry_t pv_e
;
2276 register pt_entry_t
*pte
;
2278 register pmap_t pmap
;
2281 assert(pn
!= vm_page_fictitious_addr
);
2282 if (pn
== vm_page_guard_addr
)
2285 pai
= ppn_to_pai(pn
);
2287 if (!managed_page(pai
)) {
2289 * Not a managed page.
2295 PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR
) | DBG_FUNC_START
,
2296 (int) pn
, bits
, 0, 0, 0);
2298 pv_h
= pai_to_pvh(pai
);
2303 * Walk down PV list, clearing all modify or reference bits.
2304 * We do not have to lock the pv_list because we have
2305 * the entire pmap system locked.
2307 if (pv_h
->pmap
!= PMAP_NULL
) {
2309 * There are some mappings.
2312 pv_e
= (pv_hashed_entry_t
)pv_h
;
2323 * Clear modify and/or reference bits.
2326 pte
= pmap_pte(pmap
, va
);
2327 pmap_update_pte(pte
, *pte
, (*pte
& ~bits
));
2328 /* Ensure all processors using this translation
2329 * invalidate this TLB entry. The invalidation *must* follow
2330 * the PTE update, to ensure that the TLB shadow of the
2331 * 'D' bit (in particular) is synchronized with the
2334 PMAP_UPDATE_TLBS(pmap
, va
, va
+ PAGE_SIZE
);
2337 pv_e
= (pv_hashed_entry_t
)queue_next(&pv_e
->qlink
);
2339 } while (pv_e
!= (pv_hashed_entry_t
)pv_h
);
2341 pmap_phys_attributes
[pai
] &= ~bits
;
2345 PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR
) | DBG_FUNC_END
,
2351 * Check specified attribute bits.
2354 phys_attribute_test(
2358 pv_rooted_entry_t pv_h
;
2359 register pv_hashed_entry_t pv_e
;
2360 register pt_entry_t
*pte
;
2362 register pmap_t pmap
;
2366 assert(pn
!= vm_page_fictitious_addr
);
2367 if (pn
== vm_page_guard_addr
)
2370 pai
= ppn_to_pai(pn
);
2372 if (!managed_page(pai
)) {
2374 * Not a managed page.
2380 * super fast check... if bits already collected
2381 * no need to take any locks...
2382 * if not set, we need to recheck after taking
2383 * the lock in case they got pulled in while
2384 * we were waiting for the lock
2386 if ( (pmap_phys_attributes
[pai
] & bits
) == bits
)
2389 pv_h
= pai_to_pvh(pai
);
2393 attributes
= pmap_phys_attributes
[pai
] & bits
;
2397 * Walk down PV list, checking the mappings until we
2398 * reach the end or we've found the attributes we've asked for
2399 * We do not have to lock the pv_list because we have
2400 * the entire pmap system locked.
2402 if (pv_h
->pmap
!= PMAP_NULL
) {
2404 * There are some mappings.
2406 pv_e
= (pv_hashed_entry_t
)pv_h
;
2407 if (attributes
!= bits
) do {
2416 * first make sure any processor actively
2417 * using this pmap, flushes its TLB state
2419 PMAP_UPDATE_TLBS(pmap
, va
, va
+ PAGE_SIZE
);
2422 * pick up modify and/or reference bits from this mapping
2424 pte
= pmap_pte(pmap
, va
);
2425 attributes
|= (int)(*pte
& bits
);
2429 pv_e
= (pv_hashed_entry_t
)queue_next(&pv_e
->qlink
);
2431 } while ((attributes
!= bits
) && (pv_e
!= (pv_hashed_entry_t
)pv_h
));
2435 return (attributes
);
2439 * Set specified attribute bits.
2449 assert(pn
!= vm_page_fictitious_addr
);
2450 if (pn
== vm_page_guard_addr
)
2453 pai
= ppn_to_pai(pn
);
2455 if (!managed_page(pai
)) {
2457 * Not a managed page.
2464 pmap_phys_attributes
[pai
] |= bits
;
2470 * Set the modify bit on the specified physical page.
2473 void pmap_set_modify(
2476 phys_attribute_set(pn
, PHYS_MODIFIED
);
2480 * Clear the modify bits on the specified physical page.
2487 phys_attribute_clear(pn
, PHYS_MODIFIED
);
2493 * Return whether or not the specified physical page is modified
2494 * by any physical maps.
2501 if (phys_attribute_test(pn
, PHYS_MODIFIED
))
2508 * pmap_clear_reference:
2510 * Clear the reference bit on the specified physical page.
2514 pmap_clear_reference(
2517 phys_attribute_clear(pn
, PHYS_REFERENCED
);
2521 pmap_set_reference(ppnum_t pn
)
2523 phys_attribute_set(pn
, PHYS_REFERENCED
);
2527 * pmap_is_referenced:
2529 * Return whether or not the specified physical page is referenced
2530 * by any physical maps.
2537 if (phys_attribute_test(pn
, PHYS_REFERENCED
))
2544 * pmap_get_refmod(phys)
2545 * returns the referenced and modified bits of the specified
2549 pmap_get_refmod(ppnum_t pa
)
2552 unsigned int retval
= 0;
2554 refmod
= phys_attribute_test(pa
, PHYS_MODIFIED
| PHYS_REFERENCED
);
2556 if (refmod
& PHYS_MODIFIED
)
2557 retval
|= VM_MEM_MODIFIED
;
2558 if (refmod
& PHYS_REFERENCED
)
2559 retval
|= VM_MEM_REFERENCED
;
2565 * pmap_clear_refmod(phys, mask)
2566 * clears the referenced and modified bits as specified by the mask
2567 * of the specified physical page.
2570 pmap_clear_refmod(ppnum_t pa
, unsigned int mask
)
2572 unsigned int x86Mask
;
2574 x86Mask
= ( ((mask
& VM_MEM_MODIFIED
)? PHYS_MODIFIED
: 0)
2575 | ((mask
& VM_MEM_REFERENCED
)? PHYS_REFERENCED
: 0));
2576 phys_attribute_clear(pa
, x86Mask
);
2580 invalidate_icache(__unused vm_offset_t addr
,
2581 __unused
unsigned cnt
,
2587 flush_dcache(__unused vm_offset_t addr
,
2588 __unused
unsigned count
,
2596 * Constrain DTrace copyin/copyout actions
2598 extern kern_return_t
dtrace_copyio_preflight(addr64_t
);
2599 extern kern_return_t
dtrace_copyio_postflight(addr64_t
);
2601 kern_return_t
dtrace_copyio_preflight(__unused addr64_t va
)
2603 thread_t thread
= current_thread();
2605 if (current_map() == kernel_map
)
2606 return KERN_FAILURE
;
2607 else if (thread
->machine
.specFlags
& CopyIOActive
)
2608 return KERN_FAILURE
;
2610 return KERN_SUCCESS
;
2613 kern_return_t
dtrace_copyio_postflight(__unused addr64_t va
)
2615 return KERN_SUCCESS
;
2617 #endif /* CONFIG_DTRACE */
2621 /* show phys page mappings and attributes */
2623 extern void db_show_page(pmap_paddr_t pa
);
2627 db_show_page(pmap_paddr_t pa
)
2634 pv_h
= pai_to_pvh(pai
);
2636 attr
= pmap_phys_attributes
[pai
];
2637 printf("phys page %llx ", pa
);
2638 if (attr
& PHYS_MODIFIED
)
2639 printf("modified, ");
2640 if (attr
& PHYS_REFERENCED
)
2641 printf("referenced, ");
2642 if (pv_h
->pmap
|| pv_h
->next
)
2643 printf(" mapped at\n");
2645 printf(" not mapped\n");
2646 for (; pv_h
; pv_h
= pv_h
->next
)
2648 printf("%llx in pmap %p\n", pv_h
->va
, pv_h
->pmap
);
2652 #endif /* MACH_KDB */
2656 void db_kvtophys(vm_offset_t
);
2657 void db_show_vaddrs(pt_entry_t
*);
2660 * print out the results of kvtophys(arg)
2666 db_printf("0x%qx", kvtophys(vaddr
));
2670 * Walk the pages tables.
2674 pt_entry_t
*dirbase
)
2676 pt_entry_t
*ptep
, *pdep
, tmp
;
2677 unsigned int x
, y
, pdecnt
, ptecnt
;
2680 dirbase
= kernel_pmap
->dirbase
;
2683 db_printf("need a dirbase...\n");
2686 dirbase
= (pt_entry_t
*) (int) ((unsigned long) dirbase
& ~INTEL_OFFMASK
);
2688 db_printf("dirbase: 0x%x\n", dirbase
);
2690 pdecnt
= ptecnt
= 0;
2692 for (y
= 0; y
< NPDEPG
; y
++, pdep
++) {
2693 if (((tmp
= *pdep
) & INTEL_PTE_VALID
) == 0) {
2697 ptep
= (pt_entry_t
*) ((unsigned long)(*pdep
) & ~INTEL_OFFMASK
);
2698 db_printf("dir[%4d]: 0x%x\n", y
, *pdep
);
2699 for (x
= 0; x
< NPTEPG
; x
++, ptep
++) {
2700 if (((tmp
= *ptep
) & INTEL_PTE_VALID
) == 0) {
2704 db_printf(" tab[%4d]: 0x%x, va=0x%x, pa=0x%x\n",
2707 (y
<< 22) | (x
<< 12),
2708 *ptep
& ~INTEL_OFFMASK
);
2712 db_printf("total: %d tables, %d page table entries.\n", pdecnt
, ptecnt
);
2716 #endif /* MACH_KDB */
2718 #include <mach_vm_debug.h>
2720 #include <vm/vm_debug.h>
2723 pmap_list_resident_pages(
2724 __unused pmap_t pmap
,
2725 __unused vm_offset_t
*listp
,
2730 #endif /* MACH_VM_DEBUG */
2734 /* temporary workaround */
2736 coredumpok(__unused vm_map_t map
, __unused vm_offset_t va
)
2741 ptep
= pmap_pte(map
->pmap
, va
);
2744 return ((*ptep
& (INTEL_PTE_NCACHE
| INTEL_PTE_WIRED
)) != (INTEL_PTE_NCACHE
| INTEL_PTE_WIRED
));
2755 assert(pn
!= vm_page_fictitious_addr
);
2757 if (!pmap_initialized
)
2760 if (pn
== vm_page_guard_addr
)
2763 if (!managed_page(ppn_to_pai(pn
)))
2770 pmap_commpage32_init(vm_offset_t kernel_commpage
, vm_offset_t user_commpage
, int cnt
)
2773 pt_entry_t
*opte
, *npte
;
2777 for (i
= 0; i
< cnt
; i
++) {
2779 opte
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)kernel_commpage
);
2781 panic("kernel_commpage");
2782 pte
= *opte
| INTEL_PTE_USER
|INTEL_PTE_GLOBAL
;
2783 pte
&= ~INTEL_PTE_WRITE
; // ensure read only
2784 npte
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)user_commpage
);
2786 panic("user_commpage");
2787 pmap_store_pte(npte
, pte
);
2789 kernel_commpage
+= INTEL_PGBYTES
;
2790 user_commpage
+= INTEL_PGBYTES
;
2795 #define PMAP_COMMPAGE64_CNT (_COMM_PAGE64_AREA_USED/PAGE_SIZE)
2796 pt_entry_t pmap_commpage64_ptes
[PMAP_COMMPAGE64_CNT
];
2799 pmap_commpage64_init(vm_offset_t kernel_commpage
, __unused vm_map_offset_t user_commpage
, int cnt
)
2804 PMAP_LOCK(kernel_pmap
);
2806 for (i
= 0; i
< cnt
; i
++) {
2807 kptep
= pmap_pte(kernel_pmap
, (uint64_t)kernel_commpage
+ (i
*PAGE_SIZE
));
2808 if ((0 == kptep
) || (0 == (*kptep
& INTEL_PTE_VALID
)))
2809 panic("pmap_commpage64_init pte");
2810 pmap_commpage64_ptes
[i
] = ((*kptep
& ~INTEL_PTE_WRITE
) | INTEL_PTE_USER
);
2812 PMAP_UNLOCK(kernel_pmap
);
2816 static cpu_pmap_t cpu_pmap_master
;
2819 pmap_cpu_alloc(boolean_t is_boot_cpu
)
2824 vm_offset_t address
;
2825 vm_map_address_t mapaddr
;
2826 vm_map_entry_t entry
;
2830 cp
= &cpu_pmap_master
;
2833 * The per-cpu pmap data structure itself.
2835 ret
= kmem_alloc(kernel_map
,
2836 (vm_offset_t
*) &cp
, sizeof(cpu_pmap_t
));
2837 if (ret
!= KERN_SUCCESS
) {
2838 printf("pmap_cpu_alloc() failed ret=%d\n", ret
);
2841 bzero((void *)cp
, sizeof(cpu_pmap_t
));
2844 * The temporary windows used for copy/zero - see loose_ends.c
2846 ret
= vm_map_find_space(kernel_map
,
2847 &mapaddr
, PMAP_NWINDOWS
*PAGE_SIZE
, (vm_map_offset_t
)0, 0, &entry
);
2848 if (ret
!= KERN_SUCCESS
) {
2849 printf("pmap_cpu_alloc() "
2850 "vm_map_find_space ret=%d\n", ret
);
2854 address
= (vm_offset_t
)mapaddr
;
2856 for (i
= 0; i
< PMAP_NWINDOWS
; i
++, address
+= PAGE_SIZE
) {
2859 while ((pte
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)address
)) == 0)
2860 pmap_expand(kernel_pmap
, (vm_map_offset_t
)address
);
2862 cp
->mapwindow
[i
].prv_CADDR
= (caddr_t
) address
;
2863 cp
->mapwindow
[i
].prv_CMAP
= pte
;
2866 vm_map_unlock(kernel_map
);
2869 cp
->pdpt_window_index
= PMAP_PDPT_FIRST_WINDOW
;
2870 cp
->pde_window_index
= PMAP_PDE_FIRST_WINDOW
;
2871 cp
->pte_window_index
= PMAP_PTE_FIRST_WINDOW
;
2877 pmap_cpu_free(struct cpu_pmap
*cp
)
2879 if (cp
!= NULL
&& cp
!= &cpu_pmap_master
) {
2880 kfree((void *) cp
, sizeof(cpu_pmap_t
));
2886 pmap_get_mapwindow(pt_entry_t pentry
)
2891 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
2894 * Note: 0th map reserved for pmap_pte()
2896 for (i
= PMAP_NWINDOWS_FIRSTFREE
; i
< PMAP_NWINDOWS
; i
++) {
2897 mp
= ¤t_cpu_datap()->cpu_pmap
->mapwindow
[i
];
2899 if (*mp
->prv_CMAP
== 0) {
2900 pmap_store_pte(mp
->prv_CMAP
, pentry
);
2902 invlpg((uintptr_t)mp
->prv_CADDR
);
2907 panic("pmap_get_mapwindow: no windows available");
2914 pmap_put_mapwindow(mapwindow_t
*mp
)
2916 pmap_store_pte(mp
->prv_CMAP
, 0);
2920 pmap_switch(pmap_t tpmap
)
2924 s
= splhigh(); /* Make sure interruptions are disabled */
2926 set_dirbase(tpmap
, current_thread());
2933 * disable no-execute capability on
2934 * the specified pmap
2936 void pmap_disable_NX(pmap_t pmap
) {
2938 pmap
->nx_enabled
= 0;
2942 pt_fake_zone_info(int *count
, vm_size_t
*cur_size
, vm_size_t
*max_size
, vm_size_t
*elem_size
,
2943 vm_size_t
*alloc_size
, int *collectable
, int *exhaustable
)
2945 *count
= inuse_ptepages_count
;
2946 *cur_size
= PAGE_SIZE
* inuse_ptepages_count
;
2947 *max_size
= PAGE_SIZE
* (inuse_ptepages_count
+ vm_page_inactive_count
+ vm_page_active_count
+ vm_page_free_count
);
2948 *elem_size
= PAGE_SIZE
;
2949 *alloc_size
= PAGE_SIZE
;
2955 vm_offset_t
pmap_cpu_high_map_vaddr(int cpu
, enum high_cpu_types e
)
2957 enum high_fixed_addresses a
;
2958 a
= e
+ HIGH_CPU_END
* cpu
;
2959 return pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN
+ a
);
2962 vm_offset_t
pmap_high_map_vaddr(enum high_cpu_types e
)
2964 return pmap_cpu_high_map_vaddr(cpu_number(), e
);
2967 vm_offset_t
pmap_high_map(pt_entry_t pte
, enum high_cpu_types e
)
2969 enum high_fixed_addresses a
;
2972 a
= e
+ HIGH_CPU_END
* cpu_number();
2973 vaddr
= (vm_offset_t
)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN
+ a
);
2974 pmap_store_pte(pte_unique_base
+ a
, pte
);
2976 /* TLB flush for this page for this cpu */
2977 invlpg((uintptr_t)vaddr
);
2983 pmap_cpuset_NMIPI(cpu_set cpu_mask
) {
2984 unsigned int cpu
, cpu_bit
;
2987 for (cpu
= 0, cpu_bit
= 1; cpu
< real_ncpus
; cpu
++, cpu_bit
<<= 1) {
2988 if (cpu_mask
& cpu_bit
)
2989 cpu_NMI_interrupt(cpu
);
2991 deadline
= mach_absolute_time() + (LockTimeOut
);
2992 while (mach_absolute_time() < deadline
)
2997 * Called with pmap locked, we:
2998 * - scan through per-cpu data to see which other cpus need to flush
2999 * - send an IPI to each non-idle cpu to be flushed
3000 * - wait for all to signal back that they are inactive or we see that
3001 * they are in an interrupt handler or at a safe point
3002 * - flush the local tlb is active for this pmap
3003 * - return ... the caller will unlock the pmap
3006 pmap_flush_tlbs(pmap_t pmap
)
3009 unsigned int cpu_bit
;
3010 cpu_set cpus_to_signal
;
3011 unsigned int my_cpu
= cpu_number();
3012 pmap_paddr_t pmap_cr3
= pmap
->pm_cr3
;
3013 boolean_t flush_self
= FALSE
;
3016 assert((processor_avail_count
< 2) ||
3017 (ml_get_interrupts_enabled() && get_preemption_level() != 0));
3020 * Scan other cpus for matching active or task CR3.
3021 * For idle cpus (with no active map) we mark them invalid but
3022 * don't signal -- they'll check as they go busy.
3023 * Note: for the kernel pmap we look for 64-bit shared address maps.
3026 for (cpu
= 0, cpu_bit
= 1; cpu
< real_ncpus
; cpu
++, cpu_bit
<<= 1) {
3027 if (!cpu_datap(cpu
)->cpu_running
)
3029 if ((cpu_datap(cpu
)->cpu_task_cr3
== pmap_cr3
) ||
3030 (CPU_GET_ACTIVE_CR3(cpu
) == pmap_cr3
) ||
3031 (pmap
->pm_shared
) ||
3032 ((pmap
== kernel_pmap
) &&
3033 (!CPU_CR3_IS_ACTIVE(cpu
) ||
3034 cpu_datap(cpu
)->cpu_task_map
== TASK_MAP_64BIT_SHARED
))) {
3035 if (cpu
== my_cpu
) {
3039 cpu_datap(cpu
)->cpu_tlb_invalid
= TRUE
;
3040 __asm__
volatile("mfence");
3042 if (CPU_CR3_IS_ACTIVE(cpu
)) {
3043 cpus_to_signal
|= cpu_bit
;
3044 i386_signal_cpu(cpu
, MP_TLB_FLUSH
, ASYNC
);
3049 PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS
) | DBG_FUNC_START
,
3050 (int) pmap
, cpus_to_signal
, flush_self
, 0, 0);
3052 if (cpus_to_signal
) {
3053 cpu_set cpus_to_respond
= cpus_to_signal
;
3055 deadline
= mach_absolute_time() + LockTimeOut
;
3057 * Wait for those other cpus to acknowledge
3059 while (cpus_to_respond
!= 0) {
3060 if (mach_absolute_time() > deadline
) {
3061 if (mp_recent_debugger_activity())
3063 if (!panic_active()) {
3064 pmap_tlb_flush_timeout
= TRUE
;
3065 pmap_cpuset_NMIPI(cpus_to_respond
);
3067 panic("pmap_flush_tlbs() timeout: "
3068 "cpu(s) failing to respond to interrupts, pmap=%p cpus_to_respond=0x%lx",
3069 pmap
, cpus_to_respond
);
3072 for (cpu
= 0, cpu_bit
= 1; cpu
< real_ncpus
; cpu
++, cpu_bit
<<= 1) {
3073 if ((cpus_to_respond
& cpu_bit
) != 0) {
3074 if (!cpu_datap(cpu
)->cpu_running
||
3075 cpu_datap(cpu
)->cpu_tlb_invalid
== FALSE
||
3076 !CPU_CR3_IS_ACTIVE(cpu
)) {
3077 cpus_to_respond
&= ~cpu_bit
;
3081 if (cpus_to_respond
== 0)
3087 * Flush local tlb if required.
3088 * We need this flush even if the pmap being changed
3089 * is the user map... in case we do a copyin/out
3090 * before returning to user mode.
3095 if ((pmap
== kernel_pmap
) && (flush_self
!= TRUE
)) {
3096 panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap
->pm_cr3
, current_cpu_datap()->cpu_active_cr3
, current_cpu_datap()->cpu_task_map
);
3099 PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS
) | DBG_FUNC_END
,
3100 (int) pmap
, cpus_to_signal
, flush_self
, 0, 0);
3104 process_pmap_updates(void)
3106 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
3110 current_cpu_datap()->cpu_tlb_invalid
= FALSE
;
3111 __asm__
volatile("mfence");
3115 pmap_update_interrupt(void)
3117 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT
) | DBG_FUNC_START
,
3120 process_pmap_updates();
3122 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT
) | DBG_FUNC_END
,
3127 unsigned int pmap_cache_attributes(ppnum_t pn
) {
3129 if (!managed_page(ppn_to_pai(pn
)))
3130 return (VM_WIMG_IO
);
3132 return (VM_WIMG_COPYBACK
);
3141 kprintf("pmap 0x%x\n",p
);
3143 kprintf(" pm_cr3 0x%llx\n",p
->pm_cr3
);
3144 kprintf(" pm_pml4 0x%x\n",p
->pm_pml4
);
3145 kprintf(" pm_pdpt 0x%x\n",p
->pm_pdpt
);
3147 kprintf(" pml4[0] 0x%llx\n",*p
->pm_pml4
);
3149 kprintf(" pdpt[%d] 0x%llx\n",i
, p
->pm_pdpt
[i
]);
3152 void pmap_dump_wrap(void)
3154 pmap_dump(current_cpu_datap()->cpu_active_thread
->task
->map
->pmap
);
3158 dump_4GB_pdpt(pmap_t p
)
3161 pdpt_entry_t
*user_pdptp
;
3162 pdpt_entry_t
*kern_pdptp
;
3163 pdpt_entry_t
*pml4p
;
3166 while ((user_pdptp
= pmap64_pdpt(p
, 0x0)) == PDPT_ENTRY_NULL
) {
3168 pmap_expand_pml4(p
, 0x0);
3171 kern_pdptp
= kernel_pmap
->pm_pdpt
;
3172 if (kern_pdptp
== NULL
)
3173 panic("kern_pdptp == NULL");
3174 kprintf("dump_4GB_pdpt(%p)\n"
3175 "kern_pdptp=%p (phys=0x%016llx)\n"
3176 "\t 0x%08x: 0x%016llx\n"
3177 "\t 0x%08x: 0x%016llx\n"
3178 "\t 0x%08x: 0x%016llx\n"
3179 "\t 0x%08x: 0x%016llx\n"
3180 "\t 0x%08x: 0x%016llx\n"
3181 "user_pdptp=%p (phys=0x%016llx)\n"
3182 "\t 0x%08x: 0x%016llx\n"
3183 "\t 0x%08x: 0x%016llx\n"
3184 "\t 0x%08x: 0x%016llx\n"
3185 "\t 0x%08x: 0x%016llx\n"
3186 "\t 0x%08x: 0x%016llx\n",
3187 p
, kern_pdptp
, kvtophys(kern_pdptp
),
3188 kern_pdptp
+0, *(kern_pdptp
+0),
3189 kern_pdptp
+1, *(kern_pdptp
+1),
3190 kern_pdptp
+2, *(kern_pdptp
+2),
3191 kern_pdptp
+3, *(kern_pdptp
+3),
3192 kern_pdptp
+4, *(kern_pdptp
+4),
3193 user_pdptp
, kvtophys(user_pdptp
),
3194 user_pdptp
+0, *(user_pdptp
+0),
3195 user_pdptp
+1, *(user_pdptp
+1),
3196 user_pdptp
+2, *(user_pdptp
+2),
3197 user_pdptp
+3, *(user_pdptp
+3),
3198 user_pdptp
+4, *(user_pdptp
+4));
3199 kprintf("user pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
3200 p
->pm_cr3
, p
->pm_hold
, p
->pm_pml4
);
3201 pml4p
= (pdpt_entry_t
*)p
->pm_hold
;
3203 panic("user pml4p == NULL");
3204 kprintf("\t 0x%08x: 0x%016llx\n"
3205 "\t 0x%08x: 0x%016llx\n",
3207 pml4p
+KERNEL_UBER_PML4_INDEX
, *(pml4p
+KERNEL_UBER_PML4_INDEX
));
3208 kprintf("kern pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
3209 kernel_pmap
->pm_cr3
, kernel_pmap
->pm_hold
, kernel_pmap
->pm_pml4
);
3210 pml4p
= (pdpt_entry_t
*)kernel_pmap
->pm_hold
;
3212 panic("kern pml4p == NULL");
3213 kprintf("\t 0x%08x: 0x%016llx\n"
3214 "\t 0x%08x: 0x%016llx\n",
3216 pml4p
+511, *(pml4p
+511));
3220 void dump_4GB_pdpt_thread(thread_t tp
)
3222 dump_4GB_pdpt(tp
->map
->pmap
);