2 * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 * Carnegie Mellon requests users of this software to return to
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
61 * Author: Avadis Tevanian, Jr., Michael Wayne Young
62 * (These guys wrote the Vax version)
64 * Physical Map management code for Intel i386, i486, and i860.
66 * Manages physical address maps.
68 * In addition to hardware address maps, this
69 * module is called upon to provide software-use-only
70 * maps which may or may not be stored in the same
71 * form as hardware maps. These pseudo-maps are
72 * used to store intermediate results from copy
73 * operations to and from address spaces.
75 * Since the information managed by this module is
76 * also stored by the logical address mapping module,
77 * this module may throw away valid virtual-to-physical
78 * mappings at almost any time. However, invalidations
79 * of virtual-to-physical mappings must be done as
82 * In order to cope with hardware architectures which
83 * make virtual-to-physical map invalidates expensive,
84 * this module may delay invalidate or reduced protection
85 * operations until such time as they are actually
86 * necessary. This module is given full information as
87 * to which processors are currently using which maps,
88 * and to when physical maps must be made correct.
93 #include <mach_ldebug.h>
95 #include <libkern/OSAtomic.h>
97 #include <mach/machine/vm_types.h>
99 #include <mach/boolean.h>
100 #include <kern/thread.h>
101 #include <kern/zalloc.h>
102 #include <kern/queue.h>
104 #include <kern/lock.h>
105 #include <kern/kalloc.h>
106 #include <kern/spl.h>
109 #include <vm/vm_map.h>
110 #include <vm/vm_kern.h>
111 #include <mach/vm_param.h>
112 #include <mach/vm_prot.h>
113 #include <vm/vm_object.h>
114 #include <vm/vm_page.h>
116 #include <mach/machine/vm_param.h>
117 #include <machine/thread.h>
119 #include <kern/misc_protos.h> /* prototyping */
120 #include <i386/misc_protos.h>
122 #include <i386/cpuid.h>
123 #include <i386/cpu_data.h>
124 #include <i386/cpu_number.h>
125 #include <i386/machine_cpu.h>
126 #include <i386/seg.h>
127 #include <i386/serial_io.h>
128 #include <i386/cpu_capabilities.h>
129 #include <i386/machine_routines.h>
130 #include <i386/proc_reg.h>
131 #include <i386/tsc.h>
132 #include <i386/acpi.h>
133 #include <i386/pmap_internal.h>
136 #include <ddb/db_command.h>
137 #include <ddb/db_output.h>
138 #include <ddb/db_sym.h>
139 #include <ddb/db_print.h>
140 #endif /* MACH_KDB */
142 #include <vm/vm_protos.h>
145 #include <i386/mp_desc.h>
146 #include <i386/i386_lowmem.h>
147 #include <i386/lowglobals.h>
150 /* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
151 #ifdef DEBUGINTERRUPTS
152 #define pmap_intr_assert() {if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) panic("pmap interrupt assert %s, %d",__FILE__, __LINE__);}
154 #define pmap_intr_assert()
160 #define POSTCODE_DELAY 1
161 #include <i386/postcode.h>
162 #endif /* IWANTTODEBUG */
165 void dump_pmap(pmap_t
);
166 void dump_4GB_pdpt(pmap_t p
);
167 void dump_4GB_pdpt_thread(thread_t tp
);
170 int nx_enabled
= 1; /* enable no-execute protection */
171 #ifdef CONFIG_EMBEDDED
172 int allow_data_exec
= 0; /* no exec from data, embedded is hardcore like that */
174 int allow_data_exec
= VM_ABI_32
; /* 32-bit apps may execute data by default, 64-bit apps may not */
176 int allow_stack_exec
= 0; /* No apps may execute from the stack by default */
179 boolean_t cpu_64bit
= FALSE
;
181 const boolean_t cpu_64bit
= TRUE
;
183 boolean_t pmap_trace
= FALSE
;
185 uint64_t max_preemption_latency_tsc
= 0;
187 pv_hashed_entry_t
*pv_hash_table
; /* hash lists */
189 uint32_t npvhash
= 0;
192 * pv_list entries are kept on a list that can only be accessed
193 * with the pmap system locked (at SPLVM, not in the cpus_active set).
194 * The list is refilled from the pv_hashed_list_zone if it becomes empty.
196 pv_rooted_entry_t pv_free_list
= PV_ROOTED_ENTRY_NULL
; /* free list at SPLVM */
197 pv_hashed_entry_t pv_hashed_free_list
= PV_HASHED_ENTRY_NULL
;
198 pv_hashed_entry_t pv_hashed_kern_free_list
= PV_HASHED_ENTRY_NULL
;
199 decl_simple_lock_data(,pv_hashed_free_list_lock
)
200 decl_simple_lock_data(,pv_hashed_kern_free_list_lock
)
201 decl_simple_lock_data(,pv_hash_table_lock
)
203 zone_t pv_hashed_list_zone
; /* zone of pv_hashed_entry structures */
205 static zone_t pdpt_zone
;
208 * First and last physical addresses that we maintain any information
209 * for. Initialized to zero so that pmap operations done before
210 * pmap_init won't touch any non-existent structures.
212 boolean_t pmap_initialized
= FALSE
;/* Has pmap_init completed? */
214 static struct vm_object kptobj_object_store
;
215 static vm_object_t kptobj
;
218 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
222 * Array of physical page attribites for managed pages.
223 * One byte per physical page.
225 char *pmap_phys_attributes
;
226 unsigned int last_managed_page
= 0;
228 uint64_t pde_mapped_size
;
231 * Locking and TLB invalidation
235 * Locking Protocols: (changed 2/2007 JK)
237 * There are two structures in the pmap module that need locking:
238 * the pmaps themselves, and the per-page pv_lists (which are locked
239 * by locking the pv_lock_table entry that corresponds to the pv_head
240 * for the list in question.) Most routines want to lock a pmap and
241 * then do operations in it that require pv_list locking -- however
242 * pmap_remove_all and pmap_copy_on_write operate on a physical page
243 * basis and want to do the locking in the reverse order, i.e. lock
244 * a pv_list and then go through all the pmaps referenced by that list.
246 * The system wide pmap lock has been removed. Now, paths take a lock
247 * on the pmap before changing its 'shape' and the reverse order lockers
248 * (coming in by phys ppn) take a lock on the corresponding pv and then
249 * retest to be sure nothing changed during the window before they locked
250 * and can then run up/down the pv lists holding the list lock. This also
251 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
260 #define LOCK_PVH(index) { \
261 mp_disable_preemption(); \
262 lock_pvh_pai(index); \
265 #define UNLOCK_PVH(index) { \
266 unlock_pvh_pai(index); \
267 mp_enable_preemption(); \
274 #define LOCK_PV_HASH(hash) lock_hash_hash(hash)
276 #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
279 extern int max_lock_loops
;
281 unsigned int loop_count; \
282 loop_count = disable_serial_output ? max_lock_loops \
284 #define LOOP_CHECK(msg, pmap) \
285 if (--loop_count == 0) { \
286 mp_disable_preemption(); \
287 kprintf("%s: cpu %d pmap %x\n", \
288 msg, cpu_number(), pmap); \
289 Debugger("deadlock detection"); \
290 mp_enable_preemption(); \
291 loop_count = max_lock_loops; \
293 #else /* USLOCK_DEBUG */
295 #define LOOP_CHECK(msg, pmap)
296 #endif /* USLOCK_DEBUG */
298 unsigned pmap_memory_region_count
;
299 unsigned pmap_memory_region_current
;
301 pmap_memory_region_t pmap_memory_regions
[PMAP_MEMORY_REGIONS_SIZE
];
304 * Other useful macros.
306 #define current_pmap() (vm_map_pmap(current_thread()->map))
308 struct pmap kernel_pmap_store
;
311 pd_entry_t high_shared_pde
;
312 pd_entry_t commpage64_pde
;
314 struct zone
*pmap_zone
; /* zone of pmap structures */
316 int pmap_debug
= 0; /* flag for debugging prints */
318 unsigned int inuse_ptepages_count
= 0;
319 long long alloc_ptepages_count
__attribute__((aligned(8))) = 0LL; /* aligned for atomic access */
320 unsigned int bootstrap_wired_pages
= 0;
321 int pt_fake_zone_index
= -1;
323 extern long NMIPI_acks
;
326 PMAP_ZINFO_SALLOC(vm_size_t bytes
)
328 current_thread()->tkm_shared
.alloc
+= bytes
;
332 PMAP_ZINFO_SFREE(vm_size_t bytes
)
334 current_thread()->tkm_shared
.free
+= (bytes
);
337 addr64_t kernel64_cr3
;
338 boolean_t no_shared_cr3
= FALSE
; /* -no_shared_cr3 boot arg */
340 boolean_t kernel_text_ps_4K
= TRUE
;
341 boolean_t wpkernel
= TRUE
;
346 pt_entry_t
*DMAP1
, *DMAP2
;
351 * for legacy, returns the address of the pde entry.
352 * for 64 bit, causes the pdpt page containing the pde entry to be mapped,
353 * then returns the mapped address of the pde entry in that page
356 pmap_pde(pmap_t m
, vm_map_offset_t v
)
359 if (!cpu_64bit
|| (m
== kernel_pmap
)) {
360 pde
= (&((m
)->dirbase
[(vm_offset_t
)(v
) >> PDESHIFT
]));
363 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
364 pde
= pmap64_pde(m
, v
);
370 * the single pml4 page per pmap is allocated at pmap create time and exists
371 * for the duration of the pmap. we allocate this page in kernel vm (to save us one
372 * level of page table dynamic mapping.
373 * this returns the address of the requested pml4 entry in the top level page.
377 pmap64_pml4(pmap_t pmap
, vm_map_offset_t vaddr
)
379 return ((pml4_entry_t
*)pmap
->pm_hold
+ ((vm_offset_t
)((vaddr
>>PML4SHIFT
)&(NPML4PG
-1))));
383 * maps in the pml4 page, if any, containing the pdpt entry requested
384 * and returns the address of the pdpt entry in that mapped page
387 pmap64_pdpt(pmap_t pmap
, vm_map_offset_t vaddr
)
394 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
395 if ((vaddr
> 0x00007FFFFFFFFFFFULL
) && (vaddr
< 0xFFFF800000000000ULL
)) {
399 pml4
= pmap64_pml4(pmap
, vaddr
);
401 if (pml4
&& ((*pml4
& INTEL_PTE_VALID
))) {
403 newpf
= *pml4
& PG_FRAME
;
406 for (i
=PMAP_PDPT_FIRST_WINDOW
; i
< PMAP_PDPT_FIRST_WINDOW
+PMAP_PDPT_NWINDOWS
; i
++) {
407 if (((*(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
)) & PG_FRAME
) == newpf
) {
408 return((pdpt_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CADDR
) +
409 ((vm_offset_t
)((vaddr
>>PDPTSHIFT
)&(NPDPTPG
-1))));
413 current_cpu_datap()->cpu_pmap
->pdpt_window_index
++;
414 if (current_cpu_datap()->cpu_pmap
->pdpt_window_index
> (PMAP_PDPT_FIRST_WINDOW
+PMAP_PDPT_NWINDOWS
-1))
415 current_cpu_datap()->cpu_pmap
->pdpt_window_index
= PMAP_PDPT_FIRST_WINDOW
;
417 (current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pdpt_window_index
].prv_CMAP
),
418 newpf
| INTEL_PTE_RW
| INTEL_PTE_VALID
);
419 invlpg((u_int
)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pdpt_window_index
].prv_CADDR
));
420 return ((pdpt_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pdpt_window_index
].prv_CADDR
) +
421 ((vm_offset_t
)((vaddr
>>PDPTSHIFT
)&(NPDPTPG
-1))));
428 * maps in the pdpt page, if any, containing the pde entry requested
429 * and returns the address of the pde entry in that mapped page
432 pmap64_pde(pmap_t pmap
, vm_map_offset_t vaddr
)
439 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
440 if ((vaddr
> 0x00007FFFFFFFFFFFULL
) && (vaddr
< 0xFFFF800000000000ULL
)) {
444 /* if (vaddr & (1ULL << 63)) panic("neg addr");*/
445 pdpt
= pmap64_pdpt(pmap
, vaddr
);
447 if (pdpt
&& ((*pdpt
& INTEL_PTE_VALID
))) {
449 newpf
= *pdpt
& PG_FRAME
;
451 for (i
=PMAP_PDE_FIRST_WINDOW
; i
< PMAP_PDE_FIRST_WINDOW
+PMAP_PDE_NWINDOWS
; i
++) {
452 if (((*(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
)) & PG_FRAME
) == newpf
) {
453 return((pd_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CADDR
) +
454 ((vm_offset_t
)((vaddr
>>PDSHIFT
)&(NPDPG
-1))));
458 current_cpu_datap()->cpu_pmap
->pde_window_index
++;
459 if (current_cpu_datap()->cpu_pmap
->pde_window_index
> (PMAP_PDE_FIRST_WINDOW
+PMAP_PDE_NWINDOWS
-1))
460 current_cpu_datap()->cpu_pmap
->pde_window_index
= PMAP_PDE_FIRST_WINDOW
;
462 (current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pde_window_index
].prv_CMAP
),
463 newpf
| INTEL_PTE_RW
| INTEL_PTE_VALID
);
464 invlpg((u_int
)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pde_window_index
].prv_CADDR
));
465 return ((pd_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pde_window_index
].prv_CADDR
) +
466 ((vm_offset_t
)((vaddr
>>PDSHIFT
)&(NPDPG
-1))));
473 * Because the page tables (top 3 levels) are mapped into per cpu windows,
474 * callers must either disable interrupts or disable preemption before calling
475 * one of the pte mapping routines (e.g. pmap_pte()) as the returned vaddr
476 * is in one of those mapped windows and that cannot be allowed to change until
477 * the caller is done using the returned pte pointer. When done, the caller
478 * restores interrupts or preemption to its previous state after which point the
479 * vaddr for the returned pte can no longer be used
484 * return address of mapped pte for vaddr va in pmap pmap.
485 * must be called with pre-emption or interrupts disabled
486 * if targeted pmap is not the kernel pmap
487 * since we may be passing back a virtual address that is
488 * associated with this cpu... pre-emption or interrupts
489 * must remain disabled until the caller is done using
490 * the pointer that was passed back .
492 * maps the pde page, if any, containing the pte in and returns
493 * the address of the pte in that mapped page
496 pmap_pte(pmap_t pmap
, vm_map_offset_t vaddr
)
503 pde
= pmap_pde(pmap
,vaddr
);
505 if (pde
&& ((*pde
& INTEL_PTE_VALID
))) {
506 if (*pde
& INTEL_PTE_PS
)
508 if (pmap
== kernel_pmap
)
509 return (vtopte(vaddr
)); /* compat kernel still has pte's mapped */
511 if (ml_get_interrupts_enabled() && get_preemption_level() == 0)
512 panic("pmap_pte: unsafe call");
514 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
516 newpf
= *pde
& PG_FRAME
;
518 for (i
=PMAP_PTE_FIRST_WINDOW
; i
< PMAP_PTE_FIRST_WINDOW
+PMAP_PTE_NWINDOWS
; i
++) {
519 if (((*(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
)) & PG_FRAME
) == newpf
) {
520 return((pt_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CADDR
) +
521 ((vm_offset_t
)i386_btop(vaddr
) & (NPTEPG
-1)));
525 current_cpu_datap()->cpu_pmap
->pte_window_index
++;
526 if (current_cpu_datap()->cpu_pmap
->pte_window_index
> (PMAP_PTE_FIRST_WINDOW
+PMAP_PTE_NWINDOWS
-1))
527 current_cpu_datap()->cpu_pmap
->pte_window_index
= PMAP_PTE_FIRST_WINDOW
;
529 (current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pte_window_index
].prv_CMAP
),
530 newpf
| INTEL_PTE_RW
| INTEL_PTE_VALID
);
531 invlpg((u_int
)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pte_window_index
].prv_CADDR
));
532 return ((pt_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pte_window_index
].prv_CADDR
) +
533 ((vm_offset_t
)i386_btop(vaddr
) & (NPTEPG
-1)));
541 * Map memory at initialization. The physical addresses being
542 * mapped are not managed and are never unmapped.
544 * For now, VM is already on, we only need to map the
550 vm_map_offset_t start_addr
,
551 vm_map_offset_t end_addr
,
558 while (start_addr
< end_addr
) {
559 pmap_enter(kernel_pmap
, (vm_map_offset_t
)virt
,
560 (ppnum_t
) i386_btop(start_addr
), prot
, flags
, FALSE
);
567 extern pmap_paddr_t first_avail
;
568 extern vm_offset_t virtual_avail
, virtual_end
;
569 extern pmap_paddr_t avail_start
, avail_end
;
570 extern vm_offset_t sHIB
;
571 extern vm_offset_t eHIB
;
572 extern vm_offset_t stext
;
573 extern vm_offset_t etext
;
574 extern vm_offset_t sdata
;
576 extern void *KPTphys
;
582 * Here early in the life of a processor (from cpu_mode_init()).
586 * Initialize the per-cpu, TLB-related fields.
588 current_cpu_datap()->cpu_active_cr3
= kernel_pmap
->pm_cr3
;
589 current_cpu_datap()->cpu_tlb_invalid
= FALSE
;
593 pmap_high_shared_remap(enum high_fixed_addresses e
, vm_offset_t va
, int sz
)
595 vm_offset_t ve
= pmap_index_to_virt(e
);
601 assert(0 == (va
& PAGE_MASK
)); /* expecting page aligned */
603 ptep
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)ve
);
605 for (i
=0; i
< sz
; i
++) {
606 pa
= (pmap_paddr_t
) kvtophys(va
);
607 pmap_store_pte(ptep
, (pa
& PG_FRAME
)
621 pmap_cpu_high_shared_remap(int cpu
, enum high_cpu_types e
, vm_offset_t va
, int sz
)
623 enum high_fixed_addresses a
= e
+ HIGH_CPU_END
* cpu
;
624 return pmap_high_shared_remap(HIGH_FIXED_CPUS_BEGIN
+ a
, va
, sz
);
627 void pmap_init_high_shared(void);
629 extern vm_offset_t gdtptr
, idtptr
;
631 extern uint32_t low_intstack
;
633 extern struct fake_descriptor ldt_desc_pattern
;
634 extern struct fake_descriptor tss_desc_pattern
;
636 extern char hi_remap_text
, hi_remap_etext
;
637 extern char t_zero_div
;
639 pt_entry_t
*pte_unique_base
;
642 pmap_init_high_shared(void)
648 struct i386_tss
*ttss
;
651 cpu_desc_index_t
* cdi
= &cpu_data_master
.cpu_desc_index
;
653 kprintf("HIGH_MEM_BASE 0x%x fixed per-cpu begin 0x%x\n",
654 HIGH_MEM_BASE
,pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN
));
656 pte_unique_base
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN
));
659 if (i386_btop(&hi_remap_etext
- &hi_remap_text
+ 1) >
660 HIGH_FIXED_TRAMPS_END
- HIGH_FIXED_TRAMPS
+ 1)
661 panic("tramps too large");
662 haddr
= pmap_high_shared_remap(HIGH_FIXED_TRAMPS
,
663 (vm_offset_t
) &hi_remap_text
, 3);
664 kprintf("tramp: 0x%x, ",haddr
);
665 /* map gdt up high and update ptr for reload */
666 haddr
= pmap_high_shared_remap(HIGH_FIXED_GDT
,
667 (vm_offset_t
) master_gdt
, 1);
668 cdi
->cdi_gdt
.ptr
= (void *)haddr
;
669 kprintf("GDT: 0x%x, ",haddr
);
670 /* map ldt up high */
671 haddr
= pmap_high_shared_remap(HIGH_FIXED_LDT_BEGIN
,
672 (vm_offset_t
) master_ldt
,
673 HIGH_FIXED_LDT_END
- HIGH_FIXED_LDT_BEGIN
+ 1);
674 cdi
->cdi_ldt
= (struct fake_descriptor
*)haddr
;
675 kprintf("LDT: 0x%x, ",haddr
);
676 /* put new ldt addr into gdt */
677 struct fake_descriptor temp_fake_desc
;
678 temp_fake_desc
= ldt_desc_pattern
;
679 temp_fake_desc
.offset
= (vm_offset_t
) haddr
;
680 fix_desc(&temp_fake_desc
, 1);
682 *(struct fake_descriptor
*) &master_gdt
[sel_idx(KERNEL_LDT
)] = temp_fake_desc
;
683 *(struct fake_descriptor
*) &master_gdt
[sel_idx(USER_LDT
)] = temp_fake_desc
;
685 /* map idt up high */
686 haddr
= pmap_high_shared_remap(HIGH_FIXED_IDT
,
687 (vm_offset_t
) master_idt
, 1);
688 cdi
->cdi_idt
.ptr
= (void *)haddr
;
689 kprintf("IDT: 0x%x, ", haddr
);
690 /* remap ktss up high and put new high addr into gdt */
691 haddr
= pmap_high_shared_remap(HIGH_FIXED_KTSS
,
692 (vm_offset_t
) &master_ktss
, 1);
694 temp_fake_desc
= tss_desc_pattern
;
695 temp_fake_desc
.offset
= (vm_offset_t
) haddr
;
696 fix_desc(&temp_fake_desc
, 1);
697 *(struct fake_descriptor
*) &master_gdt
[sel_idx(KERNEL_TSS
)] = temp_fake_desc
;
698 kprintf("KTSS: 0x%x, ",haddr
);
700 /* remap dbtss up high and put new high addr into gdt */
701 haddr
= pmap_high_shared_remap(HIGH_FIXED_DBTSS
,
702 (vm_offset_t
) &master_dbtss
, 1);
703 temp_fake_desc
= tss_desc_pattern
;
704 temp_fake_desc
.offset
= (vm_offset_t
) haddr
;
705 fix_desc(&temp_fake_desc
, 1);
706 *(struct fake_descriptor
*)&master_gdt
[sel_idx(DEBUG_TSS
)] = temp_fake_desc
;
707 ttss
= (struct i386_tss
*)haddr
;
708 kprintf("DBTSS: 0x%x, ",haddr
);
709 #endif /* MACH_KDB */
711 /* remap dftss up high and put new high addr into gdt */
712 haddr
= pmap_high_shared_remap(HIGH_FIXED_DFTSS
,
713 (vm_offset_t
) &master_dftss
, 1);
714 temp_fake_desc
= tss_desc_pattern
;
715 temp_fake_desc
.offset
= (vm_offset_t
) haddr
;
716 fix_desc(&temp_fake_desc
, 1);
717 *(struct fake_descriptor
*) &master_gdt
[sel_idx(DF_TSS
)] = temp_fake_desc
;
718 kprintf("DFTSS: 0x%x\n",haddr
);
720 /* remap mctss up high and put new high addr into gdt */
721 haddr
= pmap_high_shared_remap(HIGH_FIXED_DFTSS
,
722 (vm_offset_t
) &master_mctss
, 1);
723 temp_fake_desc
= tss_desc_pattern
;
724 temp_fake_desc
.offset
= (vm_offset_t
) haddr
;
725 fix_desc(&temp_fake_desc
, 1);
726 *(struct fake_descriptor
*) &master_gdt
[sel_idx(MC_TSS
)] = temp_fake_desc
;
727 kprintf("MCTSS: 0x%x\n",haddr
);
729 cpu_desc_load(&cpu_data_master
);
734 * Bootstrap the system enough to run with virtual memory.
735 * Map the kernel's code and data, and allocate the system page table.
736 * Called with mapping OFF. Page_size must already be set.
741 __unused vm_offset_t load_start
,
749 vm_last_addr
= VM_MAX_KERNEL_ADDRESS
; /* Set the highest address
752 * The kernel's pmap is statically allocated so we don't
753 * have to use pmap_create, which is unlikely to work
754 * correctly at this part of the boot sequence.
758 kernel_pmap
= &kernel_pmap_store
;
759 kernel_pmap
->ref_count
= 1;
760 kernel_pmap
->nx_enabled
= FALSE
;
761 kernel_pmap
->pm_task_map
= TASK_MAP_32BIT
;
762 kernel_pmap
->pm_obj
= (vm_object_t
) NULL
;
763 kernel_pmap
->dirbase
= (pd_entry_t
*)((unsigned int)IdlePTD
| KERNBASE
);
764 kernel_pmap
->pdirbase
= (pmap_paddr_t
)((int)IdlePTD
);
765 pdpt
= (pd_entry_t
*)((unsigned int)IdlePDPT
| KERNBASE
);
766 kernel_pmap
->pm_pdpt
= pdpt
;
767 kernel_pmap
->pm_cr3
= (pmap_paddr_t
)((int)IdlePDPT
);
770 va
= (vm_offset_t
)kernel_pmap
->dirbase
;
771 /* setup self referential mapping(s) */
772 for (i
= 0; i
< NPGPTD
; i
++, pdpt
++) {
774 pa
= (pmap_paddr_t
) kvtophys((vm_offset_t
)(va
+ i386_ptob(i
)));
776 (pd_entry_t
*) (kernel_pmap
->dirbase
+ PTDPTDI
+ i
),
777 (pa
& PG_FRAME
) | INTEL_PTE_VALID
| INTEL_PTE_RW
| INTEL_PTE_REF
|
778 INTEL_PTE_MOD
| INTEL_PTE_WIRED
) ;
779 pmap_store_pte(pdpt
, pa
| INTEL_PTE_VALID
);
783 /* 32-bit and legacy support depends on IA32e mode being disabled */
787 lo_kernel_cr3
= kernel_pmap
->pm_cr3
;
788 current_cpu_datap()->cpu_kernel_cr3
= (addr64_t
) kernel_pmap
->pm_cr3
;
790 /* save the value we stuff into created pmaps to share the gdts etc */
791 high_shared_pde
= *pmap_pde(kernel_pmap
, HIGH_MEM_BASE
);
792 /* make sure G bit is on for high shared pde entry */
793 high_shared_pde
|= INTEL_PTE_GLOBAL
;
795 pmap_store_pte(pmap_pde(kernel_pmap
, HIGH_MEM_BASE
), high_shared_pde
);
799 OSAddAtomic(NKPT
, &inuse_ptepages_count
);
800 OSAddAtomic64(NKPT
, &alloc_ptepages_count
);
801 bootstrap_wired_pages
= NKPT
;
803 virtual_avail
= (vm_offset_t
)VADDR(KPTDI
,0) + (vm_offset_t
)first_avail
;
804 virtual_end
= (vm_offset_t
)(VM_MAX_KERNEL_ADDRESS
);
807 * Reserve some special page table entries/VA space for temporary
813 #define SYSMAP(c, p, v, n) \
814 v = (c)va; va += ((n)*INTEL_PGBYTES); p = pte; pte += (n)
816 for (i
=0; i
<PMAP_NWINDOWS
; i
++) {
818 (current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
),
819 (current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CADDR
),
821 *current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
= 0;
824 /* DMAP user for debugger */
825 SYSMAP(caddr_t
, DMAP1
, DADDR1
, 1);
826 SYSMAP(caddr_t
, DMAP2
, DADDR2
, 1); /* XXX temporary - can remove */
830 if (PE_parse_boot_argn("npvhash", &npvhash
, sizeof (npvhash
))) {
831 if (0 != ((npvhash
+1) & npvhash
)) {
832 kprintf("invalid hash %d, must be ((2^N)-1), using default %d\n",npvhash
,NPVHASH
);
838 printf("npvhash=%d\n",npvhash
);
840 simple_lock_init(&kernel_pmap
->lock
, 0);
841 simple_lock_init(&pv_hashed_free_list_lock
, 0);
842 simple_lock_init(&pv_hashed_kern_free_list_lock
, 0);
843 simple_lock_init(&pv_hash_table_lock
,0);
845 pmap_init_high_shared();
847 pde_mapped_size
= PDE_MAPPED_SIZE
;
850 pdpt_entry_t
*ppdpt
= IdlePDPT
;
851 pdpt_entry_t
*ppdpt64
= (pdpt_entry_t
*)IdlePDPT64
;
852 pdpt_entry_t
*ppml4
= (pdpt_entry_t
*)IdlePML4
;
853 int istate
= ml_set_interrupts_enabled(FALSE
);
856 * Clone a new 64-bit 3rd-level page table directory, IdlePML4,
857 * with page bits set for the correct IA-32e operation and so that
858 * the legacy-mode IdlePDPT is retained for slave processor start-up.
859 * This is necessary due to the incompatible use of page bits between
860 * 64-bit and legacy modes.
862 kernel_pmap
->pm_cr3
= (pmap_paddr_t
)((int)IdlePML4
); /* setup in start.s for us */
863 kernel_pmap
->pm_pml4
= IdlePML4
;
864 kernel_pmap
->pm_pdpt
= (pd_entry_t
*)
865 ((unsigned int)IdlePDPT64
| KERNBASE
);
866 #define PAGE_BITS INTEL_PTE_VALID|INTEL_PTE_RW|INTEL_PTE_USER|INTEL_PTE_REF
867 pmap_store_pte(kernel_pmap
->pm_pml4
,
868 (uint32_t)IdlePDPT64
| PAGE_BITS
);
869 pmap_store_pte((ppdpt64
+0), *(ppdpt
+0) | PAGE_BITS
);
870 pmap_store_pte((ppdpt64
+1), *(ppdpt
+1) | PAGE_BITS
);
871 pmap_store_pte((ppdpt64
+2), *(ppdpt
+2) | PAGE_BITS
);
872 pmap_store_pte((ppdpt64
+3), *(ppdpt
+3) | PAGE_BITS
);
875 * The kernel is also mapped in the uber-sapce at the 4GB starting
876 * 0xFFFFFF80:00000000. This is the highest entry in the 4th-level.
878 pmap_store_pte((ppml4
+KERNEL_UBER_PML4_INDEX
), *(ppml4
+0));
880 kernel64_cr3
= (addr64_t
) kernel_pmap
->pm_cr3
;
882 /* Re-initialize descriptors and prepare to switch modes */
883 cpu_desc_init64(&cpu_data_master
);
884 current_cpu_datap()->cpu_is64bit
= TRUE
;
885 current_cpu_datap()->cpu_active_cr3
= kernel64_cr3
;
887 pde_mapped_size
= 512*4096 ;
889 ml_set_interrupts_enabled(istate
);
892 /* Sets 64-bit mode if required. */
893 cpu_mode_init(&cpu_data_master
);
894 /* Update in-kernel CPUID information if we're now in 64-bit mode */
898 kernel_pmap
->pm_hold
= (vm_offset_t
)kernel_pmap
->pm_pml4
;
900 kprintf("Kernel virtual space from 0x%x to 0x%x.\n",
901 VADDR(KPTDI
,0), virtual_end
);
902 printf("PAE enabled\n");
904 printf("64 bit mode enabled\n");kprintf("64 bit mode enabled\n"); }
906 kprintf("Available physical space from 0x%llx to 0x%llx\n",
907 avail_start
, avail_end
);
910 * By default for 64-bit users loaded at 4GB, share kernel mapping.
911 * But this may be overridden by the -no_shared_cr3 boot-arg.
913 if (PE_parse_boot_argn("-no_shared_cr3", &no_shared_cr3
, sizeof (no_shared_cr3
))) {
914 kprintf("Shared kernel address space disabled\n");
918 if (PE_parse_boot_argn("-pmap_trace", &pmap_trace
, sizeof (pmap_trace
))) {
919 kprintf("Kernel traces for pmap operations enabled\n");
921 #endif /* PMAP_TRACES */
929 *startp
= virtual_avail
;
934 * Initialize the pmap module.
935 * Called by vm_init, to initialize any structures that the pmap
936 * system needs to map virtual memory.
942 vm_map_offset_t vaddr
;
948 * Allocate memory for the pv_head_table and its lock bits,
949 * the modify bit array, and the pte_page table.
953 * zero bias all these arrays now instead of off avail_start
954 * so we cover all memory
957 npages
= (long)i386_btop(avail_end
);
958 s
= (vm_size_t
) (sizeof(struct pv_rooted_entry
) * npages
959 + (sizeof (struct pv_hashed_entry_t
*) * (npvhash
+1))
960 + pv_lock_table_size(npages
)
961 + pv_hash_lock_table_size((npvhash
+1))
965 if (kernel_memory_allocate(kernel_map
, &addr
, s
, 0,
966 KMA_KOBJECT
| KMA_PERMANENT
)
970 memset((char *)addr
, 0, s
);
976 if (0 == npvhash
) panic("npvhash not initialized");
980 * Allocate the structures first to preserve word-alignment.
982 pv_head_table
= (pv_rooted_entry_t
) addr
;
983 addr
= (vm_offset_t
) (pv_head_table
+ npages
);
985 pv_hash_table
= (pv_hashed_entry_t
*)addr
;
986 addr
= (vm_offset_t
) (pv_hash_table
+ (npvhash
+ 1));
988 pv_lock_table
= (char *) addr
;
989 addr
= (vm_offset_t
) (pv_lock_table
+ pv_lock_table_size(npages
));
991 pv_hash_lock_table
= (char *) addr
;
992 addr
= (vm_offset_t
) (pv_hash_lock_table
+ pv_hash_lock_table_size((npvhash
+1)));
994 pmap_phys_attributes
= (char *) addr
;
999 pmap_memory_region_t
*pmptr
= pmap_memory_regions
;
1001 last_pn
= (ppnum_t
)i386_btop(avail_end
);
1003 for (i
= 0; i
< pmap_memory_region_count
; i
++, pmptr
++) {
1004 if (pmptr
->type
== kEfiConventionalMemory
) {
1006 for (pn
= pmptr
->base
; pn
<= pmptr
->end
; pn
++) {
1008 pmap_phys_attributes
[pn
] |= PHYS_MANAGED
;
1010 if (pn
> last_managed_page
)
1011 last_managed_page
= pn
;
1013 if (pn
>= lowest_hi
&& pn
<= highest_hi
)
1014 pmap_phys_attributes
[pn
] |= PHYS_NOENCRYPT
;
1021 ppn
= pmap_find_phys(kernel_pmap
, vaddr
);
1023 pmap_phys_attributes
[ppn
] |= PHYS_NOENCRYPT
;
1029 * Create the zone of physical maps,
1030 * and of the physical-to-virtual entries.
1032 s
= (vm_size_t
) sizeof(struct pmap
);
1033 pmap_zone
= zinit(s
, 400*s
, 4096, "pmap"); /* XXX */
1034 zone_change(pmap_zone
, Z_NOENCRYPT
, TRUE
);
1036 s
= (vm_size_t
) sizeof(struct pv_hashed_entry
);
1037 pv_hashed_list_zone
= zinit(s
, 10000*s
/* Expandable zone */,
1038 4096 * 4 /* LCM i386 */, "pv_list");
1039 zone_change(pv_hashed_list_zone
, Z_NOENCRYPT
, TRUE
);
1042 pdpt_zone
= zinit(s
, 400*s
, 4096, "pdpt"); /* XXX */
1043 zone_change(pdpt_zone
, Z_NOENCRYPT
, TRUE
);
1045 kptobj
= &kptobj_object_store
;
1046 _vm_object_allocate((vm_object_size_t
)(NPGPTD
*NPTDPG
), kptobj
);
1047 kernel_pmap
->pm_obj
= kptobj
;
1049 /* create pv entries for kernel pages mapped by low level
1050 startup code. these have to exist so we can pmap_remove()
1051 e.g. kext pages from the middle of our addr space */
1053 vaddr
= (vm_map_offset_t
)0;
1054 for (ppn
= 0; ppn
< i386_btop(avail_start
) ; ppn
++ ) {
1055 pv_rooted_entry_t pv_e
;
1057 pv_e
= pai_to_pvh(ppn
);
1060 pv_e
->pmap
= kernel_pmap
;
1061 queue_init(&pv_e
->qlink
);
1064 pmap_initialized
= TRUE
;
1066 max_preemption_latency_tsc
= tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS
, tscFCvtn2t
);
1071 #define DBG(x...) kprintf("DBG: " x)
1077 * Called once VM is fully initialized so that we can release unused
1078 * sections of low memory to the general pool.
1079 * Also complete the set-up of identity-mapped sections of the kernel:
1080 * 1) write-protect kernel text
1081 * 2) map kernel text using large pages if possible
1082 * 3) read and write-protect page zero (for K32)
1083 * 4) map the global page at the appropriate virtual address.
1085 * Use of large pages
1086 * ------------------
1087 * To effectively map and write-protect all kernel text pages, the text
1088 * must be 2M-aligned at the base, and the data section above must also be
1089 * 2M-aligned. That is, there's padding below and above. This is achieved
1090 * through linker directives. Large pages are used only if this alignment
1091 * exists (and not overriden by the -kernel_text_page_4K boot-arg). The
1096 * sdata: ================== 2Meg
1100 * etext: ------------------
1108 * stext: ================== 2Meg
1112 * eHIB: ------------------
1116 * Prior to changing the mapping from 4K to 2M, the zero-padding pages
1117 * [eHIB,stext] and [etext,sdata] are ml_static_mfree()'d. Then all the
1118 * 4K pages covering [stext,etext] are coalesced as 2M large pages.
1119 * The now unused level-1 PTE pages are also freed.
1121 extern uint32_t pmap_reserved_ranges
;
1123 pmap_lowmem_finalize(void)
1128 /* Check the kernel is linked at the expected base address */
1129 if (i386_btop(kvtophys((vm_offset_t
) &IdlePML4
)) !=
1130 I386_KERNEL_IMAGE_BASE_PAGE
)
1131 panic("pmap_lowmem_finalize() unexpected kernel base address");
1134 * Update wired memory statistics for early boot pages
1136 PMAP_ZINFO_PALLOC(bootstrap_wired_pages
* PAGE_SIZE
);
1139 * Free all pages in pmap regions below the base:
1141 * We can't free all the pages to VM that EFI reports available.
1142 * Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake.
1143 * There's also a size miscalculation here: pend is one page less
1144 * than it should be but this is not fixed to be backwards
1146 * Due to this current EFI limitation, we take only the first
1147 * entry in the memory region table. However, the loop is retained
1148 * (with the intended termination criteria commented out) in the
1149 * hope that some day we can free all low-memory ranges.
1152 // pmap_memory_regions[i].end <= I386_KERNEL_IMAGE_BASE_PAGE;
1153 i
< 1 && (pmap_reserved_ranges
== 0);
1155 vm_offset_t pbase
= (vm_offset_t
)i386_ptob(pmap_memory_regions
[i
].base
);
1156 vm_offset_t pend
= (vm_offset_t
)i386_ptob(pmap_memory_regions
[i
].end
);
1157 // vm_offset_t pend = i386_ptob(pmap_memory_regions[i].end+1);
1159 DBG("ml_static_mfree(%p,%p) for pmap region %d\n",
1160 (void *) ml_static_ptovirt(pbase
),
1161 (void *) (pend
- pbase
), i
);
1162 ml_static_mfree(ml_static_ptovirt(pbase
), pend
- pbase
);
1166 * If text and data are both 2MB-aligned,
1167 * we can map text with large-pages,
1168 * unless the -kernel_text_ps_4K boot-arg overrides.
1170 if ((stext
& I386_LPGMASK
) == 0 && (sdata
& I386_LPGMASK
) == 0) {
1171 kprintf("Kernel text is 2MB aligned");
1172 kernel_text_ps_4K
= FALSE
;
1173 if (PE_parse_boot_argn("-kernel_text_ps_4K",
1175 sizeof (kernel_text_ps_4K
)))
1176 kprintf(" but will be mapped with 4K pages\n");
1178 kprintf(" and will be mapped with 2M pages\n");
1181 (void) PE_parse_boot_argn("wpkernel", &wpkernel
, sizeof (wpkernel
));
1183 kprintf("Kernel text %p-%p to be write-protected\n",
1184 (void *) stext
, (void *) etext
);
1189 * Scan over text if mappings are to be changed:
1190 * - Remap kernel text readonly unless the "wpkernel" boot-arg is 0
1191 * - Change to large-pages if possible and not overriden.
1193 if (kernel_text_ps_4K
&& wpkernel
) {
1195 for (myva
= stext
; myva
< etext
; myva
+= PAGE_SIZE
) {
1198 ptep
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)myva
);
1200 pmap_store_pte(ptep
, *ptep
& ~INTEL_PTE_RW
);
1204 if (!kernel_text_ps_4K
) {
1208 * Release zero-filled page padding used for 2M-alignment.
1210 DBG("ml_static_mfree(%p,%p) for padding below text\n",
1211 (void *) eHIB
, (void *) (stext
- eHIB
));
1212 ml_static_mfree(eHIB
, stext
- eHIB
);
1213 DBG("ml_static_mfree(%p,%p) for padding above text\n",
1214 (void *) etext
, (void *) (sdata
- etext
));
1215 ml_static_mfree(etext
, sdata
- etext
);
1218 * Coalesce text pages into large pages.
1220 for (myva
= stext
; myva
< sdata
; myva
+= I386_LPGBYTES
) {
1222 vm_offset_t pte_phys
;
1226 pdep
= pmap_pde(kernel_pmap
, (vm_map_offset_t
)myva
);
1227 ptep
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)myva
);
1228 DBG("myva: %p pdep: %p ptep: %p\n",
1229 (void *) myva
, (void *) pdep
, (void *) ptep
);
1230 if ((*ptep
& INTEL_PTE_VALID
) == 0)
1232 pte_phys
= (vm_offset_t
)(*ptep
& PG_FRAME
);
1233 pde
= *pdep
& PTMASK
; /* page attributes from pde */
1234 pde
|= INTEL_PTE_PS
; /* make it a 2M entry */
1235 pde
|= pte_phys
; /* take page frame from pte */
1238 pde
&= ~INTEL_PTE_RW
;
1239 DBG("pmap_store_pte(%p,0x%llx)\n",
1241 pmap_store_pte(pdep
, pde
);
1244 * Free the now-unused level-1 pte.
1245 * Note: ptep is a virtual address to the pte in the
1246 * recursive map. We can't use this address to free
1247 * the page. Instead we need to compute its address
1248 * in the Idle PTEs in "low memory".
1250 vm_offset_t vm_ptep
= (vm_offset_t
) KPTphys
1251 + (pte_phys
>> PTPGSHIFT
);
1252 DBG("ml_static_mfree(%p,0x%x) for pte\n",
1253 (void *) vm_ptep
, PAGE_SIZE
);
1254 ml_static_mfree(vm_ptep
, PAGE_SIZE
);
1257 /* Change variable read by sysctl machdep.pmap */
1258 pmap_kernel_text_ps
= I386_LPGBYTES
;
1261 /* no matter what, kernel page zero is not accessible */
1262 pmap_store_pte(pmap_pte(kernel_pmap
, 0), INTEL_PTE_INVALID
);
1264 /* map lowmem global page into fixed addr */
1265 pt_entry_t
*pte
= NULL
;
1266 if (0 == (pte
= pmap_pte(kernel_pmap
,
1267 VM_MIN_KERNEL_LOADED_ADDRESS
+ 0x2000)))
1268 panic("lowmem pte");
1269 /* make sure it is defined on page boundary */
1270 assert(0 == ((vm_offset_t
) &lowGlo
& PAGE_MASK
));
1271 pmap_store_pte(pte
, kvtophys((vm_offset_t
)&lowGlo
)
1281 #define managed_page(x) ( (unsigned int)x <= last_managed_page && (pmap_phys_attributes[x] & PHYS_MANAGED) )
1284 * this function is only used for debugging fron the vm layer
1290 pv_rooted_entry_t pv_h
;
1294 assert(pn
!= vm_page_fictitious_addr
);
1296 if (!pmap_initialized
)
1299 if (pn
== vm_page_guard_addr
)
1302 pai
= ppn_to_pai(pn
);
1303 if (!managed_page(pai
))
1305 pv_h
= pai_to_pvh(pn
);
1306 result
= (pv_h
->pmap
== PMAP_NULL
);
1313 vm_map_offset_t va_start
,
1314 vm_map_offset_t va_end
)
1316 vm_map_offset_t offset
;
1319 if (pmap
== PMAP_NULL
) {
1324 * Check the resident page count
1325 * - if it's zero, the pmap is completely empty.
1326 * This short-circuit test prevents a virtual address scan which is
1327 * painfully slow for 64-bit spaces.
1328 * This assumes the count is correct
1329 * .. the debug kernel ought to be checking perhaps by page table walk.
1331 if (pmap
->stats
.resident_count
== 0)
1334 for (offset
= va_start
;
1336 offset
+= PAGE_SIZE_64
) {
1337 phys_page
= pmap_find_phys(pmap
, offset
);
1339 if (pmap
!= kernel_pmap
&&
1340 pmap
->pm_task_map
== TASK_MAP_32BIT
&&
1341 offset
>= HIGH_MEM_BASE
) {
1343 * The "high_shared_pde" is used to share
1344 * the entire top-most 2MB of address space
1345 * between the kernel and all 32-bit tasks.
1346 * So none of this can be removed from 32-bit
1348 * Let's pretend there's nothing up
1353 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1354 "page %d at 0x%llx\n",
1355 pmap
, va_start
, va_end
, phys_page
, offset
);
1365 * Create and return a physical map.
1367 * If the size specified for the map
1368 * is zero, the map is an actual physical
1369 * map, and may be referenced by the
1372 * If the size specified is non-zero,
1373 * the map will be used in software only, and
1374 * is bounded by that size.
1386 pml4_entry_t
*pml4p
;
1391 PMAP_TRACE(PMAP_CODE(PMAP__CREATE
) | DBG_FUNC_START
,
1392 (int) (sz
>>32), (int) sz
, (int) is_64bit
, 0, 0);
1394 size
= (vm_size_t
) sz
;
1397 * A software use-only map doesn't even need a map.
1404 p
= (pmap_t
) zalloc(pmap_zone
);
1406 panic("pmap_create zalloc");
1408 /* init counts now since we'll be bumping some */
1409 simple_lock_init(&p
->lock
, 0);
1410 p
->stats
.resident_count
= 0;
1411 p
->stats
.resident_max
= 0;
1412 p
->stats
.wired_count
= 0;
1415 p
->pm_shared
= FALSE
;
1417 assert(!is_64bit
|| cpu_64bit
);
1418 p
->pm_task_map
= is_64bit
? TASK_MAP_64BIT
: TASK_MAP_32BIT
;;
1421 /* legacy 32 bit setup */
1422 /* in the legacy case the pdpt layer is hardwired to 4 entries and each
1423 * entry covers 1GB of addr space */
1424 if (KERN_SUCCESS
!= kmem_alloc_kobject(kernel_map
, (vm_offset_t
*)(&p
->dirbase
), NBPTD
))
1425 panic("pmap_create kmem_alloc_kobject");
1426 p
->pm_hold
= (vm_offset_t
)zalloc(pdpt_zone
);
1427 if ((vm_offset_t
)NULL
== p
->pm_hold
) {
1428 panic("pdpt zalloc");
1430 pdpt
= (pdpt_entry_t
*) (( p
->pm_hold
+ 31) & ~31);
1431 p
->pm_cr3
= (pmap_paddr_t
)kvtophys((vm_offset_t
)pdpt
);
1432 if (NULL
== (p
->pm_obj
= vm_object_allocate((vm_object_size_t
)(NPGPTD
*NPTDPG
))))
1433 panic("pmap_create vm_object_allocate");
1435 memset((char *)p
->dirbase
, 0, NBPTD
);
1437 va
= (vm_offset_t
)p
->dirbase
;
1438 p
->pdirbase
= kvtophys(va
);
1440 PMAP_ZINFO_SALLOC(NBPTD
);
1442 template = INTEL_PTE_VALID
;
1443 for (i
= 0; i
< NPGPTD
; i
++, pdpt
++ ) {
1445 pa
= (pmap_paddr_t
) kvtophys((vm_offset_t
)(va
+ i386_ptob(i
)));
1446 pmap_store_pte(pdpt
, pa
| template);
1449 /* map the high shared pde */
1451 pmap_store_pte(pmap_pde(p
, HIGH_MEM_BASE
), high_shared_pde
);
1457 /* alloc the pml4 page in kernel vm */
1458 if (KERN_SUCCESS
!= kmem_alloc_kobject(kernel_map
, (vm_offset_t
*)(&p
->pm_hold
), PAGE_SIZE
))
1459 panic("pmap_create kmem_alloc_kobject pml4");
1461 memset((char *)p
->pm_hold
, 0, PAGE_SIZE
);
1462 p
->pm_cr3
= (pmap_paddr_t
)kvtophys((vm_offset_t
)p
->pm_hold
);
1464 OSAddAtomic(1, &inuse_ptepages_count
);
1465 OSAddAtomic64(1, &alloc_ptepages_count
);
1466 PMAP_ZINFO_SALLOC(PAGE_SIZE
);
1468 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1470 if (NULL
== (p
->pm_obj_pml4
= vm_object_allocate((vm_object_size_t
)(NPML4PGS
))))
1471 panic("pmap_create pdpt obj");
1473 if (NULL
== (p
->pm_obj_pdpt
= vm_object_allocate((vm_object_size_t
)(NPDPTPGS
))))
1474 panic("pmap_create pdpt obj");
1476 if (NULL
== (p
->pm_obj
= vm_object_allocate((vm_object_size_t
)(NPDEPGS
))))
1477 panic("pmap_create pte obj");
1479 /* uber space points to uber mapped kernel */
1481 pml4p
= pmap64_pml4(p
, 0ULL);
1482 pmap_store_pte((pml4p
+KERNEL_UBER_PML4_INDEX
),*kernel_pmap
->pm_pml4
);
1486 while ((pdp
= pmap64_pde(p
, (uint64_t)HIGH_MEM_BASE
)) == PD_ENTRY_NULL
) {
1488 pmap_expand_pdpt(p
, (uint64_t)HIGH_MEM_BASE
); /* need room for another pde entry */
1491 pmap_store_pte(pdp
, high_shared_pde
);
1496 PMAP_TRACE(PMAP_CODE(PMAP__CREATE
) | DBG_FUNC_START
,
1497 (int) p
, is_64bit
, 0, 0, 0);
1503 * The following routines implement the shared address optmization for 64-bit
1504 * users with a 4GB page zero.
1506 * pmap_set_4GB_pagezero()
1507 * is called in the exec and fork paths to mirror the kernel's
1508 * mapping in the bottom 4G of the user's pmap. The task mapping changes
1509 * from TASK_MAP_64BIT to TASK_MAP_64BIT_SHARED. This routine returns
1510 * without doing anything if the -no_shared_cr3 boot-arg is set.
1512 * pmap_clear_4GB_pagezero()
1513 * is called in the exec/exit paths to undo this mirror. The task mapping
1514 * reverts to TASK_MAP_64BIT. In addition, we switch to the kernel's
1515 * CR3 by calling pmap_load_kernel_cr3().
1517 * pmap_load_kernel_cr3()
1518 * loads cr3 with the kernel's page table. In addition to being called
1519 * by pmap_clear_4GB_pagezero(), it is used both prior to teardown and
1520 * when we go idle in the context of a shared map.
1522 * Further notes on per-cpu data used:
1524 * cpu_kernel_cr3 is the cr3 for the kernel's pmap.
1525 * This is loaded in a trampoline on entering the kernel
1526 * from a 32-bit user (or non-shared-cr3 64-bit user).
1527 * cpu_task_cr3 is the cr3 for the current thread.
1528 * This is loaded in a trampoline as we exit the kernel.
1529 * cpu_active_cr3 reflects the cr3 currently loaded.
1530 * However, the low order bit is set when the
1531 * processor is idle or interrupts are disabled
1532 * while the system pmap lock is held. It is used by
1534 * cpu_task_map indicates whether the task cr3 belongs to
1535 * a 32-bit, a 64-bit or a 64-bit shared map.
1536 * The latter allows the avoidance of the cr3 load
1537 * on kernel entry and exit.
1538 * cpu_tlb_invalid set TRUE when a tlb flush is requested.
1539 * If the cr3 is "inactive" (the cpu is idle or the
1540 * system-wide pmap lock is held) this not serviced by
1541 * an IPI but at time when the cr3 becomes "active".
1545 pmap_set_4GB_pagezero(pmap_t p
)
1547 pdpt_entry_t
*user_pdptp
;
1548 pdpt_entry_t
*kern_pdptp
;
1550 assert(p
->pm_task_map
!= TASK_MAP_32BIT
);
1552 /* Kernel-shared cr3 may be disabled by boot arg. */
1557 * Set the bottom 4 3rd-level pte's to be the kernel's.
1560 while ((user_pdptp
= pmap64_pdpt(p
, 0x0)) == PDPT_ENTRY_NULL
) {
1562 pmap_expand_pml4(p
, 0x0);
1565 kern_pdptp
= kernel_pmap
->pm_pdpt
;
1566 pmap_store_pte(user_pdptp
+0, *(kern_pdptp
+0));
1567 pmap_store_pte(user_pdptp
+1, *(kern_pdptp
+1));
1568 pmap_store_pte(user_pdptp
+2, *(kern_pdptp
+2));
1569 pmap_store_pte(user_pdptp
+3, *(kern_pdptp
+3));
1570 p
->pm_task_map
= TASK_MAP_64BIT_SHARED
;
1575 pmap_clear_4GB_pagezero(pmap_t p
)
1577 pdpt_entry_t
*user_pdptp
;
1580 if (p
->pm_task_map
!= TASK_MAP_64BIT_SHARED
)
1585 p
->pm_task_map
= TASK_MAP_64BIT
;
1587 istate
= ml_set_interrupts_enabled(FALSE
);
1588 if (current_cpu_datap()->cpu_task_map
== TASK_MAP_64BIT_SHARED
)
1589 current_cpu_datap()->cpu_task_map
= TASK_MAP_64BIT
;
1590 pmap_load_kernel_cr3();
1592 user_pdptp
= pmap64_pdpt(p
, 0x0);
1593 pmap_store_pte(user_pdptp
+0, 0);
1594 pmap_store_pte(user_pdptp
+1, 0);
1595 pmap_store_pte(user_pdptp
+2, 0);
1596 pmap_store_pte(user_pdptp
+3, 0);
1598 ml_set_interrupts_enabled(istate
);
1604 pmap_load_kernel_cr3(void)
1606 uint64_t kernel_cr3
;
1608 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
1611 * Reload cr3 with the true kernel cr3.
1613 kernel_cr3
= current_cpu_datap()->cpu_kernel_cr3
;
1614 set64_cr3(kernel_cr3
);
1615 current_cpu_datap()->cpu_active_cr3
= kernel_cr3
;
1616 current_cpu_datap()->cpu_tlb_invalid
= FALSE
;
1617 __asm__
volatile("mfence");
1621 * Retire the given physical map from service.
1622 * Should only be called if the map contains
1623 * no valid mappings.
1635 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY
) | DBG_FUNC_START
,
1636 (int) p
, 0, 0, 0, 0);
1644 * If some cpu is not using the physical pmap pointer that it
1645 * is supposed to be (see set_dirbase), we might be using the
1646 * pmap that is being destroyed! Make sure we are
1647 * physically on the right pmap:
1651 0xFFFFFFFFFFFFF000ULL
);
1657 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY
) | DBG_FUNC_END
,
1658 (int) p
, 1, 0, 0, 0);
1659 return; /* still in use */
1663 * Free the memory maps, then the
1667 OSAddAtomic(-p
->pm_obj
->resident_page_count
, &inuse_ptepages_count
);
1668 PMAP_ZINFO_PFREE(p
->pm_obj
->resident_page_count
* PAGE_SIZE
);
1670 kmem_free(kernel_map
, (vm_offset_t
)p
->dirbase
, NBPTD
);
1671 PMAP_ZINFO_SFREE(NBPTD
);
1673 zfree(pdpt_zone
, (void *)p
->pm_hold
);
1675 vm_object_deallocate(p
->pm_obj
);
1678 int inuse_ptepages
= 0;
1680 /* free 64 bit mode structs */
1681 kmem_free(kernel_map
, (vm_offset_t
)p
->pm_hold
, PAGE_SIZE
);
1682 PMAP_ZINFO_SFREE(PAGE_SIZE
);
1684 inuse_ptepages
+= p
->pm_obj_pml4
->resident_page_count
;
1685 vm_object_deallocate(p
->pm_obj_pml4
);
1687 inuse_ptepages
+= p
->pm_obj_pdpt
->resident_page_count
;
1688 vm_object_deallocate(p
->pm_obj_pdpt
);
1690 inuse_ptepages
+= p
->pm_obj
->resident_page_count
;
1691 vm_object_deallocate(p
->pm_obj
);
1693 OSAddAtomic(-(inuse_ptepages
+1), &inuse_ptepages_count
);
1694 PMAP_ZINFO_PFREE(inuse_ptepages
* PAGE_SIZE
);
1697 zfree(pmap_zone
, p
);
1699 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY
) | DBG_FUNC_END
,
1705 * Add a reference to the specified pmap.
1713 if (p
!= PMAP_NULL
) {
1721 * Remove phys addr if mapped in specified map
1725 pmap_remove_some_phys(
1726 __unused pmap_t map
,
1727 __unused ppnum_t pn
)
1730 /* Implement to support working set code */
1735 * Set the physical protection on the
1736 * specified range of this map as requested.
1737 * Will not increase permissions.
1742 vm_map_offset_t sva
,
1743 vm_map_offset_t eva
,
1746 register pt_entry_t
*pde
;
1747 register pt_entry_t
*spte
, *epte
;
1748 vm_map_offset_t lva
;
1749 vm_map_offset_t orig_sva
;
1755 if (map
== PMAP_NULL
)
1758 if (prot
== VM_PROT_NONE
) {
1759 pmap_remove(map
, sva
, eva
);
1763 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT
) | DBG_FUNC_START
,
1765 (int) (sva
>>32), (int) sva
,
1766 (int) (eva
>>32), (int) eva
);
1768 if ( (prot
& VM_PROT_EXECUTE
) || !nx_enabled
|| !map
->nx_enabled
)
1777 lva
= (sva
+ pde_mapped_size
) & ~(pde_mapped_size
-1);
1780 pde
= pmap_pde(map
, sva
);
1781 if (pde
&& (*pde
& INTEL_PTE_VALID
)) {
1782 spte
= (pt_entry_t
*)pmap_pte(map
, (sva
& ~(pde_mapped_size
-1)));
1783 spte
= &spte
[ptenum(sva
)];
1784 epte
= &spte
[intel_btop(lva
-sva
)];
1786 while (spte
< epte
) {
1788 if (*spte
& INTEL_PTE_VALID
) {
1790 if (prot
& VM_PROT_WRITE
)
1791 pmap_update_pte(spte
, *spte
, (*spte
| INTEL_PTE_WRITE
));
1793 pmap_update_pte(spte
, *spte
, (*spte
& ~INTEL_PTE_WRITE
));
1796 pmap_update_pte(spte
, *spte
, (*spte
| INTEL_PTE_NX
));
1798 pmap_update_pte(spte
, *spte
, (*spte
& ~INTEL_PTE_NX
));
1809 PMAP_UPDATE_TLBS(map
, orig_sva
, eva
);
1814 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT
) | DBG_FUNC_END
,
1819 /* Map a (possibly) autogenned block */
1828 __unused
unsigned int flags
)
1832 for (page
= 0; page
< size
; page
++) {
1833 pmap_enter(pmap
, va
, pa
, prot
, attr
, TRUE
);
1840 * Routine: pmap_extract
1842 * Extract the physical page address associated
1843 * with the given map/virtual_address pair.
1844 * Change to shim for backwards compatibility but will not
1845 * work for 64 bit systems. Some old drivers that we cannot
1851 register pmap_t pmap
,
1852 vm_map_offset_t vaddr
)
1857 paddr
= (vm_offset_t
)0;
1858 ppn
= pmap_find_phys(pmap
, vaddr
);
1861 paddr
= ((vm_offset_t
)i386_ptob(ppn
)) | ((vm_offset_t
)vaddr
& INTEL_OFFMASK
);
1869 vm_map_offset_t vaddr
)
1871 register vm_page_t m
;
1872 register pmap_paddr_t pa
;
1876 pml4_entry_t
*pml4p
;
1878 if (kernel_pmap
== map
) panic("expand kernel pml4");
1881 pml4p
= pmap64_pml4(map
, vaddr
);
1883 if (PML4_ENTRY_NULL
== pml4p
) panic("pmap_expand_pml4 no pml4p");
1886 * Allocate a VM page for the pml4 page
1888 while ((m
= vm_page_grab()) == VM_PAGE_NULL
)
1892 * put the page into the pmap's obj list so it
1893 * can be found later.
1897 i
= pml4idx(map
, vaddr
);
1904 vm_page_lockspin_queues();
1906 vm_page_unlock_queues();
1908 OSAddAtomic(1, &inuse_ptepages_count
);
1909 OSAddAtomic64(1, &alloc_ptepages_count
);
1910 PMAP_ZINFO_PALLOC(PAGE_SIZE
);
1912 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
1913 vm_object_lock(map
->pm_obj_pml4
);
1917 * See if someone else expanded us first
1919 if (pmap64_pdpt(map
, vaddr
) != PDPT_ENTRY_NULL
) {
1921 vm_object_unlock(map
->pm_obj_pml4
);
1925 OSAddAtomic(-1, &inuse_ptepages_count
);
1926 PMAP_ZINFO_PFREE(PAGE_SIZE
);
1929 pmap_set_noencrypt(pn
);
1932 if (0 != vm_page_lookup(map
->pm_obj_pml4
, (vm_object_offset_t
)i
)) {
1933 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
1934 map
, map
->pm_obj_pml4
, vaddr
, i
);
1937 vm_page_insert(m
, map
->pm_obj_pml4
, (vm_object_offset_t
)i
);
1938 vm_object_unlock(map
->pm_obj_pml4
);
1941 * Set the page directory entry for this page table.
1943 pml4p
= pmap64_pml4(map
, vaddr
); /* refetch under lock */
1945 pmap_store_pte(pml4p
, pa_to_pte(pa
)
1959 vm_map_offset_t vaddr
)
1961 register vm_page_t m
;
1962 register pmap_paddr_t pa
;
1966 pdpt_entry_t
*pdptp
;
1968 if (kernel_pmap
== map
) panic("expand kernel pdpt");
1971 while ((pdptp
= pmap64_pdpt(map
, vaddr
)) == PDPT_ENTRY_NULL
) {
1973 pmap_expand_pml4(map
, vaddr
); /* need room for another pdpt entry */
1979 * Allocate a VM page for the pdpt page
1981 while ((m
= vm_page_grab()) == VM_PAGE_NULL
)
1985 * put the page into the pmap's obj list so it
1986 * can be found later.
1990 i
= pdptidx(map
, vaddr
);
1997 vm_page_lockspin_queues();
1999 vm_page_unlock_queues();
2001 OSAddAtomic(1, &inuse_ptepages_count
);
2002 OSAddAtomic64(1, &alloc_ptepages_count
);
2003 PMAP_ZINFO_PALLOC(PAGE_SIZE
);
2005 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2006 vm_object_lock(map
->pm_obj_pdpt
);
2010 * See if someone else expanded us first
2012 if (pmap64_pde(map
, vaddr
) != PD_ENTRY_NULL
) {
2014 vm_object_unlock(map
->pm_obj_pdpt
);
2018 OSAddAtomic(-1, &inuse_ptepages_count
);
2019 PMAP_ZINFO_PFREE(PAGE_SIZE
);
2022 pmap_set_noencrypt(pn
);
2025 if (0 != vm_page_lookup(map
->pm_obj_pdpt
, (vm_object_offset_t
)i
)) {
2026 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
2027 map
, map
->pm_obj_pdpt
, vaddr
, i
);
2030 vm_page_insert(m
, map
->pm_obj_pdpt
, (vm_object_offset_t
)i
);
2031 vm_object_unlock(map
->pm_obj_pdpt
);
2034 * Set the page directory entry for this page table.
2036 pdptp
= pmap64_pdpt(map
, vaddr
); /* refetch under lock */
2038 pmap_store_pte(pdptp
, pa_to_pte(pa
)
2052 * Routine: pmap_expand
2054 * Expands a pmap to be able to map the specified virtual address.
2056 * Allocates new virtual memory for the P0 or P1 portion of the
2057 * pmap, then re-maps the physical pages that were in the old
2058 * pmap to be in the new pmap.
2060 * Must be called with the pmap system and the pmap unlocked,
2061 * since these must be unlocked to use vm_allocate or vm_deallocate.
2062 * Thus it must be called in a loop that checks whether the map
2063 * has been expanded enough.
2064 * (We won't loop forever, since page tables aren't shrunk.)
2069 vm_map_offset_t vaddr
)
2072 register vm_page_t m
;
2073 register pmap_paddr_t pa
;
2079 * if not the kernel map (while we are still compat kernel mode)
2080 * and we are 64 bit, propagate expand upwards
2083 if (cpu_64bit
&& (map
!= kernel_pmap
)) {
2085 while ((pdp
= pmap64_pde(map
, vaddr
)) == PD_ENTRY_NULL
) {
2087 pmap_expand_pdpt(map
, vaddr
); /* need room for another pde entry */
2094 * Allocate a VM page for the pde entries.
2096 while ((m
= vm_page_grab()) == VM_PAGE_NULL
)
2100 * put the page into the pmap's obj list so it
2101 * can be found later.
2105 i
= pdeidx(map
, vaddr
);
2112 vm_page_lockspin_queues();
2114 vm_page_unlock_queues();
2116 OSAddAtomic(1, &inuse_ptepages_count
);
2117 OSAddAtomic64(1, &alloc_ptepages_count
);
2118 PMAP_ZINFO_PALLOC(PAGE_SIZE
);
2120 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2121 vm_object_lock(map
->pm_obj
);
2125 * See if someone else expanded us first
2128 if (pmap_pte(map
, vaddr
) != PT_ENTRY_NULL
) {
2130 vm_object_unlock(map
->pm_obj
);
2134 OSAddAtomic(-1, &inuse_ptepages_count
);
2135 PMAP_ZINFO_PFREE(PAGE_SIZE
);
2138 pmap_set_noencrypt(pn
);
2141 if (0 != vm_page_lookup(map
->pm_obj
, (vm_object_offset_t
)i
)) {
2142 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
2143 map
, map
->pm_obj
, vaddr
, i
);
2146 vm_page_insert(m
, map
->pm_obj
, (vm_object_offset_t
)i
);
2147 vm_object_unlock(map
->pm_obj
);
2150 * refetch while locked
2153 pdp
= pmap_pde(map
, vaddr
);
2156 * Set the page directory entry for this page table.
2158 pmap_store_pte(pdp
, pa_to_pte(pa
)
2170 * pmap_sync_page_data_phys(ppnum_t pa)
2172 * Invalidates all of the instruction cache on a physical page and
2173 * pushes any dirty data from the data cache for the same physical page
2174 * Not required in i386.
2177 pmap_sync_page_data_phys(__unused ppnum_t pa
)
2183 * pmap_sync_page_attributes_phys(ppnum_t pa)
2185 * Write back and invalidate all cachelines on a physical page.
2188 pmap_sync_page_attributes_phys(ppnum_t pa
)
2190 cache_flush_page_phys(pa
);
2195 #ifdef CURRENTLY_UNUSED_AND_UNTESTED
2201 * Routine: pmap_collect
2203 * Garbage collects the physical map system for
2204 * pages which are no longer used.
2205 * Success need not be guaranteed -- that is, there
2206 * may well be pages which are not referenced, but
2207 * others may be collected.
2209 * Called by the pageout daemon when pages are scarce.
2215 register pt_entry_t
*pdp
, *ptp
;
2222 if (p
== kernel_pmap
)
2226 * Garbage collect map.
2230 for (pdp
= (pt_entry_t
*)p
->dirbase
;
2231 pdp
< (pt_entry_t
*)&p
->dirbase
[(UMAXPTDI
+1)];
2234 if (*pdp
& INTEL_PTE_VALID
) {
2235 if(*pdp
& INTEL_PTE_REF
) {
2236 pmap_store_pte(pdp
, *pdp
& ~INTEL_PTE_REF
);
2240 ptp
= pmap_pte(p
, pdetova(pdp
- (pt_entry_t
*)p
->dirbase
));
2241 eptp
= ptp
+ NPTEPG
;
2244 * If the pte page has any wired mappings, we cannot
2249 register pt_entry_t
*ptep
;
2250 for (ptep
= ptp
; ptep
< eptp
; ptep
++) {
2251 if (iswired(*ptep
)) {
2259 * Remove the virtual addresses mapped by this pte page.
2261 pmap_remove_range(p
,
2262 pdetova(pdp
- (pt_entry_t
*)p
->dirbase
),
2267 * Invalidate the page directory pointer.
2269 pmap_store_pte(pdp
, 0x0);
2274 * And free the pte page itself.
2277 register vm_page_t m
;
2279 vm_object_lock(p
->pm_obj
);
2281 m
= vm_page_lookup(p
->pm_obj
,(vm_object_offset_t
)(pdp
- (pt_entry_t
*)&p
->dirbase
[0]));
2282 if (m
== VM_PAGE_NULL
)
2283 panic("pmap_collect: pte page not in object");
2285 vm_object_unlock(p
->pm_obj
);
2289 OSAddAtomic(-1, &inuse_ptepages_count
);
2290 PMAP_ZINFO_PFREE(PAGE_SIZE
);
2299 PMAP_UPDATE_TLBS(p
, 0x0, 0xFFFFFFFFFFFFF000ULL
);
2308 pmap_copy_page(ppnum_t src
, ppnum_t dst
)
2310 bcopy_phys((addr64_t
)i386_ptob(src
),
2311 (addr64_t
)i386_ptob(dst
),
2317 * Routine: pmap_pageable
2319 * Make the specified pages (by pmap, offset)
2320 * pageable (or not) as requested.
2322 * A page which is not pageable may not take
2323 * a fault; therefore, its page table entry
2324 * must remain valid for the duration.
2326 * This routine is merely advisory; pmap_enter
2327 * will specify that these pages are to be wired
2328 * down (or not) as appropriate.
2332 __unused pmap_t pmap
,
2333 __unused vm_map_offset_t start_addr
,
2334 __unused vm_map_offset_t end_addr
,
2335 __unused boolean_t pageable
)
2338 pmap
++; start_addr
++; end_addr
++; pageable
++;
2343 invalidate_icache(__unused vm_offset_t addr
,
2344 __unused
unsigned cnt
,
2350 flush_dcache(__unused vm_offset_t addr
,
2351 __unused
unsigned count
,
2359 * Constrain DTrace copyin/copyout actions
2361 extern kern_return_t
dtrace_copyio_preflight(addr64_t
);
2362 extern kern_return_t
dtrace_copyio_postflight(addr64_t
);
2364 kern_return_t
dtrace_copyio_preflight(__unused addr64_t va
)
2366 thread_t thread
= current_thread();
2368 if (current_map() == kernel_map
)
2369 return KERN_FAILURE
;
2370 else if (thread
->machine
.specFlags
& CopyIOActive
)
2371 return KERN_FAILURE
;
2373 return KERN_SUCCESS
;
2376 kern_return_t
dtrace_copyio_postflight(__unused addr64_t va
)
2378 return KERN_SUCCESS
;
2380 #endif /* CONFIG_DTRACE */
2384 /* show phys page mappings and attributes */
2386 extern void db_show_page(pmap_paddr_t pa
);
2390 db_show_page(pmap_paddr_t pa
)
2397 pv_h
= pai_to_pvh(pai
);
2399 attr
= pmap_phys_attributes
[pai
];
2400 printf("phys page %llx ", pa
);
2401 if (attr
& PHYS_MODIFIED
)
2402 printf("modified, ");
2403 if (attr
& PHYS_REFERENCED
)
2404 printf("referenced, ");
2405 if (pv_h
->pmap
|| pv_h
->next
)
2406 printf(" mapped at\n");
2408 printf(" not mapped\n");
2409 for (; pv_h
; pv_h
= pv_h
->next
)
2411 printf("%llx in pmap %p\n", pv_h
->va
, pv_h
->pmap
);
2415 #endif /* MACH_KDB */
2419 void db_kvtophys(vm_offset_t
);
2420 void db_show_vaddrs(pt_entry_t
*);
2423 * print out the results of kvtophys(arg)
2429 db_printf("0x%qx", kvtophys(vaddr
));
2433 * Walk the pages tables.
2437 pt_entry_t
*dirbase
)
2439 pt_entry_t
*ptep
, *pdep
, tmp
;
2440 unsigned int x
, y
, pdecnt
, ptecnt
;
2443 dirbase
= kernel_pmap
->dirbase
;
2446 db_printf("need a dirbase...\n");
2449 dirbase
= (pt_entry_t
*) (int) ((unsigned long) dirbase
& ~INTEL_OFFMASK
);
2451 db_printf("dirbase: 0x%x\n", dirbase
);
2453 pdecnt
= ptecnt
= 0;
2455 for (y
= 0; y
< NPDEPG
; y
++, pdep
++) {
2456 if (((tmp
= *pdep
) & INTEL_PTE_VALID
) == 0) {
2460 ptep
= (pt_entry_t
*) ((unsigned long)(*pdep
) & ~INTEL_OFFMASK
);
2461 db_printf("dir[%4d]: 0x%x\n", y
, *pdep
);
2462 for (x
= 0; x
< NPTEPG
; x
++, ptep
++) {
2463 if (((tmp
= *ptep
) & INTEL_PTE_VALID
) == 0) {
2467 db_printf(" tab[%4d]: 0x%x, va=0x%x, pa=0x%x\n",
2470 (y
<< 22) | (x
<< 12),
2471 *ptep
& ~INTEL_OFFMASK
);
2475 db_printf("total: %d tables, %d page table entries.\n", pdecnt
, ptecnt
);
2479 #endif /* MACH_KDB */
2481 #include <mach_vm_debug.h>
2483 #include <vm/vm_debug.h>
2486 pmap_list_resident_pages(
2487 __unused pmap_t pmap
,
2488 __unused vm_offset_t
*listp
,
2493 #endif /* MACH_VM_DEBUG */
2497 /* temporary workaround */
2499 coredumpok(__unused vm_map_t map
, __unused vm_offset_t va
)
2504 ptep
= pmap_pte(map
->pmap
, va
);
2507 return ((*ptep
& (INTEL_PTE_NCACHE
| INTEL_PTE_WIRED
)) != (INTEL_PTE_NCACHE
| INTEL_PTE_WIRED
));
2518 assert(pn
!= vm_page_fictitious_addr
);
2520 if (!pmap_initialized
)
2523 if (pn
== vm_page_guard_addr
)
2526 if (!managed_page(ppn_to_pai(pn
)))
2533 pmap_commpage32_init(vm_offset_t kernel_commpage
, vm_offset_t user_commpage
, int cnt
)
2536 pt_entry_t
*opte
, *npte
;
2540 for (i
= 0; i
< cnt
; i
++) {
2542 opte
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)kernel_commpage
);
2544 panic("kernel_commpage");
2545 pte
= *opte
| INTEL_PTE_USER
|INTEL_PTE_GLOBAL
;
2546 pte
&= ~INTEL_PTE_WRITE
; // ensure read only
2547 npte
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)user_commpage
);
2549 panic("user_commpage");
2550 pmap_store_pte(npte
, pte
);
2552 kernel_commpage
+= INTEL_PGBYTES
;
2553 user_commpage
+= INTEL_PGBYTES
;
2558 #define PMAP_COMMPAGE64_CNT (_COMM_PAGE64_AREA_USED/PAGE_SIZE)
2559 pt_entry_t pmap_commpage64_ptes
[PMAP_COMMPAGE64_CNT
];
2562 pmap_commpage64_init(vm_offset_t kernel_commpage
, __unused vm_map_offset_t user_commpage
, int cnt
)
2567 PMAP_LOCK(kernel_pmap
);
2569 for (i
= 0; i
< cnt
; i
++) {
2570 kptep
= pmap_pte(kernel_pmap
, (uint64_t)kernel_commpage
+ (i
*PAGE_SIZE
));
2571 if ((0 == kptep
) || (0 == (*kptep
& INTEL_PTE_VALID
)))
2572 panic("pmap_commpage64_init pte");
2573 pmap_commpage64_ptes
[i
] = ((*kptep
& ~INTEL_PTE_WRITE
) | INTEL_PTE_USER
);
2575 PMAP_UNLOCK(kernel_pmap
);
2579 static cpu_pmap_t cpu_pmap_master
;
2582 pmap_cpu_alloc(boolean_t is_boot_cpu
)
2587 vm_offset_t address
;
2588 vm_map_address_t mapaddr
;
2589 vm_map_entry_t entry
;
2593 cp
= &cpu_pmap_master
;
2596 * The per-cpu pmap data structure itself.
2598 ret
= kmem_alloc(kernel_map
,
2599 (vm_offset_t
*) &cp
, sizeof(cpu_pmap_t
));
2600 if (ret
!= KERN_SUCCESS
) {
2601 printf("pmap_cpu_alloc() failed ret=%d\n", ret
);
2604 bzero((void *)cp
, sizeof(cpu_pmap_t
));
2607 * The temporary windows used for copy/zero - see loose_ends.c
2609 ret
= vm_map_find_space(kernel_map
,
2610 &mapaddr
, PMAP_NWINDOWS
*PAGE_SIZE
, (vm_map_offset_t
)0, 0, &entry
);
2611 if (ret
!= KERN_SUCCESS
) {
2612 printf("pmap_cpu_alloc() "
2613 "vm_map_find_space ret=%d\n", ret
);
2617 address
= (vm_offset_t
)mapaddr
;
2619 for (i
= 0; i
< PMAP_NWINDOWS
; i
++, address
+= PAGE_SIZE
) {
2622 while ((pte
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)address
)) == 0)
2623 pmap_expand(kernel_pmap
, (vm_map_offset_t
)address
);
2625 cp
->mapwindow
[i
].prv_CADDR
= (caddr_t
) address
;
2626 cp
->mapwindow
[i
].prv_CMAP
= pte
;
2629 vm_map_unlock(kernel_map
);
2632 cp
->pdpt_window_index
= PMAP_PDPT_FIRST_WINDOW
;
2633 cp
->pde_window_index
= PMAP_PDE_FIRST_WINDOW
;
2634 cp
->pte_window_index
= PMAP_PTE_FIRST_WINDOW
;
2640 pmap_cpu_free(struct cpu_pmap
*cp
)
2642 if (cp
!= NULL
&& cp
!= &cpu_pmap_master
) {
2643 kfree((void *) cp
, sizeof(cpu_pmap_t
));
2648 pmap_get_mapwindow(pt_entry_t pentry
)
2653 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
2654 /* fold in cache attributes for this physical page */
2655 pentry
|= pmap_get_cache_attributes(i386_btop(pte_to_pa(pentry
)));
2657 * Note: 0th map reserved for pmap_pte()
2659 for (i
= PMAP_NWINDOWS_FIRSTFREE
; i
< PMAP_NWINDOWS
; i
++) {
2660 mp
= ¤t_cpu_datap()->cpu_pmap
->mapwindow
[i
];
2662 if (*mp
->prv_CMAP
== 0) {
2663 pmap_store_pte(mp
->prv_CMAP
, pentry
);
2665 invlpg((uintptr_t)mp
->prv_CADDR
);
2670 panic("pmap_get_mapwindow: no windows available");
2677 pmap_put_mapwindow(mapwindow_t
*mp
)
2679 pmap_store_pte(mp
->prv_CMAP
, 0);
2683 pmap_switch(pmap_t tpmap
)
2687 s
= splhigh(); /* Make sure interruptions are disabled */
2689 set_dirbase(tpmap
, current_thread());
2696 * disable no-execute capability on
2697 * the specified pmap
2699 void pmap_disable_NX(pmap_t pmap
) {
2701 pmap
->nx_enabled
= 0;
2705 pt_fake_zone_init(int zone_index
)
2707 pt_fake_zone_index
= zone_index
;
2711 pt_fake_zone_info(int *count
,
2712 vm_size_t
*cur_size
, vm_size_t
*max_size
, vm_size_t
*elem_size
, vm_size_t
*alloc_size
,
2713 uint64_t *sum_size
, int *collectable
, int *exhaustable
, int *caller_acct
)
2715 *count
= inuse_ptepages_count
;
2716 *cur_size
= PAGE_SIZE
* inuse_ptepages_count
;
2717 *max_size
= PAGE_SIZE
* (inuse_ptepages_count
+ vm_page_inactive_count
+ vm_page_active_count
+ vm_page_free_count
);
2718 *elem_size
= PAGE_SIZE
;
2719 *alloc_size
= PAGE_SIZE
;
2720 *sum_size
= alloc_ptepages_count
* PAGE_SIZE
;
2727 vm_offset_t
pmap_cpu_high_map_vaddr(int cpu
, enum high_cpu_types e
)
2729 enum high_fixed_addresses a
;
2730 a
= e
+ HIGH_CPU_END
* cpu
;
2731 return pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN
+ a
);
2734 vm_offset_t
pmap_high_map_vaddr(enum high_cpu_types e
)
2736 return pmap_cpu_high_map_vaddr(cpu_number(), e
);
2739 vm_offset_t
pmap_high_map(pt_entry_t pte
, enum high_cpu_types e
)
2741 enum high_fixed_addresses a
;
2744 a
= e
+ HIGH_CPU_END
* cpu_number();
2745 vaddr
= (vm_offset_t
)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN
+ a
);
2746 pmap_store_pte(pte_unique_base
+ a
, pte
);
2748 /* TLB flush for this page for this cpu */
2749 invlpg((uintptr_t)vaddr
);
2755 pmap_cpuset_NMIPI(cpu_set cpu_mask
) {
2756 unsigned int cpu
, cpu_bit
;
2759 for (cpu
= 0, cpu_bit
= 1; cpu
< real_ncpus
; cpu
++, cpu_bit
<<= 1) {
2760 if (cpu_mask
& cpu_bit
)
2761 cpu_NMI_interrupt(cpu
);
2763 deadline
= mach_absolute_time() + (((uint64_t)LockTimeOut
) * 3);
2764 while (mach_absolute_time() < deadline
)
2769 * Called with pmap locked, we:
2770 * - scan through per-cpu data to see which other cpus need to flush
2771 * - send an IPI to each non-idle cpu to be flushed
2772 * - wait for all to signal back that they are inactive or we see that
2773 * they are in an interrupt handler or at a safe point
2774 * - flush the local tlb is active for this pmap
2775 * - return ... the caller will unlock the pmap
2778 pmap_flush_tlbs(pmap_t pmap
, vm_map_offset_t startv
, vm_map_offset_t endv
)
2781 unsigned int cpu_bit
;
2782 cpu_set cpus_to_signal
;
2783 unsigned int my_cpu
= cpu_number();
2784 pmap_paddr_t pmap_cr3
= pmap
->pm_cr3
;
2785 boolean_t flush_self
= FALSE
;
2788 assert((processor_avail_count
< 2) ||
2789 (ml_get_interrupts_enabled() && get_preemption_level() != 0));
2792 * Scan other cpus for matching active or task CR3.
2793 * For idle cpus (with no active map) we mark them invalid but
2794 * don't signal -- they'll check as they go busy.
2795 * Note: for the kernel pmap we look for 64-bit shared address maps.
2798 for (cpu
= 0, cpu_bit
= 1; cpu
< real_ncpus
; cpu
++, cpu_bit
<<= 1) {
2799 if (!cpu_datap(cpu
)->cpu_running
)
2801 if ((cpu_datap(cpu
)->cpu_task_cr3
== pmap_cr3
) ||
2802 (CPU_GET_ACTIVE_CR3(cpu
) == pmap_cr3
) ||
2803 (pmap
->pm_shared
) ||
2804 ((pmap
== kernel_pmap
) &&
2805 (!CPU_CR3_IS_ACTIVE(cpu
) ||
2806 cpu_datap(cpu
)->cpu_task_map
== TASK_MAP_64BIT_SHARED
))) {
2807 if (cpu
== my_cpu
) {
2811 cpu_datap(cpu
)->cpu_tlb_invalid
= TRUE
;
2812 __asm__
volatile("mfence");
2814 if (CPU_CR3_IS_ACTIVE(cpu
)) {
2815 cpus_to_signal
|= cpu_bit
;
2816 i386_signal_cpu(cpu
, MP_TLB_FLUSH
, ASYNC
);
2821 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS
) | DBG_FUNC_START
,
2822 (uintptr_t) pmap
, cpus_to_signal
, flush_self
, startv
, 0);
2824 if (cpus_to_signal
) {
2825 cpu_set cpus_to_respond
= cpus_to_signal
;
2827 deadline
= mach_absolute_time() + LockTimeOut
;
2829 * Wait for those other cpus to acknowledge
2831 while (cpus_to_respond
!= 0) {
2834 for (cpu
= 0, cpu_bit
= 1; cpu
< real_ncpus
; cpu
++, cpu_bit
<<= 1) {
2835 if ((cpus_to_respond
& cpu_bit
) != 0) {
2836 if (!cpu_datap(cpu
)->cpu_running
||
2837 cpu_datap(cpu
)->cpu_tlb_invalid
== FALSE
||
2838 !CPU_CR3_IS_ACTIVE(cpu
)) {
2839 cpus_to_respond
&= ~cpu_bit
;
2843 if (cpus_to_respond
== 0)
2847 if (cpus_to_respond
&& (mach_absolute_time() > deadline
)) {
2848 if (machine_timeout_suspended())
2850 pmap_tlb_flush_timeout
= TRUE
;
2851 orig_acks
= NMIPI_acks
;
2852 pmap_cpuset_NMIPI(cpus_to_respond
);
2854 panic("TLB invalidation IPI timeout: "
2855 "CPU(s) failed to respond to interrupts, unresponsive CPU bitmap: 0x%lx, NMIPI acks: orig: 0x%lx, now: 0x%lx",
2856 cpus_to_respond
, orig_acks
, NMIPI_acks
);
2861 * Flush local tlb if required.
2862 * We need this flush even if the pmap being changed
2863 * is the user map... in case we do a copyin/out
2864 * before returning to user mode.
2869 if ((pmap
== kernel_pmap
) && (flush_self
!= TRUE
)) {
2870 panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap
->pm_cr3
, current_cpu_datap()->cpu_active_cr3
, current_cpu_datap()->cpu_task_map
);
2873 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS
) | DBG_FUNC_END
,
2874 (uintptr_t) pmap
, cpus_to_signal
, startv
, endv
, 0);
2878 process_pmap_updates(void)
2880 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
2884 current_cpu_datap()->cpu_tlb_invalid
= FALSE
;
2885 __asm__
volatile("mfence");
2889 pmap_update_interrupt(void)
2891 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT
) | DBG_FUNC_START
,
2894 process_pmap_updates();
2896 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT
) | DBG_FUNC_END
,
2905 kprintf("pmap 0x%x\n",p
);
2907 kprintf(" pm_cr3 0x%llx\n",p
->pm_cr3
);
2908 kprintf(" pm_pml4 0x%x\n",p
->pm_pml4
);
2909 kprintf(" pm_pdpt 0x%x\n",p
->pm_pdpt
);
2911 kprintf(" pml4[0] 0x%llx\n",*p
->pm_pml4
);
2913 kprintf(" pdpt[%d] 0x%llx\n",i
, p
->pm_pdpt
[i
]);
2916 void pmap_dump_wrap(void)
2918 pmap_dump(current_cpu_datap()->cpu_active_thread
->task
->map
->pmap
);
2922 dump_4GB_pdpt(pmap_t p
)
2925 pdpt_entry_t
*user_pdptp
;
2926 pdpt_entry_t
*kern_pdptp
;
2927 pdpt_entry_t
*pml4p
;
2930 while ((user_pdptp
= pmap64_pdpt(p
, 0x0)) == PDPT_ENTRY_NULL
) {
2932 pmap_expand_pml4(p
, 0x0);
2935 kern_pdptp
= kernel_pmap
->pm_pdpt
;
2936 if (kern_pdptp
== NULL
)
2937 panic("kern_pdptp == NULL");
2938 kprintf("dump_4GB_pdpt(%p)\n"
2939 "kern_pdptp=%p (phys=0x%016llx)\n"
2940 "\t 0x%08x: 0x%016llx\n"
2941 "\t 0x%08x: 0x%016llx\n"
2942 "\t 0x%08x: 0x%016llx\n"
2943 "\t 0x%08x: 0x%016llx\n"
2944 "\t 0x%08x: 0x%016llx\n"
2945 "user_pdptp=%p (phys=0x%016llx)\n"
2946 "\t 0x%08x: 0x%016llx\n"
2947 "\t 0x%08x: 0x%016llx\n"
2948 "\t 0x%08x: 0x%016llx\n"
2949 "\t 0x%08x: 0x%016llx\n"
2950 "\t 0x%08x: 0x%016llx\n",
2951 p
, kern_pdptp
, kvtophys(kern_pdptp
),
2952 kern_pdptp
+0, *(kern_pdptp
+0),
2953 kern_pdptp
+1, *(kern_pdptp
+1),
2954 kern_pdptp
+2, *(kern_pdptp
+2),
2955 kern_pdptp
+3, *(kern_pdptp
+3),
2956 kern_pdptp
+4, *(kern_pdptp
+4),
2957 user_pdptp
, kvtophys(user_pdptp
),
2958 user_pdptp
+0, *(user_pdptp
+0),
2959 user_pdptp
+1, *(user_pdptp
+1),
2960 user_pdptp
+2, *(user_pdptp
+2),
2961 user_pdptp
+3, *(user_pdptp
+3),
2962 user_pdptp
+4, *(user_pdptp
+4));
2963 kprintf("user pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
2964 p
->pm_cr3
, p
->pm_hold
, p
->pm_pml4
);
2965 pml4p
= (pdpt_entry_t
*)p
->pm_hold
;
2967 panic("user pml4p == NULL");
2968 kprintf("\t 0x%08x: 0x%016llx\n"
2969 "\t 0x%08x: 0x%016llx\n",
2971 pml4p
+KERNEL_UBER_PML4_INDEX
, *(pml4p
+KERNEL_UBER_PML4_INDEX
));
2972 kprintf("kern pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
2973 kernel_pmap
->pm_cr3
, kernel_pmap
->pm_hold
, kernel_pmap
->pm_pml4
);
2974 pml4p
= (pdpt_entry_t
*)kernel_pmap
->pm_hold
;
2976 panic("kern pml4p == NULL");
2977 kprintf("\t 0x%08x: 0x%016llx\n"
2978 "\t 0x%08x: 0x%016llx\n",
2980 pml4p
+511, *(pml4p
+511));
2984 void dump_4GB_pdpt_thread(thread_t tp
)
2986 dump_4GB_pdpt(tp
->map
->pmap
);