2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 * Carnegie Mellon requests users of this software to return to
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
61 * Author: Avadis Tevanian, Jr., Michael Wayne Young
62 * (These guys wrote the Vax version)
64 * Physical Map management code for Intel i386, i486, and i860.
66 * Manages physical address maps.
68 * In addition to hardware address maps, this
69 * module is called upon to provide software-use-only
70 * maps which may or may not be stored in the same
71 * form as hardware maps. These pseudo-maps are
72 * used to store intermediate results from copy
73 * operations to and from address spaces.
75 * Since the information managed by this module is
76 * also stored by the logical address mapping module,
77 * this module may throw away valid virtual-to-physical
78 * mappings at almost any time. However, invalidations
79 * of virtual-to-physical mappings must be done as
82 * In order to cope with hardware architectures which
83 * make virtual-to-physical map invalidates expensive,
84 * this module may delay invalidate or reduced protection
85 * operations until such time as they are actually
86 * necessary. This module is given full information as
87 * to which processors are currently using which maps,
88 * and to when physical maps must be made correct.
94 #include <mach_ldebug.h>
96 #include <libkern/OSAtomic.h>
98 #include <mach/machine/vm_types.h>
100 #include <mach/boolean.h>
101 #include <kern/thread.h>
102 #include <kern/zalloc.h>
103 #include <kern/queue.h>
105 #include <kern/lock.h>
106 #include <kern/kalloc.h>
107 #include <kern/spl.h>
110 #include <vm/vm_map.h>
111 #include <vm/vm_kern.h>
112 #include <mach/vm_param.h>
113 #include <mach/vm_prot.h>
114 #include <vm/vm_object.h>
115 #include <vm/vm_page.h>
117 #include <mach/machine/vm_param.h>
118 #include <machine/thread.h>
120 #include <kern/misc_protos.h> /* prototyping */
121 #include <i386/misc_protos.h>
123 #include <i386/cpuid.h>
124 #include <i386/cpu_data.h>
125 #include <i386/cpu_number.h>
126 #include <i386/machine_cpu.h>
127 #include <i386/mp_slave_boot.h>
128 #include <i386/seg.h>
129 #include <i386/serial_io.h>
130 #include <i386/cpu_capabilities.h>
131 #include <i386/machine_routines.h>
132 #include <i386/proc_reg.h>
133 #include <i386/tsc.h>
136 #include <ddb/db_command.h>
137 #include <ddb/db_output.h>
138 #include <ddb/db_sym.h>
139 #include <ddb/db_print.h>
140 #endif /* MACH_KDB */
142 #include <vm/vm_protos.h>
145 #include <i386/mp_desc.h>
147 #include <sys/kdebug.h>
149 /* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
150 #ifdef DEBUGINTERRUPTS
151 #define pmap_intr_assert() {if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) panic("pmap interrupt assert %s, %d",__FILE__, __LINE__);}
153 #define pmap_intr_assert()
159 #define POSTCODE_DELAY 1
160 #include <i386/postcode.h>
161 #endif /* IWANTTODEBUG */
163 //#define PMAP_TRACES 1
165 boolean_t pmap_trace
= FALSE
;
166 #define PMAP_TRACE(x,a,b,c,d,e) \
168 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
171 #define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e)
172 #endif /* PMAP_TRACES */
175 * Forward declarations for internal functions.
177 void pmap_expand_pml4(
181 void pmap_expand_pdpt(
185 void pmap_remove_range(
191 void phys_attribute_clear(
195 int phys_attribute_test(
199 void phys_attribute_set(
203 void pmap_set_reference(
211 boolean_t
phys_page_exists(
216 void dump_pmap(pmap_t
);
217 void dump_4GB_pdpt(pmap_t p
);
218 void dump_4GB_pdpt_thread(thread_t tp
);
221 #define iswired(pte) ((pte) & INTEL_PTE_WIRED)
223 int nx_enabled
= 1; /* enable no-execute protection */
224 #ifdef CONFIG_EMBEDDED
225 int allow_data_exec
= 0; /* no exec from data, embedded is hardcore like that */
227 int allow_data_exec
= VM_ABI_32
; /* 32-bit apps may execute data by default, 64-bit apps may not */
229 int allow_stack_exec
= 0; /* No apps may execute from the stack by default */
234 * when spinning through pmap_remove
235 * ensure that we don't spend too much
236 * time with preemption disabled.
237 * I'm setting the current threshold
240 #define MAX_PREEMPTION_LATENCY_NS 20000
242 uint64_t max_preemption_latency_tsc
= 0;
246 * Private data structures.
250 * For each vm_page_t, there is a list of all currently
251 * valid virtual mappings of that page. An entry is
252 * a pv_rooted_entry_t; the list is the pv_table.
254 * N.B. with the new combo rooted/hashed scheme it is
255 * only possibly to remove individual non-rooted entries
256 * if they are found via the hashed chains as there is no
257 * way to unlink the singly linked hashed entries if navigated to
258 * via the queue list off the rooted entries. Think of it as
259 * hash/walk/pull, keeping track of the prev pointer while walking
260 * the singly linked hash list. All of this is to save memory and
261 * keep both types of pv_entries as small as possible.
266 PV HASHING Changes - JK 1/2007
268 Pve's establish physical to virtual mappings. These are used for aliasing of a
269 physical page to (potentially many) virtual addresses within pmaps. In the previous
270 implementation the structure of the pv_entries (each 16 bytes in size) was
272 typedef struct pv_entry {
273 struct pv_entry_t next;
278 An initial array of these is created at boot time, one per physical page of memory,
279 indexed by the physical page number. Additionally, a pool of entries is created from a
280 pv_zone to be used as needed by pmap_enter() when it is creating new mappings.
281 Originally, we kept this pool around because the code in pmap_enter() was unable to
282 block if it needed an entry and none were available - we'd panic. Some time ago I
283 restructured the pmap_enter() code so that for user pmaps it can block while zalloc'ing
284 a pv structure and restart, removing a panic from the code (in the case of the kernel
285 pmap we cannot block and still panic, so, we keep a separate hot pool for use only on
286 kernel pmaps). The pool has not been removed since there is a large performance gain
287 keeping freed pv's around for reuse and not suffering the overhead of zalloc for every new pv we need.
289 As pmap_enter() created new mappings it linked the new pve's for them off the fixed
290 pv array for that ppn (off the next pointer). These pve's are accessed for several
291 operations, one of them being address space teardown. In that case, we basically do this
293 for (every page/pte in the space) {
294 calc pve_ptr from the ppn in the pte
295 for (every pv in the list for the ppn) {
296 if (this pv is for this pmap/vaddr) {
303 The problem arose when we were running, say 8000 (or even 2000) apache or other processes
304 and one or all terminate. The list hanging off each pv array entry could have thousands of
305 entries. We were continuously linearly searching each of these lists as we stepped through
306 the address space we were tearing down. Because of the locks we hold, likely taking a cache
307 miss for each node, and interrupt disabling for MP issues the system became completely
308 unresponsive for many seconds while we did this.
310 Realizing that pve's are accessed in two distinct ways (linearly running the list by ppn
311 for operations like pmap_page_protect and finding and modifying/removing a single pve as
312 part of pmap_enter processing) has led to modifying the pve structures and databases.
314 There are now two types of pve structures. A "rooted" structure which is basically the
315 original structure accessed in an array by ppn, and a ''hashed'' structure accessed on a
316 hash list via a hash of [pmap, vaddr]. These have been designed with the two goals of
317 minimizing wired memory and making the lookup of a ppn faster. Since a vast majority of
318 pages in the system are not aliased and hence represented by a single pv entry I've kept
319 the rooted entry size as small as possible because there is one of these dedicated for
320 every physical page of memory. The hashed pve's are larger due to the addition of the hash
321 link and the ppn entry needed for matching while running the hash list to find the entry we
322 are looking for. This way, only systems that have lots of aliasing (like 2000+ httpd procs)
323 will pay the extra memory price. Both structures have the same first three fields allowing
324 some simplification in the code.
326 They have these shapes
328 typedef struct pv_rooted_entry {
332 } *pv_rooted_entry_t;
335 typedef struct pv_hashed_entry {
340 struct pv_hashed_entry *nexth;
341 } *pv_hashed_entry_t;
343 The main flow difference is that the code is now aware of the rooted entry and the hashed
344 entries. Code that runs the pv list still starts with the rooted entry and then continues
345 down the qlink onto the hashed entries. Code that is looking up a specific pv entry first
346 checks the rooted entry and then hashes and runs the hash list for the match. The hash list
347 lengths are much smaller than the original pv lists that contained all aliases for the specific ppn.
351 typedef struct pv_rooted_entry
{ /* first three entries must match pv_hashed_entry_t */
353 vm_map_offset_t va
; /* virtual address for mapping */
354 pmap_t pmap
; /* pmap where mapping lies */
355 } *pv_rooted_entry_t
;
357 #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
359 pv_rooted_entry_t pv_head_table
; /* array of entries, one per page */
361 typedef struct pv_hashed_entry
{ /* first three entries must match pv_rooted_entry_t */
366 struct pv_hashed_entry
*nexth
;
367 } *pv_hashed_entry_t
;
369 #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
371 #define NPVHASH 4095 /* MUST BE 2^N - 1 */
372 pv_hashed_entry_t
*pv_hash_table
; /* hash lists */
374 uint32_t npvhash
= 0;
376 /* #define PV_DEBUG 1 uncomment to enable some PV debugging code */
378 #define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
380 #define CHK_NPVHASH()
384 * pv_list entries are kept on a list that can only be accessed
385 * with the pmap system locked (at SPLVM, not in the cpus_active set).
386 * The list is refilled from the pv_hashed_list_zone if it becomes empty.
388 pv_rooted_entry_t pv_free_list
= PV_ROOTED_ENTRY_NULL
; /* free list at SPLVM */
389 pv_hashed_entry_t pv_hashed_free_list
= PV_HASHED_ENTRY_NULL
;
390 pv_hashed_entry_t pv_hashed_kern_free_list
= PV_HASHED_ENTRY_NULL
;
391 decl_simple_lock_data(,pv_hashed_free_list_lock
)
392 decl_simple_lock_data(,pv_hashed_kern_free_list_lock
)
393 decl_simple_lock_data(,pv_hash_table_lock
)
395 int pv_free_count
= 0;
396 int pv_hashed_free_count
= 0;
397 int pv_kern_free_count
= 0;
398 int pv_hashed_kern_free_count
= 0;
399 #define PV_HASHED_LOW_WATER_MARK 5000
400 #define PV_HASHED_KERN_LOW_WATER_MARK 100
401 #define PV_HASHED_ALLOC_CHUNK 2000
402 #define PV_HASHED_KERN_ALLOC_CHUNK 50
403 thread_call_t mapping_adjust_call
;
404 static thread_call_data_t mapping_adjust_call_data
;
405 uint32_t mappingrecurse
= 0;
407 #define PV_HASHED_ALLOC(pvh_e) { \
408 simple_lock(&pv_hashed_free_list_lock); \
409 if ((pvh_e = pv_hashed_free_list) != 0) { \
410 pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
411 pv_hashed_free_count--; \
412 if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \
413 if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
414 thread_call_enter(mapping_adjust_call); \
416 simple_unlock(&pv_hashed_free_list_lock); \
419 #define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
420 simple_lock(&pv_hashed_free_list_lock); \
421 pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; \
422 pv_hashed_free_list = pvh_eh; \
423 pv_hashed_free_count += pv_cnt; \
424 simple_unlock(&pv_hashed_free_list_lock); \
427 #define PV_HASHED_KERN_ALLOC(pvh_e) { \
428 simple_lock(&pv_hashed_kern_free_list_lock); \
429 if ((pvh_e = pv_hashed_kern_free_list) != 0) { \
430 pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
431 pv_hashed_kern_free_count--; \
432 if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) \
433 if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
434 thread_call_enter(mapping_adjust_call); \
436 simple_unlock(&pv_hashed_kern_free_list_lock); \
439 #define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
440 simple_lock(&pv_hashed_kern_free_list_lock); \
441 pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; \
442 pv_hashed_kern_free_list = pvh_eh; \
443 pv_hashed_kern_free_count += pv_cnt; \
444 simple_unlock(&pv_hashed_kern_free_list_lock); \
447 zone_t pv_hashed_list_zone
; /* zone of pv_hashed_entry structures */
449 static zone_t pdpt_zone
;
452 * Each entry in the pv_head_table is locked by a bit in the
453 * pv_lock_table. The lock bits are accessed by the physical
454 * address of the page they lock.
457 char *pv_lock_table
; /* pointer to array of bits */
458 #define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
460 char *pv_hash_lock_table
;
461 #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
464 * First and last physical addresses that we maintain any information
465 * for. Initialized to zero so that pmap operations done before
466 * pmap_init won't touch any non-existent structures.
468 boolean_t pmap_initialized
= FALSE
;/* Has pmap_init completed? */
470 static struct vm_object kptobj_object_store
;
471 static vm_object_t kptobj
;
474 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
477 #define pa_index(pa) (i386_btop(pa))
478 #define ppn_to_pai(ppn) ((int)ppn)
480 #define pai_to_pvh(pai) (&pv_head_table[pai])
481 #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
482 #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
484 #define pvhashidx(pmap, va) (((uint32_t)pmap ^ ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) & npvhash)
485 #define pvhash(idx) (&pv_hash_table[idx])
487 #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
488 #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
491 * Array of physical page attribites for managed pages.
492 * One byte per physical page.
494 char *pmap_phys_attributes
;
495 unsigned int last_managed_page
= 0;
498 * Physical page attributes. Copy bits from PTE definition.
500 #define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
501 #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
502 #define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
505 * Amount of virtual memory mapped by one
506 * page-directory entry.
508 #define PDE_MAPPED_SIZE (pdetova(1))
509 uint64_t pde_mapped_size
;
512 * Locking and TLB invalidation
516 * Locking Protocols: (changed 2/2007 JK)
518 * There are two structures in the pmap module that need locking:
519 * the pmaps themselves, and the per-page pv_lists (which are locked
520 * by locking the pv_lock_table entry that corresponds to the pv_head
521 * for the list in question.) Most routines want to lock a pmap and
522 * then do operations in it that require pv_list locking -- however
523 * pmap_remove_all and pmap_copy_on_write operate on a physical page
524 * basis and want to do the locking in the reverse order, i.e. lock
525 * a pv_list and then go through all the pmaps referenced by that list.
527 * The system wide pmap lock has been removed. Now, paths take a lock
528 * on the pmap before changing its 'shape' and the reverse order lockers
529 * (coming in by phys ppn) take a lock on the corresponding pv and then
530 * retest to be sure nothing changed during the window before they locked
531 * and can then run up/down the pv lists holding the list lock. This also
532 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
540 #define PMAP_LOCK(pmap) { \
541 simple_lock(&(pmap)->lock); \
544 #define PMAP_UNLOCK(pmap) { \
545 simple_unlock(&(pmap)->lock); \
552 #define LOCK_PVH(index) { \
553 mp_disable_preemption(); \
554 lock_pvh_pai(index); \
557 #define UNLOCK_PVH(index) { \
558 unlock_pvh_pai(index); \
559 mp_enable_preemption(); \
566 #define LOCK_PV_HASH(hash) lock_hash_hash(hash)
568 #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
571 extern int max_lock_loops
;
573 unsigned int loop_count; \
574 loop_count = disable_serial_output ? max_lock_loops \
576 #define LOOP_CHECK(msg, pmap) \
577 if (--loop_count == 0) { \
578 mp_disable_preemption(); \
579 kprintf("%s: cpu %d pmap %x\n", \
580 msg, cpu_number(), pmap); \
581 Debugger("deadlock detection"); \
582 mp_enable_preemption(); \
583 loop_count = max_lock_loops; \
585 #else /* USLOCK_DEBUG */
587 #define LOOP_CHECK(msg, pmap)
588 #endif /* USLOCK_DEBUG */
591 static void pmap_flush_tlbs(pmap_t pmap
);
593 #define PMAP_UPDATE_TLBS(pmap, s, e) \
594 pmap_flush_tlbs(pmap)
597 #define MAX_TBIS_SIZE 32 /* > this -> TBIA */ /* XXX */
600 pmap_memory_region_t pmap_memory_regions
[PMAP_MEMORY_REGIONS_SIZE
];
603 * Other useful macros.
605 #define current_pmap() (vm_map_pmap(current_thread()->map))
607 struct pmap kernel_pmap_store
;
610 pd_entry_t high_shared_pde
;
611 pd_entry_t commpage64_pde
;
613 struct zone
*pmap_zone
; /* zone of pmap structures */
615 int pmap_debug
= 0; /* flag for debugging prints */
617 unsigned int inuse_ptepages_count
= 0;
619 addr64_t kernel64_cr3
;
620 boolean_t no_shared_cr3
= FALSE
; /* -no_shared_cr3 boot arg */
623 * Pmap cache. Cache is threaded through ref_count field of pmap.
624 * Max will eventually be constant -- variable for experimentation.
626 int pmap_cache_max
= 32;
627 int pmap_alloc_chunk
= 8;
628 pmap_t pmap_cache_list
;
629 int pmap_cache_count
;
630 decl_simple_lock_data(,pmap_cache_lock
)
635 extern uint32_t lowGlo
;
637 pt_entry_t
*DMAP1
, *DMAP2
;
642 void pmap_pvh_unlink(pv_hashed_entry_t pv
);
645 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
646 * properly deals with the anchor.
647 * must be called with the hash locked, does not unlock it
651 void pmap_pvh_unlink(pv_hashed_entry_t pvh
)
653 pv_hashed_entry_t curh
;
654 pv_hashed_entry_t
*pprevh
;
658 pvhash_idx
= pvhashidx(pvh
->pmap
, pvh
->va
);
660 pprevh
= pvhash(pvhash_idx
);
663 if (NULL
== *pprevh
) panic("pvh_unlink null anchor"); /* JK DEBUG */
667 while (PV_HASHED_ENTRY_NULL
!= curh
) {
670 pprevh
= &curh
->nexth
;
673 if (PV_HASHED_ENTRY_NULL
== curh
) panic("pmap_pvh_unlink no pvh");
674 *pprevh
= pvh
->nexth
;
679 * for legacy, returns the address of the pde entry.
680 * for 64 bit, causes the pdpt page containing the pde entry to be mapped,
681 * then returns the mapped address of the pde entry in that page
684 pmap_pde(pmap_t m
, vm_map_offset_t v
)
687 if (!cpu_64bit
|| (m
== kernel_pmap
)) {
688 pde
= (&((m
)->dirbase
[(vm_offset_t
)(v
) >> PDESHIFT
]));
691 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
692 pde
= pmap64_pde(m
, v
);
699 * the single pml4 page per pmap is allocated at pmap create time and exists
700 * for the duration of the pmap. we allocate this page in kernel vm (to save us one
701 * level of page table dynamic mapping.
702 * this returns the address of the requested pml4 entry in the top level page.
706 pmap64_pml4(pmap_t pmap
, vm_map_offset_t vaddr
)
708 return ((pml4_entry_t
*)pmap
->pm_hold
+ ((vm_offset_t
)((vaddr
>>PML4SHIFT
)&(NPML4PG
-1))));
712 * maps in the pml4 page, if any, containing the pdpt entry requested
713 * and returns the address of the pdpt entry in that mapped page
716 pmap64_pdpt(pmap_t pmap
, vm_map_offset_t vaddr
)
723 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
724 if ((vaddr
> 0x00007FFFFFFFFFFFULL
) && (vaddr
< 0xFFFF800000000000ULL
)) {
728 pml4
= pmap64_pml4(pmap
, vaddr
);
730 if (pml4
&& ((*pml4
& INTEL_PTE_VALID
))) {
732 newpf
= *pml4
& PG_FRAME
;
735 for (i
=PMAP_PDPT_FIRST_WINDOW
; i
< PMAP_PDPT_FIRST_WINDOW
+PMAP_PDPT_NWINDOWS
; i
++) {
736 if (((*(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
)) & PG_FRAME
) == newpf
) {
737 return((pdpt_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CADDR
) +
738 ((vm_offset_t
)((vaddr
>>PDPTSHIFT
)&(NPDPTPG
-1))));
742 current_cpu_datap()->cpu_pmap
->pdpt_window_index
++;
743 if (current_cpu_datap()->cpu_pmap
->pdpt_window_index
> (PMAP_PDPT_FIRST_WINDOW
+PMAP_PDPT_NWINDOWS
-1))
744 current_cpu_datap()->cpu_pmap
->pdpt_window_index
= PMAP_PDPT_FIRST_WINDOW
;
746 (current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pdpt_window_index
].prv_CMAP
),
747 newpf
| INTEL_PTE_RW
| INTEL_PTE_VALID
);
748 invlpg((u_int
)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pdpt_window_index
].prv_CADDR
));
749 return ((pdpt_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pdpt_window_index
].prv_CADDR
) +
750 ((vm_offset_t
)((vaddr
>>PDPTSHIFT
)&(NPDPTPG
-1))));
757 * maps in the pdpt page, if any, containing the pde entry requested
758 * and returns the address of the pde entry in that mapped page
761 pmap64_pde(pmap_t pmap
, vm_map_offset_t vaddr
)
768 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
769 if ((vaddr
> 0x00007FFFFFFFFFFFULL
) && (vaddr
< 0xFFFF800000000000ULL
)) {
773 /* if (vaddr & (1ULL << 63)) panic("neg addr");*/
774 pdpt
= pmap64_pdpt(pmap
, vaddr
);
776 if (pdpt
&& ((*pdpt
& INTEL_PTE_VALID
))) {
778 newpf
= *pdpt
& PG_FRAME
;
780 for (i
=PMAP_PDE_FIRST_WINDOW
; i
< PMAP_PDE_FIRST_WINDOW
+PMAP_PDE_NWINDOWS
; i
++) {
781 if (((*(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
)) & PG_FRAME
) == newpf
) {
782 return((pd_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CADDR
) +
783 ((vm_offset_t
)((vaddr
>>PDSHIFT
)&(NPDPG
-1))));
787 current_cpu_datap()->cpu_pmap
->pde_window_index
++;
788 if (current_cpu_datap()->cpu_pmap
->pde_window_index
> (PMAP_PDE_FIRST_WINDOW
+PMAP_PDE_NWINDOWS
-1))
789 current_cpu_datap()->cpu_pmap
->pde_window_index
= PMAP_PDE_FIRST_WINDOW
;
791 (current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pde_window_index
].prv_CMAP
),
792 newpf
| INTEL_PTE_RW
| INTEL_PTE_VALID
);
793 invlpg((u_int
)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pde_window_index
].prv_CADDR
));
794 return ((pd_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pde_window_index
].prv_CADDR
) +
795 ((vm_offset_t
)((vaddr
>>PDSHIFT
)&(NPDPG
-1))));
802 * Because the page tables (top 3 levels) are mapped into per cpu windows,
803 * callers must either disable interrupts or disable preemption before calling
804 * one of the pte mapping routines (e.g. pmap_pte()) as the returned vaddr
805 * is in one of those mapped windows and that cannot be allowed to change until
806 * the caller is done using the returned pte pointer. When done, the caller
807 * restores interrupts or preemption to its previous state after which point the
808 * vaddr for the returned pte can no longer be used
813 * return address of mapped pte for vaddr va in pmap pmap.
814 * must be called with pre-emption or interrupts disabled
815 * if targeted pmap is not the kernel pmap
816 * since we may be passing back a virtual address that is
817 * associated with this cpu... pre-emption or interrupts
818 * must remain disabled until the caller is done using
819 * the pointer that was passed back .
821 * maps the pde page, if any, containing the pte in and returns
822 * the address of the pte in that mapped page
825 pmap_pte(pmap_t pmap
, vm_map_offset_t vaddr
)
832 pde
= pmap_pde(pmap
,vaddr
);
834 if (pde
&& ((*pde
& INTEL_PTE_VALID
))) {
835 if (pmap
== kernel_pmap
)
836 return (vtopte(vaddr
)); /* compat kernel still has pte's mapped */
838 if (ml_get_interrupts_enabled() && get_preemption_level() == 0)
839 panic("pmap_pte: unsafe call");
841 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
843 newpf
= *pde
& PG_FRAME
;
845 for (i
=PMAP_PTE_FIRST_WINDOW
; i
< PMAP_PTE_FIRST_WINDOW
+PMAP_PTE_NWINDOWS
; i
++) {
846 if (((*(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
)) & PG_FRAME
) == newpf
) {
847 return((pt_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CADDR
) +
848 ((vm_offset_t
)i386_btop(vaddr
) & (NPTEPG
-1)));
852 current_cpu_datap()->cpu_pmap
->pte_window_index
++;
853 if (current_cpu_datap()->cpu_pmap
->pte_window_index
> (PMAP_PTE_FIRST_WINDOW
+PMAP_PTE_NWINDOWS
-1))
854 current_cpu_datap()->cpu_pmap
->pte_window_index
= PMAP_PTE_FIRST_WINDOW
;
856 (current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pte_window_index
].prv_CMAP
),
857 newpf
| INTEL_PTE_RW
| INTEL_PTE_VALID
);
858 invlpg((u_int
)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pte_window_index
].prv_CADDR
));
859 return ((pt_entry_t
*)(current_cpu_datap()->cpu_pmap
->mapwindow
[current_cpu_datap()->cpu_pmap
->pte_window_index
].prv_CADDR
) +
860 ((vm_offset_t
)i386_btop(vaddr
) & (NPTEPG
-1)));
868 * Map memory at initialization. The physical addresses being
869 * mapped are not managed and are never unmapped.
871 * For now, VM is already on, we only need to map the
877 vm_map_offset_t start_addr
,
878 vm_map_offset_t end_addr
,
885 while (start_addr
< end_addr
) {
886 pmap_enter(kernel_pmap
, (vm_map_offset_t
)virt
,
887 (ppnum_t
) i386_btop(start_addr
), prot
, flags
, FALSE
);
895 * Back-door routine for mapping kernel VM at initialization.
896 * Useful for mapping memory outside the range
897 * Sets no-cache, A, D.
898 * Otherwise like pmap_map.
903 vm_map_offset_t start_addr
,
904 vm_map_offset_t end_addr
,
912 template = pa_to_pte(start_addr
)
918 if(flags
& (VM_MEM_NOT_CACHEABLE
| VM_WIMG_USE_DEFAULT
)) {
919 template |= INTEL_PTE_NCACHE
;
920 if(!(flags
& (VM_MEM_GUARDED
| VM_WIMG_USE_DEFAULT
)))
921 template |= INTEL_PTE_PTA
;
924 if (prot
& VM_PROT_WRITE
)
925 template |= INTEL_PTE_WRITE
;
927 while (start_addr
< end_addr
) {
929 pte
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)virt
);
930 if (pte
== PT_ENTRY_NULL
) {
931 panic("pmap_map_bd: Invalid kernel address\n");
933 pmap_store_pte(pte
, template);
935 pte_increment_pa(template);
937 start_addr
+= PAGE_SIZE
;
944 extern char *first_avail
;
945 extern vm_offset_t virtual_avail
, virtual_end
;
946 extern pmap_paddr_t avail_start
, avail_end
;
947 extern vm_offset_t etext
;
948 extern void *sectHIBB
;
949 extern int sectSizeHIB
;
955 * Here early in the life of a processor (from cpu_mode_init()).
956 * If we're not in 64-bit mode, enable the global TLB feature.
957 * Note: regardless of mode we continue to set the global attribute
958 * bit in ptes for all (32-bit) global pages such as the commpage.
961 set_cr4(get_cr4() | CR4_PGE
);
965 * Initialize the per-cpu, TLB-related fields.
967 current_cpu_datap()->cpu_active_cr3
= kernel_pmap
->pm_cr3
;
968 current_cpu_datap()->cpu_tlb_invalid
= FALSE
;
972 pmap_high_shared_remap(enum high_fixed_addresses e
, vm_offset_t va
, int sz
)
974 vm_offset_t ve
= pmap_index_to_virt(e
);
980 assert(0 == (va
& PAGE_MASK
)); /* expecting page aligned */
982 ptep
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)ve
);
984 for (i
=0; i
< sz
; i
++) {
985 pa
= (pmap_paddr_t
) kvtophys(va
);
986 pmap_store_pte(ptep
, (pa
& PG_FRAME
)
1000 pmap_cpu_high_shared_remap(int cpu
, enum high_cpu_types e
, vm_offset_t va
, int sz
)
1002 enum high_fixed_addresses a
= e
+ HIGH_CPU_END
* cpu
;
1003 return pmap_high_shared_remap(HIGH_FIXED_CPUS_BEGIN
+ a
, va
, sz
);
1006 void pmap_init_high_shared(void);
1008 extern vm_offset_t gdtptr
, idtptr
;
1010 extern uint32_t low_intstack
;
1012 extern struct fake_descriptor ldt_desc_pattern
;
1013 extern struct fake_descriptor tss_desc_pattern
;
1015 extern char hi_remap_text
, hi_remap_etext
;
1016 extern char t_zero_div
;
1018 pt_entry_t
*pte_unique_base
;
1021 pmap_init_high_shared(void)
1025 struct __gdt_desc_struct gdt_desc
= {0,0,0};
1026 struct __idt_desc_struct idt_desc
= {0,0,0};
1029 struct i386_tss
*ttss
;
1032 kprintf("HIGH_MEM_BASE 0x%x fixed per-cpu begin 0x%x\n",
1033 HIGH_MEM_BASE
,pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN
));
1035 pte_unique_base
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN
));
1038 if (i386_btop(&hi_remap_etext
- &hi_remap_text
+ 1) >
1039 HIGH_FIXED_TRAMPS_END
- HIGH_FIXED_TRAMPS
+ 1)
1040 panic("tramps too large");
1041 haddr
= pmap_high_shared_remap(HIGH_FIXED_TRAMPS
,
1042 (vm_offset_t
) &hi_remap_text
, 3);
1043 kprintf("tramp: 0x%x, ",haddr
);
1044 printf("hi mem tramps at 0x%x\n",haddr
);
1045 /* map gdt up high and update ptr for reload */
1046 haddr
= pmap_high_shared_remap(HIGH_FIXED_GDT
,
1047 (vm_offset_t
) master_gdt
, 1);
1048 __asm__
__volatile__("sgdt %0": "=m" (gdt_desc
): :"memory");
1049 gdt_desc
.address
= haddr
;
1050 kprintf("GDT: 0x%x, ",haddr
);
1051 /* map ldt up high */
1052 haddr
= pmap_high_shared_remap(HIGH_FIXED_LDT_BEGIN
,
1053 (vm_offset_t
) master_ldt
,
1054 HIGH_FIXED_LDT_END
- HIGH_FIXED_LDT_BEGIN
+ 1);
1055 kprintf("LDT: 0x%x, ",haddr
);
1056 /* put new ldt addr into gdt */
1057 master_gdt
[sel_idx(KERNEL_LDT
)] = ldt_desc_pattern
;
1058 master_gdt
[sel_idx(KERNEL_LDT
)].offset
= (vm_offset_t
) haddr
;
1059 fix_desc(&master_gdt
[sel_idx(KERNEL_LDT
)], 1);
1060 master_gdt
[sel_idx(USER_LDT
)] = ldt_desc_pattern
;
1061 master_gdt
[sel_idx(USER_LDT
)].offset
= (vm_offset_t
) haddr
;
1062 fix_desc(&master_gdt
[sel_idx(USER_LDT
)], 1);
1064 /* map idt up high */
1065 haddr
= pmap_high_shared_remap(HIGH_FIXED_IDT
,
1066 (vm_offset_t
) master_idt
, 1);
1067 __asm__
__volatile__("sidt %0" : "=m" (idt_desc
));
1068 idt_desc
.address
= haddr
;
1069 kprintf("IDT: 0x%x, ", haddr
);
1070 /* remap ktss up high and put new high addr into gdt */
1071 haddr
= pmap_high_shared_remap(HIGH_FIXED_KTSS
,
1072 (vm_offset_t
) &master_ktss
, 1);
1073 master_gdt
[sel_idx(KERNEL_TSS
)] = tss_desc_pattern
;
1074 master_gdt
[sel_idx(KERNEL_TSS
)].offset
= (vm_offset_t
) haddr
;
1075 fix_desc(&master_gdt
[sel_idx(KERNEL_TSS
)], 1);
1076 kprintf("KTSS: 0x%x, ",haddr
);
1078 /* remap dbtss up high and put new high addr into gdt */
1079 haddr
= pmap_high_shared_remap(HIGH_FIXED_DBTSS
,
1080 (vm_offset_t
) &master_dbtss
, 1);
1081 master_gdt
[sel_idx(DEBUG_TSS
)] = tss_desc_pattern
;
1082 master_gdt
[sel_idx(DEBUG_TSS
)].offset
= (vm_offset_t
) haddr
;
1083 fix_desc(&master_gdt
[sel_idx(DEBUG_TSS
)], 1);
1084 ttss
= (struct i386_tss
*)haddr
;
1085 kprintf("DBTSS: 0x%x, ",haddr
);
1086 #endif /* MACH_KDB */
1088 /* remap dftss up high and put new high addr into gdt */
1089 haddr
= pmap_high_shared_remap(HIGH_FIXED_DFTSS
,
1090 (vm_offset_t
) &master_dftss
, 1);
1091 master_gdt
[sel_idx(DF_TSS
)] = tss_desc_pattern
;
1092 master_gdt
[sel_idx(DF_TSS
)].offset
= (vm_offset_t
) haddr
;
1093 fix_desc(&master_gdt
[sel_idx(DF_TSS
)], 1);
1094 kprintf("DFTSS: 0x%x\n",haddr
);
1096 /* remap mctss up high and put new high addr into gdt */
1097 haddr
= pmap_high_shared_remap(HIGH_FIXED_DFTSS
,
1098 (vm_offset_t
) &master_mctss
, 1);
1099 master_gdt
[sel_idx(MC_TSS
)] = tss_desc_pattern
;
1100 master_gdt
[sel_idx(MC_TSS
)].offset
= (vm_offset_t
) haddr
;
1101 fix_desc(&master_gdt
[sel_idx(MC_TSS
)], 1);
1102 kprintf("MCTSS: 0x%x\n",haddr
);
1104 __asm__
__volatile__("lgdt %0": "=m" (gdt_desc
));
1105 __asm__
__volatile__("lidt %0": "=m" (idt_desc
));
1106 kprintf("gdt/idt reloaded, ");
1108 kprintf("tr reset to KERNEL_TSS\n");
1113 * Bootstrap the system enough to run with virtual memory.
1114 * Map the kernel's code and data, and allocate the system page table.
1115 * Called with mapping OFF. Page_size must already be set.
1118 * load_start: PA where kernel was loaded
1119 * avail_start PA of first available physical page -
1120 * after kernel page tables
1121 * avail_end PA of last available physical page
1122 * virtual_avail VA of first available page -
1123 * after kernel page tables
1124 * virtual_end VA of last available page -
1125 * end of kernel address space
1127 * &start_text start of kernel text
1128 * &etext end of kernel text
1133 __unused vm_offset_t load_start
,
1139 int wpkernel
, boot_arg
;
1143 vm_last_addr
= VM_MAX_KERNEL_ADDRESS
; /* Set the highest address
1146 * The kernel's pmap is statically allocated so we don't
1147 * have to use pmap_create, which is unlikely to work
1148 * correctly at this part of the boot sequence.
1152 kernel_pmap
= &kernel_pmap_store
;
1153 kernel_pmap
->ref_count
= 1;
1154 kernel_pmap
->nx_enabled
= FALSE
;
1155 kernel_pmap
->pm_task_map
= TASK_MAP_32BIT
;
1156 kernel_pmap
->pm_obj
= (vm_object_t
) NULL
;
1157 kernel_pmap
->dirbase
= (pd_entry_t
*)((unsigned int)IdlePTD
| KERNBASE
);
1158 kernel_pmap
->pdirbase
= (pmap_paddr_t
)((int)IdlePTD
);
1159 pdpt
= (pd_entry_t
*)((unsigned int)IdlePDPT
| KERNBASE
);
1160 kernel_pmap
->pm_pdpt
= pdpt
;
1161 kernel_pmap
->pm_cr3
= (pmap_paddr_t
)((int)IdlePDPT
);
1163 va
= (vm_offset_t
)kernel_pmap
->dirbase
;
1164 /* setup self referential mapping(s) */
1165 for (i
= 0; i
< NPGPTD
; i
++, pdpt
++) {
1167 pa
= (pmap_paddr_t
) kvtophys(va
+ i386_ptob(i
));
1169 (pd_entry_t
*) (kernel_pmap
->dirbase
+ PTDPTDI
+ i
),
1170 (pa
& PG_FRAME
) | INTEL_PTE_VALID
| INTEL_PTE_RW
| INTEL_PTE_REF
|
1171 INTEL_PTE_MOD
| INTEL_PTE_WIRED
) ;
1172 pmap_store_pte(pdpt
, pa
| INTEL_PTE_VALID
);
1177 lo_kernel_cr3
= kernel_pmap
->pm_cr3
;
1178 current_cpu_datap()->cpu_kernel_cr3
= (addr64_t
) kernel_pmap
->pm_cr3
;
1180 /* save the value we stuff into created pmaps to share the gdts etc */
1181 high_shared_pde
= *pmap_pde(kernel_pmap
, HIGH_MEM_BASE
);
1182 /* make sure G bit is on for high shared pde entry */
1183 high_shared_pde
|= INTEL_PTE_GLOBAL
;
1185 pmap_store_pte(pmap_pde(kernel_pmap
, HIGH_MEM_BASE
), high_shared_pde
);
1189 inuse_ptepages_count
+= NKPT
;
1191 virtual_avail
= (vm_offset_t
)VADDR(KPTDI
,0) + (vm_offset_t
)first_avail
;
1192 virtual_end
= (vm_offset_t
)(VM_MAX_KERNEL_ADDRESS
);
1195 * Reserve some special page table entries/VA space for temporary
1198 #define SYSMAP(c, p, v, n) \
1199 v = (c)va; va += ((n)*INTEL_PGBYTES); p = pte; pte += (n)
1204 for (i
=0; i
<PMAP_NWINDOWS
; i
++) {
1206 (current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
),
1207 (current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CADDR
),
1209 *current_cpu_datap()->cpu_pmap
->mapwindow
[i
].prv_CMAP
= 0;
1212 /* DMAP user for debugger */
1213 SYSMAP(caddr_t
, DMAP1
, DADDR1
, 1);
1214 SYSMAP(caddr_t
, DMAP2
, DADDR2
, 1); /* XXX temporary - can remove */
1218 if (PE_parse_boot_arg("npvhash", &npvhash
)) {
1219 if (0 != ((npvhash
+1) & npvhash
)) {
1220 kprintf("invalid hash %d, must be ((2^N)-1), using default %d\n",npvhash
,NPVHASH
);
1226 printf("npvhash=%d\n",npvhash
);
1229 if (PE_parse_boot_arg("wpkernel", &boot_arg
)) {
1236 /* Remap kernel text readonly unless the "wpkernel" boot-arg is present
1244 for (myva
= i386_round_page(MP_BOOT
+ MP_BOOTSTACK
); myva
< etext
; myva
+= PAGE_SIZE
) {
1245 if (myva
>= (vm_offset_t
)sectHIBB
&& myva
< ((vm_offset_t
)sectHIBB
+ sectSizeHIB
))
1247 ptep
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)myva
);
1249 pmap_store_pte(ptep
, *ptep
& ~INTEL_PTE_RW
);
1253 /* no matter what, kernel page zero is not accessible */
1254 pte
= pmap_pte(kernel_pmap
, 0);
1255 pmap_store_pte(pte
, INTEL_PTE_INVALID
);
1257 /* map lowmem global page into fixed addr 0x2000 */
1258 if (0 == (pte
= pmap_pte(kernel_pmap
,0x2000))) panic("lowmem pte");
1259 assert(0 == ((vm_offset_t
) &lowGlo
& PAGE_MASK
)); /* make sure it is defined on page boundary */
1260 pmap_store_pte(pte
, kvtophys((vm_offset_t
)&lowGlo
)|INTEL_PTE_VALID
|INTEL_PTE_REF
|INTEL_PTE_MOD
|INTEL_PTE_WIRED
|INTEL_PTE_RW
);
1264 simple_lock_init(&kernel_pmap
->lock
, 0);
1265 simple_lock_init(&pv_hashed_free_list_lock
, 0);
1266 simple_lock_init(&pv_hashed_kern_free_list_lock
, 0);
1267 simple_lock_init(&pv_hash_table_lock
,0);
1269 pmap_init_high_shared();
1271 pde_mapped_size
= PDE_MAPPED_SIZE
;
1274 pdpt_entry_t
*ppdpt
= (pdpt_entry_t
*)IdlePDPT
;
1275 pdpt_entry_t
*ppdpt64
= (pdpt_entry_t
*)IdlePDPT64
;
1276 pdpt_entry_t
*ppml4
= (pdpt_entry_t
*)IdlePML4
;
1277 int istate
= ml_set_interrupts_enabled(FALSE
);
1280 * Clone a new 64-bit 3rd-level page table directory, IdlePML4,
1281 * with page bits set for the correct IA-32e operation and so that
1282 * the legacy-mode IdlePDPT is retained for slave processor start-up.
1283 * This is necessary due to the incompatible use of page bits between
1284 * 64-bit and legacy modes.
1286 kernel_pmap
->pm_cr3
= (pmap_paddr_t
)((int)IdlePML4
); /* setup in start.s for us */
1287 kernel_pmap
->pm_pml4
= IdlePML4
;
1288 kernel_pmap
->pm_pdpt
= (pd_entry_t
*)
1289 ((unsigned int)IdlePDPT64
| KERNBASE
);
1290 #define PAGE_BITS INTEL_PTE_VALID|INTEL_PTE_RW|INTEL_PTE_USER|INTEL_PTE_REF
1291 pmap_store_pte(kernel_pmap
->pm_pml4
,
1292 (uint32_t)IdlePDPT64
| PAGE_BITS
);
1293 pmap_store_pte((ppdpt64
+0), *(ppdpt
+0) | PAGE_BITS
);
1294 pmap_store_pte((ppdpt64
+1), *(ppdpt
+1) | PAGE_BITS
);
1295 pmap_store_pte((ppdpt64
+2), *(ppdpt
+2) | PAGE_BITS
);
1296 pmap_store_pte((ppdpt64
+3), *(ppdpt
+3) | PAGE_BITS
);
1299 * The kernel is also mapped in the uber-sapce at the 4GB starting
1300 * 0xFFFFFF80:00000000. This is the highest entry in the 4th-level.
1302 pmap_store_pte((ppml4
+KERNEL_UBER_PML4_INDEX
), *(ppml4
+0));
1304 kernel64_cr3
= (addr64_t
) kernel_pmap
->pm_cr3
;
1306 /* Re-initialize descriptors and prepare to switch modes */
1307 cpu_desc_init64(&cpu_data_master
, TRUE
);
1308 current_cpu_datap()->cpu_is64bit
= TRUE
;
1309 current_cpu_datap()->cpu_active_cr3
= kernel64_cr3
;
1311 pde_mapped_size
= 512*4096 ;
1313 ml_set_interrupts_enabled(istate
);
1316 /* Set 64-bit mode if required. */
1317 cpu_mode_init(&cpu_data_master
);
1319 kernel_pmap
->pm_hold
= (vm_offset_t
)kernel_pmap
->pm_pml4
;
1321 kprintf("Kernel virtual space from 0x%x to 0x%x.\n",
1322 VADDR(KPTDI
,0), virtual_end
);
1323 printf("PAE enabled\n");
1325 printf("64 bit mode enabled\n");kprintf("64 bit mode enabled\n"); }
1327 kprintf("Available physical space from 0x%llx to 0x%llx\n",
1328 avail_start
, avail_end
);
1331 * By default for 64-bit users loaded at 4GB, share kernel mapping.
1332 * But this may be overridden by the -no_shared_cr3 boot-arg.
1334 if (PE_parse_boot_arg("-no_shared_cr3", &no_shared_cr3
)) {
1335 kprintf("Shared kernel address space disabled\n");
1339 if (PE_parse_boot_arg("-pmap_trace", &pmap_trace
)) {
1340 kprintf("Kernel traces for pmap operations enabled\n");
1342 #endif /* PMAP_TRACES */
1347 vm_offset_t
*startp
,
1350 *startp
= virtual_avail
;
1351 *endp
= virtual_end
;
1355 * Initialize the pmap module.
1356 * Called by vm_init, to initialize any structures that the pmap
1357 * system needs to map virtual memory.
1362 register long npages
;
1364 register vm_size_t s
;
1365 vm_map_offset_t vaddr
;
1369 * Allocate memory for the pv_head_table and its lock bits,
1370 * the modify bit array, and the pte_page table.
1374 * zero bias all these arrays now instead of off avail_start
1375 * so we cover all memory
1378 npages
= i386_btop(avail_end
);
1379 s
= (vm_size_t
) (sizeof(struct pv_rooted_entry
) * npages
1380 + (sizeof (struct pv_hashed_entry_t
*) * (npvhash
+1))
1381 + pv_lock_table_size(npages
)
1382 + pv_hash_lock_table_size((npvhash
+1))
1386 if (kmem_alloc_wired(kernel_map
, &addr
, s
) != KERN_SUCCESS
)
1389 memset((char *)addr
, 0, s
);
1392 if (0 == npvhash
) panic("npvhash not initialized");
1396 * Allocate the structures first to preserve word-alignment.
1398 pv_head_table
= (pv_rooted_entry_t
) addr
;
1399 addr
= (vm_offset_t
) (pv_head_table
+ npages
);
1401 pv_hash_table
= (pv_hashed_entry_t
*)addr
;
1402 addr
= (vm_offset_t
) (pv_hash_table
+ (npvhash
+ 1));
1404 pv_lock_table
= (char *) addr
;
1405 addr
= (vm_offset_t
) (pv_lock_table
+ pv_lock_table_size(npages
));
1407 pv_hash_lock_table
= (char *) addr
;
1408 addr
= (vm_offset_t
) (pv_hash_lock_table
+ pv_hash_lock_table_size((npvhash
+1)));
1410 pmap_phys_attributes
= (char *) addr
;
1415 pmap_memory_region_t
*pmptr
= pmap_memory_regions
;
1417 last_pn
= i386_btop(avail_end
);
1419 for (i
= 0; i
< pmap_memory_region_count
; i
++, pmptr
++) {
1420 if (pmptr
->type
== kEfiConventionalMemory
) {
1421 for (pn
= pmptr
->base
; pn
<= pmptr
->end
; pn
++) {
1423 pmap_phys_attributes
[pn
] |= PHYS_MANAGED
;
1425 if (pn
> last_managed_page
)
1426 last_managed_page
= pn
;
1434 * Create the zone of physical maps,
1435 * and of the physical-to-virtual entries.
1437 s
= (vm_size_t
) sizeof(struct pmap
);
1438 pmap_zone
= zinit(s
, 400*s
, 4096, "pmap"); /* XXX */
1439 s
= (vm_size_t
) sizeof(struct pv_hashed_entry
);
1440 pv_hashed_list_zone
= zinit(s
, 10000*s
, 4096, "pv_list"); /* XXX */
1442 pdpt_zone
= zinit(s
, 400*s
, 4096, "pdpt"); /* XXX */
1444 kptobj
= &kptobj_object_store
;
1445 _vm_object_allocate((vm_object_size_t
)(NPGPTD
*NPTDPG
), kptobj
);
1446 kernel_pmap
->pm_obj
= kptobj
;
1448 /* create pv entries for kernel pages mapped by low level
1449 startup code. these have to exist so we can pmap_remove()
1450 e.g. kext pages from the middle of our addr space */
1452 vaddr
= (vm_map_offset_t
)0;
1453 for (ppn
= 0; ppn
< i386_btop(avail_start
) ; ppn
++ ) {
1454 pv_rooted_entry_t pv_e
;
1456 pv_e
= pai_to_pvh(ppn
);
1459 pv_e
->pmap
= kernel_pmap
;
1460 queue_init(&pv_e
->qlink
);
1463 pmap_initialized
= TRUE
;
1466 * Initialize pmap cache.
1468 pmap_cache_list
= PMAP_NULL
;
1469 pmap_cache_count
= 0;
1470 simple_lock_init(&pmap_cache_lock
, 0);
1472 max_preemption_latency_tsc
= tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS
, tscFCvtn2t
);
1477 x86_lowmem_free(void)
1479 /* free lowmem pages back to the vm system. we had to defer doing this
1480 until the vm system was fully up.
1481 the actual pages that are released are determined by which
1482 pages the memory sizing code puts into the region table */
1484 ml_static_mfree((vm_offset_t
) i386_ptob(pmap_memory_regions
[0].base
),
1485 (vm_size_t
) i386_ptob(pmap_memory_regions
[0].end
- pmap_memory_regions
[0].base
));
1489 #define managed_page(x) ( (unsigned int)x <= last_managed_page && (pmap_phys_attributes[x] & PHYS_MANAGED) )
1492 * this function is only used for debugging fron the vm layer
1498 pv_rooted_entry_t pv_h
;
1502 assert(pn
!= vm_page_fictitious_addr
);
1504 if (!pmap_initialized
)
1507 if (pn
== vm_page_guard_addr
)
1510 pai
= ppn_to_pai(pn
);
1511 if (!managed_page(pai
))
1513 pv_h
= pai_to_pvh(pn
);
1514 result
= (pv_h
->pmap
== PMAP_NULL
);
1521 vm_map_offset_t vstart
,
1522 vm_map_offset_t vend
)
1524 vm_map_offset_t offset
;
1527 if (pmap
== PMAP_NULL
) {
1530 for (offset
= vstart
;
1532 offset
+= PAGE_SIZE_64
) {
1533 phys_page
= pmap_find_phys(pmap
, offset
);
1535 if (pmap
!= kernel_pmap
&&
1536 pmap
->pm_task_map
== TASK_MAP_32BIT
&&
1537 offset
>= HIGH_MEM_BASE
) {
1539 * The "high_shared_pde" is used to share
1540 * the entire top-most 2MB of address space
1541 * between the kernel and all 32-bit tasks.
1542 * So none of this can be removed from 32-bit
1544 * Let's pretend there's nothing up
1549 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1550 "page %d at 0x%llx\n",
1551 pmap
, vstart
, vend
, phys_page
, offset
);
1561 * Create and return a physical map.
1563 * If the size specified for the map
1564 * is zero, the map is an actual physical
1565 * map, and may be referenced by the
1568 * If the size specified is non-zero,
1569 * the map will be used in software only, and
1570 * is bounded by that size.
1582 pml4_entry_t
*pml4p
;
1587 PMAP_TRACE(PMAP_CODE(PMAP__CREATE
) | DBG_FUNC_START
,
1588 (int) (sz
>>32), (int) sz
, (int) is_64bit
, 0, 0);
1590 size
= (vm_size_t
) sz
;
1593 * A software use-only map doesn't even need a map.
1600 p
= (pmap_t
) zalloc(pmap_zone
);
1602 panic("pmap_create zalloc");
1604 /* init counts now since we'll be bumping some */
1605 simple_lock_init(&p
->lock
, 0);
1606 p
->stats
.resident_count
= 0;
1607 p
->stats
.resident_max
= 0;
1608 p
->stats
.wired_count
= 0;
1611 p
->pm_shared
= FALSE
;
1613 assert(!is_64bit
|| cpu_64bit
);
1614 p
->pm_task_map
= is_64bit
? TASK_MAP_64BIT
: TASK_MAP_32BIT
;;
1617 /* legacy 32 bit setup */
1618 /* in the legacy case the pdpt layer is hardwired to 4 entries and each
1619 * entry covers 1GB of addr space */
1620 if (KERN_SUCCESS
!= kmem_alloc_wired(kernel_map
, (vm_offset_t
*)(&p
->dirbase
), NBPTD
))
1621 panic("pmap_create kmem_alloc_wired");
1622 p
->pm_hold
= (vm_offset_t
)zalloc(pdpt_zone
);
1623 if ((vm_offset_t
)NULL
== p
->pm_hold
) {
1624 panic("pdpt zalloc");
1626 pdpt
= (pdpt_entry_t
*) (( p
->pm_hold
+ 31) & ~31);
1627 p
->pm_cr3
= (pmap_paddr_t
)kvtophys((vm_offset_t
)pdpt
);
1628 if (NULL
== (p
->pm_obj
= vm_object_allocate((vm_object_size_t
)(NPGPTD
*NPTDPG
))))
1629 panic("pmap_create vm_object_allocate");
1631 memset((char *)p
->dirbase
, 0, NBPTD
);
1633 va
= (vm_offset_t
)p
->dirbase
;
1634 p
->pdirbase
= kvtophys(va
);
1636 template = cpu_64bit
? INTEL_PTE_VALID
|INTEL_PTE_RW
|INTEL_PTE_USER
|INTEL_PTE_REF
: INTEL_PTE_VALID
;
1637 for (i
= 0; i
< NPGPTD
; i
++, pdpt
++ ) {
1639 pa
= (pmap_paddr_t
) kvtophys(va
+ i386_ptob(i
));
1640 pmap_store_pte(pdpt
, pa
| template);
1643 /* map the high shared pde */
1645 pmap_store_pte(pmap_pde(p
, HIGH_MEM_BASE
), high_shared_pde
);
1651 /* alloc the pml4 page in kernel vm */
1652 if (KERN_SUCCESS
!= kmem_alloc_wired(kernel_map
, (vm_offset_t
*)(&p
->pm_hold
), PAGE_SIZE
))
1653 panic("pmap_create kmem_alloc_wired pml4");
1655 memset((char *)p
->pm_hold
, 0, PAGE_SIZE
);
1656 p
->pm_cr3
= (pmap_paddr_t
)kvtophys((vm_offset_t
)p
->pm_hold
);
1658 vm_page_lock_queues();
1659 inuse_ptepages_count
++;
1660 vm_page_unlock_queues();
1662 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1664 if (NULL
== (p
->pm_obj_pml4
= vm_object_allocate((vm_object_size_t
)(NPML4PGS
))))
1665 panic("pmap_create pdpt obj");
1667 if (NULL
== (p
->pm_obj_pdpt
= vm_object_allocate((vm_object_size_t
)(NPDPTPGS
))))
1668 panic("pmap_create pdpt obj");
1670 if (NULL
== (p
->pm_obj
= vm_object_allocate((vm_object_size_t
)(NPDEPGS
))))
1671 panic("pmap_create pte obj");
1673 /* uber space points to uber mapped kernel */
1675 pml4p
= pmap64_pml4(p
, 0ULL);
1676 pmap_store_pte((pml4p
+KERNEL_UBER_PML4_INDEX
),*kernel_pmap
->pm_pml4
);
1680 while ((pdp
= pmap64_pde(p
, (uint64_t)HIGH_MEM_BASE
)) == PD_ENTRY_NULL
) {
1682 pmap_expand_pdpt(p
, (uint64_t)HIGH_MEM_BASE
); /* need room for another pde entry */
1685 pmap_store_pte(pdp
, high_shared_pde
);
1690 PMAP_TRACE(PMAP_CODE(PMAP__CREATE
) | DBG_FUNC_START
,
1691 (int) p
, is_64bit
, 0, 0, 0);
1697 * The following routines implement the shared address optmization for 64-bit
1698 * users with a 4GB page zero.
1700 * pmap_set_4GB_pagezero()
1701 * is called in the exec and fork paths to mirror the kernel's
1702 * mapping in the bottom 4G of the user's pmap. The task mapping changes
1703 * from TASK_MAP_64BIT to TASK_MAP_64BIT_SHARED. This routine returns
1704 * without doing anything if the -no_shared_cr3 boot-arg is set.
1706 * pmap_clear_4GB_pagezero()
1707 * is called in the exec/exit paths to undo this mirror. The task mapping
1708 * reverts to TASK_MAP_64BIT. In addition, we switch to the kernel's
1709 * CR3 by calling pmap_load_kernel_cr3().
1711 * pmap_load_kernel_cr3()
1712 * loads cr3 with the kernel's page table. In addition to being called
1713 * by pmap_clear_4GB_pagezero(), it is used both prior to teardown and
1714 * when we go idle in the context of a shared map.
1716 * Further notes on per-cpu data used:
1718 * cpu_kernel_cr3 is the cr3 for the kernel's pmap.
1719 * This is loaded in a trampoline on entering the kernel
1720 * from a 32-bit user (or non-shared-cr3 64-bit user).
1721 * cpu_task_cr3 is the cr3 for the current thread.
1722 * This is loaded in a trampoline as we exit the kernel.
1723 * cpu_active_cr3 reflects the cr3 currently loaded.
1724 * However, the low order bit is set when the
1725 * processor is idle or interrupts are disabled
1726 * while the system pmap lock is held. It is used by
1728 * cpu_task_map indicates whether the task cr3 belongs to
1729 * a 32-bit, a 64-bit or a 64-bit shared map.
1730 * The latter allows the avoidance of the cr3 load
1731 * on kernel entry and exit.
1732 * cpu_tlb_invalid set TRUE when a tlb flush is requested.
1733 * If the cr3 is "inactive" (the cpu is idle or the
1734 * system-wide pmap lock is held) this not serviced by
1735 * an IPI but at time when the cr3 becomes "active".
1739 pmap_set_4GB_pagezero(pmap_t p
)
1741 pdpt_entry_t
*user_pdptp
;
1742 pdpt_entry_t
*kern_pdptp
;
1744 assert(p
->pm_task_map
!= TASK_MAP_32BIT
);
1746 /* Kernel-shared cr3 may be disabled by boot arg. */
1751 * Set the bottom 4 3rd-level pte's to be the kernel's.
1754 while ((user_pdptp
= pmap64_pdpt(p
, 0x0)) == PDPT_ENTRY_NULL
) {
1756 pmap_expand_pml4(p
, 0x0);
1759 kern_pdptp
= kernel_pmap
->pm_pdpt
;
1760 pmap_store_pte(user_pdptp
+0, *(kern_pdptp
+0));
1761 pmap_store_pte(user_pdptp
+1, *(kern_pdptp
+1));
1762 pmap_store_pte(user_pdptp
+2, *(kern_pdptp
+2));
1763 pmap_store_pte(user_pdptp
+3, *(kern_pdptp
+3));
1764 p
->pm_task_map
= TASK_MAP_64BIT_SHARED
;
1769 pmap_clear_4GB_pagezero(pmap_t p
)
1771 pdpt_entry_t
*user_pdptp
;
1773 if (p
->pm_task_map
!= TASK_MAP_64BIT_SHARED
)
1778 p
->pm_task_map
= TASK_MAP_64BIT
;
1780 pmap_load_kernel_cr3();
1782 user_pdptp
= pmap64_pdpt(p
, 0x0);
1783 pmap_store_pte(user_pdptp
+0, 0);
1784 pmap_store_pte(user_pdptp
+1, 0);
1785 pmap_store_pte(user_pdptp
+2, 0);
1786 pmap_store_pte(user_pdptp
+3, 0);
1792 pmap_load_kernel_cr3(void)
1794 uint64_t kernel_cr3
;
1796 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
1799 * Reload cr3 with the true kernel cr3.
1801 kernel_cr3
= current_cpu_datap()->cpu_kernel_cr3
;
1802 set64_cr3(kernel_cr3
);
1803 current_cpu_datap()->cpu_active_cr3
= kernel_cr3
;
1804 current_cpu_datap()->cpu_tlb_invalid
= FALSE
;
1805 __asm__
volatile("mfence");
1809 * Retire the given physical map from service.
1810 * Should only be called if the map contains
1811 * no valid mappings.
1823 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY
) | DBG_FUNC_START
,
1824 (int) p
, 0, 0, 0, 0);
1832 * If some cpu is not using the physical pmap pointer that it
1833 * is supposed to be (see set_dirbase), we might be using the
1834 * pmap that is being destroyed! Make sure we are
1835 * physically on the right pmap:
1839 0xFFFFFFFFFFFFF000ULL
);
1845 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY
) | DBG_FUNC_END
,
1846 (int) p
, 1, 0, 0, 0);
1847 return; /* still in use */
1851 * Free the memory maps, then the
1855 vm_page_lock_queues();
1856 inuse_ptepages_count
-= p
->pm_obj
->resident_page_count
;
1857 vm_page_unlock_queues();
1859 kmem_free(kernel_map
, (vm_offset_t
)p
->dirbase
, NBPTD
);
1860 zfree(pdpt_zone
, (void *)p
->pm_hold
);
1862 vm_object_deallocate(p
->pm_obj
);
1865 int inuse_ptepages
= 0;
1867 /* free 64 bit mode structs */
1869 kmem_free(kernel_map
, (vm_offset_t
)p
->pm_hold
, PAGE_SIZE
);
1871 inuse_ptepages
+= p
->pm_obj_pml4
->resident_page_count
;
1872 vm_object_deallocate(p
->pm_obj_pml4
);
1874 inuse_ptepages
+= p
->pm_obj_pdpt
->resident_page_count
;
1875 vm_object_deallocate(p
->pm_obj_pdpt
);
1877 inuse_ptepages
+= p
->pm_obj
->resident_page_count
;
1878 vm_object_deallocate(p
->pm_obj
);
1880 vm_page_lock_queues();
1881 inuse_ptepages_count
-= inuse_ptepages
;
1882 vm_page_unlock_queues();
1884 zfree(pmap_zone
, p
);
1886 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY
) | DBG_FUNC_END
,
1892 * Add a reference to the specified pmap.
1900 if (p
!= PMAP_NULL
) {
1908 * Remove a range of hardware page-table entries.
1909 * The entries given are the first (inclusive)
1910 * and last (exclusive) entries for the VM pages.
1911 * The virtual address is the va for the first pte.
1913 * The pmap must be locked.
1914 * If the pmap is not the kernel pmap, the range must lie
1915 * entirely within one pte-page. This is NOT checked.
1916 * Assumes that the pte-page exists.
1922 vm_map_offset_t start_vaddr
,
1926 register pt_entry_t
*cpte
;
1927 pv_hashed_entry_t pvh_et
= PV_HASHED_ENTRY_NULL
;
1928 pv_hashed_entry_t pvh_eh
= PV_HASHED_ENTRY_NULL
;
1929 pv_hashed_entry_t pvh_e
;
1931 int num_removed
, num_unwired
, num_found
;
1934 vm_map_offset_t vaddr
;
1942 if (pmap
!= kernel_pmap
&&
1943 pmap
->pm_task_map
== TASK_MAP_32BIT
&&
1944 start_vaddr
>= HIGH_MEM_BASE
) {
1946 * The range is in the "high_shared_pde" which is shared
1947 * between the kernel and all 32-bit tasks. It holds
1948 * the 32-bit commpage but also the trampolines, GDT, etc...
1949 * so we can't let user tasks remove anything from it.
1954 /* invalidate the PTEs first to "freeze" them */
1955 for (cpte
= spte
, vaddr
= start_vaddr
;
1957 cpte
++, vaddr
+= PAGE_SIZE_64
) {
1959 pa
= pte_to_pa(*cpte
);
1969 if (!managed_page(pai
)) {
1971 * Outside range of managed physical memory.
1972 * Just remove the mappings.
1974 pmap_store_pte(cpte
, 0);
1978 /* invalidate the PTE */
1979 pmap_update_pte(cpte
, *cpte
, (*cpte
& ~INTEL_PTE_VALID
));
1982 if (num_found
== 0) {
1983 /* nothing was changed: we're done */
1987 /* propagate the invalidates to other CPUs */
1989 PMAP_UPDATE_TLBS(pmap
, start_vaddr
, vaddr
);
1991 for (cpte
= spte
, vaddr
= start_vaddr
;
1993 cpte
++, vaddr
+= PAGE_SIZE_64
) {
1995 pa
= pte_to_pa(*cpte
);
2003 pa
= pte_to_pa(*cpte
);
2012 * Get the modify and reference bits, then
2013 * nuke the entry in the page table
2015 /* remember reference and change */
2016 pmap_phys_attributes
[pai
] |=
2017 (char)(*cpte
& (PHYS_MODIFIED
| PHYS_REFERENCED
));
2018 /* completely invalidate the PTE */
2019 pmap_store_pte(cpte
, 0);
2022 * Remove the mapping from the pvlist for
2023 * this physical page.
2026 pv_rooted_entry_t pv_h
;
2027 pv_hashed_entry_t
*pprevh
;
2028 ppnum_t ppn
= (ppnum_t
)pai
;
2030 pv_h
= pai_to_pvh(pai
);
2031 pvh_e
= PV_HASHED_ENTRY_NULL
;
2032 if (pv_h
->pmap
== PMAP_NULL
)
2033 panic("pmap_remove_range: null pv_list!");
2035 if (pv_h
->va
== vaddr
&& pv_h
->pmap
== pmap
) { /* rooted or not */
2037 * Header is the pv_rooted_entry. We can't free that. If there is a queued
2038 * entry after this one we remove that
2039 * from the ppn queue, we remove it from the hash chain
2040 * and copy it to the rooted entry. Then free it instead.
2043 pvh_e
= (pv_hashed_entry_t
)queue_next(&pv_h
->qlink
);
2044 if (pv_h
!= (pv_rooted_entry_t
)pvh_e
) { /* any queued after rooted? */
2046 pvhash_idx
= pvhashidx(pvh_e
->pmap
,pvh_e
->va
);
2047 LOCK_PV_HASH(pvhash_idx
);
2048 remque(&pvh_e
->qlink
);
2050 pprevh
= pvhash(pvhash_idx
);
2051 if (PV_HASHED_ENTRY_NULL
== *pprevh
) {
2052 panic("pmap_remove_range empty hash removing rooted pv");
2055 pmap_pvh_unlink(pvh_e
);
2056 UNLOCK_PV_HASH(pvhash_idx
);
2057 pv_h
->pmap
= pvh_e
->pmap
;
2058 pv_h
->va
= pvh_e
->va
; /* dispose of pvh_e */
2059 } else { /* none queued after rooted */
2060 pv_h
->pmap
= PMAP_NULL
;
2061 pvh_e
= PV_HASHED_ENTRY_NULL
;
2062 } /* any queued after rooted */
2064 } else { /* rooted or not */
2065 /* not removing rooted pv. find it on hash chain, remove from ppn queue and
2066 * hash chain and free it */
2068 pvhash_idx
= pvhashidx(pmap
,vaddr
);
2069 LOCK_PV_HASH(pvhash_idx
);
2070 pprevh
= pvhash(pvhash_idx
);
2071 if (PV_HASHED_ENTRY_NULL
== *pprevh
) {
2072 panic("pmap_remove_range empty hash removing hashed pv");
2075 pmap_pv_hashlist_walks
++;
2077 while (PV_HASHED_ENTRY_NULL
!= pvh_e
) {
2079 if (pvh_e
->pmap
== pmap
&& pvh_e
->va
== vaddr
&& pvh_e
->ppn
== ppn
) break;
2080 pprevh
= &pvh_e
->nexth
;
2081 pvh_e
= pvh_e
->nexth
;
2083 pmap_pv_hashlist_cnts
+= pv_cnt
;
2084 if (pmap_pv_hashlist_max
< pv_cnt
) pmap_pv_hashlist_max
= pv_cnt
;
2085 if (PV_HASHED_ENTRY_NULL
== pvh_e
) panic("pmap_remove_range pv not on hash");
2086 *pprevh
= pvh_e
->nexth
;
2087 remque(&pvh_e
->qlink
);
2088 UNLOCK_PV_HASH(pvhash_idx
);
2090 } /* rooted or not */
2094 if (pvh_e
!= PV_HASHED_ENTRY_NULL
) {
2095 pvh_e
->qlink
.next
= (queue_entry_t
)pvh_eh
;
2098 if (pvh_et
== PV_HASHED_ENTRY_NULL
) {
2105 } /* removing mappings for this phy page */
2108 if (pvh_eh
!= PV_HASHED_ENTRY_NULL
) {
2109 PV_HASHED_FREE_LIST(pvh_eh
, pvh_et
, pvh_cnt
);
2117 if (pmap
->stats
.resident_count
< num_removed
)
2118 panic("pmap_remove_range: resident_count");
2120 assert(pmap
->stats
.resident_count
>= num_removed
);
2121 OSAddAtomic(-num_removed
, (SInt32
*) &pmap
->stats
.resident_count
);
2124 if (pmap
->stats
.wired_count
< num_unwired
)
2125 panic("pmap_remove_range: wired_count");
2127 assert(pmap
->stats
.wired_count
>= num_unwired
);
2128 OSAddAtomic(-num_unwired
, (SInt32
*) &pmap
->stats
.wired_count
);
2134 * Remove phys addr if mapped in specified map
2138 pmap_remove_some_phys(
2139 __unused pmap_t map
,
2140 __unused ppnum_t pn
)
2143 /* Implement to support working set code */
2148 * Remove the given range of addresses
2149 * from the specified map.
2151 * It is assumed that the start and end are properly
2152 * rounded to the hardware page size.
2163 pt_entry_t
*spte
, *epte
;
2170 if (map
== PMAP_NULL
|| s64
== e64
)
2173 PMAP_TRACE(PMAP_CODE(PMAP__REMOVE
) | DBG_FUNC_START
,
2175 (int) (s64
>>32), (int) s64
,
2176 (int) (e64
>>32), (int) e64
);
2182 * Check that address range in the kernel does not overlap the stacks.
2183 * We initialize local static min/max variables once to avoid making
2184 * 2 function calls for every remove. Note also that these functions
2185 * both return 0 before kernel stacks have been initialized, and hence
2186 * the panic is not triggered in this case.
2188 if (map
== kernel_pmap
) {
2189 static vm_offset_t kernel_stack_min
= 0;
2190 static vm_offset_t kernel_stack_max
= 0;
2192 if (kernel_stack_min
== 0) {
2193 kernel_stack_min
= min_valid_stack_address();
2194 kernel_stack_max
= max_valid_stack_address();
2196 if ((kernel_stack_min
<= s64
&& s64
< kernel_stack_max
) ||
2197 (kernel_stack_min
< e64
&& e64
<= kernel_stack_max
))
2198 panic("pmap_remove() attempted in kernel stack");
2203 * The values of kernel_stack_min and kernel_stack_max are no longer
2204 * relevant now that we allocate kernel stacks anywhere in the kernel map,
2205 * so the old code above no longer applies. If we wanted to check that
2206 * we weren't removing a mapping of a page in a kernel stack we'd have to
2207 * mark the PTE with an unused bit and check that here.
2212 deadline
= rdtsc64() + max_preemption_latency_tsc
;
2218 l64
= (s64
+ pde_mapped_size
) & ~(pde_mapped_size
-1);
2221 pde
= pmap_pde(map
, s64
);
2223 if (pde
&& (*pde
& INTEL_PTE_VALID
)) {
2224 spte
= (pt_entry_t
*)pmap_pte(map
, (s64
& ~(pde_mapped_size
-1)));
2225 spte
= &spte
[ptenum(s64
)];
2226 epte
= &spte
[intel_btop(l64
-s64
)];
2228 pmap_remove_range(map
, s64
, spte
, epte
);
2233 if (s64
< e64
&& rdtsc64() >= deadline
) {
2237 deadline
= rdtsc64() + max_preemption_latency_tsc
;
2244 PMAP_TRACE(PMAP_CODE(PMAP__REMOVE
) | DBG_FUNC_END
,
2245 (int) map
, 0, 0, 0, 0);
2250 * Routine: pmap_page_protect
2253 * Lower the permission for all mappings to a given
2261 pv_hashed_entry_t pvh_eh
= PV_HASHED_ENTRY_NULL
;
2262 pv_hashed_entry_t pvh_et
= PV_HASHED_ENTRY_NULL
;
2263 pv_hashed_entry_t nexth
;
2265 pv_rooted_entry_t pv_h
;
2266 pv_rooted_entry_t pv_e
;
2267 pv_hashed_entry_t pvh_e
;
2270 register pmap_t pmap
;
2275 assert(pn
!= vm_page_fictitious_addr
);
2276 if (pn
== vm_page_guard_addr
)
2279 pai
= ppn_to_pai(pn
);
2281 if (!managed_page(pai
)) {
2283 * Not a managed page.
2288 PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT
) | DBG_FUNC_START
,
2289 (int) pn
, (int) prot
, 0, 0, 0);
2292 * Determine the new protection.
2296 case VM_PROT_READ
|VM_PROT_EXECUTE
:
2300 return; /* nothing to do */
2306 pv_h
= pai_to_pvh(pai
);
2311 * Walk down PV list, changing or removing all mappings.
2313 if (pv_h
->pmap
!= PMAP_NULL
) {
2316 pvh_e
= (pv_hashed_entry_t
)pv_e
; /* cheat */
2319 register vm_map_offset_t vaddr
;
2323 pte
= pmap_pte(pmap
, vaddr
);
2326 kprintf("pmap_page_protect pmap %p pn 0x%x vaddr 0x%llx\n",pmap
, pn
, vaddr
);
2327 panic("pmap_page_protect");
2330 nexth
= (pv_hashed_entry_t
)queue_next(&pvh_e
->qlink
); /* if there is one */
2333 * Remove the mapping if new protection is NONE
2334 * or if write-protecting a kernel mapping.
2336 if (remove
|| pmap
== kernel_pmap
) {
2338 * Remove the mapping, collecting any modify bits.
2340 pmap_update_pte(pte
, *pte
, (*pte
& ~INTEL_PTE_VALID
));
2342 PMAP_UPDATE_TLBS(pmap
, vaddr
, vaddr
+ PAGE_SIZE
);
2344 pmap_phys_attributes
[pai
] |= *pte
& (PHYS_MODIFIED
|PHYS_REFERENCED
);
2346 pmap_store_pte(pte
, 0);
2349 if (pmap
->stats
.resident_count
< 1)
2350 panic("pmap_page_protect: resident_count");
2352 assert(pmap
->stats
.resident_count
>= 1);
2353 OSAddAtomic(-1, (SInt32
*) &pmap
->stats
.resident_count
);
2356 * Deal with the pv_rooted_entry.
2361 * Fix up head later.
2363 pv_h
->pmap
= PMAP_NULL
;
2367 * Delete this entry.
2370 pvhash_idx
= pvhashidx(pvh_e
->pmap
,pvh_e
->va
);
2371 LOCK_PV_HASH(pvhash_idx
);
2372 remque(&pvh_e
->qlink
);
2373 pmap_pvh_unlink(pvh_e
);
2374 UNLOCK_PV_HASH(pvhash_idx
);
2376 pvh_e
->qlink
.next
= (queue_entry_t
)pvh_eh
;
2379 if (pvh_et
== PV_HASHED_ENTRY_NULL
)
2387 pmap_update_pte(pte
, *pte
, (*pte
& ~INTEL_PTE_WRITE
));
2388 PMAP_UPDATE_TLBS(pmap
, vaddr
, vaddr
+ PAGE_SIZE
);
2392 } while ((pv_e
= (pv_rooted_entry_t
)nexth
) != pv_h
);
2395 * If pv_head mapping was removed, fix it up.
2398 if (pv_h
->pmap
== PMAP_NULL
) {
2399 pvh_e
= (pv_hashed_entry_t
)queue_next(&pv_h
->qlink
);
2401 if (pvh_e
!= (pv_hashed_entry_t
)pv_h
) {
2403 pvhash_idx
= pvhashidx(pvh_e
->pmap
,pvh_e
->va
);
2404 LOCK_PV_HASH(pvhash_idx
);
2405 remque(&pvh_e
->qlink
);
2406 pmap_pvh_unlink(pvh_e
);
2407 UNLOCK_PV_HASH(pvhash_idx
);
2408 pv_h
->pmap
= pvh_e
->pmap
;
2409 pv_h
->va
= pvh_e
->va
;
2410 pvh_e
->qlink
.next
= (queue_entry_t
)pvh_eh
;
2413 if (pvh_et
== PV_HASHED_ENTRY_NULL
)
2419 if (pvh_eh
!= PV_HASHED_ENTRY_NULL
) {
2420 PV_HASHED_FREE_LIST(pvh_eh
, pvh_et
, pvh_cnt
);
2425 PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT
) | DBG_FUNC_END
,
2436 * Disconnect all mappings for this page and return reference and change status
2437 * in generic format.
2440 unsigned int pmap_disconnect(
2443 pmap_page_protect(pa
, 0); /* disconnect the page */
2444 return (pmap_get_refmod(pa
)); /* return ref/chg status */
2448 * Set the physical protection on the
2449 * specified range of this map as requested.
2450 * Will not increase permissions.
2455 vm_map_offset_t sva
,
2456 vm_map_offset_t eva
,
2459 register pt_entry_t
*pde
;
2460 register pt_entry_t
*spte
, *epte
;
2461 vm_map_offset_t lva
;
2462 vm_map_offset_t orig_sva
;
2468 if (map
== PMAP_NULL
)
2471 if (prot
== VM_PROT_NONE
) {
2472 pmap_remove(map
, sva
, eva
);
2476 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT
) | DBG_FUNC_START
,
2478 (int) (sva
>>32), (int) sva
,
2479 (int) (eva
>>32), (int) eva
);
2481 if ( (prot
& VM_PROT_EXECUTE
) || !nx_enabled
|| !map
->nx_enabled
)
2490 lva
= (sva
+ pde_mapped_size
) & ~(pde_mapped_size
-1);
2493 pde
= pmap_pde(map
, sva
);
2494 if (pde
&& (*pde
& INTEL_PTE_VALID
)) {
2495 spte
= (pt_entry_t
*)pmap_pte(map
, (sva
& ~(pde_mapped_size
-1)));
2496 spte
= &spte
[ptenum(sva
)];
2497 epte
= &spte
[intel_btop(lva
-sva
)];
2499 while (spte
< epte
) {
2501 if (*spte
& INTEL_PTE_VALID
) {
2503 if (prot
& VM_PROT_WRITE
)
2504 pmap_update_pte(spte
, *spte
, (*spte
| INTEL_PTE_WRITE
));
2506 pmap_update_pte(spte
, *spte
, (*spte
& ~INTEL_PTE_WRITE
));
2509 pmap_update_pte(spte
, *spte
, (*spte
| INTEL_PTE_NX
));
2511 pmap_update_pte(spte
, *spte
, (*spte
& ~INTEL_PTE_NX
));
2521 PMAP_UPDATE_TLBS(map
, orig_sva
, eva
);
2525 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT
) | DBG_FUNC_END
,
2530 /* Map a (possibly) autogenned block */
2539 __unused
unsigned int flags
)
2543 for (page
= 0; page
< size
; page
++) {
2544 pmap_enter(pmap
, va
, pa
, prot
, attr
, TRUE
);
2552 * Insert the given physical page (p) at
2553 * the specified virtual address (v) in the
2554 * target physical map with the protection requested.
2556 * If specified, the page will be wired down, meaning
2557 * that the related pte cannot be reclaimed.
2559 * NB: This is the only routine which MAY NOT lazy-evaluate
2560 * or lose information. That is, this routine must actually
2561 * insert this page into the given map NOW.
2565 register pmap_t pmap
,
2566 vm_map_offset_t vaddr
,
2572 register pt_entry_t
*pte
;
2573 register pv_rooted_entry_t pv_h
;
2575 pv_hashed_entry_t pvh_e
;
2576 pv_hashed_entry_t pvh_new
;
2577 pv_hashed_entry_t
*hashp
;
2578 pt_entry_t
template;
2579 pmap_paddr_t old_pa
;
2580 pmap_paddr_t pa
= (pmap_paddr_t
)i386_ptob(pn
);
2581 boolean_t need_tlbflush
= FALSE
;
2586 boolean_t old_pa_locked
;
2589 assert(pn
!= vm_page_fictitious_addr
);
2591 printf("pmap(%qx, %x)\n", vaddr
, pn
);
2592 if (pmap
== PMAP_NULL
)
2594 if (pn
== vm_page_guard_addr
)
2597 PMAP_TRACE(PMAP_CODE(PMAP__ENTER
) | DBG_FUNC_START
,
2599 (int) (vaddr
>>32), (int) vaddr
,
2602 if ( (prot
& VM_PROT_EXECUTE
) || !nx_enabled
|| !pmap
->nx_enabled
)
2608 * Must allocate a new pvlist entry while we're unlocked;
2609 * zalloc may cause pageout (which will lock the pmap system).
2610 * If we determine we need a pvlist entry, we will unlock
2611 * and allocate one. Then we will retry, throughing away
2612 * the allocated entry later (if we no longer need it).
2615 pvh_new
= PV_HASHED_ENTRY_NULL
;
2617 pvh_e
= PV_HASHED_ENTRY_NULL
;
2622 * Expand pmap to include this pte. Assume that
2623 * pmap is always expanded to include enough hardware
2624 * pages to map one VM page.
2627 while ((pte
= pmap_pte(pmap
, vaddr
)) == PT_ENTRY_NULL
) {
2629 * Must unlock to expand the pmap.
2632 pmap_expand(pmap
, vaddr
); /* going to grow pde level page(s) */
2636 old_pa
= pte_to_pa(*pte
);
2637 pai
= pa_index(old_pa
);
2638 old_pa_locked
= FALSE
;
2641 * if we have a previous managed page, lock the pv entry now. after
2642 * we lock it, check to see if someone beat us to the lock and if so
2646 if ((0 != old_pa
) && managed_page(pai
)) {
2648 old_pa_locked
= TRUE
;
2649 old_pa
= pte_to_pa(*pte
);
2651 UNLOCK_PVH(pai
); /* some other path beat us to it */
2652 old_pa_locked
= FALSE
;
2658 * Special case if the incoming physical page is already mapped
2664 * May be changing its wired attribute or protection
2667 template = pa_to_pte(pa
) | INTEL_PTE_VALID
;
2669 if(VM_MEM_NOT_CACHEABLE
== (flags
& (VM_MEM_NOT_CACHEABLE
| VM_WIMG_USE_DEFAULT
))) {
2670 if(!(flags
& VM_MEM_GUARDED
))
2671 template |= INTEL_PTE_PTA
;
2672 template |= INTEL_PTE_NCACHE
;
2675 if (pmap
!= kernel_pmap
)
2676 template |= INTEL_PTE_USER
;
2677 if (prot
& VM_PROT_WRITE
)
2678 template |= INTEL_PTE_WRITE
;
2681 template |= INTEL_PTE_NX
;
2684 template |= INTEL_PTE_WIRED
;
2686 OSAddAtomic(+1, (SInt32
*) &pmap
->stats
.wired_count
);
2689 if (iswired(*pte
)) {
2690 assert(pmap
->stats
.wired_count
>= 1);
2691 OSAddAtomic(-1, (SInt32
*) &pmap
->stats
.wired_count
);
2695 /* store modified PTE and preserve RC bits */
2696 pmap_update_pte(pte
, *pte
, template | (*pte
& (INTEL_PTE_REF
| INTEL_PTE_MOD
)));
2697 if (old_pa_locked
) {
2699 old_pa_locked
= FALSE
;
2701 need_tlbflush
= TRUE
;
2706 * Outline of code from here:
2707 * 1) If va was mapped, update TLBs, remove the mapping
2708 * and remove old pvlist entry.
2709 * 2) Add pvlist entry for new mapping
2710 * 3) Enter new mapping.
2712 * If the old physical page is not managed step 1) is skipped
2713 * (except for updating the TLBs), and the mapping is
2714 * overwritten at step 3). If the new physical page is not
2715 * managed, step 2) is skipped.
2718 if (old_pa
!= (pmap_paddr_t
) 0) {
2721 * Don't do anything to pages outside valid memory here.
2722 * Instead convince the code that enters a new mapping
2723 * to overwrite the old one.
2726 /* invalidate the PTE */
2727 pmap_update_pte(pte
, *pte
, (*pte
& ~INTEL_PTE_VALID
));
2728 /* propagate invalidate everywhere */
2729 PMAP_UPDATE_TLBS(pmap
, vaddr
, vaddr
+ PAGE_SIZE
);
2730 /* remember reference and change */
2731 oattr
= (char)(*pte
& (PHYS_MODIFIED
| PHYS_REFERENCED
));
2732 /* completely invalidate the PTE */
2733 pmap_store_pte(pte
, 0);
2735 if (managed_page(pai
)) {
2738 if (pmap
->stats
.resident_count
< 1)
2739 panic("pmap_enter: resident_count");
2741 assert(pmap
->stats
.resident_count
>= 1);
2742 OSAddAtomic(-1, (SInt32
*) &pmap
->stats
.resident_count
);
2744 if (iswired(*pte
)) {
2747 if (pmap
->stats
.wired_count
< 1)
2748 panic("pmap_enter: wired_count");
2750 assert(pmap
->stats
.wired_count
>= 1);
2751 OSAddAtomic(-1, (SInt32
*) &pmap
->stats
.wired_count
);
2754 pmap_phys_attributes
[pai
] |= oattr
;
2756 * Remove the mapping from the pvlist for
2757 * this physical page.
2758 * We'll end up with either a rooted pv or a
2763 pv_h
= pai_to_pvh(pai
);
2765 if (pv_h
->pmap
== PMAP_NULL
) {
2766 panic("pmap_enter: null pv_list!");
2769 if (pv_h
->va
== vaddr
&& pv_h
->pmap
== pmap
) {
2771 * Header is the pv_rooted_entry.
2772 * If there is a next one, copy it to the
2773 * header and free the next one (we cannot
2776 pvh_e
= (pv_hashed_entry_t
)queue_next(&pv_h
->qlink
);
2777 if (pvh_e
!= (pv_hashed_entry_t
)pv_h
) {
2778 pvhash_idx
= pvhashidx(pvh_e
->pmap
, pvh_e
->va
);
2779 LOCK_PV_HASH(pvhash_idx
);
2780 remque(&pvh_e
->qlink
);
2781 pmap_pvh_unlink(pvh_e
);
2782 UNLOCK_PV_HASH(pvhash_idx
);
2783 pv_h
->pmap
= pvh_e
->pmap
;
2784 pv_h
->va
= pvh_e
->va
;
2787 pv_h
->pmap
= PMAP_NULL
;
2788 pvh_e
= PV_HASHED_ENTRY_NULL
;
2792 pv_hashed_entry_t
*pprevh
;
2794 /* wasn't the rooted pv - hash, find it, and unlink it */
2795 old_ppn
= (ppnum_t
)pa_index(old_pa
);
2797 pvhash_idx
= pvhashidx(pmap
,vaddr
);
2798 LOCK_PV_HASH(pvhash_idx
);
2799 pprevh
= pvhash(pvhash_idx
);
2801 if (NULL
==pprevh
)panic("pmap enter 1");
2804 pmap_pv_hashlist_walks
++;
2806 while (PV_HASHED_ENTRY_NULL
!= pvh_e
) {
2808 if (pvh_e
->pmap
== pmap
&& pvh_e
->va
== vaddr
&& pvh_e
->ppn
== old_ppn
) break;
2809 pprevh
= &pvh_e
->nexth
;
2810 pvh_e
= pvh_e
->nexth
;
2812 pmap_pv_hashlist_cnts
+= pv_cnt
;
2813 if (pmap_pv_hashlist_max
< pv_cnt
) pmap_pv_hashlist_max
= pv_cnt
;
2814 if (PV_HASHED_ENTRY_NULL
== pvh_e
) panic("pmap_enter: pv not in hash list");
2815 if(NULL
==pprevh
)panic("pmap enter 2");
2816 *pprevh
= pvh_e
->nexth
;
2817 remque(&pvh_e
->qlink
);
2818 UNLOCK_PV_HASH(pvhash_idx
);
2825 * old_pa is not managed.
2826 * Do removal part of accounting.
2829 if (iswired(*pte
)) {
2830 assert(pmap
->stats
.wired_count
>= 1);
2831 OSAddAtomic(-1, (SInt32
*) &pmap
->stats
.wired_count
);
2837 * if we had a previously managed paged locked, unlock it now
2840 if (old_pa_locked
) {
2842 old_pa_locked
= FALSE
;
2845 pai
= pa_index(pa
); /* now working with new incoming phys page */
2846 if (managed_page(pai
)) {
2849 * Step 2) Enter the mapping in the PV list for this
2852 pv_h
= pai_to_pvh(pai
);
2856 if (pv_h
->pmap
== PMAP_NULL
) {
2858 * No mappings yet, use rooted pv
2862 queue_init(&pv_h
->qlink
);
2866 * Add new pv_hashed_entry after header.
2868 if ((PV_HASHED_ENTRY_NULL
== pvh_e
) && pvh_new
) {
2870 pvh_new
= PV_HASHED_ENTRY_NULL
; /* show we used it */
2871 } else if (PV_HASHED_ENTRY_NULL
== pvh_e
) {
2872 PV_HASHED_ALLOC(pvh_e
);
2873 if (PV_HASHED_ENTRY_NULL
== pvh_e
) {
2874 /* the pv list is empty.
2875 * if we are on the kernel pmap we'll use one of the special private
2876 * kernel pv_e's, else, we need to unlock everything, zalloc a pv_e,
2877 * and restart bringing in the pv_e with us.
2879 if (kernel_pmap
== pmap
) {
2880 PV_HASHED_KERN_ALLOC(pvh_e
);
2884 pvh_new
= (pv_hashed_entry_t
) zalloc(pv_hashed_list_zone
);
2890 if (PV_HASHED_ENTRY_NULL
== pvh_e
) panic("pvh_e exhaustion");
2895 pvhash_idx
= pvhashidx(pmap
,vaddr
);
2896 LOCK_PV_HASH(pvhash_idx
);
2897 insque(&pvh_e
->qlink
, &pv_h
->qlink
);
2898 hashp
= pvhash(pvhash_idx
);
2900 if(NULL
==hashp
)panic("pmap_enter 4");
2902 pvh_e
->nexth
= *hashp
;
2904 UNLOCK_PV_HASH(pvhash_idx
);
2907 * Remember that we used the pvlist entry.
2909 pvh_e
= PV_HASHED_ENTRY_NULL
;
2913 * only count the mapping
2914 * for 'managed memory'
2916 OSAddAtomic(+1, (SInt32
*) &pmap
->stats
.resident_count
);
2917 if (pmap
->stats
.resident_count
> pmap
->stats
.resident_max
) {
2918 pmap
->stats
.resident_max
= pmap
->stats
.resident_count
;
2923 * Step 3) Enter the mapping.
2925 * Build a template to speed up entering -
2926 * only the pfn changes.
2928 template = pa_to_pte(pa
) | INTEL_PTE_VALID
;
2930 if (flags
& VM_MEM_NOT_CACHEABLE
) {
2931 if(!(flags
& VM_MEM_GUARDED
))
2932 template |= INTEL_PTE_PTA
;
2933 template |= INTEL_PTE_NCACHE
;
2936 if (pmap
!= kernel_pmap
)
2937 template |= INTEL_PTE_USER
;
2938 if (prot
& VM_PROT_WRITE
)
2939 template |= INTEL_PTE_WRITE
;
2942 template |= INTEL_PTE_NX
;
2945 template |= INTEL_PTE_WIRED
;
2946 OSAddAtomic(+1, (SInt32
*) &pmap
->stats
.wired_count
);
2948 pmap_store_pte(pte
, template);
2950 /* if this was a managed page we delayed unlocking the pv until here
2951 * to prevent pmap_page_protect et al from finding it until the pte
2952 * has been stored */
2954 if (managed_page(pai
)) {
2959 if (need_tlbflush
== TRUE
)
2960 PMAP_UPDATE_TLBS(pmap
, vaddr
, vaddr
+ PAGE_SIZE
);
2962 if (pvh_e
!= PV_HASHED_ENTRY_NULL
) {
2963 PV_HASHED_FREE_LIST(pvh_e
, pvh_e
, 1);
2966 if (pvh_new
!= PV_HASHED_ENTRY_NULL
) {
2967 PV_HASHED_KERN_FREE_LIST(pvh_new
, pvh_new
, 1);
2971 PMAP_TRACE(PMAP_CODE(PMAP__ENTER
) | DBG_FUNC_END
, 0, 0, 0, 0, 0);
2975 * Routine: pmap_change_wiring
2976 * Function: Change the wiring attribute for a map/virtual-address
2978 * In/out conditions:
2979 * The mapping must already exist in the pmap.
2983 register pmap_t map
,
2984 vm_map_offset_t vaddr
,
2987 register pt_entry_t
*pte
;
2990 * We must grab the pmap system lock because we may
2991 * change a pte_page queue.
2995 if ((pte
= pmap_pte(map
, vaddr
)) == PT_ENTRY_NULL
)
2996 panic("pmap_change_wiring: pte missing");
2998 if (wired
&& !iswired(*pte
)) {
3000 * wiring down mapping
3002 OSAddAtomic(+1, (SInt32
*) &map
->stats
.wired_count
);
3003 pmap_update_pte(pte
, *pte
, (*pte
| INTEL_PTE_WIRED
));
3005 else if (!wired
&& iswired(*pte
)) {
3009 assert(map
->stats
.wired_count
>= 1);
3010 OSAddAtomic(-1, (SInt32
*) &map
->stats
.wired_count
);
3011 pmap_update_pte(pte
, *pte
, (*pte
& ~INTEL_PTE_WIRED
));
3018 pmap_find_phys(pmap_t pmap
, addr64_t va
)
3023 mp_disable_preemption();
3025 ptp
= pmap_pte(pmap
, va
);
3026 if (PT_ENTRY_NULL
== ptp
) {
3029 ppn
= (ppnum_t
) i386_btop(pte_to_pa(*ptp
));
3031 mp_enable_preemption();
3037 * Routine: pmap_extract
3039 * Extract the physical page address associated
3040 * with the given map/virtual_address pair.
3041 * Change to shim for backwards compatibility but will not
3042 * work for 64 bit systems. Some old drivers that we cannot
3048 register pmap_t pmap
,
3049 vm_map_offset_t vaddr
)
3054 paddr
= (vm_offset_t
)0;
3055 ppn
= pmap_find_phys(pmap
, vaddr
);
3058 paddr
= ((vm_offset_t
)i386_ptob(ppn
)) | (vaddr
& INTEL_OFFMASK
);
3066 vm_map_offset_t vaddr
)
3068 register vm_page_t m
;
3069 register pmap_paddr_t pa
;
3073 pml4_entry_t
*pml4p
;
3075 if (kernel_pmap
== map
) panic("expand kernel pml4");
3078 pml4p
= pmap64_pml4(map
, vaddr
);
3080 if (PML4_ENTRY_NULL
== pml4p
) panic("pmap_expand_pml4 no pml4p");
3083 * Allocate a VM page for the pml4 page
3085 while ((m
= vm_page_grab()) == VM_PAGE_NULL
)
3089 * put the page into the pmap's obj list so it
3090 * can be found later.
3094 i
= pml4idx(map
, vaddr
);
3101 vm_page_lock_queues();
3103 inuse_ptepages_count
++;
3104 vm_page_unlock_queues();
3106 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3107 vm_object_lock(map
->pm_obj_pml4
);
3111 * See if someone else expanded us first
3113 if (pmap64_pdpt(map
, vaddr
) != PDPT_ENTRY_NULL
) {
3115 vm_object_unlock(map
->pm_obj_pml4
);
3117 vm_page_lock_queues();
3119 inuse_ptepages_count
--;
3120 vm_page_unlock_queues();
3126 if (0 != vm_page_lookup(map
->pm_obj_pml4
, (vm_object_offset_t
)i
)) {
3127 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
3128 map
, map
->pm_obj_pml4
, vaddr
, i
);
3131 vm_page_insert(m
, map
->pm_obj_pml4
, (vm_object_offset_t
)i
);
3132 vm_object_unlock(map
->pm_obj_pml4
);
3135 * Set the page directory entry for this page table.
3137 pml4p
= pmap64_pml4(map
, vaddr
); /* refetch under lock */
3139 pmap_store_pte(pml4p
, pa_to_pte(pa
)
3153 vm_map_offset_t vaddr
)
3155 register vm_page_t m
;
3156 register pmap_paddr_t pa
;
3160 pdpt_entry_t
*pdptp
;
3162 if (kernel_pmap
== map
) panic("expand kernel pdpt");
3165 while ((pdptp
= pmap64_pdpt(map
, vaddr
)) == PDPT_ENTRY_NULL
) {
3167 pmap_expand_pml4(map
, vaddr
); /* need room for another pdpt entry */
3173 * Allocate a VM page for the pdpt page
3175 while ((m
= vm_page_grab()) == VM_PAGE_NULL
)
3179 * put the page into the pmap's obj list so it
3180 * can be found later.
3184 i
= pdptidx(map
, vaddr
);
3191 vm_page_lock_queues();
3193 inuse_ptepages_count
++;
3194 vm_page_unlock_queues();
3196 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3197 vm_object_lock(map
->pm_obj_pdpt
);
3201 * See if someone else expanded us first
3203 if (pmap64_pde(map
, vaddr
) != PD_ENTRY_NULL
) {
3205 vm_object_unlock(map
->pm_obj_pdpt
);
3207 vm_page_lock_queues();
3209 inuse_ptepages_count
--;
3210 vm_page_unlock_queues();
3216 if (0 != vm_page_lookup(map
->pm_obj_pdpt
, (vm_object_offset_t
)i
)) {
3217 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
3218 map
, map
->pm_obj_pdpt
, vaddr
, i
);
3221 vm_page_insert(m
, map
->pm_obj_pdpt
, (vm_object_offset_t
)i
);
3222 vm_object_unlock(map
->pm_obj_pdpt
);
3225 * Set the page directory entry for this page table.
3227 pdptp
= pmap64_pdpt(map
, vaddr
); /* refetch under lock */
3229 pmap_store_pte(pdptp
, pa_to_pte(pa
)
3243 * Routine: pmap_expand
3245 * Expands a pmap to be able to map the specified virtual address.
3247 * Allocates new virtual memory for the P0 or P1 portion of the
3248 * pmap, then re-maps the physical pages that were in the old
3249 * pmap to be in the new pmap.
3251 * Must be called with the pmap system and the pmap unlocked,
3252 * since these must be unlocked to use vm_allocate or vm_deallocate.
3253 * Thus it must be called in a loop that checks whether the map
3254 * has been expanded enough.
3255 * (We won't loop forever, since page tables aren't shrunk.)
3260 vm_map_offset_t vaddr
)
3263 register vm_page_t m
;
3264 register pmap_paddr_t pa
;
3270 * if not the kernel map (while we are still compat kernel mode)
3271 * and we are 64 bit, propagate expand upwards
3274 if (cpu_64bit
&& (map
!= kernel_pmap
)) {
3276 while ((pdp
= pmap64_pde(map
, vaddr
)) == PD_ENTRY_NULL
) {
3278 pmap_expand_pdpt(map
, vaddr
); /* need room for another pde entry */
3285 * Allocate a VM page for the pde entries.
3287 while ((m
= vm_page_grab()) == VM_PAGE_NULL
)
3291 * put the page into the pmap's obj list so it
3292 * can be found later.
3296 i
= pdeidx(map
, vaddr
);
3303 vm_page_lock_queues();
3305 inuse_ptepages_count
++;
3306 vm_page_unlock_queues();
3308 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3309 vm_object_lock(map
->pm_obj
);
3313 * See if someone else expanded us first
3316 if (pmap_pte(map
, vaddr
) != PT_ENTRY_NULL
) {
3318 vm_object_unlock(map
->pm_obj
);
3320 vm_page_lock_queues();
3322 inuse_ptepages_count
--;
3323 vm_page_unlock_queues();
3329 if (0 != vm_page_lookup(map
->pm_obj
, (vm_object_offset_t
)i
)) {
3330 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
3331 map
, map
->pm_obj
, vaddr
, i
);
3334 vm_page_insert(m
, map
->pm_obj
, (vm_object_offset_t
)i
);
3335 vm_object_unlock(map
->pm_obj
);
3338 * refetch while locked
3341 pdp
= pmap_pde(map
, vaddr
);
3344 * Set the page directory entry for this page table.
3346 pmap_store_pte(pdp
, pa_to_pte(pa
)
3358 * pmap_sync_page_data_phys(ppnum_t pa)
3360 * Invalidates all of the instruction cache on a physical page and
3361 * pushes any dirty data from the data cache for the same physical page
3362 * Not required in i386.
3365 pmap_sync_page_data_phys(__unused ppnum_t pa
)
3371 * pmap_sync_page_attributes_phys(ppnum_t pa)
3373 * Write back and invalidate all cachelines on a physical page.
3376 pmap_sync_page_attributes_phys(ppnum_t pa
)
3378 cache_flush_page_phys(pa
);
3383 #ifdef CURRENTLY_UNUSED_AND_UNTESTED
3389 * Routine: pmap_collect
3391 * Garbage collects the physical map system for
3392 * pages which are no longer used.
3393 * Success need not be guaranteed -- that is, there
3394 * may well be pages which are not referenced, but
3395 * others may be collected.
3397 * Called by the pageout daemon when pages are scarce.
3403 register pt_entry_t
*pdp
, *ptp
;
3410 if (p
== kernel_pmap
)
3414 * Garbage collect map.
3418 for (pdp
= (pt_entry_t
*)p
->dirbase
;
3419 pdp
< (pt_entry_t
*)&p
->dirbase
[(UMAXPTDI
+1)];
3422 if (*pdp
& INTEL_PTE_VALID
) {
3423 if(*pdp
& INTEL_PTE_REF
) {
3424 pmap_store_pte(pdp
, *pdp
& ~INTEL_PTE_REF
);
3428 ptp
= pmap_pte(p
, pdetova(pdp
- (pt_entry_t
*)p
->dirbase
));
3429 eptp
= ptp
+ NPTEPG
;
3432 * If the pte page has any wired mappings, we cannot
3437 register pt_entry_t
*ptep
;
3438 for (ptep
= ptp
; ptep
< eptp
; ptep
++) {
3439 if (iswired(*ptep
)) {
3447 * Remove the virtual addresses mapped by this pte page.
3449 pmap_remove_range(p
,
3450 pdetova(pdp
- (pt_entry_t
*)p
->dirbase
),
3455 * Invalidate the page directory pointer.
3457 pmap_store_pte(pdp
, 0x0);
3462 * And free the pte page itself.
3465 register vm_page_t m
;
3467 vm_object_lock(p
->pm_obj
);
3469 m
= vm_page_lookup(p
->pm_obj
,(vm_object_offset_t
)(pdp
- (pt_entry_t
*)&p
->dirbase
[0]));
3470 if (m
== VM_PAGE_NULL
)
3471 panic("pmap_collect: pte page not in object");
3473 vm_page_lock_queues();
3475 inuse_ptepages_count
--;
3476 vm_page_unlock_queues();
3478 vm_object_unlock(p
->pm_obj
);
3487 PMAP_UPDATE_TLBS(p
, 0x0, 0xFFFFFFFFFFFFF000ULL
);
3496 pmap_copy_page(ppnum_t src
, ppnum_t dst
)
3498 bcopy_phys((addr64_t
)i386_ptob(src
),
3499 (addr64_t
)i386_ptob(dst
),
3505 * Routine: pmap_pageable
3507 * Make the specified pages (by pmap, offset)
3508 * pageable (or not) as requested.
3510 * A page which is not pageable may not take
3511 * a fault; therefore, its page table entry
3512 * must remain valid for the duration.
3514 * This routine is merely advisory; pmap_enter
3515 * will specify that these pages are to be wired
3516 * down (or not) as appropriate.
3520 __unused pmap_t pmap
,
3521 __unused vm_map_offset_t start_addr
,
3522 __unused vm_map_offset_t end_addr
,
3523 __unused boolean_t pageable
)
3526 pmap
++; start_addr
++; end_addr
++; pageable
++;
3531 * Clear specified attribute bits.
3534 phys_attribute_clear(
3538 pv_rooted_entry_t pv_h
;
3539 register pv_hashed_entry_t pv_e
;
3540 register pt_entry_t
*pte
;
3542 register pmap_t pmap
;
3545 assert(pn
!= vm_page_fictitious_addr
);
3546 if (pn
== vm_page_guard_addr
)
3549 pai
= ppn_to_pai(pn
);
3551 if (!managed_page(pai
)) {
3553 * Not a managed page.
3558 PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR
) | DBG_FUNC_START
,
3559 (int) pn
, bits
, 0, 0, 0);
3561 pv_h
= pai_to_pvh(pai
);
3566 * Walk down PV list, clearing all modify or reference bits.
3567 * We do not have to lock the pv_list because we have
3568 * the entire pmap system locked.
3570 if (pv_h
->pmap
!= PMAP_NULL
) {
3572 * There are some mappings.
3575 pv_e
= (pv_hashed_entry_t
)pv_h
;
3585 * first make sure any processor actively
3586 * using this pmap, flushes its TLB state
3589 PMAP_UPDATE_TLBS(pmap
, va
, va
+ PAGE_SIZE
);
3592 * Clear modify and/or reference bits.
3595 pte
= pmap_pte(pmap
, va
);
3596 pmap_update_pte(pte
, *pte
, (*pte
& ~bits
));
3600 pv_e
= (pv_hashed_entry_t
)queue_next(&pv_e
->qlink
);
3602 } while (pv_e
!= (pv_hashed_entry_t
)pv_h
);
3604 pmap_phys_attributes
[pai
] &= ~bits
;
3608 PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR
) | DBG_FUNC_END
,
3614 * Check specified attribute bits.
3617 phys_attribute_test(
3621 pv_rooted_entry_t pv_h
;
3622 register pv_hashed_entry_t pv_e
;
3623 register pt_entry_t
*pte
;
3625 register pmap_t pmap
;
3629 assert(pn
!= vm_page_fictitious_addr
);
3630 if (pn
== vm_page_guard_addr
)
3633 pai
= ppn_to_pai(pn
);
3635 if (!managed_page(pai
)) {
3637 * Not a managed page.
3643 * super fast check... if bits already collected
3644 * no need to take any locks...
3645 * if not set, we need to recheck after taking
3646 * the lock in case they got pulled in while
3647 * we were waiting for the lock
3649 if ( (pmap_phys_attributes
[pai
] & bits
) == bits
)
3652 pv_h
= pai_to_pvh(pai
);
3656 attributes
= pmap_phys_attributes
[pai
] & bits
;
3659 * Walk down PV list, checking the mappings until we
3660 * reach the end or we've found the attributes we've asked for
3661 * We do not have to lock the pv_list because we have
3662 * the entire pmap system locked.
3664 if (pv_h
->pmap
!= PMAP_NULL
) {
3666 * There are some mappings.
3668 pv_e
= (pv_hashed_entry_t
)pv_h
;
3669 if (attributes
!= bits
) do {
3678 * first make sure any processor actively
3679 * using this pmap, flushes its TLB state
3681 PMAP_UPDATE_TLBS(pmap
, va
, va
+ PAGE_SIZE
);
3684 * pick up modify and/or reference bits from this mapping
3687 pte
= pmap_pte(pmap
, va
);
3688 attributes
|= *pte
& bits
;
3692 pv_e
= (pv_hashed_entry_t
)queue_next(&pv_e
->qlink
);
3694 } while ((attributes
!= bits
) && (pv_e
!= (pv_hashed_entry_t
)pv_h
));
3698 return (attributes
);
3702 * Set specified attribute bits.
3712 assert(pn
!= vm_page_fictitious_addr
);
3713 if (pn
== vm_page_guard_addr
)
3716 pai
= ppn_to_pai(pn
);
3718 if (!managed_page(pai
)) {
3720 * Not a managed page.
3727 pmap_phys_attributes
[pai
] |= bits
;
3733 * Set the modify bit on the specified physical page.
3736 void pmap_set_modify(
3739 phys_attribute_set(pn
, PHYS_MODIFIED
);
3743 * Clear the modify bits on the specified physical page.
3750 phys_attribute_clear(pn
, PHYS_MODIFIED
);
3756 * Return whether or not the specified physical page is modified
3757 * by any physical maps.
3764 if (phys_attribute_test(pn
, PHYS_MODIFIED
))
3771 * pmap_clear_reference:
3773 * Clear the reference bit on the specified physical page.
3777 pmap_clear_reference(
3780 phys_attribute_clear(pn
, PHYS_REFERENCED
);
3784 pmap_set_reference(ppnum_t pn
)
3786 phys_attribute_set(pn
, PHYS_REFERENCED
);
3790 * pmap_is_referenced:
3792 * Return whether or not the specified physical page is referenced
3793 * by any physical maps.
3800 if (phys_attribute_test(pn
, PHYS_REFERENCED
))
3807 * pmap_get_refmod(phys)
3808 * returns the referenced and modified bits of the specified
3812 pmap_get_refmod(ppnum_t pa
)
3815 unsigned int retval
= 0;
3817 refmod
= phys_attribute_test(pa
, PHYS_MODIFIED
| PHYS_REFERENCED
);
3819 if (refmod
& PHYS_MODIFIED
)
3820 retval
|= VM_MEM_MODIFIED
;
3821 if (refmod
& PHYS_REFERENCED
)
3822 retval
|= VM_MEM_REFERENCED
;
3828 * pmap_clear_refmod(phys, mask)
3829 * clears the referenced and modified bits as specified by the mask
3830 * of the specified physical page.
3833 pmap_clear_refmod(ppnum_t pa
, unsigned int mask
)
3835 unsigned int x86Mask
;
3837 x86Mask
= ( ((mask
& VM_MEM_MODIFIED
)? PHYS_MODIFIED
: 0)
3838 | ((mask
& VM_MEM_REFERENCED
)? PHYS_REFERENCED
: 0));
3839 phys_attribute_clear(pa
, x86Mask
);
3843 invalidate_icache(__unused vm_offset_t addr
,
3844 __unused
unsigned cnt
,
3850 flush_dcache(__unused vm_offset_t addr
,
3851 __unused
unsigned count
,
3859 * Constrain DTrace copyin/copyout actions
3861 extern kern_return_t
dtrace_copyio_preflight(addr64_t
);
3862 extern kern_return_t
dtrace_copyio_postflight(addr64_t
);
3864 kern_return_t
dtrace_copyio_preflight(__unused addr64_t va
)
3866 thread_t thread
= current_thread();
3868 if (current_map() == kernel_map
)
3869 return KERN_FAILURE
;
3870 else if (thread
->machine
.specFlags
& CopyIOActive
)
3871 return KERN_FAILURE
;
3873 return KERN_SUCCESS
;
3876 kern_return_t
dtrace_copyio_postflight(__unused addr64_t va
)
3878 return KERN_SUCCESS
;
3880 #endif /* CONFIG_DTRACE */
3884 /* show phys page mappings and attributes */
3886 extern void db_show_page(pmap_paddr_t pa
);
3890 db_show_page(pmap_paddr_t pa
)
3897 pv_h
= pai_to_pvh(pai
);
3899 attr
= pmap_phys_attributes
[pai
];
3900 printf("phys page %llx ", pa
);
3901 if (attr
& PHYS_MODIFIED
)
3902 printf("modified, ");
3903 if (attr
& PHYS_REFERENCED
)
3904 printf("referenced, ");
3905 if (pv_h
->pmap
|| pv_h
->next
)
3906 printf(" mapped at\n");
3908 printf(" not mapped\n");
3909 for (; pv_h
; pv_h
= pv_h
->next
)
3911 printf("%llx in pmap %p\n", pv_h
->va
, pv_h
->pmap
);
3915 #endif /* MACH_KDB */
3919 void db_kvtophys(vm_offset_t
);
3920 void db_show_vaddrs(pt_entry_t
*);
3923 * print out the results of kvtophys(arg)
3929 db_printf("0x%qx", kvtophys(vaddr
));
3933 * Walk the pages tables.
3937 pt_entry_t
*dirbase
)
3939 pt_entry_t
*ptep
, *pdep
, tmp
;
3940 unsigned int x
, y
, pdecnt
, ptecnt
;
3943 dirbase
= kernel_pmap
->dirbase
;
3946 db_printf("need a dirbase...\n");
3949 dirbase
= (pt_entry_t
*) (int) ((unsigned long) dirbase
& ~INTEL_OFFMASK
);
3951 db_printf("dirbase: 0x%x\n", dirbase
);
3953 pdecnt
= ptecnt
= 0;
3955 for (y
= 0; y
< NPDEPG
; y
++, pdep
++) {
3956 if (((tmp
= *pdep
) & INTEL_PTE_VALID
) == 0) {
3960 ptep
= (pt_entry_t
*) ((unsigned long)(*pdep
) & ~INTEL_OFFMASK
);
3961 db_printf("dir[%4d]: 0x%x\n", y
, *pdep
);
3962 for (x
= 0; x
< NPTEPG
; x
++, ptep
++) {
3963 if (((tmp
= *ptep
) & INTEL_PTE_VALID
) == 0) {
3967 db_printf(" tab[%4d]: 0x%x, va=0x%x, pa=0x%x\n",
3970 (y
<< 22) | (x
<< 12),
3971 *ptep
& ~INTEL_OFFMASK
);
3975 db_printf("total: %d tables, %d page table entries.\n", pdecnt
, ptecnt
);
3979 #endif /* MACH_KDB */
3981 #include <mach_vm_debug.h>
3983 #include <vm/vm_debug.h>
3986 pmap_list_resident_pages(
3987 __unused pmap_t pmap
,
3988 __unused vm_offset_t
*listp
,
3993 #endif /* MACH_VM_DEBUG */
3997 /* temporary workaround */
3999 coredumpok(__unused vm_map_t map
, __unused vm_offset_t va
)
4004 ptep
= pmap_pte(map
->pmap
, va
);
4007 return ((*ptep
& (INTEL_PTE_NCACHE
| INTEL_PTE_WIRED
)) != (INTEL_PTE_NCACHE
| INTEL_PTE_WIRED
));
4018 assert(pn
!= vm_page_fictitious_addr
);
4020 if (!pmap_initialized
)
4023 if (pn
== vm_page_guard_addr
)
4026 if (!managed_page(ppn_to_pai(pn
)))
4033 mapping_free_prime(void)
4036 pv_hashed_entry_t pvh_e
;
4037 pv_hashed_entry_t pvh_eh
;
4038 pv_hashed_entry_t pvh_et
;
4042 pvh_eh
= pvh_et
= PV_HASHED_ENTRY_NULL
;
4043 for (i
= 0; i
< (5 * PV_HASHED_ALLOC_CHUNK
); i
++) {
4044 pvh_e
= (pv_hashed_entry_t
) zalloc(pv_hashed_list_zone
);
4046 pvh_e
->qlink
.next
= (queue_entry_t
)pvh_eh
;
4049 if (pvh_et
== PV_HASHED_ENTRY_NULL
)
4053 PV_HASHED_FREE_LIST(pvh_eh
, pvh_et
, pv_cnt
);
4056 pvh_eh
= pvh_et
= PV_HASHED_ENTRY_NULL
;
4057 for (i
= 0; i
< PV_HASHED_KERN_ALLOC_CHUNK
; i
++) {
4058 pvh_e
= (pv_hashed_entry_t
) zalloc(pv_hashed_list_zone
);
4060 pvh_e
->qlink
.next
= (queue_entry_t
)pvh_eh
;
4063 if (pvh_et
== PV_HASHED_ENTRY_NULL
)
4067 PV_HASHED_KERN_FREE_LIST(pvh_eh
, pvh_et
, pv_cnt
);
4072 mapping_adjust(void)
4074 pv_hashed_entry_t pvh_e
;
4075 pv_hashed_entry_t pvh_eh
;
4076 pv_hashed_entry_t pvh_et
;
4080 if (mapping_adjust_call
== NULL
) {
4081 thread_call_setup(&mapping_adjust_call_data
,
4082 (thread_call_func_t
) mapping_adjust
,
4083 (thread_call_param_t
) NULL
);
4084 mapping_adjust_call
= &mapping_adjust_call_data
;
4088 pvh_eh
= pvh_et
= PV_HASHED_ENTRY_NULL
;
4089 if (pv_hashed_kern_free_count
< PV_HASHED_KERN_LOW_WATER_MARK
) {
4090 for (i
= 0; i
< PV_HASHED_KERN_ALLOC_CHUNK
; i
++) {
4091 pvh_e
= (pv_hashed_entry_t
) zalloc(pv_hashed_list_zone
);
4093 pvh_e
->qlink
.next
= (queue_entry_t
)pvh_eh
;
4096 if (pvh_et
== PV_HASHED_ENTRY_NULL
)
4100 PV_HASHED_KERN_FREE_LIST(pvh_eh
, pvh_et
, pv_cnt
);
4104 pvh_eh
= pvh_et
= PV_HASHED_ENTRY_NULL
;
4105 if (pv_hashed_free_count
< PV_HASHED_LOW_WATER_MARK
) {
4106 for (i
= 0; i
< PV_HASHED_ALLOC_CHUNK
; i
++) {
4107 pvh_e
= (pv_hashed_entry_t
) zalloc(pv_hashed_list_zone
);
4109 pvh_e
->qlink
.next
= (queue_entry_t
)pvh_eh
;
4112 if (pvh_et
== PV_HASHED_ENTRY_NULL
)
4116 PV_HASHED_FREE_LIST(pvh_eh
, pvh_et
, pv_cnt
);
4122 pmap_commpage32_init(vm_offset_t kernel_commpage
, vm_offset_t user_commpage
, int cnt
)
4125 pt_entry_t
*opte
, *npte
;
4129 for (i
= 0; i
< cnt
; i
++) {
4131 opte
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)kernel_commpage
);
4133 panic("kernel_commpage");
4134 pte
= *opte
| INTEL_PTE_USER
|INTEL_PTE_GLOBAL
;
4135 pte
&= ~INTEL_PTE_WRITE
; // ensure read only
4136 npte
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)user_commpage
);
4138 panic("user_commpage");
4139 pmap_store_pte(npte
, pte
);
4141 kernel_commpage
+= INTEL_PGBYTES
;
4142 user_commpage
+= INTEL_PGBYTES
;
4147 #define PMAP_COMMPAGE64_CNT (_COMM_PAGE64_AREA_USED/PAGE_SIZE)
4148 pt_entry_t pmap_commpage64_ptes
[PMAP_COMMPAGE64_CNT
];
4151 pmap_commpage64_init(vm_offset_t kernel_commpage
, __unused vm_map_offset_t user_commpage
, int cnt
)
4156 PMAP_LOCK(kernel_pmap
);
4158 for (i
= 0; i
< cnt
; i
++) {
4159 kptep
= pmap_pte(kernel_pmap
, (uint64_t)kernel_commpage
+ (i
*PAGE_SIZE
));
4160 if ((0 == kptep
) || (0 == (*kptep
& INTEL_PTE_VALID
)))
4161 panic("pmap_commpage64_init pte");
4162 pmap_commpage64_ptes
[i
] = ((*kptep
& ~INTEL_PTE_WRITE
) | INTEL_PTE_USER
);
4164 PMAP_UNLOCK(kernel_pmap
);
4168 static cpu_pmap_t cpu_pmap_master
;
4171 pmap_cpu_alloc(boolean_t is_boot_cpu
)
4176 vm_offset_t address
;
4177 vm_map_address_t mapaddr
;
4178 vm_map_entry_t entry
;
4182 cp
= &cpu_pmap_master
;
4185 * The per-cpu pmap data structure itself.
4187 ret
= kmem_alloc(kernel_map
,
4188 (vm_offset_t
*) &cp
, sizeof(cpu_pmap_t
));
4189 if (ret
!= KERN_SUCCESS
) {
4190 printf("pmap_cpu_alloc() failed ret=%d\n", ret
);
4193 bzero((void *)cp
, sizeof(cpu_pmap_t
));
4196 * The temporary windows used for copy/zero - see loose_ends.c
4198 ret
= vm_map_find_space(kernel_map
,
4199 &mapaddr
, PMAP_NWINDOWS
*PAGE_SIZE
, (vm_map_offset_t
)0, 0, &entry
);
4200 if (ret
!= KERN_SUCCESS
) {
4201 printf("pmap_cpu_alloc() "
4202 "vm_map_find_space ret=%d\n", ret
);
4206 address
= (vm_offset_t
)mapaddr
;
4208 for (i
= 0; i
< PMAP_NWINDOWS
; i
++, address
+= PAGE_SIZE
) {
4211 while ((pte
= pmap_pte(kernel_pmap
, (vm_map_offset_t
)address
)) == 0)
4212 pmap_expand(kernel_pmap
, (vm_map_offset_t
)address
);
4214 cp
->mapwindow
[i
].prv_CADDR
= (caddr_t
) address
;
4215 cp
->mapwindow
[i
].prv_CMAP
= pte
;
4218 vm_map_unlock(kernel_map
);
4221 cp
->pdpt_window_index
= PMAP_PDPT_FIRST_WINDOW
;
4222 cp
->pde_window_index
= PMAP_PDE_FIRST_WINDOW
;
4223 cp
->pte_window_index
= PMAP_PTE_FIRST_WINDOW
;
4229 pmap_cpu_free(struct cpu_pmap
*cp
)
4231 if (cp
!= NULL
&& cp
!= &cpu_pmap_master
) {
4232 kfree((void *) cp
, sizeof(cpu_pmap_t
));
4238 pmap_get_mapwindow(pt_entry_t pentry
)
4243 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
4246 * Note: 0th map reserved for pmap_pte()
4248 for (i
= PMAP_NWINDOWS_FIRSTFREE
; i
< PMAP_NWINDOWS
; i
++) {
4249 mp
= ¤t_cpu_datap()->cpu_pmap
->mapwindow
[i
];
4251 if (*mp
->prv_CMAP
== 0) {
4252 pmap_store_pte(mp
->prv_CMAP
, pentry
);
4254 invlpg((uintptr_t)mp
->prv_CADDR
);
4259 panic("pmap_get_mapwindow: no windows available");
4266 pmap_put_mapwindow(mapwindow_t
*mp
)
4268 pmap_store_pte(mp
->prv_CMAP
, 0);
4273 * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time,
4274 * on a NBPDE boundary.
4276 uint64_t pmap_nesting_size_min
= NBPDE
;
4277 uint64_t pmap_nesting_size_max
= 0 - (uint64_t)NBPDE
; /* no limit, really... */
4280 * kern_return_t pmap_nest(grand, subord, vstart, size)
4282 * grand = the pmap that we will nest subord into
4283 * subord = the pmap that goes into the grand
4284 * vstart = start of range in pmap to be inserted
4285 * nstart = start of range in pmap nested pmap
4286 * size = Size of nest area (up to 16TB)
4288 * Inserts a pmap into another. This is used to implement shared segments.
4290 * on x86 this is very limited right now. must be exactly 1 segment.
4292 * Note that we depend upon higher level VM locks to insure that things don't change while
4293 * we are doing this. For example, VM should not be doing any pmap enters while it is nesting
4294 * or do 2 nests at once.
4298 kern_return_t
pmap_nest(pmap_t grand
, pmap_t subord
, addr64_t vstart
, addr64_t nstart
, uint64_t size
) {
4300 vm_map_offset_t vaddr
, nvaddr
;
4301 pd_entry_t
*pde
,*npde
;
4305 // do validity tests
4306 if (size
& (pmap_nesting_size_min
-1)) return KERN_INVALID_VALUE
;
4307 if(vstart
& (pmap_nesting_size_min
-1)) return KERN_INVALID_VALUE
;
4308 if(nstart
& (pmap_nesting_size_min
-1)) return KERN_INVALID_VALUE
;
4309 if((size
>> 28) > 65536) return KERN_INVALID_VALUE
; /* Max size we can nest is 16TB */
4311 panic("pmap_nest: size is invalid - %016llX\n", size
);
4314 PMAP_TRACE(PMAP_CODE(PMAP__NEST
) | DBG_FUNC_START
,
4315 (int) grand
, (int) subord
,
4316 (int) (vstart
>>32), (int) vstart
, 0);
4318 subord
->pm_shared
= TRUE
;
4319 nvaddr
= (vm_map_offset_t
)nstart
;
4320 num_pde
= size
>> PDESHIFT
;
4323 for (i
= 0; i
< num_pde
; i
++) {
4324 npde
= pmap_pde(subord
, nvaddr
);
4325 while (0 == npde
|| ((*npde
& INTEL_PTE_VALID
) == 0)) {
4326 PMAP_UNLOCK(subord
);
4327 pmap_expand(subord
, nvaddr
); // pmap_expand handles races
4329 npde
= pmap_pde(subord
, nvaddr
);
4334 PMAP_UNLOCK(subord
);
4336 vaddr
= (vm_map_offset_t
)vstart
;
4340 for (i
= 0;i
< num_pde
; i
++) {
4343 npde
= pmap_pde(subord
, nstart
);
4345 panic("pmap_nest: no npde, subord %p nstart 0x%llx", subord
, nstart
);
4348 pde
= pmap_pde(grand
, vaddr
);
4349 /* Legacy mode does not require expansion.
4350 * DRK: consider a debug mode test to verify that no PTEs are extant within
4353 if ((0 == pde
) && cpu_64bit
) {
4355 pmap_expand_pdpt(grand
, vaddr
);
4357 pde
= pmap_pde(grand
, vaddr
);
4361 panic("pmap_nest: no pde, grand %p vaddr 0x%llx", grand
, vaddr
);
4363 pmap_store_pte(pde
, tpde
);
4366 /* XXX FBDP: why do we need to flush here ? */
4367 PMAP_UPDATE_TLBS(grand
, vstart
, vstart
+ size
- 1);
4371 PMAP_TRACE(PMAP_CODE(PMAP__NEST
) | DBG_FUNC_END
, 0, 0, 0, 0, 0);
4373 return KERN_SUCCESS
;
4377 * kern_return_t pmap_unnest(grand, vaddr)
4379 * grand = the pmap that we will nest subord into
4380 * vaddr = start of range in pmap to be unnested
4382 * Removes a pmap from another. This is used to implement shared segments.
4383 * On the current PPC processors, this is limited to segment (256MB) aligned
4384 * segment sized ranges.
4387 kern_return_t
pmap_unnest(pmap_t grand
, addr64_t vaddr
, uint64_t size
) {
4391 unsigned int num_pde
;
4392 addr64_t vstart
, vend
;
4394 PMAP_TRACE(PMAP_CODE(PMAP__NEST
) | DBG_FUNC_START
,
4396 (int) (vaddr
>>32), (int) vaddr
, 0, 0);
4398 if ((size
& (pmap_nesting_size_min
-1)) ||
4399 (vaddr
& (pmap_nesting_size_min
-1))) {
4400 panic("pmap_unnest(%p,0x%llx,0x%llx): unaligned...\n",
4401 grand
, vaddr
, size
);
4404 /* align everything to PDE boundaries */
4405 vstart
= vaddr
& ~(NBPDE
-1);
4406 vend
= (vaddr
+ size
+ NBPDE
- 1) & ~(NBPDE
-1);
4407 size
= vend
- vstart
;
4411 // invalidate all pdes for segment at vaddr in pmap grand
4413 num_pde
= size
>> PDESHIFT
;
4416 for (i
=0;i
<num_pde
;i
++,pde
++) {
4417 pde
= pmap_pde(grand
, (vm_map_offset_t
)vaddr
);
4418 if (pde
== 0) panic("pmap_unnest: no pde, grand %p vaddr 0x%llx\n", grand
, vaddr
);
4419 pmap_store_pte(pde
, (pd_entry_t
)0);
4422 PMAP_UPDATE_TLBS(grand
, vstart
, vend
);
4426 PMAP_TRACE(PMAP_CODE(PMAP__NEST
) | DBG_FUNC_END
, 0, 0, 0, 0, 0);
4428 return KERN_SUCCESS
;
4432 pmap_switch(pmap_t tpmap
)
4437 s
= splhigh(); /* Make sure interruptions are disabled */
4438 my_cpu
= cpu_number();
4440 set_dirbase(tpmap
, my_cpu
);
4447 * disable no-execute capability on
4448 * the specified pmap
4450 void pmap_disable_NX(pmap_t pmap
) {
4452 pmap
->nx_enabled
= 0;
4456 pt_fake_zone_info(int *count
, vm_size_t
*cur_size
, vm_size_t
*max_size
, vm_size_t
*elem_size
,
4457 vm_size_t
*alloc_size
, int *collectable
, int *exhaustable
)
4459 *count
= inuse_ptepages_count
;
4460 *cur_size
= PAGE_SIZE
* inuse_ptepages_count
;
4461 *max_size
= PAGE_SIZE
* (inuse_ptepages_count
+ vm_page_inactive_count
+ vm_page_active_count
+ vm_page_free_count
);
4462 *elem_size
= PAGE_SIZE
;
4463 *alloc_size
= PAGE_SIZE
;
4469 vm_offset_t
pmap_cpu_high_map_vaddr(int cpu
, enum high_cpu_types e
)
4471 enum high_fixed_addresses a
;
4472 a
= e
+ HIGH_CPU_END
* cpu
;
4473 return pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN
+ a
);
4476 vm_offset_t
pmap_high_map_vaddr(enum high_cpu_types e
)
4478 return pmap_cpu_high_map_vaddr(cpu_number(), e
);
4481 vm_offset_t
pmap_high_map(pt_entry_t pte
, enum high_cpu_types e
)
4483 enum high_fixed_addresses a
;
4486 a
= e
+ HIGH_CPU_END
* cpu_number();
4487 vaddr
= (vm_offset_t
)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN
+ a
);
4488 pmap_store_pte(pte_unique_base
+ a
, pte
);
4490 /* TLB flush for this page for this cpu */
4491 invlpg((uintptr_t)vaddr
);
4497 pmap_cpuset_NMIPI(cpu_set cpu_mask
) {
4498 unsigned int cpu
, cpu_bit
;
4501 for (cpu
= 0, cpu_bit
= 1; cpu
< real_ncpus
; cpu
++, cpu_bit
<<= 1) {
4502 if (cpu_mask
& cpu_bit
)
4503 cpu_NMI_interrupt(cpu
);
4505 deadline
= mach_absolute_time() + (LockTimeOut
>> 2);
4506 while (mach_absolute_time() < deadline
)
4512 * Called with pmap locked, we:
4513 * - scan through per-cpu data to see which other cpus need to flush
4514 * - send an IPI to each non-idle cpu to be flushed
4515 * - wait for all to signal back that they are inactive or we see that
4516 * they are in an interrupt handler or at a safe point
4517 * - flush the local tlb is active for this pmap
4518 * - return ... the caller will unlock the pmap
4521 pmap_flush_tlbs(pmap_t pmap
)
4524 unsigned int cpu_bit
;
4525 cpu_set cpus_to_signal
;
4526 unsigned int my_cpu
= cpu_number();
4527 pmap_paddr_t pmap_cr3
= pmap
->pm_cr3
;
4528 boolean_t flush_self
= FALSE
;
4531 assert((processor_avail_count
< 2) ||
4532 (ml_get_interrupts_enabled() && get_preemption_level() != 0));
4535 * Scan other cpus for matching active or task CR3.
4536 * For idle cpus (with no active map) we mark them invalid but
4537 * don't signal -- they'll check as they go busy.
4538 * Note: for the kernel pmap we look for 64-bit shared address maps.
4541 for (cpu
= 0, cpu_bit
= 1; cpu
< real_ncpus
; cpu
++, cpu_bit
<<= 1) {
4542 if (!cpu_datap(cpu
)->cpu_running
)
4544 if ((cpu_datap(cpu
)->cpu_task_cr3
== pmap_cr3
) ||
4545 (CPU_GET_ACTIVE_CR3(cpu
) == pmap_cr3
) ||
4546 (pmap
->pm_shared
) ||
4547 ((pmap
== kernel_pmap
) &&
4548 (!CPU_CR3_IS_ACTIVE(cpu
) ||
4549 cpu_datap(cpu
)->cpu_task_map
== TASK_MAP_64BIT_SHARED
))) {
4550 if (cpu
== my_cpu
) {
4554 cpu_datap(cpu
)->cpu_tlb_invalid
= TRUE
;
4555 __asm__
volatile("mfence");
4557 if (CPU_CR3_IS_ACTIVE(cpu
)) {
4558 cpus_to_signal
|= cpu_bit
;
4559 i386_signal_cpu(cpu
, MP_TLB_FLUSH
, ASYNC
);
4564 PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS
) | DBG_FUNC_START
,
4565 (int) pmap
, cpus_to_signal
, flush_self
, 0, 0);
4567 if (cpus_to_signal
) {
4568 cpu_set cpus_to_respond
= cpus_to_signal
;
4570 deadline
= mach_absolute_time() + LockTimeOut
;
4572 * Wait for those other cpus to acknowledge
4574 while (cpus_to_respond
!= 0) {
4575 if (mach_absolute_time() > deadline
) {
4576 pmap_tlb_flush_timeout
= TRUE
;
4577 pmap_cpuset_NMIPI(cpus_to_respond
);
4578 panic("pmap_flush_tlbs() timeout: "
4579 "cpu(s) failing to respond to interrupts, pmap=%p cpus_to_respond=0x%lx",
4580 pmap
, cpus_to_respond
);
4583 for (cpu
= 0, cpu_bit
= 1; cpu
< real_ncpus
; cpu
++, cpu_bit
<<= 1) {
4584 if ((cpus_to_respond
& cpu_bit
) != 0) {
4585 if (!cpu_datap(cpu
)->cpu_running
||
4586 cpu_datap(cpu
)->cpu_tlb_invalid
== FALSE
||
4587 !CPU_CR3_IS_ACTIVE(cpu
)) {
4588 cpus_to_respond
&= ~cpu_bit
;
4592 if (cpus_to_respond
== 0)
4599 * Flush local tlb if required.
4600 * We need this flush even if the pmap being changed
4601 * is the user map... in case we do a copyin/out
4602 * before returning to user mode.
4607 PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS
) | DBG_FUNC_END
,
4608 (int) pmap
, cpus_to_signal
, flush_self
, 0, 0);
4612 process_pmap_updates(void)
4614 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
4618 current_cpu_datap()->cpu_tlb_invalid
= FALSE
;
4619 __asm__
volatile("mfence");
4623 pmap_update_interrupt(void)
4625 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT
) | DBG_FUNC_START
,
4628 process_pmap_updates();
4630 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT
) | DBG_FUNC_END
,
4635 unsigned int pmap_cache_attributes(ppnum_t pn
) {
4637 if (!managed_page(ppn_to_pai(pn
)))
4638 return (VM_WIMG_IO
);
4640 return (VM_WIMG_COPYBACK
);
4649 kprintf("pmap 0x%x\n",p
);
4651 kprintf(" pm_cr3 0x%llx\n",p
->pm_cr3
);
4652 kprintf(" pm_pml4 0x%x\n",p
->pm_pml4
);
4653 kprintf(" pm_pdpt 0x%x\n",p
->pm_pdpt
);
4655 kprintf(" pml4[0] 0x%llx\n",*p
->pm_pml4
);
4657 kprintf(" pdpt[%d] 0x%llx\n",i
, p
->pm_pdpt
[i
]);
4660 void pmap_dump_wrap(void)
4662 pmap_dump(current_cpu_datap()->cpu_active_thread
->task
->map
->pmap
);
4666 dump_4GB_pdpt(pmap_t p
)
4669 pdpt_entry_t
*user_pdptp
;
4670 pdpt_entry_t
*kern_pdptp
;
4671 pdpt_entry_t
*pml4p
;
4674 while ((user_pdptp
= pmap64_pdpt(p
, 0x0)) == PDPT_ENTRY_NULL
) {
4676 pmap_expand_pml4(p
, 0x0);
4679 kern_pdptp
= kernel_pmap
->pm_pdpt
;
4680 if (kern_pdptp
== NULL
)
4681 panic("kern_pdptp == NULL");
4682 kprintf("dump_4GB_pdpt(%p)\n"
4683 "kern_pdptp=%p (phys=0x%016llx)\n"
4684 "\t 0x%08x: 0x%016llx\n"
4685 "\t 0x%08x: 0x%016llx\n"
4686 "\t 0x%08x: 0x%016llx\n"
4687 "\t 0x%08x: 0x%016llx\n"
4688 "\t 0x%08x: 0x%016llx\n"
4689 "user_pdptp=%p (phys=0x%016llx)\n"
4690 "\t 0x%08x: 0x%016llx\n"
4691 "\t 0x%08x: 0x%016llx\n"
4692 "\t 0x%08x: 0x%016llx\n"
4693 "\t 0x%08x: 0x%016llx\n"
4694 "\t 0x%08x: 0x%016llx\n",
4695 p
, kern_pdptp
, kvtophys(kern_pdptp
),
4696 kern_pdptp
+0, *(kern_pdptp
+0),
4697 kern_pdptp
+1, *(kern_pdptp
+1),
4698 kern_pdptp
+2, *(kern_pdptp
+2),
4699 kern_pdptp
+3, *(kern_pdptp
+3),
4700 kern_pdptp
+4, *(kern_pdptp
+4),
4701 user_pdptp
, kvtophys(user_pdptp
),
4702 user_pdptp
+0, *(user_pdptp
+0),
4703 user_pdptp
+1, *(user_pdptp
+1),
4704 user_pdptp
+2, *(user_pdptp
+2),
4705 user_pdptp
+3, *(user_pdptp
+3),
4706 user_pdptp
+4, *(user_pdptp
+4));
4707 kprintf("user pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
4708 p
->pm_cr3
, p
->pm_hold
, p
->pm_pml4
);
4709 pml4p
= (pdpt_entry_t
*)p
->pm_hold
;
4711 panic("user pml4p == NULL");
4712 kprintf("\t 0x%08x: 0x%016llx\n"
4713 "\t 0x%08x: 0x%016llx\n",
4715 pml4p
+KERNEL_UBER_PML4_INDEX
, *(pml4p
+KERNEL_UBER_PML4_INDEX
));
4716 kprintf("kern pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
4717 kernel_pmap
->pm_cr3
, kernel_pmap
->pm_hold
, kernel_pmap
->pm_pml4
);
4718 pml4p
= (pdpt_entry_t
*)kernel_pmap
->pm_hold
;
4720 panic("kern pml4p == NULL");
4721 kprintf("\t 0x%08x: 0x%016llx\n"
4722 "\t 0x%08x: 0x%016llx\n",
4724 pml4p
+511, *(pml4p
+511));
4728 void dump_4GB_pdpt_thread(thread_t tp
)
4730 dump_4GB_pdpt(tp
->map
->pmap
);