2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 #ifndef _I386_PMAP_INTERNAL_
31 #define _I386_PMAP_INTERNAL_
32 #ifdef MACH_KERNEL_PRIVATE
35 #include <sys/kdebug.h>
36 #include <kern/ledger.h>
42 #define PMAP_LOCK(pmap) { \
43 simple_lock(&(pmap)->lock); \
46 #define PMAP_UNLOCK(pmap) { \
47 simple_unlock(&(pmap)->lock); \
50 #define PMAP_UPDATE_TLBS(pmap, s, e) \
51 pmap_flush_tlbs(pmap, s, e, 0, NULL)
54 #define PMAP_DELAY_TLB_FLUSH 0x01
56 #define PMAP_UPDATE_TLBS_DELAYED(pmap, s, e, c) \
57 pmap_flush_tlbs(pmap, s, e, PMAP_DELAY_TLB_FLUSH, c)
60 #define iswired(pte) ((pte) & INTEL_PTE_WIRED)
63 extern boolean_t pmap_trace
;
64 #define PMAP_TRACE(x,a,b,c,d,e) \
66 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
69 #define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e)
70 #endif /* PMAP_TRACES */
72 #define PMAP_TRACE_CONSTANT(x,a,b,c,d,e) \
73 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
75 kern_return_t pmap_expand_pml4(
78 unsigned int options
);
80 kern_return_t
pmap_expand_pdpt(
83 unsigned int options
);
85 void phys_attribute_set(
89 void pmap_set_reference(
92 boolean_t
phys_page_exists(
96 pmap_flush_tlbs(pmap_t
, vm_map_offset_t
, vm_map_offset_t
, int, pmap_flush_context
*);
99 pmap_update_cache_attributes_locked(ppnum_t
, unsigned);
101 extern const boolean_t cpu_64bit
;
104 * Private data structures.
108 * For each vm_page_t, there is a list of all currently
109 * valid virtual mappings of that page. An entry is
110 * a pv_rooted_entry_t; the list is the pv_table.
112 * N.B. with the new combo rooted/hashed scheme it is
113 * only possibly to remove individual non-rooted entries
114 * if they are found via the hashed chains as there is no
115 * way to unlink the singly linked hashed entries if navigated to
116 * via the queue list off the rooted entries. Think of it as
117 * hash/walk/pull, keeping track of the prev pointer while walking
118 * the singly linked hash list. All of this is to save memory and
119 * keep both types of pv_entries as small as possible.
124 PV HASHING Changes - JK 1/2007
126 Pve's establish physical to virtual mappings. These are used for aliasing of a
127 physical page to (potentially many) virtual addresses within pmaps. In the
128 previous implementation the structure of the pv_entries (each 16 bytes in size) was
130 typedef struct pv_entry {
131 struct pv_entry_t next;
136 An initial array of these is created at boot time, one per physical page of
137 memory, indexed by the physical page number. Additionally, a pool of entries
138 is created from a pv_zone to be used as needed by pmap_enter() when it is
139 creating new mappings. Originally, we kept this pool around because the code
140 in pmap_enter() was unable to block if it needed an entry and none were
141 available - we'd panic. Some time ago I restructured the pmap_enter() code
142 so that for user pmaps it can block while zalloc'ing a pv structure and restart,
143 removing a panic from the code (in the case of the kernel pmap we cannot block
144 and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
145 The pool has not been removed since there is a large performance gain keeping
146 freed pv's around for reuse and not suffering the overhead of zalloc for every
149 As pmap_enter() created new mappings it linked the new pve's for them off the
150 fixed pv array for that ppn (off the next pointer). These pve's are accessed
151 for several operations, one of them being address space teardown. In that case,
154 for (every page/pte in the space) {
155 calc pve_ptr from the ppn in the pte
156 for (every pv in the list for the ppn) {
157 if (this pv is for this pmap/vaddr) {
164 The problem arose when we were running, say 8000 (or even 2000) apache or
165 other processes and one or all terminate. The list hanging off each pv array
166 entry could have thousands of entries. We were continuously linearly searching
167 each of these lists as we stepped through the address space we were tearing
168 down. Because of the locks we hold, likely taking a cache miss for each node,
169 and interrupt disabling for MP issues the system became completely unresponsive
170 for many seconds while we did this.
172 Realizing that pve's are accessed in two distinct ways (linearly running the
173 list by ppn for operations like pmap_page_protect and finding and
174 modifying/removing a single pve as part of pmap_enter processing) has led to
175 modifying the pve structures and databases.
177 There are now two types of pve structures. A "rooted" structure which is
178 basically the original structure accessed in an array by ppn, and a ''hashed''
179 structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
180 designed with the two goals of minimizing wired memory and making the lookup of
181 a ppn faster. Since a vast majority of pages in the system are not aliased
182 and hence represented by a single pv entry I've kept the rooted entry size as
183 small as possible because there is one of these dedicated for every physical
184 page of memory. The hashed pve's are larger due to the addition of the hash
185 link and the ppn entry needed for matching while running the hash list to find
186 the entry we are looking for. This way, only systems that have lots of
187 aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
188 structures have the same first three fields allowing some simplification in
191 They have these shapes
193 typedef struct pv_rooted_entry {
197 } *pv_rooted_entry_t;
200 typedef struct pv_hashed_entry {
205 struct pv_hashed_entry *nexth;
206 } *pv_hashed_entry_t;
208 The main flow difference is that the code is now aware of the rooted entry and
209 the hashed entries. Code that runs the pv list still starts with the rooted
210 entry and then continues down the qlink onto the hashed entries. Code that is
211 looking up a specific pv entry first checks the rooted entry and then hashes
212 and runs the hash list for the match. The hash list lengths are much smaller
213 than the original pv lists that contained all aliases for the specific ppn.
217 typedef struct pv_rooted_entry
{
218 /* first three entries must match pv_hashed_entry_t */
220 vm_map_offset_t va
; /* virtual address for mapping */
221 pmap_t pmap
; /* pmap where mapping lies */
222 } *pv_rooted_entry_t
;
224 #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
226 typedef struct pv_hashed_entry
{
227 /* first three entries must match pv_rooted_entry_t */
232 struct pv_hashed_entry
*nexth
;
233 } *pv_hashed_entry_t
;
235 #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
237 //#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */
239 #define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
241 #define CHK_NPVHASH(x)
244 #define NPVHASH 4095 /* MUST BE 2^N - 1 */
245 #define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000
246 #define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000
247 #define PV_HASHED_ALLOC_CHUNK_INITIAL 2000
248 #define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200
250 extern volatile uint32_t mappingrecurse
;
251 extern uint32_t pv_hashed_low_water_mark
, pv_hashed_kern_low_water_mark
;
257 #define LOCK_PV_HASH(hash) lock_hash_hash(hash)
258 #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
259 extern uint32_t npvhash
;
260 extern pv_hashed_entry_t
*pv_hash_table
; /* hash lists */
261 extern pv_hashed_entry_t pv_hashed_free_list
;
262 extern pv_hashed_entry_t pv_hashed_kern_free_list
;
263 decl_simple_lock_data(extern, pv_hashed_free_list_lock
)
264 decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock
)
265 decl_simple_lock_data(extern, pv_hash_table_lock
)
267 extern zone_t pv_hashed_list_zone
; /* zone of pv_hashed_entry
270 extern uint32_t pv_hashed_free_count
;
271 extern uint32_t pv_hashed_kern_free_count
;
273 * Each entry in the pv_head_table is locked by a bit in the
274 * pv_lock_table. The lock bits are accessed by the address of
275 * the frame they lock.
277 #define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
278 #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
279 extern char *pv_lock_table
; /* pointer to array of bits */
280 extern char *pv_hash_lock_table
;
281 extern pv_rooted_entry_t pv_head_table
; /* array of entries, one per page */
283 extern event_t mapping_replenish_event
;
285 static inline void PV_HASHED_ALLOC(pv_hashed_entry_t
*pvh_ep
) {
286 pmap_assert(*pvh_ep
== PV_HASHED_ENTRY_NULL
);
287 simple_lock(&pv_hashed_free_list_lock
);
288 /* If the kernel reserved pool is low, let non-kernel mappings allocate
289 * synchronously, possibly subject to a throttle.
291 if ((pv_hashed_kern_free_count
> pv_hashed_kern_low_water_mark
) && ((*pvh_ep
= pv_hashed_free_list
) != 0)) {
292 pv_hashed_free_list
= (pv_hashed_entry_t
)(*pvh_ep
)->qlink
.next
;
293 pv_hashed_free_count
--;
296 simple_unlock(&pv_hashed_free_list_lock
);
298 if (pv_hashed_free_count
<= pv_hashed_low_water_mark
) {
299 if (!mappingrecurse
&& hw_compare_and_store(0,1, &mappingrecurse
))
300 thread_wakeup(&mapping_replenish_event
);
304 static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh
, pv_hashed_entry_t pvh_et
, int pv_cnt
) {
305 simple_lock(&pv_hashed_free_list_lock
);
306 pvh_et
->qlink
.next
= (queue_entry_t
)pv_hashed_free_list
;
307 pv_hashed_free_list
= pvh_eh
;
308 pv_hashed_free_count
+= pv_cnt
;
309 simple_unlock(&pv_hashed_free_list_lock
);
312 extern unsigned pmap_kern_reserve_alloc_stat
;
314 static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t
*pvh_e
) {
315 pmap_assert(*pvh_e
== PV_HASHED_ENTRY_NULL
);
316 simple_lock(&pv_hashed_kern_free_list_lock
);
318 if ((*pvh_e
= pv_hashed_kern_free_list
) != 0) {
319 pv_hashed_kern_free_list
= (pv_hashed_entry_t
)(*pvh_e
)->qlink
.next
;
320 pv_hashed_kern_free_count
--;
321 pmap_kern_reserve_alloc_stat
++;
324 simple_unlock(&pv_hashed_kern_free_list_lock
);
326 if (pv_hashed_kern_free_count
< pv_hashed_kern_low_water_mark
) {
327 if (!mappingrecurse
&& hw_compare_and_store(0,1, &mappingrecurse
))
328 thread_wakeup(&mapping_replenish_event
);
332 static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh
, pv_hashed_entry_t pvh_et
, int pv_cnt
) {
333 simple_lock(&pv_hashed_kern_free_list_lock
);
334 pvh_et
->qlink
.next
= (queue_entry_t
)pv_hashed_kern_free_list
;
335 pv_hashed_kern_free_list
= pvh_eh
;
336 pv_hashed_kern_free_count
+= pv_cnt
;
337 simple_unlock(&pv_hashed_kern_free_list_lock
);
340 extern uint64_t pmap_pv_throttle_stat
, pmap_pv_throttled_waiters
;
341 extern event_t pmap_user_pv_throttle_event
;
343 static inline void pmap_pv_throttle(__unused pmap_t p
) {
344 pmap_assert(p
!= kernel_pmap
);
345 /* Apply throttle on non-kernel mappings */
346 if (pv_hashed_kern_free_count
< (pv_hashed_kern_low_water_mark
/ 2)) {
347 pmap_pv_throttle_stat
++;
348 /* This doesn't need to be strictly accurate, merely a hint
349 * to eliminate the timeout when the reserve is replenished.
351 pmap_pv_throttled_waiters
++;
352 assert_wait_timeout(&pmap_user_pv_throttle_event
, THREAD_UNINT
, 1, 1000 * NSEC_PER_USEC
);
353 thread_block(THREAD_CONTINUE_NULL
);
358 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
361 #define pa_index(pa) (i386_btop(pa))
362 #define ppn_to_pai(ppn) ((int)ppn)
364 #define pai_to_pvh(pai) (&pv_head_table[pai])
365 #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
366 #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
367 #define pvhash(idx) (&pv_hash_table[idx])
368 #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
369 #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
371 #define IS_MANAGED_PAGE(x) \
372 ((unsigned int)(x) <= last_managed_page && \
373 (pmap_phys_attributes[x] & PHYS_MANAGED))
374 #define IS_INTERNAL_PAGE(x) \
375 (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_INTERNAL))
376 #define IS_REUSABLE_PAGE(x) \
377 (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_REUSABLE))
380 * Physical page attributes. Copy bits from PTE definition.
382 #define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
383 #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
384 #define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
385 #define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */
386 #define PHYS_NCACHE INTEL_PTE_NCACHE
387 #define PHYS_PTA INTEL_PTE_PTA
388 #define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE)
389 #define PHYS_INTERNAL INTEL_PTE_WTHRU /* page from internal object */
390 #define PHYS_REUSABLE INTEL_PTE_WRITE /* page is "reusable" */
392 extern const boolean_t pmap_disable_kheap_nx
;
393 extern const boolean_t pmap_disable_kstack_nx
;
395 #define PMAP_EXPAND_OPTIONS_NONE (0x0)
396 #define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT)
397 #define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER)
400 * Amount of virtual memory mapped by one
401 * page-directory entry.
403 #define PDE_MAPPED_SIZE (pdetova(1))
407 * Locking and TLB invalidation
411 * Locking Protocols: (changed 2/2007 JK)
413 * There are two structures in the pmap module that need locking:
414 * the pmaps themselves, and the per-page pv_lists (which are locked
415 * by locking the pv_lock_table entry that corresponds to the pv_head
416 * for the list in question.) Most routines want to lock a pmap and
417 * then do operations in it that require pv_list locking -- however
418 * pmap_remove_all and pmap_copy_on_write operate on a physical page
419 * basis and want to do the locking in the reverse order, i.e. lock
420 * a pv_list and then go through all the pmaps referenced by that list.
422 * The system wide pmap lock has been removed. Now, paths take a lock
423 * on the pmap before changing its 'shape' and the reverse order lockers
424 * (coming in by phys ppn) take a lock on the corresponding pv and then
425 * retest to be sure nothing changed during the window before they locked
426 * and can then run up/down the pv lists holding the list lock. This also
427 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
435 #define LOCK_PVH(index) { \
436 mp_disable_preemption(); \
437 lock_pvh_pai(index); \
440 #define UNLOCK_PVH(index) { \
441 unlock_pvh_pai(index); \
442 mp_enable_preemption(); \
445 extern uint64_t pde_mapped_size
;
447 extern char *pmap_phys_attributes
;
448 extern ppnum_t last_managed_page
;
450 extern ppnum_t lowest_lo
;
451 extern ppnum_t lowest_hi
;
452 extern ppnum_t highest_hi
;
455 * when spinning through pmap_remove
456 * ensure that we don't spend too much
457 * time with preemption disabled.
458 * I'm setting the current threshold
461 #define MAX_PREEMPTION_LATENCY_NS 20000
462 extern uint64_t max_preemption_latency_tsc
;
464 /* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
465 #ifdef DEBUGINTERRUPTS
466 #define pmap_intr_assert() { \
467 if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \
468 panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \
471 #define pmap_intr_assert()
474 extern int nx_enabled
;
475 extern unsigned int inuse_ptepages_count
;
477 static inline uint32_t
478 pvhashidx(pmap_t pmap
, vm_map_offset_t va
)
480 return ((uint32_t)(uintptr_t)pmap
^
481 ((uint32_t)(va
>> PAGE_SHIFT
) & 0xFFFFFFFF)) &
487 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
488 * properly deals with the anchor.
489 * must be called with the hash locked, does not unlock it
492 pmap_pvh_unlink(pv_hashed_entry_t pvh
)
494 pv_hashed_entry_t curh
;
495 pv_hashed_entry_t
*pprevh
;
499 pvhash_idx
= pvhashidx(pvh
->pmap
, pvh
->va
);
501 pprevh
= pvhash(pvhash_idx
);
505 panic("pvh_unlink null anchor"); /* JK DEBUG */
509 while (PV_HASHED_ENTRY_NULL
!= curh
) {
512 pprevh
= &curh
->nexth
;
515 if (PV_HASHED_ENTRY_NULL
== curh
) panic("pmap_pvh_unlink no pvh");
516 *pprevh
= pvh
->nexth
;
521 pv_hash_add(pv_hashed_entry_t pvh_e
,
522 pv_rooted_entry_t pv_h
)
524 pv_hashed_entry_t
*hashp
;
528 pvhash_idx
= pvhashidx(pvh_e
->pmap
, pvh_e
->va
);
529 LOCK_PV_HASH(pvhash_idx
);
530 insque(&pvh_e
->qlink
, &pv_h
->qlink
);
531 hashp
= pvhash(pvhash_idx
);
534 panic("pv_hash_add(%p) null hash bucket", pvh_e
);
536 pvh_e
->nexth
= *hashp
;
538 UNLOCK_PV_HASH(pvhash_idx
);
542 pv_hash_remove(pv_hashed_entry_t pvh_e
)
547 pvhash_idx
= pvhashidx(pvh_e
->pmap
,pvh_e
->va
);
548 LOCK_PV_HASH(pvhash_idx
);
549 remque(&pvh_e
->qlink
);
550 pmap_pvh_unlink(pvh_e
);
551 UNLOCK_PV_HASH(pvhash_idx
);
554 static inline boolean_t
popcnt1(uint64_t distance
) {
555 return ((distance
& (distance
- 1)) == 0);
559 * Routines to handle suppression of/recovery from some forms of pagetable corruption
560 * incidents observed in the field. These can be either software induced (wild
561 * stores to the mapwindows where applicable, use after free errors
562 * (typically of pages addressed physically), mis-directed DMAs etc., or due
563 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors,
564 * the recording mechanism is deliberately not MP-safe. The overarching goal is to
565 * still assert on potential software races, but attempt recovery from incidents
566 * identifiable as occurring due to issues beyond the control of the pmap module.
567 * The latter includes single-bit errors and malformed pagetable entries.
568 * We currently limit ourselves to recovery/suppression of one incident per
569 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident
571 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09)
578 PTE_SUPERVISOR
= 0x4,
581 PTE_INVALID_CACHEABILITY
= 0x20
582 } pmap_pagetable_corruption_t
;
587 } pmap_pv_assertion_t
;
590 PMAP_ACTION_IGNORE
= 0x0,
591 PMAP_ACTION_ASSERT
= 0x1,
592 PMAP_ACTION_RETRY
= 0x2,
593 PMAP_ACTION_RETRY_RELOCK
= 0x4
594 } pmap_pagetable_corruption_action_t
;
596 #define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
597 extern uint64_t pmap_pagetable_corruption_interval_abstime
;
599 extern uint32_t pmap_pagetable_corruption_incidents
;
600 #define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8)
602 pmap_pv_assertion_t incident
;
603 pmap_pagetable_corruption_t reason
;
604 pmap_pagetable_corruption_action_t action
;
606 vm_map_offset_t vaddr
;
610 vm_map_offset_t pvva
;
612 } pmap_pagetable_corruption_record_t
;
614 extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records
[];
615 extern uint64_t pmap_pagetable_corruption_last_abstime
;
616 extern thread_call_t pmap_pagetable_corruption_log_call
;
617 extern boolean_t pmap_pagetable_corruption_timeout
;
620 pmap_pagetable_corruption_log(pmap_pv_assertion_t incident
, pmap_pagetable_corruption_t suppress_reason
, pmap_pagetable_corruption_action_t action
, pmap_t pmap
, vm_map_offset_t vaddr
, pt_entry_t
*ptep
, ppnum_t ppn
, pmap_t pvpmap
, vm_map_offset_t pvva
) {
621 uint32_t pmap_pagetable_corruption_log_index
;
622 pmap_pagetable_corruption_log_index
= pmap_pagetable_corruption_incidents
++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG
;
623 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].incident
= incident
;
624 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].reason
= suppress_reason
;
625 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].action
= action
;
626 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].pmap
= pmap
;
627 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].vaddr
= vaddr
;
628 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].pte
= *ptep
;
629 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].ppn
= ppn
;
630 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].pvpmap
= pvpmap
;
631 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].pvva
= pvva
;
632 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].abstime
= mach_absolute_time();
633 /* Asynchronously log */
634 thread_call_enter(pmap_pagetable_corruption_log_call
);
637 static inline pmap_pagetable_corruption_action_t
638 pmap_classify_pagetable_corruption(pmap_t pmap
, vm_map_offset_t vaddr
, ppnum_t
*ppnp
, pt_entry_t
*ptep
, pmap_pv_assertion_t incident
) {
639 pmap_pagetable_corruption_action_t action
= PMAP_ACTION_ASSERT
;
640 pmap_pagetable_corruption_t suppress_reason
= PTE_VALID
;
641 ppnum_t suppress_ppn
= 0;
642 pt_entry_t cpte
= *ptep
;
643 ppnum_t cpn
= pa_index(pte_to_pa(cpte
));
645 pv_rooted_entry_t pv_h
= pai_to_pvh(ppn_to_pai(ppn
));
646 pv_rooted_entry_t pv_e
= pv_h
;
648 pmap_t pvpmap
= pv_h
->pmap
;
649 vm_map_offset_t pvva
= pv_h
->va
;
650 boolean_t ppcd
= FALSE
;
652 /* Ideally, we'd consult the Mach VM here to definitively determine
653 * the nature of the mapping for this address space and address.
654 * As that would be a layering violation in this context, we
655 * use various heuristics to recover from single bit errors,
656 * malformed pagetable entries etc. These are not intended
657 * to be comprehensive.
660 /* As a precautionary measure, mark A+D */
661 pmap_phys_attributes
[ppn_to_pai(ppn
)] |= (PHYS_MODIFIED
| PHYS_REFERENCED
);
664 * Correct potential single bit errors in either (but not both) element
668 if ((popcnt1((uintptr_t)pv_e
->pmap
^ (uintptr_t)pmap
) && pv_e
->va
== vaddr
) ||
669 (pv_e
->pmap
== pmap
&& popcnt1(pv_e
->va
^ vaddr
))) {
672 suppress_reason
= PV_BITFLIP
;
673 action
= PMAP_ACTION_RETRY
;
676 } while (((pv_e
= (pv_rooted_entry_t
) queue_next(&pv_e
->qlink
))) && (pv_e
!= pv_h
));
678 /* Discover root entries with a Hamming
679 * distance of 1 from the supplied
680 * physical page frame.
682 for (bitdex
= 0; bitdex
< (sizeof(ppnum_t
) << 3); bitdex
++) {
683 ppnum_t npn
= cpn
^ (ppnum_t
) (1ULL << bitdex
);
684 if (IS_MANAGED_PAGE(npn
)) {
685 pv_rooted_entry_t npv_h
= pai_to_pvh(ppn_to_pai(npn
));
686 if (npv_h
->va
== vaddr
&& npv_h
->pmap
== pmap
) {
687 suppress_reason
= PTE_BITFLIP
;
689 action
= PMAP_ACTION_RETRY_RELOCK
;
690 UNLOCK_PVH(ppn_to_pai(ppn
));
697 if (pmap
== kernel_pmap
) {
698 action
= PMAP_ACTION_ASSERT
;
702 /* Check for malformed/inconsistent entries */
704 if ((cpte
& (INTEL_PTE_NCACHE
| INTEL_PTE_WTHRU
| INTEL_PTE_PTA
)) == (INTEL_PTE_NCACHE
| INTEL_PTE_WTHRU
)) {
705 action
= PMAP_ACTION_IGNORE
;
706 suppress_reason
= PTE_INVALID_CACHEABILITY
;
708 else if (cpte
& INTEL_PTE_RSVD
) {
709 action
= PMAP_ACTION_IGNORE
;
710 suppress_reason
= PTE_RSVD
;
712 else if ((pmap
!= kernel_pmap
) && ((cpte
& INTEL_PTE_USER
) == 0)) {
713 action
= PMAP_ACTION_IGNORE
;
714 suppress_reason
= PTE_SUPERVISOR
;
717 PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd
, sizeof(ppcd
));
719 if (debug_boot_arg
&& !ppcd
) {
720 action
= PMAP_ACTION_ASSERT
;
723 if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime
) < pmap_pagetable_corruption_interval_abstime
) {
724 action
= PMAP_ACTION_ASSERT
;
725 pmap_pagetable_corruption_timeout
= TRUE
;
729 pmap_pagetable_corruption_last_abstime
= mach_absolute_time();
731 pmap_pagetable_corruption_log(incident
, suppress_reason
, action
, pmap
, vaddr
, &cpte
, *ppnp
, pvpmap
, pvva
);
736 * Remove pv list entry.
737 * Called with pv_head_table entry locked.
738 * Returns pv entry to be freed (or NULL).
740 static inline __attribute__((always_inline
)) pv_hashed_entry_t
741 pmap_pv_remove(pmap_t pmap
,
742 vm_map_offset_t vaddr
,
746 pv_hashed_entry_t pvh_e
;
747 pv_rooted_entry_t pv_h
;
748 pv_hashed_entry_t
*pprevh
;
753 pmap_pv_remove_retry
:
755 pvh_e
= PV_HASHED_ENTRY_NULL
;
756 pv_h
= pai_to_pvh(ppn_to_pai(ppn
));
758 if (__improbable(pv_h
->pmap
== PMAP_NULL
)) {
759 pmap_pagetable_corruption_action_t pac
= pmap_classify_pagetable_corruption(pmap
, vaddr
, ppnp
, pte
, ROOT_ABSENT
);
760 if (pac
== PMAP_ACTION_IGNORE
)
761 goto pmap_pv_remove_exit
;
762 else if (pac
== PMAP_ACTION_ASSERT
)
763 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list!", pmap
, vaddr
, ppn
, *pte
, ppnp
, pte
);
764 else if (pac
== PMAP_ACTION_RETRY_RELOCK
) {
765 LOCK_PVH(ppn_to_pai(*ppnp
));
766 pmap_phys_attributes
[ppn_to_pai(*ppnp
)] |= (PHYS_MODIFIED
| PHYS_REFERENCED
);
767 goto pmap_pv_remove_retry
;
769 else if (pac
== PMAP_ACTION_RETRY
)
770 goto pmap_pv_remove_retry
;
773 if (pv_h
->va
== vaddr
&& pv_h
->pmap
== pmap
) {
775 * Header is the pv_rooted_entry.
776 * We can't free that. If there is a queued
777 * entry after this one we remove that
778 * from the ppn queue, we remove it from the hash chain
779 * and copy it to the rooted entry. Then free it instead.
781 pvh_e
= (pv_hashed_entry_t
) queue_next(&pv_h
->qlink
);
782 if (pv_h
!= (pv_rooted_entry_t
) pvh_e
) {
784 * Entry queued to root, remove this from hash
785 * and install as new root.
788 pvhash_idx
= pvhashidx(pvh_e
->pmap
, pvh_e
->va
);
789 LOCK_PV_HASH(pvhash_idx
);
790 remque(&pvh_e
->qlink
);
791 pprevh
= pvhash(pvhash_idx
);
792 if (PV_HASHED_ENTRY_NULL
== *pprevh
) {
793 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x): "
794 "empty hash, removing rooted",
797 pmap_pvh_unlink(pvh_e
);
798 UNLOCK_PV_HASH(pvhash_idx
);
799 pv_h
->pmap
= pvh_e
->pmap
;
800 pv_h
->va
= pvh_e
->va
; /* dispose of pvh_e */
802 /* none queued after rooted */
803 pv_h
->pmap
= PMAP_NULL
;
804 pvh_e
= PV_HASHED_ENTRY_NULL
;
808 * not removing rooted pv. find it on hash chain, remove from
809 * ppn queue and hash chain and free it
812 pvhash_idx
= pvhashidx(pmap
, vaddr
);
813 LOCK_PV_HASH(pvhash_idx
);
814 pprevh
= pvhash(pvhash_idx
);
815 if (PV_HASHED_ENTRY_NULL
== *pprevh
) {
816 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash",
817 pmap
, vaddr
, ppn
, *pte
, pte
);
820 pmap_pv_hashlist_walks
++;
822 while (PV_HASHED_ENTRY_NULL
!= pvh_e
) {
824 if (pvh_e
->pmap
== pmap
&&
825 pvh_e
->va
== vaddr
&&
828 pprevh
= &pvh_e
->nexth
;
829 pvh_e
= pvh_e
->nexth
;
832 if (PV_HASHED_ENTRY_NULL
== pvh_e
) {
833 pmap_pagetable_corruption_action_t pac
= pmap_classify_pagetable_corruption(pmap
, vaddr
, ppnp
, pte
, ROOT_PRESENT
);
835 if (pac
== PMAP_ACTION_ASSERT
)
836 panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx", pmap
, vaddr
, ppn
, *pte
, ppnp
, pte
, pv_h
->pmap
, pv_h
->va
);
838 UNLOCK_PV_HASH(pvhash_idx
);
839 if (pac
== PMAP_ACTION_RETRY_RELOCK
) {
840 LOCK_PVH(ppn_to_pai(*ppnp
));
841 pmap_phys_attributes
[ppn_to_pai(*ppnp
)] |= (PHYS_MODIFIED
| PHYS_REFERENCED
);
842 goto pmap_pv_remove_retry
;
844 else if (pac
== PMAP_ACTION_RETRY
) {
845 goto pmap_pv_remove_retry
;
847 else if (pac
== PMAP_ACTION_IGNORE
) {
848 goto pmap_pv_remove_exit
;
853 pmap_pv_hashlist_cnts
+= pv_cnt
;
854 if (pmap_pv_hashlist_max
< pv_cnt
)
855 pmap_pv_hashlist_max
= pv_cnt
;
856 *pprevh
= pvh_e
->nexth
;
857 remque(&pvh_e
->qlink
);
858 UNLOCK_PV_HASH(pvhash_idx
);
865 extern int pt_fake_zone_index
;
867 PMAP_ZINFO_PALLOC(pmap_t pmap
, vm_size_t bytes
)
869 thread_t thr
= current_thread();
873 pmap_ledger_credit(pmap
, task_ledgers
.tkm_private
, bytes
);
875 if (pt_fake_zone_index
!= -1 &&
876 (task
= thr
->task
) != NULL
&& (zinfo
= task
->tkm_zinfo
) != NULL
)
877 OSAddAtomic64(bytes
, (int64_t *)&zinfo
[pt_fake_zone_index
].alloc
);
881 PMAP_ZINFO_PFREE(pmap_t pmap
, vm_size_t bytes
)
883 thread_t thr
= current_thread();
887 pmap_ledger_debit(pmap
, task_ledgers
.tkm_private
, bytes
);
889 if (pt_fake_zone_index
!= -1 &&
890 (task
= thr
->task
) != NULL
&& (zinfo
= task
->tkm_zinfo
) != NULL
)
891 OSAddAtomic64(bytes
, (int64_t *)&zinfo
[pt_fake_zone_index
].free
);
895 PMAP_ZINFO_SALLOC(pmap_t pmap
, vm_size_t bytes
)
897 pmap_ledger_credit(pmap
, task_ledgers
.tkm_shared
, bytes
);
901 PMAP_ZINFO_SFREE(pmap_t pmap
, vm_size_t bytes
)
903 pmap_ledger_debit(pmap
, task_ledgers
.tkm_shared
, bytes
);
906 extern boolean_t pmap_initialized
;/* Has pmap_init completed? */
907 #define valid_page(x) (pmap_initialized && pmap_valid_page(x))
910 #define HIGH_MEM_BASE ((uint32_t)( -NBPDE) ) /* shared gdt etc seg addr */ /* XXX64 ?? */
914 int phys_attribute_test(
917 void phys_attribute_clear(
920 unsigned int options
,
923 //#define PCID_DEBUG 1
925 #define pmap_pcid_log(fmt, args...) \
927 kprintf(fmt, ##args); \
928 printf(fmt, ##args); \
931 #define pmap_pcid_log(fmt, args...)
933 void pmap_pcid_configure(void);
937 * Atomic 64-bit compare and exchange of a page table entry.
939 static inline boolean_t
940 pmap_cmpx_pte(pt_entry_t
*entryp
, pt_entry_t old
, pt_entry_t
new)
945 * Load the old value into %rax
946 * Load the new value into another register
947 * Compare-exchange-quad at address entryp
948 * If the compare succeeds, the new value is stored, return TRUE.
949 * Otherwise, no swap is made, return FALSE.
952 " lock; cmpxchgq %2,(%3) \n\t"
963 extern uint32_t pmap_update_clear_pte_count
;
965 static inline void pmap_update_pte(pt_entry_t
*mptep
, uint64_t pclear_bits
, uint64_t pset_bits
) {
966 pt_entry_t npte
, opte
;
969 if (__improbable(opte
== 0)) {
970 pmap_update_clear_pte_count
++;
973 npte
= opte
& ~(pclear_bits
);
975 } while (!pmap_cmpx_pte(mptep
, opte
, npte
));
978 #if defined(__x86_64__)
980 * The single pml4 page per pmap is allocated at pmap create time and exists
981 * for the duration of the pmap. we allocate this page in kernel vm.
982 * this returns the address of the requested pml4 entry in the top level page.
986 pmap64_pml4(pmap_t pmap
, vm_map_offset_t vaddr
)
988 if (__improbable((vaddr
> 0x00007FFFFFFFFFFFULL
) &&
989 (vaddr
< 0xFFFF800000000000ULL
))) {
994 return PHYSMAP_PTOV(&((pml4_entry_t
*)pmap
->pm_cr3
)[(vaddr
>> PML4SHIFT
) & (NPML4PG
-1)]);
996 return &pmap
->pm_pml4
[(vaddr
>> PML4SHIFT
) & (NPML4PG
-1)];
1001 * Returns address of requested PDPT entry in the physmap.
1003 static inline pdpt_entry_t
*
1004 pmap64_pdpt(pmap_t pmap
, vm_map_offset_t vaddr
)
1009 pml4
= pmap64_pml4(pmap
, vaddr
);
1010 if (pml4
&& ((*pml4
& INTEL_PTE_VALID
))) {
1011 newpf
= *pml4
& PG_FRAME
;
1012 return &((pdpt_entry_t
*) PHYSMAP_PTOV(newpf
))
1013 [(vaddr
>> PDPTSHIFT
) & (NPDPTPG
-1)];
1018 * Returns the address of the requested PDE entry in the physmap.
1020 static inline pd_entry_t
*
1021 pmap64_pde(pmap_t pmap
, vm_map_offset_t vaddr
)
1026 pdpt
= pmap64_pdpt(pmap
, vaddr
);
1028 if (pdpt
&& ((*pdpt
& INTEL_PTE_VALID
))) {
1029 newpf
= *pdpt
& PG_FRAME
;
1030 return &((pd_entry_t
*) PHYSMAP_PTOV(newpf
))
1031 [(vaddr
>> PDSHIFT
) & (NPDPG
-1)];
1036 static inline pd_entry_t
*
1037 pmap_pde(pmap_t m
, vm_map_offset_t v
)
1041 pde
= pmap64_pde(m
, v
);
1048 * return address of mapped pte for vaddr va in pmap pmap.
1050 * In case the pde maps a superpage, return the pde, which, in this case
1051 * is the actual page table entry.
1053 static inline pt_entry_t
*
1054 pmap_pte(pmap_t pmap
, vm_map_offset_t vaddr
)
1060 pde
= pmap64_pde(pmap
, vaddr
);
1062 if (pde
&& ((*pde
& INTEL_PTE_VALID
))) {
1063 if (*pde
& INTEL_PTE_PS
)
1065 newpf
= *pde
& PG_FRAME
;
1066 return &((pt_entry_t
*)PHYSMAP_PTOV(newpf
))
1067 [i386_btop(vaddr
) & (ppnum_t
)(NPTEPG
-1)];
1073 #define DPRINTF(x...) kprintf(x)
1075 #define DPRINTF(x...)
1078 #endif /* MACH_KERNEL_PRIVATE */
1079 #endif /* _I386_PMAP_INTERNAL_ */