2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 #ifndef _I386_PMAP_INTERNAL_
31 #define _I386_PMAP_INTERNAL_
32 #ifdef MACH_KERNEL_PRIVATE
35 #include <sys/kdebug.h>
36 #include <kern/ledger.h>
37 #include <kern/simple_lock.h>
38 #include <i386/bit_routines.h>
44 #define PMAP_LOCK(pmap) { \
45 simple_lock(&(pmap)->lock); \
48 #define PMAP_UNLOCK(pmap) { \
49 simple_unlock(&(pmap)->lock); \
52 #define PMAP_UPDATE_TLBS(pmap, s, e) \
53 pmap_flush_tlbs(pmap, s, e, 0, NULL)
56 #define PMAP_DELAY_TLB_FLUSH 0x01
58 #define PMAP_UPDATE_TLBS_DELAYED(pmap, s, e, c) \
59 pmap_flush_tlbs(pmap, s, e, PMAP_DELAY_TLB_FLUSH, c)
62 #define iswired(pte) ((pte) & INTEL_PTE_WIRED)
65 extern boolean_t pmap_trace
;
66 #define PMAP_TRACE(x,a,b,c,d,e) \
68 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
71 #define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e)
72 #endif /* PMAP_TRACES */
74 #define PMAP_TRACE_CONSTANT(x,a,b,c,d,e) \
75 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
77 kern_return_t pmap_expand_pml4(
80 unsigned int options
);
82 kern_return_t
pmap_expand_pdpt(
85 unsigned int options
);
87 void phys_attribute_set(
91 void pmap_set_reference(
94 boolean_t
phys_page_exists(
98 pmap_flush_tlbs(pmap_t
, vm_map_offset_t
, vm_map_offset_t
, int, pmap_flush_context
*);
101 pmap_update_cache_attributes_locked(ppnum_t
, unsigned);
103 extern const boolean_t cpu_64bit
;
106 * Private data structures.
110 * For each vm_page_t, there is a list of all currently
111 * valid virtual mappings of that page. An entry is
112 * a pv_rooted_entry_t; the list is the pv_table.
114 * N.B. with the new combo rooted/hashed scheme it is
115 * only possibly to remove individual non-rooted entries
116 * if they are found via the hashed chains as there is no
117 * way to unlink the singly linked hashed entries if navigated to
118 * via the queue list off the rooted entries. Think of it as
119 * hash/walk/pull, keeping track of the prev pointer while walking
120 * the singly linked hash list. All of this is to save memory and
121 * keep both types of pv_entries as small as possible.
126 PV HASHING Changes - JK 1/2007
128 Pve's establish physical to virtual mappings. These are used for aliasing of a
129 physical page to (potentially many) virtual addresses within pmaps. In the
130 previous implementation the structure of the pv_entries (each 16 bytes in size) was
132 typedef struct pv_entry {
133 struct pv_entry_t next;
138 An initial array of these is created at boot time, one per physical page of
139 memory, indexed by the physical page number. Additionally, a pool of entries
140 is created from a pv_zone to be used as needed by pmap_enter() when it is
141 creating new mappings. Originally, we kept this pool around because the code
142 in pmap_enter() was unable to block if it needed an entry and none were
143 available - we'd panic. Some time ago I restructured the pmap_enter() code
144 so that for user pmaps it can block while zalloc'ing a pv structure and restart,
145 removing a panic from the code (in the case of the kernel pmap we cannot block
146 and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
147 The pool has not been removed since there is a large performance gain keeping
148 freed pv's around for reuse and not suffering the overhead of zalloc for every
151 As pmap_enter() created new mappings it linked the new pve's for them off the
152 fixed pv array for that ppn (off the next pointer). These pve's are accessed
153 for several operations, one of them being address space teardown. In that case,
156 for (every page/pte in the space) {
157 calc pve_ptr from the ppn in the pte
158 for (every pv in the list for the ppn) {
159 if (this pv is for this pmap/vaddr) {
166 The problem arose when we were running, say 8000 (or even 2000) apache or
167 other processes and one or all terminate. The list hanging off each pv array
168 entry could have thousands of entries. We were continuously linearly searching
169 each of these lists as we stepped through the address space we were tearing
170 down. Because of the locks we hold, likely taking a cache miss for each node,
171 and interrupt disabling for MP issues the system became completely unresponsive
172 for many seconds while we did this.
174 Realizing that pve's are accessed in two distinct ways (linearly running the
175 list by ppn for operations like pmap_page_protect and finding and
176 modifying/removing a single pve as part of pmap_enter processing) has led to
177 modifying the pve structures and databases.
179 There are now two types of pve structures. A "rooted" structure which is
180 basically the original structure accessed in an array by ppn, and a ''hashed''
181 structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
182 designed with the two goals of minimizing wired memory and making the lookup of
183 a ppn faster. Since a vast majority of pages in the system are not aliased
184 and hence represented by a single pv entry I've kept the rooted entry size as
185 small as possible because there is one of these dedicated for every physical
186 page of memory. The hashed pve's are larger due to the addition of the hash
187 link and the ppn entry needed for matching while running the hash list to find
188 the entry we are looking for. This way, only systems that have lots of
189 aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
190 structures have the same first three fields allowing some simplification in
193 They have these shapes
195 typedef struct pv_rooted_entry {
199 } *pv_rooted_entry_t;
202 typedef struct pv_hashed_entry {
207 struct pv_hashed_entry *nexth;
208 } *pv_hashed_entry_t;
210 The main flow difference is that the code is now aware of the rooted entry and
211 the hashed entries. Code that runs the pv list still starts with the rooted
212 entry and then continues down the qlink onto the hashed entries. Code that is
213 looking up a specific pv entry first checks the rooted entry and then hashes
214 and runs the hash list for the match. The hash list lengths are much smaller
215 than the original pv lists that contained all aliases for the specific ppn.
219 typedef struct pv_rooted_entry
{
220 /* first three entries must match pv_hashed_entry_t */
222 vm_map_offset_t va_and_flags
; /* virtual address for mapping */
223 pmap_t pmap
; /* pmap where mapping lies */
224 } *pv_rooted_entry_t
;
226 #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
228 typedef struct pv_hashed_entry
{
229 /* first three entries must match pv_rooted_entry_t */
231 vm_map_offset_t va_and_flags
;
234 struct pv_hashed_entry
*nexth
;
235 } *pv_hashed_entry_t
;
237 #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
239 #define PVE_VA(pve) ((pve)->va_and_flags & ~PAGE_MASK)
240 #define PVE_FLAGS(pve) ((pve)->va_and_flags & PAGE_MASK)
241 #define PVE_IS_ALTACCT 0x001
242 #define PVE_IS_ALTACCT_PAGE(pve) \
243 (((pve)->va_and_flags & PVE_IS_ALTACCT) ? TRUE : FALSE)
245 //#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */
247 #define CHK_NPVHASH() if(0 == npvhashmask) panic("npvhash uninitialized");
249 #define CHK_NPVHASH(x)
252 #define NPVHASHBUCKETS (4096)
253 #define NPVHASHMASK ((NPVHASHBUCKETS) - 1) /* MUST BE 2^N - 1 */
254 #define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000
255 #define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000
256 #define PV_HASHED_ALLOC_CHUNK_INITIAL 2000
257 #define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200
259 extern volatile uint32_t mappingrecurse
;
260 extern uint32_t pv_hashed_low_water_mark
, pv_hashed_kern_low_water_mark
;
266 #define LOCK_PV_HASH(hash) lock_hash_hash(hash)
267 #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
268 extern uint32_t npvhashmask
;
269 extern pv_hashed_entry_t
*pv_hash_table
; /* hash lists */
270 extern pv_hashed_entry_t pv_hashed_free_list
;
271 extern pv_hashed_entry_t pv_hashed_kern_free_list
;
272 decl_simple_lock_data(extern, pv_hashed_free_list_lock
)
273 decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock
)
274 decl_simple_lock_data(extern, pv_hash_table_lock
)
275 decl_simple_lock_data(extern, phys_backup_lock
)
277 extern zone_t pv_hashed_list_zone
; /* zone of pv_hashed_entry
280 extern uint32_t pv_hashed_free_count
;
281 extern uint32_t pv_hashed_kern_free_count
;
283 * Each entry in the pv_head_table is locked by a bit in the
284 * pv_lock_table. The lock bits are accessed by the address of
285 * the frame they lock.
287 #define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
288 #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
289 extern char *pv_lock_table
; /* pointer to array of bits */
290 extern char *pv_hash_lock_table
;
291 extern pv_rooted_entry_t pv_head_table
; /* array of entries, one per page */
293 extern event_t mapping_replenish_event
;
295 static inline void PV_HASHED_ALLOC(pv_hashed_entry_t
*pvh_ep
) {
296 pmap_assert(*pvh_ep
== PV_HASHED_ENTRY_NULL
);
297 simple_lock(&pv_hashed_free_list_lock
);
298 /* If the kernel reserved pool is low, let non-kernel mappings allocate
299 * synchronously, possibly subject to a throttle.
301 if ((pv_hashed_kern_free_count
> pv_hashed_kern_low_water_mark
) && ((*pvh_ep
= pv_hashed_free_list
) != 0)) {
302 pv_hashed_free_list
= (pv_hashed_entry_t
)(*pvh_ep
)->qlink
.next
;
303 pv_hashed_free_count
--;
306 simple_unlock(&pv_hashed_free_list_lock
);
308 if (pv_hashed_free_count
<= pv_hashed_low_water_mark
) {
309 if (!mappingrecurse
&& hw_compare_and_store(0,1, &mappingrecurse
))
310 thread_wakeup(&mapping_replenish_event
);
314 static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh
, pv_hashed_entry_t pvh_et
, int pv_cnt
) {
315 simple_lock(&pv_hashed_free_list_lock
);
316 pvh_et
->qlink
.next
= (queue_entry_t
)pv_hashed_free_list
;
317 pv_hashed_free_list
= pvh_eh
;
318 pv_hashed_free_count
+= pv_cnt
;
319 simple_unlock(&pv_hashed_free_list_lock
);
322 extern unsigned pmap_kern_reserve_alloc_stat
;
324 static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t
*pvh_e
) {
325 pmap_assert(*pvh_e
== PV_HASHED_ENTRY_NULL
);
326 simple_lock(&pv_hashed_kern_free_list_lock
);
328 if ((*pvh_e
= pv_hashed_kern_free_list
) != 0) {
329 pv_hashed_kern_free_list
= (pv_hashed_entry_t
)(*pvh_e
)->qlink
.next
;
330 pv_hashed_kern_free_count
--;
331 pmap_kern_reserve_alloc_stat
++;
334 simple_unlock(&pv_hashed_kern_free_list_lock
);
336 if (pv_hashed_kern_free_count
< pv_hashed_kern_low_water_mark
) {
337 if (!mappingrecurse
&& hw_compare_and_store(0,1, &mappingrecurse
))
338 thread_wakeup(&mapping_replenish_event
);
342 static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh
, pv_hashed_entry_t pvh_et
, int pv_cnt
) {
343 simple_lock(&pv_hashed_kern_free_list_lock
);
344 pvh_et
->qlink
.next
= (queue_entry_t
)pv_hashed_kern_free_list
;
345 pv_hashed_kern_free_list
= pvh_eh
;
346 pv_hashed_kern_free_count
+= pv_cnt
;
347 simple_unlock(&pv_hashed_kern_free_list_lock
);
350 extern uint64_t pmap_pv_throttle_stat
, pmap_pv_throttled_waiters
;
351 extern event_t pmap_user_pv_throttle_event
;
353 static inline void pmap_pv_throttle(__unused pmap_t p
) {
354 pmap_assert(p
!= kernel_pmap
);
355 /* Apply throttle on non-kernel mappings */
356 if (pv_hashed_kern_free_count
< (pv_hashed_kern_low_water_mark
/ 2)) {
357 pmap_pv_throttle_stat
++;
358 /* This doesn't need to be strictly accurate, merely a hint
359 * to eliminate the timeout when the reserve is replenished.
361 pmap_pv_throttled_waiters
++;
362 assert_wait_timeout(&pmap_user_pv_throttle_event
, THREAD_UNINT
, 1, 1000 * NSEC_PER_USEC
);
363 thread_block(THREAD_CONTINUE_NULL
);
368 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
371 #define pa_index(pa) (i386_btop(pa))
372 #define ppn_to_pai(ppn) ((int)ppn)
374 #define pai_to_pvh(pai) (&pv_head_table[pai])
375 #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
376 #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
377 #define pvhash(idx) (&pv_hash_table[idx])
378 #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
379 #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
381 #define IS_MANAGED_PAGE(x) \
382 ((unsigned int)(x) <= last_managed_page && \
383 (pmap_phys_attributes[x] & PHYS_MANAGED))
384 #define IS_INTERNAL_PAGE(x) \
385 (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_INTERNAL))
386 #define IS_REUSABLE_PAGE(x) \
387 (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_REUSABLE))
388 #define IS_ALTACCT_PAGE(x,pve) \
389 (IS_MANAGED_PAGE((x)) && \
390 (PVE_IS_ALTACCT_PAGE((pve))))
393 * Physical page attributes. Copy bits from PTE definition.
395 #define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
396 #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
397 #define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
398 #define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */
399 #define PHYS_NCACHE INTEL_PTE_NCACHE
400 #define PHYS_PTA INTEL_PTE_PTA
401 #define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE)
402 #define PHYS_INTERNAL INTEL_PTE_WTHRU /* page from internal object */
403 #define PHYS_REUSABLE INTEL_PTE_WRITE /* page is "reusable" */
405 extern boolean_t pmap_disable_kheap_nx
;
406 extern boolean_t pmap_disable_kstack_nx
;
408 #define PMAP_EXPAND_OPTIONS_NONE (0x0)
409 #define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT)
410 #define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER)
413 * Amount of virtual memory mapped by one
414 * page-directory entry.
416 #define PDE_MAPPED_SIZE (pdetova(1))
420 * Locking and TLB invalidation
424 * Locking Protocols: (changed 2/2007 JK)
426 * There are two structures in the pmap module that need locking:
427 * the pmaps themselves, and the per-page pv_lists (which are locked
428 * by locking the pv_lock_table entry that corresponds to the pv_head
429 * for the list in question.) Most routines want to lock a pmap and
430 * then do operations in it that require pv_list locking -- however
431 * pmap_remove_all and pmap_copy_on_write operate on a physical page
432 * basis and want to do the locking in the reverse order, i.e. lock
433 * a pv_list and then go through all the pmaps referenced by that list.
435 * The system wide pmap lock has been removed. Now, paths take a lock
436 * on the pmap before changing its 'shape' and the reverse order lockers
437 * (coming in by phys ppn) take a lock on the corresponding pv and then
438 * retest to be sure nothing changed during the window before they locked
439 * and can then run up/down the pv lists holding the list lock. This also
440 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
448 #define LOCK_PVH(index) { \
449 mp_disable_preemption(); \
450 lock_pvh_pai(index); \
453 #define UNLOCK_PVH(index) { \
454 unlock_pvh_pai(index); \
455 mp_enable_preemption(); \
458 extern uint64_t pde_mapped_size
;
460 extern char *pmap_phys_attributes
;
461 extern ppnum_t last_managed_page
;
463 extern ppnum_t lowest_lo
;
464 extern ppnum_t lowest_hi
;
465 extern ppnum_t highest_hi
;
468 * when spinning through pmap_remove
469 * ensure that we don't spend too much
470 * time with preemption disabled.
471 * I'm setting the current threshold
474 #define MAX_PREEMPTION_LATENCY_NS 20000
475 extern uint64_t max_preemption_latency_tsc
;
477 /* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
478 #ifdef DEBUGINTERRUPTS
479 #define pmap_intr_assert() { \
480 if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \
481 panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \
484 #define pmap_intr_assert()
487 extern int nx_enabled
;
488 extern unsigned int inuse_ptepages_count
;
490 static inline uint32_t
491 pvhashidx(pmap_t pmap
, vm_map_offset_t va
)
493 uint32_t hashidx
= ((uint32_t)(uintptr_t)pmap
^
494 ((uint32_t)(va
>> PAGE_SHIFT
) & 0xFFFFFFFF)) &
501 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
502 * properly deals with the anchor.
503 * must be called with the hash locked, does not unlock it
506 pmap_pvh_unlink(pv_hashed_entry_t pvh
)
508 pv_hashed_entry_t curh
;
509 pv_hashed_entry_t
*pprevh
;
513 pvhash_idx
= pvhashidx(pvh
->pmap
, PVE_VA(pvh
));
515 pprevh
= pvhash(pvhash_idx
);
519 panic("pvh_unlink null anchor"); /* JK DEBUG */
523 while (PV_HASHED_ENTRY_NULL
!= curh
) {
526 pprevh
= &curh
->nexth
;
529 if (PV_HASHED_ENTRY_NULL
== curh
) panic("pmap_pvh_unlink no pvh");
530 *pprevh
= pvh
->nexth
;
535 pv_hash_add(pv_hashed_entry_t pvh_e
,
536 pv_rooted_entry_t pv_h
)
538 pv_hashed_entry_t
*hashp
;
542 pvhash_idx
= pvhashidx(pvh_e
->pmap
, PVE_VA(pvh_e
));
543 LOCK_PV_HASH(pvhash_idx
);
544 insque(&pvh_e
->qlink
, &pv_h
->qlink
);
545 hashp
= pvhash(pvhash_idx
);
548 panic("pv_hash_add(%p) null hash bucket", pvh_e
);
550 pvh_e
->nexth
= *hashp
;
552 UNLOCK_PV_HASH(pvhash_idx
);
556 pv_hash_remove(pv_hashed_entry_t pvh_e
)
561 pvhash_idx
= pvhashidx(pvh_e
->pmap
,PVE_VA(pvh_e
));
562 LOCK_PV_HASH(pvhash_idx
);
563 remque(&pvh_e
->qlink
);
564 pmap_pvh_unlink(pvh_e
);
565 UNLOCK_PV_HASH(pvhash_idx
);
568 static inline boolean_t
popcnt1(uint64_t distance
) {
569 return ((distance
& (distance
- 1)) == 0);
573 * Routines to handle suppression of/recovery from some forms of pagetable corruption
574 * incidents observed in the field. These can be either software induced (wild
575 * stores to the mapwindows where applicable, use after free errors
576 * (typically of pages addressed physically), mis-directed DMAs etc., or due
577 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors,
578 * the recording mechanism is deliberately not MP-safe. The overarching goal is to
579 * still assert on potential software races, but attempt recovery from incidents
580 * identifiable as occurring due to issues beyond the control of the pmap module.
581 * The latter includes single-bit errors and malformed pagetable entries.
582 * We currently limit ourselves to recovery/suppression of one incident per
583 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident
585 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09)
592 PTE_SUPERVISOR
= 0x4,
595 PTE_INVALID_CACHEABILITY
= 0x20
596 } pmap_pagetable_corruption_t
;
601 } pmap_pv_assertion_t
;
604 PMAP_ACTION_IGNORE
= 0x0,
605 PMAP_ACTION_ASSERT
= 0x1,
606 PMAP_ACTION_RETRY
= 0x2,
607 PMAP_ACTION_RETRY_RELOCK
= 0x4
608 } pmap_pagetable_corruption_action_t
;
610 #define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
611 extern uint64_t pmap_pagetable_corruption_interval_abstime
;
613 extern uint32_t pmap_pagetable_corruption_incidents
;
614 #define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8)
616 pmap_pv_assertion_t incident
;
617 pmap_pagetable_corruption_t reason
;
618 pmap_pagetable_corruption_action_t action
;
620 vm_map_offset_t vaddr
;
624 vm_map_offset_t pvva
;
626 } pmap_pagetable_corruption_record_t
;
628 extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records
[];
629 extern uint64_t pmap_pagetable_corruption_last_abstime
;
630 extern thread_call_t pmap_pagetable_corruption_log_call
;
631 extern boolean_t pmap_pagetable_corruption_timeout
;
634 pmap_pagetable_corruption_log(pmap_pv_assertion_t incident
, pmap_pagetable_corruption_t suppress_reason
, pmap_pagetable_corruption_action_t action
, pmap_t pmap
, vm_map_offset_t vaddr
, pt_entry_t
*ptep
, ppnum_t ppn
, pmap_t pvpmap
, vm_map_offset_t pvva
) {
635 uint32_t pmap_pagetable_corruption_log_index
;
636 pmap_pagetable_corruption_log_index
= pmap_pagetable_corruption_incidents
++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG
;
637 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].incident
= incident
;
638 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].reason
= suppress_reason
;
639 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].action
= action
;
640 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].pmap
= pmap
;
641 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].vaddr
= vaddr
;
642 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].pte
= *ptep
;
643 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].ppn
= ppn
;
644 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].pvpmap
= pvpmap
;
645 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].pvva
= pvva
;
646 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].abstime
= mach_absolute_time();
647 /* Asynchronously log */
648 thread_call_enter(pmap_pagetable_corruption_log_call
);
651 static inline pmap_pagetable_corruption_action_t
652 pmap_classify_pagetable_corruption(pmap_t pmap
, vm_map_offset_t vaddr
, ppnum_t
*ppnp
, pt_entry_t
*ptep
, pmap_pv_assertion_t incident
) {
653 pmap_pagetable_corruption_action_t action
= PMAP_ACTION_ASSERT
;
654 pmap_pagetable_corruption_t suppress_reason
= PTE_VALID
;
655 ppnum_t suppress_ppn
= 0;
656 pt_entry_t cpte
= *ptep
;
657 ppnum_t cpn
= pa_index(pte_to_pa(cpte
));
659 pv_rooted_entry_t pv_h
= pai_to_pvh(ppn_to_pai(ppn
));
660 pv_rooted_entry_t pv_e
= pv_h
;
662 pmap_t pvpmap
= pv_h
->pmap
;
663 vm_map_offset_t pvva
= PVE_VA(pv_h
);
664 vm_map_offset_t pve_flags
;
665 boolean_t ppcd
= FALSE
;
668 /* Ideally, we'd consult the Mach VM here to definitively determine
669 * the nature of the mapping for this address space and address.
670 * As that would be a layering violation in this context, we
671 * use various heuristics to recover from single bit errors,
672 * malformed pagetable entries etc. These are not intended
673 * to be comprehensive.
676 /* As a precautionary measure, mark A+D */
677 pmap_phys_attributes
[ppn_to_pai(ppn
)] |= (PHYS_MODIFIED
| PHYS_REFERENCED
);
678 is_ept
= is_ept_pmap(pmap
);
681 * Correct potential single bit errors in either (but not both) element
685 if ((popcnt1((uintptr_t)pv_e
->pmap
^ (uintptr_t)pmap
) && PVE_VA(pv_e
) == vaddr
) ||
686 (pv_e
->pmap
== pmap
&& popcnt1(PVE_VA(pv_e
) ^ vaddr
))) {
687 pve_flags
= PVE_FLAGS(pv_e
);
689 pv_h
->va_and_flags
= vaddr
| pve_flags
;
690 suppress_reason
= PV_BITFLIP
;
691 action
= PMAP_ACTION_RETRY
;
694 } while (((pv_e
= (pv_rooted_entry_t
) queue_next(&pv_e
->qlink
))) && (pv_e
!= pv_h
));
696 /* Discover root entries with a Hamming
697 * distance of 1 from the supplied
698 * physical page frame.
700 for (bitdex
= 0; bitdex
< (sizeof(ppnum_t
) << 3); bitdex
++) {
701 ppnum_t npn
= cpn
^ (ppnum_t
) (1ULL << bitdex
);
702 if (IS_MANAGED_PAGE(npn
)) {
703 pv_rooted_entry_t npv_h
= pai_to_pvh(ppn_to_pai(npn
));
704 if (PVE_VA(npv_h
) == vaddr
&& npv_h
->pmap
== pmap
) {
705 suppress_reason
= PTE_BITFLIP
;
707 action
= PMAP_ACTION_RETRY_RELOCK
;
708 UNLOCK_PVH(ppn_to_pai(ppn
));
715 if (pmap
== kernel_pmap
) {
716 action
= PMAP_ACTION_ASSERT
;
721 * Check for malformed/inconsistent entries.
722 * The first check here isn't useful for EPT PTEs because INTEL_EPT_NCACHE == 0
724 if (!is_ept
&& ((cpte
& (INTEL_PTE_NCACHE
| INTEL_PTE_WTHRU
| INTEL_PTE_PTA
)) == (INTEL_PTE_NCACHE
| INTEL_PTE_WTHRU
))) {
725 action
= PMAP_ACTION_IGNORE
;
726 suppress_reason
= PTE_INVALID_CACHEABILITY
;
728 else if (cpte
& INTEL_PTE_RSVD
) {
729 action
= PMAP_ACTION_IGNORE
;
730 suppress_reason
= PTE_RSVD
;
732 else if ((pmap
!= kernel_pmap
) && (!is_ept
) && ((cpte
& INTEL_PTE_USER
) == 0)) {
733 action
= PMAP_ACTION_IGNORE
;
734 suppress_reason
= PTE_SUPERVISOR
;
737 PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd
, sizeof(ppcd
));
739 if (debug_boot_arg
&& !ppcd
) {
740 action
= PMAP_ACTION_ASSERT
;
743 if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime
) < pmap_pagetable_corruption_interval_abstime
) {
744 action
= PMAP_ACTION_ASSERT
;
745 pmap_pagetable_corruption_timeout
= TRUE
;
749 pmap_pagetable_corruption_last_abstime
= mach_absolute_time();
751 pmap_pagetable_corruption_log(incident
, suppress_reason
, action
, pmap
, vaddr
, &cpte
, *ppnp
, pvpmap
, pvva
);
756 * Remove pv list entry.
757 * Called with pv_head_table entry locked.
758 * Returns pv entry to be freed (or NULL).
760 static inline __attribute__((always_inline
)) pv_hashed_entry_t
761 pmap_pv_remove(pmap_t pmap
,
762 vm_map_offset_t vaddr
,
765 boolean_t
*was_altacct
)
767 pv_hashed_entry_t pvh_e
;
768 pv_rooted_entry_t pv_h
;
769 pv_hashed_entry_t
*pprevh
;
774 *was_altacct
= FALSE
;
775 pmap_pv_remove_retry
:
777 pvh_e
= PV_HASHED_ENTRY_NULL
;
778 pv_h
= pai_to_pvh(ppn_to_pai(ppn
));
780 if (__improbable(pv_h
->pmap
== PMAP_NULL
)) {
781 pmap_pagetable_corruption_action_t pac
= pmap_classify_pagetable_corruption(pmap
, vaddr
, ppnp
, pte
, ROOT_ABSENT
);
782 if (pac
== PMAP_ACTION_IGNORE
)
783 goto pmap_pv_remove_exit
;
784 else if (pac
== PMAP_ACTION_ASSERT
)
785 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list!", pmap
, vaddr
, ppn
, *pte
, ppnp
, pte
);
786 else if (pac
== PMAP_ACTION_RETRY_RELOCK
) {
787 LOCK_PVH(ppn_to_pai(*ppnp
));
788 pmap_phys_attributes
[ppn_to_pai(*ppnp
)] |= (PHYS_MODIFIED
| PHYS_REFERENCED
);
789 goto pmap_pv_remove_retry
;
791 else if (pac
== PMAP_ACTION_RETRY
)
792 goto pmap_pv_remove_retry
;
795 if (PVE_VA(pv_h
) == vaddr
&& pv_h
->pmap
== pmap
) {
796 *was_altacct
= IS_ALTACCT_PAGE(ppn_to_pai(*ppnp
), pv_h
);
798 * Header is the pv_rooted_entry.
799 * We can't free that. If there is a queued
800 * entry after this one we remove that
801 * from the ppn queue, we remove it from the hash chain
802 * and copy it to the rooted entry. Then free it instead.
804 pvh_e
= (pv_hashed_entry_t
) queue_next(&pv_h
->qlink
);
805 if (pv_h
!= (pv_rooted_entry_t
) pvh_e
) {
807 * Entry queued to root, remove this from hash
808 * and install as new root.
811 pvhash_idx
= pvhashidx(pvh_e
->pmap
, PVE_VA(pvh_e
));
812 LOCK_PV_HASH(pvhash_idx
);
813 remque(&pvh_e
->qlink
);
814 pprevh
= pvhash(pvhash_idx
);
815 if (PV_HASHED_ENTRY_NULL
== *pprevh
) {
816 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x): "
817 "empty hash, removing rooted",
820 pmap_pvh_unlink(pvh_e
);
821 UNLOCK_PV_HASH(pvhash_idx
);
822 pv_h
->pmap
= pvh_e
->pmap
;
823 pv_h
->va_and_flags
= pvh_e
->va_and_flags
;
824 /* dispose of pvh_e */
826 /* none queued after rooted */
827 pv_h
->pmap
= PMAP_NULL
;
828 pvh_e
= PV_HASHED_ENTRY_NULL
;
832 * not removing rooted pv. find it on hash chain, remove from
833 * ppn queue and hash chain and free it
836 pvhash_idx
= pvhashidx(pmap
, vaddr
);
837 LOCK_PV_HASH(pvhash_idx
);
838 pprevh
= pvhash(pvhash_idx
);
839 if (PV_HASHED_ENTRY_NULL
== *pprevh
) {
840 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash",
841 pmap
, vaddr
, ppn
, *pte
, pte
);
844 pmap_pv_hashlist_walks
++;
846 while (PV_HASHED_ENTRY_NULL
!= pvh_e
) {
848 if (pvh_e
->pmap
== pmap
&&
849 PVE_VA(pvh_e
) == vaddr
&&
852 pprevh
= &pvh_e
->nexth
;
853 pvh_e
= pvh_e
->nexth
;
856 if (PV_HASHED_ENTRY_NULL
== pvh_e
) {
857 pmap_pagetable_corruption_action_t pac
= pmap_classify_pagetable_corruption(pmap
, vaddr
, ppnp
, pte
, ROOT_PRESENT
);
859 if (pac
== PMAP_ACTION_ASSERT
)
860 panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx", pmap
, vaddr
, ppn
, *pte
, ppnp
, pte
, pv_h
->pmap
, PVE_VA(pv_h
));
862 UNLOCK_PV_HASH(pvhash_idx
);
863 if (pac
== PMAP_ACTION_RETRY_RELOCK
) {
864 LOCK_PVH(ppn_to_pai(*ppnp
));
865 pmap_phys_attributes
[ppn_to_pai(*ppnp
)] |= (PHYS_MODIFIED
| PHYS_REFERENCED
);
866 goto pmap_pv_remove_retry
;
868 else if (pac
== PMAP_ACTION_RETRY
) {
869 goto pmap_pv_remove_retry
;
871 else if (pac
== PMAP_ACTION_IGNORE
) {
872 goto pmap_pv_remove_exit
;
877 *was_altacct
= IS_ALTACCT_PAGE(ppn_to_pai(*ppnp
), pvh_e
);
879 pmap_pv_hashlist_cnts
+= pv_cnt
;
880 if (pmap_pv_hashlist_max
< pv_cnt
)
881 pmap_pv_hashlist_max
= pv_cnt
;
882 *pprevh
= pvh_e
->nexth
;
883 remque(&pvh_e
->qlink
);
884 UNLOCK_PV_HASH(pvhash_idx
);
890 static inline __attribute__((always_inline
)) boolean_t
893 vm_map_offset_t vaddr
,
896 pv_hashed_entry_t pvh_e
;
897 pv_rooted_entry_t pv_h
;
899 boolean_t is_altacct
;
901 pvh_e
= PV_HASHED_ENTRY_NULL
;
902 pv_h
= pai_to_pvh(ppn_to_pai(ppn
));
904 if (__improbable(pv_h
->pmap
== PMAP_NULL
)) {
908 if (PVE_VA(pv_h
) == vaddr
&& pv_h
->pmap
== pmap
) {
910 * Header is the pv_rooted_entry.
912 return IS_ALTACCT_PAGE(ppn
, pv_h
);
916 pvhash_idx
= pvhashidx(pmap
, vaddr
);
917 LOCK_PV_HASH(pvhash_idx
);
918 pvh_e
= *(pvhash(pvhash_idx
));
919 if (PV_HASHED_ENTRY_NULL
== pvh_e
) {
920 panic("Possible memory corruption: pmap_pv_is_altacct(%p,0x%llx,0x%x): empty hash",
923 while (PV_HASHED_ENTRY_NULL
!= pvh_e
) {
924 if (pvh_e
->pmap
== pmap
&&
925 PVE_VA(pvh_e
) == vaddr
&&
928 pvh_e
= pvh_e
->nexth
;
930 if (PV_HASHED_ENTRY_NULL
== pvh_e
) {
933 is_altacct
= IS_ALTACCT_PAGE(ppn
, pvh_e
);
935 UNLOCK_PV_HASH(pvhash_idx
);
940 extern int pt_fake_zone_index
;
942 PMAP_ZINFO_PALLOC(pmap_t pmap
, vm_size_t bytes
)
944 pmap_ledger_credit(pmap
, task_ledgers
.tkm_private
, bytes
);
948 PMAP_ZINFO_PFREE(pmap_t pmap
, vm_size_t bytes
)
950 pmap_ledger_debit(pmap
, task_ledgers
.tkm_private
, bytes
);
954 PMAP_ZINFO_SALLOC(pmap_t pmap
, vm_size_t bytes
)
956 pmap_ledger_credit(pmap
, task_ledgers
.tkm_shared
, bytes
);
960 PMAP_ZINFO_SFREE(pmap_t pmap
, vm_size_t bytes
)
962 pmap_ledger_debit(pmap
, task_ledgers
.tkm_shared
, bytes
);
965 extern boolean_t pmap_initialized
;/* Has pmap_init completed? */
966 #define valid_page(x) (pmap_initialized && pmap_valid_page(x))
969 #define HIGH_MEM_BASE ((uint32_t)( -NBPDE) ) /* shared gdt etc seg addr */ /* XXX64 ?? */
973 int phys_attribute_test(
976 void phys_attribute_clear(
979 unsigned int options
,
982 //#define PCID_DEBUG 1
984 #define pmap_pcid_log(fmt, args...) \
986 kprintf(fmt, ##args); \
987 printf(fmt, ##args); \
990 #define pmap_pcid_log(fmt, args...)
992 void pmap_pcid_configure(void);
996 * Atomic 64-bit compare and exchange of a page table entry.
998 static inline boolean_t
999 pmap_cmpx_pte(pt_entry_t
*entryp
, pt_entry_t old
, pt_entry_t
new)
1004 * Load the old value into %rax
1005 * Load the new value into another register
1006 * Compare-exchange-quad at address entryp
1007 * If the compare succeeds, the new value is stored, return TRUE.
1008 * Otherwise, no swap is made, return FALSE.
1011 " lock; cmpxchgq %2,(%3) \n\t"
1022 extern uint32_t pmap_update_clear_pte_count
;
1024 static inline void pmap_update_pte(pt_entry_t
*mptep
, uint64_t pclear_bits
, uint64_t pset_bits
) {
1025 pt_entry_t npte
, opte
;
1028 if (__improbable(opte
== 0)) {
1029 pmap_update_clear_pte_count
++;
1032 npte
= opte
& ~(pclear_bits
);
1034 } while (!pmap_cmpx_pte(mptep
, opte
, npte
));
1037 #if defined(__x86_64__)
1039 * The single pml4 page per pmap is allocated at pmap create time and exists
1040 * for the duration of the pmap. we allocate this page in kernel vm.
1041 * this returns the address of the requested pml4 entry in the top level page.
1045 pmap64_pml4(pmap_t pmap
, vm_map_offset_t vaddr
)
1047 if (__improbable((vaddr
> 0x00007FFFFFFFFFFFULL
) &&
1048 (vaddr
< 0xFFFF800000000000ULL
))) {
1053 return PHYSMAP_PTOV(&((pml4_entry_t
*)pmap
->pm_cr3
)[(vaddr
>> PML4SHIFT
) & (NPML4PG
-1)]);
1055 return &pmap
->pm_pml4
[(vaddr
>> PML4SHIFT
) & (NPML4PG
-1)];
1060 * Returns address of requested PDPT entry in the physmap.
1062 static inline pdpt_entry_t
*
1063 pmap64_pdpt(pmap_t pmap
, vm_map_offset_t vaddr
)
1069 pml4
= pmap64_pml4(pmap
, vaddr
);
1070 is_ept
= is_ept_pmap(pmap
);
1072 if (pml4
&& (*pml4
& PTE_VALID_MASK(is_ept
))) {
1073 newpf
= *pml4
& PG_FRAME
;
1074 return &((pdpt_entry_t
*) PHYSMAP_PTOV(newpf
))
1075 [(vaddr
>> PDPTSHIFT
) & (NPDPTPG
-1)];
1080 * Returns the address of the requested PDE entry in the physmap.
1082 static inline pd_entry_t
*
1083 pmap64_pde(pmap_t pmap
, vm_map_offset_t vaddr
)
1089 pdpt
= pmap64_pdpt(pmap
, vaddr
);
1090 is_ept
= is_ept_pmap(pmap
);
1092 if (pdpt
&& (*pdpt
& PTE_VALID_MASK(is_ept
))) {
1093 newpf
= *pdpt
& PG_FRAME
;
1094 return &((pd_entry_t
*) PHYSMAP_PTOV(newpf
))
1095 [(vaddr
>> PDSHIFT
) & (NPDPG
-1)];
1100 static inline pd_entry_t
*
1101 pmap_pde(pmap_t m
, vm_map_offset_t v
)
1105 pde
= pmap64_pde(m
, v
);
1112 * return address of mapped pte for vaddr va in pmap pmap.
1114 * In case the pde maps a superpage, return the pde, which, in this case
1115 * is the actual page table entry.
1117 static inline pt_entry_t
*
1118 pmap_pte(pmap_t pmap
, vm_map_offset_t vaddr
)
1125 pde
= pmap64_pde(pmap
, vaddr
);
1127 is_ept
= is_ept_pmap(pmap
);
1129 if (pde
&& (*pde
& PTE_VALID_MASK(is_ept
))) {
1132 newpf
= *pde
& PG_FRAME
;
1133 return &((pt_entry_t
*)PHYSMAP_PTOV(newpf
))
1134 [i386_btop(vaddr
) & (ppnum_t
)(NPTEPG
-1)];
1140 #define DPRINTF(x...) kprintf(x)
1142 #define DPRINTF(x...)
1145 #endif /* MACH_KERNEL_PRIVATE */
1146 #endif /* _I386_PMAP_INTERNAL_ */