2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 #ifndef _I386_PMAP_INTERNAL_
31 #define _I386_PMAP_INTERNAL_
32 #ifdef MACH_KERNEL_PRIVATE
35 #include <sys/kdebug.h>
36 #include <kern/ledger.h>
37 #include <kern/simple_lock.h>
38 #include <i386/bit_routines.h>
44 #define PMAP_LOCK(pmap) { \
45 simple_lock(&(pmap)->lock); \
48 #define PMAP_UNLOCK(pmap) { \
49 simple_unlock(&(pmap)->lock); \
52 #define PMAP_UPDATE_TLBS(pmap, s, e) \
53 pmap_flush_tlbs(pmap, s, e, 0, NULL)
56 #define PMAP_DELAY_TLB_FLUSH 0x01
58 #define PMAP_UPDATE_TLBS_DELAYED(pmap, s, e, c) \
59 pmap_flush_tlbs(pmap, s, e, PMAP_DELAY_TLB_FLUSH, c)
62 #define iswired(pte) ((pte) & INTEL_PTE_WIRED)
65 extern boolean_t pmap_trace
;
66 #define PMAP_TRACE(...) \
68 KDBG_RELEASE(__VA_ARGS__); \
71 #define PMAP_TRACE(...) KDBG_DEBUG(__VA_ARGS__)
72 #endif /* PMAP_TRACES */
74 #define PMAP_TRACE_CONSTANT(...) KDBG_RELEASE(__VA_ARGS__)
76 kern_return_t
pmap_expand_pml4(
79 unsigned int options
);
81 kern_return_t
pmap_expand_pdpt(
84 unsigned int options
);
86 void phys_attribute_set(
90 void pmap_set_reference(
93 boolean_t
phys_page_exists(
97 pmap_flush_tlbs(pmap_t
, vm_map_offset_t
, vm_map_offset_t
, int, pmap_flush_context
*);
100 pmap_update_cache_attributes_locked(ppnum_t
, unsigned);
102 extern const boolean_t cpu_64bit
;
105 * Private data structures.
109 * For each vm_page_t, there is a list of all currently
110 * valid virtual mappings of that page. An entry is
111 * a pv_rooted_entry_t; the list is the pv_table.
113 * N.B. with the new combo rooted/hashed scheme it is
114 * only possibly to remove individual non-rooted entries
115 * if they are found via the hashed chains as there is no
116 * way to unlink the singly linked hashed entries if navigated to
117 * via the queue list off the rooted entries. Think of it as
118 * hash/walk/pull, keeping track of the prev pointer while walking
119 * the singly linked hash list. All of this is to save memory and
120 * keep both types of pv_entries as small as possible.
125 PV HASHING Changes - JK 1/2007
127 Pve's establish physical to virtual mappings. These are used for aliasing of a
128 physical page to (potentially many) virtual addresses within pmaps. In the
129 previous implementation the structure of the pv_entries (each 16 bytes in size) was
131 typedef struct pv_entry {
132 struct pv_entry_t next;
137 An initial array of these is created at boot time, one per physical page of
138 memory, indexed by the physical page number. Additionally, a pool of entries
139 is created from a pv_zone to be used as needed by pmap_enter() when it is
140 creating new mappings. Originally, we kept this pool around because the code
141 in pmap_enter() was unable to block if it needed an entry and none were
142 available - we'd panic. Some time ago I restructured the pmap_enter() code
143 so that for user pmaps it can block while zalloc'ing a pv structure and restart,
144 removing a panic from the code (in the case of the kernel pmap we cannot block
145 and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
146 The pool has not been removed since there is a large performance gain keeping
147 freed pv's around for reuse and not suffering the overhead of zalloc for every
150 As pmap_enter() created new mappings it linked the new pve's for them off the
151 fixed pv array for that ppn (off the next pointer). These pve's are accessed
152 for several operations, one of them being address space teardown. In that case,
155 for (every page/pte in the space) {
156 calc pve_ptr from the ppn in the pte
157 for (every pv in the list for the ppn) {
158 if (this pv is for this pmap/vaddr) {
165 The problem arose when we were running, say 8000 (or even 2000) apache or
166 other processes and one or all terminate. The list hanging off each pv array
167 entry could have thousands of entries. We were continuously linearly searching
168 each of these lists as we stepped through the address space we were tearing
169 down. Because of the locks we hold, likely taking a cache miss for each node,
170 and interrupt disabling for MP issues the system became completely unresponsive
171 for many seconds while we did this.
173 Realizing that pve's are accessed in two distinct ways (linearly running the
174 list by ppn for operations like pmap_page_protect and finding and
175 modifying/removing a single pve as part of pmap_enter processing) has led to
176 modifying the pve structures and databases.
178 There are now two types of pve structures. A "rooted" structure which is
179 basically the original structure accessed in an array by ppn, and a ''hashed''
180 structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
181 designed with the two goals of minimizing wired memory and making the lookup of
182 a ppn faster. Since a vast majority of pages in the system are not aliased
183 and hence represented by a single pv entry I've kept the rooted entry size as
184 small as possible because there is one of these dedicated for every physical
185 page of memory. The hashed pve's are larger due to the addition of the hash
186 link and the ppn entry needed for matching while running the hash list to find
187 the entry we are looking for. This way, only systems that have lots of
188 aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
189 structures have the same first three fields allowing some simplification in
192 They have these shapes
194 typedef struct pv_rooted_entry {
198 } *pv_rooted_entry_t;
201 typedef struct pv_hashed_entry {
206 struct pv_hashed_entry *nexth;
207 } *pv_hashed_entry_t;
209 The main flow difference is that the code is now aware of the rooted entry and
210 the hashed entries. Code that runs the pv list still starts with the rooted
211 entry and then continues down the qlink onto the hashed entries. Code that is
212 looking up a specific pv entry first checks the rooted entry and then hashes
213 and runs the hash list for the match. The hash list lengths are much smaller
214 than the original pv lists that contained all aliases for the specific ppn.
218 typedef struct pv_rooted_entry
{
219 /* first three entries must match pv_hashed_entry_t */
221 vm_map_offset_t va_and_flags
; /* virtual address for mapping */
222 pmap_t pmap
; /* pmap where mapping lies */
223 } *pv_rooted_entry_t
;
225 #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
227 typedef struct pv_hashed_entry
{
228 /* first three entries must match pv_rooted_entry_t */
230 vm_map_offset_t va_and_flags
;
233 struct pv_hashed_entry
*nexth
;
234 } *pv_hashed_entry_t
;
236 #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
238 #define PVE_VA(pve) ((pve)->va_and_flags & ~PAGE_MASK)
239 #define PVE_FLAGS(pve) ((pve)->va_and_flags & PAGE_MASK)
240 #define PVE_IS_ALTACCT 0x001
241 #define PVE_IS_ALTACCT_PAGE(pve) \
242 (((pve)->va_and_flags & PVE_IS_ALTACCT) ? TRUE : FALSE)
244 //#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */
246 #define CHK_NPVHASH() if(0 == npvhashmask) panic("npvhash uninitialized");
248 #define CHK_NPVHASH(x)
251 #define NPVHASHBUCKETS (4096)
252 #define NPVHASHMASK ((NPVHASHBUCKETS) - 1) /* MUST BE 2^N - 1 */
253 #define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000
254 #define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000
255 #define PV_HASHED_ALLOC_CHUNK_INITIAL 2000
256 #define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200
258 extern volatile uint32_t mappingrecurse
;
259 extern uint32_t pv_hashed_low_water_mark
, pv_hashed_kern_low_water_mark
;
265 #define LOCK_PV_HASH(hash) lock_hash_hash(hash)
266 #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
267 extern uint32_t npvhashmask
;
268 extern pv_hashed_entry_t
*pv_hash_table
; /* hash lists */
269 extern pv_hashed_entry_t pv_hashed_free_list
;
270 extern pv_hashed_entry_t pv_hashed_kern_free_list
;
271 decl_simple_lock_data(extern, pv_hashed_free_list_lock
)
272 decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock
)
273 decl_simple_lock_data(extern, pv_hash_table_lock
)
274 decl_simple_lock_data(extern, phys_backup_lock
)
276 extern zone_t pv_hashed_list_zone
; /* zone of pv_hashed_entry
279 extern uint32_t pv_hashed_free_count
;
280 extern uint32_t pv_hashed_kern_free_count
;
282 * Each entry in the pv_head_table is locked by a bit in the
283 * pv_lock_table. The lock bits are accessed by the address of
284 * the frame they lock.
286 #define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
287 #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
288 extern char *pv_lock_table
; /* pointer to array of bits */
289 extern char *pv_hash_lock_table
;
290 extern pv_rooted_entry_t pv_head_table
; /* array of entries, one per page */
292 extern event_t mapping_replenish_event
;
294 static inline void PV_HASHED_ALLOC(pv_hashed_entry_t
*pvh_ep
) {
295 pmap_assert(*pvh_ep
== PV_HASHED_ENTRY_NULL
);
296 simple_lock(&pv_hashed_free_list_lock
);
297 /* If the kernel reserved pool is low, let non-kernel mappings allocate
298 * synchronously, possibly subject to a throttle.
300 if ((pv_hashed_kern_free_count
> pv_hashed_kern_low_water_mark
) && ((*pvh_ep
= pv_hashed_free_list
) != 0)) {
301 pv_hashed_free_list
= (pv_hashed_entry_t
)(*pvh_ep
)->qlink
.next
;
302 pv_hashed_free_count
--;
305 simple_unlock(&pv_hashed_free_list_lock
);
307 if (pv_hashed_free_count
<= pv_hashed_low_water_mark
) {
308 if (!mappingrecurse
&& hw_compare_and_store(0,1, &mappingrecurse
))
309 thread_wakeup(&mapping_replenish_event
);
313 static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh
, pv_hashed_entry_t pvh_et
, int pv_cnt
) {
314 simple_lock(&pv_hashed_free_list_lock
);
315 pvh_et
->qlink
.next
= (queue_entry_t
)pv_hashed_free_list
;
316 pv_hashed_free_list
= pvh_eh
;
317 pv_hashed_free_count
+= pv_cnt
;
318 simple_unlock(&pv_hashed_free_list_lock
);
321 extern unsigned pmap_kern_reserve_alloc_stat
;
323 static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t
*pvh_e
) {
324 pmap_assert(*pvh_e
== PV_HASHED_ENTRY_NULL
);
325 simple_lock(&pv_hashed_kern_free_list_lock
);
327 if ((*pvh_e
= pv_hashed_kern_free_list
) != 0) {
328 pv_hashed_kern_free_list
= (pv_hashed_entry_t
)(*pvh_e
)->qlink
.next
;
329 pv_hashed_kern_free_count
--;
330 pmap_kern_reserve_alloc_stat
++;
333 simple_unlock(&pv_hashed_kern_free_list_lock
);
335 if (pv_hashed_kern_free_count
< pv_hashed_kern_low_water_mark
) {
336 if (!mappingrecurse
&& hw_compare_and_store(0,1, &mappingrecurse
))
337 thread_wakeup(&mapping_replenish_event
);
341 static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh
, pv_hashed_entry_t pvh_et
, int pv_cnt
) {
342 simple_lock(&pv_hashed_kern_free_list_lock
);
343 pvh_et
->qlink
.next
= (queue_entry_t
)pv_hashed_kern_free_list
;
344 pv_hashed_kern_free_list
= pvh_eh
;
345 pv_hashed_kern_free_count
+= pv_cnt
;
346 simple_unlock(&pv_hashed_kern_free_list_lock
);
349 extern uint64_t pmap_pv_throttle_stat
, pmap_pv_throttled_waiters
;
350 extern event_t pmap_user_pv_throttle_event
;
352 static inline void pmap_pv_throttle(__unused pmap_t p
) {
353 pmap_assert(p
!= kernel_pmap
);
354 /* Apply throttle on non-kernel mappings */
355 if (pv_hashed_kern_free_count
< (pv_hashed_kern_low_water_mark
/ 2)) {
356 pmap_pv_throttle_stat
++;
357 /* This doesn't need to be strictly accurate, merely a hint
358 * to eliminate the timeout when the reserve is replenished.
360 pmap_pv_throttled_waiters
++;
361 assert_wait_timeout(&pmap_user_pv_throttle_event
, THREAD_UNINT
, 1, 1000 * NSEC_PER_USEC
);
362 thread_block(THREAD_CONTINUE_NULL
);
367 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
370 #define pa_index(pa) (i386_btop(pa))
371 #define ppn_to_pai(ppn) ((int)ppn)
373 #define pai_to_pvh(pai) (&pv_head_table[pai])
374 #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
375 #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
376 #define pvhash(idx) (&pv_hash_table[idx])
377 #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
378 #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
380 #define IS_MANAGED_PAGE(x) \
381 ((unsigned int)(x) <= last_managed_page && \
382 (pmap_phys_attributes[x] & PHYS_MANAGED))
383 #define IS_INTERNAL_PAGE(x) \
384 (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_INTERNAL))
385 #define IS_REUSABLE_PAGE(x) \
386 (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_REUSABLE))
387 #define IS_ALTACCT_PAGE(x,pve) \
388 (IS_MANAGED_PAGE((x)) && \
389 (PVE_IS_ALTACCT_PAGE((pve))))
392 * Physical page attributes. Copy bits from PTE definition.
394 #define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
395 #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
396 #define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
397 #define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */
398 #define PHYS_NCACHE INTEL_PTE_NCACHE
399 #define PHYS_PTA INTEL_PTE_PTA
400 #define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE)
401 #define PHYS_INTERNAL INTEL_PTE_WTHRU /* page from internal object */
402 #define PHYS_REUSABLE INTEL_PTE_WRITE /* page is "reusable" */
404 extern boolean_t pmap_disable_kheap_nx
;
405 extern boolean_t pmap_disable_kstack_nx
;
407 #define PMAP_EXPAND_OPTIONS_NONE (0x0)
408 #define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT)
409 #define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER)
410 #define PMAP_EXPAND_OPTIONS_ALIASMAP (0x40000000U)
412 * Amount of virtual memory mapped by one
413 * page-directory entry.
415 #define PDE_MAPPED_SIZE (pdetova(1))
419 * Locking and TLB invalidation
423 * Locking Protocols: (changed 2/2007 JK)
425 * There are two structures in the pmap module that need locking:
426 * the pmaps themselves, and the per-page pv_lists (which are locked
427 * by locking the pv_lock_table entry that corresponds to the pv_head
428 * for the list in question.) Most routines want to lock a pmap and
429 * then do operations in it that require pv_list locking -- however
430 * pmap_remove_all and pmap_copy_on_write operate on a physical page
431 * basis and want to do the locking in the reverse order, i.e. lock
432 * a pv_list and then go through all the pmaps referenced by that list.
434 * The system wide pmap lock has been removed. Now, paths take a lock
435 * on the pmap before changing its 'shape' and the reverse order lockers
436 * (coming in by phys ppn) take a lock on the corresponding pv and then
437 * retest to be sure nothing changed during the window before they locked
438 * and can then run up/down the pv lists holding the list lock. This also
439 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
447 #define LOCK_PVH(index) { \
448 mp_disable_preemption(); \
449 lock_pvh_pai(index); \
452 #define UNLOCK_PVH(index) { \
453 unlock_pvh_pai(index); \
454 mp_enable_preemption(); \
457 extern uint64_t pde_mapped_size
;
459 extern char *pmap_phys_attributes
;
460 extern ppnum_t last_managed_page
;
462 extern ppnum_t lowest_lo
;
463 extern ppnum_t lowest_hi
;
464 extern ppnum_t highest_hi
;
467 * when spinning through pmap_remove
468 * ensure that we don't spend too much
469 * time with preemption disabled.
470 * I'm setting the current threshold
473 #define MAX_PREEMPTION_LATENCY_NS 20000
474 extern uint64_t max_preemption_latency_tsc
;
477 #define PMAP_INTR_DEBUG (1)
481 #define pmap_intr_assert() { \
482 if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \
483 panic("pmap interrupt assert %d %s, %d", processor_avail_count, __FILE__, __LINE__); \
486 #define pmap_intr_assert()
489 extern int nx_enabled
;
490 extern unsigned int inuse_ptepages_count
;
492 static inline uint32_t
493 pvhashidx(pmap_t pmap
, vm_map_offset_t va
)
495 uint32_t hashidx
= ((uint32_t)(uintptr_t)pmap
^
496 ((uint32_t)(va
>> PAGE_SHIFT
) & 0xFFFFFFFF)) &
502 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
503 * properly deals with the anchor.
504 * must be called with the hash locked, does not unlock it
507 pmap_pvh_unlink(pv_hashed_entry_t pvh
)
509 pv_hashed_entry_t curh
;
510 pv_hashed_entry_t
*pprevh
;
514 pvhash_idx
= pvhashidx(pvh
->pmap
, PVE_VA(pvh
));
516 pprevh
= pvhash(pvhash_idx
);
520 panic("pvh_unlink null anchor"); /* JK DEBUG */
524 while (PV_HASHED_ENTRY_NULL
!= curh
) {
527 pprevh
= &curh
->nexth
;
530 if (PV_HASHED_ENTRY_NULL
== curh
) panic("pmap_pvh_unlink no pvh");
531 *pprevh
= pvh
->nexth
;
536 pv_hash_add(pv_hashed_entry_t pvh_e
,
537 pv_rooted_entry_t pv_h
)
539 pv_hashed_entry_t
*hashp
;
543 pvhash_idx
= pvhashidx(pvh_e
->pmap
, PVE_VA(pvh_e
));
544 LOCK_PV_HASH(pvhash_idx
);
545 insque(&pvh_e
->qlink
, &pv_h
->qlink
);
546 hashp
= pvhash(pvhash_idx
);
549 panic("pv_hash_add(%p) null hash bucket", pvh_e
);
551 pvh_e
->nexth
= *hashp
;
553 UNLOCK_PV_HASH(pvhash_idx
);
557 pv_hash_remove(pv_hashed_entry_t pvh_e
)
562 pvhash_idx
= pvhashidx(pvh_e
->pmap
,PVE_VA(pvh_e
));
563 LOCK_PV_HASH(pvhash_idx
);
564 remque(&pvh_e
->qlink
);
565 pmap_pvh_unlink(pvh_e
);
566 UNLOCK_PV_HASH(pvhash_idx
);
569 static inline boolean_t
popcnt1(uint64_t distance
) {
570 return ((distance
& (distance
- 1)) == 0);
574 * Routines to handle suppression of/recovery from some forms of pagetable corruption
575 * incidents observed in the field. These can be either software induced (wild
576 * stores to the mapwindows where applicable, use after free errors
577 * (typically of pages addressed physically), mis-directed DMAs etc., or due
578 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors,
579 * the recording mechanism is deliberately not MP-safe. The overarching goal is to
580 * still assert on potential software races, but attempt recovery from incidents
581 * identifiable as occurring due to issues beyond the control of the pmap module.
582 * The latter includes single-bit errors and malformed pagetable entries.
583 * We currently limit ourselves to recovery/suppression of one incident per
584 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident
586 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09)
593 PTE_SUPERVISOR
= 0x4,
596 PTE_INVALID_CACHEABILITY
= 0x20
597 } pmap_pagetable_corruption_t
;
602 } pmap_pv_assertion_t
;
605 PMAP_ACTION_IGNORE
= 0x0,
606 PMAP_ACTION_ASSERT
= 0x1,
607 PMAP_ACTION_RETRY
= 0x2,
608 PMAP_ACTION_RETRY_RELOCK
= 0x4
609 } pmap_pagetable_corruption_action_t
;
611 #define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
612 extern uint64_t pmap_pagetable_corruption_interval_abstime
;
614 extern uint32_t pmap_pagetable_corruption_incidents
;
615 #define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8)
617 pmap_pv_assertion_t incident
;
618 pmap_pagetable_corruption_t reason
;
619 pmap_pagetable_corruption_action_t action
;
621 vm_map_offset_t vaddr
;
625 vm_map_offset_t pvva
;
627 } pmap_pagetable_corruption_record_t
;
629 extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records
[];
630 extern uint64_t pmap_pagetable_corruption_last_abstime
;
631 extern thread_call_t pmap_pagetable_corruption_log_call
;
632 extern boolean_t pmap_pagetable_corruption_timeout
;
635 pmap_pagetable_corruption_log(pmap_pv_assertion_t incident
, pmap_pagetable_corruption_t suppress_reason
, pmap_pagetable_corruption_action_t action
, pmap_t pmap
, vm_map_offset_t vaddr
, pt_entry_t
*ptep
, ppnum_t ppn
, pmap_t pvpmap
, vm_map_offset_t pvva
) {
636 uint32_t pmap_pagetable_corruption_log_index
;
637 pmap_pagetable_corruption_log_index
= pmap_pagetable_corruption_incidents
++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG
;
638 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].incident
= incident
;
639 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].reason
= suppress_reason
;
640 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].action
= action
;
641 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].pmap
= pmap
;
642 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].vaddr
= vaddr
;
643 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].pte
= *ptep
;
644 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].ppn
= ppn
;
645 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].pvpmap
= pvpmap
;
646 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].pvva
= pvva
;
647 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].abstime
= mach_absolute_time();
648 /* Asynchronously log */
649 thread_call_enter(pmap_pagetable_corruption_log_call
);
652 static inline pmap_pagetable_corruption_action_t
653 pmap_classify_pagetable_corruption(pmap_t pmap
, vm_map_offset_t vaddr
, ppnum_t
*ppnp
, pt_entry_t
*ptep
, pmap_pv_assertion_t incident
) {
654 pmap_pagetable_corruption_action_t action
= PMAP_ACTION_ASSERT
;
655 pmap_pagetable_corruption_t suppress_reason
= PTE_VALID
;
656 ppnum_t suppress_ppn
= 0;
657 pt_entry_t cpte
= *ptep
;
658 ppnum_t cpn
= pa_index(pte_to_pa(cpte
));
660 pv_rooted_entry_t pv_h
= pai_to_pvh(ppn_to_pai(ppn
));
661 pv_rooted_entry_t pv_e
= pv_h
;
663 pmap_t pvpmap
= pv_h
->pmap
;
664 vm_map_offset_t pvva
= PVE_VA(pv_h
);
665 vm_map_offset_t pve_flags
;
666 boolean_t ppcd
= FALSE
;
669 /* Ideally, we'd consult the Mach VM here to definitively determine
670 * the nature of the mapping for this address space and address.
671 * As that would be a layering violation in this context, we
672 * use various heuristics to recover from single bit errors,
673 * malformed pagetable entries etc. These are not intended
674 * to be comprehensive.
677 /* As a precautionary measure, mark A+D */
678 pmap_phys_attributes
[ppn_to_pai(ppn
)] |= (PHYS_MODIFIED
| PHYS_REFERENCED
);
679 is_ept
= is_ept_pmap(pmap
);
682 * Correct potential single bit errors in either (but not both) element
686 if ((popcnt1((uintptr_t)pv_e
->pmap
^ (uintptr_t)pmap
) && PVE_VA(pv_e
) == vaddr
) ||
687 (pv_e
->pmap
== pmap
&& popcnt1(PVE_VA(pv_e
) ^ vaddr
))) {
688 pve_flags
= PVE_FLAGS(pv_e
);
690 pv_h
->va_and_flags
= vaddr
| pve_flags
;
691 suppress_reason
= PV_BITFLIP
;
692 action
= PMAP_ACTION_RETRY
;
695 } while (((pv_e
= (pv_rooted_entry_t
) queue_next(&pv_e
->qlink
))) && (pv_e
!= pv_h
));
697 /* Discover root entries with a Hamming
698 * distance of 1 from the supplied
699 * physical page frame.
701 for (bitdex
= 0; bitdex
< (sizeof(ppnum_t
) << 3); bitdex
++) {
702 ppnum_t npn
= cpn
^ (ppnum_t
) (1ULL << bitdex
);
703 if (IS_MANAGED_PAGE(npn
)) {
704 pv_rooted_entry_t npv_h
= pai_to_pvh(ppn_to_pai(npn
));
705 if (PVE_VA(npv_h
) == vaddr
&& npv_h
->pmap
== pmap
) {
706 suppress_reason
= PTE_BITFLIP
;
708 action
= PMAP_ACTION_RETRY_RELOCK
;
709 UNLOCK_PVH(ppn_to_pai(ppn
));
716 if (pmap
== kernel_pmap
) {
717 action
= PMAP_ACTION_ASSERT
;
722 * Check for malformed/inconsistent entries.
723 * The first check here isn't useful for EPT PTEs because INTEL_EPT_NCACHE == 0
725 if (!is_ept
&& ((cpte
& (INTEL_PTE_NCACHE
| INTEL_PTE_WTHRU
| INTEL_PTE_PTA
)) == (INTEL_PTE_NCACHE
| INTEL_PTE_WTHRU
))) {
726 action
= PMAP_ACTION_IGNORE
;
727 suppress_reason
= PTE_INVALID_CACHEABILITY
;
729 else if (cpte
& INTEL_PTE_RSVD
) {
730 action
= PMAP_ACTION_IGNORE
;
731 suppress_reason
= PTE_RSVD
;
733 else if ((pmap
!= kernel_pmap
) && (!is_ept
) && ((cpte
& INTEL_PTE_USER
) == 0)) {
734 action
= PMAP_ACTION_IGNORE
;
735 suppress_reason
= PTE_SUPERVISOR
;
738 PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd
, sizeof(ppcd
));
740 if (debug_boot_arg
&& !ppcd
) {
741 action
= PMAP_ACTION_ASSERT
;
744 if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime
) < pmap_pagetable_corruption_interval_abstime
) {
745 action
= PMAP_ACTION_ASSERT
;
746 pmap_pagetable_corruption_timeout
= TRUE
;
750 pmap_pagetable_corruption_last_abstime
= mach_absolute_time();
752 pmap_pagetable_corruption_log(incident
, suppress_reason
, action
, pmap
, vaddr
, &cpte
, *ppnp
, pvpmap
, pvva
);
757 * Remove pv list entry.
758 * Called with pv_head_table entry locked.
759 * Returns pv entry to be freed (or NULL).
761 static inline __attribute__((always_inline
)) pv_hashed_entry_t
762 pmap_pv_remove(pmap_t pmap
,
763 vm_map_offset_t vaddr
,
766 boolean_t
*was_altacct
)
768 pv_hashed_entry_t pvh_e
;
769 pv_rooted_entry_t pv_h
;
770 pv_hashed_entry_t
*pprevh
;
775 *was_altacct
= FALSE
;
776 pmap_pv_remove_retry
:
778 pvh_e
= PV_HASHED_ENTRY_NULL
;
779 pv_h
= pai_to_pvh(ppn_to_pai(ppn
));
781 if (__improbable(pv_h
->pmap
== PMAP_NULL
)) {
782 pmap_pagetable_corruption_action_t pac
= pmap_classify_pagetable_corruption(pmap
, vaddr
, ppnp
, pte
, ROOT_ABSENT
);
783 if (pac
== PMAP_ACTION_IGNORE
)
784 goto pmap_pv_remove_exit
;
785 else if (pac
== PMAP_ACTION_ASSERT
)
786 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list, priors: %d", pmap
, vaddr
, ppn
, *pte
, ppnp
, pte
, pmap_pagetable_corruption_incidents
);
787 else if (pac
== PMAP_ACTION_RETRY_RELOCK
) {
788 LOCK_PVH(ppn_to_pai(*ppnp
));
789 pmap_phys_attributes
[ppn_to_pai(*ppnp
)] |= (PHYS_MODIFIED
| PHYS_REFERENCED
);
790 goto pmap_pv_remove_retry
;
792 else if (pac
== PMAP_ACTION_RETRY
)
793 goto pmap_pv_remove_retry
;
796 if (PVE_VA(pv_h
) == vaddr
&& pv_h
->pmap
== pmap
) {
797 *was_altacct
= IS_ALTACCT_PAGE(ppn_to_pai(*ppnp
), pv_h
);
799 * Header is the pv_rooted_entry.
800 * We can't free that. If there is a queued
801 * entry after this one we remove that
802 * from the ppn queue, we remove it from the hash chain
803 * and copy it to the rooted entry. Then free it instead.
805 pvh_e
= (pv_hashed_entry_t
) queue_next(&pv_h
->qlink
);
806 if (pv_h
!= (pv_rooted_entry_t
) pvh_e
) {
808 * Entry queued to root, remove this from hash
809 * and install as new root.
812 pvhash_idx
= pvhashidx(pvh_e
->pmap
, PVE_VA(pvh_e
));
813 LOCK_PV_HASH(pvhash_idx
);
814 remque(&pvh_e
->qlink
);
815 pprevh
= pvhash(pvhash_idx
);
816 if (PV_HASHED_ENTRY_NULL
== *pprevh
) {
817 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x): "
818 "empty hash, removing rooted, priors: %d",
819 pmap
, vaddr
, ppn
, pmap_pagetable_corruption_incidents
);
821 pmap_pvh_unlink(pvh_e
);
822 UNLOCK_PV_HASH(pvhash_idx
);
823 pv_h
->pmap
= pvh_e
->pmap
;
824 pv_h
->va_and_flags
= pvh_e
->va_and_flags
;
825 /* dispose of pvh_e */
827 /* none queued after rooted */
828 pv_h
->pmap
= PMAP_NULL
;
829 pvh_e
= PV_HASHED_ENTRY_NULL
;
833 * not removing rooted pv. find it on hash chain, remove from
834 * ppn queue and hash chain and free it
837 pvhash_idx
= pvhashidx(pmap
, vaddr
);
838 LOCK_PV_HASH(pvhash_idx
);
839 pprevh
= pvhash(pvhash_idx
);
840 if (PV_HASHED_ENTRY_NULL
== *pprevh
) {
841 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash, priors: %d",
842 pmap
, vaddr
, ppn
, *pte
, pte
, pmap_pagetable_corruption_incidents
);
845 pmap_pv_hashlist_walks
++;
847 while (PV_HASHED_ENTRY_NULL
!= pvh_e
) {
849 if (pvh_e
->pmap
== pmap
&&
850 PVE_VA(pvh_e
) == vaddr
&&
853 pprevh
= &pvh_e
->nexth
;
854 pvh_e
= pvh_e
->nexth
;
857 if (PV_HASHED_ENTRY_NULL
== pvh_e
) {
858 pmap_pagetable_corruption_action_t pac
= pmap_classify_pagetable_corruption(pmap
, vaddr
, ppnp
, pte
, ROOT_PRESENT
);
860 if (pac
== PMAP_ACTION_ASSERT
)
861 panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx, priors: %d", pmap
, vaddr
, ppn
, *pte
, ppnp
, pte
, pv_h
->pmap
, PVE_VA(pv_h
), pmap_pagetable_corruption_incidents
);
863 UNLOCK_PV_HASH(pvhash_idx
);
864 if (pac
== PMAP_ACTION_RETRY_RELOCK
) {
865 LOCK_PVH(ppn_to_pai(*ppnp
));
866 pmap_phys_attributes
[ppn_to_pai(*ppnp
)] |= (PHYS_MODIFIED
| PHYS_REFERENCED
);
867 goto pmap_pv_remove_retry
;
869 else if (pac
== PMAP_ACTION_RETRY
) {
870 goto pmap_pv_remove_retry
;
872 else if (pac
== PMAP_ACTION_IGNORE
) {
873 goto pmap_pv_remove_exit
;
878 *was_altacct
= IS_ALTACCT_PAGE(ppn_to_pai(*ppnp
), pvh_e
);
880 pmap_pv_hashlist_cnts
+= pv_cnt
;
881 if (pmap_pv_hashlist_max
< pv_cnt
)
882 pmap_pv_hashlist_max
= pv_cnt
;
883 *pprevh
= pvh_e
->nexth
;
884 remque(&pvh_e
->qlink
);
885 UNLOCK_PV_HASH(pvhash_idx
);
891 static inline __attribute__((always_inline
)) boolean_t
894 vm_map_offset_t vaddr
,
897 pv_hashed_entry_t pvh_e
;
898 pv_rooted_entry_t pv_h
;
900 boolean_t is_altacct
;
902 pvh_e
= PV_HASHED_ENTRY_NULL
;
903 pv_h
= pai_to_pvh(ppn_to_pai(ppn
));
905 if (__improbable(pv_h
->pmap
== PMAP_NULL
)) {
909 if (PVE_VA(pv_h
) == vaddr
&& pv_h
->pmap
== pmap
) {
911 * Header is the pv_rooted_entry.
913 return IS_ALTACCT_PAGE(ppn
, pv_h
);
917 pvhash_idx
= pvhashidx(pmap
, vaddr
);
918 LOCK_PV_HASH(pvhash_idx
);
919 pvh_e
= *(pvhash(pvhash_idx
));
920 if (PV_HASHED_ENTRY_NULL
== pvh_e
) {
921 panic("Possible memory corruption: pmap_pv_is_altacct(%p,0x%llx,0x%x): empty hash",
924 while (PV_HASHED_ENTRY_NULL
!= pvh_e
) {
925 if (pvh_e
->pmap
== pmap
&&
926 PVE_VA(pvh_e
) == vaddr
&&
929 pvh_e
= pvh_e
->nexth
;
931 if (PV_HASHED_ENTRY_NULL
== pvh_e
) {
934 is_altacct
= IS_ALTACCT_PAGE(ppn
, pvh_e
);
936 UNLOCK_PV_HASH(pvhash_idx
);
941 extern int pt_fake_zone_index
;
943 PMAP_ZINFO_PALLOC(pmap_t pmap
, vm_size_t bytes
)
945 pmap_ledger_credit(pmap
, task_ledgers
.tkm_private
, bytes
);
949 PMAP_ZINFO_PFREE(pmap_t pmap
, vm_size_t bytes
)
951 pmap_ledger_debit(pmap
, task_ledgers
.tkm_private
, bytes
);
955 PMAP_ZINFO_SALLOC(pmap_t pmap
, vm_size_t bytes
)
957 pmap_ledger_credit(pmap
, task_ledgers
.tkm_shared
, bytes
);
961 PMAP_ZINFO_SFREE(pmap_t pmap
, vm_size_t bytes
)
963 pmap_ledger_debit(pmap
, task_ledgers
.tkm_shared
, bytes
);
966 extern boolean_t pmap_initialized
;/* Has pmap_init completed? */
967 #define valid_page(x) (pmap_initialized && pmap_valid_page(x))
969 int phys_attribute_test(
972 void phys_attribute_clear(
975 unsigned int options
,
978 //#define PCID_DEBUG 1
980 #define pmap_pcid_log(fmt, args...) \
982 kprintf(fmt, ##args); \
983 printf(fmt, ##args); \
986 #define pmap_pcid_log(fmt, args...)
988 void pmap_pcid_configure(void);
992 * Atomic 64-bit compare and exchange of a page table entry.
994 static inline boolean_t
995 pmap_cmpx_pte(pt_entry_t
*entryp
, pt_entry_t old
, pt_entry_t
new)
1000 * Load the old value into %rax
1001 * Load the new value into another register
1002 * Compare-exchange-quad at address entryp
1003 * If the compare succeeds, the new value is stored, return TRUE.
1004 * Otherwise, no swap is made, return FALSE.
1007 " lock; cmpxchgq %2,(%3) \n\t"
1018 extern uint32_t pmap_update_clear_pte_count
;
1020 static inline void pmap_update_pte(pt_entry_t
*mptep
, uint64_t pclear_bits
, uint64_t pset_bits
) {
1021 pt_entry_t npte
, opte
;
1024 if (__improbable(opte
== 0)) {
1025 pmap_update_clear_pte_count
++;
1028 npte
= opte
& ~(pclear_bits
);
1030 } while (!pmap_cmpx_pte(mptep
, opte
, npte
));
1034 * The single pml4 page per pmap is allocated at pmap create time and exists
1035 * for the duration of the pmap. we allocate this page in kernel vm.
1036 * this returns the address of the requested pml4 entry in the top level page.
1040 pmap64_pml4(pmap_t pmap
, vm_map_offset_t vaddr
)
1042 if (__improbable((vaddr
> 0x00007FFFFFFFFFFFULL
) &&
1043 (vaddr
< 0xFFFF800000000000ULL
))) {
1048 return PHYSMAP_PTOV(&((pml4_entry_t
*)pmap
->pm_cr3
)[(vaddr
>> PML4SHIFT
) & (NPML4PG
-1)]);
1050 return &pmap
->pm_pml4
[(vaddr
>> PML4SHIFT
) & (NPML4PG
-1)];
1054 static inline pml4_entry_t
*
1055 pmap64_user_pml4(pmap_t pmap
, vm_map_offset_t vaddr
)
1057 if (__improbable((vaddr
> 0x00007FFFFFFFFFFFULL
) &&
1058 (vaddr
< 0xFFFF800000000000ULL
))) {
1063 return PHYSMAP_PTOV(&((pml4_entry_t
*)pmap
->pm_ucr3
)[(vaddr
>> PML4SHIFT
) & (NPML4PG
-1)]);
1065 return &pmap
->pm_upml4
[(vaddr
>> PML4SHIFT
) & (NPML4PG
-1)];
1070 * Returns address of requested PDPT entry in the physmap.
1072 static inline pdpt_entry_t
*
1073 pmap64_pdpt(pmap_t pmap
, vm_map_offset_t vaddr
)
1079 pml4
= pmap64_pml4(pmap
, vaddr
);
1080 is_ept
= is_ept_pmap(pmap
);
1082 if (pml4
&& (*pml4
& PTE_VALID_MASK(is_ept
))) {
1083 newpf
= *pml4
& PG_FRAME
;
1084 return &((pdpt_entry_t
*) PHYSMAP_PTOV(newpf
))
1085 [(vaddr
>> PDPTSHIFT
) & (NPDPTPG
-1)];
1090 * Returns the address of the requested PDE entry in the physmap.
1092 static inline pd_entry_t
*
1093 pmap64_pde(pmap_t pmap
, vm_map_offset_t vaddr
)
1099 pdpt
= pmap64_pdpt(pmap
, vaddr
);
1100 is_ept
= is_ept_pmap(pmap
);
1102 if (pdpt
&& (*pdpt
& PTE_VALID_MASK(is_ept
))) {
1103 newpf
= *pdpt
& PG_FRAME
;
1104 return &((pd_entry_t
*) PHYSMAP_PTOV(newpf
))
1105 [(vaddr
>> PDSHIFT
) & (NPDPG
-1)];
1110 static inline pd_entry_t
*
1111 pmap_pde(pmap_t m
, vm_map_offset_t v
)
1115 pde
= pmap64_pde(m
, v
);
1122 * return address of mapped pte for vaddr va in pmap pmap.
1124 * In case the pde maps a superpage, return the pde, which, in this case
1125 * is the actual page table entry.
1127 static inline pt_entry_t
*
1128 pmap_pte(pmap_t pmap
, vm_map_offset_t vaddr
)
1135 pde
= pmap64_pde(pmap
, vaddr
);
1137 is_ept
= is_ept_pmap(pmap
);
1139 if (pde
&& (*pde
& PTE_VALID_MASK(is_ept
))) {
1142 newpf
= *pde
& PG_FRAME
;
1143 return &((pt_entry_t
*)PHYSMAP_PTOV(newpf
))
1144 [i386_btop(vaddr
) & (ppnum_t
)(NPTEPG
-1)];
1148 extern void pmap_alias(
1150 vm_map_offset_t start
,
1151 vm_map_offset_t end
,
1153 unsigned int options
);
1156 #define DPRINTF(x...) kprintf(x)
1158 #define DPRINTF(x...)
1161 #endif /* MACH_KERNEL_PRIVATE */
1162 #endif /* _I386_PMAP_INTERNAL_ */