2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 #include <sys/kdebug.h>
32 #ifdef MACH_KERNEL_PRIVATE
38 #define PMAP_LOCK(pmap) { \
39 simple_lock(&(pmap)->lock); \
42 #define PMAP_UNLOCK(pmap) { \
43 simple_unlock(&(pmap)->lock); \
46 #define PMAP_UPDATE_TLBS(pmap, s, e) \
47 pmap_flush_tlbs(pmap, s, e)
49 #define iswired(pte) ((pte) & INTEL_PTE_WIRED)
52 extern boolean_t pmap_trace
;
53 #define PMAP_TRACE(x,a,b,c,d,e) \
55 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
58 #define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e)
59 #endif /* PMAP_TRACES */
61 #define PMAP_TRACE_CONSTANT(x,a,b,c,d,e) \
62 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
64 void pmap_expand_pml4(
68 void pmap_expand_pdpt(
72 void phys_attribute_set(
76 void pmap_set_reference(
79 boolean_t
phys_page_exists(
82 void pmap_flush_tlbs(pmap_t
, vm_map_offset_t
, vm_map_offset_t
);
85 pmap_update_cache_attributes_locked(ppnum_t
, unsigned);
88 extern boolean_t cpu_64bit
;
90 extern const boolean_t cpu_64bit
;
94 * Private data structures.
98 * For each vm_page_t, there is a list of all currently
99 * valid virtual mappings of that page. An entry is
100 * a pv_rooted_entry_t; the list is the pv_table.
102 * N.B. with the new combo rooted/hashed scheme it is
103 * only possibly to remove individual non-rooted entries
104 * if they are found via the hashed chains as there is no
105 * way to unlink the singly linked hashed entries if navigated to
106 * via the queue list off the rooted entries. Think of it as
107 * hash/walk/pull, keeping track of the prev pointer while walking
108 * the singly linked hash list. All of this is to save memory and
109 * keep both types of pv_entries as small as possible.
114 PV HASHING Changes - JK 1/2007
116 Pve's establish physical to virtual mappings. These are used for aliasing of a
117 physical page to (potentially many) virtual addresses within pmaps. In the
118 previous implementation the structure of the pv_entries (each 16 bytes in size) was
120 typedef struct pv_entry {
121 struct pv_entry_t next;
126 An initial array of these is created at boot time, one per physical page of
127 memory, indexed by the physical page number. Additionally, a pool of entries
128 is created from a pv_zone to be used as needed by pmap_enter() when it is
129 creating new mappings. Originally, we kept this pool around because the code
130 in pmap_enter() was unable to block if it needed an entry and none were
131 available - we'd panic. Some time ago I restructured the pmap_enter() code
132 so that for user pmaps it can block while zalloc'ing a pv structure and restart,
133 removing a panic from the code (in the case of the kernel pmap we cannot block
134 and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
135 The pool has not been removed since there is a large performance gain keeping
136 freed pv's around for reuse and not suffering the overhead of zalloc for every
139 As pmap_enter() created new mappings it linked the new pve's for them off the
140 fixed pv array for that ppn (off the next pointer). These pve's are accessed
141 for several operations, one of them being address space teardown. In that case,
144 for (every page/pte in the space) {
145 calc pve_ptr from the ppn in the pte
146 for (every pv in the list for the ppn) {
147 if (this pv is for this pmap/vaddr) {
154 The problem arose when we were running, say 8000 (or even 2000) apache or
155 other processes and one or all terminate. The list hanging off each pv array
156 entry could have thousands of entries. We were continuously linearly searching
157 each of these lists as we stepped through the address space we were tearing
158 down. Because of the locks we hold, likely taking a cache miss for each node,
159 and interrupt disabling for MP issues the system became completely unresponsive
160 for many seconds while we did this.
162 Realizing that pve's are accessed in two distinct ways (linearly running the
163 list by ppn for operations like pmap_page_protect and finding and
164 modifying/removing a single pve as part of pmap_enter processing) has led to
165 modifying the pve structures and databases.
167 There are now two types of pve structures. A "rooted" structure which is
168 basically the original structure accessed in an array by ppn, and a ''hashed''
169 structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
170 designed with the two goals of minimizing wired memory and making the lookup of
171 a ppn faster. Since a vast majority of pages in the system are not aliased
172 and hence represented by a single pv entry I've kept the rooted entry size as
173 small as possible because there is one of these dedicated for every physical
174 page of memory. The hashed pve's are larger due to the addition of the hash
175 link and the ppn entry needed for matching while running the hash list to find
176 the entry we are looking for. This way, only systems that have lots of
177 aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
178 structures have the same first three fields allowing some simplification in
181 They have these shapes
183 typedef struct pv_rooted_entry {
187 } *pv_rooted_entry_t;
190 typedef struct pv_hashed_entry {
195 struct pv_hashed_entry *nexth;
196 } *pv_hashed_entry_t;
198 The main flow difference is that the code is now aware of the rooted entry and
199 the hashed entries. Code that runs the pv list still starts with the rooted
200 entry and then continues down the qlink onto the hashed entries. Code that is
201 looking up a specific pv entry first checks the rooted entry and then hashes
202 and runs the hash list for the match. The hash list lengths are much smaller
203 than the original pv lists that contained all aliases for the specific ppn.
207 typedef struct pv_rooted_entry
{
208 /* first three entries must match pv_hashed_entry_t */
210 vm_map_offset_t va
; /* virtual address for mapping */
211 pmap_t pmap
; /* pmap where mapping lies */
212 } *pv_rooted_entry_t
;
214 #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
216 typedef struct pv_hashed_entry
{
217 /* first three entries must match pv_rooted_entry_t */
222 struct pv_hashed_entry
*nexth
;
223 } *pv_hashed_entry_t
;
225 #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
227 //#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */
229 #define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
231 #define CHK_NPVHASH(x)
234 #define NPVHASH 4095 /* MUST BE 2^N - 1 */
235 #define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000
236 #define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000
237 #define PV_HASHED_ALLOC_CHUNK_INITIAL 2000
238 #define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200
240 extern volatile uint32_t mappingrecurse
;
241 extern uint32_t pv_hashed_low_water_mark
, pv_hashed_kern_low_water_mark
;
247 #define LOCK_PV_HASH(hash) lock_hash_hash(hash)
248 #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
249 extern uint32_t npvhash
;
250 extern pv_hashed_entry_t
*pv_hash_table
; /* hash lists */
251 extern pv_hashed_entry_t pv_hashed_free_list
;
252 extern pv_hashed_entry_t pv_hashed_kern_free_list
;
253 decl_simple_lock_data(extern, pv_hashed_free_list_lock
)
254 decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock
)
255 decl_simple_lock_data(extern, pv_hash_table_lock
)
257 extern zone_t pv_hashed_list_zone
; /* zone of pv_hashed_entry
260 extern uint32_t pv_hashed_free_count
;
261 extern uint32_t pv_hashed_kern_free_count
;
263 * Each entry in the pv_head_table is locked by a bit in the
264 * pv_lock_table. The lock bits are accessed by the address of
265 * the frame they lock.
267 #define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
268 #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
269 extern char *pv_lock_table
; /* pointer to array of bits */
270 extern char *pv_hash_lock_table
;
271 extern pv_rooted_entry_t pv_head_table
; /* array of entries, one per page */
273 extern event_t mapping_replenish_event
;
275 static inline void PV_HASHED_ALLOC(pv_hashed_entry_t
*pvh_ep
) {
277 simple_lock(&pv_hashed_free_list_lock
);
278 /* If the kernel reserved pool is low, let non-kernel mappings allocate
279 * synchronously, possibly subject to a throttle.
281 if ((pv_hashed_kern_free_count
>= pv_hashed_kern_low_water_mark
) &&
282 (*pvh_ep
= pv_hashed_free_list
) != 0) {
283 pv_hashed_free_list
= (pv_hashed_entry_t
)(*pvh_ep
)->qlink
.next
;
284 pv_hashed_free_count
--;
287 simple_unlock(&pv_hashed_free_list_lock
);
289 if (pv_hashed_free_count
< pv_hashed_low_water_mark
) {
290 if (!mappingrecurse
&& hw_compare_and_store(0,1, &mappingrecurse
))
291 thread_wakeup(&mapping_replenish_event
);
295 static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh
, pv_hashed_entry_t pvh_et
, int pv_cnt
) {
296 simple_lock(&pv_hashed_free_list_lock
);
297 pvh_et
->qlink
.next
= (queue_entry_t
)pv_hashed_free_list
;
298 pv_hashed_free_list
= pvh_eh
;
299 pv_hashed_free_count
+= pv_cnt
;
300 simple_unlock(&pv_hashed_free_list_lock
);
303 extern unsigned pmap_kern_reserve_alloc_stat
;
305 static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t
*pvh_e
) {
306 simple_lock(&pv_hashed_kern_free_list_lock
);
308 if ((*pvh_e
= pv_hashed_kern_free_list
) != 0) {
309 pv_hashed_kern_free_list
= (pv_hashed_entry_t
)(*pvh_e
)->qlink
.next
;
310 pv_hashed_kern_free_count
--;
311 pmap_kern_reserve_alloc_stat
++;
314 simple_unlock(&pv_hashed_kern_free_list_lock
);
316 if (pv_hashed_kern_free_count
< pv_hashed_kern_low_water_mark
) {
317 if (!mappingrecurse
&& hw_compare_and_store(0,1, &mappingrecurse
))
318 thread_wakeup(&mapping_replenish_event
);
322 static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh
, pv_hashed_entry_t pvh_et
, int pv_cnt
) {
323 simple_lock(&pv_hashed_kern_free_list_lock
);
324 pvh_et
->qlink
.next
= (queue_entry_t
)pv_hashed_kern_free_list
;
325 pv_hashed_kern_free_list
= pvh_eh
;
326 pv_hashed_kern_free_count
+= pv_cnt
;
327 simple_unlock(&pv_hashed_kern_free_list_lock
);
330 extern uint64_t pmap_pv_throttle_stat
, pmap_pv_throttled_waiters
;
331 extern event_t pmap_user_pv_throttle_event
;
333 static inline void pmap_pv_throttle(__unused pmap_t p
) {
334 pmap_assert(p
!= kernel_pmap
);
335 /* Apply throttle on non-kernel mappings */
336 if (pv_hashed_kern_free_count
< (pv_hashed_kern_low_water_mark
/ 2)) {
337 pmap_pv_throttle_stat
++;
338 /* This doesn't need to be strictly accurate, merely a hint
339 * to eliminate the timeout when the reserve is replenished.
341 pmap_pv_throttled_waiters
++;
342 assert_wait_timeout(&pmap_user_pv_throttle_event
, THREAD_UNINT
, 1, 1000 * NSEC_PER_USEC
);
343 thread_block(THREAD_CONTINUE_NULL
);
348 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
351 #define pa_index(pa) (i386_btop(pa))
352 #define ppn_to_pai(ppn) ((int)ppn)
354 #define pai_to_pvh(pai) (&pv_head_table[pai])
355 #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
356 #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
357 #define pvhash(idx) (&pv_hash_table[idx])
358 #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
359 #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
361 #define IS_MANAGED_PAGE(x) \
362 ((unsigned int)(x) <= last_managed_page && \
363 (pmap_phys_attributes[x] & PHYS_MANAGED))
366 * Physical page attributes. Copy bits from PTE definition.
368 #define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
369 #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
370 #define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
371 #define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */
372 #define PHYS_NCACHE INTEL_PTE_NCACHE
373 #define PHYS_PTA INTEL_PTE_PTA
374 #define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE)
377 * Amount of virtual memory mapped by one
378 * page-directory entry.
380 #define PDE_MAPPED_SIZE (pdetova(1))
384 * Locking and TLB invalidation
388 * Locking Protocols: (changed 2/2007 JK)
390 * There are two structures in the pmap module that need locking:
391 * the pmaps themselves, and the per-page pv_lists (which are locked
392 * by locking the pv_lock_table entry that corresponds to the pv_head
393 * for the list in question.) Most routines want to lock a pmap and
394 * then do operations in it that require pv_list locking -- however
395 * pmap_remove_all and pmap_copy_on_write operate on a physical page
396 * basis and want to do the locking in the reverse order, i.e. lock
397 * a pv_list and then go through all the pmaps referenced by that list.
399 * The system wide pmap lock has been removed. Now, paths take a lock
400 * on the pmap before changing its 'shape' and the reverse order lockers
401 * (coming in by phys ppn) take a lock on the corresponding pv and then
402 * retest to be sure nothing changed during the window before they locked
403 * and can then run up/down the pv lists holding the list lock. This also
404 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
412 #define LOCK_PVH(index) { \
413 mp_disable_preemption(); \
414 lock_pvh_pai(index); \
417 #define UNLOCK_PVH(index) { \
418 unlock_pvh_pai(index); \
419 mp_enable_preemption(); \
422 extern uint64_t pde_mapped_size
;
424 extern char *pmap_phys_attributes
;
425 extern unsigned int last_managed_page
;
427 extern ppnum_t lowest_lo
;
428 extern ppnum_t lowest_hi
;
429 extern ppnum_t highest_hi
;
432 * when spinning through pmap_remove
433 * ensure that we don't spend too much
434 * time with preemption disabled.
435 * I'm setting the current threshold
438 #define MAX_PREEMPTION_LATENCY_NS 20000
439 extern uint64_t max_preemption_latency_tsc
;
441 /* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
442 #ifdef DEBUGINTERRUPTS
443 #define pmap_intr_assert() { \
444 if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \
445 panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \
448 #define pmap_intr_assert()
451 extern int nx_enabled
;
452 extern unsigned int inuse_ptepages_count
;
454 static inline uint32_t
455 pvhashidx(pmap_t pmap
, vm_map_offset_t va
)
457 return ((uint32_t)(uintptr_t)pmap
^
458 ((uint32_t)(va
>> PAGE_SHIFT
) & 0xFFFFFFFF)) &
464 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
465 * properly deals with the anchor.
466 * must be called with the hash locked, does not unlock it
469 pmap_pvh_unlink(pv_hashed_entry_t pvh
)
471 pv_hashed_entry_t curh
;
472 pv_hashed_entry_t
*pprevh
;
476 pvhash_idx
= pvhashidx(pvh
->pmap
, pvh
->va
);
478 pprevh
= pvhash(pvhash_idx
);
482 panic("pvh_unlink null anchor"); /* JK DEBUG */
486 while (PV_HASHED_ENTRY_NULL
!= curh
) {
489 pprevh
= &curh
->nexth
;
492 if (PV_HASHED_ENTRY_NULL
== curh
) panic("pmap_pvh_unlink no pvh");
493 *pprevh
= pvh
->nexth
;
498 pv_hash_add(pv_hashed_entry_t pvh_e
,
499 pv_rooted_entry_t pv_h
)
501 pv_hashed_entry_t
*hashp
;
505 pvhash_idx
= pvhashidx(pvh_e
->pmap
, pvh_e
->va
);
506 LOCK_PV_HASH(pvhash_idx
);
507 insque(&pvh_e
->qlink
, &pv_h
->qlink
);
508 hashp
= pvhash(pvhash_idx
);
511 panic("pv_hash_add(%p) null hash bucket", pvh_e
);
513 pvh_e
->nexth
= *hashp
;
515 UNLOCK_PV_HASH(pvhash_idx
);
519 pv_hash_remove(pv_hashed_entry_t pvh_e
)
524 pvhash_idx
= pvhashidx(pvh_e
->pmap
,pvh_e
->va
);
525 LOCK_PV_HASH(pvhash_idx
);
526 remque(&pvh_e
->qlink
);
527 pmap_pvh_unlink(pvh_e
);
528 UNLOCK_PV_HASH(pvhash_idx
);
531 static inline boolean_t
popcnt1(uint64_t distance
) {
532 return ((distance
& (distance
- 1)) == 0);
536 * Routines to handle suppression of/recovery from some forms of pagetable corruption
537 * incidents observed in the field. These can be either software induced (wild
538 * stores to the mapwindows where applicable, use after free errors
539 * (typically of pages addressed physically), mis-directed DMAs etc., or due
540 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors,
541 * the recording mechanism is deliberately not MP-safe. The overarching goal is to
542 * still assert on potential software races, but attempt recovery from incidents
543 * identifiable as occurring due to issues beyond the control of the pmap module.
544 * The latter includes single-bit errors and malformed pagetable entries.
545 * We currently limit ourselves to recovery/suppression of one incident per
546 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident
548 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09)
555 PTE_SUPERVISOR
= 0x4,
558 PTE_INVALID_CACHEABILITY
= 0x20
559 } pmap_pagetable_corruption_t
;
564 } pmap_pv_assertion_t
;
567 PMAP_ACTION_IGNORE
= 0x0,
568 PMAP_ACTION_ASSERT
= 0x1,
569 PMAP_ACTION_RETRY
= 0x2,
570 PMAP_ACTION_RETRY_RELOCK
= 0x4
571 } pmap_pagetable_corruption_action_t
;
573 #define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
574 extern uint64_t pmap_pagetable_corruption_interval_abstime
;
576 extern uint32_t pmap_pagetable_corruption_incidents
;
577 #define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8)
579 pmap_pv_assertion_t incident
;
580 pmap_pagetable_corruption_t reason
;
581 pmap_pagetable_corruption_action_t action
;
583 vm_map_offset_t vaddr
;
587 vm_map_offset_t pvva
;
589 } pmap_pagetable_corruption_record_t
;
591 extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records
[];
592 extern uint64_t pmap_pagetable_corruption_last_abstime
;
593 extern thread_call_t pmap_pagetable_corruption_log_call
;
594 extern boolean_t pmap_pagetable_corruption_timeout
;
597 pmap_pagetable_corruption_log(pmap_pv_assertion_t incident
, pmap_pagetable_corruption_t suppress_reason
, pmap_pagetable_corruption_action_t action
, pmap_t pmap
, vm_map_offset_t vaddr
, pt_entry_t
*ptep
, ppnum_t ppn
, pmap_t pvpmap
, vm_map_offset_t pvva
) {
598 uint32_t pmap_pagetable_corruption_log_index
;
599 pmap_pagetable_corruption_log_index
= pmap_pagetable_corruption_incidents
++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG
;
600 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].incident
= incident
;
601 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].reason
= suppress_reason
;
602 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].action
= action
;
603 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].pmap
= pmap
;
604 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].vaddr
= vaddr
;
605 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].pte
= *ptep
;
606 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].ppn
= ppn
;
607 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].pvpmap
= pvpmap
;
608 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].pvva
= pvva
;
609 pmap_pagetable_corruption_records
[pmap_pagetable_corruption_log_index
].abstime
= mach_absolute_time();
610 /* Asynchronously log */
611 thread_call_enter(pmap_pagetable_corruption_log_call
);
614 static inline pmap_pagetable_corruption_action_t
615 pmap_classify_pagetable_corruption(pmap_t pmap
, vm_map_offset_t vaddr
, ppnum_t
*ppnp
, pt_entry_t
*ptep
, pmap_pv_assertion_t incident
) {
616 pmap_pv_assertion_t action
= PMAP_ACTION_ASSERT
;
617 pmap_pagetable_corruption_t suppress_reason
= PTE_VALID
;
618 ppnum_t suppress_ppn
= 0;
619 pt_entry_t cpte
= *ptep
;
620 ppnum_t cpn
= pa_index(pte_to_pa(cpte
));
622 pv_rooted_entry_t pv_h
= pai_to_pvh(ppn_to_pai(ppn
));
623 pv_rooted_entry_t pv_e
= pv_h
;
625 pmap_t pvpmap
= pv_h
->pmap
;
626 vm_map_offset_t pvva
= pv_h
->va
;
627 boolean_t ppcd
= FALSE
;
629 /* Ideally, we'd consult the Mach VM here to definitively determine
630 * the nature of the mapping for this address space and address.
631 * As that would be a layering violation in this context, we
632 * use various heuristics to recover from single bit errors,
633 * malformed pagetable entries etc. These are not intended
634 * to be comprehensive.
637 /* As a precautionary measure, mark A+D */
638 pmap_phys_attributes
[ppn_to_pai(ppn
)] |= (PHYS_MODIFIED
| PHYS_REFERENCED
);
641 * Correct potential single bit errors in either (but not both) element
645 if ((popcnt1((uintptr_t)pv_e
->pmap
^ (uintptr_t)pmap
) && pv_e
->va
== vaddr
) ||
646 (pv_e
->pmap
== pmap
&& popcnt1(pv_e
->va
^ vaddr
))) {
649 suppress_reason
= PV_BITFLIP
;
650 action
= PMAP_ACTION_RETRY
;
653 } while((pv_e
= (pv_rooted_entry_t
) queue_next(&pv_e
->qlink
)) != pv_h
);
655 /* Discover root entries with a Hamming
656 * distance of 1 from the supplied
657 * physical page frame.
659 for (bitdex
= 0; bitdex
< (sizeof(ppnum_t
) << 3); bitdex
++) {
660 ppnum_t npn
= cpn
^ (ppnum_t
) (1ULL << bitdex
);
661 if (IS_MANAGED_PAGE(npn
)) {
662 pv_rooted_entry_t npv_h
= pai_to_pvh(ppn_to_pai(npn
));
663 if (npv_h
->va
== vaddr
&& npv_h
->pmap
== pmap
) {
664 suppress_reason
= PTE_BITFLIP
;
666 action
= PMAP_ACTION_RETRY_RELOCK
;
667 UNLOCK_PVH(ppn_to_pai(ppn
));
674 if (pmap
== kernel_pmap
) {
675 action
= PMAP_ACTION_ASSERT
;
679 /* Check for malformed/inconsistent entries */
681 if ((cpte
& (INTEL_PTE_NCACHE
| INTEL_PTE_WTHRU
| INTEL_PTE_PTA
)) == (INTEL_PTE_NCACHE
| INTEL_PTE_WTHRU
)) {
682 action
= PMAP_ACTION_IGNORE
;
683 suppress_reason
= PTE_INVALID_CACHEABILITY
;
685 else if (cpte
& INTEL_PTE_RSVD
) {
686 action
= PMAP_ACTION_IGNORE
;
687 suppress_reason
= PTE_RSVD
;
689 else if ((pmap
!= kernel_pmap
) && ((cpte
& INTEL_PTE_USER
) == 0)) {
690 action
= PMAP_ACTION_IGNORE
;
691 suppress_reason
= PTE_SUPERVISOR
;
694 PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd
, sizeof(ppcd
));
696 if (debug_boot_arg
&& !ppcd
) {
697 action
= PMAP_ACTION_ASSERT
;
700 if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime
) < pmap_pagetable_corruption_interval_abstime
) {
701 action
= PMAP_ACTION_ASSERT
;
702 pmap_pagetable_corruption_timeout
= TRUE
;
706 pmap_pagetable_corruption_last_abstime
= mach_absolute_time();
708 pmap_pagetable_corruption_log(incident
, suppress_reason
, action
, pmap
, vaddr
, &cpte
, *ppnp
, pvpmap
, pvva
);
713 * Remove pv list entry.
714 * Called with pv_head_table entry locked.
715 * Returns pv entry to be freed (or NULL).
717 static inline __attribute__((always_inline
)) pv_hashed_entry_t
718 pmap_pv_remove(pmap_t pmap
,
719 vm_map_offset_t vaddr
,
723 pv_hashed_entry_t pvh_e
;
724 pv_rooted_entry_t pv_h
;
725 pv_hashed_entry_t
*pprevh
;
730 pmap_pv_remove_retry
:
732 pvh_e
= PV_HASHED_ENTRY_NULL
;
733 pv_h
= pai_to_pvh(ppn_to_pai(ppn
));
735 if (pv_h
->pmap
== PMAP_NULL
) {
736 pmap_pagetable_corruption_action_t pac
= pmap_classify_pagetable_corruption(pmap
, vaddr
, ppnp
, pte
, ROOT_ABSENT
);
737 if (pac
== PMAP_ACTION_IGNORE
)
738 goto pmap_pv_remove_exit
;
739 else if (pac
== PMAP_ACTION_ASSERT
)
740 panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): null pv_list!", pmap
, vaddr
, ppn
, *pte
);
741 else if (pac
== PMAP_ACTION_RETRY_RELOCK
) {
742 LOCK_PVH(ppn_to_pai(*ppnp
));
743 pmap_phys_attributes
[ppn_to_pai(*ppnp
)] |= (PHYS_MODIFIED
| PHYS_REFERENCED
);
744 goto pmap_pv_remove_retry
;
746 else if (pac
== PMAP_ACTION_RETRY
)
747 goto pmap_pv_remove_retry
;
750 if (pv_h
->va
== vaddr
&& pv_h
->pmap
== pmap
) {
752 * Header is the pv_rooted_entry.
753 * We can't free that. If there is a queued
754 * entry after this one we remove that
755 * from the ppn queue, we remove it from the hash chain
756 * and copy it to the rooted entry. Then free it instead.
758 pvh_e
= (pv_hashed_entry_t
) queue_next(&pv_h
->qlink
);
759 if (pv_h
!= (pv_rooted_entry_t
) pvh_e
) {
761 * Entry queued to root, remove this from hash
762 * and install as new root.
765 pvhash_idx
= pvhashidx(pvh_e
->pmap
, pvh_e
->va
);
766 LOCK_PV_HASH(pvhash_idx
);
767 remque(&pvh_e
->qlink
);
768 pprevh
= pvhash(pvhash_idx
);
769 if (PV_HASHED_ENTRY_NULL
== *pprevh
) {
770 panic("pmap_pv_remove(%p,0x%llx,0x%x): "
771 "empty hash, removing rooted",
774 pmap_pvh_unlink(pvh_e
);
775 UNLOCK_PV_HASH(pvhash_idx
);
776 pv_h
->pmap
= pvh_e
->pmap
;
777 pv_h
->va
= pvh_e
->va
; /* dispose of pvh_e */
779 /* none queued after rooted */
780 pv_h
->pmap
= PMAP_NULL
;
781 pvh_e
= PV_HASHED_ENTRY_NULL
;
785 * not removing rooted pv. find it on hash chain, remove from
786 * ppn queue and hash chain and free it
789 pvhash_idx
= pvhashidx(pmap
, vaddr
);
790 LOCK_PV_HASH(pvhash_idx
);
791 pprevh
= pvhash(pvhash_idx
);
792 if (PV_HASHED_ENTRY_NULL
== *pprevh
) {
793 panic("pmap_pv_remove(%p,0x%llx,0x%x): empty hash",
797 pmap_pv_hashlist_walks
++;
799 while (PV_HASHED_ENTRY_NULL
!= pvh_e
) {
801 if (pvh_e
->pmap
== pmap
&&
802 pvh_e
->va
== vaddr
&&
805 pprevh
= &pvh_e
->nexth
;
806 pvh_e
= pvh_e
->nexth
;
809 if (PV_HASHED_ENTRY_NULL
== pvh_e
) {
810 pmap_pagetable_corruption_action_t pac
= pmap_classify_pagetable_corruption(pmap
, vaddr
, ppnp
, pte
, ROOT_PRESENT
);
812 if (pac
== PMAP_ACTION_ASSERT
)
813 panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): pv not on hash, head: %p, 0x%llx", pmap
, vaddr
, ppn
, *pte
, pv_h
->pmap
, pv_h
->va
);
815 UNLOCK_PV_HASH(pvhash_idx
);
816 if (pac
== PMAP_ACTION_RETRY_RELOCK
) {
817 LOCK_PVH(ppn_to_pai(*ppnp
));
818 pmap_phys_attributes
[ppn_to_pai(*ppnp
)] |= (PHYS_MODIFIED
| PHYS_REFERENCED
);
819 goto pmap_pv_remove_retry
;
821 else if (pac
== PMAP_ACTION_RETRY
) {
822 goto pmap_pv_remove_retry
;
824 else if (pac
== PMAP_ACTION_IGNORE
) {
825 goto pmap_pv_remove_exit
;
830 pmap_pv_hashlist_cnts
+= pv_cnt
;
831 if (pmap_pv_hashlist_max
< pv_cnt
)
832 pmap_pv_hashlist_max
= pv_cnt
;
833 *pprevh
= pvh_e
->nexth
;
834 remque(&pvh_e
->qlink
);
835 UNLOCK_PV_HASH(pvhash_idx
);
842 extern int pt_fake_zone_index
;
844 PMAP_ZINFO_PALLOC(vm_size_t bytes
)
846 thread_t thr
= current_thread();
850 thr
->tkm_private
.alloc
+= bytes
;
851 if (pt_fake_zone_index
!= -1 &&
852 (task
= thr
->task
) != NULL
&& (zinfo
= task
->tkm_zinfo
) != NULL
)
853 OSAddAtomic64(bytes
, (int64_t *)&zinfo
[pt_fake_zone_index
].alloc
);
857 PMAP_ZINFO_PFREE(vm_size_t bytes
)
859 thread_t thr
= current_thread();
863 thr
->tkm_private
.free
+= bytes
;
864 if (pt_fake_zone_index
!= -1 &&
865 (task
= thr
->task
) != NULL
&& (zinfo
= task
->tkm_zinfo
) != NULL
)
866 OSAddAtomic64(bytes
, (int64_t *)&zinfo
[pt_fake_zone_index
].free
);
869 extern boolean_t pmap_initialized
;/* Has pmap_init completed? */
870 #define valid_page(x) (pmap_initialized && pmap_valid_page(x))
873 #define HIGH_MEM_BASE ((uint32_t)( -NBPDE) ) /* shared gdt etc seg addr */ /* XXX64 ?? */
877 int phys_attribute_test(
880 void phys_attribute_clear(
884 //#define PCID_DEBUG 1
886 #define pmap_pcid_log(fmt, args...) \
888 kprintf(fmt, ##args); \
889 printf(fmt, ##args); \
892 #define pmap_pcid_log(fmt, args...)
894 void pmap_pcid_configure(void);
896 #if defined(__x86_64__)
898 * The single pml4 page per pmap is allocated at pmap create time and exists
899 * for the duration of the pmap. we allocate this page in kernel vm.
900 * this returns the address of the requested pml4 entry in the top level page.
904 pmap64_pml4(pmap_t pmap
, vm_map_offset_t vaddr
)
907 return PHYSMAP_PTOV(&((pml4_entry_t
*)pmap
->pm_cr3
)[(vaddr
>> PML4SHIFT
) & (NPML4PG
-1)]);
909 return &pmap
->pm_pml4
[(vaddr
>> PML4SHIFT
) & (NPML4PG
-1)];
914 * Returns address of requested PDPT entry in the physmap.
916 static inline pdpt_entry_t
*
917 pmap64_pdpt(pmap_t pmap
, vm_map_offset_t vaddr
)
923 if ((vaddr
> 0x00007FFFFFFFFFFFULL
) &&
924 (vaddr
< 0xFFFF800000000000ULL
)) {
928 pml4
= pmap64_pml4(pmap
, vaddr
);
929 if (pml4
&& ((*pml4
& INTEL_PTE_VALID
))) {
930 newpf
= *pml4
& PG_FRAME
;
931 return &((pdpt_entry_t
*) PHYSMAP_PTOV(newpf
))
932 [(vaddr
>> PDPTSHIFT
) & (NPDPTPG
-1)];
937 * Returns the address of the requested PDE entry in the physmap.
939 static inline pd_entry_t
*
940 pmap64_pde(pmap_t pmap
, vm_map_offset_t vaddr
)
946 if ((vaddr
> 0x00007FFFFFFFFFFFULL
) &&
947 (vaddr
< 0xFFFF800000000000ULL
)) {
951 pdpt
= pmap64_pdpt(pmap
, vaddr
);
953 if (pdpt
&& ((*pdpt
& INTEL_PTE_VALID
))) {
954 newpf
= *pdpt
& PG_FRAME
;
955 return &((pd_entry_t
*) PHYSMAP_PTOV(newpf
))
956 [(vaddr
>> PDSHIFT
) & (NPDPG
-1)];
961 static inline pd_entry_t
*
962 pmap_pde(pmap_t m
, vm_map_offset_t v
)
967 pde
= pmap64_pde(m
, v
);
974 * return address of mapped pte for vaddr va in pmap pmap.
976 * In case the pde maps a superpage, return the pde, which, in this case
977 * is the actual page table entry.
979 static inline pt_entry_t
*
980 pmap_pte(pmap_t pmap
, vm_map_offset_t vaddr
)
986 pde
= pmap_pde(pmap
, vaddr
);
988 if (pde
&& ((*pde
& INTEL_PTE_VALID
))) {
989 if (*pde
& INTEL_PTE_PS
)
991 newpf
= *pde
& PG_FRAME
;
992 return &((pt_entry_t
*)PHYSMAP_PTOV(newpf
))
993 [i386_btop(vaddr
) & (ppnum_t
)(NPTEPG
-1)];
998 #endif /* MACH_KERNEL_PRIVATE */