]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/pmap_internal.h
xnu-1504.3.12.tar.gz
[apple/xnu.git] / osfmk / i386 / pmap_internal.h
1 /*
2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <vm/pmap.h>
30 #include <sys/kdebug.h>
31 #include <kern/debug.h>
32
33 #ifdef MACH_KERNEL_PRIVATE
34
35 /*
36 * pmap locking
37 */
38
39 #define PMAP_LOCK(pmap) { \
40 simple_lock(&(pmap)->lock); \
41 }
42
43 #define PMAP_UNLOCK(pmap) { \
44 simple_unlock(&(pmap)->lock); \
45 }
46
47
48 #define PMAP_UPDATE_TLBS(pmap, s, e) \
49 pmap_flush_tlbs(pmap)
50
51 #define iswired(pte) ((pte) & INTEL_PTE_WIRED)
52
53 #ifdef PMAP_TRACES
54 extern boolean_t pmap_trace;
55 #define PMAP_TRACE(x,a,b,c,d,e) \
56 if (pmap_trace) { \
57 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
58 }
59 #else
60 #define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e)
61 #endif /* PMAP_TRACES */
62
63 void pmap_expand_pml4(
64 pmap_t map,
65 vm_map_offset_t v);
66
67 void pmap_expand_pdpt(
68 pmap_t map,
69 vm_map_offset_t v);
70 extern void pmap_flush_tlbs(pmap_t pmap);
71
72 #if defined(__x86_64__)
73 extern const boolean_t cpu_64bit;
74 #else
75 extern boolean_t cpu_64bit;
76 #endif
77
78 /*
79 * Private data structures.
80 */
81
82 /*
83 * For each vm_page_t, there is a list of all currently
84 * valid virtual mappings of that page. An entry is
85 * a pv_rooted_entry_t; the list is the pv_table.
86 *
87 * N.B. with the new combo rooted/hashed scheme it is
88 * only possibly to remove individual non-rooted entries
89 * if they are found via the hashed chains as there is no
90 * way to unlink the singly linked hashed entries if navigated to
91 * via the queue list off the rooted entries. Think of it as
92 * hash/walk/pull, keeping track of the prev pointer while walking
93 * the singly linked hash list. All of this is to save memory and
94 * keep both types of pv_entries as small as possible.
95 */
96
97 /*
98
99 PV HASHING Changes - JK 1/2007
100
101 Pve's establish physical to virtual mappings. These are used for aliasing of a
102 physical page to (potentially many) virtual addresses within pmaps. In the previous
103 implementation the structure of the pv_entries (each 16 bytes in size) was
104
105 typedef struct pv_entry {
106 struct pv_entry_t next;
107 pmap_t pmap;
108 vm_map_offset_t va;
109 } *pv_entry_t;
110
111 An initial array of these is created at boot time, one per physical page of memory,
112 indexed by the physical page number. Additionally, a pool of entries is created from a
113 pv_zone to be used as needed by pmap_enter() when it is creating new mappings.
114 Originally, we kept this pool around because the code in pmap_enter() was unable to
115 block if it needed an entry and none were available - we'd panic. Some time ago I
116 restructured the pmap_enter() code so that for user pmaps it can block while zalloc'ing
117 a pv structure and restart, removing a panic from the code (in the case of the kernel
118 pmap we cannot block and still panic, so, we keep a separate hot pool for use only on
119 kernel pmaps). The pool has not been removed since there is a large performance gain
120 keeping freed pv's around for reuse and not suffering the overhead of zalloc for every new pv we need.
121
122 As pmap_enter() created new mappings it linked the new pve's for them off the fixed
123 pv array for that ppn (off the next pointer). These pve's are accessed for several
124 operations, one of them being address space teardown. In that case, we basically do this
125
126 for (every page/pte in the space) {
127 calc pve_ptr from the ppn in the pte
128 for (every pv in the list for the ppn) {
129 if (this pv is for this pmap/vaddr) {
130 do housekeeping
131 unlink/free the pv
132 }
133 }
134 }
135
136 The problem arose when we were running, say 8000 (or even 2000) apache or other processes
137 and one or all terminate. The list hanging off each pv array entry could have thousands of
138 entries. We were continuously linearly searching each of these lists as we stepped through
139 the address space we were tearing down. Because of the locks we hold, likely taking a cache
140 miss for each node, and interrupt disabling for MP issues the system became completely
141 unresponsive for many seconds while we did this.
142
143 Realizing that pve's are accessed in two distinct ways (linearly running the list by ppn
144 for operations like pmap_page_protect and finding and modifying/removing a single pve as
145 part of pmap_enter processing) has led to modifying the pve structures and databases.
146
147 There are now two types of pve structures. A "rooted" structure which is basically the
148 original structure accessed in an array by ppn, and a ''hashed'' structure accessed on a
149 hash list via a hash of [pmap, vaddr]. These have been designed with the two goals of
150 minimizing wired memory and making the lookup of a ppn faster. Since a vast majority of
151 pages in the system are not aliased and hence represented by a single pv entry I've kept
152 the rooted entry size as small as possible because there is one of these dedicated for
153 every physical page of memory. The hashed pve's are larger due to the addition of the hash
154 link and the ppn entry needed for matching while running the hash list to find the entry we
155 are looking for. This way, only systems that have lots of aliasing (like 2000+ httpd procs)
156 will pay the extra memory price. Both structures have the same first three fields allowing
157 some simplification in the code.
158
159 They have these shapes
160
161 typedef struct pv_rooted_entry {
162 queue_head_t qlink;
163 vm_map_offset_t va;
164 pmap_t pmap;
165 } *pv_rooted_entry_t;
166
167
168 typedef struct pv_hashed_entry {
169 queue_head_t qlink;
170 vm_map_offset_t va;
171 pmap_t pmap;
172 ppnum_t ppn;
173 struct pv_hashed_entry *nexth;
174 } *pv_hashed_entry_t;
175
176 The main flow difference is that the code is now aware of the rooted entry and the hashed
177 entries. Code that runs the pv list still starts with the rooted entry and then continues
178 down the qlink onto the hashed entries. Code that is looking up a specific pv entry first
179 checks the rooted entry and then hashes and runs the hash list for the match. The hash list
180 lengths are much smaller than the original pv lists that contained all aliases for the specific ppn.
181
182 */
183
184 typedef struct pv_rooted_entry { /* first three entries must match pv_hashed_entry_t */
185 queue_head_t qlink;
186 vm_map_offset_t va; /* virtual address for mapping */
187 pmap_t pmap; /* pmap where mapping lies */
188 } *pv_rooted_entry_t;
189
190 #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
191
192
193 typedef struct pv_hashed_entry { /* first three entries must match pv_rooted_entry_t */
194 queue_head_t qlink;
195 vm_map_offset_t va;
196 pmap_t pmap;
197 ppnum_t ppn;
198 struct pv_hashed_entry *nexth;
199 } *pv_hashed_entry_t;
200
201 #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
202
203 /* #define PV_DEBUG 1 uncomment to enable some PV debugging code */
204 #ifdef PV_DEBUG
205 #define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
206 #else
207 #define CHK_NPVHASH()
208 #endif
209
210 #define NPVHASH 4095 /* MUST BE 2^N - 1 */
211 #define PV_HASHED_LOW_WATER_MARK 5000
212 #define PV_HASHED_KERN_LOW_WATER_MARK 400
213 #define PV_HASHED_ALLOC_CHUNK 2000
214 #define PV_HASHED_KERN_ALLOC_CHUNK 200
215
216 #define PV_HASHED_ALLOC(pvh_e) { \
217 simple_lock(&pv_hashed_free_list_lock); \
218 if ((pvh_e = pv_hashed_free_list) != 0) { \
219 pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
220 pv_hashed_free_count--; \
221 if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \
222 if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
223 thread_call_enter(mapping_adjust_call); \
224 } \
225 simple_unlock(&pv_hashed_free_list_lock); \
226 }
227
228 #define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
229 simple_lock(&pv_hashed_free_list_lock); \
230 pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; \
231 pv_hashed_free_list = pvh_eh; \
232 pv_hashed_free_count += pv_cnt; \
233 simple_unlock(&pv_hashed_free_list_lock); \
234 }
235
236 #define PV_HASHED_KERN_ALLOC(pvh_e) { \
237 simple_lock(&pv_hashed_kern_free_list_lock); \
238 if ((pvh_e = pv_hashed_kern_free_list) != 0) { \
239 pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
240 pv_hashed_kern_free_count--; \
241 if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) \
242 if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
243 thread_call_enter(mapping_adjust_call); \
244 } \
245 simple_unlock(&pv_hashed_kern_free_list_lock); \
246 }
247
248 #define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
249 simple_lock(&pv_hashed_kern_free_list_lock); \
250 pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; \
251 pv_hashed_kern_free_list = pvh_eh; \
252 pv_hashed_kern_free_count += pv_cnt; \
253 simple_unlock(&pv_hashed_kern_free_list_lock); \
254 }
255
256 /*
257 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
258 */
259
260 #define pa_index(pa) (i386_btop(pa))
261 #define ppn_to_pai(ppn) ((int)ppn)
262
263 #define pai_to_pvh(pai) (&pv_head_table[pai])
264 #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
265 #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
266 #define pvhash(idx) (&pv_hash_table[idx])
267
268 #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
269 #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
270
271 #define IS_MANAGED_PAGE(x) \
272 ((unsigned int)(x) <= last_managed_page && \
273 (pmap_phys_attributes[x] & PHYS_MANAGED))
274
275 /*
276 * Physical page attributes. Copy bits from PTE definition.
277 */
278 #define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
279 #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
280 #define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
281
282 /*
283 * Amount of virtual memory mapped by one
284 * page-directory entry.
285 */
286 #define PDE_MAPPED_SIZE (pdetova(1))
287
288
289 /*
290 * Locking and TLB invalidation
291 */
292
293 /*
294 * Locking Protocols: (changed 2/2007 JK)
295 *
296 * There are two structures in the pmap module that need locking:
297 * the pmaps themselves, and the per-page pv_lists (which are locked
298 * by locking the pv_lock_table entry that corresponds to the pv_head
299 * for the list in question.) Most routines want to lock a pmap and
300 * then do operations in it that require pv_list locking -- however
301 * pmap_remove_all and pmap_copy_on_write operate on a physical page
302 * basis and want to do the locking in the reverse order, i.e. lock
303 * a pv_list and then go through all the pmaps referenced by that list.
304 *
305 * The system wide pmap lock has been removed. Now, paths take a lock
306 * on the pmap before changing its 'shape' and the reverse order lockers
307 * (coming in by phys ppn) take a lock on the corresponding pv and then
308 * retest to be sure nothing changed during the window before they locked
309 * and can then run up/down the pv lists holding the list lock. This also
310 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
311 * previously.
312 */
313
314 /*
315 * PV locking
316 */
317
318 #define LOCK_PVH(index) { \
319 mp_disable_preemption(); \
320 lock_pvh_pai(index); \
321 }
322
323 #define UNLOCK_PVH(index) { \
324 unlock_pvh_pai(index); \
325 mp_enable_preemption(); \
326 }
327 /*
328 * PV hash locking
329 */
330
331 #define LOCK_PV_HASH(hash) lock_hash_hash(hash)
332 #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
333 extern uint32_t npvhash;
334 extern pv_hashed_entry_t *pv_hash_table; /* hash lists */
335 extern pv_hashed_entry_t pv_hashed_free_list;
336 extern pv_hashed_entry_t pv_hashed_kern_free_list;
337 decl_simple_lock_data(extern, pv_hashed_free_list_lock)
338 decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock)
339 decl_simple_lock_data(extern, pv_hash_table_lock)
340
341 extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
342
343 extern int pv_hashed_free_count;
344 extern int pv_hashed_kern_free_count;
345 #define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
346 #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
347 extern char *pv_lock_table; /* pointer to array of bits */
348
349 extern char *pv_hash_lock_table;
350 extern pv_rooted_entry_t pv_head_table; /* array of entries, one
351 * per page */
352 extern uint64_t pde_mapped_size;
353
354 extern char *pmap_phys_attributes;
355 extern unsigned int last_managed_page;
356
357 /*
358 * when spinning through pmap_remove
359 * ensure that we don't spend too much
360 * time with preemption disabled.
361 * I'm setting the current threshold
362 * to 20us
363 */
364 #define MAX_PREEMPTION_LATENCY_NS 20000
365 extern uint64_t max_preemption_latency_tsc;
366
367 /* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
368 #ifdef DEBUGINTERRUPTS
369 #define pmap_intr_assert() { \
370 if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \
371 panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \
372 }
373 #else
374 #define pmap_intr_assert()
375 #endif
376
377 extern int nx_enabled;
378 extern unsigned int inuse_ptepages_count;
379
380 static inline uint32_t
381 pvhashidx(pmap_t pmap, vm_map_offset_t va)
382 {
383 return ((uint32_t)(uintptr_t)pmap ^
384 ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
385 npvhash;
386 }
387
388 /*
389 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
390 * properly deals with the anchor.
391 * must be called with the hash locked, does not unlock it
392 */
393
394 static inline void
395 pmap_pvh_unlink(pv_hashed_entry_t pvh)
396 {
397 pv_hashed_entry_t curh;
398 pv_hashed_entry_t *pprevh;
399 int pvhash_idx;
400
401 CHK_NPVHASH();
402 pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
403
404 pprevh = pvhash(pvhash_idx);
405
406 #if PV_DEBUG
407 if (NULL == *pprevh)
408 panic("pvh_unlink null anchor"); /* JK DEBUG */
409 #endif
410 curh = *pprevh;
411
412 while (PV_HASHED_ENTRY_NULL != curh) {
413 if (pvh == curh)
414 break;
415 pprevh = &curh->nexth;
416 curh = curh->nexth;
417 }
418 if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
419 *pprevh = pvh->nexth;
420 return;
421 }
422
423 static inline void
424 pv_hash_add(pv_hashed_entry_t pvh_e,
425 pv_rooted_entry_t pv_h)
426 {
427 pv_hashed_entry_t *hashp;
428 int pvhash_idx;
429
430 CHK_NPVHASH();
431 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
432 LOCK_PV_HASH(pvhash_idx);
433 insque(&pvh_e->qlink, &pv_h->qlink);
434 hashp = pvhash(pvhash_idx);
435 #if PV_DEBUG
436 if (NULL==hashp)
437 panic("pv_hash_add(%p) null hash bucket", pvh_e);
438 #endif
439 pvh_e->nexth = *hashp;
440 *hashp = pvh_e;
441 UNLOCK_PV_HASH(pvhash_idx);
442 }
443
444 static inline void
445 pv_hash_remove(pv_hashed_entry_t pvh_e)
446 {
447 int pvhash_idx;
448
449 CHK_NPVHASH();
450 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
451 LOCK_PV_HASH(pvhash_idx);
452 remque(&pvh_e->qlink);
453 pmap_pvh_unlink(pvh_e);
454 UNLOCK_PV_HASH(pvhash_idx);
455 }
456
457 static inline boolean_t popcnt1(uint64_t distance) {
458 return ((distance & (distance - 1)) == 0);
459 }
460
461 /*
462 * Routines to handle suppression of/recovery from some forms of pagetable corruption
463 * incidents observed in the field. These can be either software induced (wild
464 * stores to the mapwindows where applicable, use after free errors
465 * (typically of pages addressed physically), mis-directed DMAs etc., or due
466 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors,
467 * the recording mechanism is deliberately not MP-safe. The overarching goal is to
468 * still assert on potential software races, but attempt recovery from incidents
469 * identifiable as occurring due to issues beyond the control of the pmap module.
470 * The latter includes single-bit errors and malformed pagetable entries.
471 * We currently limit ourselves to recovery/suppression of one incident per
472 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident
473 * are logged.
474 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09)
475 */
476
477 typedef enum {
478 PTE_VALID = 0x0,
479 PTE_INVALID = 0x1,
480 PTE_RSVD = 0x2,
481 PTE_SUPERVISOR = 0x4,
482 PTE_BITFLIP = 0x8,
483 PV_BITFLIP = 0x10,
484 PTE_INVALID_CACHEABILITY = 0x20
485 } pmap_pagetable_corruption_t;
486
487 typedef enum {
488 ROOT_PRESENT = 0,
489 ROOT_ABSENT = 1
490 } pmap_pv_assertion_t;
491
492 typedef enum {
493 PMAP_ACTION_IGNORE = 0x0,
494 PMAP_ACTION_ASSERT = 0x1,
495 PMAP_ACTION_RETRY = 0x2,
496 PMAP_ACTION_RETRY_RELOCK = 0x4
497 } pmap_pagetable_corruption_action_t;
498
499 #define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
500 extern uint64_t pmap_pagetable_corruption_interval_abstime;
501
502 extern uint32_t pmap_pagetable_corruption_incidents;
503 #define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8)
504 typedef struct {
505 pmap_pv_assertion_t incident;
506 pmap_pagetable_corruption_t reason;
507 pmap_pagetable_corruption_action_t action;
508 pmap_t pmap;
509 vm_map_offset_t vaddr;
510 pt_entry_t pte;
511 ppnum_t ppn;
512 pmap_t pvpmap;
513 vm_map_offset_t pvva;
514 uint64_t abstime;
515 } pmap_pagetable_corruption_record_t;
516
517 extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[];
518 extern uint64_t pmap_pagetable_corruption_last_abstime;
519 extern thread_call_t pmap_pagetable_corruption_log_call;
520 extern boolean_t pmap_pagetable_corruption_timeout;
521
522 static inline void
523 pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) {
524 uint32_t pmap_pagetable_corruption_log_index;
525 pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG;
526 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident;
527 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason;
528 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action;
529 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap;
530 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr;
531 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep;
532 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn;
533 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap;
534 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva;
535 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time();
536 /* Asynchronously log */
537 thread_call_enter(pmap_pagetable_corruption_log_call);
538 }
539
540 static inline pmap_pagetable_corruption_action_t
541 pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) {
542 pmap_pv_assertion_t action = PMAP_ACTION_ASSERT;
543 pmap_pagetable_corruption_t suppress_reason = PTE_VALID;
544 ppnum_t suppress_ppn = 0;
545 pt_entry_t cpte = *ptep;
546 ppnum_t cpn = pa_index(pte_to_pa(cpte));
547 ppnum_t ppn = *ppnp;
548 pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn));
549 pv_rooted_entry_t pv_e = pv_h;
550 uint32_t bitdex;
551 pmap_t pvpmap = pv_h->pmap;
552 vm_map_offset_t pvva = pv_h->va;
553 boolean_t ppcd = FALSE;
554
555 /* Ideally, we'd consult the Mach VM here to definitively determine
556 * the nature of the mapping for this address space and address.
557 * As that would be a layering violation in this context, we
558 * use various heuristics to recover from single bit errors,
559 * malformed pagetable entries etc. These are not intended
560 * to be comprehensive.
561 */
562
563 /* As a precautionary measure, mark A+D */
564 pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
565
566 /*
567 * Correct potential single bit errors in either (but not both) element
568 * of the PV
569 */
570 do {
571 if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) ||
572 (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) {
573 pv_e->pmap = pmap;
574 pv_e->va = vaddr;
575 suppress_reason = PV_BITFLIP;
576 action = PMAP_ACTION_RETRY;
577 goto pmap_cpc_exit;
578 }
579 } while((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink)) != pv_h);
580
581 /* Discover root entries with a Hamming
582 * distance of 1 from the supplied
583 * physical page frame.
584 */
585 for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) {
586 ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex);
587 if (IS_MANAGED_PAGE(npn)) {
588 pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn));
589 if (npv_h->va == vaddr && npv_h->pmap == pmap) {
590 suppress_reason = PTE_BITFLIP;
591 suppress_ppn = npn;
592 action = PMAP_ACTION_RETRY_RELOCK;
593 UNLOCK_PVH(ppn_to_pai(ppn));
594 *ppnp = npn;
595 goto pmap_cpc_exit;
596 }
597 }
598 }
599
600 if (pmap == kernel_pmap) {
601 action = PMAP_ACTION_ASSERT;
602 goto pmap_cpc_exit;
603 }
604
605 /* Check for malformed/inconsistent entries */
606
607 if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) {
608 action = PMAP_ACTION_IGNORE;
609 suppress_reason = PTE_INVALID_CACHEABILITY;
610 }
611 else if (cpte & INTEL_PTE_RSVD) {
612 action = PMAP_ACTION_IGNORE;
613 suppress_reason = PTE_RSVD;
614 }
615 else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) {
616 action = PMAP_ACTION_IGNORE;
617 suppress_reason = PTE_SUPERVISOR;
618 }
619 pmap_cpc_exit:
620 PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd));
621
622 if (debug_boot_arg && !ppcd) {
623 action = PMAP_ACTION_ASSERT;
624 }
625
626 if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) {
627 action = PMAP_ACTION_ASSERT;
628 pmap_pagetable_corruption_timeout = TRUE;
629 }
630 else
631 {
632 pmap_pagetable_corruption_last_abstime = mach_absolute_time();
633 }
634 pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva);
635 return action;
636 }
637 /*
638 * Remove pv list entry.
639 * Called with pv_head_table entry locked.
640 * Returns pv entry to be freed (or NULL).
641 */
642
643 static inline __attribute__((always_inline)) pv_hashed_entry_t
644 pmap_pv_remove( pmap_t pmap,
645 vm_map_offset_t vaddr,
646 ppnum_t *ppnp,
647 pt_entry_t *pte)
648 {
649 pv_hashed_entry_t pvh_e;
650 pv_rooted_entry_t pv_h;
651 pv_hashed_entry_t *pprevh;
652 int pvhash_idx;
653 uint32_t pv_cnt;
654 ppnum_t ppn;
655
656 pmap_pv_remove_retry:
657 ppn = *ppnp;
658 pvh_e = PV_HASHED_ENTRY_NULL;
659 pv_h = pai_to_pvh(ppn_to_pai(ppn));
660
661 if (pv_h->pmap == PMAP_NULL) {
662 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT);
663 if (pac == PMAP_ACTION_IGNORE)
664 goto pmap_pv_remove_exit;
665 else if (pac == PMAP_ACTION_ASSERT)
666 panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): null pv_list!", pmap, vaddr, ppn, *pte);
667 else if (pac == PMAP_ACTION_RETRY_RELOCK) {
668 LOCK_PVH(ppn_to_pai(*ppnp));
669 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
670 goto pmap_pv_remove_retry;
671 }
672 else if (pac == PMAP_ACTION_RETRY)
673 goto pmap_pv_remove_retry;
674 }
675
676 if (pv_h->va == vaddr && pv_h->pmap == pmap) {
677 /*
678 * Header is the pv_rooted_entry.
679 * We can't free that. If there is a queued
680 * entry after this one we remove that
681 * from the ppn queue, we remove it from the hash chain
682 * and copy it to the rooted entry. Then free it instead.
683 */
684 pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
685 if (pv_h != (pv_rooted_entry_t) pvh_e) {
686 /*
687 * Entry queued to root, remove this from hash
688 * and install as new root.
689 */
690 CHK_NPVHASH();
691 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
692 LOCK_PV_HASH(pvhash_idx);
693 remque(&pvh_e->qlink);
694 pprevh = pvhash(pvhash_idx);
695 if (PV_HASHED_ENTRY_NULL == *pprevh) {
696 panic("pmap_pv_remove(%p,0x%llx,0x%x): "
697 "empty hash, removing rooted",
698 pmap, vaddr, ppn);
699 }
700 pmap_pvh_unlink(pvh_e);
701 UNLOCK_PV_HASH(pvhash_idx);
702 pv_h->pmap = pvh_e->pmap;
703 pv_h->va = pvh_e->va; /* dispose of pvh_e */
704 } else {
705 /* none queued after rooted */
706 pv_h->pmap = PMAP_NULL;
707 pvh_e = PV_HASHED_ENTRY_NULL;
708 }
709 } else {
710 /*
711 * not removing rooted pv. find it on hash chain, remove from
712 * ppn queue and hash chain and free it
713 */
714 CHK_NPVHASH();
715 pvhash_idx = pvhashidx(pmap, vaddr);
716 LOCK_PV_HASH(pvhash_idx);
717 pprevh = pvhash(pvhash_idx);
718 if (PV_HASHED_ENTRY_NULL == *pprevh) {
719 panic("pmap_pv_remove(%p,0x%llx,0x%x): empty hash", pmap, vaddr, ppn);
720 }
721 pvh_e = *pprevh;
722 pmap_pv_hashlist_walks++;
723 pv_cnt = 0;
724 while (PV_HASHED_ENTRY_NULL != pvh_e) {
725 pv_cnt++;
726 if (pvh_e->pmap == pmap &&
727 pvh_e->va == vaddr &&
728 pvh_e->ppn == ppn)
729 break;
730 pprevh = &pvh_e->nexth;
731 pvh_e = pvh_e->nexth;
732 }
733 if (PV_HASHED_ENTRY_NULL == pvh_e) {
734 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT);
735
736 if (pac == PMAP_ACTION_ASSERT)
737 panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, pv_h->pmap, pv_h->va);
738 else {
739 UNLOCK_PV_HASH(pvhash_idx);
740 if (pac == PMAP_ACTION_RETRY_RELOCK) {
741 LOCK_PVH(ppn_to_pai(*ppnp));
742 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
743 goto pmap_pv_remove_retry;
744 }
745 else if (pac == PMAP_ACTION_RETRY) {
746 goto pmap_pv_remove_retry;
747 }
748 else if (pac == PMAP_ACTION_IGNORE) {
749 goto pmap_pv_remove_exit;
750 }
751 }
752 }
753 pmap_pv_hashlist_cnts += pv_cnt;
754 if (pmap_pv_hashlist_max < pv_cnt)
755 pmap_pv_hashlist_max = pv_cnt;
756 *pprevh = pvh_e->nexth;
757 remque(&pvh_e->qlink);
758 UNLOCK_PV_HASH(pvhash_idx);
759 }
760 pmap_pv_remove_exit:
761 return pvh_e;
762 }
763
764 #endif /* MACH_KERNEL_PRIVATE */