]> git.saurik.com Git - apple/xnu.git/blame - osfmk/i386/pmap_internal.h
xnu-1504.9.17.tar.gz
[apple/xnu.git] / osfmk / i386 / pmap_internal.h
CommitLineData
1c79356b 1/*
b0d623f7 2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b 27 */
1c79356b 28
b0d623f7
A
29#include <vm/pmap.h>
30#include <sys/kdebug.h>
b7266188 31#include <kern/debug.h>
1c79356b 32
b0d623f7 33#ifdef MACH_KERNEL_PRIVATE
1c79356b 34
b0d623f7
A
35/*
36 * pmap locking
2d21ac55 37 */
0b4e3aa0 38
b0d623f7
A
39#define PMAP_LOCK(pmap) { \
40 simple_lock(&(pmap)->lock); \
41}
0b4e3aa0 42
b0d623f7
A
43#define PMAP_UNLOCK(pmap) { \
44 simple_unlock(&(pmap)->lock); \
45}
1c79356b 46
1c79356b 47
b0d623f7
A
48#define PMAP_UPDATE_TLBS(pmap, s, e) \
49 pmap_flush_tlbs(pmap)
1c79356b 50
b0d623f7 51#define iswired(pte) ((pte) & INTEL_PTE_WIRED)
1c79356b 52
b0d623f7
A
53#ifdef PMAP_TRACES
54extern boolean_t pmap_trace;
55#define PMAP_TRACE(x,a,b,c,d,e) \
56 if (pmap_trace) { \
57 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
58 }
59#else
60#define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e)
61#endif /* PMAP_TRACES */
1c79356b 62
b0d623f7
A
63void pmap_expand_pml4(
64 pmap_t map,
65 vm_map_offset_t v);
66
67void pmap_expand_pdpt(
68 pmap_t map,
69 vm_map_offset_t v);
b7266188
A
70extern void pmap_flush_tlbs(pmap_t pmap);
71
b0d623f7
A
72#if defined(__x86_64__)
73extern const boolean_t cpu_64bit;
74#else
75extern boolean_t cpu_64bit;
2d21ac55 76#endif
b0d623f7 77
b7266188
A
78/*
79 * Private data structures.
80 */
81
82/*
83 * For each vm_page_t, there is a list of all currently
84 * valid virtual mappings of that page. An entry is
85 * a pv_rooted_entry_t; the list is the pv_table.
86 *
87 * N.B. with the new combo rooted/hashed scheme it is
88 * only possibly to remove individual non-rooted entries
89 * if they are found via the hashed chains as there is no
90 * way to unlink the singly linked hashed entries if navigated to
91 * via the queue list off the rooted entries. Think of it as
92 * hash/walk/pull, keeping track of the prev pointer while walking
93 * the singly linked hash list. All of this is to save memory and
94 * keep both types of pv_entries as small as possible.
95 */
96
97/*
98
99PV HASHING Changes - JK 1/2007
100
101Pve's establish physical to virtual mappings. These are used for aliasing of a
102physical page to (potentially many) virtual addresses within pmaps. In the previous
103implementation the structure of the pv_entries (each 16 bytes in size) was
104
105typedef struct pv_entry {
106 struct pv_entry_t next;
107 pmap_t pmap;
108 vm_map_offset_t va;
109} *pv_entry_t;
110
111An initial array of these is created at boot time, one per physical page of memory,
112indexed by the physical page number. Additionally, a pool of entries is created from a
113pv_zone to be used as needed by pmap_enter() when it is creating new mappings.
114Originally, we kept this pool around because the code in pmap_enter() was unable to
115block if it needed an entry and none were available - we'd panic. Some time ago I
116restructured the pmap_enter() code so that for user pmaps it can block while zalloc'ing
117a pv structure and restart, removing a panic from the code (in the case of the kernel
118pmap we cannot block and still panic, so, we keep a separate hot pool for use only on
119kernel pmaps). The pool has not been removed since there is a large performance gain
120keeping freed pv's around for reuse and not suffering the overhead of zalloc for every new pv we need.
121
122As pmap_enter() created new mappings it linked the new pve's for them off the fixed
123pv array for that ppn (off the next pointer). These pve's are accessed for several
124operations, one of them being address space teardown. In that case, we basically do this
125
126 for (every page/pte in the space) {
127 calc pve_ptr from the ppn in the pte
128 for (every pv in the list for the ppn) {
129 if (this pv is for this pmap/vaddr) {
130 do housekeeping
131 unlink/free the pv
132 }
133 }
134 }
135
136The problem arose when we were running, say 8000 (or even 2000) apache or other processes
137and one or all terminate. The list hanging off each pv array entry could have thousands of
138entries. We were continuously linearly searching each of these lists as we stepped through
139the address space we were tearing down. Because of the locks we hold, likely taking a cache
140miss for each node, and interrupt disabling for MP issues the system became completely
141unresponsive for many seconds while we did this.
142
143Realizing that pve's are accessed in two distinct ways (linearly running the list by ppn
144for operations like pmap_page_protect and finding and modifying/removing a single pve as
145part of pmap_enter processing) has led to modifying the pve structures and databases.
146
147There are now two types of pve structures. A "rooted" structure which is basically the
148original structure accessed in an array by ppn, and a ''hashed'' structure accessed on a
149hash list via a hash of [pmap, vaddr]. These have been designed with the two goals of
150minimizing wired memory and making the lookup of a ppn faster. Since a vast majority of
151pages in the system are not aliased and hence represented by a single pv entry I've kept
152the rooted entry size as small as possible because there is one of these dedicated for
153every physical page of memory. The hashed pve's are larger due to the addition of the hash
154link and the ppn entry needed for matching while running the hash list to find the entry we
155are looking for. This way, only systems that have lots of aliasing (like 2000+ httpd procs)
156will pay the extra memory price. Both structures have the same first three fields allowing
157some simplification in the code.
158
159They have these shapes
160
161typedef struct pv_rooted_entry {
162 queue_head_t qlink;
163 vm_map_offset_t va;
164 pmap_t pmap;
165} *pv_rooted_entry_t;
166
167
168typedef struct pv_hashed_entry {
169 queue_head_t qlink;
170 vm_map_offset_t va;
171 pmap_t pmap;
172 ppnum_t ppn;
173 struct pv_hashed_entry *nexth;
174} *pv_hashed_entry_t;
175
176The main flow difference is that the code is now aware of the rooted entry and the hashed
177entries. Code that runs the pv list still starts with the rooted entry and then continues
178down the qlink onto the hashed entries. Code that is looking up a specific pv entry first
179checks the rooted entry and then hashes and runs the hash list for the match. The hash list
180lengths are much smaller than the original pv lists that contained all aliases for the specific ppn.
181
182*/
183
184typedef struct pv_rooted_entry { /* first three entries must match pv_hashed_entry_t */
185 queue_head_t qlink;
186 vm_map_offset_t va; /* virtual address for mapping */
187 pmap_t pmap; /* pmap where mapping lies */
188} *pv_rooted_entry_t;
189
190#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
191
192
193typedef struct pv_hashed_entry { /* first three entries must match pv_rooted_entry_t */
194 queue_head_t qlink;
195 vm_map_offset_t va;
196 pmap_t pmap;
197 ppnum_t ppn;
198 struct pv_hashed_entry *nexth;
199} *pv_hashed_entry_t;
200
201#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
202
203/* #define PV_DEBUG 1 uncomment to enable some PV debugging code */
204#ifdef PV_DEBUG
205#define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
206#else
207#define CHK_NPVHASH()
208#endif
209
210#define NPVHASH 4095 /* MUST BE 2^N - 1 */
211#define PV_HASHED_LOW_WATER_MARK 5000
212#define PV_HASHED_KERN_LOW_WATER_MARK 400
213#define PV_HASHED_ALLOC_CHUNK 2000
214#define PV_HASHED_KERN_ALLOC_CHUNK 200
215
216#define PV_HASHED_ALLOC(pvh_e) { \
217 simple_lock(&pv_hashed_free_list_lock); \
218 if ((pvh_e = pv_hashed_free_list) != 0) { \
219 pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
220 pv_hashed_free_count--; \
221 if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \
222 if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
223 thread_call_enter(mapping_adjust_call); \
224 } \
225 simple_unlock(&pv_hashed_free_list_lock); \
226}
227
228#define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
229 simple_lock(&pv_hashed_free_list_lock); \
230 pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; \
231 pv_hashed_free_list = pvh_eh; \
232 pv_hashed_free_count += pv_cnt; \
233 simple_unlock(&pv_hashed_free_list_lock); \
234}
235
236#define PV_HASHED_KERN_ALLOC(pvh_e) { \
237 simple_lock(&pv_hashed_kern_free_list_lock); \
238 if ((pvh_e = pv_hashed_kern_free_list) != 0) { \
239 pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
240 pv_hashed_kern_free_count--; \
241 if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) \
242 if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
243 thread_call_enter(mapping_adjust_call); \
244 } \
245 simple_unlock(&pv_hashed_kern_free_list_lock); \
246}
247
248#define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
249 simple_lock(&pv_hashed_kern_free_list_lock); \
250 pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; \
251 pv_hashed_kern_free_list = pvh_eh; \
252 pv_hashed_kern_free_count += pv_cnt; \
253 simple_unlock(&pv_hashed_kern_free_list_lock); \
254}
255
256/*
257 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
258 */
259
260#define pa_index(pa) (i386_btop(pa))
261#define ppn_to_pai(ppn) ((int)ppn)
262
263#define pai_to_pvh(pai) (&pv_head_table[pai])
264#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
265#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
266#define pvhash(idx) (&pv_hash_table[idx])
267
268#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
269#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
270
271#define IS_MANAGED_PAGE(x) \
272 ((unsigned int)(x) <= last_managed_page && \
273 (pmap_phys_attributes[x] & PHYS_MANAGED))
274
275/*
276 * Physical page attributes. Copy bits from PTE definition.
277 */
278#define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
279#define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
280#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
0b4c1975 281#define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */
b7266188
A
282
283/*
284 * Amount of virtual memory mapped by one
285 * page-directory entry.
286 */
287#define PDE_MAPPED_SIZE (pdetova(1))
288
289
290/*
291 * Locking and TLB invalidation
292 */
293
294/*
295 * Locking Protocols: (changed 2/2007 JK)
296 *
297 * There are two structures in the pmap module that need locking:
298 * the pmaps themselves, and the per-page pv_lists (which are locked
299 * by locking the pv_lock_table entry that corresponds to the pv_head
300 * for the list in question.) Most routines want to lock a pmap and
301 * then do operations in it that require pv_list locking -- however
302 * pmap_remove_all and pmap_copy_on_write operate on a physical page
303 * basis and want to do the locking in the reverse order, i.e. lock
304 * a pv_list and then go through all the pmaps referenced by that list.
305 *
306 * The system wide pmap lock has been removed. Now, paths take a lock
307 * on the pmap before changing its 'shape' and the reverse order lockers
308 * (coming in by phys ppn) take a lock on the corresponding pv and then
309 * retest to be sure nothing changed during the window before they locked
310 * and can then run up/down the pv lists holding the list lock. This also
311 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
312 * previously.
313 */
314
315/*
316 * PV locking
317 */
318
319#define LOCK_PVH(index) { \
320 mp_disable_preemption(); \
321 lock_pvh_pai(index); \
322}
323
324#define UNLOCK_PVH(index) { \
325 unlock_pvh_pai(index); \
326 mp_enable_preemption(); \
327}
328/*
329 * PV hash locking
330 */
331
332#define LOCK_PV_HASH(hash) lock_hash_hash(hash)
333#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
334extern uint32_t npvhash;
335extern pv_hashed_entry_t *pv_hash_table; /* hash lists */
336extern pv_hashed_entry_t pv_hashed_free_list;
337extern pv_hashed_entry_t pv_hashed_kern_free_list;
338decl_simple_lock_data(extern, pv_hashed_free_list_lock)
339decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock)
340decl_simple_lock_data(extern, pv_hash_table_lock)
341
342extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
343
344extern int pv_hashed_free_count;
345extern int pv_hashed_kern_free_count;
346#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
347#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
348extern char *pv_lock_table; /* pointer to array of bits */
349
350extern char *pv_hash_lock_table;
351extern pv_rooted_entry_t pv_head_table; /* array of entries, one
352 * per page */
353extern uint64_t pde_mapped_size;
354
355extern char *pmap_phys_attributes;
356extern unsigned int last_managed_page;
357
358/*
359 * when spinning through pmap_remove
360 * ensure that we don't spend too much
361 * time with preemption disabled.
362 * I'm setting the current threshold
363 * to 20us
364 */
365#define MAX_PREEMPTION_LATENCY_NS 20000
366extern uint64_t max_preemption_latency_tsc;
367
368/* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
369#ifdef DEBUGINTERRUPTS
370#define pmap_intr_assert() { \
371 if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \
372 panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \
373}
374#else
375#define pmap_intr_assert()
376#endif
377
378extern int nx_enabled;
379extern unsigned int inuse_ptepages_count;
380
381static inline uint32_t
382pvhashidx(pmap_t pmap, vm_map_offset_t va)
383{
384 return ((uint32_t)(uintptr_t)pmap ^
385 ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
386 npvhash;
387}
388
389/*
390 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
391 * properly deals with the anchor.
392 * must be called with the hash locked, does not unlock it
393 */
394
395static inline void
396pmap_pvh_unlink(pv_hashed_entry_t pvh)
397{
398 pv_hashed_entry_t curh;
399 pv_hashed_entry_t *pprevh;
400 int pvhash_idx;
401
402 CHK_NPVHASH();
403 pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
404
405 pprevh = pvhash(pvhash_idx);
406
407#if PV_DEBUG
408 if (NULL == *pprevh)
409 panic("pvh_unlink null anchor"); /* JK DEBUG */
410#endif
411 curh = *pprevh;
412
413 while (PV_HASHED_ENTRY_NULL != curh) {
414 if (pvh == curh)
415 break;
416 pprevh = &curh->nexth;
417 curh = curh->nexth;
418 }
419 if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
420 *pprevh = pvh->nexth;
421 return;
422}
423
424static inline void
425pv_hash_add(pv_hashed_entry_t pvh_e,
426 pv_rooted_entry_t pv_h)
427{
428 pv_hashed_entry_t *hashp;
429 int pvhash_idx;
430
431 CHK_NPVHASH();
432 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
433 LOCK_PV_HASH(pvhash_idx);
434 insque(&pvh_e->qlink, &pv_h->qlink);
435 hashp = pvhash(pvhash_idx);
436#if PV_DEBUG
437 if (NULL==hashp)
438 panic("pv_hash_add(%p) null hash bucket", pvh_e);
439#endif
440 pvh_e->nexth = *hashp;
441 *hashp = pvh_e;
442 UNLOCK_PV_HASH(pvhash_idx);
443}
444
445static inline void
446pv_hash_remove(pv_hashed_entry_t pvh_e)
447{
448 int pvhash_idx;
449
450 CHK_NPVHASH();
451 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
452 LOCK_PV_HASH(pvhash_idx);
453 remque(&pvh_e->qlink);
454 pmap_pvh_unlink(pvh_e);
455 UNLOCK_PV_HASH(pvhash_idx);
456}
457
458static inline boolean_t popcnt1(uint64_t distance) {
459 return ((distance & (distance - 1)) == 0);
460}
461
462/*
463 * Routines to handle suppression of/recovery from some forms of pagetable corruption
464 * incidents observed in the field. These can be either software induced (wild
465 * stores to the mapwindows where applicable, use after free errors
466 * (typically of pages addressed physically), mis-directed DMAs etc., or due
467 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors,
468 * the recording mechanism is deliberately not MP-safe. The overarching goal is to
469 * still assert on potential software races, but attempt recovery from incidents
470 * identifiable as occurring due to issues beyond the control of the pmap module.
471 * The latter includes single-bit errors and malformed pagetable entries.
472 * We currently limit ourselves to recovery/suppression of one incident per
473 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident
474 * are logged.
475 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09)
476 */
477
478typedef enum {
479 PTE_VALID = 0x0,
480 PTE_INVALID = 0x1,
481 PTE_RSVD = 0x2,
482 PTE_SUPERVISOR = 0x4,
483 PTE_BITFLIP = 0x8,
484 PV_BITFLIP = 0x10,
485 PTE_INVALID_CACHEABILITY = 0x20
486} pmap_pagetable_corruption_t;
487
488typedef enum {
489 ROOT_PRESENT = 0,
490 ROOT_ABSENT = 1
491} pmap_pv_assertion_t;
492
493typedef enum {
494 PMAP_ACTION_IGNORE = 0x0,
495 PMAP_ACTION_ASSERT = 0x1,
496 PMAP_ACTION_RETRY = 0x2,
497 PMAP_ACTION_RETRY_RELOCK = 0x4
498} pmap_pagetable_corruption_action_t;
499
500#define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
501extern uint64_t pmap_pagetable_corruption_interval_abstime;
502
503extern uint32_t pmap_pagetable_corruption_incidents;
504#define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8)
505typedef struct {
506 pmap_pv_assertion_t incident;
507 pmap_pagetable_corruption_t reason;
508 pmap_pagetable_corruption_action_t action;
509 pmap_t pmap;
510 vm_map_offset_t vaddr;
511 pt_entry_t pte;
512 ppnum_t ppn;
513 pmap_t pvpmap;
514 vm_map_offset_t pvva;
515 uint64_t abstime;
516} pmap_pagetable_corruption_record_t;
517
518extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[];
519extern uint64_t pmap_pagetable_corruption_last_abstime;
520extern thread_call_t pmap_pagetable_corruption_log_call;
521extern boolean_t pmap_pagetable_corruption_timeout;
522
523static inline void
524pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) {
525 uint32_t pmap_pagetable_corruption_log_index;
526 pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG;
527 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident;
528 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason;
529 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action;
530 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap;
531 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr;
532 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep;
533 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn;
534 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap;
535 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva;
536 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time();
537 /* Asynchronously log */
538 thread_call_enter(pmap_pagetable_corruption_log_call);
539}
540
541static inline pmap_pagetable_corruption_action_t
542pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) {
543 pmap_pv_assertion_t action = PMAP_ACTION_ASSERT;
544 pmap_pagetable_corruption_t suppress_reason = PTE_VALID;
545 ppnum_t suppress_ppn = 0;
546 pt_entry_t cpte = *ptep;
547 ppnum_t cpn = pa_index(pte_to_pa(cpte));
548 ppnum_t ppn = *ppnp;
549 pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn));
550 pv_rooted_entry_t pv_e = pv_h;
551 uint32_t bitdex;
552 pmap_t pvpmap = pv_h->pmap;
553 vm_map_offset_t pvva = pv_h->va;
554 boolean_t ppcd = FALSE;
555
556 /* Ideally, we'd consult the Mach VM here to definitively determine
557 * the nature of the mapping for this address space and address.
558 * As that would be a layering violation in this context, we
559 * use various heuristics to recover from single bit errors,
560 * malformed pagetable entries etc. These are not intended
561 * to be comprehensive.
562 */
563
564 /* As a precautionary measure, mark A+D */
565 pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
566
567 /*
568 * Correct potential single bit errors in either (but not both) element
569 * of the PV
570 */
571 do {
572 if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) ||
573 (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) {
574 pv_e->pmap = pmap;
575 pv_e->va = vaddr;
576 suppress_reason = PV_BITFLIP;
577 action = PMAP_ACTION_RETRY;
578 goto pmap_cpc_exit;
579 }
580 } while((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink)) != pv_h);
581
582 /* Discover root entries with a Hamming
583 * distance of 1 from the supplied
584 * physical page frame.
585 */
586 for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) {
587 ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex);
588 if (IS_MANAGED_PAGE(npn)) {
589 pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn));
590 if (npv_h->va == vaddr && npv_h->pmap == pmap) {
591 suppress_reason = PTE_BITFLIP;
592 suppress_ppn = npn;
593 action = PMAP_ACTION_RETRY_RELOCK;
594 UNLOCK_PVH(ppn_to_pai(ppn));
595 *ppnp = npn;
596 goto pmap_cpc_exit;
597 }
598 }
599 }
600
601 if (pmap == kernel_pmap) {
602 action = PMAP_ACTION_ASSERT;
603 goto pmap_cpc_exit;
604 }
605
606 /* Check for malformed/inconsistent entries */
607
608 if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) {
609 action = PMAP_ACTION_IGNORE;
610 suppress_reason = PTE_INVALID_CACHEABILITY;
611 }
612 else if (cpte & INTEL_PTE_RSVD) {
613 action = PMAP_ACTION_IGNORE;
614 suppress_reason = PTE_RSVD;
615 }
616 else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) {
617 action = PMAP_ACTION_IGNORE;
618 suppress_reason = PTE_SUPERVISOR;
619 }
620pmap_cpc_exit:
621 PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd));
622
623 if (debug_boot_arg && !ppcd) {
624 action = PMAP_ACTION_ASSERT;
625 }
626
627 if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) {
628 action = PMAP_ACTION_ASSERT;
629 pmap_pagetable_corruption_timeout = TRUE;
630 }
631 else
632 {
633 pmap_pagetable_corruption_last_abstime = mach_absolute_time();
634 }
635 pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva);
636 return action;
637}
638/*
639 * Remove pv list entry.
640 * Called with pv_head_table entry locked.
641 * Returns pv entry to be freed (or NULL).
642 */
643
644static inline __attribute__((always_inline)) pv_hashed_entry_t
645pmap_pv_remove( pmap_t pmap,
646 vm_map_offset_t vaddr,
647 ppnum_t *ppnp,
648 pt_entry_t *pte)
649{
650 pv_hashed_entry_t pvh_e;
651 pv_rooted_entry_t pv_h;
652 pv_hashed_entry_t *pprevh;
653 int pvhash_idx;
654 uint32_t pv_cnt;
655 ppnum_t ppn;
656
657pmap_pv_remove_retry:
658 ppn = *ppnp;
659 pvh_e = PV_HASHED_ENTRY_NULL;
660 pv_h = pai_to_pvh(ppn_to_pai(ppn));
661
662 if (pv_h->pmap == PMAP_NULL) {
663 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT);
664 if (pac == PMAP_ACTION_IGNORE)
665 goto pmap_pv_remove_exit;
666 else if (pac == PMAP_ACTION_ASSERT)
667 panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): null pv_list!", pmap, vaddr, ppn, *pte);
668 else if (pac == PMAP_ACTION_RETRY_RELOCK) {
669 LOCK_PVH(ppn_to_pai(*ppnp));
670 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
671 goto pmap_pv_remove_retry;
672 }
673 else if (pac == PMAP_ACTION_RETRY)
674 goto pmap_pv_remove_retry;
675 }
676
677 if (pv_h->va == vaddr && pv_h->pmap == pmap) {
678 /*
679 * Header is the pv_rooted_entry.
680 * We can't free that. If there is a queued
681 * entry after this one we remove that
682 * from the ppn queue, we remove it from the hash chain
683 * and copy it to the rooted entry. Then free it instead.
684 */
685 pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
686 if (pv_h != (pv_rooted_entry_t) pvh_e) {
687 /*
688 * Entry queued to root, remove this from hash
689 * and install as new root.
690 */
691 CHK_NPVHASH();
692 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
693 LOCK_PV_HASH(pvhash_idx);
694 remque(&pvh_e->qlink);
695 pprevh = pvhash(pvhash_idx);
696 if (PV_HASHED_ENTRY_NULL == *pprevh) {
697 panic("pmap_pv_remove(%p,0x%llx,0x%x): "
698 "empty hash, removing rooted",
699 pmap, vaddr, ppn);
700 }
701 pmap_pvh_unlink(pvh_e);
702 UNLOCK_PV_HASH(pvhash_idx);
703 pv_h->pmap = pvh_e->pmap;
704 pv_h->va = pvh_e->va; /* dispose of pvh_e */
705 } else {
706 /* none queued after rooted */
707 pv_h->pmap = PMAP_NULL;
708 pvh_e = PV_HASHED_ENTRY_NULL;
709 }
710 } else {
711 /*
712 * not removing rooted pv. find it on hash chain, remove from
713 * ppn queue and hash chain and free it
714 */
715 CHK_NPVHASH();
716 pvhash_idx = pvhashidx(pmap, vaddr);
717 LOCK_PV_HASH(pvhash_idx);
718 pprevh = pvhash(pvhash_idx);
719 if (PV_HASHED_ENTRY_NULL == *pprevh) {
720 panic("pmap_pv_remove(%p,0x%llx,0x%x): empty hash", pmap, vaddr, ppn);
721 }
722 pvh_e = *pprevh;
723 pmap_pv_hashlist_walks++;
724 pv_cnt = 0;
725 while (PV_HASHED_ENTRY_NULL != pvh_e) {
726 pv_cnt++;
727 if (pvh_e->pmap == pmap &&
728 pvh_e->va == vaddr &&
729 pvh_e->ppn == ppn)
730 break;
731 pprevh = &pvh_e->nexth;
732 pvh_e = pvh_e->nexth;
733 }
734 if (PV_HASHED_ENTRY_NULL == pvh_e) {
735 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT);
736
737 if (pac == PMAP_ACTION_ASSERT)
738 panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, pv_h->pmap, pv_h->va);
739 else {
740 UNLOCK_PV_HASH(pvhash_idx);
741 if (pac == PMAP_ACTION_RETRY_RELOCK) {
742 LOCK_PVH(ppn_to_pai(*ppnp));
743 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
744 goto pmap_pv_remove_retry;
745 }
746 else if (pac == PMAP_ACTION_RETRY) {
747 goto pmap_pv_remove_retry;
748 }
749 else if (pac == PMAP_ACTION_IGNORE) {
750 goto pmap_pv_remove_exit;
751 }
752 }
753 }
754 pmap_pv_hashlist_cnts += pv_cnt;
755 if (pmap_pv_hashlist_max < pv_cnt)
756 pmap_pv_hashlist_max = pv_cnt;
757 *pprevh = pvh_e->nexth;
758 remque(&pvh_e->qlink);
759 UNLOCK_PV_HASH(pvhash_idx);
760 }
761pmap_pv_remove_exit:
762 return pvh_e;
763}
764
b0d623f7 765#endif /* MACH_KERNEL_PRIVATE */