]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/pmap_internal.h
37757f19183aaa2b7552ca864a5db31121365999
[apple/xnu.git] / osfmk / i386 / pmap_internal.h
1 /*
2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <vm/pmap.h>
30 #include <sys/kdebug.h>
31
32 #ifdef MACH_KERNEL_PRIVATE
33
34 /*
35 * pmap locking
36 */
37
38 #define PMAP_LOCK(pmap) { \
39 simple_lock(&(pmap)->lock); \
40 }
41
42 #define PMAP_UNLOCK(pmap) { \
43 simple_unlock(&(pmap)->lock); \
44 }
45
46 #define PMAP_UPDATE_TLBS(pmap, s, e) \
47 pmap_flush_tlbs(pmap, s, e)
48
49 #define iswired(pte) ((pte) & INTEL_PTE_WIRED)
50
51 #ifdef PMAP_TRACES
52 extern boolean_t pmap_trace;
53 #define PMAP_TRACE(x,a,b,c,d,e) \
54 if (pmap_trace) { \
55 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
56 }
57 #else
58 #define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e)
59 #endif /* PMAP_TRACES */
60
61 #define PMAP_TRACE_CONSTANT(x,a,b,c,d,e) \
62 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
63
64 void pmap_expand_pml4(
65 pmap_t map,
66 vm_map_offset_t v);
67
68 void pmap_expand_pdpt(
69 pmap_t map,
70 vm_map_offset_t v);
71
72 void phys_attribute_set(
73 ppnum_t phys,
74 int bits);
75
76 void pmap_set_reference(
77 ppnum_t pn);
78
79 boolean_t phys_page_exists(
80 ppnum_t pn);
81
82 void pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t);
83
84 void
85 pmap_update_cache_attributes_locked(ppnum_t, unsigned);
86
87 #if CONFIG_YONAH
88 extern boolean_t cpu_64bit;
89 #else
90 extern const boolean_t cpu_64bit;
91 #endif
92
93 /*
94 * Private data structures.
95 */
96
97 /*
98 * For each vm_page_t, there is a list of all currently
99 * valid virtual mappings of that page. An entry is
100 * a pv_rooted_entry_t; the list is the pv_table.
101 *
102 * N.B. with the new combo rooted/hashed scheme it is
103 * only possibly to remove individual non-rooted entries
104 * if they are found via the hashed chains as there is no
105 * way to unlink the singly linked hashed entries if navigated to
106 * via the queue list off the rooted entries. Think of it as
107 * hash/walk/pull, keeping track of the prev pointer while walking
108 * the singly linked hash list. All of this is to save memory and
109 * keep both types of pv_entries as small as possible.
110 */
111
112 /*
113
114 PV HASHING Changes - JK 1/2007
115
116 Pve's establish physical to virtual mappings. These are used for aliasing of a
117 physical page to (potentially many) virtual addresses within pmaps. In the
118 previous implementation the structure of the pv_entries (each 16 bytes in size) was
119
120 typedef struct pv_entry {
121 struct pv_entry_t next;
122 pmap_t pmap;
123 vm_map_offset_t va;
124 } *pv_entry_t;
125
126 An initial array of these is created at boot time, one per physical page of
127 memory, indexed by the physical page number. Additionally, a pool of entries
128 is created from a pv_zone to be used as needed by pmap_enter() when it is
129 creating new mappings. Originally, we kept this pool around because the code
130 in pmap_enter() was unable to block if it needed an entry and none were
131 available - we'd panic. Some time ago I restructured the pmap_enter() code
132 so that for user pmaps it can block while zalloc'ing a pv structure and restart,
133 removing a panic from the code (in the case of the kernel pmap we cannot block
134 and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
135 The pool has not been removed since there is a large performance gain keeping
136 freed pv's around for reuse and not suffering the overhead of zalloc for every
137 new pv we need.
138
139 As pmap_enter() created new mappings it linked the new pve's for them off the
140 fixed pv array for that ppn (off the next pointer). These pve's are accessed
141 for several operations, one of them being address space teardown. In that case,
142 we basically do this
143
144 for (every page/pte in the space) {
145 calc pve_ptr from the ppn in the pte
146 for (every pv in the list for the ppn) {
147 if (this pv is for this pmap/vaddr) {
148 do housekeeping
149 unlink/free the pv
150 }
151 }
152 }
153
154 The problem arose when we were running, say 8000 (or even 2000) apache or
155 other processes and one or all terminate. The list hanging off each pv array
156 entry could have thousands of entries. We were continuously linearly searching
157 each of these lists as we stepped through the address space we were tearing
158 down. Because of the locks we hold, likely taking a cache miss for each node,
159 and interrupt disabling for MP issues the system became completely unresponsive
160 for many seconds while we did this.
161
162 Realizing that pve's are accessed in two distinct ways (linearly running the
163 list by ppn for operations like pmap_page_protect and finding and
164 modifying/removing a single pve as part of pmap_enter processing) has led to
165 modifying the pve structures and databases.
166
167 There are now two types of pve structures. A "rooted" structure which is
168 basically the original structure accessed in an array by ppn, and a ''hashed''
169 structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
170 designed with the two goals of minimizing wired memory and making the lookup of
171 a ppn faster. Since a vast majority of pages in the system are not aliased
172 and hence represented by a single pv entry I've kept the rooted entry size as
173 small as possible because there is one of these dedicated for every physical
174 page of memory. The hashed pve's are larger due to the addition of the hash
175 link and the ppn entry needed for matching while running the hash list to find
176 the entry we are looking for. This way, only systems that have lots of
177 aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
178 structures have the same first three fields allowing some simplification in
179 the code.
180
181 They have these shapes
182
183 typedef struct pv_rooted_entry {
184 queue_head_t qlink;
185 vm_map_offset_t va;
186 pmap_t pmap;
187 } *pv_rooted_entry_t;
188
189
190 typedef struct pv_hashed_entry {
191 queue_head_t qlink;
192 vm_map_offset_t va;
193 pmap_t pmap;
194 ppnum_t ppn;
195 struct pv_hashed_entry *nexth;
196 } *pv_hashed_entry_t;
197
198 The main flow difference is that the code is now aware of the rooted entry and
199 the hashed entries. Code that runs the pv list still starts with the rooted
200 entry and then continues down the qlink onto the hashed entries. Code that is
201 looking up a specific pv entry first checks the rooted entry and then hashes
202 and runs the hash list for the match. The hash list lengths are much smaller
203 than the original pv lists that contained all aliases for the specific ppn.
204
205 */
206
207 typedef struct pv_rooted_entry {
208 /* first three entries must match pv_hashed_entry_t */
209 queue_head_t qlink;
210 vm_map_offset_t va; /* virtual address for mapping */
211 pmap_t pmap; /* pmap where mapping lies */
212 } *pv_rooted_entry_t;
213
214 #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
215
216 typedef struct pv_hashed_entry {
217 /* first three entries must match pv_rooted_entry_t */
218 queue_head_t qlink;
219 vm_map_offset_t va;
220 pmap_t pmap;
221 ppnum_t ppn;
222 struct pv_hashed_entry *nexth;
223 } *pv_hashed_entry_t;
224
225 #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
226
227 //#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */
228 #ifdef PV_DEBUG
229 #define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
230 #else
231 #define CHK_NPVHASH(x)
232 #endif
233
234 #define NPVHASH 4095 /* MUST BE 2^N - 1 */
235 #define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000
236 #define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000
237 #define PV_HASHED_ALLOC_CHUNK_INITIAL 2000
238 #define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200
239
240 extern volatile uint32_t mappingrecurse;
241 extern uint32_t pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark;
242
243 /*
244 * PV hash locking
245 */
246
247 #define LOCK_PV_HASH(hash) lock_hash_hash(hash)
248 #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
249 extern uint32_t npvhash;
250 extern pv_hashed_entry_t *pv_hash_table; /* hash lists */
251 extern pv_hashed_entry_t pv_hashed_free_list;
252 extern pv_hashed_entry_t pv_hashed_kern_free_list;
253 decl_simple_lock_data(extern, pv_hashed_free_list_lock)
254 decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock)
255 decl_simple_lock_data(extern, pv_hash_table_lock)
256
257 extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry
258 * structures */
259
260 extern uint32_t pv_hashed_free_count;
261 extern uint32_t pv_hashed_kern_free_count;
262 /*
263 * Each entry in the pv_head_table is locked by a bit in the
264 * pv_lock_table. The lock bits are accessed by the address of
265 * the frame they lock.
266 */
267 #define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
268 #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
269 extern char *pv_lock_table; /* pointer to array of bits */
270 extern char *pv_hash_lock_table;
271 extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */
272
273 extern event_t mapping_replenish_event;
274
275 static inline void PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) {
276
277 simple_lock(&pv_hashed_free_list_lock);
278 /* If the kernel reserved pool is low, let non-kernel mappings allocate
279 * synchronously, possibly subject to a throttle.
280 */
281 if ((pv_hashed_kern_free_count >= pv_hashed_kern_low_water_mark) &&
282 (*pvh_ep = pv_hashed_free_list) != 0) {
283 pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next;
284 pv_hashed_free_count--;
285 }
286
287 simple_unlock(&pv_hashed_free_list_lock);
288
289 if (pv_hashed_free_count < pv_hashed_low_water_mark) {
290 if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
291 thread_wakeup(&mapping_replenish_event);
292 }
293 }
294
295 static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) {
296 simple_lock(&pv_hashed_free_list_lock);
297 pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list;
298 pv_hashed_free_list = pvh_eh;
299 pv_hashed_free_count += pv_cnt;
300 simple_unlock(&pv_hashed_free_list_lock);
301 }
302
303 extern unsigned pmap_kern_reserve_alloc_stat;
304
305 static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) {
306 simple_lock(&pv_hashed_kern_free_list_lock);
307
308 if ((*pvh_e = pv_hashed_kern_free_list) != 0) {
309 pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next;
310 pv_hashed_kern_free_count--;
311 pmap_kern_reserve_alloc_stat++;
312 }
313
314 simple_unlock(&pv_hashed_kern_free_list_lock);
315
316 if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) {
317 if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
318 thread_wakeup(&mapping_replenish_event);
319 }
320 }
321
322 static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) {
323 simple_lock(&pv_hashed_kern_free_list_lock);
324 pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list;
325 pv_hashed_kern_free_list = pvh_eh;
326 pv_hashed_kern_free_count += pv_cnt;
327 simple_unlock(&pv_hashed_kern_free_list_lock);
328 }
329
330 extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters;
331 extern event_t pmap_user_pv_throttle_event;
332
333 static inline void pmap_pv_throttle(__unused pmap_t p) {
334 pmap_assert(p != kernel_pmap);
335 /* Apply throttle on non-kernel mappings */
336 if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) {
337 pmap_pv_throttle_stat++;
338 /* This doesn't need to be strictly accurate, merely a hint
339 * to eliminate the timeout when the reserve is replenished.
340 */
341 pmap_pv_throttled_waiters++;
342 assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC);
343 thread_block(THREAD_CONTINUE_NULL);
344 }
345 }
346
347 /*
348 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
349 */
350
351 #define pa_index(pa) (i386_btop(pa))
352 #define ppn_to_pai(ppn) ((int)ppn)
353
354 #define pai_to_pvh(pai) (&pv_head_table[pai])
355 #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
356 #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
357 #define pvhash(idx) (&pv_hash_table[idx])
358 #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
359 #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
360
361 #define IS_MANAGED_PAGE(x) \
362 ((unsigned int)(x) <= last_managed_page && \
363 (pmap_phys_attributes[x] & PHYS_MANAGED))
364
365 /*
366 * Physical page attributes. Copy bits from PTE definition.
367 */
368 #define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
369 #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
370 #define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
371 #define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */
372 #define PHYS_NCACHE INTEL_PTE_NCACHE
373 #define PHYS_PTA INTEL_PTE_PTA
374 #define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE)
375
376 /*
377 * Amount of virtual memory mapped by one
378 * page-directory entry.
379 */
380 #define PDE_MAPPED_SIZE (pdetova(1))
381
382
383 /*
384 * Locking and TLB invalidation
385 */
386
387 /*
388 * Locking Protocols: (changed 2/2007 JK)
389 *
390 * There are two structures in the pmap module that need locking:
391 * the pmaps themselves, and the per-page pv_lists (which are locked
392 * by locking the pv_lock_table entry that corresponds to the pv_head
393 * for the list in question.) Most routines want to lock a pmap and
394 * then do operations in it that require pv_list locking -- however
395 * pmap_remove_all and pmap_copy_on_write operate on a physical page
396 * basis and want to do the locking in the reverse order, i.e. lock
397 * a pv_list and then go through all the pmaps referenced by that list.
398 *
399 * The system wide pmap lock has been removed. Now, paths take a lock
400 * on the pmap before changing its 'shape' and the reverse order lockers
401 * (coming in by phys ppn) take a lock on the corresponding pv and then
402 * retest to be sure nothing changed during the window before they locked
403 * and can then run up/down the pv lists holding the list lock. This also
404 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
405 * previously.
406 */
407
408 /*
409 * PV locking
410 */
411
412 #define LOCK_PVH(index) { \
413 mp_disable_preemption(); \
414 lock_pvh_pai(index); \
415 }
416
417 #define UNLOCK_PVH(index) { \
418 unlock_pvh_pai(index); \
419 mp_enable_preemption(); \
420 }
421
422 extern uint64_t pde_mapped_size;
423
424 extern char *pmap_phys_attributes;
425 extern unsigned int last_managed_page;
426
427 extern ppnum_t lowest_lo;
428 extern ppnum_t lowest_hi;
429 extern ppnum_t highest_hi;
430
431 /*
432 * when spinning through pmap_remove
433 * ensure that we don't spend too much
434 * time with preemption disabled.
435 * I'm setting the current threshold
436 * to 20us
437 */
438 #define MAX_PREEMPTION_LATENCY_NS 20000
439 extern uint64_t max_preemption_latency_tsc;
440
441 /* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
442 #ifdef DEBUGINTERRUPTS
443 #define pmap_intr_assert() { \
444 if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \
445 panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \
446 }
447 #else
448 #define pmap_intr_assert()
449 #endif
450
451 extern int nx_enabled;
452 extern unsigned int inuse_ptepages_count;
453
454 static inline uint32_t
455 pvhashidx(pmap_t pmap, vm_map_offset_t va)
456 {
457 return ((uint32_t)(uintptr_t)pmap ^
458 ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
459 npvhash;
460 }
461
462
463 /*
464 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
465 * properly deals with the anchor.
466 * must be called with the hash locked, does not unlock it
467 */
468 static inline void
469 pmap_pvh_unlink(pv_hashed_entry_t pvh)
470 {
471 pv_hashed_entry_t curh;
472 pv_hashed_entry_t *pprevh;
473 int pvhash_idx;
474
475 CHK_NPVHASH();
476 pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
477
478 pprevh = pvhash(pvhash_idx);
479
480 #if PV_DEBUG
481 if (NULL == *pprevh)
482 panic("pvh_unlink null anchor"); /* JK DEBUG */
483 #endif
484 curh = *pprevh;
485
486 while (PV_HASHED_ENTRY_NULL != curh) {
487 if (pvh == curh)
488 break;
489 pprevh = &curh->nexth;
490 curh = curh->nexth;
491 }
492 if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
493 *pprevh = pvh->nexth;
494 return;
495 }
496
497 static inline void
498 pv_hash_add(pv_hashed_entry_t pvh_e,
499 pv_rooted_entry_t pv_h)
500 {
501 pv_hashed_entry_t *hashp;
502 int pvhash_idx;
503
504 CHK_NPVHASH();
505 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
506 LOCK_PV_HASH(pvhash_idx);
507 insque(&pvh_e->qlink, &pv_h->qlink);
508 hashp = pvhash(pvhash_idx);
509 #if PV_DEBUG
510 if (NULL==hashp)
511 panic("pv_hash_add(%p) null hash bucket", pvh_e);
512 #endif
513 pvh_e->nexth = *hashp;
514 *hashp = pvh_e;
515 UNLOCK_PV_HASH(pvhash_idx);
516 }
517
518 static inline void
519 pv_hash_remove(pv_hashed_entry_t pvh_e)
520 {
521 int pvhash_idx;
522
523 CHK_NPVHASH();
524 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
525 LOCK_PV_HASH(pvhash_idx);
526 remque(&pvh_e->qlink);
527 pmap_pvh_unlink(pvh_e);
528 UNLOCK_PV_HASH(pvhash_idx);
529 }
530
531 static inline boolean_t popcnt1(uint64_t distance) {
532 return ((distance & (distance - 1)) == 0);
533 }
534
535 /*
536 * Routines to handle suppression of/recovery from some forms of pagetable corruption
537 * incidents observed in the field. These can be either software induced (wild
538 * stores to the mapwindows where applicable, use after free errors
539 * (typically of pages addressed physically), mis-directed DMAs etc., or due
540 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors,
541 * the recording mechanism is deliberately not MP-safe. The overarching goal is to
542 * still assert on potential software races, but attempt recovery from incidents
543 * identifiable as occurring due to issues beyond the control of the pmap module.
544 * The latter includes single-bit errors and malformed pagetable entries.
545 * We currently limit ourselves to recovery/suppression of one incident per
546 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident
547 * are logged.
548 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09)
549 */
550
551 typedef enum {
552 PTE_VALID = 0x0,
553 PTE_INVALID = 0x1,
554 PTE_RSVD = 0x2,
555 PTE_SUPERVISOR = 0x4,
556 PTE_BITFLIP = 0x8,
557 PV_BITFLIP = 0x10,
558 PTE_INVALID_CACHEABILITY = 0x20
559 } pmap_pagetable_corruption_t;
560
561 typedef enum {
562 ROOT_PRESENT = 0,
563 ROOT_ABSENT = 1
564 } pmap_pv_assertion_t;
565
566 typedef enum {
567 PMAP_ACTION_IGNORE = 0x0,
568 PMAP_ACTION_ASSERT = 0x1,
569 PMAP_ACTION_RETRY = 0x2,
570 PMAP_ACTION_RETRY_RELOCK = 0x4
571 } pmap_pagetable_corruption_action_t;
572
573 #define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
574 extern uint64_t pmap_pagetable_corruption_interval_abstime;
575
576 extern uint32_t pmap_pagetable_corruption_incidents;
577 #define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8)
578 typedef struct {
579 pmap_pv_assertion_t incident;
580 pmap_pagetable_corruption_t reason;
581 pmap_pagetable_corruption_action_t action;
582 pmap_t pmap;
583 vm_map_offset_t vaddr;
584 pt_entry_t pte;
585 ppnum_t ppn;
586 pmap_t pvpmap;
587 vm_map_offset_t pvva;
588 uint64_t abstime;
589 } pmap_pagetable_corruption_record_t;
590
591 extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[];
592 extern uint64_t pmap_pagetable_corruption_last_abstime;
593 extern thread_call_t pmap_pagetable_corruption_log_call;
594 extern boolean_t pmap_pagetable_corruption_timeout;
595
596 static inline void
597 pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) {
598 uint32_t pmap_pagetable_corruption_log_index;
599 pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG;
600 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident;
601 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason;
602 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action;
603 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap;
604 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr;
605 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep;
606 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn;
607 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap;
608 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva;
609 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time();
610 /* Asynchronously log */
611 thread_call_enter(pmap_pagetable_corruption_log_call);
612 }
613
614 static inline pmap_pagetable_corruption_action_t
615 pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) {
616 pmap_pv_assertion_t action = PMAP_ACTION_ASSERT;
617 pmap_pagetable_corruption_t suppress_reason = PTE_VALID;
618 ppnum_t suppress_ppn = 0;
619 pt_entry_t cpte = *ptep;
620 ppnum_t cpn = pa_index(pte_to_pa(cpte));
621 ppnum_t ppn = *ppnp;
622 pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn));
623 pv_rooted_entry_t pv_e = pv_h;
624 uint32_t bitdex;
625 pmap_t pvpmap = pv_h->pmap;
626 vm_map_offset_t pvva = pv_h->va;
627 boolean_t ppcd = FALSE;
628
629 /* Ideally, we'd consult the Mach VM here to definitively determine
630 * the nature of the mapping for this address space and address.
631 * As that would be a layering violation in this context, we
632 * use various heuristics to recover from single bit errors,
633 * malformed pagetable entries etc. These are not intended
634 * to be comprehensive.
635 */
636
637 /* As a precautionary measure, mark A+D */
638 pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
639
640 /*
641 * Correct potential single bit errors in either (but not both) element
642 * of the PV
643 */
644 do {
645 if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) ||
646 (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) {
647 pv_e->pmap = pmap;
648 pv_e->va = vaddr;
649 suppress_reason = PV_BITFLIP;
650 action = PMAP_ACTION_RETRY;
651 goto pmap_cpc_exit;
652 }
653 } while((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink)) != pv_h);
654
655 /* Discover root entries with a Hamming
656 * distance of 1 from the supplied
657 * physical page frame.
658 */
659 for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) {
660 ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex);
661 if (IS_MANAGED_PAGE(npn)) {
662 pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn));
663 if (npv_h->va == vaddr && npv_h->pmap == pmap) {
664 suppress_reason = PTE_BITFLIP;
665 suppress_ppn = npn;
666 action = PMAP_ACTION_RETRY_RELOCK;
667 UNLOCK_PVH(ppn_to_pai(ppn));
668 *ppnp = npn;
669 goto pmap_cpc_exit;
670 }
671 }
672 }
673
674 if (pmap == kernel_pmap) {
675 action = PMAP_ACTION_ASSERT;
676 goto pmap_cpc_exit;
677 }
678
679 /* Check for malformed/inconsistent entries */
680
681 if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) {
682 action = PMAP_ACTION_IGNORE;
683 suppress_reason = PTE_INVALID_CACHEABILITY;
684 }
685 else if (cpte & INTEL_PTE_RSVD) {
686 action = PMAP_ACTION_IGNORE;
687 suppress_reason = PTE_RSVD;
688 }
689 else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) {
690 action = PMAP_ACTION_IGNORE;
691 suppress_reason = PTE_SUPERVISOR;
692 }
693 pmap_cpc_exit:
694 PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd));
695
696 if (debug_boot_arg && !ppcd) {
697 action = PMAP_ACTION_ASSERT;
698 }
699
700 if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) {
701 action = PMAP_ACTION_ASSERT;
702 pmap_pagetable_corruption_timeout = TRUE;
703 }
704 else
705 {
706 pmap_pagetable_corruption_last_abstime = mach_absolute_time();
707 }
708 pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva);
709 return action;
710 }
711
712 /*
713 * Remove pv list entry.
714 * Called with pv_head_table entry locked.
715 * Returns pv entry to be freed (or NULL).
716 */
717 static inline __attribute__((always_inline)) pv_hashed_entry_t
718 pmap_pv_remove(pmap_t pmap,
719 vm_map_offset_t vaddr,
720 ppnum_t *ppnp,
721 pt_entry_t *pte)
722 {
723 pv_hashed_entry_t pvh_e;
724 pv_rooted_entry_t pv_h;
725 pv_hashed_entry_t *pprevh;
726 int pvhash_idx;
727 uint32_t pv_cnt;
728 ppnum_t ppn;
729
730 pmap_pv_remove_retry:
731 ppn = *ppnp;
732 pvh_e = PV_HASHED_ENTRY_NULL;
733 pv_h = pai_to_pvh(ppn_to_pai(ppn));
734
735 if (pv_h->pmap == PMAP_NULL) {
736 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT);
737 if (pac == PMAP_ACTION_IGNORE)
738 goto pmap_pv_remove_exit;
739 else if (pac == PMAP_ACTION_ASSERT)
740 panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): null pv_list!", pmap, vaddr, ppn, *pte);
741 else if (pac == PMAP_ACTION_RETRY_RELOCK) {
742 LOCK_PVH(ppn_to_pai(*ppnp));
743 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
744 goto pmap_pv_remove_retry;
745 }
746 else if (pac == PMAP_ACTION_RETRY)
747 goto pmap_pv_remove_retry;
748 }
749
750 if (pv_h->va == vaddr && pv_h->pmap == pmap) {
751 /*
752 * Header is the pv_rooted_entry.
753 * We can't free that. If there is a queued
754 * entry after this one we remove that
755 * from the ppn queue, we remove it from the hash chain
756 * and copy it to the rooted entry. Then free it instead.
757 */
758 pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
759 if (pv_h != (pv_rooted_entry_t) pvh_e) {
760 /*
761 * Entry queued to root, remove this from hash
762 * and install as new root.
763 */
764 CHK_NPVHASH();
765 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
766 LOCK_PV_HASH(pvhash_idx);
767 remque(&pvh_e->qlink);
768 pprevh = pvhash(pvhash_idx);
769 if (PV_HASHED_ENTRY_NULL == *pprevh) {
770 panic("pmap_pv_remove(%p,0x%llx,0x%x): "
771 "empty hash, removing rooted",
772 pmap, vaddr, ppn);
773 }
774 pmap_pvh_unlink(pvh_e);
775 UNLOCK_PV_HASH(pvhash_idx);
776 pv_h->pmap = pvh_e->pmap;
777 pv_h->va = pvh_e->va; /* dispose of pvh_e */
778 } else {
779 /* none queued after rooted */
780 pv_h->pmap = PMAP_NULL;
781 pvh_e = PV_HASHED_ENTRY_NULL;
782 }
783 } else {
784 /*
785 * not removing rooted pv. find it on hash chain, remove from
786 * ppn queue and hash chain and free it
787 */
788 CHK_NPVHASH();
789 pvhash_idx = pvhashidx(pmap, vaddr);
790 LOCK_PV_HASH(pvhash_idx);
791 pprevh = pvhash(pvhash_idx);
792 if (PV_HASHED_ENTRY_NULL == *pprevh) {
793 panic("pmap_pv_remove(%p,0x%llx,0x%x): empty hash",
794 pmap, vaddr, ppn);
795 }
796 pvh_e = *pprevh;
797 pmap_pv_hashlist_walks++;
798 pv_cnt = 0;
799 while (PV_HASHED_ENTRY_NULL != pvh_e) {
800 pv_cnt++;
801 if (pvh_e->pmap == pmap &&
802 pvh_e->va == vaddr &&
803 pvh_e->ppn == ppn)
804 break;
805 pprevh = &pvh_e->nexth;
806 pvh_e = pvh_e->nexth;
807 }
808
809 if (PV_HASHED_ENTRY_NULL == pvh_e) {
810 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT);
811
812 if (pac == PMAP_ACTION_ASSERT)
813 panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, pv_h->pmap, pv_h->va);
814 else {
815 UNLOCK_PV_HASH(pvhash_idx);
816 if (pac == PMAP_ACTION_RETRY_RELOCK) {
817 LOCK_PVH(ppn_to_pai(*ppnp));
818 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
819 goto pmap_pv_remove_retry;
820 }
821 else if (pac == PMAP_ACTION_RETRY) {
822 goto pmap_pv_remove_retry;
823 }
824 else if (pac == PMAP_ACTION_IGNORE) {
825 goto pmap_pv_remove_exit;
826 }
827 }
828 }
829
830 pmap_pv_hashlist_cnts += pv_cnt;
831 if (pmap_pv_hashlist_max < pv_cnt)
832 pmap_pv_hashlist_max = pv_cnt;
833 *pprevh = pvh_e->nexth;
834 remque(&pvh_e->qlink);
835 UNLOCK_PV_HASH(pvhash_idx);
836 }
837 pmap_pv_remove_exit:
838 return pvh_e;
839 }
840
841
842 extern int pt_fake_zone_index;
843 static inline void
844 PMAP_ZINFO_PALLOC(vm_size_t bytes)
845 {
846 thread_t thr = current_thread();
847 task_t task;
848 zinfo_usage_t zinfo;
849
850 thr->tkm_private.alloc += bytes;
851 if (pt_fake_zone_index != -1 &&
852 (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
853 OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].alloc);
854 }
855
856 static inline void
857 PMAP_ZINFO_PFREE(vm_size_t bytes)
858 {
859 thread_t thr = current_thread();
860 task_t task;
861 zinfo_usage_t zinfo;
862
863 thr->tkm_private.free += bytes;
864 if (pt_fake_zone_index != -1 &&
865 (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
866 OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].free);
867 }
868
869 extern boolean_t pmap_initialized;/* Has pmap_init completed? */
870 #define valid_page(x) (pmap_initialized && pmap_valid_page(x))
871
872 // XXX
873 #define HIGH_MEM_BASE ((uint32_t)( -NBPDE) ) /* shared gdt etc seg addr */ /* XXX64 ?? */
874 // XXX
875
876
877 int phys_attribute_test(
878 ppnum_t phys,
879 int bits);
880 void phys_attribute_clear(
881 ppnum_t phys,
882 int bits);
883
884 //#define PCID_DEBUG 1
885 #if PCID_DEBUG
886 #define pmap_pcid_log(fmt, args...) \
887 do { \
888 kprintf(fmt, ##args); \
889 printf(fmt, ##args); \
890 } while(0)
891 #else
892 #define pmap_pcid_log(fmt, args...)
893 #endif
894 void pmap_pcid_configure(void);
895
896 #if defined(__x86_64__)
897 /*
898 * The single pml4 page per pmap is allocated at pmap create time and exists
899 * for the duration of the pmap. we allocate this page in kernel vm.
900 * this returns the address of the requested pml4 entry in the top level page.
901 */
902 static inline
903 pml4_entry_t *
904 pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
905 {
906 #if PMAP_ASSERT
907 return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]);
908 #else
909 return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)];
910 #endif
911 }
912
913 /*
914 * Returns address of requested PDPT entry in the physmap.
915 */
916 static inline pdpt_entry_t *
917 pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
918 {
919 pml4_entry_t newpf;
920 pml4_entry_t *pml4;
921
922 assert(pmap);
923 if ((vaddr > 0x00007FFFFFFFFFFFULL) &&
924 (vaddr < 0xFFFF800000000000ULL)) {
925 return (0);
926 }
927
928 pml4 = pmap64_pml4(pmap, vaddr);
929 if (pml4 && ((*pml4 & INTEL_PTE_VALID))) {
930 newpf = *pml4 & PG_FRAME;
931 return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf))
932 [(vaddr >> PDPTSHIFT) & (NPDPTPG-1)];
933 }
934 return (NULL);
935 }
936 /*
937 * Returns the address of the requested PDE entry in the physmap.
938 */
939 static inline pd_entry_t *
940 pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr)
941 {
942 pdpt_entry_t newpf;
943 pdpt_entry_t *pdpt;
944
945 assert(pmap);
946 if ((vaddr > 0x00007FFFFFFFFFFFULL) &&
947 (vaddr < 0xFFFF800000000000ULL)) {
948 return (0);
949 }
950
951 pdpt = pmap64_pdpt(pmap, vaddr);
952
953 if (pdpt && ((*pdpt & INTEL_PTE_VALID))) {
954 newpf = *pdpt & PG_FRAME;
955 return &((pd_entry_t *) PHYSMAP_PTOV(newpf))
956 [(vaddr >> PDSHIFT) & (NPDPG-1)];
957 }
958 return (NULL);
959 }
960
961 static inline pd_entry_t *
962 pmap_pde(pmap_t m, vm_map_offset_t v)
963 {
964 pd_entry_t *pde;
965
966 assert(m);
967 pde = pmap64_pde(m, v);
968
969 return pde;
970 }
971
972
973 /*
974 * return address of mapped pte for vaddr va in pmap pmap.
975 *
976 * In case the pde maps a superpage, return the pde, which, in this case
977 * is the actual page table entry.
978 */
979 static inline pt_entry_t *
980 pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
981 {
982 pd_entry_t *pde;
983 pd_entry_t newpf;
984
985 assert(pmap);
986 pde = pmap_pde(pmap, vaddr);
987
988 if (pde && ((*pde & INTEL_PTE_VALID))) {
989 if (*pde & INTEL_PTE_PS)
990 return pde;
991 newpf = *pde & PG_FRAME;
992 return &((pt_entry_t *)PHYSMAP_PTOV(newpf))
993 [i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)];
994 }
995 return (NULL);
996 }
997 #endif
998 #endif /* MACH_KERNEL_PRIVATE */