]> git.saurik.com Git - apple/xnu.git/blame - osfmk/i386/pmap_internal.h
xnu-1699.24.23.tar.gz
[apple/xnu.git] / osfmk / i386 / pmap_internal.h
CommitLineData
1c79356b 1/*
b0d623f7 2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b 27 */
1c79356b 28
b0d623f7
A
29#include <vm/pmap.h>
30#include <sys/kdebug.h>
1c79356b 31
b0d623f7 32#ifdef MACH_KERNEL_PRIVATE
1c79356b 33
b0d623f7
A
34/*
35 * pmap locking
2d21ac55 36 */
0b4e3aa0 37
b0d623f7
A
38#define PMAP_LOCK(pmap) { \
39 simple_lock(&(pmap)->lock); \
40}
0b4e3aa0 41
b0d623f7
A
42#define PMAP_UNLOCK(pmap) { \
43 simple_unlock(&(pmap)->lock); \
44}
1c79356b 45
b0d623f7 46#define PMAP_UPDATE_TLBS(pmap, s, e) \
6d2010ae 47 pmap_flush_tlbs(pmap, s, e)
1c79356b 48
b0d623f7 49#define iswired(pte) ((pte) & INTEL_PTE_WIRED)
1c79356b 50
b0d623f7
A
51#ifdef PMAP_TRACES
52extern boolean_t pmap_trace;
53#define PMAP_TRACE(x,a,b,c,d,e) \
54 if (pmap_trace) { \
55 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
56 }
57#else
58#define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e)
59#endif /* PMAP_TRACES */
1c79356b 60
6d2010ae
A
61#define PMAP_TRACE_CONSTANT(x,a,b,c,d,e) \
62 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
63
b0d623f7
A
64void pmap_expand_pml4(
65 pmap_t map,
66 vm_map_offset_t v);
67
68void pmap_expand_pdpt(
69 pmap_t map,
70 vm_map_offset_t v);
b7266188 71
6d2010ae
A
72void phys_attribute_set(
73 ppnum_t phys,
74 int bits);
75
76void pmap_set_reference(
77 ppnum_t pn);
78
79boolean_t phys_page_exists(
80 ppnum_t pn);
81
82void pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t);
83
84void
85pmap_update_cache_attributes_locked(ppnum_t, unsigned);
86
87#if CONFIG_YONAH
b0d623f7 88extern boolean_t cpu_64bit;
6d2010ae
A
89#else
90extern const boolean_t cpu_64bit;
2d21ac55 91#endif
b0d623f7 92
b7266188
A
93/*
94 * Private data structures.
95 */
96
97/*
98 * For each vm_page_t, there is a list of all currently
99 * valid virtual mappings of that page. An entry is
100 * a pv_rooted_entry_t; the list is the pv_table.
101 *
102 * N.B. with the new combo rooted/hashed scheme it is
103 * only possibly to remove individual non-rooted entries
104 * if they are found via the hashed chains as there is no
105 * way to unlink the singly linked hashed entries if navigated to
106 * via the queue list off the rooted entries. Think of it as
107 * hash/walk/pull, keeping track of the prev pointer while walking
108 * the singly linked hash list. All of this is to save memory and
109 * keep both types of pv_entries as small as possible.
110 */
111
112/*
113
114PV HASHING Changes - JK 1/2007
115
116Pve's establish physical to virtual mappings. These are used for aliasing of a
6d2010ae
A
117physical page to (potentially many) virtual addresses within pmaps. In the
118previous implementation the structure of the pv_entries (each 16 bytes in size) was
b7266188
A
119
120typedef struct pv_entry {
121 struct pv_entry_t next;
122 pmap_t pmap;
123 vm_map_offset_t va;
124} *pv_entry_t;
125
6d2010ae
A
126An initial array of these is created at boot time, one per physical page of
127memory, indexed by the physical page number. Additionally, a pool of entries
128is created from a pv_zone to be used as needed by pmap_enter() when it is
129creating new mappings. Originally, we kept this pool around because the code
130in pmap_enter() was unable to block if it needed an entry and none were
131available - we'd panic. Some time ago I restructured the pmap_enter() code
132so that for user pmaps it can block while zalloc'ing a pv structure and restart,
133removing a panic from the code (in the case of the kernel pmap we cannot block
134and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
135The pool has not been removed since there is a large performance gain keeping
136freed pv's around for reuse and not suffering the overhead of zalloc for every
137new pv we need.
138
139As pmap_enter() created new mappings it linked the new pve's for them off the
140fixed pv array for that ppn (off the next pointer). These pve's are accessed
141for several operations, one of them being address space teardown. In that case,
142we basically do this
b7266188
A
143
144 for (every page/pte in the space) {
145 calc pve_ptr from the ppn in the pte
146 for (every pv in the list for the ppn) {
147 if (this pv is for this pmap/vaddr) {
148 do housekeeping
149 unlink/free the pv
150 }
151 }
152 }
153
6d2010ae
A
154The problem arose when we were running, say 8000 (or even 2000) apache or
155other processes and one or all terminate. The list hanging off each pv array
156entry could have thousands of entries. We were continuously linearly searching
157each of these lists as we stepped through the address space we were tearing
158down. Because of the locks we hold, likely taking a cache miss for each node,
159and interrupt disabling for MP issues the system became completely unresponsive
160for many seconds while we did this.
161
162Realizing that pve's are accessed in two distinct ways (linearly running the
163list by ppn for operations like pmap_page_protect and finding and
164modifying/removing a single pve as part of pmap_enter processing) has led to
165modifying the pve structures and databases.
166
167There are now two types of pve structures. A "rooted" structure which is
168basically the original structure accessed in an array by ppn, and a ''hashed''
169structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
170designed with the two goals of minimizing wired memory and making the lookup of
171a ppn faster. Since a vast majority of pages in the system are not aliased
172and hence represented by a single pv entry I've kept the rooted entry size as
173small as possible because there is one of these dedicated for every physical
174page of memory. The hashed pve's are larger due to the addition of the hash
175link and the ppn entry needed for matching while running the hash list to find
176the entry we are looking for. This way, only systems that have lots of
177aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
178structures have the same first three fields allowing some simplification in
179the code.
b7266188
A
180
181They have these shapes
182
183typedef struct pv_rooted_entry {
6d2010ae
A
184 queue_head_t qlink;
185 vm_map_offset_t va;
186 pmap_t pmap;
b7266188
A
187} *pv_rooted_entry_t;
188
189
190typedef struct pv_hashed_entry {
6d2010ae
A
191 queue_head_t qlink;
192 vm_map_offset_t va;
193 pmap_t pmap;
194 ppnum_t ppn;
195 struct pv_hashed_entry *nexth;
b7266188
A
196} *pv_hashed_entry_t;
197
6d2010ae
A
198The main flow difference is that the code is now aware of the rooted entry and
199the hashed entries. Code that runs the pv list still starts with the rooted
200entry and then continues down the qlink onto the hashed entries. Code that is
201looking up a specific pv entry first checks the rooted entry and then hashes
202and runs the hash list for the match. The hash list lengths are much smaller
203than the original pv lists that contained all aliases for the specific ppn.
b7266188
A
204
205*/
206
6d2010ae
A
207typedef struct pv_rooted_entry {
208 /* first three entries must match pv_hashed_entry_t */
209 queue_head_t qlink;
210 vm_map_offset_t va; /* virtual address for mapping */
211 pmap_t pmap; /* pmap where mapping lies */
b7266188
A
212} *pv_rooted_entry_t;
213
214#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
215
6d2010ae
A
216typedef struct pv_hashed_entry {
217 /* first three entries must match pv_rooted_entry_t */
218 queue_head_t qlink;
219 vm_map_offset_t va;
220 pmap_t pmap;
221 ppnum_t ppn;
222 struct pv_hashed_entry *nexth;
b7266188
A
223} *pv_hashed_entry_t;
224
225#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
226
6d2010ae 227//#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */
b7266188
A
228#ifdef PV_DEBUG
229#define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
230#else
6d2010ae 231#define CHK_NPVHASH(x)
b7266188
A
232#endif
233
234#define NPVHASH 4095 /* MUST BE 2^N - 1 */
6d2010ae
A
235#define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000
236#define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000
237#define PV_HASHED_ALLOC_CHUNK_INITIAL 2000
238#define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200
239
240extern volatile uint32_t mappingrecurse;
241extern uint32_t pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark;
242
243/*
244 * PV hash locking
245 */
246
247#define LOCK_PV_HASH(hash) lock_hash_hash(hash)
248#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
249extern uint32_t npvhash;
250extern pv_hashed_entry_t *pv_hash_table; /* hash lists */
251extern pv_hashed_entry_t pv_hashed_free_list;
252extern pv_hashed_entry_t pv_hashed_kern_free_list;
253decl_simple_lock_data(extern, pv_hashed_free_list_lock)
254decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock)
255decl_simple_lock_data(extern, pv_hash_table_lock)
256
257extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry
258 * structures */
259
260extern uint32_t pv_hashed_free_count;
261extern uint32_t pv_hashed_kern_free_count;
262/*
263 * Each entry in the pv_head_table is locked by a bit in the
264 * pv_lock_table. The lock bits are accessed by the address of
265 * the frame they lock.
266 */
267#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
268#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
269extern char *pv_lock_table; /* pointer to array of bits */
270extern char *pv_hash_lock_table;
271extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */
272
273extern event_t mapping_replenish_event;
274
275static inline void PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) {
276
277 simple_lock(&pv_hashed_free_list_lock);
278 /* If the kernel reserved pool is low, let non-kernel mappings allocate
279 * synchronously, possibly subject to a throttle.
280 */
281 if ((pv_hashed_kern_free_count >= pv_hashed_kern_low_water_mark) &&
282 (*pvh_ep = pv_hashed_free_list) != 0) {
283 pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next;
284 pv_hashed_free_count--;
285 }
286
287 simple_unlock(&pv_hashed_free_list_lock);
288
289 if (pv_hashed_free_count < pv_hashed_low_water_mark) {
290 if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
291 thread_wakeup(&mapping_replenish_event);
292 }
b7266188
A
293}
294
6d2010ae
A
295static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) {
296 simple_lock(&pv_hashed_free_list_lock);
297 pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list;
298 pv_hashed_free_list = pvh_eh;
299 pv_hashed_free_count += pv_cnt;
300 simple_unlock(&pv_hashed_free_list_lock);
b7266188
A
301}
302
6d2010ae
A
303extern unsigned pmap_kern_reserve_alloc_stat;
304
305static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) {
306 simple_lock(&pv_hashed_kern_free_list_lock);
307
308 if ((*pvh_e = pv_hashed_kern_free_list) != 0) {
309 pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next;
310 pv_hashed_kern_free_count--;
311 pmap_kern_reserve_alloc_stat++;
312 }
313
314 simple_unlock(&pv_hashed_kern_free_list_lock);
315
316 if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) {
317 if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
318 thread_wakeup(&mapping_replenish_event);
319 }
b7266188
A
320}
321
6d2010ae
A
322static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) {
323 simple_lock(&pv_hashed_kern_free_list_lock);
324 pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list;
325 pv_hashed_kern_free_list = pvh_eh;
326 pv_hashed_kern_free_count += pv_cnt;
327 simple_unlock(&pv_hashed_kern_free_list_lock);
328}
329
330extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters;
331extern event_t pmap_user_pv_throttle_event;
332
333static inline void pmap_pv_throttle(__unused pmap_t p) {
334 pmap_assert(p != kernel_pmap);
335 /* Apply throttle on non-kernel mappings */
336 if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) {
337 pmap_pv_throttle_stat++;
338 /* This doesn't need to be strictly accurate, merely a hint
339 * to eliminate the timeout when the reserve is replenished.
340 */
341 pmap_pv_throttled_waiters++;
342 assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC);
343 thread_block(THREAD_CONTINUE_NULL);
344 }
b7266188
A
345}
346
347/*
348 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
349 */
350
351#define pa_index(pa) (i386_btop(pa))
352#define ppn_to_pai(ppn) ((int)ppn)
353
354#define pai_to_pvh(pai) (&pv_head_table[pai])
355#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
356#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
357#define pvhash(idx) (&pv_hash_table[idx])
b7266188
A
358#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
359#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
360
361#define IS_MANAGED_PAGE(x) \
362 ((unsigned int)(x) <= last_managed_page && \
363 (pmap_phys_attributes[x] & PHYS_MANAGED))
364
365/*
366 * Physical page attributes. Copy bits from PTE definition.
367 */
368#define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
369#define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
370#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
0b4c1975 371#define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */
6d2010ae
A
372#define PHYS_NCACHE INTEL_PTE_NCACHE
373#define PHYS_PTA INTEL_PTE_PTA
374#define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE)
b7266188
A
375
376/*
377 * Amount of virtual memory mapped by one
378 * page-directory entry.
379 */
380#define PDE_MAPPED_SIZE (pdetova(1))
381
382
383/*
384 * Locking and TLB invalidation
385 */
386
387/*
388 * Locking Protocols: (changed 2/2007 JK)
389 *
390 * There are two structures in the pmap module that need locking:
391 * the pmaps themselves, and the per-page pv_lists (which are locked
392 * by locking the pv_lock_table entry that corresponds to the pv_head
393 * for the list in question.) Most routines want to lock a pmap and
394 * then do operations in it that require pv_list locking -- however
395 * pmap_remove_all and pmap_copy_on_write operate on a physical page
396 * basis and want to do the locking in the reverse order, i.e. lock
397 * a pv_list and then go through all the pmaps referenced by that list.
398 *
399 * The system wide pmap lock has been removed. Now, paths take a lock
400 * on the pmap before changing its 'shape' and the reverse order lockers
401 * (coming in by phys ppn) take a lock on the corresponding pv and then
402 * retest to be sure nothing changed during the window before they locked
403 * and can then run up/down the pv lists holding the list lock. This also
404 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
405 * previously.
406 */
407
408/*
409 * PV locking
410 */
411
412#define LOCK_PVH(index) { \
413 mp_disable_preemption(); \
414 lock_pvh_pai(index); \
415}
416
417#define UNLOCK_PVH(index) { \
418 unlock_pvh_pai(index); \
419 mp_enable_preemption(); \
420}
b7266188 421
b7266188
A
422extern uint64_t pde_mapped_size;
423
424extern char *pmap_phys_attributes;
425extern unsigned int last_managed_page;
426
060df5ea
A
427extern ppnum_t lowest_lo;
428extern ppnum_t lowest_hi;
429extern ppnum_t highest_hi;
430
b7266188
A
431/*
432 * when spinning through pmap_remove
433 * ensure that we don't spend too much
434 * time with preemption disabled.
435 * I'm setting the current threshold
436 * to 20us
437 */
438#define MAX_PREEMPTION_LATENCY_NS 20000
439extern uint64_t max_preemption_latency_tsc;
440
441/* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
442#ifdef DEBUGINTERRUPTS
443#define pmap_intr_assert() { \
444 if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \
445 panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \
446}
447#else
448#define pmap_intr_assert()
449#endif
450
6d2010ae
A
451extern int nx_enabled;
452extern unsigned int inuse_ptepages_count;
b7266188
A
453
454static inline uint32_t
455pvhashidx(pmap_t pmap, vm_map_offset_t va)
456{
457 return ((uint32_t)(uintptr_t)pmap ^
6d2010ae 458 ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
b7266188
A
459 npvhash;
460}
461
6d2010ae 462
b7266188
A
463/*
464 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
465 * properly deals with the anchor.
466 * must be called with the hash locked, does not unlock it
467 */
b7266188
A
468static inline void
469pmap_pvh_unlink(pv_hashed_entry_t pvh)
470{
471 pv_hashed_entry_t curh;
472 pv_hashed_entry_t *pprevh;
473 int pvhash_idx;
474
475 CHK_NPVHASH();
476 pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
477
478 pprevh = pvhash(pvhash_idx);
479
480#if PV_DEBUG
481 if (NULL == *pprevh)
482 panic("pvh_unlink null anchor"); /* JK DEBUG */
483#endif
484 curh = *pprevh;
485
486 while (PV_HASHED_ENTRY_NULL != curh) {
487 if (pvh == curh)
488 break;
489 pprevh = &curh->nexth;
490 curh = curh->nexth;
491 }
492 if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
493 *pprevh = pvh->nexth;
494 return;
495}
496
497static inline void
498pv_hash_add(pv_hashed_entry_t pvh_e,
499 pv_rooted_entry_t pv_h)
500{
501 pv_hashed_entry_t *hashp;
502 int pvhash_idx;
503
504 CHK_NPVHASH();
505 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
506 LOCK_PV_HASH(pvhash_idx);
507 insque(&pvh_e->qlink, &pv_h->qlink);
508 hashp = pvhash(pvhash_idx);
509#if PV_DEBUG
510 if (NULL==hashp)
511 panic("pv_hash_add(%p) null hash bucket", pvh_e);
512#endif
513 pvh_e->nexth = *hashp;
514 *hashp = pvh_e;
515 UNLOCK_PV_HASH(pvhash_idx);
516}
517
518static inline void
519pv_hash_remove(pv_hashed_entry_t pvh_e)
520{
521 int pvhash_idx;
522
523 CHK_NPVHASH();
524 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
525 LOCK_PV_HASH(pvhash_idx);
526 remque(&pvh_e->qlink);
527 pmap_pvh_unlink(pvh_e);
528 UNLOCK_PV_HASH(pvhash_idx);
6d2010ae 529}
b7266188
A
530
531static inline boolean_t popcnt1(uint64_t distance) {
532 return ((distance & (distance - 1)) == 0);
533}
534
535/*
536 * Routines to handle suppression of/recovery from some forms of pagetable corruption
537 * incidents observed in the field. These can be either software induced (wild
538 * stores to the mapwindows where applicable, use after free errors
539 * (typically of pages addressed physically), mis-directed DMAs etc., or due
540 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors,
541 * the recording mechanism is deliberately not MP-safe. The overarching goal is to
542 * still assert on potential software races, but attempt recovery from incidents
543 * identifiable as occurring due to issues beyond the control of the pmap module.
544 * The latter includes single-bit errors and malformed pagetable entries.
545 * We currently limit ourselves to recovery/suppression of one incident per
546 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident
547 * are logged.
548 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09)
549 */
550
551typedef enum {
552 PTE_VALID = 0x0,
553 PTE_INVALID = 0x1,
554 PTE_RSVD = 0x2,
555 PTE_SUPERVISOR = 0x4,
556 PTE_BITFLIP = 0x8,
557 PV_BITFLIP = 0x10,
558 PTE_INVALID_CACHEABILITY = 0x20
559} pmap_pagetable_corruption_t;
560
561typedef enum {
562 ROOT_PRESENT = 0,
563 ROOT_ABSENT = 1
564} pmap_pv_assertion_t;
565
566typedef enum {
567 PMAP_ACTION_IGNORE = 0x0,
568 PMAP_ACTION_ASSERT = 0x1,
569 PMAP_ACTION_RETRY = 0x2,
570 PMAP_ACTION_RETRY_RELOCK = 0x4
571} pmap_pagetable_corruption_action_t;
572
573#define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
574extern uint64_t pmap_pagetable_corruption_interval_abstime;
575
576extern uint32_t pmap_pagetable_corruption_incidents;
577#define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8)
578typedef struct {
579 pmap_pv_assertion_t incident;
580 pmap_pagetable_corruption_t reason;
581 pmap_pagetable_corruption_action_t action;
582 pmap_t pmap;
583 vm_map_offset_t vaddr;
584 pt_entry_t pte;
585 ppnum_t ppn;
586 pmap_t pvpmap;
587 vm_map_offset_t pvva;
588 uint64_t abstime;
589} pmap_pagetable_corruption_record_t;
590
591extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[];
592extern uint64_t pmap_pagetable_corruption_last_abstime;
593extern thread_call_t pmap_pagetable_corruption_log_call;
594extern boolean_t pmap_pagetable_corruption_timeout;
595
596static inline void
597pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) {
598 uint32_t pmap_pagetable_corruption_log_index;
599 pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG;
600 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident;
601 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason;
602 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action;
603 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap;
604 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr;
605 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep;
606 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn;
607 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap;
608 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva;
609 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time();
610 /* Asynchronously log */
611 thread_call_enter(pmap_pagetable_corruption_log_call);
612}
613
614static inline pmap_pagetable_corruption_action_t
615pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) {
616 pmap_pv_assertion_t action = PMAP_ACTION_ASSERT;
617 pmap_pagetable_corruption_t suppress_reason = PTE_VALID;
618 ppnum_t suppress_ppn = 0;
619 pt_entry_t cpte = *ptep;
620 ppnum_t cpn = pa_index(pte_to_pa(cpte));
621 ppnum_t ppn = *ppnp;
622 pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn));
623 pv_rooted_entry_t pv_e = pv_h;
624 uint32_t bitdex;
625 pmap_t pvpmap = pv_h->pmap;
626 vm_map_offset_t pvva = pv_h->va;
627 boolean_t ppcd = FALSE;
628
629 /* Ideally, we'd consult the Mach VM here to definitively determine
630 * the nature of the mapping for this address space and address.
631 * As that would be a layering violation in this context, we
632 * use various heuristics to recover from single bit errors,
633 * malformed pagetable entries etc. These are not intended
634 * to be comprehensive.
635 */
636
637 /* As a precautionary measure, mark A+D */
638 pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
639
640 /*
641 * Correct potential single bit errors in either (but not both) element
642 * of the PV
643 */
644 do {
645 if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) ||
646 (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) {
647 pv_e->pmap = pmap;
648 pv_e->va = vaddr;
649 suppress_reason = PV_BITFLIP;
650 action = PMAP_ACTION_RETRY;
651 goto pmap_cpc_exit;
652 }
653 } while((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink)) != pv_h);
654
655 /* Discover root entries with a Hamming
656 * distance of 1 from the supplied
657 * physical page frame.
658 */
659 for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) {
660 ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex);
661 if (IS_MANAGED_PAGE(npn)) {
662 pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn));
663 if (npv_h->va == vaddr && npv_h->pmap == pmap) {
664 suppress_reason = PTE_BITFLIP;
665 suppress_ppn = npn;
666 action = PMAP_ACTION_RETRY_RELOCK;
667 UNLOCK_PVH(ppn_to_pai(ppn));
668 *ppnp = npn;
669 goto pmap_cpc_exit;
670 }
671 }
672 }
673
674 if (pmap == kernel_pmap) {
675 action = PMAP_ACTION_ASSERT;
676 goto pmap_cpc_exit;
677 }
678
679 /* Check for malformed/inconsistent entries */
680
681 if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) {
682 action = PMAP_ACTION_IGNORE;
683 suppress_reason = PTE_INVALID_CACHEABILITY;
684 }
685 else if (cpte & INTEL_PTE_RSVD) {
686 action = PMAP_ACTION_IGNORE;
687 suppress_reason = PTE_RSVD;
688 }
689 else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) {
690 action = PMAP_ACTION_IGNORE;
691 suppress_reason = PTE_SUPERVISOR;
692 }
693pmap_cpc_exit:
694 PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd));
695
696 if (debug_boot_arg && !ppcd) {
697 action = PMAP_ACTION_ASSERT;
698 }
699
700 if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) {
701 action = PMAP_ACTION_ASSERT;
702 pmap_pagetable_corruption_timeout = TRUE;
703 }
704 else
705 {
706 pmap_pagetable_corruption_last_abstime = mach_absolute_time();
707 }
708 pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva);
709 return action;
710}
6d2010ae 711
b7266188
A
712/*
713 * Remove pv list entry.
714 * Called with pv_head_table entry locked.
715 * Returns pv entry to be freed (or NULL).
716 */
b7266188 717static inline __attribute__((always_inline)) pv_hashed_entry_t
6d2010ae
A
718pmap_pv_remove(pmap_t pmap,
719 vm_map_offset_t vaddr,
720 ppnum_t *ppnp,
b7266188
A
721 pt_entry_t *pte)
722{
723 pv_hashed_entry_t pvh_e;
724 pv_rooted_entry_t pv_h;
725 pv_hashed_entry_t *pprevh;
726 int pvhash_idx;
727 uint32_t pv_cnt;
728 ppnum_t ppn;
729
730pmap_pv_remove_retry:
731 ppn = *ppnp;
732 pvh_e = PV_HASHED_ENTRY_NULL;
733 pv_h = pai_to_pvh(ppn_to_pai(ppn));
734
735 if (pv_h->pmap == PMAP_NULL) {
736 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT);
737 if (pac == PMAP_ACTION_IGNORE)
738 goto pmap_pv_remove_exit;
739 else if (pac == PMAP_ACTION_ASSERT)
740 panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): null pv_list!", pmap, vaddr, ppn, *pte);
741 else if (pac == PMAP_ACTION_RETRY_RELOCK) {
742 LOCK_PVH(ppn_to_pai(*ppnp));
743 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
744 goto pmap_pv_remove_retry;
745 }
746 else if (pac == PMAP_ACTION_RETRY)
747 goto pmap_pv_remove_retry;
748 }
749
750 if (pv_h->va == vaddr && pv_h->pmap == pmap) {
751 /*
752 * Header is the pv_rooted_entry.
753 * We can't free that. If there is a queued
754 * entry after this one we remove that
755 * from the ppn queue, we remove it from the hash chain
756 * and copy it to the rooted entry. Then free it instead.
757 */
758 pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
759 if (pv_h != (pv_rooted_entry_t) pvh_e) {
760 /*
761 * Entry queued to root, remove this from hash
762 * and install as new root.
763 */
764 CHK_NPVHASH();
765 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
766 LOCK_PV_HASH(pvhash_idx);
767 remque(&pvh_e->qlink);
768 pprevh = pvhash(pvhash_idx);
769 if (PV_HASHED_ENTRY_NULL == *pprevh) {
770 panic("pmap_pv_remove(%p,0x%llx,0x%x): "
771 "empty hash, removing rooted",
772 pmap, vaddr, ppn);
773 }
774 pmap_pvh_unlink(pvh_e);
775 UNLOCK_PV_HASH(pvhash_idx);
776 pv_h->pmap = pvh_e->pmap;
777 pv_h->va = pvh_e->va; /* dispose of pvh_e */
778 } else {
779 /* none queued after rooted */
780 pv_h->pmap = PMAP_NULL;
781 pvh_e = PV_HASHED_ENTRY_NULL;
782 }
783 } else {
784 /*
785 * not removing rooted pv. find it on hash chain, remove from
786 * ppn queue and hash chain and free it
787 */
788 CHK_NPVHASH();
789 pvhash_idx = pvhashidx(pmap, vaddr);
790 LOCK_PV_HASH(pvhash_idx);
791 pprevh = pvhash(pvhash_idx);
792 if (PV_HASHED_ENTRY_NULL == *pprevh) {
6d2010ae
A
793 panic("pmap_pv_remove(%p,0x%llx,0x%x): empty hash",
794 pmap, vaddr, ppn);
b7266188
A
795 }
796 pvh_e = *pprevh;
797 pmap_pv_hashlist_walks++;
798 pv_cnt = 0;
799 while (PV_HASHED_ENTRY_NULL != pvh_e) {
800 pv_cnt++;
801 if (pvh_e->pmap == pmap &&
802 pvh_e->va == vaddr &&
803 pvh_e->ppn == ppn)
804 break;
805 pprevh = &pvh_e->nexth;
806 pvh_e = pvh_e->nexth;
807 }
6d2010ae 808
b7266188
A
809 if (PV_HASHED_ENTRY_NULL == pvh_e) {
810 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT);
811
812 if (pac == PMAP_ACTION_ASSERT)
813 panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, pv_h->pmap, pv_h->va);
814 else {
815 UNLOCK_PV_HASH(pvhash_idx);
816 if (pac == PMAP_ACTION_RETRY_RELOCK) {
817 LOCK_PVH(ppn_to_pai(*ppnp));
818 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
819 goto pmap_pv_remove_retry;
820 }
821 else if (pac == PMAP_ACTION_RETRY) {
822 goto pmap_pv_remove_retry;
823 }
824 else if (pac == PMAP_ACTION_IGNORE) {
825 goto pmap_pv_remove_exit;
826 }
827 }
828 }
6d2010ae 829
b7266188
A
830 pmap_pv_hashlist_cnts += pv_cnt;
831 if (pmap_pv_hashlist_max < pv_cnt)
832 pmap_pv_hashlist_max = pv_cnt;
833 *pprevh = pvh_e->nexth;
834 remque(&pvh_e->qlink);
835 UNLOCK_PV_HASH(pvhash_idx);
836 }
837pmap_pv_remove_exit:
838 return pvh_e;
839}
840
6d2010ae
A
841
842extern int pt_fake_zone_index;
843static inline void
844PMAP_ZINFO_PALLOC(vm_size_t bytes)
845{
846 thread_t thr = current_thread();
847 task_t task;
848 zinfo_usage_t zinfo;
849
850 thr->tkm_private.alloc += bytes;
851 if (pt_fake_zone_index != -1 &&
852 (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
853 OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].alloc);
854}
855
856static inline void
857PMAP_ZINFO_PFREE(vm_size_t bytes)
858{
859 thread_t thr = current_thread();
860 task_t task;
861 zinfo_usage_t zinfo;
862
863 thr->tkm_private.free += bytes;
864 if (pt_fake_zone_index != -1 &&
865 (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
866 OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].free);
867}
868
869extern boolean_t pmap_initialized;/* Has pmap_init completed? */
870#define valid_page(x) (pmap_initialized && pmap_valid_page(x))
871
872// XXX
873#define HIGH_MEM_BASE ((uint32_t)( -NBPDE) ) /* shared gdt etc seg addr */ /* XXX64 ?? */
874// XXX
875
876
877int phys_attribute_test(
878 ppnum_t phys,
879 int bits);
880void phys_attribute_clear(
881 ppnum_t phys,
882 int bits);
883
884//#define PCID_DEBUG 1
885#if PCID_DEBUG
886#define pmap_pcid_log(fmt, args...) \
887 do { \
888 kprintf(fmt, ##args); \
889 printf(fmt, ##args); \
890 } while(0)
891#else
892#define pmap_pcid_log(fmt, args...)
893#endif
894void pmap_pcid_configure(void);
895
896#if defined(__x86_64__)
897/*
898 * The single pml4 page per pmap is allocated at pmap create time and exists
899 * for the duration of the pmap. we allocate this page in kernel vm.
900 * this returns the address of the requested pml4 entry in the top level page.
901 */
902static inline
903pml4_entry_t *
904pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
905{
906#if PMAP_ASSERT
907 return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]);
908#else
909 return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)];
910#endif
911}
912
913/*
914 * Returns address of requested PDPT entry in the physmap.
915 */
916static inline pdpt_entry_t *
917pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
918{
919 pml4_entry_t newpf;
920 pml4_entry_t *pml4;
921
922 assert(pmap);
923 if ((vaddr > 0x00007FFFFFFFFFFFULL) &&
924 (vaddr < 0xFFFF800000000000ULL)) {
925 return (0);
926 }
927
928 pml4 = pmap64_pml4(pmap, vaddr);
929 if (pml4 && ((*pml4 & INTEL_PTE_VALID))) {
930 newpf = *pml4 & PG_FRAME;
931 return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf))
932 [(vaddr >> PDPTSHIFT) & (NPDPTPG-1)];
933 }
934 return (NULL);
935}
936/*
937 * Returns the address of the requested PDE entry in the physmap.
938 */
939static inline pd_entry_t *
940pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr)
941{
942 pdpt_entry_t newpf;
943 pdpt_entry_t *pdpt;
944
945 assert(pmap);
946 if ((vaddr > 0x00007FFFFFFFFFFFULL) &&
947 (vaddr < 0xFFFF800000000000ULL)) {
948 return (0);
949 }
950
951 pdpt = pmap64_pdpt(pmap, vaddr);
952
953 if (pdpt && ((*pdpt & INTEL_PTE_VALID))) {
954 newpf = *pdpt & PG_FRAME;
955 return &((pd_entry_t *) PHYSMAP_PTOV(newpf))
956 [(vaddr >> PDSHIFT) & (NPDPG-1)];
957 }
958 return (NULL);
959}
960
961static inline pd_entry_t *
962pmap_pde(pmap_t m, vm_map_offset_t v)
963{
964 pd_entry_t *pde;
965
966 assert(m);
967 pde = pmap64_pde(m, v);
968
969 return pde;
970}
971
972
973/*
974 * return address of mapped pte for vaddr va in pmap pmap.
975 *
976 * In case the pde maps a superpage, return the pde, which, in this case
977 * is the actual page table entry.
978 */
979static inline pt_entry_t *
980pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
981{
982 pd_entry_t *pde;
983 pd_entry_t newpf;
984
985 assert(pmap);
986 pde = pmap_pde(pmap, vaddr);
987
988 if (pde && ((*pde & INTEL_PTE_VALID))) {
989 if (*pde & INTEL_PTE_PS)
990 return pde;
991 newpf = *pde & PG_FRAME;
992 return &((pt_entry_t *)PHYSMAP_PTOV(newpf))
993 [i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)];
994 }
995 return (NULL);
996}
997#endif
b0d623f7 998#endif /* MACH_KERNEL_PRIVATE */