]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/pmap_internal.h
xnu-2422.100.13.tar.gz
[apple/xnu.git] / osfmk / i386 / pmap_internal.h
1 /*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30 #ifndef _I386_PMAP_INTERNAL_
31 #define _I386_PMAP_INTERNAL_
32 #ifdef MACH_KERNEL_PRIVATE
33
34 #include <vm/pmap.h>
35 #include <sys/kdebug.h>
36 #include <kern/ledger.h>
37
38 /*
39 * pmap locking
40 */
41
42 #define PMAP_LOCK(pmap) { \
43 simple_lock(&(pmap)->lock); \
44 }
45
46 #define PMAP_UNLOCK(pmap) { \
47 simple_unlock(&(pmap)->lock); \
48 }
49
50 #define PMAP_UPDATE_TLBS(pmap, s, e) \
51 pmap_flush_tlbs(pmap, s, e, 0, NULL)
52
53
54 #define PMAP_DELAY_TLB_FLUSH 0x01
55
56 #define PMAP_UPDATE_TLBS_DELAYED(pmap, s, e, c) \
57 pmap_flush_tlbs(pmap, s, e, PMAP_DELAY_TLB_FLUSH, c)
58
59
60 #define iswired(pte) ((pte) & INTEL_PTE_WIRED)
61
62 #ifdef PMAP_TRACES
63 extern boolean_t pmap_trace;
64 #define PMAP_TRACE(x,a,b,c,d,e) \
65 if (pmap_trace) { \
66 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
67 }
68 #else
69 #define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e)
70 #endif /* PMAP_TRACES */
71
72 #define PMAP_TRACE_CONSTANT(x,a,b,c,d,e) \
73 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
74
75 kern_return_t pmap_expand_pml4(
76 pmap_t map,
77 vm_map_offset_t v,
78 unsigned int options);
79
80 kern_return_t pmap_expand_pdpt(
81 pmap_t map,
82 vm_map_offset_t v,
83 unsigned int options);
84
85 void phys_attribute_set(
86 ppnum_t phys,
87 int bits);
88
89 void pmap_set_reference(
90 ppnum_t pn);
91
92 boolean_t phys_page_exists(
93 ppnum_t pn);
94
95 void
96 pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t, int, pmap_flush_context *);
97
98 void
99 pmap_update_cache_attributes_locked(ppnum_t, unsigned);
100
101 extern const boolean_t cpu_64bit;
102
103 /*
104 * Private data structures.
105 */
106
107 /*
108 * For each vm_page_t, there is a list of all currently
109 * valid virtual mappings of that page. An entry is
110 * a pv_rooted_entry_t; the list is the pv_table.
111 *
112 * N.B. with the new combo rooted/hashed scheme it is
113 * only possibly to remove individual non-rooted entries
114 * if they are found via the hashed chains as there is no
115 * way to unlink the singly linked hashed entries if navigated to
116 * via the queue list off the rooted entries. Think of it as
117 * hash/walk/pull, keeping track of the prev pointer while walking
118 * the singly linked hash list. All of this is to save memory and
119 * keep both types of pv_entries as small as possible.
120 */
121
122 /*
123
124 PV HASHING Changes - JK 1/2007
125
126 Pve's establish physical to virtual mappings. These are used for aliasing of a
127 physical page to (potentially many) virtual addresses within pmaps. In the
128 previous implementation the structure of the pv_entries (each 16 bytes in size) was
129
130 typedef struct pv_entry {
131 struct pv_entry_t next;
132 pmap_t pmap;
133 vm_map_offset_t va;
134 } *pv_entry_t;
135
136 An initial array of these is created at boot time, one per physical page of
137 memory, indexed by the physical page number. Additionally, a pool of entries
138 is created from a pv_zone to be used as needed by pmap_enter() when it is
139 creating new mappings. Originally, we kept this pool around because the code
140 in pmap_enter() was unable to block if it needed an entry and none were
141 available - we'd panic. Some time ago I restructured the pmap_enter() code
142 so that for user pmaps it can block while zalloc'ing a pv structure and restart,
143 removing a panic from the code (in the case of the kernel pmap we cannot block
144 and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
145 The pool has not been removed since there is a large performance gain keeping
146 freed pv's around for reuse and not suffering the overhead of zalloc for every
147 new pv we need.
148
149 As pmap_enter() created new mappings it linked the new pve's for them off the
150 fixed pv array for that ppn (off the next pointer). These pve's are accessed
151 for several operations, one of them being address space teardown. In that case,
152 we basically do this
153
154 for (every page/pte in the space) {
155 calc pve_ptr from the ppn in the pte
156 for (every pv in the list for the ppn) {
157 if (this pv is for this pmap/vaddr) {
158 do housekeeping
159 unlink/free the pv
160 }
161 }
162 }
163
164 The problem arose when we were running, say 8000 (or even 2000) apache or
165 other processes and one or all terminate. The list hanging off each pv array
166 entry could have thousands of entries. We were continuously linearly searching
167 each of these lists as we stepped through the address space we were tearing
168 down. Because of the locks we hold, likely taking a cache miss for each node,
169 and interrupt disabling for MP issues the system became completely unresponsive
170 for many seconds while we did this.
171
172 Realizing that pve's are accessed in two distinct ways (linearly running the
173 list by ppn for operations like pmap_page_protect and finding and
174 modifying/removing a single pve as part of pmap_enter processing) has led to
175 modifying the pve structures and databases.
176
177 There are now two types of pve structures. A "rooted" structure which is
178 basically the original structure accessed in an array by ppn, and a ''hashed''
179 structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
180 designed with the two goals of minimizing wired memory and making the lookup of
181 a ppn faster. Since a vast majority of pages in the system are not aliased
182 and hence represented by a single pv entry I've kept the rooted entry size as
183 small as possible because there is one of these dedicated for every physical
184 page of memory. The hashed pve's are larger due to the addition of the hash
185 link and the ppn entry needed for matching while running the hash list to find
186 the entry we are looking for. This way, only systems that have lots of
187 aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
188 structures have the same first three fields allowing some simplification in
189 the code.
190
191 They have these shapes
192
193 typedef struct pv_rooted_entry {
194 queue_head_t qlink;
195 vm_map_offset_t va;
196 pmap_t pmap;
197 } *pv_rooted_entry_t;
198
199
200 typedef struct pv_hashed_entry {
201 queue_head_t qlink;
202 vm_map_offset_t va;
203 pmap_t pmap;
204 ppnum_t ppn;
205 struct pv_hashed_entry *nexth;
206 } *pv_hashed_entry_t;
207
208 The main flow difference is that the code is now aware of the rooted entry and
209 the hashed entries. Code that runs the pv list still starts with the rooted
210 entry and then continues down the qlink onto the hashed entries. Code that is
211 looking up a specific pv entry first checks the rooted entry and then hashes
212 and runs the hash list for the match. The hash list lengths are much smaller
213 than the original pv lists that contained all aliases for the specific ppn.
214
215 */
216
217 typedef struct pv_rooted_entry {
218 /* first three entries must match pv_hashed_entry_t */
219 queue_head_t qlink;
220 vm_map_offset_t va; /* virtual address for mapping */
221 pmap_t pmap; /* pmap where mapping lies */
222 } *pv_rooted_entry_t;
223
224 #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
225
226 typedef struct pv_hashed_entry {
227 /* first three entries must match pv_rooted_entry_t */
228 queue_head_t qlink;
229 vm_map_offset_t va;
230 pmap_t pmap;
231 ppnum_t ppn;
232 struct pv_hashed_entry *nexth;
233 } *pv_hashed_entry_t;
234
235 #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
236
237 //#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */
238 #ifdef PV_DEBUG
239 #define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
240 #else
241 #define CHK_NPVHASH(x)
242 #endif
243
244 #define NPVHASH 4095 /* MUST BE 2^N - 1 */
245 #define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000
246 #define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000
247 #define PV_HASHED_ALLOC_CHUNK_INITIAL 2000
248 #define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200
249
250 extern volatile uint32_t mappingrecurse;
251 extern uint32_t pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark;
252
253 /*
254 * PV hash locking
255 */
256
257 #define LOCK_PV_HASH(hash) lock_hash_hash(hash)
258 #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
259 extern uint32_t npvhash;
260 extern pv_hashed_entry_t *pv_hash_table; /* hash lists */
261 extern pv_hashed_entry_t pv_hashed_free_list;
262 extern pv_hashed_entry_t pv_hashed_kern_free_list;
263 decl_simple_lock_data(extern, pv_hashed_free_list_lock)
264 decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock)
265 decl_simple_lock_data(extern, pv_hash_table_lock)
266
267 extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry
268 * structures */
269
270 extern uint32_t pv_hashed_free_count;
271 extern uint32_t pv_hashed_kern_free_count;
272 /*
273 * Each entry in the pv_head_table is locked by a bit in the
274 * pv_lock_table. The lock bits are accessed by the address of
275 * the frame they lock.
276 */
277 #define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
278 #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
279 extern char *pv_lock_table; /* pointer to array of bits */
280 extern char *pv_hash_lock_table;
281 extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */
282
283 extern event_t mapping_replenish_event;
284
285 static inline void PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) {
286 pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL);
287 simple_lock(&pv_hashed_free_list_lock);
288 /* If the kernel reserved pool is low, let non-kernel mappings allocate
289 * synchronously, possibly subject to a throttle.
290 */
291 if ((pv_hashed_kern_free_count > pv_hashed_kern_low_water_mark) && ((*pvh_ep = pv_hashed_free_list) != 0)) {
292 pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next;
293 pv_hashed_free_count--;
294 }
295
296 simple_unlock(&pv_hashed_free_list_lock);
297
298 if (pv_hashed_free_count <= pv_hashed_low_water_mark) {
299 if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
300 thread_wakeup(&mapping_replenish_event);
301 }
302 }
303
304 static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) {
305 simple_lock(&pv_hashed_free_list_lock);
306 pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list;
307 pv_hashed_free_list = pvh_eh;
308 pv_hashed_free_count += pv_cnt;
309 simple_unlock(&pv_hashed_free_list_lock);
310 }
311
312 extern unsigned pmap_kern_reserve_alloc_stat;
313
314 static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) {
315 pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL);
316 simple_lock(&pv_hashed_kern_free_list_lock);
317
318 if ((*pvh_e = pv_hashed_kern_free_list) != 0) {
319 pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next;
320 pv_hashed_kern_free_count--;
321 pmap_kern_reserve_alloc_stat++;
322 }
323
324 simple_unlock(&pv_hashed_kern_free_list_lock);
325
326 if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) {
327 if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
328 thread_wakeup(&mapping_replenish_event);
329 }
330 }
331
332 static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) {
333 simple_lock(&pv_hashed_kern_free_list_lock);
334 pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list;
335 pv_hashed_kern_free_list = pvh_eh;
336 pv_hashed_kern_free_count += pv_cnt;
337 simple_unlock(&pv_hashed_kern_free_list_lock);
338 }
339
340 extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters;
341 extern event_t pmap_user_pv_throttle_event;
342
343 static inline void pmap_pv_throttle(__unused pmap_t p) {
344 pmap_assert(p != kernel_pmap);
345 /* Apply throttle on non-kernel mappings */
346 if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) {
347 pmap_pv_throttle_stat++;
348 /* This doesn't need to be strictly accurate, merely a hint
349 * to eliminate the timeout when the reserve is replenished.
350 */
351 pmap_pv_throttled_waiters++;
352 assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC);
353 thread_block(THREAD_CONTINUE_NULL);
354 }
355 }
356
357 /*
358 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
359 */
360
361 #define pa_index(pa) (i386_btop(pa))
362 #define ppn_to_pai(ppn) ((int)ppn)
363
364 #define pai_to_pvh(pai) (&pv_head_table[pai])
365 #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
366 #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
367 #define pvhash(idx) (&pv_hash_table[idx])
368 #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
369 #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
370
371 #define IS_MANAGED_PAGE(x) \
372 ((unsigned int)(x) <= last_managed_page && \
373 (pmap_phys_attributes[x] & PHYS_MANAGED))
374 #define IS_INTERNAL_PAGE(x) \
375 (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_INTERNAL))
376 #define IS_REUSABLE_PAGE(x) \
377 (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_REUSABLE))
378
379 /*
380 * Physical page attributes. Copy bits from PTE definition.
381 */
382 #define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
383 #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
384 #define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
385 #define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */
386 #define PHYS_NCACHE INTEL_PTE_NCACHE
387 #define PHYS_PTA INTEL_PTE_PTA
388 #define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE)
389 #define PHYS_INTERNAL INTEL_PTE_WTHRU /* page from internal object */
390 #define PHYS_REUSABLE INTEL_PTE_WRITE /* page is "reusable" */
391
392 extern const boolean_t pmap_disable_kheap_nx;
393 extern const boolean_t pmap_disable_kstack_nx;
394
395 #define PMAP_EXPAND_OPTIONS_NONE (0x0)
396 #define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT)
397 #define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER)
398
399 /*
400 * Amount of virtual memory mapped by one
401 * page-directory entry.
402 */
403 #define PDE_MAPPED_SIZE (pdetova(1))
404
405
406 /*
407 * Locking and TLB invalidation
408 */
409
410 /*
411 * Locking Protocols: (changed 2/2007 JK)
412 *
413 * There are two structures in the pmap module that need locking:
414 * the pmaps themselves, and the per-page pv_lists (which are locked
415 * by locking the pv_lock_table entry that corresponds to the pv_head
416 * for the list in question.) Most routines want to lock a pmap and
417 * then do operations in it that require pv_list locking -- however
418 * pmap_remove_all and pmap_copy_on_write operate on a physical page
419 * basis and want to do the locking in the reverse order, i.e. lock
420 * a pv_list and then go through all the pmaps referenced by that list.
421 *
422 * The system wide pmap lock has been removed. Now, paths take a lock
423 * on the pmap before changing its 'shape' and the reverse order lockers
424 * (coming in by phys ppn) take a lock on the corresponding pv and then
425 * retest to be sure nothing changed during the window before they locked
426 * and can then run up/down the pv lists holding the list lock. This also
427 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
428 * previously.
429 */
430
431 /*
432 * PV locking
433 */
434
435 #define LOCK_PVH(index) { \
436 mp_disable_preemption(); \
437 lock_pvh_pai(index); \
438 }
439
440 #define UNLOCK_PVH(index) { \
441 unlock_pvh_pai(index); \
442 mp_enable_preemption(); \
443 }
444
445 extern uint64_t pde_mapped_size;
446
447 extern char *pmap_phys_attributes;
448 extern ppnum_t last_managed_page;
449
450 extern ppnum_t lowest_lo;
451 extern ppnum_t lowest_hi;
452 extern ppnum_t highest_hi;
453
454 /*
455 * when spinning through pmap_remove
456 * ensure that we don't spend too much
457 * time with preemption disabled.
458 * I'm setting the current threshold
459 * to 20us
460 */
461 #define MAX_PREEMPTION_LATENCY_NS 20000
462 extern uint64_t max_preemption_latency_tsc;
463
464 /* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
465 #ifdef DEBUGINTERRUPTS
466 #define pmap_intr_assert() { \
467 if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \
468 panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \
469 }
470 #else
471 #define pmap_intr_assert()
472 #endif
473
474 extern int nx_enabled;
475 extern unsigned int inuse_ptepages_count;
476
477 static inline uint32_t
478 pvhashidx(pmap_t pmap, vm_map_offset_t va)
479 {
480 return ((uint32_t)(uintptr_t)pmap ^
481 ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
482 npvhash;
483 }
484
485
486 /*
487 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
488 * properly deals with the anchor.
489 * must be called with the hash locked, does not unlock it
490 */
491 static inline void
492 pmap_pvh_unlink(pv_hashed_entry_t pvh)
493 {
494 pv_hashed_entry_t curh;
495 pv_hashed_entry_t *pprevh;
496 int pvhash_idx;
497
498 CHK_NPVHASH();
499 pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
500
501 pprevh = pvhash(pvhash_idx);
502
503 #if PV_DEBUG
504 if (NULL == *pprevh)
505 panic("pvh_unlink null anchor"); /* JK DEBUG */
506 #endif
507 curh = *pprevh;
508
509 while (PV_HASHED_ENTRY_NULL != curh) {
510 if (pvh == curh)
511 break;
512 pprevh = &curh->nexth;
513 curh = curh->nexth;
514 }
515 if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
516 *pprevh = pvh->nexth;
517 return;
518 }
519
520 static inline void
521 pv_hash_add(pv_hashed_entry_t pvh_e,
522 pv_rooted_entry_t pv_h)
523 {
524 pv_hashed_entry_t *hashp;
525 int pvhash_idx;
526
527 CHK_NPVHASH();
528 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
529 LOCK_PV_HASH(pvhash_idx);
530 insque(&pvh_e->qlink, &pv_h->qlink);
531 hashp = pvhash(pvhash_idx);
532 #if PV_DEBUG
533 if (NULL==hashp)
534 panic("pv_hash_add(%p) null hash bucket", pvh_e);
535 #endif
536 pvh_e->nexth = *hashp;
537 *hashp = pvh_e;
538 UNLOCK_PV_HASH(pvhash_idx);
539 }
540
541 static inline void
542 pv_hash_remove(pv_hashed_entry_t pvh_e)
543 {
544 int pvhash_idx;
545
546 CHK_NPVHASH();
547 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
548 LOCK_PV_HASH(pvhash_idx);
549 remque(&pvh_e->qlink);
550 pmap_pvh_unlink(pvh_e);
551 UNLOCK_PV_HASH(pvhash_idx);
552 }
553
554 static inline boolean_t popcnt1(uint64_t distance) {
555 return ((distance & (distance - 1)) == 0);
556 }
557
558 /*
559 * Routines to handle suppression of/recovery from some forms of pagetable corruption
560 * incidents observed in the field. These can be either software induced (wild
561 * stores to the mapwindows where applicable, use after free errors
562 * (typically of pages addressed physically), mis-directed DMAs etc., or due
563 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors,
564 * the recording mechanism is deliberately not MP-safe. The overarching goal is to
565 * still assert on potential software races, but attempt recovery from incidents
566 * identifiable as occurring due to issues beyond the control of the pmap module.
567 * The latter includes single-bit errors and malformed pagetable entries.
568 * We currently limit ourselves to recovery/suppression of one incident per
569 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident
570 * are logged.
571 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09)
572 */
573
574 typedef enum {
575 PTE_VALID = 0x0,
576 PTE_INVALID = 0x1,
577 PTE_RSVD = 0x2,
578 PTE_SUPERVISOR = 0x4,
579 PTE_BITFLIP = 0x8,
580 PV_BITFLIP = 0x10,
581 PTE_INVALID_CACHEABILITY = 0x20
582 } pmap_pagetable_corruption_t;
583
584 typedef enum {
585 ROOT_PRESENT = 0,
586 ROOT_ABSENT = 1
587 } pmap_pv_assertion_t;
588
589 typedef enum {
590 PMAP_ACTION_IGNORE = 0x0,
591 PMAP_ACTION_ASSERT = 0x1,
592 PMAP_ACTION_RETRY = 0x2,
593 PMAP_ACTION_RETRY_RELOCK = 0x4
594 } pmap_pagetable_corruption_action_t;
595
596 #define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
597 extern uint64_t pmap_pagetable_corruption_interval_abstime;
598
599 extern uint32_t pmap_pagetable_corruption_incidents;
600 #define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8)
601 typedef struct {
602 pmap_pv_assertion_t incident;
603 pmap_pagetable_corruption_t reason;
604 pmap_pagetable_corruption_action_t action;
605 pmap_t pmap;
606 vm_map_offset_t vaddr;
607 pt_entry_t pte;
608 ppnum_t ppn;
609 pmap_t pvpmap;
610 vm_map_offset_t pvva;
611 uint64_t abstime;
612 } pmap_pagetable_corruption_record_t;
613
614 extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[];
615 extern uint64_t pmap_pagetable_corruption_last_abstime;
616 extern thread_call_t pmap_pagetable_corruption_log_call;
617 extern boolean_t pmap_pagetable_corruption_timeout;
618
619 static inline void
620 pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) {
621 uint32_t pmap_pagetable_corruption_log_index;
622 pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG;
623 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident;
624 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason;
625 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action;
626 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap;
627 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr;
628 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep;
629 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn;
630 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap;
631 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva;
632 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time();
633 /* Asynchronously log */
634 thread_call_enter(pmap_pagetable_corruption_log_call);
635 }
636
637 static inline pmap_pagetable_corruption_action_t
638 pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) {
639 pmap_pagetable_corruption_action_t action = PMAP_ACTION_ASSERT;
640 pmap_pagetable_corruption_t suppress_reason = PTE_VALID;
641 ppnum_t suppress_ppn = 0;
642 pt_entry_t cpte = *ptep;
643 ppnum_t cpn = pa_index(pte_to_pa(cpte));
644 ppnum_t ppn = *ppnp;
645 pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn));
646 pv_rooted_entry_t pv_e = pv_h;
647 uint32_t bitdex;
648 pmap_t pvpmap = pv_h->pmap;
649 vm_map_offset_t pvva = pv_h->va;
650 boolean_t ppcd = FALSE;
651
652 /* Ideally, we'd consult the Mach VM here to definitively determine
653 * the nature of the mapping for this address space and address.
654 * As that would be a layering violation in this context, we
655 * use various heuristics to recover from single bit errors,
656 * malformed pagetable entries etc. These are not intended
657 * to be comprehensive.
658 */
659
660 /* As a precautionary measure, mark A+D */
661 pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
662
663 /*
664 * Correct potential single bit errors in either (but not both) element
665 * of the PV
666 */
667 do {
668 if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) ||
669 (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) {
670 pv_e->pmap = pmap;
671 pv_e->va = vaddr;
672 suppress_reason = PV_BITFLIP;
673 action = PMAP_ACTION_RETRY;
674 goto pmap_cpc_exit;
675 }
676 } while (((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink))) && (pv_e != pv_h));
677
678 /* Discover root entries with a Hamming
679 * distance of 1 from the supplied
680 * physical page frame.
681 */
682 for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) {
683 ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex);
684 if (IS_MANAGED_PAGE(npn)) {
685 pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn));
686 if (npv_h->va == vaddr && npv_h->pmap == pmap) {
687 suppress_reason = PTE_BITFLIP;
688 suppress_ppn = npn;
689 action = PMAP_ACTION_RETRY_RELOCK;
690 UNLOCK_PVH(ppn_to_pai(ppn));
691 *ppnp = npn;
692 goto pmap_cpc_exit;
693 }
694 }
695 }
696
697 if (pmap == kernel_pmap) {
698 action = PMAP_ACTION_ASSERT;
699 goto pmap_cpc_exit;
700 }
701
702 /* Check for malformed/inconsistent entries */
703
704 if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) {
705 action = PMAP_ACTION_IGNORE;
706 suppress_reason = PTE_INVALID_CACHEABILITY;
707 }
708 else if (cpte & INTEL_PTE_RSVD) {
709 action = PMAP_ACTION_IGNORE;
710 suppress_reason = PTE_RSVD;
711 }
712 else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) {
713 action = PMAP_ACTION_IGNORE;
714 suppress_reason = PTE_SUPERVISOR;
715 }
716 pmap_cpc_exit:
717 PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd));
718
719 if (debug_boot_arg && !ppcd) {
720 action = PMAP_ACTION_ASSERT;
721 }
722
723 if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) {
724 action = PMAP_ACTION_ASSERT;
725 pmap_pagetable_corruption_timeout = TRUE;
726 }
727 else
728 {
729 pmap_pagetable_corruption_last_abstime = mach_absolute_time();
730 }
731 pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva);
732 return action;
733 }
734
735 /*
736 * Remove pv list entry.
737 * Called with pv_head_table entry locked.
738 * Returns pv entry to be freed (or NULL).
739 */
740 static inline __attribute__((always_inline)) pv_hashed_entry_t
741 pmap_pv_remove(pmap_t pmap,
742 vm_map_offset_t vaddr,
743 ppnum_t *ppnp,
744 pt_entry_t *pte)
745 {
746 pv_hashed_entry_t pvh_e;
747 pv_rooted_entry_t pv_h;
748 pv_hashed_entry_t *pprevh;
749 int pvhash_idx;
750 uint32_t pv_cnt;
751 ppnum_t ppn;
752
753 pmap_pv_remove_retry:
754 ppn = *ppnp;
755 pvh_e = PV_HASHED_ENTRY_NULL;
756 pv_h = pai_to_pvh(ppn_to_pai(ppn));
757
758 if (__improbable(pv_h->pmap == PMAP_NULL)) {
759 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT);
760 if (pac == PMAP_ACTION_IGNORE)
761 goto pmap_pv_remove_exit;
762 else if (pac == PMAP_ACTION_ASSERT)
763 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list!", pmap, vaddr, ppn, *pte, ppnp, pte);
764 else if (pac == PMAP_ACTION_RETRY_RELOCK) {
765 LOCK_PVH(ppn_to_pai(*ppnp));
766 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
767 goto pmap_pv_remove_retry;
768 }
769 else if (pac == PMAP_ACTION_RETRY)
770 goto pmap_pv_remove_retry;
771 }
772
773 if (pv_h->va == vaddr && pv_h->pmap == pmap) {
774 /*
775 * Header is the pv_rooted_entry.
776 * We can't free that. If there is a queued
777 * entry after this one we remove that
778 * from the ppn queue, we remove it from the hash chain
779 * and copy it to the rooted entry. Then free it instead.
780 */
781 pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
782 if (pv_h != (pv_rooted_entry_t) pvh_e) {
783 /*
784 * Entry queued to root, remove this from hash
785 * and install as new root.
786 */
787 CHK_NPVHASH();
788 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
789 LOCK_PV_HASH(pvhash_idx);
790 remque(&pvh_e->qlink);
791 pprevh = pvhash(pvhash_idx);
792 if (PV_HASHED_ENTRY_NULL == *pprevh) {
793 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x): "
794 "empty hash, removing rooted",
795 pmap, vaddr, ppn);
796 }
797 pmap_pvh_unlink(pvh_e);
798 UNLOCK_PV_HASH(pvhash_idx);
799 pv_h->pmap = pvh_e->pmap;
800 pv_h->va = pvh_e->va; /* dispose of pvh_e */
801 } else {
802 /* none queued after rooted */
803 pv_h->pmap = PMAP_NULL;
804 pvh_e = PV_HASHED_ENTRY_NULL;
805 }
806 } else {
807 /*
808 * not removing rooted pv. find it on hash chain, remove from
809 * ppn queue and hash chain and free it
810 */
811 CHK_NPVHASH();
812 pvhash_idx = pvhashidx(pmap, vaddr);
813 LOCK_PV_HASH(pvhash_idx);
814 pprevh = pvhash(pvhash_idx);
815 if (PV_HASHED_ENTRY_NULL == *pprevh) {
816 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash",
817 pmap, vaddr, ppn, *pte, pte);
818 }
819 pvh_e = *pprevh;
820 pmap_pv_hashlist_walks++;
821 pv_cnt = 0;
822 while (PV_HASHED_ENTRY_NULL != pvh_e) {
823 pv_cnt++;
824 if (pvh_e->pmap == pmap &&
825 pvh_e->va == vaddr &&
826 pvh_e->ppn == ppn)
827 break;
828 pprevh = &pvh_e->nexth;
829 pvh_e = pvh_e->nexth;
830 }
831
832 if (PV_HASHED_ENTRY_NULL == pvh_e) {
833 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT);
834
835 if (pac == PMAP_ACTION_ASSERT)
836 panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, pv_h->va);
837 else {
838 UNLOCK_PV_HASH(pvhash_idx);
839 if (pac == PMAP_ACTION_RETRY_RELOCK) {
840 LOCK_PVH(ppn_to_pai(*ppnp));
841 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
842 goto pmap_pv_remove_retry;
843 }
844 else if (pac == PMAP_ACTION_RETRY) {
845 goto pmap_pv_remove_retry;
846 }
847 else if (pac == PMAP_ACTION_IGNORE) {
848 goto pmap_pv_remove_exit;
849 }
850 }
851 }
852
853 pmap_pv_hashlist_cnts += pv_cnt;
854 if (pmap_pv_hashlist_max < pv_cnt)
855 pmap_pv_hashlist_max = pv_cnt;
856 *pprevh = pvh_e->nexth;
857 remque(&pvh_e->qlink);
858 UNLOCK_PV_HASH(pvhash_idx);
859 }
860 pmap_pv_remove_exit:
861 return pvh_e;
862 }
863
864
865 extern int pt_fake_zone_index;
866 static inline void
867 PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes)
868 {
869 thread_t thr = current_thread();
870 task_t task;
871 zinfo_usage_t zinfo;
872
873 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
874
875 if (pt_fake_zone_index != -1 &&
876 (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
877 OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].alloc);
878 }
879
880 static inline void
881 PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes)
882 {
883 thread_t thr = current_thread();
884 task_t task;
885 zinfo_usage_t zinfo;
886
887 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
888
889 if (pt_fake_zone_index != -1 &&
890 (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
891 OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].free);
892 }
893
894 static inline void
895 PMAP_ZINFO_SALLOC(pmap_t pmap, vm_size_t bytes)
896 {
897 pmap_ledger_credit(pmap, task_ledgers.tkm_shared, bytes);
898 }
899
900 static inline void
901 PMAP_ZINFO_SFREE(pmap_t pmap, vm_size_t bytes)
902 {
903 pmap_ledger_debit(pmap, task_ledgers.tkm_shared, bytes);
904 }
905
906 extern boolean_t pmap_initialized;/* Has pmap_init completed? */
907 #define valid_page(x) (pmap_initialized && pmap_valid_page(x))
908
909 // XXX
910 #define HIGH_MEM_BASE ((uint32_t)( -NBPDE) ) /* shared gdt etc seg addr */ /* XXX64 ?? */
911 // XXX
912
913
914 int phys_attribute_test(
915 ppnum_t phys,
916 int bits);
917 void phys_attribute_clear(
918 ppnum_t phys,
919 int bits,
920 unsigned int options,
921 void *arg);
922
923 //#define PCID_DEBUG 1
924 #if PCID_DEBUG
925 #define pmap_pcid_log(fmt, args...) \
926 do { \
927 kprintf(fmt, ##args); \
928 printf(fmt, ##args); \
929 } while(0)
930 #else
931 #define pmap_pcid_log(fmt, args...)
932 #endif
933 void pmap_pcid_configure(void);
934
935
936 /*
937 * Atomic 64-bit compare and exchange of a page table entry.
938 */
939 static inline boolean_t
940 pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new)
941 {
942 boolean_t ret;
943
944 /*
945 * Load the old value into %rax
946 * Load the new value into another register
947 * Compare-exchange-quad at address entryp
948 * If the compare succeeds, the new value is stored, return TRUE.
949 * Otherwise, no swap is made, return FALSE.
950 */
951 asm volatile(
952 " lock; cmpxchgq %2,(%3) \n\t"
953 " setz %%al \n\t"
954 " movzbl %%al,%0"
955 : "=a" (ret)
956 : "a" (old),
957 "r" (new),
958 "r" (entryp)
959 : "memory");
960 return ret;
961 }
962
963 extern uint32_t pmap_update_clear_pte_count;
964
965 static inline void pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits) {
966 pt_entry_t npte, opte;
967 do {
968 opte = *mptep;
969 if (__improbable(opte == 0)) {
970 pmap_update_clear_pte_count++;
971 break;
972 }
973 npte = opte & ~(pclear_bits);
974 npte |= pset_bits;
975 } while (!pmap_cmpx_pte(mptep, opte, npte));
976 }
977
978 #if defined(__x86_64__)
979 /*
980 * The single pml4 page per pmap is allocated at pmap create time and exists
981 * for the duration of the pmap. we allocate this page in kernel vm.
982 * this returns the address of the requested pml4 entry in the top level page.
983 */
984 static inline
985 pml4_entry_t *
986 pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
987 {
988 if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) &&
989 (vaddr < 0xFFFF800000000000ULL))) {
990 return (NULL);
991 }
992
993 #if PMAP_ASSERT
994 return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]);
995 #else
996 return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)];
997 #endif
998 }
999
1000 /*
1001 * Returns address of requested PDPT entry in the physmap.
1002 */
1003 static inline pdpt_entry_t *
1004 pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
1005 {
1006 pml4_entry_t newpf;
1007 pml4_entry_t *pml4;
1008
1009 pml4 = pmap64_pml4(pmap, vaddr);
1010 if (pml4 && ((*pml4 & INTEL_PTE_VALID))) {
1011 newpf = *pml4 & PG_FRAME;
1012 return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf))
1013 [(vaddr >> PDPTSHIFT) & (NPDPTPG-1)];
1014 }
1015 return (NULL);
1016 }
1017 /*
1018 * Returns the address of the requested PDE entry in the physmap.
1019 */
1020 static inline pd_entry_t *
1021 pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr)
1022 {
1023 pdpt_entry_t newpf;
1024 pdpt_entry_t *pdpt;
1025
1026 pdpt = pmap64_pdpt(pmap, vaddr);
1027
1028 if (pdpt && ((*pdpt & INTEL_PTE_VALID))) {
1029 newpf = *pdpt & PG_FRAME;
1030 return &((pd_entry_t *) PHYSMAP_PTOV(newpf))
1031 [(vaddr >> PDSHIFT) & (NPDPG-1)];
1032 }
1033 return (NULL);
1034 }
1035
1036 static inline pd_entry_t *
1037 pmap_pde(pmap_t m, vm_map_offset_t v)
1038 {
1039 pd_entry_t *pde;
1040
1041 pde = pmap64_pde(m, v);
1042
1043 return pde;
1044 }
1045
1046
1047 /*
1048 * return address of mapped pte for vaddr va in pmap pmap.
1049 *
1050 * In case the pde maps a superpage, return the pde, which, in this case
1051 * is the actual page table entry.
1052 */
1053 static inline pt_entry_t *
1054 pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
1055 {
1056 pd_entry_t *pde;
1057 pd_entry_t newpf;
1058
1059 assert(pmap);
1060 pde = pmap64_pde(pmap, vaddr);
1061
1062 if (pde && ((*pde & INTEL_PTE_VALID))) {
1063 if (*pde & INTEL_PTE_PS)
1064 return pde;
1065 newpf = *pde & PG_FRAME;
1066 return &((pt_entry_t *)PHYSMAP_PTOV(newpf))
1067 [i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)];
1068 }
1069 return (NULL);
1070 }
1071 #endif
1072 #if DEBUG
1073 #define DPRINTF(x...) kprintf(x)
1074 #else
1075 #define DPRINTF(x...)
1076 #endif
1077
1078 #endif /* MACH_KERNEL_PRIVATE */
1079 #endif /* _I386_PMAP_INTERNAL_ */