]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/pmap_internal.h
4f5580dfa2d7a3a3821543f56c61c89e048c789c
[apple/xnu.git] / osfmk / i386 / pmap_internal.h
1 /*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30 #ifndef _I386_PMAP_INTERNAL_
31 #define _I386_PMAP_INTERNAL_
32 #ifdef MACH_KERNEL_PRIVATE
33
34 #include <vm/pmap.h>
35 #include <sys/kdebug.h>
36 #include <kern/ledger.h>
37 #include <kern/simple_lock.h>
38 #include <i386/bit_routines.h>
39
40 /*
41 * pmap locking
42 */
43
44 #define PMAP_LOCK(pmap) { \
45 simple_lock(&(pmap)->lock); \
46 }
47
48 #define PMAP_UNLOCK(pmap) { \
49 simple_unlock(&(pmap)->lock); \
50 }
51
52 #define PMAP_UPDATE_TLBS(pmap, s, e) \
53 pmap_flush_tlbs(pmap, s, e, 0, NULL)
54
55
56 #define PMAP_DELAY_TLB_FLUSH 0x01
57
58 #define PMAP_UPDATE_TLBS_DELAYED(pmap, s, e, c) \
59 pmap_flush_tlbs(pmap, s, e, PMAP_DELAY_TLB_FLUSH, c)
60
61
62 #define iswired(pte) ((pte) & INTEL_PTE_WIRED)
63
64 #ifdef PMAP_TRACES
65 extern boolean_t pmap_trace;
66 #define PMAP_TRACE(x,a,b,c,d,e) \
67 if (pmap_trace) { \
68 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
69 }
70 #else
71 #define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e)
72 #endif /* PMAP_TRACES */
73
74 #define PMAP_TRACE_CONSTANT(x,a,b,c,d,e) \
75 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \
76
77 kern_return_t pmap_expand_pml4(
78 pmap_t map,
79 vm_map_offset_t v,
80 unsigned int options);
81
82 kern_return_t pmap_expand_pdpt(
83 pmap_t map,
84 vm_map_offset_t v,
85 unsigned int options);
86
87 void phys_attribute_set(
88 ppnum_t phys,
89 int bits);
90
91 void pmap_set_reference(
92 ppnum_t pn);
93
94 boolean_t phys_page_exists(
95 ppnum_t pn);
96
97 void
98 pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t, int, pmap_flush_context *);
99
100 void
101 pmap_update_cache_attributes_locked(ppnum_t, unsigned);
102
103 extern const boolean_t cpu_64bit;
104
105 /*
106 * Private data structures.
107 */
108
109 /*
110 * For each vm_page_t, there is a list of all currently
111 * valid virtual mappings of that page. An entry is
112 * a pv_rooted_entry_t; the list is the pv_table.
113 *
114 * N.B. with the new combo rooted/hashed scheme it is
115 * only possibly to remove individual non-rooted entries
116 * if they are found via the hashed chains as there is no
117 * way to unlink the singly linked hashed entries if navigated to
118 * via the queue list off the rooted entries. Think of it as
119 * hash/walk/pull, keeping track of the prev pointer while walking
120 * the singly linked hash list. All of this is to save memory and
121 * keep both types of pv_entries as small as possible.
122 */
123
124 /*
125
126 PV HASHING Changes - JK 1/2007
127
128 Pve's establish physical to virtual mappings. These are used for aliasing of a
129 physical page to (potentially many) virtual addresses within pmaps. In the
130 previous implementation the structure of the pv_entries (each 16 bytes in size) was
131
132 typedef struct pv_entry {
133 struct pv_entry_t next;
134 pmap_t pmap;
135 vm_map_offset_t va;
136 } *pv_entry_t;
137
138 An initial array of these is created at boot time, one per physical page of
139 memory, indexed by the physical page number. Additionally, a pool of entries
140 is created from a pv_zone to be used as needed by pmap_enter() when it is
141 creating new mappings. Originally, we kept this pool around because the code
142 in pmap_enter() was unable to block if it needed an entry and none were
143 available - we'd panic. Some time ago I restructured the pmap_enter() code
144 so that for user pmaps it can block while zalloc'ing a pv structure and restart,
145 removing a panic from the code (in the case of the kernel pmap we cannot block
146 and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
147 The pool has not been removed since there is a large performance gain keeping
148 freed pv's around for reuse and not suffering the overhead of zalloc for every
149 new pv we need.
150
151 As pmap_enter() created new mappings it linked the new pve's for them off the
152 fixed pv array for that ppn (off the next pointer). These pve's are accessed
153 for several operations, one of them being address space teardown. In that case,
154 we basically do this
155
156 for (every page/pte in the space) {
157 calc pve_ptr from the ppn in the pte
158 for (every pv in the list for the ppn) {
159 if (this pv is for this pmap/vaddr) {
160 do housekeeping
161 unlink/free the pv
162 }
163 }
164 }
165
166 The problem arose when we were running, say 8000 (or even 2000) apache or
167 other processes and one or all terminate. The list hanging off each pv array
168 entry could have thousands of entries. We were continuously linearly searching
169 each of these lists as we stepped through the address space we were tearing
170 down. Because of the locks we hold, likely taking a cache miss for each node,
171 and interrupt disabling for MP issues the system became completely unresponsive
172 for many seconds while we did this.
173
174 Realizing that pve's are accessed in two distinct ways (linearly running the
175 list by ppn for operations like pmap_page_protect and finding and
176 modifying/removing a single pve as part of pmap_enter processing) has led to
177 modifying the pve structures and databases.
178
179 There are now two types of pve structures. A "rooted" structure which is
180 basically the original structure accessed in an array by ppn, and a ''hashed''
181 structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
182 designed with the two goals of minimizing wired memory and making the lookup of
183 a ppn faster. Since a vast majority of pages in the system are not aliased
184 and hence represented by a single pv entry I've kept the rooted entry size as
185 small as possible because there is one of these dedicated for every physical
186 page of memory. The hashed pve's are larger due to the addition of the hash
187 link and the ppn entry needed for matching while running the hash list to find
188 the entry we are looking for. This way, only systems that have lots of
189 aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
190 structures have the same first three fields allowing some simplification in
191 the code.
192
193 They have these shapes
194
195 typedef struct pv_rooted_entry {
196 queue_head_t qlink;
197 vm_map_offset_t va;
198 pmap_t pmap;
199 } *pv_rooted_entry_t;
200
201
202 typedef struct pv_hashed_entry {
203 queue_head_t qlink;
204 vm_map_offset_t va;
205 pmap_t pmap;
206 ppnum_t ppn;
207 struct pv_hashed_entry *nexth;
208 } *pv_hashed_entry_t;
209
210 The main flow difference is that the code is now aware of the rooted entry and
211 the hashed entries. Code that runs the pv list still starts with the rooted
212 entry and then continues down the qlink onto the hashed entries. Code that is
213 looking up a specific pv entry first checks the rooted entry and then hashes
214 and runs the hash list for the match. The hash list lengths are much smaller
215 than the original pv lists that contained all aliases for the specific ppn.
216
217 */
218
219 typedef struct pv_rooted_entry {
220 /* first three entries must match pv_hashed_entry_t */
221 queue_head_t qlink;
222 vm_map_offset_t va; /* virtual address for mapping */
223 pmap_t pmap; /* pmap where mapping lies */
224 } *pv_rooted_entry_t;
225
226 #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
227
228 typedef struct pv_hashed_entry {
229 /* first three entries must match pv_rooted_entry_t */
230 queue_head_t qlink;
231 vm_map_offset_t va;
232 pmap_t pmap;
233 ppnum_t ppn;
234 struct pv_hashed_entry *nexth;
235 } *pv_hashed_entry_t;
236
237 #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
238
239 //#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */
240 #ifdef PV_DEBUG
241 #define CHK_NPVHASH() if(0 == npvhashmask) panic("npvhash uninitialized");
242 #else
243 #define CHK_NPVHASH(x)
244 #endif
245
246 #define NPVHASHBUCKETS (4096)
247 #define NPVHASHMASK ((NPVHASHBUCKETS) - 1) /* MUST BE 2^N - 1 */
248 #define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000
249 #define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000
250 #define PV_HASHED_ALLOC_CHUNK_INITIAL 2000
251 #define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200
252
253 extern volatile uint32_t mappingrecurse;
254 extern uint32_t pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark;
255
256 /*
257 * PV hash locking
258 */
259
260 #define LOCK_PV_HASH(hash) lock_hash_hash(hash)
261 #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
262 extern uint32_t npvhashmask;
263 extern pv_hashed_entry_t *pv_hash_table; /* hash lists */
264 extern pv_hashed_entry_t pv_hashed_free_list;
265 extern pv_hashed_entry_t pv_hashed_kern_free_list;
266 decl_simple_lock_data(extern, pv_hashed_free_list_lock)
267 decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock)
268 decl_simple_lock_data(extern, pv_hash_table_lock)
269 decl_simple_lock_data(extern, phys_backup_lock)
270
271 extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry
272 * structures */
273
274 extern uint32_t pv_hashed_free_count;
275 extern uint32_t pv_hashed_kern_free_count;
276 /*
277 * Each entry in the pv_head_table is locked by a bit in the
278 * pv_lock_table. The lock bits are accessed by the address of
279 * the frame they lock.
280 */
281 #define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
282 #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
283 extern char *pv_lock_table; /* pointer to array of bits */
284 extern char *pv_hash_lock_table;
285 extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */
286
287 extern event_t mapping_replenish_event;
288
289 static inline void PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) {
290 pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL);
291 simple_lock(&pv_hashed_free_list_lock);
292 /* If the kernel reserved pool is low, let non-kernel mappings allocate
293 * synchronously, possibly subject to a throttle.
294 */
295 if ((pv_hashed_kern_free_count > pv_hashed_kern_low_water_mark) && ((*pvh_ep = pv_hashed_free_list) != 0)) {
296 pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next;
297 pv_hashed_free_count--;
298 }
299
300 simple_unlock(&pv_hashed_free_list_lock);
301
302 if (pv_hashed_free_count <= pv_hashed_low_water_mark) {
303 if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
304 thread_wakeup(&mapping_replenish_event);
305 }
306 }
307
308 static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) {
309 simple_lock(&pv_hashed_free_list_lock);
310 pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list;
311 pv_hashed_free_list = pvh_eh;
312 pv_hashed_free_count += pv_cnt;
313 simple_unlock(&pv_hashed_free_list_lock);
314 }
315
316 extern unsigned pmap_kern_reserve_alloc_stat;
317
318 static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) {
319 pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL);
320 simple_lock(&pv_hashed_kern_free_list_lock);
321
322 if ((*pvh_e = pv_hashed_kern_free_list) != 0) {
323 pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next;
324 pv_hashed_kern_free_count--;
325 pmap_kern_reserve_alloc_stat++;
326 }
327
328 simple_unlock(&pv_hashed_kern_free_list_lock);
329
330 if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) {
331 if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
332 thread_wakeup(&mapping_replenish_event);
333 }
334 }
335
336 static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) {
337 simple_lock(&pv_hashed_kern_free_list_lock);
338 pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list;
339 pv_hashed_kern_free_list = pvh_eh;
340 pv_hashed_kern_free_count += pv_cnt;
341 simple_unlock(&pv_hashed_kern_free_list_lock);
342 }
343
344 extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters;
345 extern event_t pmap_user_pv_throttle_event;
346
347 static inline void pmap_pv_throttle(__unused pmap_t p) {
348 pmap_assert(p != kernel_pmap);
349 /* Apply throttle on non-kernel mappings */
350 if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) {
351 pmap_pv_throttle_stat++;
352 /* This doesn't need to be strictly accurate, merely a hint
353 * to eliminate the timeout when the reserve is replenished.
354 */
355 pmap_pv_throttled_waiters++;
356 assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC);
357 thread_block(THREAD_CONTINUE_NULL);
358 }
359 }
360
361 /*
362 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
363 */
364
365 #define pa_index(pa) (i386_btop(pa))
366 #define ppn_to_pai(ppn) ((int)ppn)
367
368 #define pai_to_pvh(pai) (&pv_head_table[pai])
369 #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
370 #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
371 #define pvhash(idx) (&pv_hash_table[idx])
372 #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
373 #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
374
375 #define IS_MANAGED_PAGE(x) \
376 ((unsigned int)(x) <= last_managed_page && \
377 (pmap_phys_attributes[x] & PHYS_MANAGED))
378 #define IS_INTERNAL_PAGE(x) \
379 (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_INTERNAL))
380 #define IS_REUSABLE_PAGE(x) \
381 (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_REUSABLE))
382
383 /*
384 * Physical page attributes. Copy bits from PTE definition.
385 */
386 #define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
387 #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
388 #define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
389 #define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */
390 #define PHYS_NCACHE INTEL_PTE_NCACHE
391 #define PHYS_PTA INTEL_PTE_PTA
392 #define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE)
393 #define PHYS_INTERNAL INTEL_PTE_WTHRU /* page from internal object */
394 #define PHYS_REUSABLE INTEL_PTE_WRITE /* page is "reusable" */
395
396 extern boolean_t pmap_disable_kheap_nx;
397 extern boolean_t pmap_disable_kstack_nx;
398
399 #define PMAP_EXPAND_OPTIONS_NONE (0x0)
400 #define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT)
401 #define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER)
402
403 /*
404 * Amount of virtual memory mapped by one
405 * page-directory entry.
406 */
407 #define PDE_MAPPED_SIZE (pdetova(1))
408
409
410 /*
411 * Locking and TLB invalidation
412 */
413
414 /*
415 * Locking Protocols: (changed 2/2007 JK)
416 *
417 * There are two structures in the pmap module that need locking:
418 * the pmaps themselves, and the per-page pv_lists (which are locked
419 * by locking the pv_lock_table entry that corresponds to the pv_head
420 * for the list in question.) Most routines want to lock a pmap and
421 * then do operations in it that require pv_list locking -- however
422 * pmap_remove_all and pmap_copy_on_write operate on a physical page
423 * basis and want to do the locking in the reverse order, i.e. lock
424 * a pv_list and then go through all the pmaps referenced by that list.
425 *
426 * The system wide pmap lock has been removed. Now, paths take a lock
427 * on the pmap before changing its 'shape' and the reverse order lockers
428 * (coming in by phys ppn) take a lock on the corresponding pv and then
429 * retest to be sure nothing changed during the window before they locked
430 * and can then run up/down the pv lists holding the list lock. This also
431 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
432 * previously.
433 */
434
435 /*
436 * PV locking
437 */
438
439 #define LOCK_PVH(index) { \
440 mp_disable_preemption(); \
441 lock_pvh_pai(index); \
442 }
443
444 #define UNLOCK_PVH(index) { \
445 unlock_pvh_pai(index); \
446 mp_enable_preemption(); \
447 }
448
449 extern uint64_t pde_mapped_size;
450
451 extern char *pmap_phys_attributes;
452 extern ppnum_t last_managed_page;
453
454 extern ppnum_t lowest_lo;
455 extern ppnum_t lowest_hi;
456 extern ppnum_t highest_hi;
457
458 /*
459 * when spinning through pmap_remove
460 * ensure that we don't spend too much
461 * time with preemption disabled.
462 * I'm setting the current threshold
463 * to 20us
464 */
465 #define MAX_PREEMPTION_LATENCY_NS 20000
466 extern uint64_t max_preemption_latency_tsc;
467
468 /* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
469 #ifdef DEBUGINTERRUPTS
470 #define pmap_intr_assert() { \
471 if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \
472 panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \
473 }
474 #else
475 #define pmap_intr_assert()
476 #endif
477
478 extern int nx_enabled;
479 extern unsigned int inuse_ptepages_count;
480
481 static inline uint32_t
482 pvhashidx(pmap_t pmap, vm_map_offset_t va)
483 {
484 uint32_t hashidx = ((uint32_t)(uintptr_t)pmap ^
485 ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
486 npvhashmask;
487 return hashidx;
488 }
489
490
491 /*
492 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
493 * properly deals with the anchor.
494 * must be called with the hash locked, does not unlock it
495 */
496 static inline void
497 pmap_pvh_unlink(pv_hashed_entry_t pvh)
498 {
499 pv_hashed_entry_t curh;
500 pv_hashed_entry_t *pprevh;
501 int pvhash_idx;
502
503 CHK_NPVHASH();
504 pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
505
506 pprevh = pvhash(pvhash_idx);
507
508 #if PV_DEBUG
509 if (NULL == *pprevh)
510 panic("pvh_unlink null anchor"); /* JK DEBUG */
511 #endif
512 curh = *pprevh;
513
514 while (PV_HASHED_ENTRY_NULL != curh) {
515 if (pvh == curh)
516 break;
517 pprevh = &curh->nexth;
518 curh = curh->nexth;
519 }
520 if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
521 *pprevh = pvh->nexth;
522 return;
523 }
524
525 static inline void
526 pv_hash_add(pv_hashed_entry_t pvh_e,
527 pv_rooted_entry_t pv_h)
528 {
529 pv_hashed_entry_t *hashp;
530 int pvhash_idx;
531
532 CHK_NPVHASH();
533 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
534 LOCK_PV_HASH(pvhash_idx);
535 insque(&pvh_e->qlink, &pv_h->qlink);
536 hashp = pvhash(pvhash_idx);
537 #if PV_DEBUG
538 if (NULL==hashp)
539 panic("pv_hash_add(%p) null hash bucket", pvh_e);
540 #endif
541 pvh_e->nexth = *hashp;
542 *hashp = pvh_e;
543 UNLOCK_PV_HASH(pvhash_idx);
544 }
545
546 static inline void
547 pv_hash_remove(pv_hashed_entry_t pvh_e)
548 {
549 int pvhash_idx;
550
551 CHK_NPVHASH();
552 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
553 LOCK_PV_HASH(pvhash_idx);
554 remque(&pvh_e->qlink);
555 pmap_pvh_unlink(pvh_e);
556 UNLOCK_PV_HASH(pvhash_idx);
557 }
558
559 static inline boolean_t popcnt1(uint64_t distance) {
560 return ((distance & (distance - 1)) == 0);
561 }
562
563 /*
564 * Routines to handle suppression of/recovery from some forms of pagetable corruption
565 * incidents observed in the field. These can be either software induced (wild
566 * stores to the mapwindows where applicable, use after free errors
567 * (typically of pages addressed physically), mis-directed DMAs etc., or due
568 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors,
569 * the recording mechanism is deliberately not MP-safe. The overarching goal is to
570 * still assert on potential software races, but attempt recovery from incidents
571 * identifiable as occurring due to issues beyond the control of the pmap module.
572 * The latter includes single-bit errors and malformed pagetable entries.
573 * We currently limit ourselves to recovery/suppression of one incident per
574 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident
575 * are logged.
576 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09)
577 */
578
579 typedef enum {
580 PTE_VALID = 0x0,
581 PTE_INVALID = 0x1,
582 PTE_RSVD = 0x2,
583 PTE_SUPERVISOR = 0x4,
584 PTE_BITFLIP = 0x8,
585 PV_BITFLIP = 0x10,
586 PTE_INVALID_CACHEABILITY = 0x20
587 } pmap_pagetable_corruption_t;
588
589 typedef enum {
590 ROOT_PRESENT = 0,
591 ROOT_ABSENT = 1
592 } pmap_pv_assertion_t;
593
594 typedef enum {
595 PMAP_ACTION_IGNORE = 0x0,
596 PMAP_ACTION_ASSERT = 0x1,
597 PMAP_ACTION_RETRY = 0x2,
598 PMAP_ACTION_RETRY_RELOCK = 0x4
599 } pmap_pagetable_corruption_action_t;
600
601 #define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
602 extern uint64_t pmap_pagetable_corruption_interval_abstime;
603
604 extern uint32_t pmap_pagetable_corruption_incidents;
605 #define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8)
606 typedef struct {
607 pmap_pv_assertion_t incident;
608 pmap_pagetable_corruption_t reason;
609 pmap_pagetable_corruption_action_t action;
610 pmap_t pmap;
611 vm_map_offset_t vaddr;
612 pt_entry_t pte;
613 ppnum_t ppn;
614 pmap_t pvpmap;
615 vm_map_offset_t pvva;
616 uint64_t abstime;
617 } pmap_pagetable_corruption_record_t;
618
619 extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[];
620 extern uint64_t pmap_pagetable_corruption_last_abstime;
621 extern thread_call_t pmap_pagetable_corruption_log_call;
622 extern boolean_t pmap_pagetable_corruption_timeout;
623
624 static inline void
625 pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) {
626 uint32_t pmap_pagetable_corruption_log_index;
627 pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG;
628 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident;
629 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason;
630 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action;
631 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap;
632 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr;
633 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep;
634 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn;
635 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap;
636 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva;
637 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time();
638 /* Asynchronously log */
639 thread_call_enter(pmap_pagetable_corruption_log_call);
640 }
641
642 static inline pmap_pagetable_corruption_action_t
643 pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) {
644 pmap_pagetable_corruption_action_t action = PMAP_ACTION_ASSERT;
645 pmap_pagetable_corruption_t suppress_reason = PTE_VALID;
646 ppnum_t suppress_ppn = 0;
647 pt_entry_t cpte = *ptep;
648 ppnum_t cpn = pa_index(pte_to_pa(cpte));
649 ppnum_t ppn = *ppnp;
650 pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn));
651 pv_rooted_entry_t pv_e = pv_h;
652 uint32_t bitdex;
653 pmap_t pvpmap = pv_h->pmap;
654 vm_map_offset_t pvva = pv_h->va;
655 boolean_t ppcd = FALSE;
656 boolean_t is_ept;
657
658 /* Ideally, we'd consult the Mach VM here to definitively determine
659 * the nature of the mapping for this address space and address.
660 * As that would be a layering violation in this context, we
661 * use various heuristics to recover from single bit errors,
662 * malformed pagetable entries etc. These are not intended
663 * to be comprehensive.
664 */
665
666 /* As a precautionary measure, mark A+D */
667 pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
668 is_ept = is_ept_pmap(pmap);
669
670 /*
671 * Correct potential single bit errors in either (but not both) element
672 * of the PV
673 */
674 do {
675 if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) ||
676 (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) {
677 pv_e->pmap = pmap;
678 pv_e->va = vaddr;
679 suppress_reason = PV_BITFLIP;
680 action = PMAP_ACTION_RETRY;
681 goto pmap_cpc_exit;
682 }
683 } while (((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink))) && (pv_e != pv_h));
684
685 /* Discover root entries with a Hamming
686 * distance of 1 from the supplied
687 * physical page frame.
688 */
689 for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) {
690 ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex);
691 if (IS_MANAGED_PAGE(npn)) {
692 pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn));
693 if (npv_h->va == vaddr && npv_h->pmap == pmap) {
694 suppress_reason = PTE_BITFLIP;
695 suppress_ppn = npn;
696 action = PMAP_ACTION_RETRY_RELOCK;
697 UNLOCK_PVH(ppn_to_pai(ppn));
698 *ppnp = npn;
699 goto pmap_cpc_exit;
700 }
701 }
702 }
703
704 if (pmap == kernel_pmap) {
705 action = PMAP_ACTION_ASSERT;
706 goto pmap_cpc_exit;
707 }
708
709 /*
710 * Check for malformed/inconsistent entries.
711 * The first check here isn't useful for EPT PTEs because INTEL_EPT_NCACHE == 0
712 */
713 if (!is_ept && ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU))) {
714 action = PMAP_ACTION_IGNORE;
715 suppress_reason = PTE_INVALID_CACHEABILITY;
716 }
717 else if (cpte & INTEL_PTE_RSVD) {
718 action = PMAP_ACTION_IGNORE;
719 suppress_reason = PTE_RSVD;
720 }
721 else if ((pmap != kernel_pmap) && (!is_ept) && ((cpte & INTEL_PTE_USER) == 0)) {
722 action = PMAP_ACTION_IGNORE;
723 suppress_reason = PTE_SUPERVISOR;
724 }
725 pmap_cpc_exit:
726 PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd));
727
728 if (debug_boot_arg && !ppcd) {
729 action = PMAP_ACTION_ASSERT;
730 }
731
732 if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) {
733 action = PMAP_ACTION_ASSERT;
734 pmap_pagetable_corruption_timeout = TRUE;
735 }
736 else
737 {
738 pmap_pagetable_corruption_last_abstime = mach_absolute_time();
739 }
740 pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva);
741 return action;
742 }
743
744 /*
745 * Remove pv list entry.
746 * Called with pv_head_table entry locked.
747 * Returns pv entry to be freed (or NULL).
748 */
749 static inline __attribute__((always_inline)) pv_hashed_entry_t
750 pmap_pv_remove(pmap_t pmap,
751 vm_map_offset_t vaddr,
752 ppnum_t *ppnp,
753 pt_entry_t *pte)
754 {
755 pv_hashed_entry_t pvh_e;
756 pv_rooted_entry_t pv_h;
757 pv_hashed_entry_t *pprevh;
758 int pvhash_idx;
759 uint32_t pv_cnt;
760 ppnum_t ppn;
761
762 pmap_pv_remove_retry:
763 ppn = *ppnp;
764 pvh_e = PV_HASHED_ENTRY_NULL;
765 pv_h = pai_to_pvh(ppn_to_pai(ppn));
766
767 if (__improbable(pv_h->pmap == PMAP_NULL)) {
768 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT);
769 if (pac == PMAP_ACTION_IGNORE)
770 goto pmap_pv_remove_exit;
771 else if (pac == PMAP_ACTION_ASSERT)
772 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list!", pmap, vaddr, ppn, *pte, ppnp, pte);
773 else if (pac == PMAP_ACTION_RETRY_RELOCK) {
774 LOCK_PVH(ppn_to_pai(*ppnp));
775 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
776 goto pmap_pv_remove_retry;
777 }
778 else if (pac == PMAP_ACTION_RETRY)
779 goto pmap_pv_remove_retry;
780 }
781
782 if (pv_h->va == vaddr && pv_h->pmap == pmap) {
783 /*
784 * Header is the pv_rooted_entry.
785 * We can't free that. If there is a queued
786 * entry after this one we remove that
787 * from the ppn queue, we remove it from the hash chain
788 * and copy it to the rooted entry. Then free it instead.
789 */
790 pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
791 if (pv_h != (pv_rooted_entry_t) pvh_e) {
792 /*
793 * Entry queued to root, remove this from hash
794 * and install as new root.
795 */
796 CHK_NPVHASH();
797 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
798 LOCK_PV_HASH(pvhash_idx);
799 remque(&pvh_e->qlink);
800 pprevh = pvhash(pvhash_idx);
801 if (PV_HASHED_ENTRY_NULL == *pprevh) {
802 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x): "
803 "empty hash, removing rooted",
804 pmap, vaddr, ppn);
805 }
806 pmap_pvh_unlink(pvh_e);
807 UNLOCK_PV_HASH(pvhash_idx);
808 pv_h->pmap = pvh_e->pmap;
809 pv_h->va = pvh_e->va; /* dispose of pvh_e */
810 } else {
811 /* none queued after rooted */
812 pv_h->pmap = PMAP_NULL;
813 pvh_e = PV_HASHED_ENTRY_NULL;
814 }
815 } else {
816 /*
817 * not removing rooted pv. find it on hash chain, remove from
818 * ppn queue and hash chain and free it
819 */
820 CHK_NPVHASH();
821 pvhash_idx = pvhashidx(pmap, vaddr);
822 LOCK_PV_HASH(pvhash_idx);
823 pprevh = pvhash(pvhash_idx);
824 if (PV_HASHED_ENTRY_NULL == *pprevh) {
825 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash",
826 pmap, vaddr, ppn, *pte, pte);
827 }
828 pvh_e = *pprevh;
829 pmap_pv_hashlist_walks++;
830 pv_cnt = 0;
831 while (PV_HASHED_ENTRY_NULL != pvh_e) {
832 pv_cnt++;
833 if (pvh_e->pmap == pmap &&
834 pvh_e->va == vaddr &&
835 pvh_e->ppn == ppn)
836 break;
837 pprevh = &pvh_e->nexth;
838 pvh_e = pvh_e->nexth;
839 }
840
841 if (PV_HASHED_ENTRY_NULL == pvh_e) {
842 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT);
843
844 if (pac == PMAP_ACTION_ASSERT)
845 panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, pv_h->va);
846 else {
847 UNLOCK_PV_HASH(pvhash_idx);
848 if (pac == PMAP_ACTION_RETRY_RELOCK) {
849 LOCK_PVH(ppn_to_pai(*ppnp));
850 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
851 goto pmap_pv_remove_retry;
852 }
853 else if (pac == PMAP_ACTION_RETRY) {
854 goto pmap_pv_remove_retry;
855 }
856 else if (pac == PMAP_ACTION_IGNORE) {
857 goto pmap_pv_remove_exit;
858 }
859 }
860 }
861
862 pmap_pv_hashlist_cnts += pv_cnt;
863 if (pmap_pv_hashlist_max < pv_cnt)
864 pmap_pv_hashlist_max = pv_cnt;
865 *pprevh = pvh_e->nexth;
866 remque(&pvh_e->qlink);
867 UNLOCK_PV_HASH(pvhash_idx);
868 }
869 pmap_pv_remove_exit:
870 return pvh_e;
871 }
872
873
874 extern int pt_fake_zone_index;
875 static inline void
876 PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes)
877 {
878 thread_t thr = current_thread();
879 task_t task;
880 zinfo_usage_t zinfo;
881
882 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
883
884 if (pt_fake_zone_index != -1 &&
885 (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
886 OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].alloc);
887 }
888
889 static inline void
890 PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes)
891 {
892 thread_t thr = current_thread();
893 task_t task;
894 zinfo_usage_t zinfo;
895
896 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
897
898 if (pt_fake_zone_index != -1 &&
899 (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
900 OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].free);
901 }
902
903 static inline void
904 PMAP_ZINFO_SALLOC(pmap_t pmap, vm_size_t bytes)
905 {
906 pmap_ledger_credit(pmap, task_ledgers.tkm_shared, bytes);
907 }
908
909 static inline void
910 PMAP_ZINFO_SFREE(pmap_t pmap, vm_size_t bytes)
911 {
912 pmap_ledger_debit(pmap, task_ledgers.tkm_shared, bytes);
913 }
914
915 extern boolean_t pmap_initialized;/* Has pmap_init completed? */
916 #define valid_page(x) (pmap_initialized && pmap_valid_page(x))
917
918 // XXX
919 #define HIGH_MEM_BASE ((uint32_t)( -NBPDE) ) /* shared gdt etc seg addr */ /* XXX64 ?? */
920 // XXX
921
922
923 int phys_attribute_test(
924 ppnum_t phys,
925 int bits);
926 void phys_attribute_clear(
927 ppnum_t phys,
928 int bits,
929 unsigned int options,
930 void *arg);
931
932 //#define PCID_DEBUG 1
933 #if PCID_DEBUG
934 #define pmap_pcid_log(fmt, args...) \
935 do { \
936 kprintf(fmt, ##args); \
937 printf(fmt, ##args); \
938 } while(0)
939 #else
940 #define pmap_pcid_log(fmt, args...)
941 #endif
942 void pmap_pcid_configure(void);
943
944
945 /*
946 * Atomic 64-bit compare and exchange of a page table entry.
947 */
948 static inline boolean_t
949 pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new)
950 {
951 boolean_t ret;
952
953 /*
954 * Load the old value into %rax
955 * Load the new value into another register
956 * Compare-exchange-quad at address entryp
957 * If the compare succeeds, the new value is stored, return TRUE.
958 * Otherwise, no swap is made, return FALSE.
959 */
960 asm volatile(
961 " lock; cmpxchgq %2,(%3) \n\t"
962 " setz %%al \n\t"
963 " movzbl %%al,%0"
964 : "=a" (ret)
965 : "a" (old),
966 "r" (new),
967 "r" (entryp)
968 : "memory");
969 return ret;
970 }
971
972 extern uint32_t pmap_update_clear_pte_count;
973
974 static inline void pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits) {
975 pt_entry_t npte, opte;
976 do {
977 opte = *mptep;
978 if (__improbable(opte == 0)) {
979 pmap_update_clear_pte_count++;
980 break;
981 }
982 npte = opte & ~(pclear_bits);
983 npte |= pset_bits;
984 } while (!pmap_cmpx_pte(mptep, opte, npte));
985 }
986
987 #if defined(__x86_64__)
988 /*
989 * The single pml4 page per pmap is allocated at pmap create time and exists
990 * for the duration of the pmap. we allocate this page in kernel vm.
991 * this returns the address of the requested pml4 entry in the top level page.
992 */
993 static inline
994 pml4_entry_t *
995 pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
996 {
997 if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) &&
998 (vaddr < 0xFFFF800000000000ULL))) {
999 return (NULL);
1000 }
1001
1002 #if PMAP_ASSERT
1003 return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]);
1004 #else
1005 return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)];
1006 #endif
1007 }
1008
1009 /*
1010 * Returns address of requested PDPT entry in the physmap.
1011 */
1012 static inline pdpt_entry_t *
1013 pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
1014 {
1015 pml4_entry_t newpf;
1016 pml4_entry_t *pml4;
1017 boolean_t is_ept;
1018
1019 pml4 = pmap64_pml4(pmap, vaddr);
1020 is_ept = is_ept_pmap(pmap);
1021
1022 if (pml4 && (*pml4 & PTE_VALID_MASK(is_ept))) {
1023 newpf = *pml4 & PG_FRAME;
1024 return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf))
1025 [(vaddr >> PDPTSHIFT) & (NPDPTPG-1)];
1026 }
1027 return (NULL);
1028 }
1029 /*
1030 * Returns the address of the requested PDE entry in the physmap.
1031 */
1032 static inline pd_entry_t *
1033 pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr)
1034 {
1035 pdpt_entry_t newpf;
1036 pdpt_entry_t *pdpt;
1037 boolean_t is_ept;
1038
1039 pdpt = pmap64_pdpt(pmap, vaddr);
1040 is_ept = is_ept_pmap(pmap);
1041
1042 if (pdpt && (*pdpt & PTE_VALID_MASK(is_ept))) {
1043 newpf = *pdpt & PG_FRAME;
1044 return &((pd_entry_t *) PHYSMAP_PTOV(newpf))
1045 [(vaddr >> PDSHIFT) & (NPDPG-1)];
1046 }
1047 return (NULL);
1048 }
1049
1050 static inline pd_entry_t *
1051 pmap_pde(pmap_t m, vm_map_offset_t v)
1052 {
1053 pd_entry_t *pde;
1054
1055 pde = pmap64_pde(m, v);
1056
1057 return pde;
1058 }
1059
1060
1061 /*
1062 * return address of mapped pte for vaddr va in pmap pmap.
1063 *
1064 * In case the pde maps a superpage, return the pde, which, in this case
1065 * is the actual page table entry.
1066 */
1067 static inline pt_entry_t *
1068 pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
1069 {
1070 pd_entry_t *pde;
1071 pd_entry_t newpf;
1072 boolean_t is_ept;
1073
1074 assert(pmap);
1075 pde = pmap64_pde(pmap, vaddr);
1076
1077 is_ept = is_ept_pmap(pmap);
1078
1079 if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1080 if (*pde & PTE_PS)
1081 return pde;
1082 newpf = *pde & PG_FRAME;
1083 return &((pt_entry_t *)PHYSMAP_PTOV(newpf))
1084 [i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)];
1085 }
1086 return (NULL);
1087 }
1088 #endif
1089 #if DEBUG
1090 #define DPRINTF(x...) kprintf(x)
1091 #else
1092 #define DPRINTF(x...)
1093 #endif
1094
1095 #endif /* MACH_KERNEL_PRIVATE */
1096 #endif /* _I386_PMAP_INTERNAL_ */