]> git.saurik.com Git - apple/xnu.git/blame - osfmk/i386/pmap_internal.h
xnu-7195.101.1.tar.gz
[apple/xnu.git] / osfmk / i386 / pmap_internal.h
CommitLineData
1c79356b 1/*
cb323159 2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
0a7de745 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
0a7de745 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
0a7de745 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
0a7de745 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b 27 */
1c79356b 28
1c79356b 29
0a7de745 30#ifndef _I386_PMAP_INTERNAL_
316670eb 31#define _I386_PMAP_INTERNAL_
b0d623f7 32#ifdef MACH_KERNEL_PRIVATE
1c79356b 33
316670eb
A
34#include <vm/pmap.h>
35#include <sys/kdebug.h>
36#include <kern/ledger.h>
fe8ab488
A
37#include <kern/simple_lock.h>
38#include <i386/bit_routines.h>
316670eb 39
b0d623f7
A
40/*
41 * pmap locking
2d21ac55 42 */
0b4e3aa0 43
0a7de745
A
44static inline void
45PMAP_LOCK_EXCLUSIVE(pmap_t p)
46{
47 mp_disable_preemption();
48 lck_rw_lock_exclusive(&p->pmap_rwl);
b0d623f7 49}
0b4e3aa0 50
0a7de745
A
51static inline void
52PMAP_LOCK_SHARED(pmap_t p)
53{
54 mp_disable_preemption();
55 lck_rw_lock_shared(&p->pmap_rwl);
b0d623f7 56}
1c79356b 57
0a7de745
A
58static inline void
59PMAP_LOCK_SHARED_TO_EXCLUSIVE(pmap_t p)
60{
61 lck_rw_lock_shared_to_exclusive(&p->pmap_rwl);
62}
39236c6e 63
0a7de745
A
64static inline void
65PMAP_LOCK_EXCLUSIVE_TO_SHARED(pmap_t p)
66{
67 lck_rw_lock_exclusive_to_shared(&p->pmap_rwl);
68}
39236c6e 69
0a7de745
A
70static inline void
71PMAP_UNLOCK_EXCLUSIVE(pmap_t p)
72{
73 lck_rw_unlock_exclusive(&p->pmap_rwl);
74 mp_enable_preemption();
75}
39236c6e 76
0a7de745
A
77static inline void
78PMAP_UNLOCK_SHARED(pmap_t p)
79{
80 lck_rw_unlock_shared(&p->pmap_rwl);
81 mp_enable_preemption();
82}
1c79356b 83
0a7de745 84#define iswired(pte) ((pte) & INTEL_PTE_WIRED)
1c79356b 85
0a7de745
A
86#ifdef PMAP_TRACES
87extern boolean_t pmap_trace;
5ba3f43e
A
88#define PMAP_TRACE(...) \
89 if (pmap_trace) { \
0a7de745 90 KDBG_RELEASE(__VA_ARGS__); \
b0d623f7
A
91 }
92#else
0a7de745 93#define PMAP_TRACE(...) KDBG_DEBUG(__VA_ARGS__)
b0d623f7 94#endif /* PMAP_TRACES */
1c79356b 95
5ba3f43e 96#define PMAP_TRACE_CONSTANT(...) KDBG_RELEASE(__VA_ARGS__)
6d2010ae 97
0a7de745
A
98kern_return_t pmap_expand_pml4(
99 pmap_t map,
100 vm_map_offset_t v,
101 unsigned int options);
b0d623f7 102
0a7de745
A
103kern_return_t pmap_expand_pdpt(
104 pmap_t map,
105 vm_map_offset_t v,
106 unsigned int options);
b7266188 107
0a7de745
A
108void phys_attribute_set(
109 ppnum_t phys,
110 int bits);
6d2010ae 111
0a7de745
A
112void pmap_set_reference(
113 ppnum_t pn);
6d2010ae 114
0a7de745
A
115boolean_t phys_page_exists(
116 ppnum_t pn);
6d2010ae 117
39236c6e 118void
0a7de745 119 pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t, int, pmap_flush_context *);
6d2010ae
A
120
121void
0a7de745
A
122 pmap_update_cache_attributes_locked(ppnum_t, unsigned);
123
124
125static inline void
126PMAP_UPDATE_TLBS(pmap_t fp, addr64_t s, addr64_t e)
127{
128 pmap_flush_tlbs(fp, s, e, 0, NULL);
129}
130
131#define PMAP_DELAY_TLB_FLUSH 0x01
6d2010ae 132
0a7de745
A
133static inline void
134PMAP_UPDATE_TLBS_DELAYED(pmap_t fp, addr64_t s, addr64_t e, pmap_flush_context *pfc)
135{
136 pmap_flush_tlbs(fp, s, e, PMAP_DELAY_TLB_FLUSH, pfc);
137}
b0d623f7 138
b7266188
A
139/*
140 * Private data structures.
141 */
142
143/*
144 * For each vm_page_t, there is a list of all currently
145 * valid virtual mappings of that page. An entry is
146 * a pv_rooted_entry_t; the list is the pv_table.
147 *
148 * N.B. with the new combo rooted/hashed scheme it is
149 * only possibly to remove individual non-rooted entries
150 * if they are found via the hashed chains as there is no
151 * way to unlink the singly linked hashed entries if navigated to
152 * via the queue list off the rooted entries. Think of it as
153 * hash/walk/pull, keeping track of the prev pointer while walking
154 * the singly linked hash list. All of this is to save memory and
155 * keep both types of pv_entries as small as possible.
156 */
157
158/*
0a7de745
A
159 *
160 * PV HASHING Changes - JK 1/2007
161 *
162 * Pve's establish physical to virtual mappings. These are used for aliasing of a
163 * physical page to (potentially many) virtual addresses within pmaps. In the
164 * previous implementation the structure of the pv_entries (each 16 bytes in size) was
165 *
166 * typedef struct pv_entry {
167 * struct pv_entry_t next;
168 * pmap_t pmap;
169 * vm_map_offset_t va;
170 * } *pv_entry_t;
171 *
172 * An initial array of these is created at boot time, one per physical page of
173 * memory, indexed by the physical page number. Additionally, a pool of entries
174 * is created from a pv_zone to be used as needed by pmap_enter() when it is
175 * creating new mappings. Originally, we kept this pool around because the code
176 * in pmap_enter() was unable to block if it needed an entry and none were
177 * available - we'd panic. Some time ago I restructured the pmap_enter() code
178 * so that for user pmaps it can block while zalloc'ing a pv structure and restart,
179 * removing a panic from the code (in the case of the kernel pmap we cannot block
180 * and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
181 * The pool has not been removed since there is a large performance gain keeping
182 * freed pv's around for reuse and not suffering the overhead of zalloc for every
183 * new pv we need.
184 *
185 * As pmap_enter() created new mappings it linked the new pve's for them off the
186 * fixed pv array for that ppn (off the next pointer). These pve's are accessed
187 * for several operations, one of them being address space teardown. In that case,
188 * we basically do this
189 *
190 * for (every page/pte in the space) {
191 * calc pve_ptr from the ppn in the pte
192 * for (every pv in the list for the ppn) {
193 * if (this pv is for this pmap/vaddr) {
194 * do housekeeping
195 * unlink/free the pv
196 * }
197 * }
198 * }
199 *
200 * The problem arose when we were running, say 8000 (or even 2000) apache or
201 * other processes and one or all terminate. The list hanging off each pv array
202 * entry could have thousands of entries. We were continuously linearly searching
203 * each of these lists as we stepped through the address space we were tearing
204 * down. Because of the locks we hold, likely taking a cache miss for each node,
205 * and interrupt disabling for MP issues the system became completely unresponsive
206 * for many seconds while we did this.
207 *
208 * Realizing that pve's are accessed in two distinct ways (linearly running the
209 * list by ppn for operations like pmap_page_protect and finding and
210 * modifying/removing a single pve as part of pmap_enter processing) has led to
211 * modifying the pve structures and databases.
212 *
213 * There are now two types of pve structures. A "rooted" structure which is
214 * basically the original structure accessed in an array by ppn, and a ''hashed''
215 * structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
216 * designed with the two goals of minimizing wired memory and making the lookup of
217 * a ppn faster. Since a vast majority of pages in the system are not aliased
218 * and hence represented by a single pv entry I've kept the rooted entry size as
219 * small as possible because there is one of these dedicated for every physical
220 * page of memory. The hashed pve's are larger due to the addition of the hash
221 * link and the ppn entry needed for matching while running the hash list to find
222 * the entry we are looking for. This way, only systems that have lots of
223 * aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
224 * structures have the same first three fields allowing some simplification in
225 * the code.
226 *
227 * They have these shapes
228 *
229 * typedef struct pv_rooted_entry {
230 * queue_head_t qlink;
231 * vm_map_offset_t va;
232 * pmap_t pmap;
233 * } *pv_rooted_entry_t;
234 *
235 *
236 * typedef struct pv_hashed_entry {
237 * queue_head_t qlink;
238 * vm_map_offset_t va;
239 * pmap_t pmap;
240 * ppnum_t ppn;
241 * struct pv_hashed_entry *nexth;
242 * } *pv_hashed_entry_t;
243 *
244 * The main flow difference is that the code is now aware of the rooted entry and
245 * the hashed entries. Code that runs the pv list still starts with the rooted
246 * entry and then continues down the qlink onto the hashed entries. Code that is
247 * looking up a specific pv entry first checks the rooted entry and then hashes
248 * and runs the hash list for the match. The hash list lengths are much smaller
249 * than the original pv lists that contained all aliases for the specific ppn.
250 *
251 */
b7266188 252
6d2010ae
A
253typedef struct pv_rooted_entry {
254 /* first three entries must match pv_hashed_entry_t */
0a7de745
A
255 queue_head_t qlink;
256 vm_map_offset_t va_and_flags; /* virtual address for mapping */
257 pmap_t pmap; /* pmap where mapping lies */
b7266188
A
258} *pv_rooted_entry_t;
259
0a7de745 260#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
b7266188 261
6d2010ae
A
262typedef struct pv_hashed_entry {
263 /* first three entries must match pv_rooted_entry_t */
0a7de745
A
264 queue_head_t qlink;
265 vm_map_offset_t va_and_flags;
266 pmap_t pmap;
267 ppnum_t ppn;
268 struct pv_hashed_entry *nexth;
b7266188
A
269} *pv_hashed_entry_t;
270
271#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
272
f427ee49 273#define PVE_VA(pve) ((pve)->va_and_flags & (vm_map_offset_t)~PAGE_MASK)
39037602
A
274#define PVE_FLAGS(pve) ((pve)->va_and_flags & PAGE_MASK)
275#define PVE_IS_ALTACCT 0x001
276#define PVE_IS_ALTACCT_PAGE(pve) \
277 (((pve)->va_and_flags & PVE_IS_ALTACCT) ? TRUE : FALSE)
278
6d2010ae 279//#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */
b7266188 280#ifdef PV_DEBUG
fe8ab488 281#define CHK_NPVHASH() if(0 == npvhashmask) panic("npvhash uninitialized");
b7266188 282#else
6d2010ae 283#define CHK_NPVHASH(x)
b7266188
A
284#endif
285
fe8ab488
A
286#define NPVHASHBUCKETS (4096)
287#define NPVHASHMASK ((NPVHASHBUCKETS) - 1) /* MUST BE 2^N - 1 */
6d2010ae
A
288#define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000
289#define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000
290#define PV_HASHED_ALLOC_CHUNK_INITIAL 2000
291#define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200
292
0a7de745 293extern volatile uint32_t mappingrecurse;
6d2010ae
A
294extern uint32_t pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark;
295
296/*
297 * PV hash locking
298 */
299
0a7de745
A
300#define LOCK_PV_HASH(hash) lock_hash_hash(hash)
301#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
fe8ab488 302extern uint32_t npvhashmask;
0a7de745
A
303extern pv_hashed_entry_t *pv_hash_table; /* hash lists */
304extern pv_hashed_entry_t pv_hashed_free_list;
305extern pv_hashed_entry_t pv_hashed_kern_free_list;
cb323159
A
306decl_simple_lock_data(extern, pv_hashed_free_list_lock);
307decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock);
308decl_simple_lock_data(extern, pv_hash_table_lock);
309decl_simple_lock_data(extern, phys_backup_lock);
6d2010ae 310
0a7de745
A
311extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry
312 * structures */
6d2010ae 313
0a7de745
A
314extern uint32_t pv_hashed_free_count;
315extern uint32_t pv_hashed_kern_free_count;
6d2010ae
A
316/*
317 * Each entry in the pv_head_table is locked by a bit in the
318 * pv_lock_table. The lock bits are accessed by the address of
319 * the frame they lock.
320 */
0a7de745 321#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
6d2010ae 322#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
0a7de745
A
323extern char *pv_lock_table; /* pointer to array of bits */
324extern char *pv_hash_lock_table;
325extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */
6d2010ae
A
326
327extern event_t mapping_replenish_event;
328
0a7de745
A
329static inline void
330PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep)
331{
316670eb 332 pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL);
0a7de745 333 simple_lock(&pv_hashed_free_list_lock, LCK_GRP_NULL);
6d2010ae
A
334 /* If the kernel reserved pool is low, let non-kernel mappings allocate
335 * synchronously, possibly subject to a throttle.
336 */
316670eb 337 if ((pv_hashed_kern_free_count > pv_hashed_kern_low_water_mark) && ((*pvh_ep = pv_hashed_free_list) != 0)) {
6d2010ae
A
338 pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next;
339 pv_hashed_free_count--;
340 }
341
342 simple_unlock(&pv_hashed_free_list_lock);
343
316670eb 344 if (pv_hashed_free_count <= pv_hashed_low_water_mark) {
cb323159 345 if (!mappingrecurse && os_atomic_cmpxchg(&mappingrecurse, 0, 1, acq_rel)) {
6d2010ae 346 thread_wakeup(&mapping_replenish_event);
0a7de745 347 }
6d2010ae 348 }
b7266188
A
349}
350
0a7de745
A
351static inline void
352PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt)
353{
354 simple_lock(&pv_hashed_free_list_lock, LCK_GRP_NULL);
6d2010ae
A
355 pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list;
356 pv_hashed_free_list = pvh_eh;
f427ee49 357 pv_hashed_free_count += (uint32_t)pv_cnt;
6d2010ae 358 simple_unlock(&pv_hashed_free_list_lock);
b7266188
A
359}
360
6d2010ae
A
361extern unsigned pmap_kern_reserve_alloc_stat;
362
0a7de745
A
363static inline void
364PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e)
365{
316670eb 366 pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL);
0a7de745 367 simple_lock(&pv_hashed_kern_free_list_lock, LCK_GRP_NULL);
6d2010ae
A
368
369 if ((*pvh_e = pv_hashed_kern_free_list) != 0) {
370 pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next;
371 pv_hashed_kern_free_count--;
372 pmap_kern_reserve_alloc_stat++;
373 }
374
375 simple_unlock(&pv_hashed_kern_free_list_lock);
376
377 if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) {
cb323159 378 if (!mappingrecurse && os_atomic_cmpxchg(&mappingrecurse, 0, 1, acq_rel)) {
6d2010ae 379 thread_wakeup(&mapping_replenish_event);
0a7de745 380 }
6d2010ae 381 }
b7266188
A
382}
383
0a7de745
A
384static inline void
385PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt)
386{
387 simple_lock(&pv_hashed_kern_free_list_lock, LCK_GRP_NULL);
6d2010ae
A
388 pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list;
389 pv_hashed_kern_free_list = pvh_eh;
f427ee49 390 pv_hashed_kern_free_count += (uint32_t)pv_cnt;
6d2010ae
A
391 simple_unlock(&pv_hashed_kern_free_list_lock);
392}
393
394extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters;
395extern event_t pmap_user_pv_throttle_event;
396
0a7de745
A
397static inline void
398pmap_pv_throttle(__unused pmap_t p)
399{
6d2010ae
A
400 pmap_assert(p != kernel_pmap);
401 /* Apply throttle on non-kernel mappings */
402 if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) {
403 pmap_pv_throttle_stat++;
404 /* This doesn't need to be strictly accurate, merely a hint
405 * to eliminate the timeout when the reserve is replenished.
406 */
407 pmap_pv_throttled_waiters++;
408 assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC);
409 thread_block(THREAD_CONTINUE_NULL);
410 }
b7266188
A
411}
412
413/*
414 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
415 */
416
0a7de745
A
417#define pa_index(pa) (i386_btop(pa))
418#define ppn_to_pai(ppn) ((int)ppn)
b7266188 419
0a7de745
A
420#define pai_to_pvh(pai) (&pv_head_table[pai])
421#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
422#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
423#define pvhash(idx) (&pv_hash_table[idx])
424#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
425#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
b7266188 426
0a7de745
A
427#define IS_MANAGED_PAGE(x) \
428 ((unsigned int)(x) <= last_managed_page && \
f427ee49 429 ((unsigned long long)pmap_phys_attributes[x] & PHYS_MANAGED))
0a7de745 430#define IS_INTERNAL_PAGE(x) \
f427ee49 431 (IS_MANAGED_PAGE(x) && ((unsigned long long)pmap_phys_attributes[x] & PHYS_INTERNAL))
0a7de745 432#define IS_REUSABLE_PAGE(x) \
f427ee49 433 (IS_MANAGED_PAGE(x) && ((unsigned long long)pmap_phys_attributes[x] & PHYS_REUSABLE))
0a7de745
A
434#define IS_ALTACCT_PAGE(x, pve) \
435 (IS_MANAGED_PAGE((x)) && \
d190cdc3 436 (PVE_IS_ALTACCT_PAGE((pve))))
b7266188
A
437
438/*
439 * Physical page attributes. Copy bits from PTE definition.
440 */
0a7de745
A
441#define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
442#define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
443#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
444#define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */
445#define PHYS_NCACHE INTEL_PTE_NCACHE
446#define PHYS_PAT INTEL_PTE_PAT
447#define PHYS_CACHEABILITY_MASK (INTEL_PTE_PAT | INTEL_PTE_NCACHE)
448#define PHYS_INTERNAL INTEL_PTE_WTHRU /* page from internal object */
449#define PHYS_REUSABLE INTEL_PTE_WRITE /* page is "reusable" */
450
451#if DEVELOPMENT || DEBUG
452extern boolean_t pmap_disable_kheap_nx;
453extern boolean_t pmap_disable_kstack_nx;
454#endif
316670eb
A
455
456#define PMAP_EXPAND_OPTIONS_NONE (0x0)
457#define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT)
458#define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER)
5c9f4661 459#define PMAP_EXPAND_OPTIONS_ALIASMAP (0x40000000U)
b7266188
A
460/*
461 * Amount of virtual memory mapped by one
462 * page-directory entry.
463 */
0a7de745 464#define PDE_MAPPED_SIZE (pdetova(1))
b7266188
A
465
466/*
467 * Locking and TLB invalidation
468 */
469
470/*
471 * Locking Protocols: (changed 2/2007 JK)
472 *
473 * There are two structures in the pmap module that need locking:
474 * the pmaps themselves, and the per-page pv_lists (which are locked
475 * by locking the pv_lock_table entry that corresponds to the pv_head
476 * for the list in question.) Most routines want to lock a pmap and
477 * then do operations in it that require pv_list locking -- however
478 * pmap_remove_all and pmap_copy_on_write operate on a physical page
479 * basis and want to do the locking in the reverse order, i.e. lock
480 * a pv_list and then go through all the pmaps referenced by that list.
481 *
482 * The system wide pmap lock has been removed. Now, paths take a lock
483 * on the pmap before changing its 'shape' and the reverse order lockers
484 * (coming in by phys ppn) take a lock on the corresponding pv and then
485 * retest to be sure nothing changed during the window before they locked
486 * and can then run up/down the pv lists holding the list lock. This also
487 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
488 * previously.
489 */
490
491/*
492 * PV locking
493 */
494
0a7de745
A
495#define LOCK_PVH(index) { \
496 mp_disable_preemption(); \
497 lock_pvh_pai(index); \
b7266188
A
498}
499
0a7de745
A
500#define UNLOCK_PVH(index) { \
501 unlock_pvh_pai(index); \
502 mp_enable_preemption(); \
b7266188 503}
b7266188 504
b7266188
A
505extern uint64_t pde_mapped_size;
506
0a7de745
A
507extern char *pmap_phys_attributes;
508extern ppnum_t last_managed_page;
b7266188 509
cb323159
A
510/*
511 * Used to record high memory allocated to kernel before
512 * pmap_init() gets called.
513 */
514extern ppnum_t pmap_high_used_top;
515extern ppnum_t pmap_high_used_bottom;
516extern ppnum_t pmap_middle_used_top;
517extern ppnum_t pmap_middle_used_bottom;
060df5ea 518
b7266188
A
519/*
520 * when spinning through pmap_remove
521 * ensure that we don't spend too much
522 * time with preemption disabled.
523 * I'm setting the current threshold
524 * to 20us
525 */
526#define MAX_PREEMPTION_LATENCY_NS 20000
527extern uint64_t max_preemption_latency_tsc;
528
5c9f4661
A
529#if DEBUG
530#define PMAP_INTR_DEBUG (1)
531#endif
532
533#if PMAP_INTR_DEBUG
0a7de745
A
534#define pmap_intr_assert() { \
535 if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \
536 panic("pmap interrupt assert %d %s, %d", processor_avail_count, __FILE__, __LINE__); \
b7266188
A
537}
538#else
539#define pmap_intr_assert()
540#endif
0a7de745
A
541#if DEVELOPMENT || DEBUG
542extern int nx_enabled;
543#endif
6d2010ae 544extern unsigned int inuse_ptepages_count;
b7266188
A
545
546static inline uint32_t
547pvhashidx(pmap_t pmap, vm_map_offset_t va)
548{
fe8ab488 549 uint32_t hashidx = ((uint32_t)(uintptr_t)pmap ^
0a7de745
A
550 ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
551 npvhashmask;
552 return hashidx;
b7266188
A
553}
554
555/*
556 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
557 * properly deals with the anchor.
558 * must be called with the hash locked, does not unlock it
559 */
0a7de745 560static inline void
b7266188
A
561pmap_pvh_unlink(pv_hashed_entry_t pvh)
562{
0a7de745
A
563 pv_hashed_entry_t curh;
564 pv_hashed_entry_t *pprevh;
f427ee49 565 uint32_t pvhash_idx;
b7266188
A
566
567 CHK_NPVHASH();
39037602 568 pvhash_idx = pvhashidx(pvh->pmap, PVE_VA(pvh));
b7266188
A
569
570 pprevh = pvhash(pvhash_idx);
571
572#if PV_DEBUG
0a7de745 573 if (NULL == *pprevh) {
b7266188 574 panic("pvh_unlink null anchor"); /* JK DEBUG */
0a7de745 575 }
b7266188
A
576#endif
577 curh = *pprevh;
578
579 while (PV_HASHED_ENTRY_NULL != curh) {
0a7de745 580 if (pvh == curh) {
b7266188 581 break;
0a7de745 582 }
b7266188
A
583 pprevh = &curh->nexth;
584 curh = curh->nexth;
585 }
0a7de745
A
586 if (PV_HASHED_ENTRY_NULL == curh) {
587 panic("pmap_pvh_unlink no pvh");
588 }
b7266188
A
589 *pprevh = pvh->nexth;
590 return;
591}
592
593static inline void
0a7de745
A
594pv_hash_add(pv_hashed_entry_t pvh_e,
595 pv_rooted_entry_t pv_h)
b7266188
A
596{
597 pv_hashed_entry_t *hashp;
f427ee49 598 uint32_t pvhash_idx;
b7266188
A
599
600 CHK_NPVHASH();
39037602 601 pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e));
b7266188
A
602 LOCK_PV_HASH(pvhash_idx);
603 insque(&pvh_e->qlink, &pv_h->qlink);
604 hashp = pvhash(pvhash_idx);
605#if PV_DEBUG
0a7de745 606 if (NULL == hashp) {
b7266188 607 panic("pv_hash_add(%p) null hash bucket", pvh_e);
0a7de745 608 }
b7266188
A
609#endif
610 pvh_e->nexth = *hashp;
611 *hashp = pvh_e;
612 UNLOCK_PV_HASH(pvhash_idx);
613}
614
615static inline void
616pv_hash_remove(pv_hashed_entry_t pvh_e)
617{
f427ee49 618 uint32_t pvhash_idx;
b7266188
A
619
620 CHK_NPVHASH();
0a7de745 621 pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e));
b7266188
A
622 LOCK_PV_HASH(pvhash_idx);
623 remque(&pvh_e->qlink);
624 pmap_pvh_unlink(pvh_e);
625 UNLOCK_PV_HASH(pvhash_idx);
0a7de745 626}
b7266188 627
0a7de745
A
628static inline boolean_t
629popcnt1(uint64_t distance)
630{
631 return (distance & (distance - 1)) == 0;
b7266188
A
632}
633
634/*
635 * Routines to handle suppression of/recovery from some forms of pagetable corruption
636 * incidents observed in the field. These can be either software induced (wild
637 * stores to the mapwindows where applicable, use after free errors
638 * (typically of pages addressed physically), mis-directed DMAs etc., or due
639 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors,
640 * the recording mechanism is deliberately not MP-safe. The overarching goal is to
641 * still assert on potential software races, but attempt recovery from incidents
642 * identifiable as occurring due to issues beyond the control of the pmap module.
643 * The latter includes single-bit errors and malformed pagetable entries.
644 * We currently limit ourselves to recovery/suppression of one incident per
645 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident
646 * are logged.
647 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09)
648 */
649
650typedef enum {
cb323159
A
651 PTE_VALID = 0x0,
652 PTE_INVALID = 0x1,
653 PTE_RSVD = 0x2,
654 PTE_SUPERVISOR = 0x4,
655 PTE_BITFLIP = 0x8,
656 PV_BITFLIP = 0x10,
657 PTE_INVALID_CACHEABILITY = 0x20,
658 PTE_NXBITFLIP = 0x40
b7266188
A
659} pmap_pagetable_corruption_t;
660
661typedef enum {
662 ROOT_PRESENT = 0,
663 ROOT_ABSENT = 1
664} pmap_pv_assertion_t;
665
666typedef enum {
0a7de745
A
667 PMAP_ACTION_IGNORE = 0x0,
668 PMAP_ACTION_ASSERT = 0x1,
669 PMAP_ACTION_RETRY = 0x2,
b7266188
A
670 PMAP_ACTION_RETRY_RELOCK = 0x4
671} pmap_pagetable_corruption_action_t;
672
0a7de745 673#define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
b7266188
A
674extern uint64_t pmap_pagetable_corruption_interval_abstime;
675
676extern uint32_t pmap_pagetable_corruption_incidents;
677#define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8)
678typedef struct {
679 pmap_pv_assertion_t incident;
680 pmap_pagetable_corruption_t reason;
681 pmap_pagetable_corruption_action_t action;
0a7de745 682 pmap_t pmap;
b7266188
A
683 vm_map_offset_t vaddr;
684 pt_entry_t pte;
685 ppnum_t ppn;
686 pmap_t pvpmap;
687 vm_map_offset_t pvva;
688 uint64_t abstime;
cb323159
A
689 int adj_ptes_count;
690#define PMPTCR_MAX_ADJ_PTES (2)
691 uint64_t adj_ptes[PMPTCR_MAX_ADJ_PTES];
b7266188
A
692} pmap_pagetable_corruption_record_t;
693
694extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[];
695extern uint64_t pmap_pagetable_corruption_last_abstime;
0a7de745 696extern thread_call_t pmap_pagetable_corruption_log_call;
b7266188
A
697extern boolean_t pmap_pagetable_corruption_timeout;
698
cb323159
A
699static inline pmap_pagetable_corruption_action_t
700pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason,
701 pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep,
702 ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva, int adj_pteps_cnt, uint64_t **adj_pteps)
0a7de745 703{
b7266188 704 uint32_t pmap_pagetable_corruption_log_index;
cb323159
A
705 uint64_t curtime = mach_absolute_time();
706
707 if ((curtime - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) {
708 pmap_pagetable_corruption_timeout = TRUE;
709 action = PMAP_ACTION_ASSERT;
710 } else {
711 pmap_pagetable_corruption_last_abstime = curtime;
712 }
713
b7266188
A
714 pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG;
715 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident;
716 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason;
717 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action;
718 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap;
719 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr;
720 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep;
721 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn;
722 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap;
723 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva;
cb323159
A
724 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = curtime;
725 if (adj_pteps_cnt > 0 && adj_pteps != NULL) {
726 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes_count = MIN(adj_pteps_cnt, PMPTCR_MAX_ADJ_PTES);
727 for (int i = 0; i < pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes_count; i++) {
728 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes[i] = *adj_pteps[i];
729 }
730 }
b7266188
A
731 /* Asynchronously log */
732 thread_call_enter(pmap_pagetable_corruption_log_call);
cb323159
A
733
734 return action;
b7266188
A
735}
736
737static inline pmap_pagetable_corruption_action_t
0a7de745
A
738pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident)
739{
740 pmap_pagetable_corruption_action_t action = PMAP_ACTION_ASSERT;
741 pmap_pagetable_corruption_t suppress_reason = PTE_VALID;
742 ppnum_t suppress_ppn = 0;
b7266188 743 pt_entry_t cpte = *ptep;
0a7de745
A
744 ppnum_t cpn = pa_index(pte_to_pa(cpte));
745 ppnum_t ppn = *ppnp;
746 pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn));
747 pv_rooted_entry_t pv_e = pv_h;
748 uint32_t bitdex;
b7266188 749 pmap_t pvpmap = pv_h->pmap;
39037602 750 vm_map_offset_t pvva = PVE_VA(pv_h);
d190cdc3 751 vm_map_offset_t pve_flags;
b7266188 752 boolean_t ppcd = FALSE;
3e170ce0 753 boolean_t is_ept;
b7266188
A
754
755 /* Ideally, we'd consult the Mach VM here to definitively determine
756 * the nature of the mapping for this address space and address.
757 * As that would be a layering violation in this context, we
758 * use various heuristics to recover from single bit errors,
759 * malformed pagetable entries etc. These are not intended
760 * to be comprehensive.
761 */
762
763 /* As a precautionary measure, mark A+D */
764 pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
3e170ce0 765 is_ept = is_ept_pmap(pmap);
b7266188
A
766
767 /*
768 * Correct potential single bit errors in either (but not both) element
769 * of the PV
770 */
771 do {
39037602
A
772 if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && PVE_VA(pv_e) == vaddr) ||
773 (pv_e->pmap == pmap && popcnt1(PVE_VA(pv_e) ^ vaddr))) {
d190cdc3 774 pve_flags = PVE_FLAGS(pv_e);
b7266188 775 pv_e->pmap = pmap;
d190cdc3 776 pv_h->va_and_flags = vaddr | pve_flags;
b7266188
A
777 suppress_reason = PV_BITFLIP;
778 action = PMAP_ACTION_RETRY;
779 goto pmap_cpc_exit;
780 }
316670eb 781 } while (((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink))) && (pv_e != pv_h));
b7266188
A
782
783 /* Discover root entries with a Hamming
784 * distance of 1 from the supplied
785 * physical page frame.
786 */
787 for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) {
788 ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex);
789 if (IS_MANAGED_PAGE(npn)) {
790 pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn));
39037602 791 if (PVE_VA(npv_h) == vaddr && npv_h->pmap == pmap) {
b7266188
A
792 suppress_reason = PTE_BITFLIP;
793 suppress_ppn = npn;
794 action = PMAP_ACTION_RETRY_RELOCK;
795 UNLOCK_PVH(ppn_to_pai(ppn));
796 *ppnp = npn;
797 goto pmap_cpc_exit;
798 }
799 }
800 }
801
802 if (pmap == kernel_pmap) {
803 action = PMAP_ACTION_ASSERT;
804 goto pmap_cpc_exit;
805 }
806
3e170ce0
A
807 /*
808 * Check for malformed/inconsistent entries.
809 * The first check here isn't useful for EPT PTEs because INTEL_EPT_NCACHE == 0
810 */
0a7de745 811 if (!is_ept && ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PAT)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU))) {
b7266188
A
812 action = PMAP_ACTION_IGNORE;
813 suppress_reason = PTE_INVALID_CACHEABILITY;
0a7de745 814 } else if (cpte & INTEL_PTE_RSVD) {
b7266188
A
815 action = PMAP_ACTION_IGNORE;
816 suppress_reason = PTE_RSVD;
0a7de745 817 } else if ((pmap != kernel_pmap) && (!is_ept) && ((cpte & INTEL_PTE_USER) == 0)) {
b7266188
A
818 action = PMAP_ACTION_IGNORE;
819 suppress_reason = PTE_SUPERVISOR;
820 }
821pmap_cpc_exit:
822 PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd));
823
824 if (debug_boot_arg && !ppcd) {
825 action = PMAP_ACTION_ASSERT;
826 }
827
cb323159
A
828 return pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva, 0, 0);
829}
830
831static inline boolean_t
832pmap_compressed_pte_corruption_repair(uint64_t pte, uint64_t *pte_addr, uint64_t *ptep, pmap_t pmap,
833 vm_map_offset_t vaddr)
834{
835 uint64_t *adj_pteps[2];
836 int pteidx = ((uintptr_t)ptep & INTEL_OFFMASK) / sizeof(pt_entry_t);
837 pmap_pagetable_corruption_action_t action = PMAP_ACTION_IGNORE;
838
839 /*
840 * Grab pointers to PTEs on either side of the PTE in question, unless we're at the start of
841 * a PT (grab pointers to the next and next-next PTEs) or the end of a PT (grab the previous
842 * 2 PTEs).
843 */
844 if (pteidx == 0) {
845 adj_pteps[0] = ptep + 1;
846 adj_pteps[1] = ptep + 2;
847 } else if (pteidx == (NPTPG - 1)) {
848 adj_pteps[0] = ptep - 2;
849 adj_pteps[1] = ptep - 1;
0a7de745 850 } else {
cb323159
A
851 adj_pteps[0] = ptep - 1;
852 adj_pteps[1] = ptep + 1;
b7266188 853 }
cb323159
A
854
855 /*
856 * Since the compressed PTE no longer has a PTE associated, we cannot pass in the pv data to
857 * pmap_pagetable_corruption_log, so instead supply adjacent PTEs for logging.
858 */
859 if (pmap_pagetable_corruption_log(ROOT_ABSENT, (pte & INTEL_PTE_NX) ? PTE_NXBITFLIP : PTE_BITFLIP,
860 action, pmap, vaddr, ptep, (ppnum_t)~0UL, 0, 0, sizeof(adj_pteps) / sizeof(adj_pteps[0]),
861 adj_pteps) != PMAP_ACTION_ASSERT) {
862 /* Correct the flipped bit(s) and continue */
863 pmap_store_pte(ptep, pte & INTEL_PTE_COMPRESSED_MASK);
864 pmap->corrected_compressed_ptes_count++;
865 return TRUE; /* Returning TRUE to indicate this is a now a valid compressed PTE (we hope) */
866 }
867
868 panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted? Adjacent PTEs: 0x%llx@%p, 0x%llx@%p",
869 pte_addr, pte, pte & ~INTEL_PTE_COMPRESSED_MASK, *adj_pteps[0], adj_pteps[0], *adj_pteps[1], adj_pteps[1]);
870 /*NOTREACHED*/
b7266188 871}
6d2010ae 872
b7266188
A
873/*
874 * Remove pv list entry.
875 * Called with pv_head_table entry locked.
876 * Returns pv entry to be freed (or NULL).
877 */
b7266188 878static inline __attribute__((always_inline)) pv_hashed_entry_t
0a7de745
A
879pmap_pv_remove(pmap_t pmap,
880 vm_map_offset_t vaddr,
881 ppnum_t *ppnp,
882 pt_entry_t *pte,
883 boolean_t *was_altacct)
b7266188
A
884{
885 pv_hashed_entry_t pvh_e;
0a7de745
A
886 pv_rooted_entry_t pv_h;
887 pv_hashed_entry_t *pprevh;
f427ee49 888 uint32_t pvhash_idx;
b7266188 889 uint32_t pv_cnt;
0a7de745 890 ppnum_t ppn;
b7266188 891
d190cdc3 892 *was_altacct = FALSE;
b7266188
A
893pmap_pv_remove_retry:
894 ppn = *ppnp;
895 pvh_e = PV_HASHED_ENTRY_NULL;
896 pv_h = pai_to_pvh(ppn_to_pai(ppn));
897
316670eb 898 if (__improbable(pv_h->pmap == PMAP_NULL)) {
b7266188 899 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT);
0a7de745 900 if (pac == PMAP_ACTION_IGNORE) {
b7266188 901 goto pmap_pv_remove_exit;
0a7de745 902 } else if (pac == PMAP_ACTION_ASSERT) {
5ba3f43e 903 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list, priors: %d", pmap, vaddr, ppn, *pte, ppnp, pte, pmap_pagetable_corruption_incidents);
0a7de745 904 } else if (pac == PMAP_ACTION_RETRY_RELOCK) {
b7266188
A
905 LOCK_PVH(ppn_to_pai(*ppnp));
906 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
907 goto pmap_pv_remove_retry;
0a7de745 908 } else if (pac == PMAP_ACTION_RETRY) {
b7266188 909 goto pmap_pv_remove_retry;
0a7de745 910 }
b7266188
A
911 }
912
39037602 913 if (PVE_VA(pv_h) == vaddr && pv_h->pmap == pmap) {
d190cdc3 914 *was_altacct = IS_ALTACCT_PAGE(ppn_to_pai(*ppnp), pv_h);
b7266188 915 /*
0a7de745 916 * Header is the pv_rooted_entry.
b7266188 917 * We can't free that. If there is a queued
0a7de745
A
918 * entry after this one we remove that
919 * from the ppn queue, we remove it from the hash chain
920 * and copy it to the rooted entry. Then free it instead.
921 */
b7266188
A
922 pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
923 if (pv_h != (pv_rooted_entry_t) pvh_e) {
924 /*
925 * Entry queued to root, remove this from hash
926 * and install as new root.
927 */
928 CHK_NPVHASH();
39037602 929 pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e));
b7266188
A
930 LOCK_PV_HASH(pvhash_idx);
931 remque(&pvh_e->qlink);
932 pprevh = pvhash(pvhash_idx);
933 if (PV_HASHED_ENTRY_NULL == *pprevh) {
39236c6e 934 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x): "
0a7de745 935 "empty hash, removing rooted, priors: %d",
5ba3f43e 936 pmap, vaddr, ppn, pmap_pagetable_corruption_incidents);
b7266188
A
937 }
938 pmap_pvh_unlink(pvh_e);
939 UNLOCK_PV_HASH(pvhash_idx);
940 pv_h->pmap = pvh_e->pmap;
d190cdc3 941 pv_h->va_and_flags = pvh_e->va_and_flags;
39037602 942 /* dispose of pvh_e */
b7266188
A
943 } else {
944 /* none queued after rooted */
945 pv_h->pmap = PMAP_NULL;
946 pvh_e = PV_HASHED_ENTRY_NULL;
947 }
948 } else {
949 /*
950 * not removing rooted pv. find it on hash chain, remove from
951 * ppn queue and hash chain and free it
952 */
953 CHK_NPVHASH();
954 pvhash_idx = pvhashidx(pmap, vaddr);
955 LOCK_PV_HASH(pvhash_idx);
956 pprevh = pvhash(pvhash_idx);
957 if (PV_HASHED_ENTRY_NULL == *pprevh) {
5ba3f43e
A
958 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash, priors: %d",
959 pmap, vaddr, ppn, *pte, pte, pmap_pagetable_corruption_incidents);
b7266188
A
960 }
961 pvh_e = *pprevh;
962 pmap_pv_hashlist_walks++;
963 pv_cnt = 0;
964 while (PV_HASHED_ENTRY_NULL != pvh_e) {
965 pv_cnt++;
966 if (pvh_e->pmap == pmap &&
39037602 967 PVE_VA(pvh_e) == vaddr &&
0a7de745 968 pvh_e->ppn == ppn) {
b7266188 969 break;
0a7de745 970 }
b7266188
A
971 pprevh = &pvh_e->nexth;
972 pvh_e = pvh_e->nexth;
973 }
6d2010ae 974
b7266188
A
975 if (PV_HASHED_ENTRY_NULL == pvh_e) {
976 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT);
977
0a7de745 978 if (pac == PMAP_ACTION_ASSERT) {
5ba3f43e 979 panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx, priors: %d", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, PVE_VA(pv_h), pmap_pagetable_corruption_incidents);
0a7de745 980 } else {
b7266188
A
981 UNLOCK_PV_HASH(pvhash_idx);
982 if (pac == PMAP_ACTION_RETRY_RELOCK) {
983 LOCK_PVH(ppn_to_pai(*ppnp));
984 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
985 goto pmap_pv_remove_retry;
0a7de745 986 } else if (pac == PMAP_ACTION_RETRY) {
b7266188 987 goto pmap_pv_remove_retry;
0a7de745 988 } else if (pac == PMAP_ACTION_IGNORE) {
b7266188
A
989 goto pmap_pv_remove_exit;
990 }
991 }
992 }
6d2010ae 993
d190cdc3
A
994 *was_altacct = IS_ALTACCT_PAGE(ppn_to_pai(*ppnp), pvh_e);
995
b7266188 996 pmap_pv_hashlist_cnts += pv_cnt;
0a7de745 997 if (pmap_pv_hashlist_max < pv_cnt) {
b7266188 998 pmap_pv_hashlist_max = pv_cnt;
0a7de745 999 }
b7266188
A
1000 *pprevh = pvh_e->nexth;
1001 remque(&pvh_e->qlink);
1002 UNLOCK_PV_HASH(pvhash_idx);
1003 }
1004pmap_pv_remove_exit:
1005 return pvh_e;
1006}
1007
d190cdc3
A
1008static inline __attribute__((always_inline)) boolean_t
1009pmap_pv_is_altacct(
0a7de745
A
1010 pmap_t pmap,
1011 vm_map_offset_t vaddr,
1012 ppnum_t ppn)
d190cdc3
A
1013{
1014 pv_hashed_entry_t pvh_e;
0a7de745 1015 pv_rooted_entry_t pv_h;
f427ee49 1016 uint32_t pvhash_idx;
0a7de745 1017 boolean_t is_altacct;
d190cdc3
A
1018
1019 pvh_e = PV_HASHED_ENTRY_NULL;
1020 pv_h = pai_to_pvh(ppn_to_pai(ppn));
1021
1022 if (__improbable(pv_h->pmap == PMAP_NULL)) {
1023 return FALSE;
1024 }
1025
1026 if (PVE_VA(pv_h) == vaddr && pv_h->pmap == pmap) {
1027 /*
0a7de745
A
1028 * Header is the pv_rooted_entry.
1029 */
d190cdc3
A
1030 return IS_ALTACCT_PAGE(ppn, pv_h);
1031 }
1032
1033 CHK_NPVHASH();
1034 pvhash_idx = pvhashidx(pmap, vaddr);
1035 LOCK_PV_HASH(pvhash_idx);
1036 pvh_e = *(pvhash(pvhash_idx));
d190cdc3
A
1037 while (PV_HASHED_ENTRY_NULL != pvh_e) {
1038 if (pvh_e->pmap == pmap &&
1039 PVE_VA(pvh_e) == vaddr &&
0a7de745 1040 pvh_e->ppn == ppn) {
d190cdc3 1041 break;
0a7de745 1042 }
d190cdc3
A
1043 pvh_e = pvh_e->nexth;
1044 }
1045 if (PV_HASHED_ENTRY_NULL == pvh_e) {
1046 is_altacct = FALSE;
1047 } else {
1048 is_altacct = IS_ALTACCT_PAGE(ppn, pvh_e);
1049 }
1050 UNLOCK_PV_HASH(pvhash_idx);
1051
1052 return is_altacct;
1053}
6d2010ae 1054
6d2010ae 1055static inline void
316670eb 1056PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes)
6d2010ae 1057{
f427ee49 1058 pmap_ledger_credit(pmap, task_ledgers.tkm_private, (ledger_amount_t)bytes);
6d2010ae
A
1059}
1060
1061static inline void
316670eb 1062PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes)
6d2010ae 1063{
f427ee49 1064 pmap_ledger_debit(pmap, task_ledgers.tkm_private, (ledger_amount_t)bytes);
6d2010ae
A
1065}
1066
316670eb
A
1067static inline void
1068PMAP_ZINFO_SALLOC(pmap_t pmap, vm_size_t bytes)
1069{
f427ee49 1070 pmap_ledger_credit(pmap, task_ledgers.tkm_shared, (ledger_amount_t)bytes);
316670eb
A
1071}
1072
1073static inline void
1074PMAP_ZINFO_SFREE(pmap_t pmap, vm_size_t bytes)
1075{
f427ee49 1076 pmap_ledger_debit(pmap, task_ledgers.tkm_shared, (ledger_amount_t)bytes);
316670eb
A
1077}
1078
0a7de745 1079extern boolean_t pmap_initialized;/* Has pmap_init completed? */
6d2010ae
A
1080#define valid_page(x) (pmap_initialized && pmap_valid_page(x))
1081
0a7de745
A
1082int phys_attribute_test(
1083 ppnum_t phys,
1084 int bits);
1085void phys_attribute_clear(
1086 ppnum_t phys,
1087 int bits,
1088 unsigned int options,
1089 void *arg);
6d2010ae
A
1090
1091//#define PCID_DEBUG 1
0a7de745
A
1092#if PCID_DEBUG
1093#define pmap_pcid_log(fmt, args...) \
1094 do { \
1095 kprintf(fmt, ##args); \
1096 printf(fmt, ##args); \
6d2010ae
A
1097 } while(0)
1098#else
1099#define pmap_pcid_log(fmt, args...)
1100#endif
0a7de745 1101void pmap_pcid_configure(void);
6d2010ae 1102
316670eb
A
1103
1104/*
1105 * Atomic 64-bit compare and exchange of a page table entry.
1106 */
0a7de745
A
1107
1108#include <machine/atomic.h>
316670eb
A
1109static inline boolean_t
1110pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new)
1111{
0a7de745
A
1112 return __c11_atomic_compare_exchange_strong((_Atomic pt_entry_t *)entryp, &old, new,
1113 memory_order_acq_rel_smp, memory_order_relaxed);
316670eb
A
1114}
1115
1116extern uint32_t pmap_update_clear_pte_count;
1117
0a7de745
A
1118static inline void
1119pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits)
1120{
316670eb
A
1121 pt_entry_t npte, opte;
1122 do {
1123 opte = *mptep;
1124 if (__improbable(opte == 0)) {
0a7de745 1125#if DEVELOPMENT || DEBUG
316670eb 1126 pmap_update_clear_pte_count++;
0a7de745 1127#endif
316670eb
A
1128 break;
1129 }
1130 npte = opte & ~(pclear_bits);
1131 npte |= pset_bits;
0a7de745 1132 } while (!pmap_cmpx_pte(mptep, opte, npte));
316670eb
A
1133}
1134
6d2010ae
A
1135/*
1136 * The single pml4 page per pmap is allocated at pmap create time and exists
1137 * for the duration of the pmap. we allocate this page in kernel vm.
1138 * this returns the address of the requested pml4 entry in the top level page.
1139 */
1140static inline
1141pml4_entry_t *
1142pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
1143{
316670eb 1144 if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) &&
0a7de745
A
1145 (vaddr < 0xFFFF800000000000ULL))) {
1146 return NULL;
316670eb
A
1147 }
1148
0a7de745
A
1149#if DEBUG
1150 return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG - 1)]);
6d2010ae 1151#else
0a7de745 1152 return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG - 1)];
6d2010ae
A
1153#endif
1154}
1155
5c9f4661
A
1156static inline pml4_entry_t *
1157pmap64_user_pml4(pmap_t pmap, vm_map_offset_t vaddr)
1158{
1159 if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) &&
0a7de745
A
1160 (vaddr < 0xFFFF800000000000ULL))) {
1161 return NULL;
5c9f4661
A
1162 }
1163
0a7de745
A
1164#if DEBUG
1165 return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_ucr3)[(vaddr >> PML4SHIFT) & (NPML4PG - 1)]);
5c9f4661 1166#else
0a7de745 1167 return &pmap->pm_upml4[(vaddr >> PML4SHIFT) & (NPML4PG - 1)];
5c9f4661
A
1168#endif
1169}
1170
6d2010ae
A
1171/*
1172 * Returns address of requested PDPT entry in the physmap.
1173 */
1174static inline pdpt_entry_t *
1175pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
1176{
0a7de745
A
1177 pml4_entry_t newpf;
1178 pml4_entry_t *pml4;
1179 boolean_t is_ept;
6d2010ae 1180
6d2010ae 1181 pml4 = pmap64_pml4(pmap, vaddr);
3e170ce0
A
1182 is_ept = is_ept_pmap(pmap);
1183
1184 if (pml4 && (*pml4 & PTE_VALID_MASK(is_ept))) {
6d2010ae
A
1185 newpf = *pml4 & PG_FRAME;
1186 return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf))
0a7de745 1187 [(vaddr >> PDPTSHIFT) & (NPDPTPG - 1)];
6d2010ae 1188 }
0a7de745 1189 return NULL;
6d2010ae
A
1190}
1191/*
1192 * Returns the address of the requested PDE entry in the physmap.
1193 */
1194static inline pd_entry_t *
0a7de745 1195pmap_pde_internal1(vm_map_offset_t vaddr, boolean_t is_ept, pdpt_entry_t *pdpte)
6d2010ae 1196{
0a7de745
A
1197 if (*pdpte & PTE_VALID_MASK(is_ept)) {
1198 pdpt_entry_t newpf = *pdpte & PG_FRAME;
1199 return &((pd_entry_t *) PHYSMAP_PTOV(newpf))
1200 [(vaddr >> PDSHIFT) & (NPDPG - 1)];
1201 } else {
1202 return NULL;
1203 }
1204}
6d2010ae 1205
0a7de745
A
1206static inline pd_entry_t *
1207pmap_pde_internal0(pmap_t pmap, vm_map_offset_t vaddr, boolean_t is_ept)
1208{
1209 pdpt_entry_t *pdpt;
6d2010ae 1210
0a7de745
A
1211 pdpt = pmap64_pdpt(pmap, vaddr);
1212 if (pdpt) {
1213 return pmap_pde_internal1(vaddr, is_ept, pdpt);
1214 } else {
1215 return NULL;
6d2010ae 1216 }
6d2010ae
A
1217}
1218
0a7de745
A
1219
1220static inline pd_entry_t *
1221pmap_pde(pmap_t pmap, vm_map_offset_t vaddr)
6d2010ae 1222{
0a7de745
A
1223 pdpt_entry_t *pdpt;
1224 boolean_t is_ept;
6d2010ae 1225
0a7de745
A
1226 pdpt = pmap64_pdpt(pmap, vaddr);
1227 is_ept = is_ept_pmap(pmap);
6d2010ae 1228
0a7de745
A
1229 if (pdpt) {
1230 return pmap_pde_internal1(vaddr, is_ept, pdpt);
1231 } else {
1232 return NULL;
1233 }
6d2010ae
A
1234}
1235
1236
1237/*
1238 * return address of mapped pte for vaddr va in pmap pmap.
1239 *
1240 * In case the pde maps a superpage, return the pde, which, in this case
1241 * is the actual page table entry.
1242 */
0a7de745
A
1243
1244
1245static inline pt_entry_t *
1246pmap_pte_internal(vm_map_offset_t vaddr, boolean_t is_ept, pd_entry_t *pde)
1247{
1248 if (*pde & PTE_VALID_MASK(is_ept)) {
1249 if (__improbable(*pde & PTE_PS)) {
1250 return pde;
1251 }
1252 pd_entry_t newpf = *pde & PG_FRAME;
1253
1254 return &((pt_entry_t *)PHYSMAP_PTOV(newpf))
1255 [i386_btop(vaddr) & (ppnum_t)(NPTEPG - 1)];
1256 } else {
1257 return NULL;
1258 }
1259}
1260
6d2010ae
A
1261static inline pt_entry_t *
1262pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
1263{
0a7de745 1264 pd_entry_t *pde;
6d2010ae 1265
0a7de745 1266 boolean_t is_ept;
6d2010ae 1267
3e170ce0
A
1268 is_ept = is_ept_pmap(pmap);
1269
0a7de745
A
1270 pde = pmap_pde_internal0(pmap, vaddr, is_ept);
1271
1272 if (pde) {
1273 return pmap_pte_internal(vaddr, is_ept, pde);
1274 } else {
1275 return NULL;
6d2010ae 1276 }
6d2010ae 1277}
0a7de745
A
1278
1279extern void pmap_alias(
1280 vm_offset_t ava,
1281 vm_map_offset_t start,
1282 vm_map_offset_t end,
1283 vm_prot_t prot,
1284 unsigned int options);
1285
1286#if DEBUG
1287#define DPRINTF(x...) kprintf(x)
316670eb
A
1288#else
1289#define DPRINTF(x...)
1290#endif
1291
b0d623f7 1292#endif /* MACH_KERNEL_PRIVATE */
316670eb 1293#endif /* _I386_PMAP_INTERNAL_ */