]>
Commit | Line | Data |
---|---|---|
1c79356b | 1 | /* |
39236c6e | 2 | * Copyright (c) 2000-2012 Apple Inc. All rights reserved. |
1c79356b | 3 | * |
2d21ac55 | 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
0a7de745 | 5 | * |
2d21ac55 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
0a7de745 | 14 | * |
2d21ac55 A |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
0a7de745 | 17 | * |
2d21ac55 A |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
8f6c56a5 A |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
2d21ac55 A |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
0a7de745 | 25 | * |
2d21ac55 | 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
1c79356b | 27 | */ |
1c79356b | 28 | |
1c79356b | 29 | |
0a7de745 | 30 | #ifndef _I386_PMAP_INTERNAL_ |
316670eb | 31 | #define _I386_PMAP_INTERNAL_ |
b0d623f7 | 32 | #ifdef MACH_KERNEL_PRIVATE |
1c79356b | 33 | |
316670eb A |
34 | #include <vm/pmap.h> |
35 | #include <sys/kdebug.h> | |
36 | #include <kern/ledger.h> | |
fe8ab488 A |
37 | #include <kern/simple_lock.h> |
38 | #include <i386/bit_routines.h> | |
316670eb | 39 | |
b0d623f7 A |
40 | /* |
41 | * pmap locking | |
2d21ac55 | 42 | */ |
0b4e3aa0 | 43 | |
0a7de745 A |
44 | static inline void |
45 | PMAP_LOCK_EXCLUSIVE(pmap_t p) | |
46 | { | |
47 | mp_disable_preemption(); | |
48 | lck_rw_lock_exclusive(&p->pmap_rwl); | |
b0d623f7 | 49 | } |
0b4e3aa0 | 50 | |
0a7de745 A |
51 | static inline void |
52 | PMAP_LOCK_SHARED(pmap_t p) | |
53 | { | |
54 | mp_disable_preemption(); | |
55 | lck_rw_lock_shared(&p->pmap_rwl); | |
b0d623f7 | 56 | } |
1c79356b | 57 | |
0a7de745 A |
58 | static inline void |
59 | PMAP_LOCK_SHARED_TO_EXCLUSIVE(pmap_t p) | |
60 | { | |
61 | lck_rw_lock_shared_to_exclusive(&p->pmap_rwl); | |
62 | } | |
39236c6e | 63 | |
0a7de745 A |
64 | static inline void |
65 | PMAP_LOCK_EXCLUSIVE_TO_SHARED(pmap_t p) | |
66 | { | |
67 | lck_rw_lock_exclusive_to_shared(&p->pmap_rwl); | |
68 | } | |
39236c6e | 69 | |
0a7de745 A |
70 | static inline void |
71 | PMAP_UNLOCK_EXCLUSIVE(pmap_t p) | |
72 | { | |
73 | lck_rw_unlock_exclusive(&p->pmap_rwl); | |
74 | mp_enable_preemption(); | |
75 | } | |
39236c6e | 76 | |
0a7de745 A |
77 | static inline void |
78 | PMAP_UNLOCK_SHARED(pmap_t p) | |
79 | { | |
80 | lck_rw_unlock_shared(&p->pmap_rwl); | |
81 | mp_enable_preemption(); | |
82 | } | |
1c79356b | 83 | |
0a7de745 | 84 | #define iswired(pte) ((pte) & INTEL_PTE_WIRED) |
1c79356b | 85 | |
0a7de745 A |
86 | #ifdef PMAP_TRACES |
87 | extern boolean_t pmap_trace; | |
5ba3f43e A |
88 | #define PMAP_TRACE(...) \ |
89 | if (pmap_trace) { \ | |
0a7de745 | 90 | KDBG_RELEASE(__VA_ARGS__); \ |
b0d623f7 A |
91 | } |
92 | #else | |
0a7de745 | 93 | #define PMAP_TRACE(...) KDBG_DEBUG(__VA_ARGS__) |
b0d623f7 | 94 | #endif /* PMAP_TRACES */ |
1c79356b | 95 | |
5ba3f43e | 96 | #define PMAP_TRACE_CONSTANT(...) KDBG_RELEASE(__VA_ARGS__) |
6d2010ae | 97 | |
0a7de745 A |
98 | kern_return_t pmap_expand_pml4( |
99 | pmap_t map, | |
100 | vm_map_offset_t v, | |
101 | unsigned int options); | |
b0d623f7 | 102 | |
0a7de745 A |
103 | kern_return_t pmap_expand_pdpt( |
104 | pmap_t map, | |
105 | vm_map_offset_t v, | |
106 | unsigned int options); | |
b7266188 | 107 | |
0a7de745 A |
108 | void phys_attribute_set( |
109 | ppnum_t phys, | |
110 | int bits); | |
6d2010ae | 111 | |
0a7de745 A |
112 | void pmap_set_reference( |
113 | ppnum_t pn); | |
6d2010ae | 114 | |
0a7de745 A |
115 | boolean_t phys_page_exists( |
116 | ppnum_t pn); | |
6d2010ae | 117 | |
39236c6e | 118 | void |
0a7de745 | 119 | pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t, int, pmap_flush_context *); |
6d2010ae A |
120 | |
121 | void | |
0a7de745 A |
122 | pmap_update_cache_attributes_locked(ppnum_t, unsigned); |
123 | ||
124 | ||
125 | static inline void | |
126 | PMAP_UPDATE_TLBS(pmap_t fp, addr64_t s, addr64_t e) | |
127 | { | |
128 | pmap_flush_tlbs(fp, s, e, 0, NULL); | |
129 | } | |
130 | ||
131 | #define PMAP_DELAY_TLB_FLUSH 0x01 | |
6d2010ae | 132 | |
0a7de745 A |
133 | static inline void |
134 | PMAP_UPDATE_TLBS_DELAYED(pmap_t fp, addr64_t s, addr64_t e, pmap_flush_context *pfc) | |
135 | { | |
136 | pmap_flush_tlbs(fp, s, e, PMAP_DELAY_TLB_FLUSH, pfc); | |
137 | } | |
b0d623f7 | 138 | |
b7266188 A |
139 | /* |
140 | * Private data structures. | |
141 | */ | |
142 | ||
143 | /* | |
144 | * For each vm_page_t, there is a list of all currently | |
145 | * valid virtual mappings of that page. An entry is | |
146 | * a pv_rooted_entry_t; the list is the pv_table. | |
147 | * | |
148 | * N.B. with the new combo rooted/hashed scheme it is | |
149 | * only possibly to remove individual non-rooted entries | |
150 | * if they are found via the hashed chains as there is no | |
151 | * way to unlink the singly linked hashed entries if navigated to | |
152 | * via the queue list off the rooted entries. Think of it as | |
153 | * hash/walk/pull, keeping track of the prev pointer while walking | |
154 | * the singly linked hash list. All of this is to save memory and | |
155 | * keep both types of pv_entries as small as possible. | |
156 | */ | |
157 | ||
158 | /* | |
0a7de745 A |
159 | * |
160 | * PV HASHING Changes - JK 1/2007 | |
161 | * | |
162 | * Pve's establish physical to virtual mappings. These are used for aliasing of a | |
163 | * physical page to (potentially many) virtual addresses within pmaps. In the | |
164 | * previous implementation the structure of the pv_entries (each 16 bytes in size) was | |
165 | * | |
166 | * typedef struct pv_entry { | |
167 | * struct pv_entry_t next; | |
168 | * pmap_t pmap; | |
169 | * vm_map_offset_t va; | |
170 | * } *pv_entry_t; | |
171 | * | |
172 | * An initial array of these is created at boot time, one per physical page of | |
173 | * memory, indexed by the physical page number. Additionally, a pool of entries | |
174 | * is created from a pv_zone to be used as needed by pmap_enter() when it is | |
175 | * creating new mappings. Originally, we kept this pool around because the code | |
176 | * in pmap_enter() was unable to block if it needed an entry and none were | |
177 | * available - we'd panic. Some time ago I restructured the pmap_enter() code | |
178 | * so that for user pmaps it can block while zalloc'ing a pv structure and restart, | |
179 | * removing a panic from the code (in the case of the kernel pmap we cannot block | |
180 | * and still panic, so, we keep a separate hot pool for use only on kernel pmaps). | |
181 | * The pool has not been removed since there is a large performance gain keeping | |
182 | * freed pv's around for reuse and not suffering the overhead of zalloc for every | |
183 | * new pv we need. | |
184 | * | |
185 | * As pmap_enter() created new mappings it linked the new pve's for them off the | |
186 | * fixed pv array for that ppn (off the next pointer). These pve's are accessed | |
187 | * for several operations, one of them being address space teardown. In that case, | |
188 | * we basically do this | |
189 | * | |
190 | * for (every page/pte in the space) { | |
191 | * calc pve_ptr from the ppn in the pte | |
192 | * for (every pv in the list for the ppn) { | |
193 | * if (this pv is for this pmap/vaddr) { | |
194 | * do housekeeping | |
195 | * unlink/free the pv | |
196 | * } | |
197 | * } | |
198 | * } | |
199 | * | |
200 | * The problem arose when we were running, say 8000 (or even 2000) apache or | |
201 | * other processes and one or all terminate. The list hanging off each pv array | |
202 | * entry could have thousands of entries. We were continuously linearly searching | |
203 | * each of these lists as we stepped through the address space we were tearing | |
204 | * down. Because of the locks we hold, likely taking a cache miss for each node, | |
205 | * and interrupt disabling for MP issues the system became completely unresponsive | |
206 | * for many seconds while we did this. | |
207 | * | |
208 | * Realizing that pve's are accessed in two distinct ways (linearly running the | |
209 | * list by ppn for operations like pmap_page_protect and finding and | |
210 | * modifying/removing a single pve as part of pmap_enter processing) has led to | |
211 | * modifying the pve structures and databases. | |
212 | * | |
213 | * There are now two types of pve structures. A "rooted" structure which is | |
214 | * basically the original structure accessed in an array by ppn, and a ''hashed'' | |
215 | * structure accessed on a hash list via a hash of [pmap, vaddr]. These have been | |
216 | * designed with the two goals of minimizing wired memory and making the lookup of | |
217 | * a ppn faster. Since a vast majority of pages in the system are not aliased | |
218 | * and hence represented by a single pv entry I've kept the rooted entry size as | |
219 | * small as possible because there is one of these dedicated for every physical | |
220 | * page of memory. The hashed pve's are larger due to the addition of the hash | |
221 | * link and the ppn entry needed for matching while running the hash list to find | |
222 | * the entry we are looking for. This way, only systems that have lots of | |
223 | * aliasing (like 2000+ httpd procs) will pay the extra memory price. Both | |
224 | * structures have the same first three fields allowing some simplification in | |
225 | * the code. | |
226 | * | |
227 | * They have these shapes | |
228 | * | |
229 | * typedef struct pv_rooted_entry { | |
230 | * queue_head_t qlink; | |
231 | * vm_map_offset_t va; | |
232 | * pmap_t pmap; | |
233 | * } *pv_rooted_entry_t; | |
234 | * | |
235 | * | |
236 | * typedef struct pv_hashed_entry { | |
237 | * queue_head_t qlink; | |
238 | * vm_map_offset_t va; | |
239 | * pmap_t pmap; | |
240 | * ppnum_t ppn; | |
241 | * struct pv_hashed_entry *nexth; | |
242 | * } *pv_hashed_entry_t; | |
243 | * | |
244 | * The main flow difference is that the code is now aware of the rooted entry and | |
245 | * the hashed entries. Code that runs the pv list still starts with the rooted | |
246 | * entry and then continues down the qlink onto the hashed entries. Code that is | |
247 | * looking up a specific pv entry first checks the rooted entry and then hashes | |
248 | * and runs the hash list for the match. The hash list lengths are much smaller | |
249 | * than the original pv lists that contained all aliases for the specific ppn. | |
250 | * | |
251 | */ | |
b7266188 | 252 | |
6d2010ae A |
253 | typedef struct pv_rooted_entry { |
254 | /* first three entries must match pv_hashed_entry_t */ | |
0a7de745 A |
255 | queue_head_t qlink; |
256 | vm_map_offset_t va_and_flags; /* virtual address for mapping */ | |
257 | pmap_t pmap; /* pmap where mapping lies */ | |
b7266188 A |
258 | } *pv_rooted_entry_t; |
259 | ||
0a7de745 | 260 | #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0) |
b7266188 | 261 | |
6d2010ae A |
262 | typedef struct pv_hashed_entry { |
263 | /* first three entries must match pv_rooted_entry_t */ | |
0a7de745 A |
264 | queue_head_t qlink; |
265 | vm_map_offset_t va_and_flags; | |
266 | pmap_t pmap; | |
267 | ppnum_t ppn; | |
268 | struct pv_hashed_entry *nexth; | |
b7266188 A |
269 | } *pv_hashed_entry_t; |
270 | ||
271 | #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0) | |
272 | ||
39037602 A |
273 | #define PVE_VA(pve) ((pve)->va_and_flags & ~PAGE_MASK) |
274 | #define PVE_FLAGS(pve) ((pve)->va_and_flags & PAGE_MASK) | |
275 | #define PVE_IS_ALTACCT 0x001 | |
276 | #define PVE_IS_ALTACCT_PAGE(pve) \ | |
277 | (((pve)->va_and_flags & PVE_IS_ALTACCT) ? TRUE : FALSE) | |
278 | ||
6d2010ae | 279 | //#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */ |
b7266188 | 280 | #ifdef PV_DEBUG |
fe8ab488 | 281 | #define CHK_NPVHASH() if(0 == npvhashmask) panic("npvhash uninitialized"); |
b7266188 | 282 | #else |
6d2010ae | 283 | #define CHK_NPVHASH(x) |
b7266188 A |
284 | #endif |
285 | ||
fe8ab488 A |
286 | #define NPVHASHBUCKETS (4096) |
287 | #define NPVHASHMASK ((NPVHASHBUCKETS) - 1) /* MUST BE 2^N - 1 */ | |
6d2010ae A |
288 | #define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000 |
289 | #define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000 | |
290 | #define PV_HASHED_ALLOC_CHUNK_INITIAL 2000 | |
291 | #define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200 | |
292 | ||
0a7de745 | 293 | extern volatile uint32_t mappingrecurse; |
6d2010ae A |
294 | extern uint32_t pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark; |
295 | ||
296 | /* | |
297 | * PV hash locking | |
298 | */ | |
299 | ||
0a7de745 A |
300 | #define LOCK_PV_HASH(hash) lock_hash_hash(hash) |
301 | #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash) | |
fe8ab488 | 302 | extern uint32_t npvhashmask; |
0a7de745 A |
303 | extern pv_hashed_entry_t *pv_hash_table; /* hash lists */ |
304 | extern pv_hashed_entry_t pv_hashed_free_list; | |
305 | extern pv_hashed_entry_t pv_hashed_kern_free_list; | |
6d2010ae A |
306 | decl_simple_lock_data(extern, pv_hashed_free_list_lock) |
307 | decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock) | |
308 | decl_simple_lock_data(extern, pv_hash_table_lock) | |
fe8ab488 | 309 | decl_simple_lock_data(extern, phys_backup_lock) |
6d2010ae | 310 | |
0a7de745 A |
311 | extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry |
312 | * structures */ | |
6d2010ae | 313 | |
0a7de745 A |
314 | extern uint32_t pv_hashed_free_count; |
315 | extern uint32_t pv_hashed_kern_free_count; | |
6d2010ae A |
316 | /* |
317 | * Each entry in the pv_head_table is locked by a bit in the | |
318 | * pv_lock_table. The lock bits are accessed by the address of | |
319 | * the frame they lock. | |
320 | */ | |
0a7de745 | 321 | #define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) |
6d2010ae | 322 | #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) |
0a7de745 A |
323 | extern char *pv_lock_table; /* pointer to array of bits */ |
324 | extern char *pv_hash_lock_table; | |
325 | extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */ | |
6d2010ae A |
326 | |
327 | extern event_t mapping_replenish_event; | |
328 | ||
0a7de745 A |
329 | static inline void |
330 | PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) | |
331 | { | |
316670eb | 332 | pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL); |
0a7de745 | 333 | simple_lock(&pv_hashed_free_list_lock, LCK_GRP_NULL); |
6d2010ae A |
334 | /* If the kernel reserved pool is low, let non-kernel mappings allocate |
335 | * synchronously, possibly subject to a throttle. | |
336 | */ | |
316670eb | 337 | if ((pv_hashed_kern_free_count > pv_hashed_kern_low_water_mark) && ((*pvh_ep = pv_hashed_free_list) != 0)) { |
6d2010ae A |
338 | pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next; |
339 | pv_hashed_free_count--; | |
340 | } | |
341 | ||
342 | simple_unlock(&pv_hashed_free_list_lock); | |
343 | ||
316670eb | 344 | if (pv_hashed_free_count <= pv_hashed_low_water_mark) { |
0a7de745 | 345 | if (!mappingrecurse && hw_compare_and_store(0, 1, &mappingrecurse)) { |
6d2010ae | 346 | thread_wakeup(&mapping_replenish_event); |
0a7de745 | 347 | } |
6d2010ae | 348 | } |
b7266188 A |
349 | } |
350 | ||
0a7de745 A |
351 | static inline void |
352 | PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) | |
353 | { | |
354 | simple_lock(&pv_hashed_free_list_lock, LCK_GRP_NULL); | |
6d2010ae A |
355 | pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; |
356 | pv_hashed_free_list = pvh_eh; | |
357 | pv_hashed_free_count += pv_cnt; | |
358 | simple_unlock(&pv_hashed_free_list_lock); | |
b7266188 A |
359 | } |
360 | ||
6d2010ae A |
361 | extern unsigned pmap_kern_reserve_alloc_stat; |
362 | ||
0a7de745 A |
363 | static inline void |
364 | PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) | |
365 | { | |
316670eb | 366 | pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL); |
0a7de745 | 367 | simple_lock(&pv_hashed_kern_free_list_lock, LCK_GRP_NULL); |
6d2010ae A |
368 | |
369 | if ((*pvh_e = pv_hashed_kern_free_list) != 0) { | |
370 | pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next; | |
371 | pv_hashed_kern_free_count--; | |
372 | pmap_kern_reserve_alloc_stat++; | |
373 | } | |
374 | ||
375 | simple_unlock(&pv_hashed_kern_free_list_lock); | |
376 | ||
377 | if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) { | |
0a7de745 | 378 | if (!mappingrecurse && hw_compare_and_store(0, 1, &mappingrecurse)) { |
6d2010ae | 379 | thread_wakeup(&mapping_replenish_event); |
0a7de745 | 380 | } |
6d2010ae | 381 | } |
b7266188 A |
382 | } |
383 | ||
0a7de745 A |
384 | static inline void |
385 | PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) | |
386 | { | |
387 | simple_lock(&pv_hashed_kern_free_list_lock, LCK_GRP_NULL); | |
6d2010ae A |
388 | pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; |
389 | pv_hashed_kern_free_list = pvh_eh; | |
390 | pv_hashed_kern_free_count += pv_cnt; | |
391 | simple_unlock(&pv_hashed_kern_free_list_lock); | |
392 | } | |
393 | ||
394 | extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters; | |
395 | extern event_t pmap_user_pv_throttle_event; | |
396 | ||
0a7de745 A |
397 | static inline void |
398 | pmap_pv_throttle(__unused pmap_t p) | |
399 | { | |
6d2010ae A |
400 | pmap_assert(p != kernel_pmap); |
401 | /* Apply throttle on non-kernel mappings */ | |
402 | if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) { | |
403 | pmap_pv_throttle_stat++; | |
404 | /* This doesn't need to be strictly accurate, merely a hint | |
405 | * to eliminate the timeout when the reserve is replenished. | |
406 | */ | |
407 | pmap_pv_throttled_waiters++; | |
408 | assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC); | |
409 | thread_block(THREAD_CONTINUE_NULL); | |
410 | } | |
b7266188 A |
411 | } |
412 | ||
413 | /* | |
414 | * Index into pv_head table, its lock bits, and the modify/reference and managed bits | |
415 | */ | |
416 | ||
0a7de745 A |
417 | #define pa_index(pa) (i386_btop(pa)) |
418 | #define ppn_to_pai(ppn) ((int)ppn) | |
b7266188 | 419 | |
0a7de745 A |
420 | #define pai_to_pvh(pai) (&pv_head_table[pai]) |
421 | #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table) | |
422 | #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table) | |
423 | #define pvhash(idx) (&pv_hash_table[idx]) | |
424 | #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table) | |
425 | #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table) | |
b7266188 | 426 | |
0a7de745 A |
427 | #define IS_MANAGED_PAGE(x) \ |
428 | ((unsigned int)(x) <= last_managed_page && \ | |
b7266188 | 429 | (pmap_phys_attributes[x] & PHYS_MANAGED)) |
0a7de745 | 430 | #define IS_INTERNAL_PAGE(x) \ |
39236c6e | 431 | (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_INTERNAL)) |
0a7de745 | 432 | #define IS_REUSABLE_PAGE(x) \ |
39236c6e | 433 | (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_REUSABLE)) |
0a7de745 A |
434 | #define IS_ALTACCT_PAGE(x, pve) \ |
435 | (IS_MANAGED_PAGE((x)) && \ | |
d190cdc3 | 436 | (PVE_IS_ALTACCT_PAGE((pve)))) |
b7266188 A |
437 | |
438 | /* | |
439 | * Physical page attributes. Copy bits from PTE definition. | |
440 | */ | |
0a7de745 A |
441 | #define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */ |
442 | #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */ | |
443 | #define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */ | |
444 | #define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */ | |
445 | #define PHYS_NCACHE INTEL_PTE_NCACHE | |
446 | #define PHYS_PAT INTEL_PTE_PAT | |
447 | #define PHYS_CACHEABILITY_MASK (INTEL_PTE_PAT | INTEL_PTE_NCACHE) | |
448 | #define PHYS_INTERNAL INTEL_PTE_WTHRU /* page from internal object */ | |
449 | #define PHYS_REUSABLE INTEL_PTE_WRITE /* page is "reusable" */ | |
450 | ||
451 | #if DEVELOPMENT || DEBUG | |
452 | extern boolean_t pmap_disable_kheap_nx; | |
453 | extern boolean_t pmap_disable_kstack_nx; | |
454 | #endif | |
316670eb A |
455 | |
456 | #define PMAP_EXPAND_OPTIONS_NONE (0x0) | |
457 | #define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT) | |
458 | #define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER) | |
5c9f4661 | 459 | #define PMAP_EXPAND_OPTIONS_ALIASMAP (0x40000000U) |
b7266188 A |
460 | /* |
461 | * Amount of virtual memory mapped by one | |
462 | * page-directory entry. | |
463 | */ | |
0a7de745 | 464 | #define PDE_MAPPED_SIZE (pdetova(1)) |
b7266188 A |
465 | |
466 | /* | |
467 | * Locking and TLB invalidation | |
468 | */ | |
469 | ||
470 | /* | |
471 | * Locking Protocols: (changed 2/2007 JK) | |
472 | * | |
473 | * There are two structures in the pmap module that need locking: | |
474 | * the pmaps themselves, and the per-page pv_lists (which are locked | |
475 | * by locking the pv_lock_table entry that corresponds to the pv_head | |
476 | * for the list in question.) Most routines want to lock a pmap and | |
477 | * then do operations in it that require pv_list locking -- however | |
478 | * pmap_remove_all and pmap_copy_on_write operate on a physical page | |
479 | * basis and want to do the locking in the reverse order, i.e. lock | |
480 | * a pv_list and then go through all the pmaps referenced by that list. | |
481 | * | |
482 | * The system wide pmap lock has been removed. Now, paths take a lock | |
483 | * on the pmap before changing its 'shape' and the reverse order lockers | |
484 | * (coming in by phys ppn) take a lock on the corresponding pv and then | |
485 | * retest to be sure nothing changed during the window before they locked | |
486 | * and can then run up/down the pv lists holding the list lock. This also | |
487 | * lets the pmap layer run (nearly completely) interrupt enabled, unlike | |
488 | * previously. | |
489 | */ | |
490 | ||
491 | /* | |
492 | * PV locking | |
493 | */ | |
494 | ||
0a7de745 A |
495 | #define LOCK_PVH(index) { \ |
496 | mp_disable_preemption(); \ | |
497 | lock_pvh_pai(index); \ | |
b7266188 A |
498 | } |
499 | ||
0a7de745 A |
500 | #define UNLOCK_PVH(index) { \ |
501 | unlock_pvh_pai(index); \ | |
502 | mp_enable_preemption(); \ | |
b7266188 | 503 | } |
b7266188 | 504 | |
b7266188 A |
505 | extern uint64_t pde_mapped_size; |
506 | ||
0a7de745 A |
507 | extern char *pmap_phys_attributes; |
508 | extern ppnum_t last_managed_page; | |
b7266188 | 509 | |
0a7de745 A |
510 | extern ppnum_t lowest_lo; |
511 | extern ppnum_t lowest_hi; | |
512 | extern ppnum_t highest_hi; | |
060df5ea | 513 | |
b7266188 A |
514 | /* |
515 | * when spinning through pmap_remove | |
516 | * ensure that we don't spend too much | |
517 | * time with preemption disabled. | |
518 | * I'm setting the current threshold | |
519 | * to 20us | |
520 | */ | |
521 | #define MAX_PREEMPTION_LATENCY_NS 20000 | |
522 | extern uint64_t max_preemption_latency_tsc; | |
523 | ||
5c9f4661 A |
524 | #if DEBUG |
525 | #define PMAP_INTR_DEBUG (1) | |
526 | #endif | |
527 | ||
528 | #if PMAP_INTR_DEBUG | |
0a7de745 A |
529 | #define pmap_intr_assert() { \ |
530 | if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \ | |
531 | panic("pmap interrupt assert %d %s, %d", processor_avail_count, __FILE__, __LINE__); \ | |
b7266188 A |
532 | } |
533 | #else | |
534 | #define pmap_intr_assert() | |
535 | #endif | |
0a7de745 A |
536 | #if DEVELOPMENT || DEBUG |
537 | extern int nx_enabled; | |
538 | #endif | |
6d2010ae | 539 | extern unsigned int inuse_ptepages_count; |
b7266188 A |
540 | |
541 | static inline uint32_t | |
542 | pvhashidx(pmap_t pmap, vm_map_offset_t va) | |
543 | { | |
fe8ab488 | 544 | uint32_t hashidx = ((uint32_t)(uintptr_t)pmap ^ |
0a7de745 A |
545 | ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) & |
546 | npvhashmask; | |
547 | return hashidx; | |
b7266188 A |
548 | } |
549 | ||
550 | /* | |
551 | * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain. | |
552 | * properly deals with the anchor. | |
553 | * must be called with the hash locked, does not unlock it | |
554 | */ | |
0a7de745 | 555 | static inline void |
b7266188 A |
556 | pmap_pvh_unlink(pv_hashed_entry_t pvh) |
557 | { | |
0a7de745 A |
558 | pv_hashed_entry_t curh; |
559 | pv_hashed_entry_t *pprevh; | |
560 | int pvhash_idx; | |
b7266188 A |
561 | |
562 | CHK_NPVHASH(); | |
39037602 | 563 | pvhash_idx = pvhashidx(pvh->pmap, PVE_VA(pvh)); |
b7266188 A |
564 | |
565 | pprevh = pvhash(pvhash_idx); | |
566 | ||
567 | #if PV_DEBUG | |
0a7de745 | 568 | if (NULL == *pprevh) { |
b7266188 | 569 | panic("pvh_unlink null anchor"); /* JK DEBUG */ |
0a7de745 | 570 | } |
b7266188 A |
571 | #endif |
572 | curh = *pprevh; | |
573 | ||
574 | while (PV_HASHED_ENTRY_NULL != curh) { | |
0a7de745 | 575 | if (pvh == curh) { |
b7266188 | 576 | break; |
0a7de745 | 577 | } |
b7266188 A |
578 | pprevh = &curh->nexth; |
579 | curh = curh->nexth; | |
580 | } | |
0a7de745 A |
581 | if (PV_HASHED_ENTRY_NULL == curh) { |
582 | panic("pmap_pvh_unlink no pvh"); | |
583 | } | |
b7266188 A |
584 | *pprevh = pvh->nexth; |
585 | return; | |
586 | } | |
587 | ||
588 | static inline void | |
0a7de745 A |
589 | pv_hash_add(pv_hashed_entry_t pvh_e, |
590 | pv_rooted_entry_t pv_h) | |
b7266188 A |
591 | { |
592 | pv_hashed_entry_t *hashp; | |
593 | int pvhash_idx; | |
594 | ||
595 | CHK_NPVHASH(); | |
39037602 | 596 | pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e)); |
b7266188 A |
597 | LOCK_PV_HASH(pvhash_idx); |
598 | insque(&pvh_e->qlink, &pv_h->qlink); | |
599 | hashp = pvhash(pvhash_idx); | |
600 | #if PV_DEBUG | |
0a7de745 | 601 | if (NULL == hashp) { |
b7266188 | 602 | panic("pv_hash_add(%p) null hash bucket", pvh_e); |
0a7de745 | 603 | } |
b7266188 A |
604 | #endif |
605 | pvh_e->nexth = *hashp; | |
606 | *hashp = pvh_e; | |
607 | UNLOCK_PV_HASH(pvhash_idx); | |
608 | } | |
609 | ||
610 | static inline void | |
611 | pv_hash_remove(pv_hashed_entry_t pvh_e) | |
612 | { | |
613 | int pvhash_idx; | |
614 | ||
615 | CHK_NPVHASH(); | |
0a7de745 | 616 | pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e)); |
b7266188 A |
617 | LOCK_PV_HASH(pvhash_idx); |
618 | remque(&pvh_e->qlink); | |
619 | pmap_pvh_unlink(pvh_e); | |
620 | UNLOCK_PV_HASH(pvhash_idx); | |
0a7de745 | 621 | } |
b7266188 | 622 | |
0a7de745 A |
623 | static inline boolean_t |
624 | popcnt1(uint64_t distance) | |
625 | { | |
626 | return (distance & (distance - 1)) == 0; | |
b7266188 A |
627 | } |
628 | ||
629 | /* | |
630 | * Routines to handle suppression of/recovery from some forms of pagetable corruption | |
631 | * incidents observed in the field. These can be either software induced (wild | |
632 | * stores to the mapwindows where applicable, use after free errors | |
633 | * (typically of pages addressed physically), mis-directed DMAs etc., or due | |
634 | * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors, | |
635 | * the recording mechanism is deliberately not MP-safe. The overarching goal is to | |
636 | * still assert on potential software races, but attempt recovery from incidents | |
637 | * identifiable as occurring due to issues beyond the control of the pmap module. | |
638 | * The latter includes single-bit errors and malformed pagetable entries. | |
639 | * We currently limit ourselves to recovery/suppression of one incident per | |
640 | * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident | |
641 | * are logged. | |
642 | * Assertions are not suppressed if kernel debugging is enabled. (DRK 09) | |
643 | */ | |
644 | ||
645 | typedef enum { | |
0a7de745 A |
646 | PTE_VALID = 0x0, |
647 | PTE_INVALID = 0x1, | |
648 | PTE_RSVD = 0x2, | |
649 | PTE_SUPERVISOR = 0x4, | |
650 | PTE_BITFLIP = 0x8, | |
651 | PV_BITFLIP = 0x10, | |
b7266188 A |
652 | PTE_INVALID_CACHEABILITY = 0x20 |
653 | } pmap_pagetable_corruption_t; | |
654 | ||
655 | typedef enum { | |
656 | ROOT_PRESENT = 0, | |
657 | ROOT_ABSENT = 1 | |
658 | } pmap_pv_assertion_t; | |
659 | ||
660 | typedef enum { | |
0a7de745 A |
661 | PMAP_ACTION_IGNORE = 0x0, |
662 | PMAP_ACTION_ASSERT = 0x1, | |
663 | PMAP_ACTION_RETRY = 0x2, | |
b7266188 A |
664 | PMAP_ACTION_RETRY_RELOCK = 0x4 |
665 | } pmap_pagetable_corruption_action_t; | |
666 | ||
0a7de745 | 667 | #define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL) |
b7266188 A |
668 | extern uint64_t pmap_pagetable_corruption_interval_abstime; |
669 | ||
670 | extern uint32_t pmap_pagetable_corruption_incidents; | |
671 | #define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8) | |
672 | typedef struct { | |
673 | pmap_pv_assertion_t incident; | |
674 | pmap_pagetable_corruption_t reason; | |
675 | pmap_pagetable_corruption_action_t action; | |
0a7de745 | 676 | pmap_t pmap; |
b7266188 A |
677 | vm_map_offset_t vaddr; |
678 | pt_entry_t pte; | |
679 | ppnum_t ppn; | |
680 | pmap_t pvpmap; | |
681 | vm_map_offset_t pvva; | |
682 | uint64_t abstime; | |
683 | } pmap_pagetable_corruption_record_t; | |
684 | ||
685 | extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[]; | |
686 | extern uint64_t pmap_pagetable_corruption_last_abstime; | |
0a7de745 | 687 | extern thread_call_t pmap_pagetable_corruption_log_call; |
b7266188 A |
688 | extern boolean_t pmap_pagetable_corruption_timeout; |
689 | ||
690 | static inline void | |
0a7de745 A |
691 | pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) |
692 | { | |
b7266188 A |
693 | uint32_t pmap_pagetable_corruption_log_index; |
694 | pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG; | |
695 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident; | |
696 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason; | |
697 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action; | |
698 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap; | |
699 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr; | |
700 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep; | |
701 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn; | |
702 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap; | |
703 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva; | |
704 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time(); | |
705 | /* Asynchronously log */ | |
706 | thread_call_enter(pmap_pagetable_corruption_log_call); | |
707 | } | |
708 | ||
709 | static inline pmap_pagetable_corruption_action_t | |
0a7de745 A |
710 | pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) |
711 | { | |
712 | pmap_pagetable_corruption_action_t action = PMAP_ACTION_ASSERT; | |
713 | pmap_pagetable_corruption_t suppress_reason = PTE_VALID; | |
714 | ppnum_t suppress_ppn = 0; | |
b7266188 | 715 | pt_entry_t cpte = *ptep; |
0a7de745 A |
716 | ppnum_t cpn = pa_index(pte_to_pa(cpte)); |
717 | ppnum_t ppn = *ppnp; | |
718 | pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn)); | |
719 | pv_rooted_entry_t pv_e = pv_h; | |
720 | uint32_t bitdex; | |
b7266188 | 721 | pmap_t pvpmap = pv_h->pmap; |
39037602 | 722 | vm_map_offset_t pvva = PVE_VA(pv_h); |
d190cdc3 | 723 | vm_map_offset_t pve_flags; |
b7266188 | 724 | boolean_t ppcd = FALSE; |
3e170ce0 | 725 | boolean_t is_ept; |
b7266188 A |
726 | |
727 | /* Ideally, we'd consult the Mach VM here to definitively determine | |
728 | * the nature of the mapping for this address space and address. | |
729 | * As that would be a layering violation in this context, we | |
730 | * use various heuristics to recover from single bit errors, | |
731 | * malformed pagetable entries etc. These are not intended | |
732 | * to be comprehensive. | |
733 | */ | |
734 | ||
735 | /* As a precautionary measure, mark A+D */ | |
736 | pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED); | |
3e170ce0 | 737 | is_ept = is_ept_pmap(pmap); |
b7266188 A |
738 | |
739 | /* | |
740 | * Correct potential single bit errors in either (but not both) element | |
741 | * of the PV | |
742 | */ | |
743 | do { | |
39037602 A |
744 | if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && PVE_VA(pv_e) == vaddr) || |
745 | (pv_e->pmap == pmap && popcnt1(PVE_VA(pv_e) ^ vaddr))) { | |
d190cdc3 | 746 | pve_flags = PVE_FLAGS(pv_e); |
b7266188 | 747 | pv_e->pmap = pmap; |
d190cdc3 | 748 | pv_h->va_and_flags = vaddr | pve_flags; |
b7266188 A |
749 | suppress_reason = PV_BITFLIP; |
750 | action = PMAP_ACTION_RETRY; | |
751 | goto pmap_cpc_exit; | |
752 | } | |
316670eb | 753 | } while (((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink))) && (pv_e != pv_h)); |
b7266188 A |
754 | |
755 | /* Discover root entries with a Hamming | |
756 | * distance of 1 from the supplied | |
757 | * physical page frame. | |
758 | */ | |
759 | for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) { | |
760 | ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex); | |
761 | if (IS_MANAGED_PAGE(npn)) { | |
762 | pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn)); | |
39037602 | 763 | if (PVE_VA(npv_h) == vaddr && npv_h->pmap == pmap) { |
b7266188 A |
764 | suppress_reason = PTE_BITFLIP; |
765 | suppress_ppn = npn; | |
766 | action = PMAP_ACTION_RETRY_RELOCK; | |
767 | UNLOCK_PVH(ppn_to_pai(ppn)); | |
768 | *ppnp = npn; | |
769 | goto pmap_cpc_exit; | |
770 | } | |
771 | } | |
772 | } | |
773 | ||
774 | if (pmap == kernel_pmap) { | |
775 | action = PMAP_ACTION_ASSERT; | |
776 | goto pmap_cpc_exit; | |
777 | } | |
778 | ||
3e170ce0 A |
779 | /* |
780 | * Check for malformed/inconsistent entries. | |
781 | * The first check here isn't useful for EPT PTEs because INTEL_EPT_NCACHE == 0 | |
782 | */ | |
0a7de745 | 783 | if (!is_ept && ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PAT)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU))) { |
b7266188 A |
784 | action = PMAP_ACTION_IGNORE; |
785 | suppress_reason = PTE_INVALID_CACHEABILITY; | |
0a7de745 | 786 | } else if (cpte & INTEL_PTE_RSVD) { |
b7266188 A |
787 | action = PMAP_ACTION_IGNORE; |
788 | suppress_reason = PTE_RSVD; | |
0a7de745 | 789 | } else if ((pmap != kernel_pmap) && (!is_ept) && ((cpte & INTEL_PTE_USER) == 0)) { |
b7266188 A |
790 | action = PMAP_ACTION_IGNORE; |
791 | suppress_reason = PTE_SUPERVISOR; | |
792 | } | |
793 | pmap_cpc_exit: | |
794 | PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd)); | |
795 | ||
796 | if (debug_boot_arg && !ppcd) { | |
797 | action = PMAP_ACTION_ASSERT; | |
798 | } | |
799 | ||
800 | if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) { | |
801 | action = PMAP_ACTION_ASSERT; | |
802 | pmap_pagetable_corruption_timeout = TRUE; | |
0a7de745 | 803 | } else { |
b7266188 A |
804 | pmap_pagetable_corruption_last_abstime = mach_absolute_time(); |
805 | } | |
806 | pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva); | |
807 | return action; | |
808 | } | |
6d2010ae | 809 | |
b7266188 A |
810 | /* |
811 | * Remove pv list entry. | |
812 | * Called with pv_head_table entry locked. | |
813 | * Returns pv entry to be freed (or NULL). | |
814 | */ | |
b7266188 | 815 | static inline __attribute__((always_inline)) pv_hashed_entry_t |
0a7de745 A |
816 | pmap_pv_remove(pmap_t pmap, |
817 | vm_map_offset_t vaddr, | |
818 | ppnum_t *ppnp, | |
819 | pt_entry_t *pte, | |
820 | boolean_t *was_altacct) | |
b7266188 A |
821 | { |
822 | pv_hashed_entry_t pvh_e; | |
0a7de745 A |
823 | pv_rooted_entry_t pv_h; |
824 | pv_hashed_entry_t *pprevh; | |
b7266188 A |
825 | int pvhash_idx; |
826 | uint32_t pv_cnt; | |
0a7de745 | 827 | ppnum_t ppn; |
b7266188 | 828 | |
d190cdc3 | 829 | *was_altacct = FALSE; |
b7266188 A |
830 | pmap_pv_remove_retry: |
831 | ppn = *ppnp; | |
832 | pvh_e = PV_HASHED_ENTRY_NULL; | |
833 | pv_h = pai_to_pvh(ppn_to_pai(ppn)); | |
834 | ||
316670eb | 835 | if (__improbable(pv_h->pmap == PMAP_NULL)) { |
b7266188 | 836 | pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT); |
0a7de745 | 837 | if (pac == PMAP_ACTION_IGNORE) { |
b7266188 | 838 | goto pmap_pv_remove_exit; |
0a7de745 | 839 | } else if (pac == PMAP_ACTION_ASSERT) { |
5ba3f43e | 840 | panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list, priors: %d", pmap, vaddr, ppn, *pte, ppnp, pte, pmap_pagetable_corruption_incidents); |
0a7de745 | 841 | } else if (pac == PMAP_ACTION_RETRY_RELOCK) { |
b7266188 A |
842 | LOCK_PVH(ppn_to_pai(*ppnp)); |
843 | pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); | |
844 | goto pmap_pv_remove_retry; | |
0a7de745 | 845 | } else if (pac == PMAP_ACTION_RETRY) { |
b7266188 | 846 | goto pmap_pv_remove_retry; |
0a7de745 | 847 | } |
b7266188 A |
848 | } |
849 | ||
39037602 | 850 | if (PVE_VA(pv_h) == vaddr && pv_h->pmap == pmap) { |
d190cdc3 | 851 | *was_altacct = IS_ALTACCT_PAGE(ppn_to_pai(*ppnp), pv_h); |
b7266188 | 852 | /* |
0a7de745 | 853 | * Header is the pv_rooted_entry. |
b7266188 | 854 | * We can't free that. If there is a queued |
0a7de745 A |
855 | * entry after this one we remove that |
856 | * from the ppn queue, we remove it from the hash chain | |
857 | * and copy it to the rooted entry. Then free it instead. | |
858 | */ | |
b7266188 A |
859 | pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink); |
860 | if (pv_h != (pv_rooted_entry_t) pvh_e) { | |
861 | /* | |
862 | * Entry queued to root, remove this from hash | |
863 | * and install as new root. | |
864 | */ | |
865 | CHK_NPVHASH(); | |
39037602 | 866 | pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e)); |
b7266188 A |
867 | LOCK_PV_HASH(pvhash_idx); |
868 | remque(&pvh_e->qlink); | |
869 | pprevh = pvhash(pvhash_idx); | |
870 | if (PV_HASHED_ENTRY_NULL == *pprevh) { | |
39236c6e | 871 | panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x): " |
0a7de745 | 872 | "empty hash, removing rooted, priors: %d", |
5ba3f43e | 873 | pmap, vaddr, ppn, pmap_pagetable_corruption_incidents); |
b7266188 A |
874 | } |
875 | pmap_pvh_unlink(pvh_e); | |
876 | UNLOCK_PV_HASH(pvhash_idx); | |
877 | pv_h->pmap = pvh_e->pmap; | |
d190cdc3 | 878 | pv_h->va_and_flags = pvh_e->va_and_flags; |
39037602 | 879 | /* dispose of pvh_e */ |
b7266188 A |
880 | } else { |
881 | /* none queued after rooted */ | |
882 | pv_h->pmap = PMAP_NULL; | |
883 | pvh_e = PV_HASHED_ENTRY_NULL; | |
884 | } | |
885 | } else { | |
886 | /* | |
887 | * not removing rooted pv. find it on hash chain, remove from | |
888 | * ppn queue and hash chain and free it | |
889 | */ | |
890 | CHK_NPVHASH(); | |
891 | pvhash_idx = pvhashidx(pmap, vaddr); | |
892 | LOCK_PV_HASH(pvhash_idx); | |
893 | pprevh = pvhash(pvhash_idx); | |
894 | if (PV_HASHED_ENTRY_NULL == *pprevh) { | |
5ba3f43e A |
895 | panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash, priors: %d", |
896 | pmap, vaddr, ppn, *pte, pte, pmap_pagetable_corruption_incidents); | |
b7266188 A |
897 | } |
898 | pvh_e = *pprevh; | |
899 | pmap_pv_hashlist_walks++; | |
900 | pv_cnt = 0; | |
901 | while (PV_HASHED_ENTRY_NULL != pvh_e) { | |
902 | pv_cnt++; | |
903 | if (pvh_e->pmap == pmap && | |
39037602 | 904 | PVE_VA(pvh_e) == vaddr && |
0a7de745 | 905 | pvh_e->ppn == ppn) { |
b7266188 | 906 | break; |
0a7de745 | 907 | } |
b7266188 A |
908 | pprevh = &pvh_e->nexth; |
909 | pvh_e = pvh_e->nexth; | |
910 | } | |
6d2010ae | 911 | |
b7266188 A |
912 | if (PV_HASHED_ENTRY_NULL == pvh_e) { |
913 | pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT); | |
914 | ||
0a7de745 | 915 | if (pac == PMAP_ACTION_ASSERT) { |
5ba3f43e | 916 | panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx, priors: %d", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, PVE_VA(pv_h), pmap_pagetable_corruption_incidents); |
0a7de745 | 917 | } else { |
b7266188 A |
918 | UNLOCK_PV_HASH(pvhash_idx); |
919 | if (pac == PMAP_ACTION_RETRY_RELOCK) { | |
920 | LOCK_PVH(ppn_to_pai(*ppnp)); | |
921 | pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); | |
922 | goto pmap_pv_remove_retry; | |
0a7de745 | 923 | } else if (pac == PMAP_ACTION_RETRY) { |
b7266188 | 924 | goto pmap_pv_remove_retry; |
0a7de745 | 925 | } else if (pac == PMAP_ACTION_IGNORE) { |
b7266188 A |
926 | goto pmap_pv_remove_exit; |
927 | } | |
928 | } | |
929 | } | |
6d2010ae | 930 | |
d190cdc3 A |
931 | *was_altacct = IS_ALTACCT_PAGE(ppn_to_pai(*ppnp), pvh_e); |
932 | ||
b7266188 | 933 | pmap_pv_hashlist_cnts += pv_cnt; |
0a7de745 | 934 | if (pmap_pv_hashlist_max < pv_cnt) { |
b7266188 | 935 | pmap_pv_hashlist_max = pv_cnt; |
0a7de745 | 936 | } |
b7266188 A |
937 | *pprevh = pvh_e->nexth; |
938 | remque(&pvh_e->qlink); | |
939 | UNLOCK_PV_HASH(pvhash_idx); | |
940 | } | |
941 | pmap_pv_remove_exit: | |
942 | return pvh_e; | |
943 | } | |
944 | ||
d190cdc3 A |
945 | static inline __attribute__((always_inline)) boolean_t |
946 | pmap_pv_is_altacct( | |
0a7de745 A |
947 | pmap_t pmap, |
948 | vm_map_offset_t vaddr, | |
949 | ppnum_t ppn) | |
d190cdc3 A |
950 | { |
951 | pv_hashed_entry_t pvh_e; | |
0a7de745 | 952 | pv_rooted_entry_t pv_h; |
d190cdc3 | 953 | int pvhash_idx; |
0a7de745 | 954 | boolean_t is_altacct; |
d190cdc3 A |
955 | |
956 | pvh_e = PV_HASHED_ENTRY_NULL; | |
957 | pv_h = pai_to_pvh(ppn_to_pai(ppn)); | |
958 | ||
959 | if (__improbable(pv_h->pmap == PMAP_NULL)) { | |
960 | return FALSE; | |
961 | } | |
962 | ||
963 | if (PVE_VA(pv_h) == vaddr && pv_h->pmap == pmap) { | |
964 | /* | |
0a7de745 A |
965 | * Header is the pv_rooted_entry. |
966 | */ | |
d190cdc3 A |
967 | return IS_ALTACCT_PAGE(ppn, pv_h); |
968 | } | |
969 | ||
970 | CHK_NPVHASH(); | |
971 | pvhash_idx = pvhashidx(pmap, vaddr); | |
972 | LOCK_PV_HASH(pvhash_idx); | |
973 | pvh_e = *(pvhash(pvhash_idx)); | |
d190cdc3 A |
974 | while (PV_HASHED_ENTRY_NULL != pvh_e) { |
975 | if (pvh_e->pmap == pmap && | |
976 | PVE_VA(pvh_e) == vaddr && | |
0a7de745 | 977 | pvh_e->ppn == ppn) { |
d190cdc3 | 978 | break; |
0a7de745 | 979 | } |
d190cdc3 A |
980 | pvh_e = pvh_e->nexth; |
981 | } | |
982 | if (PV_HASHED_ENTRY_NULL == pvh_e) { | |
983 | is_altacct = FALSE; | |
984 | } else { | |
985 | is_altacct = IS_ALTACCT_PAGE(ppn, pvh_e); | |
986 | } | |
987 | UNLOCK_PV_HASH(pvhash_idx); | |
988 | ||
989 | return is_altacct; | |
990 | } | |
6d2010ae | 991 | |
0a7de745 | 992 | extern int pt_fake_zone_index; |
6d2010ae | 993 | static inline void |
316670eb | 994 | PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes) |
6d2010ae | 995 | { |
316670eb | 996 | pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes); |
6d2010ae A |
997 | } |
998 | ||
999 | static inline void | |
316670eb | 1000 | PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes) |
6d2010ae | 1001 | { |
316670eb | 1002 | pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes); |
6d2010ae A |
1003 | } |
1004 | ||
316670eb A |
1005 | static inline void |
1006 | PMAP_ZINFO_SALLOC(pmap_t pmap, vm_size_t bytes) | |
1007 | { | |
1008 | pmap_ledger_credit(pmap, task_ledgers.tkm_shared, bytes); | |
1009 | } | |
1010 | ||
1011 | static inline void | |
1012 | PMAP_ZINFO_SFREE(pmap_t pmap, vm_size_t bytes) | |
1013 | { | |
1014 | pmap_ledger_debit(pmap, task_ledgers.tkm_shared, bytes); | |
1015 | } | |
1016 | ||
0a7de745 | 1017 | extern boolean_t pmap_initialized;/* Has pmap_init completed? */ |
6d2010ae A |
1018 | #define valid_page(x) (pmap_initialized && pmap_valid_page(x)) |
1019 | ||
0a7de745 A |
1020 | int phys_attribute_test( |
1021 | ppnum_t phys, | |
1022 | int bits); | |
1023 | void phys_attribute_clear( | |
1024 | ppnum_t phys, | |
1025 | int bits, | |
1026 | unsigned int options, | |
1027 | void *arg); | |
6d2010ae A |
1028 | |
1029 | //#define PCID_DEBUG 1 | |
0a7de745 A |
1030 | #if PCID_DEBUG |
1031 | #define pmap_pcid_log(fmt, args...) \ | |
1032 | do { \ | |
1033 | kprintf(fmt, ##args); \ | |
1034 | printf(fmt, ##args); \ | |
6d2010ae A |
1035 | } while(0) |
1036 | #else | |
1037 | #define pmap_pcid_log(fmt, args...) | |
1038 | #endif | |
0a7de745 | 1039 | void pmap_pcid_configure(void); |
6d2010ae | 1040 | |
316670eb A |
1041 | |
1042 | /* | |
1043 | * Atomic 64-bit compare and exchange of a page table entry. | |
1044 | */ | |
0a7de745 A |
1045 | |
1046 | #include <machine/atomic.h> | |
316670eb A |
1047 | static inline boolean_t |
1048 | pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new) | |
1049 | { | |
0a7de745 A |
1050 | return __c11_atomic_compare_exchange_strong((_Atomic pt_entry_t *)entryp, &old, new, |
1051 | memory_order_acq_rel_smp, memory_order_relaxed); | |
316670eb A |
1052 | } |
1053 | ||
1054 | extern uint32_t pmap_update_clear_pte_count; | |
1055 | ||
0a7de745 A |
1056 | static inline void |
1057 | pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits) | |
1058 | { | |
316670eb A |
1059 | pt_entry_t npte, opte; |
1060 | do { | |
1061 | opte = *mptep; | |
1062 | if (__improbable(opte == 0)) { | |
0a7de745 | 1063 | #if DEVELOPMENT || DEBUG |
316670eb | 1064 | pmap_update_clear_pte_count++; |
0a7de745 | 1065 | #endif |
316670eb A |
1066 | break; |
1067 | } | |
1068 | npte = opte & ~(pclear_bits); | |
1069 | npte |= pset_bits; | |
0a7de745 | 1070 | } while (!pmap_cmpx_pte(mptep, opte, npte)); |
316670eb A |
1071 | } |
1072 | ||
6d2010ae A |
1073 | /* |
1074 | * The single pml4 page per pmap is allocated at pmap create time and exists | |
1075 | * for the duration of the pmap. we allocate this page in kernel vm. | |
1076 | * this returns the address of the requested pml4 entry in the top level page. | |
1077 | */ | |
1078 | static inline | |
1079 | pml4_entry_t * | |
1080 | pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr) | |
1081 | { | |
316670eb | 1082 | if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) && |
0a7de745 A |
1083 | (vaddr < 0xFFFF800000000000ULL))) { |
1084 | return NULL; | |
316670eb A |
1085 | } |
1086 | ||
0a7de745 A |
1087 | #if DEBUG |
1088 | return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG - 1)]); | |
6d2010ae | 1089 | #else |
0a7de745 | 1090 | return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG - 1)]; |
6d2010ae A |
1091 | #endif |
1092 | } | |
1093 | ||
5c9f4661 A |
1094 | static inline pml4_entry_t * |
1095 | pmap64_user_pml4(pmap_t pmap, vm_map_offset_t vaddr) | |
1096 | { | |
1097 | if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) && | |
0a7de745 A |
1098 | (vaddr < 0xFFFF800000000000ULL))) { |
1099 | return NULL; | |
5c9f4661 A |
1100 | } |
1101 | ||
0a7de745 A |
1102 | #if DEBUG |
1103 | return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_ucr3)[(vaddr >> PML4SHIFT) & (NPML4PG - 1)]); | |
5c9f4661 | 1104 | #else |
0a7de745 | 1105 | return &pmap->pm_upml4[(vaddr >> PML4SHIFT) & (NPML4PG - 1)]; |
5c9f4661 A |
1106 | #endif |
1107 | } | |
1108 | ||
6d2010ae A |
1109 | /* |
1110 | * Returns address of requested PDPT entry in the physmap. | |
1111 | */ | |
1112 | static inline pdpt_entry_t * | |
1113 | pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr) | |
1114 | { | |
0a7de745 A |
1115 | pml4_entry_t newpf; |
1116 | pml4_entry_t *pml4; | |
1117 | boolean_t is_ept; | |
6d2010ae | 1118 | |
6d2010ae | 1119 | pml4 = pmap64_pml4(pmap, vaddr); |
3e170ce0 A |
1120 | is_ept = is_ept_pmap(pmap); |
1121 | ||
1122 | if (pml4 && (*pml4 & PTE_VALID_MASK(is_ept))) { | |
6d2010ae A |
1123 | newpf = *pml4 & PG_FRAME; |
1124 | return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf)) | |
0a7de745 | 1125 | [(vaddr >> PDPTSHIFT) & (NPDPTPG - 1)]; |
6d2010ae | 1126 | } |
0a7de745 | 1127 | return NULL; |
6d2010ae A |
1128 | } |
1129 | /* | |
1130 | * Returns the address of the requested PDE entry in the physmap. | |
1131 | */ | |
1132 | static inline pd_entry_t * | |
0a7de745 | 1133 | pmap_pde_internal1(vm_map_offset_t vaddr, boolean_t is_ept, pdpt_entry_t *pdpte) |
6d2010ae | 1134 | { |
0a7de745 A |
1135 | if (*pdpte & PTE_VALID_MASK(is_ept)) { |
1136 | pdpt_entry_t newpf = *pdpte & PG_FRAME; | |
1137 | return &((pd_entry_t *) PHYSMAP_PTOV(newpf)) | |
1138 | [(vaddr >> PDSHIFT) & (NPDPG - 1)]; | |
1139 | } else { | |
1140 | return NULL; | |
1141 | } | |
1142 | } | |
6d2010ae | 1143 | |
0a7de745 A |
1144 | static inline pd_entry_t * |
1145 | pmap_pde_internal0(pmap_t pmap, vm_map_offset_t vaddr, boolean_t is_ept) | |
1146 | { | |
1147 | pdpt_entry_t *pdpt; | |
6d2010ae | 1148 | |
0a7de745 A |
1149 | pdpt = pmap64_pdpt(pmap, vaddr); |
1150 | if (pdpt) { | |
1151 | return pmap_pde_internal1(vaddr, is_ept, pdpt); | |
1152 | } else { | |
1153 | return NULL; | |
6d2010ae | 1154 | } |
6d2010ae A |
1155 | } |
1156 | ||
0a7de745 A |
1157 | |
1158 | static inline pd_entry_t * | |
1159 | pmap_pde(pmap_t pmap, vm_map_offset_t vaddr) | |
6d2010ae | 1160 | { |
0a7de745 A |
1161 | pdpt_entry_t *pdpt; |
1162 | boolean_t is_ept; | |
6d2010ae | 1163 | |
0a7de745 A |
1164 | pdpt = pmap64_pdpt(pmap, vaddr); |
1165 | is_ept = is_ept_pmap(pmap); | |
6d2010ae | 1166 | |
0a7de745 A |
1167 | if (pdpt) { |
1168 | return pmap_pde_internal1(vaddr, is_ept, pdpt); | |
1169 | } else { | |
1170 | return NULL; | |
1171 | } | |
6d2010ae A |
1172 | } |
1173 | ||
1174 | ||
1175 | /* | |
1176 | * return address of mapped pte for vaddr va in pmap pmap. | |
1177 | * | |
1178 | * In case the pde maps a superpage, return the pde, which, in this case | |
1179 | * is the actual page table entry. | |
1180 | */ | |
0a7de745 A |
1181 | |
1182 | ||
1183 | static inline pt_entry_t * | |
1184 | pmap_pte_internal(vm_map_offset_t vaddr, boolean_t is_ept, pd_entry_t *pde) | |
1185 | { | |
1186 | if (*pde & PTE_VALID_MASK(is_ept)) { | |
1187 | if (__improbable(*pde & PTE_PS)) { | |
1188 | return pde; | |
1189 | } | |
1190 | pd_entry_t newpf = *pde & PG_FRAME; | |
1191 | ||
1192 | return &((pt_entry_t *)PHYSMAP_PTOV(newpf)) | |
1193 | [i386_btop(vaddr) & (ppnum_t)(NPTEPG - 1)]; | |
1194 | } else { | |
1195 | return NULL; | |
1196 | } | |
1197 | } | |
1198 | ||
6d2010ae A |
1199 | static inline pt_entry_t * |
1200 | pmap_pte(pmap_t pmap, vm_map_offset_t vaddr) | |
1201 | { | |
0a7de745 | 1202 | pd_entry_t *pde; |
6d2010ae | 1203 | |
0a7de745 | 1204 | boolean_t is_ept; |
6d2010ae | 1205 | |
3e170ce0 A |
1206 | is_ept = is_ept_pmap(pmap); |
1207 | ||
0a7de745 A |
1208 | pde = pmap_pde_internal0(pmap, vaddr, is_ept); |
1209 | ||
1210 | if (pde) { | |
1211 | return pmap_pte_internal(vaddr, is_ept, pde); | |
1212 | } else { | |
1213 | return NULL; | |
6d2010ae | 1214 | } |
6d2010ae | 1215 | } |
0a7de745 A |
1216 | |
1217 | extern void pmap_alias( | |
1218 | vm_offset_t ava, | |
1219 | vm_map_offset_t start, | |
1220 | vm_map_offset_t end, | |
1221 | vm_prot_t prot, | |
1222 | unsigned int options); | |
1223 | ||
1224 | #if DEBUG | |
1225 | #define DPRINTF(x...) kprintf(x) | |
316670eb A |
1226 | #else |
1227 | #define DPRINTF(x...) | |
1228 | #endif | |
1229 | ||
b0d623f7 | 1230 | #endif /* MACH_KERNEL_PRIVATE */ |
316670eb | 1231 | #endif /* _I386_PMAP_INTERNAL_ */ |