]>
Commit | Line | Data |
---|---|---|
1c79356b | 1 | /* |
b0d623f7 | 2 | * Copyright (c) 2000-2009 Apple Inc. All rights reserved. |
1c79356b | 3 | * |
2d21ac55 | 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
1c79356b | 5 | * |
2d21ac55 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
8f6c56a5 | 14 | * |
2d21ac55 A |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
8f6c56a5 A |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
2d21ac55 A |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
8f6c56a5 | 25 | * |
2d21ac55 | 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
1c79356b | 27 | */ |
1c79356b | 28 | |
1c79356b | 29 | |
316670eb A |
30 | #ifndef _I386_PMAP_INTERNAL_ |
31 | #define _I386_PMAP_INTERNAL_ | |
b0d623f7 | 32 | #ifdef MACH_KERNEL_PRIVATE |
1c79356b | 33 | |
316670eb A |
34 | #include <vm/pmap.h> |
35 | #include <sys/kdebug.h> | |
36 | #include <kern/ledger.h> | |
37 | ||
b0d623f7 A |
38 | /* |
39 | * pmap locking | |
2d21ac55 | 40 | */ |
0b4e3aa0 | 41 | |
b0d623f7 A |
42 | #define PMAP_LOCK(pmap) { \ |
43 | simple_lock(&(pmap)->lock); \ | |
44 | } | |
0b4e3aa0 | 45 | |
b0d623f7 A |
46 | #define PMAP_UNLOCK(pmap) { \ |
47 | simple_unlock(&(pmap)->lock); \ | |
48 | } | |
1c79356b | 49 | |
b0d623f7 | 50 | #define PMAP_UPDATE_TLBS(pmap, s, e) \ |
6d2010ae | 51 | pmap_flush_tlbs(pmap, s, e) |
1c79356b | 52 | |
b0d623f7 | 53 | #define iswired(pte) ((pte) & INTEL_PTE_WIRED) |
1c79356b | 54 | |
b0d623f7 A |
55 | #ifdef PMAP_TRACES |
56 | extern boolean_t pmap_trace; | |
57 | #define PMAP_TRACE(x,a,b,c,d,e) \ | |
58 | if (pmap_trace) { \ | |
59 | KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \ | |
60 | } | |
61 | #else | |
62 | #define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e) | |
63 | #endif /* PMAP_TRACES */ | |
1c79356b | 64 | |
6d2010ae A |
65 | #define PMAP_TRACE_CONSTANT(x,a,b,c,d,e) \ |
66 | KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \ | |
67 | ||
316670eb | 68 | kern_return_t pmap_expand_pml4( |
b0d623f7 | 69 | pmap_t map, |
316670eb A |
70 | vm_map_offset_t v, |
71 | unsigned int options); | |
b0d623f7 | 72 | |
316670eb | 73 | kern_return_t pmap_expand_pdpt( |
b0d623f7 | 74 | pmap_t map, |
316670eb A |
75 | vm_map_offset_t v, |
76 | unsigned int options); | |
b7266188 | 77 | |
6d2010ae A |
78 | void phys_attribute_set( |
79 | ppnum_t phys, | |
80 | int bits); | |
81 | ||
82 | void pmap_set_reference( | |
83 | ppnum_t pn); | |
84 | ||
85 | boolean_t phys_page_exists( | |
86 | ppnum_t pn); | |
87 | ||
88 | void pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t); | |
89 | ||
90 | void | |
91 | pmap_update_cache_attributes_locked(ppnum_t, unsigned); | |
92 | ||
93 | #if CONFIG_YONAH | |
b0d623f7 | 94 | extern boolean_t cpu_64bit; |
6d2010ae A |
95 | #else |
96 | extern const boolean_t cpu_64bit; | |
2d21ac55 | 97 | #endif |
b0d623f7 | 98 | |
b7266188 A |
99 | /* |
100 | * Private data structures. | |
101 | */ | |
102 | ||
103 | /* | |
104 | * For each vm_page_t, there is a list of all currently | |
105 | * valid virtual mappings of that page. An entry is | |
106 | * a pv_rooted_entry_t; the list is the pv_table. | |
107 | * | |
108 | * N.B. with the new combo rooted/hashed scheme it is | |
109 | * only possibly to remove individual non-rooted entries | |
110 | * if they are found via the hashed chains as there is no | |
111 | * way to unlink the singly linked hashed entries if navigated to | |
112 | * via the queue list off the rooted entries. Think of it as | |
113 | * hash/walk/pull, keeping track of the prev pointer while walking | |
114 | * the singly linked hash list. All of this is to save memory and | |
115 | * keep both types of pv_entries as small as possible. | |
116 | */ | |
117 | ||
118 | /* | |
119 | ||
120 | PV HASHING Changes - JK 1/2007 | |
121 | ||
122 | Pve's establish physical to virtual mappings. These are used for aliasing of a | |
6d2010ae A |
123 | physical page to (potentially many) virtual addresses within pmaps. In the |
124 | previous implementation the structure of the pv_entries (each 16 bytes in size) was | |
b7266188 A |
125 | |
126 | typedef struct pv_entry { | |
127 | struct pv_entry_t next; | |
128 | pmap_t pmap; | |
129 | vm_map_offset_t va; | |
130 | } *pv_entry_t; | |
131 | ||
6d2010ae A |
132 | An initial array of these is created at boot time, one per physical page of |
133 | memory, indexed by the physical page number. Additionally, a pool of entries | |
134 | is created from a pv_zone to be used as needed by pmap_enter() when it is | |
135 | creating new mappings. Originally, we kept this pool around because the code | |
136 | in pmap_enter() was unable to block if it needed an entry and none were | |
137 | available - we'd panic. Some time ago I restructured the pmap_enter() code | |
138 | so that for user pmaps it can block while zalloc'ing a pv structure and restart, | |
139 | removing a panic from the code (in the case of the kernel pmap we cannot block | |
140 | and still panic, so, we keep a separate hot pool for use only on kernel pmaps). | |
141 | The pool has not been removed since there is a large performance gain keeping | |
142 | freed pv's around for reuse and not suffering the overhead of zalloc for every | |
143 | new pv we need. | |
144 | ||
145 | As pmap_enter() created new mappings it linked the new pve's for them off the | |
146 | fixed pv array for that ppn (off the next pointer). These pve's are accessed | |
147 | for several operations, one of them being address space teardown. In that case, | |
148 | we basically do this | |
b7266188 A |
149 | |
150 | for (every page/pte in the space) { | |
151 | calc pve_ptr from the ppn in the pte | |
152 | for (every pv in the list for the ppn) { | |
153 | if (this pv is for this pmap/vaddr) { | |
154 | do housekeeping | |
155 | unlink/free the pv | |
156 | } | |
157 | } | |
158 | } | |
159 | ||
6d2010ae A |
160 | The problem arose when we were running, say 8000 (or even 2000) apache or |
161 | other processes and one or all terminate. The list hanging off each pv array | |
162 | entry could have thousands of entries. We were continuously linearly searching | |
163 | each of these lists as we stepped through the address space we were tearing | |
164 | down. Because of the locks we hold, likely taking a cache miss for each node, | |
165 | and interrupt disabling for MP issues the system became completely unresponsive | |
166 | for many seconds while we did this. | |
167 | ||
168 | Realizing that pve's are accessed in two distinct ways (linearly running the | |
169 | list by ppn for operations like pmap_page_protect and finding and | |
170 | modifying/removing a single pve as part of pmap_enter processing) has led to | |
171 | modifying the pve structures and databases. | |
172 | ||
173 | There are now two types of pve structures. A "rooted" structure which is | |
174 | basically the original structure accessed in an array by ppn, and a ''hashed'' | |
175 | structure accessed on a hash list via a hash of [pmap, vaddr]. These have been | |
176 | designed with the two goals of minimizing wired memory and making the lookup of | |
177 | a ppn faster. Since a vast majority of pages in the system are not aliased | |
178 | and hence represented by a single pv entry I've kept the rooted entry size as | |
179 | small as possible because there is one of these dedicated for every physical | |
180 | page of memory. The hashed pve's are larger due to the addition of the hash | |
181 | link and the ppn entry needed for matching while running the hash list to find | |
182 | the entry we are looking for. This way, only systems that have lots of | |
183 | aliasing (like 2000+ httpd procs) will pay the extra memory price. Both | |
184 | structures have the same first three fields allowing some simplification in | |
185 | the code. | |
b7266188 A |
186 | |
187 | They have these shapes | |
188 | ||
189 | typedef struct pv_rooted_entry { | |
6d2010ae A |
190 | queue_head_t qlink; |
191 | vm_map_offset_t va; | |
192 | pmap_t pmap; | |
b7266188 A |
193 | } *pv_rooted_entry_t; |
194 | ||
195 | ||
196 | typedef struct pv_hashed_entry { | |
6d2010ae A |
197 | queue_head_t qlink; |
198 | vm_map_offset_t va; | |
199 | pmap_t pmap; | |
200 | ppnum_t ppn; | |
201 | struct pv_hashed_entry *nexth; | |
b7266188 A |
202 | } *pv_hashed_entry_t; |
203 | ||
6d2010ae A |
204 | The main flow difference is that the code is now aware of the rooted entry and |
205 | the hashed entries. Code that runs the pv list still starts with the rooted | |
206 | entry and then continues down the qlink onto the hashed entries. Code that is | |
207 | looking up a specific pv entry first checks the rooted entry and then hashes | |
208 | and runs the hash list for the match. The hash list lengths are much smaller | |
209 | than the original pv lists that contained all aliases for the specific ppn. | |
b7266188 A |
210 | |
211 | */ | |
212 | ||
6d2010ae A |
213 | typedef struct pv_rooted_entry { |
214 | /* first three entries must match pv_hashed_entry_t */ | |
215 | queue_head_t qlink; | |
216 | vm_map_offset_t va; /* virtual address for mapping */ | |
217 | pmap_t pmap; /* pmap where mapping lies */ | |
b7266188 A |
218 | } *pv_rooted_entry_t; |
219 | ||
220 | #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0) | |
221 | ||
6d2010ae A |
222 | typedef struct pv_hashed_entry { |
223 | /* first three entries must match pv_rooted_entry_t */ | |
224 | queue_head_t qlink; | |
225 | vm_map_offset_t va; | |
226 | pmap_t pmap; | |
227 | ppnum_t ppn; | |
228 | struct pv_hashed_entry *nexth; | |
b7266188 A |
229 | } *pv_hashed_entry_t; |
230 | ||
231 | #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0) | |
232 | ||
6d2010ae | 233 | //#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */ |
b7266188 A |
234 | #ifdef PV_DEBUG |
235 | #define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized"); | |
236 | #else | |
6d2010ae | 237 | #define CHK_NPVHASH(x) |
b7266188 A |
238 | #endif |
239 | ||
240 | #define NPVHASH 4095 /* MUST BE 2^N - 1 */ | |
6d2010ae A |
241 | #define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000 |
242 | #define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000 | |
243 | #define PV_HASHED_ALLOC_CHUNK_INITIAL 2000 | |
244 | #define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200 | |
245 | ||
246 | extern volatile uint32_t mappingrecurse; | |
247 | extern uint32_t pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark; | |
248 | ||
249 | /* | |
250 | * PV hash locking | |
251 | */ | |
252 | ||
253 | #define LOCK_PV_HASH(hash) lock_hash_hash(hash) | |
254 | #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash) | |
255 | extern uint32_t npvhash; | |
256 | extern pv_hashed_entry_t *pv_hash_table; /* hash lists */ | |
257 | extern pv_hashed_entry_t pv_hashed_free_list; | |
258 | extern pv_hashed_entry_t pv_hashed_kern_free_list; | |
259 | decl_simple_lock_data(extern, pv_hashed_free_list_lock) | |
260 | decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock) | |
261 | decl_simple_lock_data(extern, pv_hash_table_lock) | |
262 | ||
263 | extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry | |
264 | * structures */ | |
265 | ||
266 | extern uint32_t pv_hashed_free_count; | |
267 | extern uint32_t pv_hashed_kern_free_count; | |
268 | /* | |
269 | * Each entry in the pv_head_table is locked by a bit in the | |
270 | * pv_lock_table. The lock bits are accessed by the address of | |
271 | * the frame they lock. | |
272 | */ | |
273 | #define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) | |
274 | #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) | |
275 | extern char *pv_lock_table; /* pointer to array of bits */ | |
276 | extern char *pv_hash_lock_table; | |
277 | extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */ | |
278 | ||
279 | extern event_t mapping_replenish_event; | |
280 | ||
281 | static inline void PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) { | |
316670eb | 282 | pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL); |
6d2010ae A |
283 | simple_lock(&pv_hashed_free_list_lock); |
284 | /* If the kernel reserved pool is low, let non-kernel mappings allocate | |
285 | * synchronously, possibly subject to a throttle. | |
286 | */ | |
316670eb | 287 | if ((pv_hashed_kern_free_count > pv_hashed_kern_low_water_mark) && ((*pvh_ep = pv_hashed_free_list) != 0)) { |
6d2010ae A |
288 | pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next; |
289 | pv_hashed_free_count--; | |
290 | } | |
291 | ||
292 | simple_unlock(&pv_hashed_free_list_lock); | |
293 | ||
316670eb | 294 | if (pv_hashed_free_count <= pv_hashed_low_water_mark) { |
6d2010ae A |
295 | if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) |
296 | thread_wakeup(&mapping_replenish_event); | |
297 | } | |
b7266188 A |
298 | } |
299 | ||
6d2010ae A |
300 | static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) { |
301 | simple_lock(&pv_hashed_free_list_lock); | |
302 | pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; | |
303 | pv_hashed_free_list = pvh_eh; | |
304 | pv_hashed_free_count += pv_cnt; | |
305 | simple_unlock(&pv_hashed_free_list_lock); | |
b7266188 A |
306 | } |
307 | ||
6d2010ae A |
308 | extern unsigned pmap_kern_reserve_alloc_stat; |
309 | ||
310 | static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) { | |
316670eb | 311 | pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL); |
6d2010ae A |
312 | simple_lock(&pv_hashed_kern_free_list_lock); |
313 | ||
314 | if ((*pvh_e = pv_hashed_kern_free_list) != 0) { | |
315 | pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next; | |
316 | pv_hashed_kern_free_count--; | |
317 | pmap_kern_reserve_alloc_stat++; | |
318 | } | |
319 | ||
320 | simple_unlock(&pv_hashed_kern_free_list_lock); | |
321 | ||
322 | if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) { | |
323 | if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) | |
324 | thread_wakeup(&mapping_replenish_event); | |
325 | } | |
b7266188 A |
326 | } |
327 | ||
6d2010ae A |
328 | static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) { |
329 | simple_lock(&pv_hashed_kern_free_list_lock); | |
330 | pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; | |
331 | pv_hashed_kern_free_list = pvh_eh; | |
332 | pv_hashed_kern_free_count += pv_cnt; | |
333 | simple_unlock(&pv_hashed_kern_free_list_lock); | |
334 | } | |
335 | ||
336 | extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters; | |
337 | extern event_t pmap_user_pv_throttle_event; | |
338 | ||
339 | static inline void pmap_pv_throttle(__unused pmap_t p) { | |
340 | pmap_assert(p != kernel_pmap); | |
341 | /* Apply throttle on non-kernel mappings */ | |
342 | if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) { | |
343 | pmap_pv_throttle_stat++; | |
344 | /* This doesn't need to be strictly accurate, merely a hint | |
345 | * to eliminate the timeout when the reserve is replenished. | |
346 | */ | |
347 | pmap_pv_throttled_waiters++; | |
348 | assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC); | |
349 | thread_block(THREAD_CONTINUE_NULL); | |
350 | } | |
b7266188 A |
351 | } |
352 | ||
353 | /* | |
354 | * Index into pv_head table, its lock bits, and the modify/reference and managed bits | |
355 | */ | |
356 | ||
357 | #define pa_index(pa) (i386_btop(pa)) | |
358 | #define ppn_to_pai(ppn) ((int)ppn) | |
359 | ||
360 | #define pai_to_pvh(pai) (&pv_head_table[pai]) | |
361 | #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table) | |
362 | #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table) | |
363 | #define pvhash(idx) (&pv_hash_table[idx]) | |
b7266188 A |
364 | #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table) |
365 | #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table) | |
366 | ||
367 | #define IS_MANAGED_PAGE(x) \ | |
368 | ((unsigned int)(x) <= last_managed_page && \ | |
369 | (pmap_phys_attributes[x] & PHYS_MANAGED)) | |
370 | ||
371 | /* | |
372 | * Physical page attributes. Copy bits from PTE definition. | |
373 | */ | |
374 | #define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */ | |
375 | #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */ | |
376 | #define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */ | |
0b4c1975 | 377 | #define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */ |
6d2010ae A |
378 | #define PHYS_NCACHE INTEL_PTE_NCACHE |
379 | #define PHYS_PTA INTEL_PTE_PTA | |
380 | #define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE) | |
b7266188 | 381 | |
316670eb A |
382 | extern const boolean_t pmap_disable_kheap_nx; |
383 | extern const boolean_t pmap_disable_kstack_nx; | |
384 | ||
385 | #define PMAP_EXPAND_OPTIONS_NONE (0x0) | |
386 | #define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT) | |
387 | #define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER) | |
388 | ||
b7266188 A |
389 | /* |
390 | * Amount of virtual memory mapped by one | |
391 | * page-directory entry. | |
392 | */ | |
393 | #define PDE_MAPPED_SIZE (pdetova(1)) | |
394 | ||
395 | ||
396 | /* | |
397 | * Locking and TLB invalidation | |
398 | */ | |
399 | ||
400 | /* | |
401 | * Locking Protocols: (changed 2/2007 JK) | |
402 | * | |
403 | * There are two structures in the pmap module that need locking: | |
404 | * the pmaps themselves, and the per-page pv_lists (which are locked | |
405 | * by locking the pv_lock_table entry that corresponds to the pv_head | |
406 | * for the list in question.) Most routines want to lock a pmap and | |
407 | * then do operations in it that require pv_list locking -- however | |
408 | * pmap_remove_all and pmap_copy_on_write operate on a physical page | |
409 | * basis and want to do the locking in the reverse order, i.e. lock | |
410 | * a pv_list and then go through all the pmaps referenced by that list. | |
411 | * | |
412 | * The system wide pmap lock has been removed. Now, paths take a lock | |
413 | * on the pmap before changing its 'shape' and the reverse order lockers | |
414 | * (coming in by phys ppn) take a lock on the corresponding pv and then | |
415 | * retest to be sure nothing changed during the window before they locked | |
416 | * and can then run up/down the pv lists holding the list lock. This also | |
417 | * lets the pmap layer run (nearly completely) interrupt enabled, unlike | |
418 | * previously. | |
419 | */ | |
420 | ||
421 | /* | |
422 | * PV locking | |
423 | */ | |
424 | ||
425 | #define LOCK_PVH(index) { \ | |
426 | mp_disable_preemption(); \ | |
427 | lock_pvh_pai(index); \ | |
428 | } | |
429 | ||
430 | #define UNLOCK_PVH(index) { \ | |
431 | unlock_pvh_pai(index); \ | |
432 | mp_enable_preemption(); \ | |
433 | } | |
b7266188 | 434 | |
b7266188 A |
435 | extern uint64_t pde_mapped_size; |
436 | ||
437 | extern char *pmap_phys_attributes; | |
316670eb | 438 | extern ppnum_t last_managed_page; |
b7266188 | 439 | |
060df5ea A |
440 | extern ppnum_t lowest_lo; |
441 | extern ppnum_t lowest_hi; | |
442 | extern ppnum_t highest_hi; | |
443 | ||
b7266188 A |
444 | /* |
445 | * when spinning through pmap_remove | |
446 | * ensure that we don't spend too much | |
447 | * time with preemption disabled. | |
448 | * I'm setting the current threshold | |
449 | * to 20us | |
450 | */ | |
451 | #define MAX_PREEMPTION_LATENCY_NS 20000 | |
452 | extern uint64_t max_preemption_latency_tsc; | |
453 | ||
454 | /* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */ | |
455 | #ifdef DEBUGINTERRUPTS | |
456 | #define pmap_intr_assert() { \ | |
457 | if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \ | |
458 | panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \ | |
459 | } | |
460 | #else | |
461 | #define pmap_intr_assert() | |
462 | #endif | |
463 | ||
6d2010ae A |
464 | extern int nx_enabled; |
465 | extern unsigned int inuse_ptepages_count; | |
b7266188 A |
466 | |
467 | static inline uint32_t | |
468 | pvhashidx(pmap_t pmap, vm_map_offset_t va) | |
469 | { | |
470 | return ((uint32_t)(uintptr_t)pmap ^ | |
6d2010ae | 471 | ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) & |
b7266188 A |
472 | npvhash; |
473 | } | |
474 | ||
6d2010ae | 475 | |
b7266188 A |
476 | /* |
477 | * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain. | |
478 | * properly deals with the anchor. | |
479 | * must be called with the hash locked, does not unlock it | |
480 | */ | |
b7266188 A |
481 | static inline void |
482 | pmap_pvh_unlink(pv_hashed_entry_t pvh) | |
483 | { | |
484 | pv_hashed_entry_t curh; | |
485 | pv_hashed_entry_t *pprevh; | |
486 | int pvhash_idx; | |
487 | ||
488 | CHK_NPVHASH(); | |
489 | pvhash_idx = pvhashidx(pvh->pmap, pvh->va); | |
490 | ||
491 | pprevh = pvhash(pvhash_idx); | |
492 | ||
493 | #if PV_DEBUG | |
494 | if (NULL == *pprevh) | |
495 | panic("pvh_unlink null anchor"); /* JK DEBUG */ | |
496 | #endif | |
497 | curh = *pprevh; | |
498 | ||
499 | while (PV_HASHED_ENTRY_NULL != curh) { | |
500 | if (pvh == curh) | |
501 | break; | |
502 | pprevh = &curh->nexth; | |
503 | curh = curh->nexth; | |
504 | } | |
505 | if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh"); | |
506 | *pprevh = pvh->nexth; | |
507 | return; | |
508 | } | |
509 | ||
510 | static inline void | |
511 | pv_hash_add(pv_hashed_entry_t pvh_e, | |
512 | pv_rooted_entry_t pv_h) | |
513 | { | |
514 | pv_hashed_entry_t *hashp; | |
515 | int pvhash_idx; | |
516 | ||
517 | CHK_NPVHASH(); | |
518 | pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); | |
519 | LOCK_PV_HASH(pvhash_idx); | |
520 | insque(&pvh_e->qlink, &pv_h->qlink); | |
521 | hashp = pvhash(pvhash_idx); | |
522 | #if PV_DEBUG | |
523 | if (NULL==hashp) | |
524 | panic("pv_hash_add(%p) null hash bucket", pvh_e); | |
525 | #endif | |
526 | pvh_e->nexth = *hashp; | |
527 | *hashp = pvh_e; | |
528 | UNLOCK_PV_HASH(pvhash_idx); | |
529 | } | |
530 | ||
531 | static inline void | |
532 | pv_hash_remove(pv_hashed_entry_t pvh_e) | |
533 | { | |
534 | int pvhash_idx; | |
535 | ||
536 | CHK_NPVHASH(); | |
537 | pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va); | |
538 | LOCK_PV_HASH(pvhash_idx); | |
539 | remque(&pvh_e->qlink); | |
540 | pmap_pvh_unlink(pvh_e); | |
541 | UNLOCK_PV_HASH(pvhash_idx); | |
6d2010ae | 542 | } |
b7266188 A |
543 | |
544 | static inline boolean_t popcnt1(uint64_t distance) { | |
545 | return ((distance & (distance - 1)) == 0); | |
546 | } | |
547 | ||
548 | /* | |
549 | * Routines to handle suppression of/recovery from some forms of pagetable corruption | |
550 | * incidents observed in the field. These can be either software induced (wild | |
551 | * stores to the mapwindows where applicable, use after free errors | |
552 | * (typically of pages addressed physically), mis-directed DMAs etc., or due | |
553 | * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors, | |
554 | * the recording mechanism is deliberately not MP-safe. The overarching goal is to | |
555 | * still assert on potential software races, but attempt recovery from incidents | |
556 | * identifiable as occurring due to issues beyond the control of the pmap module. | |
557 | * The latter includes single-bit errors and malformed pagetable entries. | |
558 | * We currently limit ourselves to recovery/suppression of one incident per | |
559 | * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident | |
560 | * are logged. | |
561 | * Assertions are not suppressed if kernel debugging is enabled. (DRK 09) | |
562 | */ | |
563 | ||
564 | typedef enum { | |
565 | PTE_VALID = 0x0, | |
566 | PTE_INVALID = 0x1, | |
567 | PTE_RSVD = 0x2, | |
568 | PTE_SUPERVISOR = 0x4, | |
569 | PTE_BITFLIP = 0x8, | |
570 | PV_BITFLIP = 0x10, | |
571 | PTE_INVALID_CACHEABILITY = 0x20 | |
572 | } pmap_pagetable_corruption_t; | |
573 | ||
574 | typedef enum { | |
575 | ROOT_PRESENT = 0, | |
576 | ROOT_ABSENT = 1 | |
577 | } pmap_pv_assertion_t; | |
578 | ||
579 | typedef enum { | |
580 | PMAP_ACTION_IGNORE = 0x0, | |
581 | PMAP_ACTION_ASSERT = 0x1, | |
582 | PMAP_ACTION_RETRY = 0x2, | |
583 | PMAP_ACTION_RETRY_RELOCK = 0x4 | |
584 | } pmap_pagetable_corruption_action_t; | |
585 | ||
586 | #define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL) | |
587 | extern uint64_t pmap_pagetable_corruption_interval_abstime; | |
588 | ||
589 | extern uint32_t pmap_pagetable_corruption_incidents; | |
590 | #define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8) | |
591 | typedef struct { | |
592 | pmap_pv_assertion_t incident; | |
593 | pmap_pagetable_corruption_t reason; | |
594 | pmap_pagetable_corruption_action_t action; | |
595 | pmap_t pmap; | |
596 | vm_map_offset_t vaddr; | |
597 | pt_entry_t pte; | |
598 | ppnum_t ppn; | |
599 | pmap_t pvpmap; | |
600 | vm_map_offset_t pvva; | |
601 | uint64_t abstime; | |
602 | } pmap_pagetable_corruption_record_t; | |
603 | ||
604 | extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[]; | |
605 | extern uint64_t pmap_pagetable_corruption_last_abstime; | |
606 | extern thread_call_t pmap_pagetable_corruption_log_call; | |
607 | extern boolean_t pmap_pagetable_corruption_timeout; | |
608 | ||
609 | static inline void | |
610 | pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) { | |
611 | uint32_t pmap_pagetable_corruption_log_index; | |
612 | pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG; | |
613 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident; | |
614 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason; | |
615 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action; | |
616 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap; | |
617 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr; | |
618 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep; | |
619 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn; | |
620 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap; | |
621 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva; | |
622 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time(); | |
623 | /* Asynchronously log */ | |
624 | thread_call_enter(pmap_pagetable_corruption_log_call); | |
625 | } | |
626 | ||
627 | static inline pmap_pagetable_corruption_action_t | |
628 | pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) { | |
316670eb | 629 | pmap_pagetable_corruption_action_t action = PMAP_ACTION_ASSERT; |
b7266188 A |
630 | pmap_pagetable_corruption_t suppress_reason = PTE_VALID; |
631 | ppnum_t suppress_ppn = 0; | |
632 | pt_entry_t cpte = *ptep; | |
633 | ppnum_t cpn = pa_index(pte_to_pa(cpte)); | |
634 | ppnum_t ppn = *ppnp; | |
635 | pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn)); | |
636 | pv_rooted_entry_t pv_e = pv_h; | |
637 | uint32_t bitdex; | |
638 | pmap_t pvpmap = pv_h->pmap; | |
639 | vm_map_offset_t pvva = pv_h->va; | |
640 | boolean_t ppcd = FALSE; | |
641 | ||
642 | /* Ideally, we'd consult the Mach VM here to definitively determine | |
643 | * the nature of the mapping for this address space and address. | |
644 | * As that would be a layering violation in this context, we | |
645 | * use various heuristics to recover from single bit errors, | |
646 | * malformed pagetable entries etc. These are not intended | |
647 | * to be comprehensive. | |
648 | */ | |
649 | ||
650 | /* As a precautionary measure, mark A+D */ | |
651 | pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED); | |
652 | ||
653 | /* | |
654 | * Correct potential single bit errors in either (but not both) element | |
655 | * of the PV | |
656 | */ | |
657 | do { | |
658 | if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) || | |
659 | (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) { | |
660 | pv_e->pmap = pmap; | |
661 | pv_e->va = vaddr; | |
662 | suppress_reason = PV_BITFLIP; | |
663 | action = PMAP_ACTION_RETRY; | |
664 | goto pmap_cpc_exit; | |
665 | } | |
316670eb | 666 | } while (((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink))) && (pv_e != pv_h)); |
b7266188 A |
667 | |
668 | /* Discover root entries with a Hamming | |
669 | * distance of 1 from the supplied | |
670 | * physical page frame. | |
671 | */ | |
672 | for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) { | |
673 | ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex); | |
674 | if (IS_MANAGED_PAGE(npn)) { | |
675 | pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn)); | |
676 | if (npv_h->va == vaddr && npv_h->pmap == pmap) { | |
677 | suppress_reason = PTE_BITFLIP; | |
678 | suppress_ppn = npn; | |
679 | action = PMAP_ACTION_RETRY_RELOCK; | |
680 | UNLOCK_PVH(ppn_to_pai(ppn)); | |
681 | *ppnp = npn; | |
682 | goto pmap_cpc_exit; | |
683 | } | |
684 | } | |
685 | } | |
686 | ||
687 | if (pmap == kernel_pmap) { | |
688 | action = PMAP_ACTION_ASSERT; | |
689 | goto pmap_cpc_exit; | |
690 | } | |
691 | ||
692 | /* Check for malformed/inconsistent entries */ | |
693 | ||
694 | if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) { | |
695 | action = PMAP_ACTION_IGNORE; | |
696 | suppress_reason = PTE_INVALID_CACHEABILITY; | |
697 | } | |
698 | else if (cpte & INTEL_PTE_RSVD) { | |
699 | action = PMAP_ACTION_IGNORE; | |
700 | suppress_reason = PTE_RSVD; | |
701 | } | |
702 | else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) { | |
703 | action = PMAP_ACTION_IGNORE; | |
704 | suppress_reason = PTE_SUPERVISOR; | |
705 | } | |
706 | pmap_cpc_exit: | |
707 | PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd)); | |
708 | ||
709 | if (debug_boot_arg && !ppcd) { | |
710 | action = PMAP_ACTION_ASSERT; | |
711 | } | |
712 | ||
713 | if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) { | |
714 | action = PMAP_ACTION_ASSERT; | |
715 | pmap_pagetable_corruption_timeout = TRUE; | |
716 | } | |
717 | else | |
718 | { | |
719 | pmap_pagetable_corruption_last_abstime = mach_absolute_time(); | |
720 | } | |
721 | pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva); | |
722 | return action; | |
723 | } | |
6d2010ae | 724 | |
b7266188 A |
725 | /* |
726 | * Remove pv list entry. | |
727 | * Called with pv_head_table entry locked. | |
728 | * Returns pv entry to be freed (or NULL). | |
729 | */ | |
b7266188 | 730 | static inline __attribute__((always_inline)) pv_hashed_entry_t |
6d2010ae A |
731 | pmap_pv_remove(pmap_t pmap, |
732 | vm_map_offset_t vaddr, | |
733 | ppnum_t *ppnp, | |
b7266188 A |
734 | pt_entry_t *pte) |
735 | { | |
736 | pv_hashed_entry_t pvh_e; | |
737 | pv_rooted_entry_t pv_h; | |
738 | pv_hashed_entry_t *pprevh; | |
739 | int pvhash_idx; | |
740 | uint32_t pv_cnt; | |
741 | ppnum_t ppn; | |
742 | ||
743 | pmap_pv_remove_retry: | |
744 | ppn = *ppnp; | |
745 | pvh_e = PV_HASHED_ENTRY_NULL; | |
746 | pv_h = pai_to_pvh(ppn_to_pai(ppn)); | |
747 | ||
316670eb | 748 | if (__improbable(pv_h->pmap == PMAP_NULL)) { |
b7266188 A |
749 | pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT); |
750 | if (pac == PMAP_ACTION_IGNORE) | |
751 | goto pmap_pv_remove_exit; | |
752 | else if (pac == PMAP_ACTION_ASSERT) | |
316670eb | 753 | panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list!", pmap, vaddr, ppn, *pte, ppnp, pte); |
b7266188 A |
754 | else if (pac == PMAP_ACTION_RETRY_RELOCK) { |
755 | LOCK_PVH(ppn_to_pai(*ppnp)); | |
756 | pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); | |
757 | goto pmap_pv_remove_retry; | |
758 | } | |
759 | else if (pac == PMAP_ACTION_RETRY) | |
760 | goto pmap_pv_remove_retry; | |
761 | } | |
762 | ||
763 | if (pv_h->va == vaddr && pv_h->pmap == pmap) { | |
764 | /* | |
765 | * Header is the pv_rooted_entry. | |
766 | * We can't free that. If there is a queued | |
767 | * entry after this one we remove that | |
768 | * from the ppn queue, we remove it from the hash chain | |
769 | * and copy it to the rooted entry. Then free it instead. | |
770 | */ | |
771 | pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink); | |
772 | if (pv_h != (pv_rooted_entry_t) pvh_e) { | |
773 | /* | |
774 | * Entry queued to root, remove this from hash | |
775 | * and install as new root. | |
776 | */ | |
777 | CHK_NPVHASH(); | |
778 | pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); | |
779 | LOCK_PV_HASH(pvhash_idx); | |
780 | remque(&pvh_e->qlink); | |
781 | pprevh = pvhash(pvhash_idx); | |
782 | if (PV_HASHED_ENTRY_NULL == *pprevh) { | |
783 | panic("pmap_pv_remove(%p,0x%llx,0x%x): " | |
784 | "empty hash, removing rooted", | |
785 | pmap, vaddr, ppn); | |
786 | } | |
787 | pmap_pvh_unlink(pvh_e); | |
788 | UNLOCK_PV_HASH(pvhash_idx); | |
789 | pv_h->pmap = pvh_e->pmap; | |
790 | pv_h->va = pvh_e->va; /* dispose of pvh_e */ | |
791 | } else { | |
792 | /* none queued after rooted */ | |
793 | pv_h->pmap = PMAP_NULL; | |
794 | pvh_e = PV_HASHED_ENTRY_NULL; | |
795 | } | |
796 | } else { | |
797 | /* | |
798 | * not removing rooted pv. find it on hash chain, remove from | |
799 | * ppn queue and hash chain and free it | |
800 | */ | |
801 | CHK_NPVHASH(); | |
802 | pvhash_idx = pvhashidx(pmap, vaddr); | |
803 | LOCK_PV_HASH(pvhash_idx); | |
804 | pprevh = pvhash(pvhash_idx); | |
805 | if (PV_HASHED_ENTRY_NULL == *pprevh) { | |
316670eb A |
806 | panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash", |
807 | pmap, vaddr, ppn, *pte, pte); | |
b7266188 A |
808 | } |
809 | pvh_e = *pprevh; | |
810 | pmap_pv_hashlist_walks++; | |
811 | pv_cnt = 0; | |
812 | while (PV_HASHED_ENTRY_NULL != pvh_e) { | |
813 | pv_cnt++; | |
814 | if (pvh_e->pmap == pmap && | |
815 | pvh_e->va == vaddr && | |
816 | pvh_e->ppn == ppn) | |
817 | break; | |
818 | pprevh = &pvh_e->nexth; | |
819 | pvh_e = pvh_e->nexth; | |
820 | } | |
6d2010ae | 821 | |
b7266188 A |
822 | if (PV_HASHED_ENTRY_NULL == pvh_e) { |
823 | pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT); | |
824 | ||
825 | if (pac == PMAP_ACTION_ASSERT) | |
316670eb | 826 | panic("pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, pv_h->va); |
b7266188 A |
827 | else { |
828 | UNLOCK_PV_HASH(pvhash_idx); | |
829 | if (pac == PMAP_ACTION_RETRY_RELOCK) { | |
830 | LOCK_PVH(ppn_to_pai(*ppnp)); | |
831 | pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); | |
832 | goto pmap_pv_remove_retry; | |
833 | } | |
834 | else if (pac == PMAP_ACTION_RETRY) { | |
835 | goto pmap_pv_remove_retry; | |
836 | } | |
837 | else if (pac == PMAP_ACTION_IGNORE) { | |
838 | goto pmap_pv_remove_exit; | |
839 | } | |
840 | } | |
841 | } | |
6d2010ae | 842 | |
b7266188 A |
843 | pmap_pv_hashlist_cnts += pv_cnt; |
844 | if (pmap_pv_hashlist_max < pv_cnt) | |
845 | pmap_pv_hashlist_max = pv_cnt; | |
846 | *pprevh = pvh_e->nexth; | |
847 | remque(&pvh_e->qlink); | |
848 | UNLOCK_PV_HASH(pvhash_idx); | |
849 | } | |
850 | pmap_pv_remove_exit: | |
851 | return pvh_e; | |
852 | } | |
853 | ||
6d2010ae A |
854 | |
855 | extern int pt_fake_zone_index; | |
856 | static inline void | |
316670eb | 857 | PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes) |
6d2010ae A |
858 | { |
859 | thread_t thr = current_thread(); | |
860 | task_t task; | |
861 | zinfo_usage_t zinfo; | |
862 | ||
316670eb A |
863 | pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes); |
864 | ||
6d2010ae A |
865 | if (pt_fake_zone_index != -1 && |
866 | (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) | |
867 | OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].alloc); | |
868 | } | |
869 | ||
870 | static inline void | |
316670eb | 871 | PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes) |
6d2010ae A |
872 | { |
873 | thread_t thr = current_thread(); | |
874 | task_t task; | |
875 | zinfo_usage_t zinfo; | |
876 | ||
316670eb A |
877 | pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes); |
878 | ||
6d2010ae A |
879 | if (pt_fake_zone_index != -1 && |
880 | (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) | |
881 | OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].free); | |
882 | } | |
883 | ||
316670eb A |
884 | static inline void |
885 | PMAP_ZINFO_SALLOC(pmap_t pmap, vm_size_t bytes) | |
886 | { | |
887 | pmap_ledger_credit(pmap, task_ledgers.tkm_shared, bytes); | |
888 | } | |
889 | ||
890 | static inline void | |
891 | PMAP_ZINFO_SFREE(pmap_t pmap, vm_size_t bytes) | |
892 | { | |
893 | pmap_ledger_debit(pmap, task_ledgers.tkm_shared, bytes); | |
894 | } | |
895 | ||
6d2010ae A |
896 | extern boolean_t pmap_initialized;/* Has pmap_init completed? */ |
897 | #define valid_page(x) (pmap_initialized && pmap_valid_page(x)) | |
898 | ||
899 | // XXX | |
900 | #define HIGH_MEM_BASE ((uint32_t)( -NBPDE) ) /* shared gdt etc seg addr */ /* XXX64 ?? */ | |
901 | // XXX | |
902 | ||
903 | ||
904 | int phys_attribute_test( | |
905 | ppnum_t phys, | |
906 | int bits); | |
907 | void phys_attribute_clear( | |
908 | ppnum_t phys, | |
909 | int bits); | |
910 | ||
911 | //#define PCID_DEBUG 1 | |
912 | #if PCID_DEBUG | |
913 | #define pmap_pcid_log(fmt, args...) \ | |
914 | do { \ | |
915 | kprintf(fmt, ##args); \ | |
916 | printf(fmt, ##args); \ | |
917 | } while(0) | |
918 | #else | |
919 | #define pmap_pcid_log(fmt, args...) | |
920 | #endif | |
921 | void pmap_pcid_configure(void); | |
922 | ||
316670eb A |
923 | |
924 | /* | |
925 | * Atomic 64-bit compare and exchange of a page table entry. | |
926 | */ | |
927 | static inline boolean_t | |
928 | pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new) | |
929 | { | |
930 | boolean_t ret; | |
931 | ||
932 | #ifdef __i386__ | |
933 | /* | |
934 | * Load the old value into %edx:%eax | |
935 | * Load the new value into %ecx:%ebx | |
936 | * Compare-exchange-8bytes at address entryp (loaded in %edi) | |
937 | * If the compare succeeds, the new value is stored, return TRUE. | |
938 | * Otherwise, no swap is made, return FALSE. | |
939 | */ | |
940 | asm volatile( | |
941 | " lock; cmpxchg8b (%1) \n\t" | |
942 | " setz %%al \n\t" | |
943 | " movzbl %%al,%0" | |
944 | : "=a" (ret) | |
945 | : "D" (entryp), | |
946 | "a" ((uint32_t)old), | |
947 | "d" ((uint32_t)(old >> 32)), | |
948 | "b" ((uint32_t)new), | |
949 | "c" ((uint32_t)(new >> 32)) | |
950 | : "memory"); | |
951 | #else | |
952 | /* | |
953 | * Load the old value into %rax | |
954 | * Load the new value into another register | |
955 | * Compare-exchange-quad at address entryp | |
956 | * If the compare succeeds, the new value is stored, return TRUE. | |
957 | * Otherwise, no swap is made, return FALSE. | |
958 | */ | |
959 | asm volatile( | |
960 | " lock; cmpxchgq %2,(%3) \n\t" | |
961 | " setz %%al \n\t" | |
962 | " movzbl %%al,%0" | |
963 | : "=a" (ret) | |
964 | : "a" (old), | |
965 | "r" (new), | |
966 | "r" (entryp) | |
967 | : "memory"); | |
968 | #endif | |
969 | return ret; | |
970 | } | |
971 | ||
972 | extern uint32_t pmap_update_clear_pte_count; | |
973 | ||
974 | static inline void pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits) { | |
975 | pt_entry_t npte, opte; | |
976 | do { | |
977 | opte = *mptep; | |
978 | if (__improbable(opte == 0)) { | |
979 | pmap_update_clear_pte_count++; | |
980 | break; | |
981 | } | |
982 | npte = opte & ~(pclear_bits); | |
983 | npte |= pset_bits; | |
984 | } while (!pmap_cmpx_pte(mptep, opte, npte)); | |
985 | } | |
986 | ||
6d2010ae A |
987 | #if defined(__x86_64__) |
988 | /* | |
989 | * The single pml4 page per pmap is allocated at pmap create time and exists | |
990 | * for the duration of the pmap. we allocate this page in kernel vm. | |
991 | * this returns the address of the requested pml4 entry in the top level page. | |
992 | */ | |
993 | static inline | |
994 | pml4_entry_t * | |
995 | pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr) | |
996 | { | |
316670eb A |
997 | if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) && |
998 | (vaddr < 0xFFFF800000000000ULL))) { | |
999 | return (NULL); | |
1000 | } | |
1001 | ||
6d2010ae A |
1002 | #if PMAP_ASSERT |
1003 | return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]); | |
1004 | #else | |
1005 | return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)]; | |
1006 | #endif | |
1007 | } | |
1008 | ||
1009 | /* | |
1010 | * Returns address of requested PDPT entry in the physmap. | |
1011 | */ | |
1012 | static inline pdpt_entry_t * | |
1013 | pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr) | |
1014 | { | |
1015 | pml4_entry_t newpf; | |
1016 | pml4_entry_t *pml4; | |
1017 | ||
6d2010ae A |
1018 | pml4 = pmap64_pml4(pmap, vaddr); |
1019 | if (pml4 && ((*pml4 & INTEL_PTE_VALID))) { | |
1020 | newpf = *pml4 & PG_FRAME; | |
1021 | return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf)) | |
1022 | [(vaddr >> PDPTSHIFT) & (NPDPTPG-1)]; | |
1023 | } | |
1024 | return (NULL); | |
1025 | } | |
1026 | /* | |
1027 | * Returns the address of the requested PDE entry in the physmap. | |
1028 | */ | |
1029 | static inline pd_entry_t * | |
1030 | pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr) | |
1031 | { | |
1032 | pdpt_entry_t newpf; | |
1033 | pdpt_entry_t *pdpt; | |
1034 | ||
6d2010ae A |
1035 | pdpt = pmap64_pdpt(pmap, vaddr); |
1036 | ||
1037 | if (pdpt && ((*pdpt & INTEL_PTE_VALID))) { | |
1038 | newpf = *pdpt & PG_FRAME; | |
1039 | return &((pd_entry_t *) PHYSMAP_PTOV(newpf)) | |
1040 | [(vaddr >> PDSHIFT) & (NPDPG-1)]; | |
1041 | } | |
1042 | return (NULL); | |
1043 | } | |
1044 | ||
1045 | static inline pd_entry_t * | |
1046 | pmap_pde(pmap_t m, vm_map_offset_t v) | |
1047 | { | |
1048 | pd_entry_t *pde; | |
1049 | ||
6d2010ae A |
1050 | pde = pmap64_pde(m, v); |
1051 | ||
1052 | return pde; | |
1053 | } | |
1054 | ||
1055 | ||
1056 | /* | |
1057 | * return address of mapped pte for vaddr va in pmap pmap. | |
1058 | * | |
1059 | * In case the pde maps a superpage, return the pde, which, in this case | |
1060 | * is the actual page table entry. | |
1061 | */ | |
1062 | static inline pt_entry_t * | |
1063 | pmap_pte(pmap_t pmap, vm_map_offset_t vaddr) | |
1064 | { | |
1065 | pd_entry_t *pde; | |
1066 | pd_entry_t newpf; | |
1067 | ||
1068 | assert(pmap); | |
316670eb | 1069 | pde = pmap64_pde(pmap, vaddr); |
6d2010ae A |
1070 | |
1071 | if (pde && ((*pde & INTEL_PTE_VALID))) { | |
1072 | if (*pde & INTEL_PTE_PS) | |
1073 | return pde; | |
1074 | newpf = *pde & PG_FRAME; | |
1075 | return &((pt_entry_t *)PHYSMAP_PTOV(newpf)) | |
1076 | [i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)]; | |
1077 | } | |
1078 | return (NULL); | |
1079 | } | |
1080 | #endif | |
316670eb A |
1081 | #if DEBUG |
1082 | #define DPRINTF(x...) kprintf(x) | |
1083 | #else | |
1084 | #define DPRINTF(x...) | |
1085 | #endif | |
1086 | ||
b0d623f7 | 1087 | #endif /* MACH_KERNEL_PRIVATE */ |
316670eb | 1088 | #endif /* _I386_PMAP_INTERNAL_ */ |