]>
Commit | Line | Data |
---|---|---|
1c79356b | 1 | /* |
b0d623f7 | 2 | * Copyright (c) 2000-2009 Apple Inc. All rights reserved. |
1c79356b | 3 | * |
2d21ac55 | 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
1c79356b | 5 | * |
2d21ac55 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
8f6c56a5 | 14 | * |
2d21ac55 A |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
8f6c56a5 A |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
2d21ac55 A |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
8f6c56a5 | 25 | * |
2d21ac55 | 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
1c79356b | 27 | */ |
1c79356b | 28 | |
b0d623f7 A |
29 | #include <vm/pmap.h> |
30 | #include <sys/kdebug.h> | |
b7266188 | 31 | #include <kern/debug.h> |
1c79356b | 32 | |
b0d623f7 | 33 | #ifdef MACH_KERNEL_PRIVATE |
1c79356b | 34 | |
b0d623f7 A |
35 | /* |
36 | * pmap locking | |
2d21ac55 | 37 | */ |
0b4e3aa0 | 38 | |
b0d623f7 A |
39 | #define PMAP_LOCK(pmap) { \ |
40 | simple_lock(&(pmap)->lock); \ | |
41 | } | |
0b4e3aa0 | 42 | |
b0d623f7 A |
43 | #define PMAP_UNLOCK(pmap) { \ |
44 | simple_unlock(&(pmap)->lock); \ | |
45 | } | |
1c79356b | 46 | |
1c79356b | 47 | |
b0d623f7 A |
48 | #define PMAP_UPDATE_TLBS(pmap, s, e) \ |
49 | pmap_flush_tlbs(pmap) | |
1c79356b | 50 | |
b0d623f7 | 51 | #define iswired(pte) ((pte) & INTEL_PTE_WIRED) |
1c79356b | 52 | |
b0d623f7 A |
53 | #ifdef PMAP_TRACES |
54 | extern boolean_t pmap_trace; | |
55 | #define PMAP_TRACE(x,a,b,c,d,e) \ | |
56 | if (pmap_trace) { \ | |
57 | KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \ | |
58 | } | |
59 | #else | |
60 | #define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e) | |
61 | #endif /* PMAP_TRACES */ | |
1c79356b | 62 | |
b0d623f7 A |
63 | void pmap_expand_pml4( |
64 | pmap_t map, | |
65 | vm_map_offset_t v); | |
66 | ||
67 | void pmap_expand_pdpt( | |
68 | pmap_t map, | |
69 | vm_map_offset_t v); | |
b7266188 A |
70 | extern void pmap_flush_tlbs(pmap_t pmap); |
71 | ||
b0d623f7 A |
72 | #if defined(__x86_64__) |
73 | extern const boolean_t cpu_64bit; | |
74 | #else | |
75 | extern boolean_t cpu_64bit; | |
2d21ac55 | 76 | #endif |
b0d623f7 | 77 | |
b7266188 A |
78 | /* |
79 | * Private data structures. | |
80 | */ | |
81 | ||
82 | /* | |
83 | * For each vm_page_t, there is a list of all currently | |
84 | * valid virtual mappings of that page. An entry is | |
85 | * a pv_rooted_entry_t; the list is the pv_table. | |
86 | * | |
87 | * N.B. with the new combo rooted/hashed scheme it is | |
88 | * only possibly to remove individual non-rooted entries | |
89 | * if they are found via the hashed chains as there is no | |
90 | * way to unlink the singly linked hashed entries if navigated to | |
91 | * via the queue list off the rooted entries. Think of it as | |
92 | * hash/walk/pull, keeping track of the prev pointer while walking | |
93 | * the singly linked hash list. All of this is to save memory and | |
94 | * keep both types of pv_entries as small as possible. | |
95 | */ | |
96 | ||
97 | /* | |
98 | ||
99 | PV HASHING Changes - JK 1/2007 | |
100 | ||
101 | Pve's establish physical to virtual mappings. These are used for aliasing of a | |
102 | physical page to (potentially many) virtual addresses within pmaps. In the previous | |
103 | implementation the structure of the pv_entries (each 16 bytes in size) was | |
104 | ||
105 | typedef struct pv_entry { | |
106 | struct pv_entry_t next; | |
107 | pmap_t pmap; | |
108 | vm_map_offset_t va; | |
109 | } *pv_entry_t; | |
110 | ||
111 | An initial array of these is created at boot time, one per physical page of memory, | |
112 | indexed by the physical page number. Additionally, a pool of entries is created from a | |
113 | pv_zone to be used as needed by pmap_enter() when it is creating new mappings. | |
114 | Originally, we kept this pool around because the code in pmap_enter() was unable to | |
115 | block if it needed an entry and none were available - we'd panic. Some time ago I | |
116 | restructured the pmap_enter() code so that for user pmaps it can block while zalloc'ing | |
117 | a pv structure and restart, removing a panic from the code (in the case of the kernel | |
118 | pmap we cannot block and still panic, so, we keep a separate hot pool for use only on | |
119 | kernel pmaps). The pool has not been removed since there is a large performance gain | |
120 | keeping freed pv's around for reuse and not suffering the overhead of zalloc for every new pv we need. | |
121 | ||
122 | As pmap_enter() created new mappings it linked the new pve's for them off the fixed | |
123 | pv array for that ppn (off the next pointer). These pve's are accessed for several | |
124 | operations, one of them being address space teardown. In that case, we basically do this | |
125 | ||
126 | for (every page/pte in the space) { | |
127 | calc pve_ptr from the ppn in the pte | |
128 | for (every pv in the list for the ppn) { | |
129 | if (this pv is for this pmap/vaddr) { | |
130 | do housekeeping | |
131 | unlink/free the pv | |
132 | } | |
133 | } | |
134 | } | |
135 | ||
136 | The problem arose when we were running, say 8000 (or even 2000) apache or other processes | |
137 | and one or all terminate. The list hanging off each pv array entry could have thousands of | |
138 | entries. We were continuously linearly searching each of these lists as we stepped through | |
139 | the address space we were tearing down. Because of the locks we hold, likely taking a cache | |
140 | miss for each node, and interrupt disabling for MP issues the system became completely | |
141 | unresponsive for many seconds while we did this. | |
142 | ||
143 | Realizing that pve's are accessed in two distinct ways (linearly running the list by ppn | |
144 | for operations like pmap_page_protect and finding and modifying/removing a single pve as | |
145 | part of pmap_enter processing) has led to modifying the pve structures and databases. | |
146 | ||
147 | There are now two types of pve structures. A "rooted" structure which is basically the | |
148 | original structure accessed in an array by ppn, and a ''hashed'' structure accessed on a | |
149 | hash list via a hash of [pmap, vaddr]. These have been designed with the two goals of | |
150 | minimizing wired memory and making the lookup of a ppn faster. Since a vast majority of | |
151 | pages in the system are not aliased and hence represented by a single pv entry I've kept | |
152 | the rooted entry size as small as possible because there is one of these dedicated for | |
153 | every physical page of memory. The hashed pve's are larger due to the addition of the hash | |
154 | link and the ppn entry needed for matching while running the hash list to find the entry we | |
155 | are looking for. This way, only systems that have lots of aliasing (like 2000+ httpd procs) | |
156 | will pay the extra memory price. Both structures have the same first three fields allowing | |
157 | some simplification in the code. | |
158 | ||
159 | They have these shapes | |
160 | ||
161 | typedef struct pv_rooted_entry { | |
162 | queue_head_t qlink; | |
163 | vm_map_offset_t va; | |
164 | pmap_t pmap; | |
165 | } *pv_rooted_entry_t; | |
166 | ||
167 | ||
168 | typedef struct pv_hashed_entry { | |
169 | queue_head_t qlink; | |
170 | vm_map_offset_t va; | |
171 | pmap_t pmap; | |
172 | ppnum_t ppn; | |
173 | struct pv_hashed_entry *nexth; | |
174 | } *pv_hashed_entry_t; | |
175 | ||
176 | The main flow difference is that the code is now aware of the rooted entry and the hashed | |
177 | entries. Code that runs the pv list still starts with the rooted entry and then continues | |
178 | down the qlink onto the hashed entries. Code that is looking up a specific pv entry first | |
179 | checks the rooted entry and then hashes and runs the hash list for the match. The hash list | |
180 | lengths are much smaller than the original pv lists that contained all aliases for the specific ppn. | |
181 | ||
182 | */ | |
183 | ||
184 | typedef struct pv_rooted_entry { /* first three entries must match pv_hashed_entry_t */ | |
185 | queue_head_t qlink; | |
186 | vm_map_offset_t va; /* virtual address for mapping */ | |
187 | pmap_t pmap; /* pmap where mapping lies */ | |
188 | } *pv_rooted_entry_t; | |
189 | ||
190 | #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0) | |
191 | ||
192 | ||
193 | typedef struct pv_hashed_entry { /* first three entries must match pv_rooted_entry_t */ | |
194 | queue_head_t qlink; | |
195 | vm_map_offset_t va; | |
196 | pmap_t pmap; | |
197 | ppnum_t ppn; | |
198 | struct pv_hashed_entry *nexth; | |
199 | } *pv_hashed_entry_t; | |
200 | ||
201 | #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0) | |
202 | ||
203 | /* #define PV_DEBUG 1 uncomment to enable some PV debugging code */ | |
204 | #ifdef PV_DEBUG | |
205 | #define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized"); | |
206 | #else | |
207 | #define CHK_NPVHASH() | |
208 | #endif | |
209 | ||
210 | #define NPVHASH 4095 /* MUST BE 2^N - 1 */ | |
211 | #define PV_HASHED_LOW_WATER_MARK 5000 | |
212 | #define PV_HASHED_KERN_LOW_WATER_MARK 400 | |
213 | #define PV_HASHED_ALLOC_CHUNK 2000 | |
214 | #define PV_HASHED_KERN_ALLOC_CHUNK 200 | |
215 | ||
216 | #define PV_HASHED_ALLOC(pvh_e) { \ | |
217 | simple_lock(&pv_hashed_free_list_lock); \ | |
218 | if ((pvh_e = pv_hashed_free_list) != 0) { \ | |
219 | pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \ | |
220 | pv_hashed_free_count--; \ | |
221 | if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \ | |
222 | if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \ | |
223 | thread_call_enter(mapping_adjust_call); \ | |
224 | } \ | |
225 | simple_unlock(&pv_hashed_free_list_lock); \ | |
226 | } | |
227 | ||
228 | #define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \ | |
229 | simple_lock(&pv_hashed_free_list_lock); \ | |
230 | pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; \ | |
231 | pv_hashed_free_list = pvh_eh; \ | |
232 | pv_hashed_free_count += pv_cnt; \ | |
233 | simple_unlock(&pv_hashed_free_list_lock); \ | |
234 | } | |
235 | ||
236 | #define PV_HASHED_KERN_ALLOC(pvh_e) { \ | |
237 | simple_lock(&pv_hashed_kern_free_list_lock); \ | |
238 | if ((pvh_e = pv_hashed_kern_free_list) != 0) { \ | |
239 | pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \ | |
240 | pv_hashed_kern_free_count--; \ | |
241 | if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) \ | |
242 | if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \ | |
243 | thread_call_enter(mapping_adjust_call); \ | |
244 | } \ | |
245 | simple_unlock(&pv_hashed_kern_free_list_lock); \ | |
246 | } | |
247 | ||
248 | #define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \ | |
249 | simple_lock(&pv_hashed_kern_free_list_lock); \ | |
250 | pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; \ | |
251 | pv_hashed_kern_free_list = pvh_eh; \ | |
252 | pv_hashed_kern_free_count += pv_cnt; \ | |
253 | simple_unlock(&pv_hashed_kern_free_list_lock); \ | |
254 | } | |
255 | ||
256 | /* | |
257 | * Index into pv_head table, its lock bits, and the modify/reference and managed bits | |
258 | */ | |
259 | ||
260 | #define pa_index(pa) (i386_btop(pa)) | |
261 | #define ppn_to_pai(ppn) ((int)ppn) | |
262 | ||
263 | #define pai_to_pvh(pai) (&pv_head_table[pai]) | |
264 | #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table) | |
265 | #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table) | |
266 | #define pvhash(idx) (&pv_hash_table[idx]) | |
267 | ||
268 | #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table) | |
269 | #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table) | |
270 | ||
271 | #define IS_MANAGED_PAGE(x) \ | |
272 | ((unsigned int)(x) <= last_managed_page && \ | |
273 | (pmap_phys_attributes[x] & PHYS_MANAGED)) | |
274 | ||
275 | /* | |
276 | * Physical page attributes. Copy bits from PTE definition. | |
277 | */ | |
278 | #define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */ | |
279 | #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */ | |
280 | #define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */ | |
0b4c1975 | 281 | #define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */ |
b7266188 A |
282 | |
283 | /* | |
284 | * Amount of virtual memory mapped by one | |
285 | * page-directory entry. | |
286 | */ | |
287 | #define PDE_MAPPED_SIZE (pdetova(1)) | |
288 | ||
289 | ||
290 | /* | |
291 | * Locking and TLB invalidation | |
292 | */ | |
293 | ||
294 | /* | |
295 | * Locking Protocols: (changed 2/2007 JK) | |
296 | * | |
297 | * There are two structures in the pmap module that need locking: | |
298 | * the pmaps themselves, and the per-page pv_lists (which are locked | |
299 | * by locking the pv_lock_table entry that corresponds to the pv_head | |
300 | * for the list in question.) Most routines want to lock a pmap and | |
301 | * then do operations in it that require pv_list locking -- however | |
302 | * pmap_remove_all and pmap_copy_on_write operate on a physical page | |
303 | * basis and want to do the locking in the reverse order, i.e. lock | |
304 | * a pv_list and then go through all the pmaps referenced by that list. | |
305 | * | |
306 | * The system wide pmap lock has been removed. Now, paths take a lock | |
307 | * on the pmap before changing its 'shape' and the reverse order lockers | |
308 | * (coming in by phys ppn) take a lock on the corresponding pv and then | |
309 | * retest to be sure nothing changed during the window before they locked | |
310 | * and can then run up/down the pv lists holding the list lock. This also | |
311 | * lets the pmap layer run (nearly completely) interrupt enabled, unlike | |
312 | * previously. | |
313 | */ | |
314 | ||
315 | /* | |
316 | * PV locking | |
317 | */ | |
318 | ||
319 | #define LOCK_PVH(index) { \ | |
320 | mp_disable_preemption(); \ | |
321 | lock_pvh_pai(index); \ | |
322 | } | |
323 | ||
324 | #define UNLOCK_PVH(index) { \ | |
325 | unlock_pvh_pai(index); \ | |
326 | mp_enable_preemption(); \ | |
327 | } | |
328 | /* | |
329 | * PV hash locking | |
330 | */ | |
331 | ||
332 | #define LOCK_PV_HASH(hash) lock_hash_hash(hash) | |
333 | #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash) | |
334 | extern uint32_t npvhash; | |
335 | extern pv_hashed_entry_t *pv_hash_table; /* hash lists */ | |
336 | extern pv_hashed_entry_t pv_hashed_free_list; | |
337 | extern pv_hashed_entry_t pv_hashed_kern_free_list; | |
338 | decl_simple_lock_data(extern, pv_hashed_free_list_lock) | |
339 | decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock) | |
340 | decl_simple_lock_data(extern, pv_hash_table_lock) | |
341 | ||
342 | extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */ | |
343 | ||
344 | extern int pv_hashed_free_count; | |
345 | extern int pv_hashed_kern_free_count; | |
346 | #define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) | |
347 | #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) | |
348 | extern char *pv_lock_table; /* pointer to array of bits */ | |
349 | ||
350 | extern char *pv_hash_lock_table; | |
351 | extern pv_rooted_entry_t pv_head_table; /* array of entries, one | |
352 | * per page */ | |
353 | extern uint64_t pde_mapped_size; | |
354 | ||
355 | extern char *pmap_phys_attributes; | |
356 | extern unsigned int last_managed_page; | |
357 | ||
358 | /* | |
359 | * when spinning through pmap_remove | |
360 | * ensure that we don't spend too much | |
361 | * time with preemption disabled. | |
362 | * I'm setting the current threshold | |
363 | * to 20us | |
364 | */ | |
365 | #define MAX_PREEMPTION_LATENCY_NS 20000 | |
366 | extern uint64_t max_preemption_latency_tsc; | |
367 | ||
368 | /* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */ | |
369 | #ifdef DEBUGINTERRUPTS | |
370 | #define pmap_intr_assert() { \ | |
371 | if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \ | |
372 | panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \ | |
373 | } | |
374 | #else | |
375 | #define pmap_intr_assert() | |
376 | #endif | |
377 | ||
378 | extern int nx_enabled; | |
379 | extern unsigned int inuse_ptepages_count; | |
380 | ||
381 | static inline uint32_t | |
382 | pvhashidx(pmap_t pmap, vm_map_offset_t va) | |
383 | { | |
384 | return ((uint32_t)(uintptr_t)pmap ^ | |
385 | ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) & | |
386 | npvhash; | |
387 | } | |
388 | ||
389 | /* | |
390 | * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain. | |
391 | * properly deals with the anchor. | |
392 | * must be called with the hash locked, does not unlock it | |
393 | */ | |
394 | ||
395 | static inline void | |
396 | pmap_pvh_unlink(pv_hashed_entry_t pvh) | |
397 | { | |
398 | pv_hashed_entry_t curh; | |
399 | pv_hashed_entry_t *pprevh; | |
400 | int pvhash_idx; | |
401 | ||
402 | CHK_NPVHASH(); | |
403 | pvhash_idx = pvhashidx(pvh->pmap, pvh->va); | |
404 | ||
405 | pprevh = pvhash(pvhash_idx); | |
406 | ||
407 | #if PV_DEBUG | |
408 | if (NULL == *pprevh) | |
409 | panic("pvh_unlink null anchor"); /* JK DEBUG */ | |
410 | #endif | |
411 | curh = *pprevh; | |
412 | ||
413 | while (PV_HASHED_ENTRY_NULL != curh) { | |
414 | if (pvh == curh) | |
415 | break; | |
416 | pprevh = &curh->nexth; | |
417 | curh = curh->nexth; | |
418 | } | |
419 | if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh"); | |
420 | *pprevh = pvh->nexth; | |
421 | return; | |
422 | } | |
423 | ||
424 | static inline void | |
425 | pv_hash_add(pv_hashed_entry_t pvh_e, | |
426 | pv_rooted_entry_t pv_h) | |
427 | { | |
428 | pv_hashed_entry_t *hashp; | |
429 | int pvhash_idx; | |
430 | ||
431 | CHK_NPVHASH(); | |
432 | pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); | |
433 | LOCK_PV_HASH(pvhash_idx); | |
434 | insque(&pvh_e->qlink, &pv_h->qlink); | |
435 | hashp = pvhash(pvhash_idx); | |
436 | #if PV_DEBUG | |
437 | if (NULL==hashp) | |
438 | panic("pv_hash_add(%p) null hash bucket", pvh_e); | |
439 | #endif | |
440 | pvh_e->nexth = *hashp; | |
441 | *hashp = pvh_e; | |
442 | UNLOCK_PV_HASH(pvhash_idx); | |
443 | } | |
444 | ||
445 | static inline void | |
446 | pv_hash_remove(pv_hashed_entry_t pvh_e) | |
447 | { | |
448 | int pvhash_idx; | |
449 | ||
450 | CHK_NPVHASH(); | |
451 | pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va); | |
452 | LOCK_PV_HASH(pvhash_idx); | |
453 | remque(&pvh_e->qlink); | |
454 | pmap_pvh_unlink(pvh_e); | |
455 | UNLOCK_PV_HASH(pvhash_idx); | |
456 | } | |
457 | ||
458 | static inline boolean_t popcnt1(uint64_t distance) { | |
459 | return ((distance & (distance - 1)) == 0); | |
460 | } | |
461 | ||
462 | /* | |
463 | * Routines to handle suppression of/recovery from some forms of pagetable corruption | |
464 | * incidents observed in the field. These can be either software induced (wild | |
465 | * stores to the mapwindows where applicable, use after free errors | |
466 | * (typically of pages addressed physically), mis-directed DMAs etc., or due | |
467 | * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors, | |
468 | * the recording mechanism is deliberately not MP-safe. The overarching goal is to | |
469 | * still assert on potential software races, but attempt recovery from incidents | |
470 | * identifiable as occurring due to issues beyond the control of the pmap module. | |
471 | * The latter includes single-bit errors and malformed pagetable entries. | |
472 | * We currently limit ourselves to recovery/suppression of one incident per | |
473 | * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident | |
474 | * are logged. | |
475 | * Assertions are not suppressed if kernel debugging is enabled. (DRK 09) | |
476 | */ | |
477 | ||
478 | typedef enum { | |
479 | PTE_VALID = 0x0, | |
480 | PTE_INVALID = 0x1, | |
481 | PTE_RSVD = 0x2, | |
482 | PTE_SUPERVISOR = 0x4, | |
483 | PTE_BITFLIP = 0x8, | |
484 | PV_BITFLIP = 0x10, | |
485 | PTE_INVALID_CACHEABILITY = 0x20 | |
486 | } pmap_pagetable_corruption_t; | |
487 | ||
488 | typedef enum { | |
489 | ROOT_PRESENT = 0, | |
490 | ROOT_ABSENT = 1 | |
491 | } pmap_pv_assertion_t; | |
492 | ||
493 | typedef enum { | |
494 | PMAP_ACTION_IGNORE = 0x0, | |
495 | PMAP_ACTION_ASSERT = 0x1, | |
496 | PMAP_ACTION_RETRY = 0x2, | |
497 | PMAP_ACTION_RETRY_RELOCK = 0x4 | |
498 | } pmap_pagetable_corruption_action_t; | |
499 | ||
500 | #define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL) | |
501 | extern uint64_t pmap_pagetable_corruption_interval_abstime; | |
502 | ||
503 | extern uint32_t pmap_pagetable_corruption_incidents; | |
504 | #define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8) | |
505 | typedef struct { | |
506 | pmap_pv_assertion_t incident; | |
507 | pmap_pagetable_corruption_t reason; | |
508 | pmap_pagetable_corruption_action_t action; | |
509 | pmap_t pmap; | |
510 | vm_map_offset_t vaddr; | |
511 | pt_entry_t pte; | |
512 | ppnum_t ppn; | |
513 | pmap_t pvpmap; | |
514 | vm_map_offset_t pvva; | |
515 | uint64_t abstime; | |
516 | } pmap_pagetable_corruption_record_t; | |
517 | ||
518 | extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[]; | |
519 | extern uint64_t pmap_pagetable_corruption_last_abstime; | |
520 | extern thread_call_t pmap_pagetable_corruption_log_call; | |
521 | extern boolean_t pmap_pagetable_corruption_timeout; | |
522 | ||
523 | static inline void | |
524 | pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) { | |
525 | uint32_t pmap_pagetable_corruption_log_index; | |
526 | pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG; | |
527 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident; | |
528 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason; | |
529 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action; | |
530 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap; | |
531 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr; | |
532 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep; | |
533 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn; | |
534 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap; | |
535 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva; | |
536 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time(); | |
537 | /* Asynchronously log */ | |
538 | thread_call_enter(pmap_pagetable_corruption_log_call); | |
539 | } | |
540 | ||
541 | static inline pmap_pagetable_corruption_action_t | |
542 | pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) { | |
543 | pmap_pv_assertion_t action = PMAP_ACTION_ASSERT; | |
544 | pmap_pagetable_corruption_t suppress_reason = PTE_VALID; | |
545 | ppnum_t suppress_ppn = 0; | |
546 | pt_entry_t cpte = *ptep; | |
547 | ppnum_t cpn = pa_index(pte_to_pa(cpte)); | |
548 | ppnum_t ppn = *ppnp; | |
549 | pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn)); | |
550 | pv_rooted_entry_t pv_e = pv_h; | |
551 | uint32_t bitdex; | |
552 | pmap_t pvpmap = pv_h->pmap; | |
553 | vm_map_offset_t pvva = pv_h->va; | |
554 | boolean_t ppcd = FALSE; | |
555 | ||
556 | /* Ideally, we'd consult the Mach VM here to definitively determine | |
557 | * the nature of the mapping for this address space and address. | |
558 | * As that would be a layering violation in this context, we | |
559 | * use various heuristics to recover from single bit errors, | |
560 | * malformed pagetable entries etc. These are not intended | |
561 | * to be comprehensive. | |
562 | */ | |
563 | ||
564 | /* As a precautionary measure, mark A+D */ | |
565 | pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED); | |
566 | ||
567 | /* | |
568 | * Correct potential single bit errors in either (but not both) element | |
569 | * of the PV | |
570 | */ | |
571 | do { | |
572 | if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) || | |
573 | (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) { | |
574 | pv_e->pmap = pmap; | |
575 | pv_e->va = vaddr; | |
576 | suppress_reason = PV_BITFLIP; | |
577 | action = PMAP_ACTION_RETRY; | |
578 | goto pmap_cpc_exit; | |
579 | } | |
580 | } while((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink)) != pv_h); | |
581 | ||
582 | /* Discover root entries with a Hamming | |
583 | * distance of 1 from the supplied | |
584 | * physical page frame. | |
585 | */ | |
586 | for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) { | |
587 | ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex); | |
588 | if (IS_MANAGED_PAGE(npn)) { | |
589 | pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn)); | |
590 | if (npv_h->va == vaddr && npv_h->pmap == pmap) { | |
591 | suppress_reason = PTE_BITFLIP; | |
592 | suppress_ppn = npn; | |
593 | action = PMAP_ACTION_RETRY_RELOCK; | |
594 | UNLOCK_PVH(ppn_to_pai(ppn)); | |
595 | *ppnp = npn; | |
596 | goto pmap_cpc_exit; | |
597 | } | |
598 | } | |
599 | } | |
600 | ||
601 | if (pmap == kernel_pmap) { | |
602 | action = PMAP_ACTION_ASSERT; | |
603 | goto pmap_cpc_exit; | |
604 | } | |
605 | ||
606 | /* Check for malformed/inconsistent entries */ | |
607 | ||
608 | if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) { | |
609 | action = PMAP_ACTION_IGNORE; | |
610 | suppress_reason = PTE_INVALID_CACHEABILITY; | |
611 | } | |
612 | else if (cpte & INTEL_PTE_RSVD) { | |
613 | action = PMAP_ACTION_IGNORE; | |
614 | suppress_reason = PTE_RSVD; | |
615 | } | |
616 | else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) { | |
617 | action = PMAP_ACTION_IGNORE; | |
618 | suppress_reason = PTE_SUPERVISOR; | |
619 | } | |
620 | pmap_cpc_exit: | |
621 | PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd)); | |
622 | ||
623 | if (debug_boot_arg && !ppcd) { | |
624 | action = PMAP_ACTION_ASSERT; | |
625 | } | |
626 | ||
627 | if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) { | |
628 | action = PMAP_ACTION_ASSERT; | |
629 | pmap_pagetable_corruption_timeout = TRUE; | |
630 | } | |
631 | else | |
632 | { | |
633 | pmap_pagetable_corruption_last_abstime = mach_absolute_time(); | |
634 | } | |
635 | pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva); | |
636 | return action; | |
637 | } | |
638 | /* | |
639 | * Remove pv list entry. | |
640 | * Called with pv_head_table entry locked. | |
641 | * Returns pv entry to be freed (or NULL). | |
642 | */ | |
643 | ||
644 | static inline __attribute__((always_inline)) pv_hashed_entry_t | |
645 | pmap_pv_remove( pmap_t pmap, | |
646 | vm_map_offset_t vaddr, | |
647 | ppnum_t *ppnp, | |
648 | pt_entry_t *pte) | |
649 | { | |
650 | pv_hashed_entry_t pvh_e; | |
651 | pv_rooted_entry_t pv_h; | |
652 | pv_hashed_entry_t *pprevh; | |
653 | int pvhash_idx; | |
654 | uint32_t pv_cnt; | |
655 | ppnum_t ppn; | |
656 | ||
657 | pmap_pv_remove_retry: | |
658 | ppn = *ppnp; | |
659 | pvh_e = PV_HASHED_ENTRY_NULL; | |
660 | pv_h = pai_to_pvh(ppn_to_pai(ppn)); | |
661 | ||
662 | if (pv_h->pmap == PMAP_NULL) { | |
663 | pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT); | |
664 | if (pac == PMAP_ACTION_IGNORE) | |
665 | goto pmap_pv_remove_exit; | |
666 | else if (pac == PMAP_ACTION_ASSERT) | |
667 | panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): null pv_list!", pmap, vaddr, ppn, *pte); | |
668 | else if (pac == PMAP_ACTION_RETRY_RELOCK) { | |
669 | LOCK_PVH(ppn_to_pai(*ppnp)); | |
670 | pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); | |
671 | goto pmap_pv_remove_retry; | |
672 | } | |
673 | else if (pac == PMAP_ACTION_RETRY) | |
674 | goto pmap_pv_remove_retry; | |
675 | } | |
676 | ||
677 | if (pv_h->va == vaddr && pv_h->pmap == pmap) { | |
678 | /* | |
679 | * Header is the pv_rooted_entry. | |
680 | * We can't free that. If there is a queued | |
681 | * entry after this one we remove that | |
682 | * from the ppn queue, we remove it from the hash chain | |
683 | * and copy it to the rooted entry. Then free it instead. | |
684 | */ | |
685 | pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink); | |
686 | if (pv_h != (pv_rooted_entry_t) pvh_e) { | |
687 | /* | |
688 | * Entry queued to root, remove this from hash | |
689 | * and install as new root. | |
690 | */ | |
691 | CHK_NPVHASH(); | |
692 | pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); | |
693 | LOCK_PV_HASH(pvhash_idx); | |
694 | remque(&pvh_e->qlink); | |
695 | pprevh = pvhash(pvhash_idx); | |
696 | if (PV_HASHED_ENTRY_NULL == *pprevh) { | |
697 | panic("pmap_pv_remove(%p,0x%llx,0x%x): " | |
698 | "empty hash, removing rooted", | |
699 | pmap, vaddr, ppn); | |
700 | } | |
701 | pmap_pvh_unlink(pvh_e); | |
702 | UNLOCK_PV_HASH(pvhash_idx); | |
703 | pv_h->pmap = pvh_e->pmap; | |
704 | pv_h->va = pvh_e->va; /* dispose of pvh_e */ | |
705 | } else { | |
706 | /* none queued after rooted */ | |
707 | pv_h->pmap = PMAP_NULL; | |
708 | pvh_e = PV_HASHED_ENTRY_NULL; | |
709 | } | |
710 | } else { | |
711 | /* | |
712 | * not removing rooted pv. find it on hash chain, remove from | |
713 | * ppn queue and hash chain and free it | |
714 | */ | |
715 | CHK_NPVHASH(); | |
716 | pvhash_idx = pvhashidx(pmap, vaddr); | |
717 | LOCK_PV_HASH(pvhash_idx); | |
718 | pprevh = pvhash(pvhash_idx); | |
719 | if (PV_HASHED_ENTRY_NULL == *pprevh) { | |
720 | panic("pmap_pv_remove(%p,0x%llx,0x%x): empty hash", pmap, vaddr, ppn); | |
721 | } | |
722 | pvh_e = *pprevh; | |
723 | pmap_pv_hashlist_walks++; | |
724 | pv_cnt = 0; | |
725 | while (PV_HASHED_ENTRY_NULL != pvh_e) { | |
726 | pv_cnt++; | |
727 | if (pvh_e->pmap == pmap && | |
728 | pvh_e->va == vaddr && | |
729 | pvh_e->ppn == ppn) | |
730 | break; | |
731 | pprevh = &pvh_e->nexth; | |
732 | pvh_e = pvh_e->nexth; | |
733 | } | |
734 | if (PV_HASHED_ENTRY_NULL == pvh_e) { | |
735 | pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT); | |
736 | ||
737 | if (pac == PMAP_ACTION_ASSERT) | |
738 | panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, pv_h->pmap, pv_h->va); | |
739 | else { | |
740 | UNLOCK_PV_HASH(pvhash_idx); | |
741 | if (pac == PMAP_ACTION_RETRY_RELOCK) { | |
742 | LOCK_PVH(ppn_to_pai(*ppnp)); | |
743 | pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); | |
744 | goto pmap_pv_remove_retry; | |
745 | } | |
746 | else if (pac == PMAP_ACTION_RETRY) { | |
747 | goto pmap_pv_remove_retry; | |
748 | } | |
749 | else if (pac == PMAP_ACTION_IGNORE) { | |
750 | goto pmap_pv_remove_exit; | |
751 | } | |
752 | } | |
753 | } | |
754 | pmap_pv_hashlist_cnts += pv_cnt; | |
755 | if (pmap_pv_hashlist_max < pv_cnt) | |
756 | pmap_pv_hashlist_max = pv_cnt; | |
757 | *pprevh = pvh_e->nexth; | |
758 | remque(&pvh_e->qlink); | |
759 | UNLOCK_PV_HASH(pvhash_idx); | |
760 | } | |
761 | pmap_pv_remove_exit: | |
762 | return pvh_e; | |
763 | } | |
764 | ||
b0d623f7 | 765 | #endif /* MACH_KERNEL_PRIVATE */ |