]> git.saurik.com Git - apple/xnu.git/blob - osfmk/x86_64/pmap.c
ce64b82dbf1486c193fa2f6e54166b4de75cc8d7
[apple/xnu.git] / osfmk / x86_64 / pmap.c
1 /*
2 * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58
59 /*
60 * File: pmap.c
61 * Author: Avadis Tevanian, Jr., Michael Wayne Young
62 * (These guys wrote the Vax version)
63 *
64 * Physical Map management code for Intel i386, i486, and i860.
65 *
66 * Manages physical address maps.
67 *
68 * In addition to hardware address maps, this
69 * module is called upon to provide software-use-only
70 * maps which may or may not be stored in the same
71 * form as hardware maps. These pseudo-maps are
72 * used to store intermediate results from copy
73 * operations to and from address spaces.
74 *
75 * Since the information managed by this module is
76 * also stored by the logical address mapping module,
77 * this module may throw away valid virtual-to-physical
78 * mappings at almost any time. However, invalidations
79 * of virtual-to-physical mappings must be done as
80 * requested.
81 *
82 * In order to cope with hardware architectures which
83 * make virtual-to-physical map invalidates expensive,
84 * this module may delay invalidate or reduced protection
85 * operations until such time as they are actually
86 * necessary. This module is given full information as
87 * to which processors are currently using which maps,
88 * and to when physical maps must be made correct.
89 */
90
91 #include <string.h>
92 #include <mach_ldebug.h>
93
94 #include <libkern/OSAtomic.h>
95
96 #include <mach/machine/vm_types.h>
97
98 #include <mach/boolean.h>
99 #include <kern/thread.h>
100 #include <kern/zalloc.h>
101 #include <kern/queue.h>
102 #include <kern/ledger.h>
103 #include <kern/mach_param.h>
104
105 #include <kern/kalloc.h>
106 #include <kern/spl.h>
107
108 #include <vm/pmap.h>
109 #include <vm/vm_map.h>
110 #include <vm/vm_kern.h>
111 #include <mach/vm_param.h>
112 #include <mach/vm_prot.h>
113 #include <vm/vm_object.h>
114 #include <vm/vm_page.h>
115
116 #include <mach/machine/vm_param.h>
117 #include <machine/thread.h>
118
119 #include <kern/misc_protos.h> /* prototyping */
120 #include <i386/misc_protos.h>
121 #include <i386/i386_lowmem.h>
122 #include <x86_64/lowglobals.h>
123
124 #include <i386/cpuid.h>
125 #include <i386/cpu_data.h>
126 #include <i386/cpu_number.h>
127 #include <i386/machine_cpu.h>
128 #include <i386/seg.h>
129 #include <i386/serial_io.h>
130 #include <i386/cpu_capabilities.h>
131 #include <i386/machine_routines.h>
132 #include <i386/proc_reg.h>
133 #include <i386/tsc.h>
134 #include <i386/pmap_internal.h>
135 #include <i386/pmap_pcid.h>
136 #if CONFIG_VMX
137 #include <i386/vmx/vmx_cpu.h>
138 #endif
139
140 #include <vm/vm_protos.h>
141 #include <san/kasan.h>
142
143 #include <i386/mp.h>
144 #include <i386/mp_desc.h>
145 #include <libkern/kernel_mach_header.h>
146
147 #include <pexpert/i386/efi.h>
148
149 #if MACH_ASSERT
150 int pmap_stats_assert = 1;
151 #endif /* MACH_ASSERT */
152
153 #ifdef IWANTTODEBUG
154 #undef DEBUG
155 #define DEBUG 1
156 #define POSTCODE_DELAY 1
157 #include <i386/postcode.h>
158 #endif /* IWANTTODEBUG */
159
160 #ifdef PMAP_DEBUG
161 #define DBG(x...) kprintf("DBG: " x)
162 #else
163 #define DBG(x...)
164 #endif
165 /* Compile time assert to ensure adjacency/alignment of per-CPU data fields used
166 * in the trampolines for kernel/user boundary TLB coherency.
167 */
168 char pmap_cpu_data_assert[(((offsetof(cpu_data_t, cpu_tlb_invalid) - offsetof(cpu_data_t, cpu_active_cr3)) == 8) && (offsetof(cpu_data_t, cpu_active_cr3) % 64 == 0)) ? 1 : -1];
169 boolean_t pmap_trace = FALSE;
170
171 boolean_t no_shared_cr3 = DEBUG; /* TRUE for DEBUG by default */
172
173 int nx_enabled = 1; /* enable no-execute protection -- set during boot */
174
175 #if DEBUG || DEVELOPMENT
176 int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
177 int allow_stack_exec = 0; /* No apps may execute from the stack by default */
178 #else /* DEBUG || DEVELOPMENT */
179 const int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
180 const int allow_stack_exec = 0; /* No apps may execute from the stack by default */
181 #endif /* DEBUG || DEVELOPMENT */
182
183 const boolean_t cpu_64bit = TRUE; /* Mais oui! */
184
185 uint64_t max_preemption_latency_tsc = 0;
186
187 pv_hashed_entry_t *pv_hash_table; /* hash lists */
188
189 uint32_t npvhashmask = 0, npvhashbuckets = 0;
190
191 pv_hashed_entry_t pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
192 pv_hashed_entry_t pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
193 decl_simple_lock_data(,pv_hashed_free_list_lock)
194 decl_simple_lock_data(,pv_hashed_kern_free_list_lock)
195 decl_simple_lock_data(,pv_hash_table_lock)
196
197 decl_simple_lock_data(,phys_backup_lock)
198
199 zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
200
201 /*
202 * First and last physical addresses that we maintain any information
203 * for. Initialized to zero so that pmap operations done before
204 * pmap_init won't touch any non-existent structures.
205 */
206 boolean_t pmap_initialized = FALSE;/* Has pmap_init completed? */
207
208 static struct vm_object kptobj_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
209 static struct vm_object kpml4obj_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
210 static struct vm_object kpdptobj_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
211
212 /*
213 * Array of physical page attribites for managed pages.
214 * One byte per physical page.
215 */
216 char *pmap_phys_attributes;
217 ppnum_t last_managed_page = 0;
218
219 /*
220 * Amount of virtual memory mapped by one
221 * page-directory entry.
222 */
223
224 uint64_t pde_mapped_size = PDE_MAPPED_SIZE;
225
226 unsigned pmap_memory_region_count;
227 unsigned pmap_memory_region_current;
228
229 pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
230
231 /*
232 * Other useful macros.
233 */
234 #define current_pmap() (vm_map_pmap(current_thread()->map))
235
236 struct pmap kernel_pmap_store;
237 pmap_t kernel_pmap;
238
239 struct zone *pmap_zone; /* zone of pmap structures */
240
241 struct zone *pmap_anchor_zone;
242 struct zone *pmap_uanchor_zone;
243 int pmap_debug = 0; /* flag for debugging prints */
244
245 unsigned int inuse_ptepages_count = 0;
246 long long alloc_ptepages_count __attribute__((aligned(8))) = 0; /* aligned for atomic access */
247 unsigned int bootstrap_wired_pages = 0;
248 int pt_fake_zone_index = -1;
249
250 extern long NMIPI_acks;
251
252 boolean_t kernel_text_ps_4K = TRUE;
253 boolean_t wpkernel = TRUE;
254
255 extern char end;
256
257 static int nkpt;
258
259 pt_entry_t *DMAP1, *DMAP2;
260 caddr_t DADDR1;
261 caddr_t DADDR2;
262
263 boolean_t pmap_disable_kheap_nx = FALSE;
264 boolean_t pmap_disable_kstack_nx = FALSE;
265
266 extern long __stack_chk_guard[];
267
268 static uint64_t pmap_eptp_flags = 0;
269 boolean_t pmap_ept_support_ad = FALSE;
270
271
272 /*
273 * Map memory at initialization. The physical addresses being
274 * mapped are not managed and are never unmapped.
275 *
276 * For now, VM is already on, we only need to map the
277 * specified memory.
278 */
279 vm_offset_t
280 pmap_map(
281 vm_offset_t virt,
282 vm_map_offset_t start_addr,
283 vm_map_offset_t end_addr,
284 vm_prot_t prot,
285 unsigned int flags)
286 {
287 kern_return_t kr;
288 int ps;
289
290 ps = PAGE_SIZE;
291 while (start_addr < end_addr) {
292 kr = pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
293 (ppnum_t) i386_btop(start_addr), prot, VM_PROT_NONE, flags, TRUE);
294
295 if (kr != KERN_SUCCESS) {
296 panic("%s: failed pmap_enter, "
297 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
298 __FUNCTION__,
299 (void *)virt, (void *)start_addr, (void *)end_addr, prot, flags);
300 }
301
302 virt += ps;
303 start_addr += ps;
304 }
305 return(virt);
306 }
307
308 extern char *first_avail;
309 extern vm_offset_t virtual_avail, virtual_end;
310 extern pmap_paddr_t avail_start, avail_end;
311 extern vm_offset_t sHIB;
312 extern vm_offset_t eHIB;
313 extern vm_offset_t stext;
314 extern vm_offset_t etext;
315 extern vm_offset_t sdata, edata;
316 extern vm_offset_t sconst, econst;
317
318 extern void *KPTphys;
319
320 boolean_t pmap_smep_enabled = FALSE;
321 boolean_t pmap_smap_enabled = FALSE;
322
323 void
324 pmap_cpu_init(void)
325 {
326 cpu_data_t *cdp = current_cpu_datap();
327
328 set_cr4(get_cr4() | CR4_PGE);
329
330 /*
331 * Initialize the per-cpu, TLB-related fields.
332 */
333 cdp->cpu_kernel_cr3 = kernel_pmap->pm_cr3;
334 cdp->cpu_active_cr3 = kernel_pmap->pm_cr3;
335 cdp->cpu_tlb_invalid = FALSE;
336 cdp->cpu_task_map = TASK_MAP_64BIT;
337
338 pmap_pcid_configure();
339 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMEP) {
340 pmap_smep_enabled = TRUE;
341 #if DEVELOPMENT || DEBUG
342 boolean_t nsmep;
343 if (PE_parse_boot_argn("-pmap_smep_disable", &nsmep, sizeof(nsmep))) {
344 pmap_smep_enabled = FALSE;
345 }
346 #endif
347 if (pmap_smep_enabled) {
348 set_cr4(get_cr4() | CR4_SMEP);
349 }
350
351 }
352 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMAP) {
353 pmap_smap_enabled = TRUE;
354 #if DEVELOPMENT || DEBUG
355 boolean_t nsmap;
356 if (PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) {
357 pmap_smap_enabled = FALSE;
358 }
359 #endif
360 if (pmap_smap_enabled) {
361 set_cr4(get_cr4() | CR4_SMAP);
362 }
363 }
364
365 #if !MONOTONIC
366 if (cdp->cpu_fixed_pmcs_enabled) {
367 boolean_t enable = TRUE;
368 cpu_pmc_control(&enable);
369 }
370 #endif /* !MONOTONIC */
371 }
372
373 static uint32_t pmap_scale_shift(void) {
374 uint32_t scale = 0;
375
376 if (sane_size <= 8*GB) {
377 scale = (uint32_t)(sane_size / (2 * GB));
378 } else if (sane_size <= 32*GB) {
379 scale = 4 + (uint32_t)((sane_size - (8 * GB))/ (4 * GB));
380 } else {
381 scale = 10 + (uint32_t)MIN(4, ((sane_size - (32 * GB))/ (8 * GB)));
382 }
383 return scale;
384 }
385
386 /*
387 * Bootstrap the system enough to run with virtual memory.
388 * Map the kernel's code and data, and allocate the system page table.
389 * Called with mapping OFF. Page_size must already be set.
390 */
391
392 void
393 pmap_bootstrap(
394 __unused vm_offset_t load_start,
395 __unused boolean_t IA32e)
396 {
397 #if NCOPY_WINDOWS > 0
398 vm_offset_t va;
399 int i;
400 #endif
401 assert(IA32e);
402
403 vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Set the highest address
404 * known to VM */
405 /*
406 * The kernel's pmap is statically allocated so we don't
407 * have to use pmap_create, which is unlikely to work
408 * correctly at this part of the boot sequence.
409 */
410
411 kernel_pmap = &kernel_pmap_store;
412 kernel_pmap->ref_count = 1;
413 kernel_pmap->nx_enabled = TRUE;
414 kernel_pmap->pm_task_map = TASK_MAP_64BIT;
415 kernel_pmap->pm_obj = (vm_object_t) NULL;
416 kernel_pmap->pm_pml4 = IdlePML4;
417 kernel_pmap->pm_upml4 = IdlePML4;
418 kernel_pmap->pm_cr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
419 kernel_pmap->pm_ucr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
420 kernel_pmap->pm_eptp = 0;
421
422 pmap_pcid_initialize_kernel(kernel_pmap);
423
424 current_cpu_datap()->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
425
426 nkpt = NKPT;
427 OSAddAtomic(NKPT, &inuse_ptepages_count);
428 OSAddAtomic64(NKPT, &alloc_ptepages_count);
429 bootstrap_wired_pages = NKPT;
430
431 virtual_avail = (vm_offset_t)(VM_MIN_KERNEL_ADDRESS) + (vm_offset_t)first_avail;
432 virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
433
434 #if NCOPY_WINDOWS > 0
435 /*
436 * Reserve some special page table entries/VA space for temporary
437 * mapping of pages.
438 */
439 #define SYSMAP(c, p, v, n) \
440 v = (c)va; va += ((n)*INTEL_PGBYTES);
441
442 va = virtual_avail;
443
444 for (i=0; i<PMAP_NWINDOWS; i++) {
445 #if 1
446 kprintf("trying to do SYSMAP idx %d %p\n", i,
447 current_cpu_datap());
448 kprintf("cpu_pmap %p\n", current_cpu_datap()->cpu_pmap);
449 kprintf("mapwindow %p\n", current_cpu_datap()->cpu_pmap->mapwindow);
450 kprintf("two stuff %p %p\n",
451 (void *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
452 (void *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR));
453 #endif
454 SYSMAP(caddr_t,
455 (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
456 (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR),
457 1);
458 current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP =
459 &(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP_store);
460 *current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP = 0;
461 }
462
463 /* DMAP user for debugger */
464 SYSMAP(caddr_t, DMAP1, DADDR1, 1);
465 SYSMAP(caddr_t, DMAP2, DADDR2, 1); /* XXX temporary - can remove */
466
467 virtual_avail = va;
468 #endif
469 if (!PE_parse_boot_argn("npvhash", &npvhashmask, sizeof (npvhashmask))) {
470 npvhashmask = ((NPVHASHBUCKETS) << pmap_scale_shift()) - 1;
471
472 }
473
474 npvhashbuckets = npvhashmask + 1;
475
476 if (0 != ((npvhashbuckets) & npvhashmask)) {
477 panic("invalid hash %d, must be ((2^N)-1), "
478 "using default %d\n", npvhashmask, NPVHASHMASK);
479 }
480
481 simple_lock_init(&kernel_pmap->lock, 0);
482 simple_lock_init(&pv_hashed_free_list_lock, 0);
483 simple_lock_init(&pv_hashed_kern_free_list_lock, 0);
484 simple_lock_init(&pv_hash_table_lock,0);
485 simple_lock_init(&phys_backup_lock, 0);
486
487 pmap_cpu_init();
488
489 if (pmap_pcid_ncpus)
490 printf("PMAP: PCID enabled\n");
491
492 if (pmap_smep_enabled)
493 printf("PMAP: Supervisor Mode Execute Protection enabled\n");
494 if (pmap_smap_enabled)
495 printf("PMAP: Supervisor Mode Access Protection enabled\n");
496
497 #if DEBUG
498 printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]);
499 printf("early_random(): 0x%qx\n", early_random());
500 #endif
501 #if DEVELOPMENT || DEBUG
502 boolean_t ptmp;
503 /* Check if the user has requested disabling stack or heap no-execute
504 * enforcement. These are "const" variables; that qualifier is cast away
505 * when altering them. The TEXT/DATA const sections are marked
506 * write protected later in the kernel startup sequence, so altering
507 * them is possible at this point, in pmap_bootstrap().
508 */
509 if (PE_parse_boot_argn("-pmap_disable_kheap_nx", &ptmp, sizeof(ptmp))) {
510 boolean_t *pdknxp = (boolean_t *) &pmap_disable_kheap_nx;
511 *pdknxp = TRUE;
512 }
513
514 if (PE_parse_boot_argn("-pmap_disable_kstack_nx", &ptmp, sizeof(ptmp))) {
515 boolean_t *pdknhp = (boolean_t *) &pmap_disable_kstack_nx;
516 *pdknhp = TRUE;
517 }
518 #endif /* DEVELOPMENT || DEBUG */
519
520 boot_args *args = (boot_args *)PE_state.bootArgs;
521 if (args->efiMode == kBootArgsEfiMode32) {
522 printf("EFI32: kernel virtual space limited to 4GB\n");
523 virtual_end = VM_MAX_KERNEL_ADDRESS_EFI32;
524 }
525 kprintf("Kernel virtual space from 0x%lx to 0x%lx.\n",
526 (long)KERNEL_BASE, (long)virtual_end);
527 kprintf("Available physical space from 0x%llx to 0x%llx\n",
528 avail_start, avail_end);
529
530 /*
531 * The -no_shared_cr3 boot-arg is a debugging feature (set by default
532 * in the DEBUG kernel) to force the kernel to switch to its own map
533 * (and cr3) when control is in kernelspace. The kernel's map does not
534 * include (i.e. share) userspace so wild references will cause
535 * a panic. Only copyin and copyout are exempt from this.
536 */
537 (void) PE_parse_boot_argn("-no_shared_cr3",
538 &no_shared_cr3, sizeof (no_shared_cr3));
539 if (no_shared_cr3)
540 kprintf("Kernel not sharing user map\n");
541
542 #ifdef PMAP_TRACES
543 if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof (pmap_trace))) {
544 kprintf("Kernel traces for pmap operations enabled\n");
545 }
546 #endif /* PMAP_TRACES */
547
548 #if MACH_ASSERT
549 PE_parse_boot_argn("pmap_stats_assert",
550 &pmap_stats_assert,
551 sizeof (pmap_stats_assert));
552 #endif /* MACH_ASSERT */
553 }
554
555 void
556 pmap_virtual_space(
557 vm_offset_t *startp,
558 vm_offset_t *endp)
559 {
560 *startp = virtual_avail;
561 *endp = virtual_end;
562 }
563
564
565
566
567 #if HIBERNATION
568
569 #include <IOKit/IOHibernatePrivate.h>
570
571 int32_t pmap_npages;
572 int32_t pmap_teardown_last_valid_compact_indx = -1;
573
574
575 void hibernate_rebuild_pmap_structs(void);
576 void hibernate_teardown_pmap_structs(addr64_t *, addr64_t *);
577 void pmap_pack_index(uint32_t);
578 int32_t pmap_unpack_index(pv_rooted_entry_t);
579
580
581 int32_t
582 pmap_unpack_index(pv_rooted_entry_t pv_h)
583 {
584 int32_t indx = 0;
585
586 indx = (int32_t)(*((uint64_t *)(&pv_h->qlink.next)) >> 48);
587 indx = indx << 16;
588 indx |= (int32_t)(*((uint64_t *)(&pv_h->qlink.prev)) >> 48);
589
590 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)0xffff << 48);
591 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)0xffff << 48);
592
593 return (indx);
594 }
595
596
597 void
598 pmap_pack_index(uint32_t indx)
599 {
600 pv_rooted_entry_t pv_h;
601
602 pv_h = &pv_head_table[indx];
603
604 *((uint64_t *)(&pv_h->qlink.next)) &= ~((uint64_t)0xffff << 48);
605 *((uint64_t *)(&pv_h->qlink.prev)) &= ~((uint64_t)0xffff << 48);
606
607 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)(indx >> 16)) << 48;
608 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)(indx & 0xffff)) << 48;
609 }
610
611
612 void
613 hibernate_teardown_pmap_structs(addr64_t *unneeded_start, addr64_t *unneeded_end)
614 {
615 int32_t i;
616 int32_t compact_target_indx;
617
618 compact_target_indx = 0;
619
620 for (i = 0; i < pmap_npages; i++) {
621 if (pv_head_table[i].pmap == PMAP_NULL) {
622
623 if (pv_head_table[compact_target_indx].pmap != PMAP_NULL)
624 compact_target_indx = i;
625 } else {
626 pmap_pack_index((uint32_t)i);
627
628 if (pv_head_table[compact_target_indx].pmap == PMAP_NULL) {
629 /*
630 * we've got a hole to fill, so
631 * move this pv_rooted_entry_t to it's new home
632 */
633 pv_head_table[compact_target_indx] = pv_head_table[i];
634 pv_head_table[i].pmap = PMAP_NULL;
635
636 pmap_teardown_last_valid_compact_indx = compact_target_indx;
637 compact_target_indx++;
638 } else
639 pmap_teardown_last_valid_compact_indx = i;
640 }
641 }
642 *unneeded_start = (addr64_t)&pv_head_table[pmap_teardown_last_valid_compact_indx+1];
643 *unneeded_end = (addr64_t)&pv_head_table[pmap_npages-1];
644
645 HIBLOG("hibernate_teardown_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
646 }
647
648
649 void
650 hibernate_rebuild_pmap_structs(void)
651 {
652 int32_t cindx, eindx, rindx = 0;
653 pv_rooted_entry_t pv_h;
654
655 eindx = (int32_t)pmap_npages;
656
657 for (cindx = pmap_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
658
659 pv_h = &pv_head_table[cindx];
660
661 rindx = pmap_unpack_index(pv_h);
662 assert(rindx < pmap_npages);
663
664 if (rindx != cindx) {
665 /*
666 * this pv_rooted_entry_t was moved by hibernate_teardown_pmap_structs,
667 * so move it back to its real location
668 */
669 pv_head_table[rindx] = pv_head_table[cindx];
670 }
671 if (rindx+1 != eindx) {
672 /*
673 * the 'hole' between this vm_rooted_entry_t and the previous
674 * vm_rooted_entry_t we moved needs to be initialized as
675 * a range of zero'd vm_rooted_entry_t's
676 */
677 bzero((char *)&pv_head_table[rindx+1], (eindx - rindx - 1) * sizeof (struct pv_rooted_entry));
678 }
679 eindx = rindx;
680 }
681 if (rindx)
682 bzero ((char *)&pv_head_table[0], rindx * sizeof (struct pv_rooted_entry));
683
684 HIBLOG("hibernate_rebuild_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
685 }
686
687 #endif
688
689 /*
690 * Initialize the pmap module.
691 * Called by vm_init, to initialize any structures that the pmap
692 * system needs to map virtual memory.
693 */
694 void
695 pmap_init(void)
696 {
697 long npages;
698 vm_offset_t addr;
699 vm_size_t s, vsize;
700 vm_map_offset_t vaddr;
701 ppnum_t ppn;
702
703
704 kernel_pmap->pm_obj_pml4 = &kpml4obj_object_store;
705 _vm_object_allocate((vm_object_size_t)NPML4PGS * PAGE_SIZE, &kpml4obj_object_store);
706
707 kernel_pmap->pm_obj_pdpt = &kpdptobj_object_store;
708 _vm_object_allocate((vm_object_size_t)NPDPTPGS * PAGE_SIZE, &kpdptobj_object_store);
709
710 kernel_pmap->pm_obj = &kptobj_object_store;
711 _vm_object_allocate((vm_object_size_t)NPDEPGS * PAGE_SIZE, &kptobj_object_store);
712
713 /*
714 * Allocate memory for the pv_head_table and its lock bits,
715 * the modify bit array, and the pte_page table.
716 */
717
718 /*
719 * zero bias all these arrays now instead of off avail_start
720 * so we cover all memory
721 */
722
723 npages = i386_btop(avail_end);
724 #if HIBERNATION
725 pmap_npages = (uint32_t)npages;
726 #endif
727 s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
728 + (sizeof (struct pv_hashed_entry_t *) * (npvhashbuckets))
729 + pv_lock_table_size(npages)
730 + pv_hash_lock_table_size((npvhashbuckets))
731 + npages);
732 s = round_page(s);
733 if (kernel_memory_allocate(kernel_map, &addr, s, 0,
734 KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_PMAP)
735 != KERN_SUCCESS)
736 panic("pmap_init");
737
738 memset((char *)addr, 0, s);
739
740 vaddr = addr;
741 vsize = s;
742
743 #if PV_DEBUG
744 if (0 == npvhashmask) panic("npvhashmask not initialized");
745 #endif
746
747 /*
748 * Allocate the structures first to preserve word-alignment.
749 */
750 pv_head_table = (pv_rooted_entry_t) addr;
751 addr = (vm_offset_t) (pv_head_table + npages);
752
753 pv_hash_table = (pv_hashed_entry_t *)addr;
754 addr = (vm_offset_t) (pv_hash_table + (npvhashbuckets));
755
756 pv_lock_table = (char *) addr;
757 addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
758
759 pv_hash_lock_table = (char *) addr;
760 addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhashbuckets)));
761
762 pmap_phys_attributes = (char *) addr;
763
764 ppnum_t last_pn = i386_btop(avail_end);
765 unsigned int i;
766 pmap_memory_region_t *pmptr = pmap_memory_regions;
767 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
768 if (pmptr->type != kEfiConventionalMemory)
769 continue;
770 ppnum_t pn;
771 for (pn = pmptr->base; pn <= pmptr->end; pn++) {
772 if (pn < last_pn) {
773 pmap_phys_attributes[pn] |= PHYS_MANAGED;
774
775 if (pn > last_managed_page)
776 last_managed_page = pn;
777
778 if (pn >= lowest_hi && pn <= highest_hi)
779 pmap_phys_attributes[pn] |= PHYS_NOENCRYPT;
780 }
781 }
782 }
783 while (vsize) {
784 ppn = pmap_find_phys(kernel_pmap, vaddr);
785
786 pmap_phys_attributes[ppn] |= PHYS_NOENCRYPT;
787
788 vaddr += PAGE_SIZE;
789 vsize -= PAGE_SIZE;
790 }
791 /*
792 * Create the zone of physical maps,
793 * and of the physical-to-virtual entries.
794 */
795 s = (vm_size_t) sizeof(struct pmap);
796 pmap_zone = zinit(s, 400*s, 4096, "pmap"); /* XXX */
797 zone_change(pmap_zone, Z_NOENCRYPT, TRUE);
798
799 pmap_anchor_zone = zinit(PAGE_SIZE, task_max, PAGE_SIZE, "pagetable anchors");
800 zone_change(pmap_anchor_zone, Z_NOENCRYPT, TRUE);
801
802 /* The anchor is required to be page aligned. Zone debugging adds
803 * padding which may violate that requirement. Tell the zone
804 * subsystem that alignment is required.
805 */
806
807 zone_change(pmap_anchor_zone, Z_ALIGNMENT_REQUIRED, TRUE);
808 /* TODO: possible general optimisation...pre-allocate via zones commonly created
809 * level3/2 pagetables
810 */
811 pmap_uanchor_zone = zinit(PAGE_SIZE, task_max, PAGE_SIZE, "pagetable user anchors");
812 zone_change(pmap_uanchor_zone, Z_NOENCRYPT, TRUE);
813
814 /* The anchor is required to be page aligned. Zone debugging adds
815 * padding which may violate that requirement. Tell the zone
816 * subsystem that alignment is required.
817 */
818
819 zone_change(pmap_uanchor_zone, Z_ALIGNMENT_REQUIRED, TRUE);
820
821 s = (vm_size_t) sizeof(struct pv_hashed_entry);
822 pv_hashed_list_zone = zinit(s, 10000*s /* Expandable zone */,
823 4096 * 3 /* LCM x86_64*/, "pv_list");
824 zone_change(pv_hashed_list_zone, Z_NOENCRYPT, TRUE);
825 zone_change(pv_hashed_list_zone, Z_GZALLOC_EXEMPT, TRUE);
826
827 /* create pv entries for kernel pages mapped by low level
828 startup code. these have to exist so we can pmap_remove()
829 e.g. kext pages from the middle of our addr space */
830
831 vaddr = (vm_map_offset_t) VM_MIN_KERNEL_ADDRESS;
832 for (ppn = VM_MIN_KERNEL_PAGE; ppn < i386_btop(avail_start); ppn++) {
833 pv_rooted_entry_t pv_e;
834
835 pv_e = pai_to_pvh(ppn);
836 pv_e->va_and_flags = vaddr;
837 vaddr += PAGE_SIZE;
838 pv_e->pmap = kernel_pmap;
839 queue_init(&pv_e->qlink);
840 }
841 pmap_initialized = TRUE;
842
843 max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
844
845 /*
846 * Ensure the kernel's PML4 entry exists for the basement
847 * before this is shared with any user.
848 */
849 pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT, PMAP_EXPAND_OPTIONS_NONE);
850
851 #if CONFIG_VMX
852 pmap_ept_support_ad = vmx_hv_support() && (VMX_CAP(MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_EPT_VPID_CAP_AD_SHIFT, 1) ? TRUE : FALSE);
853 pmap_eptp_flags = HV_VMX_EPTP_MEMORY_TYPE_WB | HV_VMX_EPTP_WALK_LENGTH(4) | (pmap_ept_support_ad ? HV_VMX_EPTP_ENABLE_AD_FLAGS : 0);
854 #endif /* CONFIG_VMX */
855 }
856
857 static
858 void pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolean_t ro) {
859 uint64_t ev = sv + nxrosz, cv = sv;
860 pd_entry_t *pdep;
861 pt_entry_t *ptep = NULL;
862
863 assert(!is_ept_pmap(npmap));
864
865 assert(((sv & 0xFFFULL) | (nxrosz & 0xFFFULL)) == 0);
866
867 for (pdep = pmap_pde(npmap, cv); pdep != NULL && (cv < ev);) {
868 uint64_t pdev = (cv & ~((uint64_t)PDEMASK));
869
870 if (*pdep & INTEL_PTE_PS) {
871 if (NX)
872 *pdep |= INTEL_PTE_NX;
873 if (ro)
874 *pdep &= ~INTEL_PTE_WRITE;
875 cv += NBPD;
876 cv &= ~((uint64_t) PDEMASK);
877 pdep = pmap_pde(npmap, cv);
878 continue;
879 }
880
881 for (ptep = pmap_pte(npmap, cv); ptep != NULL && (cv < (pdev + NBPD)) && (cv < ev);) {
882 if (NX)
883 *ptep |= INTEL_PTE_NX;
884 if (ro)
885 *ptep &= ~INTEL_PTE_WRITE;
886 cv += NBPT;
887 ptep = pmap_pte(npmap, cv);
888 }
889 }
890 DPRINTF("%s(0x%llx, 0x%llx, %u, %u): 0x%llx, 0x%llx\n", __FUNCTION__, sv, nxrosz, NX, ro, cv, ptep ? *ptep: 0);
891 }
892
893 /*
894 * Called once VM is fully initialized so that we can release unused
895 * sections of low memory to the general pool.
896 * Also complete the set-up of identity-mapped sections of the kernel:
897 * 1) write-protect kernel text
898 * 2) map kernel text using large pages if possible
899 * 3) read and write-protect page zero (for K32)
900 * 4) map the global page at the appropriate virtual address.
901 *
902 * Use of large pages
903 * ------------------
904 * To effectively map and write-protect all kernel text pages, the text
905 * must be 2M-aligned at the base, and the data section above must also be
906 * 2M-aligned. That is, there's padding below and above. This is achieved
907 * through linker directives. Large pages are used only if this alignment
908 * exists (and not overriden by the -kernel_text_page_4K boot-arg). The
909 * memory layout is:
910 *
911 * : :
912 * | __DATA |
913 * sdata: ================== 2Meg
914 * | |
915 * | zero-padding |
916 * | |
917 * etext: ------------------
918 * | |
919 * : :
920 * | |
921 * | __TEXT |
922 * | |
923 * : :
924 * | |
925 * stext: ================== 2Meg
926 * | |
927 * | zero-padding |
928 * | |
929 * eHIB: ------------------
930 * | __HIB |
931 * : :
932 *
933 * Prior to changing the mapping from 4K to 2M, the zero-padding pages
934 * [eHIB,stext] and [etext,sdata] are ml_static_mfree()'d. Then all the
935 * 4K pages covering [stext,etext] are coalesced as 2M large pages.
936 * The now unused level-1 PTE pages are also freed.
937 */
938 extern ppnum_t vm_kernel_base_page;
939 static uint32_t constptes = 0, dataptes = 0;
940
941 void pmap_lowmem_finalize(void) {
942 spl_t spl;
943 int i;
944
945 /*
946 * Update wired memory statistics for early boot pages
947 */
948 PMAP_ZINFO_PALLOC(kernel_pmap, bootstrap_wired_pages * PAGE_SIZE);
949
950 /*
951 * Free pages in pmap regions below the base:
952 * rdar://6332712
953 * We can't free all the pages to VM that EFI reports available.
954 * Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake.
955 * There's also a size miscalculation here: pend is one page less
956 * than it should be but this is not fixed to be backwards
957 * compatible.
958 * This is important for KASLR because up to 256*2MB = 512MB of space
959 * needs has to be released to VM.
960 */
961 for (i = 0;
962 pmap_memory_regions[i].end < vm_kernel_base_page;
963 i++) {
964 vm_offset_t pbase = i386_ptob(pmap_memory_regions[i].base);
965 vm_offset_t pend = i386_ptob(pmap_memory_regions[i].end+1);
966
967 DBG("pmap region %d [%p..[%p\n",
968 i, (void *) pbase, (void *) pend);
969
970 if (pmap_memory_regions[i].attribute & EFI_MEMORY_KERN_RESERVED)
971 continue;
972 /*
973 * rdar://6332712
974 * Adjust limits not to free pages in range 0xc0000-0xff000.
975 */
976 if (pbase >= 0xc0000 && pend <= 0x100000)
977 continue;
978 if (pbase < 0xc0000 && pend > 0x100000) {
979 /* page range entirely within region, free lower part */
980 DBG("- ml_static_mfree(%p,%p)\n",
981 (void *) ml_static_ptovirt(pbase),
982 (void *) (0xc0000-pbase));
983 ml_static_mfree(ml_static_ptovirt(pbase),0xc0000-pbase);
984 pbase = 0x100000;
985 }
986 if (pbase < 0xc0000)
987 pend = MIN(pend, 0xc0000);
988 if (pend > 0x100000)
989 pbase = MAX(pbase, 0x100000);
990 DBG("- ml_static_mfree(%p,%p)\n",
991 (void *) ml_static_ptovirt(pbase),
992 (void *) (pend - pbase));
993 ml_static_mfree(ml_static_ptovirt(pbase), pend - pbase);
994 }
995
996 /* A final pass to get rid of all initial identity mappings to
997 * low pages.
998 */
999 DPRINTF("%s: Removing mappings from 0->0x%lx\n", __FUNCTION__, vm_kernel_base);
1000
1001 /*
1002 * Remove all mappings past the boot-cpu descriptor aliases and low globals.
1003 * Non-boot-cpu GDT aliases will be remapped later as needed.
1004 */
1005 pmap_remove(kernel_pmap, LOWGLOBAL_ALIAS + PAGE_SIZE, vm_kernel_base);
1006
1007 /*
1008 * If text and data are both 2MB-aligned,
1009 * we can map text with large-pages,
1010 * unless the -kernel_text_ps_4K boot-arg overrides.
1011 */
1012 if ((stext & I386_LPGMASK) == 0 && (sdata & I386_LPGMASK) == 0) {
1013 kprintf("Kernel text is 2MB aligned");
1014 kernel_text_ps_4K = FALSE;
1015 if (PE_parse_boot_argn("-kernel_text_ps_4K",
1016 &kernel_text_ps_4K,
1017 sizeof (kernel_text_ps_4K)))
1018 kprintf(" but will be mapped with 4K pages\n");
1019 else
1020 kprintf(" and will be mapped with 2M pages\n");
1021 }
1022
1023 (void) PE_parse_boot_argn("wpkernel", &wpkernel, sizeof (wpkernel));
1024 if (wpkernel)
1025 kprintf("Kernel text %p-%p to be write-protected\n",
1026 (void *) stext, (void *) etext);
1027
1028 spl = splhigh();
1029
1030 /*
1031 * Scan over text if mappings are to be changed:
1032 * - Remap kernel text readonly unless the "wpkernel" boot-arg is 0
1033 * - Change to large-pages if possible and not overriden.
1034 */
1035 if (kernel_text_ps_4K && wpkernel) {
1036 vm_offset_t myva;
1037 for (myva = stext; myva < etext; myva += PAGE_SIZE) {
1038 pt_entry_t *ptep;
1039
1040 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1041 if (ptep)
1042 pmap_store_pte(ptep, *ptep & ~INTEL_PTE_WRITE);
1043 }
1044 }
1045
1046 if (!kernel_text_ps_4K) {
1047 vm_offset_t myva;
1048
1049 /*
1050 * Release zero-filled page padding used for 2M-alignment.
1051 */
1052 DBG("ml_static_mfree(%p,%p) for padding below text\n",
1053 (void *) eHIB, (void *) (stext - eHIB));
1054 ml_static_mfree(eHIB, stext - eHIB);
1055 DBG("ml_static_mfree(%p,%p) for padding above text\n",
1056 (void *) etext, (void *) (sdata - etext));
1057 ml_static_mfree(etext, sdata - etext);
1058
1059 /*
1060 * Coalesce text pages into large pages.
1061 */
1062 for (myva = stext; myva < sdata; myva += I386_LPGBYTES) {
1063 pt_entry_t *ptep;
1064 vm_offset_t pte_phys;
1065 pt_entry_t *pdep;
1066 pt_entry_t pde;
1067
1068 pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva);
1069 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1070 DBG("myva: %p pdep: %p ptep: %p\n",
1071 (void *) myva, (void *) pdep, (void *) ptep);
1072 if ((*ptep & INTEL_PTE_VALID) == 0)
1073 continue;
1074 pte_phys = (vm_offset_t)(*ptep & PG_FRAME);
1075 pde = *pdep & PTMASK; /* page attributes from pde */
1076 pde |= INTEL_PTE_PS; /* make it a 2M entry */
1077 pde |= pte_phys; /* take page frame from pte */
1078
1079 if (wpkernel)
1080 pde &= ~INTEL_PTE_WRITE;
1081 DBG("pmap_store_pte(%p,0x%llx)\n",
1082 (void *)pdep, pde);
1083 pmap_store_pte(pdep, pde);
1084
1085 /*
1086 * Free the now-unused level-1 pte.
1087 * Note: ptep is a virtual address to the pte in the
1088 * recursive map. We can't use this address to free
1089 * the page. Instead we need to compute its address
1090 * in the Idle PTEs in "low memory".
1091 */
1092 vm_offset_t vm_ptep = (vm_offset_t) KPTphys
1093 + (pte_phys >> PTPGSHIFT);
1094 DBG("ml_static_mfree(%p,0x%x) for pte\n",
1095 (void *) vm_ptep, PAGE_SIZE);
1096 ml_static_mfree(vm_ptep, PAGE_SIZE);
1097 }
1098
1099 /* Change variable read by sysctl machdep.pmap */
1100 pmap_kernel_text_ps = I386_LPGBYTES;
1101 }
1102
1103 boolean_t doconstro = TRUE;
1104 #if DEVELOPMENT || DEBUG
1105 (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro));
1106 #endif
1107 if (doconstro) {
1108 if (sconst & PAGE_MASK) {
1109 panic("CONST segment misaligned 0x%lx 0x%lx\n",
1110 sconst, econst);
1111 }
1112 kprintf("Marking const DATA read-only\n");
1113 }
1114
1115 vm_offset_t dva;
1116
1117 for (dva = sdata; dva < edata; dva += I386_PGBYTES) {
1118 assert(((sdata | edata) & PAGE_MASK) == 0);
1119 pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva);
1120
1121 dpte = *dptep;
1122 assert((dpte & INTEL_PTE_VALID));
1123 dpte |= INTEL_PTE_NX;
1124 pmap_store_pte(dptep, dpte);
1125 dataptes++;
1126 }
1127 assert(dataptes > 0);
1128
1129 for (dva = sconst; dva < econst; dva += I386_PGBYTES) {
1130 pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva);
1131
1132 dpte = *dptep;
1133
1134 assert((dpte & INTEL_PTE_VALID));
1135 dpte |= INTEL_PTE_NX;
1136 dpte &= ~INTEL_PTE_WRITE;
1137 constptes++;
1138 pmap_store_pte(dptep, dpte);
1139 }
1140
1141 assert(constptes > 0);
1142
1143 kernel_segment_command_t * seg;
1144 kernel_section_t * sec;
1145
1146 for (seg = firstseg(); seg != NULL; seg = nextsegfromheader(&_mh_execute_header, seg)) {
1147 if (!strcmp(seg->segname, "__TEXT") ||
1148 !strcmp(seg->segname, "__DATA")) {
1149 continue;
1150 }
1151 //XXX
1152 if (!strcmp(seg->segname, "__KLD")) {
1153 continue;
1154 }
1155 if (!strcmp(seg->segname, "__HIB")) {
1156 for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) {
1157 if (sec->addr & PAGE_MASK)
1158 panic("__HIB segment's sections misaligned");
1159 if (!strcmp(sec->sectname, "__text")) {
1160 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), FALSE, TRUE);
1161 } else {
1162 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), TRUE, FALSE);
1163 }
1164 }
1165 } else {
1166 pmap_mark_range(kernel_pmap, seg->vmaddr, round_page_64(seg->vmsize), TRUE, FALSE);
1167 }
1168 }
1169
1170 /*
1171 * If we're debugging, map the low global vector page at the fixed
1172 * virtual address. Otherwise, remove the mapping for this.
1173 */
1174 if (debug_boot_arg) {
1175 pt_entry_t *pte = NULL;
1176 if (0 == (pte = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS)))
1177 panic("lowmem pte");
1178 /* make sure it is defined on page boundary */
1179 assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK));
1180 pmap_store_pte(pte, kvtophys((vm_offset_t)&lowGlo)
1181 | INTEL_PTE_REF
1182 | INTEL_PTE_MOD
1183 | INTEL_PTE_WIRED
1184 | INTEL_PTE_VALID
1185 | INTEL_PTE_WRITE
1186 | INTEL_PTE_NX);
1187 } else {
1188 pmap_remove(kernel_pmap,
1189 LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE);
1190 }
1191
1192 splx(spl);
1193 if (pmap_pcid_ncpus)
1194 tlb_flush_global();
1195 else
1196 flush_tlb_raw();
1197 }
1198
1199 /*
1200 * this function is only used for debugging fron the vm layer
1201 */
1202 boolean_t
1203 pmap_verify_free(
1204 ppnum_t pn)
1205 {
1206 pv_rooted_entry_t pv_h;
1207 int pai;
1208 boolean_t result;
1209
1210 assert(pn != vm_page_fictitious_addr);
1211
1212 if (!pmap_initialized)
1213 return(TRUE);
1214
1215 if (pn == vm_page_guard_addr)
1216 return TRUE;
1217
1218 pai = ppn_to_pai(pn);
1219 if (!IS_MANAGED_PAGE(pai))
1220 return(FALSE);
1221 pv_h = pai_to_pvh(pn);
1222 result = (pv_h->pmap == PMAP_NULL);
1223 return(result);
1224 }
1225
1226 boolean_t
1227 pmap_is_empty(
1228 pmap_t pmap,
1229 vm_map_offset_t va_start,
1230 vm_map_offset_t va_end)
1231 {
1232 vm_map_offset_t offset;
1233 ppnum_t phys_page;
1234
1235 if (pmap == PMAP_NULL) {
1236 return TRUE;
1237 }
1238
1239 /*
1240 * Check the resident page count
1241 * - if it's zero, the pmap is completely empty.
1242 * This short-circuit test prevents a virtual address scan which is
1243 * painfully slow for 64-bit spaces.
1244 * This assumes the count is correct
1245 * .. the debug kernel ought to be checking perhaps by page table walk.
1246 */
1247 if (pmap->stats.resident_count == 0)
1248 return TRUE;
1249
1250 for (offset = va_start;
1251 offset < va_end;
1252 offset += PAGE_SIZE_64) {
1253 phys_page = pmap_find_phys(pmap, offset);
1254 if (phys_page) {
1255 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1256 "page %d at 0x%llx\n",
1257 pmap, va_start, va_end, phys_page, offset);
1258 return FALSE;
1259 }
1260 }
1261
1262 return TRUE;
1263 }
1264
1265 void
1266 hv_ept_pmap_create(void **ept_pmap, void **eptp)
1267 {
1268 pmap_t p;
1269
1270 if ((ept_pmap == NULL) || (eptp == NULL)) {
1271 return;
1272 }
1273
1274 p = pmap_create_options(get_task_ledger(current_task()), 0, (PMAP_CREATE_64BIT | PMAP_CREATE_EPT));
1275 if (p == PMAP_NULL) {
1276 *ept_pmap = NULL;
1277 *eptp = NULL;
1278 return;
1279 }
1280
1281 assert(is_ept_pmap(p));
1282
1283 *ept_pmap = (void*)p;
1284 *eptp = (void*)(p->pm_eptp);
1285 return;
1286 }
1287
1288 /*
1289 * Create and return a physical map.
1290 *
1291 * If the size specified for the map
1292 * is zero, the map is an actual physical
1293 * map, and may be referenced by the
1294 * hardware.
1295 *
1296 * If the size specified is non-zero,
1297 * the map will be used in software only, and
1298 * is bounded by that size.
1299 */
1300
1301 pmap_t
1302 pmap_create_options(
1303 ledger_t ledger,
1304 vm_map_size_t sz,
1305 int flags)
1306 {
1307 pmap_t p;
1308 vm_size_t size;
1309 pml4_entry_t *pml4;
1310 pml4_entry_t *kpml4;
1311
1312 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, sz, flags);
1313
1314 size = (vm_size_t) sz;
1315
1316 /*
1317 * A software use-only map doesn't even need a map.
1318 */
1319
1320 if (size != 0) {
1321 return(PMAP_NULL);
1322 }
1323
1324 /*
1325 * Return error when unrecognized flags are passed.
1326 */
1327 if (__improbable((flags & ~(PMAP_CREATE_KNOWN_FLAGS)) != 0)) {
1328 return(PMAP_NULL);
1329 }
1330
1331 p = (pmap_t) zalloc(pmap_zone);
1332 if (PMAP_NULL == p)
1333 panic("pmap_create zalloc");
1334
1335 /* Zero all fields */
1336 bzero(p, sizeof(*p));
1337 /* init counts now since we'll be bumping some */
1338 simple_lock_init(&p->lock, 0);
1339 bzero(&p->stats, sizeof (p->stats));
1340
1341 p->ref_count = 1;
1342 p->nx_enabled = 1;
1343 p->pm_shared = FALSE;
1344 ledger_reference(ledger);
1345 p->ledger = ledger;
1346
1347 p->pm_task_map = ((flags & PMAP_CREATE_64BIT) ? TASK_MAP_64BIT : TASK_MAP_32BIT);
1348
1349 p->pagezero_accessible = FALSE;
1350
1351 if (pmap_pcid_ncpus) {
1352 pmap_pcid_initialize(p);
1353 }
1354
1355 p->pm_pml4 = zalloc(pmap_anchor_zone);
1356 p->pm_upml4 = zalloc(pmap_uanchor_zone); //cleanup for EPT
1357
1358 pmap_assert((((uintptr_t)p->pm_pml4) & PAGE_MASK) == 0);
1359 pmap_assert((((uintptr_t)p->pm_upml4) & PAGE_MASK) == 0);
1360
1361 memset((char *)p->pm_pml4, 0, PAGE_SIZE);
1362 memset((char *)p->pm_upml4, 0, PAGE_SIZE);
1363
1364 if (flags & PMAP_CREATE_EPT) {
1365 p->pm_eptp = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4) | pmap_eptp_flags;
1366 p->pm_cr3 = 0;
1367 } else {
1368 p->pm_eptp = 0;
1369 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4);
1370 p->pm_ucr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_upml4);
1371 }
1372
1373 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1374
1375 p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS) * PAGE_SIZE);
1376 if (NULL == p->pm_obj_pml4)
1377 panic("pmap_create pdpt obj");
1378
1379 p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS) * PAGE_SIZE);
1380 if (NULL == p->pm_obj_pdpt)
1381 panic("pmap_create pdpt obj");
1382
1383 p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS) * PAGE_SIZE);
1384 if (NULL == p->pm_obj)
1385 panic("pmap_create pte obj");
1386
1387 if (!(flags & PMAP_CREATE_EPT)) {
1388 /* All host pmaps share the kernel's pml4 */
1389 pml4 = pmap64_pml4(p, 0ULL);
1390 kpml4 = kernel_pmap->pm_pml4;
1391 pml4[KERNEL_PML4_INDEX] = kpml4[KERNEL_PML4_INDEX];
1392 pml4[KERNEL_KEXTS_INDEX] = kpml4[KERNEL_KEXTS_INDEX];
1393 pml4[KERNEL_PHYSMAP_PML4_INDEX] = kpml4[KERNEL_PHYSMAP_PML4_INDEX];
1394 pml4[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1395 #if KASAN
1396 pml4[KERNEL_KASAN_PML4_INDEX0] = kpml4[KERNEL_KASAN_PML4_INDEX0];
1397 pml4[KERNEL_KASAN_PML4_INDEX1] = kpml4[KERNEL_KASAN_PML4_INDEX1];
1398 #endif
1399 pml4_entry_t *pml4u = pmap64_user_pml4(p, 0ULL);
1400 pml4u[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1401 }
1402
1403 #if MACH_ASSERT
1404 p->pmap_pid = 0;
1405 strlcpy(p->pmap_procname, "<nil>", sizeof (p->pmap_procname));
1406 #endif /* MACH_ASSERT */
1407
1408 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END,
1409 VM_KERNEL_ADDRHIDE(p));
1410
1411 return(p);
1412 }
1413
1414 pmap_t
1415 pmap_create(
1416 ledger_t ledger,
1417 vm_map_size_t sz,
1418 boolean_t is_64bit)
1419 {
1420 return pmap_create_options(ledger, sz, ((is_64bit) ? PMAP_CREATE_64BIT : 0));
1421 }
1422
1423 /*
1424 * We maintain stats and ledgers so that a task's physical footprint is:
1425 * phys_footprint = ((internal - alternate_accounting)
1426 * + (internal_compressed - alternate_accounting_compressed)
1427 * + iokit_mapped
1428 * + purgeable_nonvolatile
1429 * + purgeable_nonvolatile_compressed
1430 * + page_table)
1431 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
1432 */
1433
1434 #if MACH_ASSERT
1435 struct {
1436 uint64_t num_pmaps_checked;
1437
1438 int phys_footprint_over;
1439 ledger_amount_t phys_footprint_over_total;
1440 ledger_amount_t phys_footprint_over_max;
1441 int phys_footprint_under;
1442 ledger_amount_t phys_footprint_under_total;
1443 ledger_amount_t phys_footprint_under_max;
1444
1445 int internal_over;
1446 ledger_amount_t internal_over_total;
1447 ledger_amount_t internal_over_max;
1448 int internal_under;
1449 ledger_amount_t internal_under_total;
1450 ledger_amount_t internal_under_max;
1451
1452 int internal_compressed_over;
1453 ledger_amount_t internal_compressed_over_total;
1454 ledger_amount_t internal_compressed_over_max;
1455 int internal_compressed_under;
1456 ledger_amount_t internal_compressed_under_total;
1457 ledger_amount_t internal_compressed_under_max;
1458
1459 int iokit_mapped_over;
1460 ledger_amount_t iokit_mapped_over_total;
1461 ledger_amount_t iokit_mapped_over_max;
1462 int iokit_mapped_under;
1463 ledger_amount_t iokit_mapped_under_total;
1464 ledger_amount_t iokit_mapped_under_max;
1465
1466 int alternate_accounting_over;
1467 ledger_amount_t alternate_accounting_over_total;
1468 ledger_amount_t alternate_accounting_over_max;
1469 int alternate_accounting_under;
1470 ledger_amount_t alternate_accounting_under_total;
1471 ledger_amount_t alternate_accounting_under_max;
1472
1473 int alternate_accounting_compressed_over;
1474 ledger_amount_t alternate_accounting_compressed_over_total;
1475 ledger_amount_t alternate_accounting_compressed_over_max;
1476 int alternate_accounting_compressed_under;
1477 ledger_amount_t alternate_accounting_compressed_under_total;
1478 ledger_amount_t alternate_accounting_compressed_under_max;
1479
1480 int page_table_over;
1481 ledger_amount_t page_table_over_total;
1482 ledger_amount_t page_table_over_max;
1483 int page_table_under;
1484 ledger_amount_t page_table_under_total;
1485 ledger_amount_t page_table_under_max;
1486
1487 int purgeable_volatile_over;
1488 ledger_amount_t purgeable_volatile_over_total;
1489 ledger_amount_t purgeable_volatile_over_max;
1490 int purgeable_volatile_under;
1491 ledger_amount_t purgeable_volatile_under_total;
1492 ledger_amount_t purgeable_volatile_under_max;
1493
1494 int purgeable_nonvolatile_over;
1495 ledger_amount_t purgeable_nonvolatile_over_total;
1496 ledger_amount_t purgeable_nonvolatile_over_max;
1497 int purgeable_nonvolatile_under;
1498 ledger_amount_t purgeable_nonvolatile_under_total;
1499 ledger_amount_t purgeable_nonvolatile_under_max;
1500
1501 int purgeable_volatile_compressed_over;
1502 ledger_amount_t purgeable_volatile_compressed_over_total;
1503 ledger_amount_t purgeable_volatile_compressed_over_max;
1504 int purgeable_volatile_compressed_under;
1505 ledger_amount_t purgeable_volatile_compressed_under_total;
1506 ledger_amount_t purgeable_volatile_compressed_under_max;
1507
1508 int purgeable_nonvolatile_compressed_over;
1509 ledger_amount_t purgeable_nonvolatile_compressed_over_total;
1510 ledger_amount_t purgeable_nonvolatile_compressed_over_max;
1511 int purgeable_nonvolatile_compressed_under;
1512 ledger_amount_t purgeable_nonvolatile_compressed_under_total;
1513 ledger_amount_t purgeable_nonvolatile_compressed_under_max;
1514 } pmap_ledgers_drift;
1515 static void pmap_check_ledgers(pmap_t pmap);
1516 #else /* MACH_ASSERT */
1517 static inline void pmap_check_ledgers(__unused pmap_t pmap) {}
1518 #endif /* MACH_ASSERT */
1519
1520 /*
1521 * Retire the given physical map from service.
1522 * Should only be called if the map contains
1523 * no valid mappings.
1524 */
1525 extern int vm_wired_objects_page_count;
1526
1527 void
1528 pmap_destroy(pmap_t p)
1529 {
1530 int c;
1531
1532 if (p == PMAP_NULL)
1533 return;
1534
1535 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1536 VM_KERNEL_ADDRHIDe(p));
1537
1538 PMAP_LOCK(p);
1539
1540 c = --p->ref_count;
1541
1542 pmap_assert((current_thread() && (current_thread()->map)) ? (current_thread()->map->pmap != p) : TRUE);
1543
1544 if (c == 0) {
1545 /*
1546 * If some cpu is not using the physical pmap pointer that it
1547 * is supposed to be (see set_dirbase), we might be using the
1548 * pmap that is being destroyed! Make sure we are
1549 * physically on the right pmap:
1550 */
1551 PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL);
1552 if (pmap_pcid_ncpus)
1553 pmap_destroy_pcid_sync(p);
1554 }
1555
1556 PMAP_UNLOCK(p);
1557
1558 if (c != 0) {
1559 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1560 pmap_assert(p == kernel_pmap);
1561 return; /* still in use */
1562 }
1563
1564 /*
1565 * Free the memory maps, then the
1566 * pmap structure.
1567 */
1568 int inuse_ptepages = 0;
1569
1570 zfree(pmap_anchor_zone, p->pm_pml4);
1571 zfree(pmap_uanchor_zone, p->pm_upml4);
1572
1573 inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1574 vm_object_deallocate(p->pm_obj_pml4);
1575
1576 inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1577 vm_object_deallocate(p->pm_obj_pdpt);
1578
1579 inuse_ptepages += p->pm_obj->resident_page_count;
1580 vm_object_deallocate(p->pm_obj);
1581
1582 OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count);
1583 PMAP_ZINFO_PFREE(p, inuse_ptepages * PAGE_SIZE);
1584
1585 pmap_check_ledgers(p);
1586 ledger_dereference(p->ledger);
1587 zfree(pmap_zone, p);
1588
1589 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1590 }
1591
1592 /*
1593 * Add a reference to the specified pmap.
1594 */
1595
1596 void
1597 pmap_reference(pmap_t p)
1598 {
1599 if (p != PMAP_NULL) {
1600 PMAP_LOCK(p);
1601 p->ref_count++;
1602 PMAP_UNLOCK(p);;
1603 }
1604 }
1605
1606 /*
1607 * Remove phys addr if mapped in specified map
1608 *
1609 */
1610 void
1611 pmap_remove_some_phys(
1612 __unused pmap_t map,
1613 __unused ppnum_t pn)
1614 {
1615
1616 /* Implement to support working set code */
1617
1618 }
1619
1620
1621 void
1622 pmap_protect(
1623 pmap_t map,
1624 vm_map_offset_t sva,
1625 vm_map_offset_t eva,
1626 vm_prot_t prot)
1627 {
1628 pmap_protect_options(map, sva, eva, prot, 0, NULL);
1629 }
1630
1631
1632 /*
1633 * Set the physical protection on the
1634 * specified range of this map as requested.
1635 * Will not increase permissions.
1636 */
1637 void
1638 pmap_protect_options(
1639 pmap_t map,
1640 vm_map_offset_t sva,
1641 vm_map_offset_t eva,
1642 vm_prot_t prot,
1643 unsigned int options,
1644 void *arg)
1645 {
1646 pt_entry_t *pde;
1647 pt_entry_t *spte, *epte;
1648 vm_map_offset_t lva;
1649 vm_map_offset_t orig_sva;
1650 boolean_t set_NX;
1651 int num_found = 0;
1652 boolean_t is_ept;
1653
1654 pmap_intr_assert();
1655
1656 if (map == PMAP_NULL)
1657 return;
1658
1659 if (prot == VM_PROT_NONE) {
1660 pmap_remove_options(map, sva, eva, options);
1661 return;
1662 }
1663
1664 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
1665 VM_KERNEL_ADDRHIDE(map), VM_KERNEL_ADDRHIDE(sva),
1666 VM_KERNEL_ADDRHIDE(eva));
1667
1668 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !map->nx_enabled)
1669 set_NX = FALSE;
1670 else
1671 set_NX = TRUE;
1672
1673 is_ept = is_ept_pmap(map);
1674
1675
1676 PMAP_LOCK(map);
1677
1678 orig_sva = sva;
1679 while (sva < eva) {
1680 lva = (sva + pde_mapped_size) & ~(pde_mapped_size - 1);
1681 if (lva > eva)
1682 lva = eva;
1683 pde = pmap_pde(map, sva);
1684 if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1685 if (*pde & PTE_PS) {
1686 /* superpage */
1687 spte = pde;
1688 epte = spte+1; /* excluded */
1689 } else {
1690 spte = pmap_pte(map, (sva & ~(pde_mapped_size - 1)));
1691 spte = &spte[ptenum(sva)];
1692 epte = &spte[intel_btop(lva - sva)];
1693 }
1694
1695 for (; spte < epte; spte++) {
1696 if (!(*spte & PTE_VALID_MASK(is_ept)))
1697 continue;
1698
1699 if (is_ept) {
1700 if (prot & VM_PROT_READ)
1701 pmap_update_pte(spte, 0, PTE_READ(is_ept));
1702 else
1703 pmap_update_pte(spte, PTE_READ(is_ept), 0);
1704 }
1705 if (prot & VM_PROT_WRITE)
1706 pmap_update_pte(spte, 0, PTE_WRITE(is_ept));
1707 else
1708 pmap_update_pte(spte, PTE_WRITE(is_ept), 0);
1709
1710 if (set_NX) {
1711 if (!is_ept)
1712 pmap_update_pte(spte, 0, INTEL_PTE_NX);
1713 else
1714 pmap_update_pte(spte, INTEL_EPT_EX, 0);
1715 } else {
1716 if (!is_ept)
1717 pmap_update_pte(spte, INTEL_PTE_NX, 0);
1718 else
1719 pmap_update_pte(spte, 0, INTEL_EPT_EX);
1720 }
1721 num_found++;
1722 }
1723 }
1724 sva = lva;
1725 }
1726 if (num_found) {
1727 if (options & PMAP_OPTIONS_NOFLUSH)
1728 PMAP_UPDATE_TLBS_DELAYED(map, orig_sva, eva, (pmap_flush_context *)arg);
1729 else
1730 PMAP_UPDATE_TLBS(map, orig_sva, eva);
1731 }
1732 PMAP_UNLOCK(map);
1733
1734 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
1735
1736 }
1737
1738 /* Map a (possibly) autogenned block */
1739 kern_return_t
1740 pmap_map_block(
1741 pmap_t pmap,
1742 addr64_t va,
1743 ppnum_t pa,
1744 uint32_t size,
1745 vm_prot_t prot,
1746 int attr,
1747 __unused unsigned int flags)
1748 {
1749 kern_return_t kr;
1750 addr64_t original_va = va;
1751 uint32_t page;
1752 int cur_page_size;
1753
1754 if (attr & VM_MEM_SUPERPAGE)
1755 cur_page_size = SUPERPAGE_SIZE;
1756 else
1757 cur_page_size = PAGE_SIZE;
1758
1759 for (page = 0; page < size; page+=cur_page_size/PAGE_SIZE) {
1760 kr = pmap_enter(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE);
1761
1762 if (kr != KERN_SUCCESS) {
1763 /*
1764 * This will panic for now, as it is unclear that
1765 * removing the mappings is correct.
1766 */
1767 panic("%s: failed pmap_enter, "
1768 "pmap=%p, va=%#llx, pa=%u, size=%u, prot=%#x, flags=%#x",
1769 __FUNCTION__,
1770 pmap, va, pa, size, prot, flags);
1771
1772 pmap_remove(pmap, original_va, va - original_va);
1773 return kr;
1774 }
1775
1776 va += cur_page_size;
1777 pa+=cur_page_size/PAGE_SIZE;
1778 }
1779
1780 return KERN_SUCCESS;
1781 }
1782
1783 kern_return_t
1784 pmap_expand_pml4(
1785 pmap_t map,
1786 vm_map_offset_t vaddr,
1787 unsigned int options)
1788 {
1789 vm_page_t m;
1790 pmap_paddr_t pa;
1791 uint64_t i;
1792 ppnum_t pn;
1793 pml4_entry_t *pml4p;
1794 boolean_t is_ept = is_ept_pmap(map);
1795
1796 DBG("pmap_expand_pml4(%p,%p)\n", map, (void *)vaddr);
1797
1798 /* With the exception of the kext "basement", the kernel's level 4
1799 * pagetables must not be dynamically expanded.
1800 */
1801 assert(map != kernel_pmap || (vaddr == KERNEL_BASEMENT));
1802 /*
1803 * Allocate a VM page for the pml4 page
1804 */
1805 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
1806 if (options & PMAP_EXPAND_OPTIONS_NOWAIT)
1807 return KERN_RESOURCE_SHORTAGE;
1808 VM_PAGE_WAIT();
1809 }
1810 /*
1811 * put the page into the pmap's obj list so it
1812 * can be found later.
1813 */
1814 pn = VM_PAGE_GET_PHYS_PAGE(m);
1815 pa = i386_ptob(pn);
1816 i = pml4idx(map, vaddr);
1817
1818 /*
1819 * Zero the page.
1820 */
1821 pmap_zero_page(pn);
1822
1823 vm_page_lockspin_queues();
1824 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
1825 vm_page_unlock_queues();
1826
1827 OSAddAtomic(1, &inuse_ptepages_count);
1828 OSAddAtomic64(1, &alloc_ptepages_count);
1829 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
1830
1831 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
1832 vm_object_lock(map->pm_obj_pml4);
1833
1834 PMAP_LOCK(map);
1835 /*
1836 * See if someone else expanded us first
1837 */
1838 if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
1839 PMAP_UNLOCK(map);
1840 vm_object_unlock(map->pm_obj_pml4);
1841
1842 VM_PAGE_FREE(m);
1843
1844 OSAddAtomic(-1, &inuse_ptepages_count);
1845 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
1846 return KERN_SUCCESS;
1847 }
1848
1849 #if 0 /* DEBUG */
1850 if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE)) {
1851 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
1852 map, map->pm_obj_pml4, vaddr, i);
1853 }
1854 #endif
1855 vm_page_insert_wired(m, map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
1856 vm_object_unlock(map->pm_obj_pml4);
1857
1858 /*
1859 * Set the page directory entry for this page table.
1860 */
1861 pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
1862
1863 pmap_store_pte(pml4p, pa_to_pte(pa)
1864 | PTE_READ(is_ept)
1865 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
1866 | PTE_WRITE(is_ept));
1867 pml4_entry_t *upml4p;
1868
1869 upml4p = pmap64_user_pml4(map, vaddr);
1870 pmap_store_pte(upml4p, pa_to_pte(pa)
1871 | PTE_READ(is_ept)
1872 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
1873 | PTE_WRITE(is_ept));
1874
1875 PMAP_UNLOCK(map);
1876
1877 return KERN_SUCCESS;
1878 }
1879
1880 kern_return_t
1881 pmap_expand_pdpt(pmap_t map, vm_map_offset_t vaddr, unsigned int options)
1882 {
1883 vm_page_t m;
1884 pmap_paddr_t pa;
1885 uint64_t i;
1886 ppnum_t pn;
1887 pdpt_entry_t *pdptp;
1888 boolean_t is_ept = is_ept_pmap(map);
1889
1890 DBG("pmap_expand_pdpt(%p,%p)\n", map, (void *)vaddr);
1891
1892 while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
1893 kern_return_t pep4kr = pmap_expand_pml4(map, vaddr, options);
1894 if (pep4kr != KERN_SUCCESS)
1895 return pep4kr;
1896 }
1897
1898 /*
1899 * Allocate a VM page for the pdpt page
1900 */
1901 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
1902 if (options & PMAP_EXPAND_OPTIONS_NOWAIT)
1903 return KERN_RESOURCE_SHORTAGE;
1904 VM_PAGE_WAIT();
1905 }
1906
1907 /*
1908 * put the page into the pmap's obj list so it
1909 * can be found later.
1910 */
1911 pn = VM_PAGE_GET_PHYS_PAGE(m);
1912 pa = i386_ptob(pn);
1913 i = pdptidx(map, vaddr);
1914
1915 /*
1916 * Zero the page.
1917 */
1918 pmap_zero_page(pn);
1919
1920 vm_page_lockspin_queues();
1921 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
1922 vm_page_unlock_queues();
1923
1924 OSAddAtomic(1, &inuse_ptepages_count);
1925 OSAddAtomic64(1, &alloc_ptepages_count);
1926 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
1927
1928 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
1929 vm_object_lock(map->pm_obj_pdpt);
1930
1931 PMAP_LOCK(map);
1932 /*
1933 * See if someone else expanded us first
1934 */
1935 if (pmap64_pde(map, vaddr) != PD_ENTRY_NULL) {
1936 PMAP_UNLOCK(map);
1937 vm_object_unlock(map->pm_obj_pdpt);
1938
1939 VM_PAGE_FREE(m);
1940
1941 OSAddAtomic(-1, &inuse_ptepages_count);
1942 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
1943 return KERN_SUCCESS;
1944 }
1945
1946 #if 0 /* DEBUG */
1947 if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE)) {
1948 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
1949 map, map->pm_obj_pdpt, vaddr, i);
1950 }
1951 #endif
1952 vm_page_insert_wired(m, map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
1953 vm_object_unlock(map->pm_obj_pdpt);
1954
1955 /*
1956 * Set the page directory entry for this page table.
1957 */
1958 pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
1959
1960 pmap_store_pte(pdptp, pa_to_pte(pa)
1961 | PTE_READ(is_ept)
1962 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
1963 | PTE_WRITE(is_ept));
1964
1965 PMAP_UNLOCK(map);
1966
1967 return KERN_SUCCESS;
1968
1969 }
1970
1971
1972
1973 /*
1974 * Routine: pmap_expand
1975 *
1976 * Expands a pmap to be able to map the specified virtual address.
1977 *
1978 * Allocates new virtual memory for the P0 or P1 portion of the
1979 * pmap, then re-maps the physical pages that were in the old
1980 * pmap to be in the new pmap.
1981 *
1982 * Must be called with the pmap system and the pmap unlocked,
1983 * since these must be unlocked to use vm_allocate or vm_deallocate.
1984 * Thus it must be called in a loop that checks whether the map
1985 * has been expanded enough.
1986 * (We won't loop forever, since page tables aren't shrunk.)
1987 */
1988 kern_return_t
1989 pmap_expand(
1990 pmap_t map,
1991 vm_map_offset_t vaddr,
1992 unsigned int options)
1993 {
1994 pt_entry_t *pdp;
1995 vm_page_t m;
1996 pmap_paddr_t pa;
1997 uint64_t i;
1998 ppnum_t pn;
1999 boolean_t is_ept = is_ept_pmap(map);
2000
2001
2002 /*
2003 * For the kernel, the virtual address must be in or above the basement
2004 * which is for kexts and is in the 512GB immediately below the kernel..
2005 * XXX - should use VM_MIN_KERNEL_AND_KEXT_ADDRESS not KERNEL_BASEMENT
2006 */
2007 if (__improbable(map == kernel_pmap &&
2008 !(vaddr >= KERNEL_BASEMENT && vaddr <= VM_MAX_KERNEL_ADDRESS))) {
2009 if ((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0) {
2010 panic("pmap_expand: bad vaddr 0x%llx for kernel pmap", vaddr);
2011 }
2012 }
2013
2014
2015 while ((pdp = pmap64_pde(map, vaddr)) == PD_ENTRY_NULL) {
2016 assert((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0);
2017 kern_return_t pepkr = pmap_expand_pdpt(map, vaddr, options);
2018 if (pepkr != KERN_SUCCESS)
2019 return pepkr;
2020 }
2021
2022 /*
2023 * Allocate a VM page for the pde entries.
2024 */
2025 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2026 if (options & PMAP_EXPAND_OPTIONS_NOWAIT)
2027 return KERN_RESOURCE_SHORTAGE;
2028 VM_PAGE_WAIT();
2029 }
2030
2031 /*
2032 * put the page into the pmap's obj list so it
2033 * can be found later.
2034 */
2035 pn = VM_PAGE_GET_PHYS_PAGE(m);
2036 pa = i386_ptob(pn);
2037 i = pdeidx(map, vaddr);
2038
2039 /*
2040 * Zero the page.
2041 */
2042 pmap_zero_page(pn);
2043
2044 vm_page_lockspin_queues();
2045 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2046 vm_page_unlock_queues();
2047
2048 OSAddAtomic(1, &inuse_ptepages_count);
2049 OSAddAtomic64(1, &alloc_ptepages_count);
2050 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2051
2052 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2053 vm_object_lock(map->pm_obj);
2054
2055 PMAP_LOCK(map);
2056
2057 /*
2058 * See if someone else expanded us first
2059 */
2060 if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
2061 PMAP_UNLOCK(map);
2062 vm_object_unlock(map->pm_obj);
2063
2064 VM_PAGE_FREE(m);
2065
2066 OSAddAtomic(-1, &inuse_ptepages_count);//todo replace all with inlines
2067 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2068 return KERN_SUCCESS;
2069 }
2070
2071 #if 0 /* DEBUG */
2072 if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE)) {
2073 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
2074 map, map->pm_obj, vaddr, i);
2075 }
2076 #endif
2077 vm_page_insert_wired(m, map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2078 vm_object_unlock(map->pm_obj);
2079
2080 /*
2081 * Set the page directory entry for this page table.
2082 */
2083 pdp = pmap_pde(map, vaddr);
2084 pmap_store_pte(pdp, pa_to_pte(pa)
2085 | PTE_READ(is_ept)
2086 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2087 | PTE_WRITE(is_ept));
2088
2089 PMAP_UNLOCK(map);
2090
2091 return KERN_SUCCESS;
2092 }
2093
2094 /* On K64 machines with more than 32GB of memory, pmap_steal_memory
2095 * will allocate past the 1GB of pre-expanded virtual kernel area. This
2096 * function allocates all the page tables using memory from the same pool
2097 * that pmap_steal_memory uses, rather than calling vm_page_grab (which
2098 * isn't available yet). */
2099 void
2100 pmap_pre_expand(pmap_t pmap, vm_map_offset_t vaddr)
2101 {
2102 ppnum_t pn;
2103 pt_entry_t *pte;
2104 boolean_t is_ept = is_ept_pmap(pmap);
2105
2106 PMAP_LOCK(pmap);
2107
2108 if(pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) {
2109 if (!pmap_next_page_hi(&pn))
2110 panic("pmap_pre_expand");
2111
2112 pmap_zero_page(pn);
2113
2114 pte = pmap64_pml4(pmap, vaddr);
2115
2116 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
2117 | PTE_READ(is_ept)
2118 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2119 | PTE_WRITE(is_ept));
2120
2121 pte = pmap64_user_pml4(pmap, vaddr);
2122
2123 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
2124 | PTE_READ(is_ept)
2125 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2126 | PTE_WRITE(is_ept));
2127
2128 }
2129
2130 if(pmap64_pde(pmap, vaddr) == PD_ENTRY_NULL) {
2131 if (!pmap_next_page_hi(&pn))
2132 panic("pmap_pre_expand");
2133
2134 pmap_zero_page(pn);
2135
2136 pte = pmap64_pdpt(pmap, vaddr);
2137
2138 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
2139 | PTE_READ(is_ept)
2140 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2141 | PTE_WRITE(is_ept));
2142 }
2143
2144 if(pmap_pte(pmap, vaddr) == PT_ENTRY_NULL) {
2145 if (!pmap_next_page_hi(&pn))
2146 panic("pmap_pre_expand");
2147
2148 pmap_zero_page(pn);
2149
2150 pte = pmap64_pde(pmap, vaddr);
2151
2152 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
2153 | PTE_READ(is_ept)
2154 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2155 | PTE_WRITE(is_ept));
2156 }
2157
2158 PMAP_UNLOCK(pmap);
2159 }
2160
2161 /*
2162 * pmap_sync_page_data_phys(ppnum_t pa)
2163 *
2164 * Invalidates all of the instruction cache on a physical page and
2165 * pushes any dirty data from the data cache for the same physical page
2166 * Not required in i386.
2167 */
2168 void
2169 pmap_sync_page_data_phys(__unused ppnum_t pa)
2170 {
2171 return;
2172 }
2173
2174 /*
2175 * pmap_sync_page_attributes_phys(ppnum_t pa)
2176 *
2177 * Write back and invalidate all cachelines on a physical page.
2178 */
2179 void
2180 pmap_sync_page_attributes_phys(ppnum_t pa)
2181 {
2182 cache_flush_page_phys(pa);
2183 }
2184
2185
2186
2187 #ifdef CURRENTLY_UNUSED_AND_UNTESTED
2188
2189 int collect_ref;
2190 int collect_unref;
2191
2192 /*
2193 * Routine: pmap_collect
2194 * Function:
2195 * Garbage collects the physical map system for
2196 * pages which are no longer used.
2197 * Success need not be guaranteed -- that is, there
2198 * may well be pages which are not referenced, but
2199 * others may be collected.
2200 * Usage:
2201 * Called by the pageout daemon when pages are scarce.
2202 */
2203 void
2204 pmap_collect(
2205 pmap_t p)
2206 {
2207 pt_entry_t *pdp, *ptp;
2208 pt_entry_t *eptp;
2209 int wired;
2210 boolean_t is_ept;
2211
2212 if (p == PMAP_NULL)
2213 return;
2214
2215 if (p == kernel_pmap)
2216 return;
2217
2218 is_ept = is_ept_pmap(p);
2219
2220 /*
2221 * Garbage collect map.
2222 */
2223 PMAP_LOCK(p);
2224
2225 for (pdp = (pt_entry_t *)p->dirbase;
2226 pdp < (pt_entry_t *)&p->dirbase[(UMAXPTDI+1)];
2227 pdp++)
2228 {
2229 if (*pdp & PTE_VALID_MASK(is_ept)) {
2230 if (*pdp & PTE_REF(is_ept)) {
2231 pmap_store_pte(pdp, *pdp & ~PTE_REF(is_ept));
2232 collect_ref++;
2233 } else {
2234 collect_unref++;
2235 ptp = pmap_pte(p, pdetova(pdp - (pt_entry_t *)p->dirbase));
2236 eptp = ptp + NPTEPG;
2237
2238 /*
2239 * If the pte page has any wired mappings, we cannot
2240 * free it.
2241 */
2242 wired = 0;
2243 {
2244 pt_entry_t *ptep;
2245 for (ptep = ptp; ptep < eptp; ptep++) {
2246 if (iswired(*ptep)) {
2247 wired = 1;
2248 break;
2249 }
2250 }
2251 }
2252 if (!wired) {
2253 /*
2254 * Remove the virtual addresses mapped by this pte page.
2255 */
2256 pmap_remove_range(p,
2257 pdetova(pdp - (pt_entry_t *)p->dirbase),
2258 ptp,
2259 eptp);
2260
2261 /*
2262 * Invalidate the page directory pointer.
2263 */
2264 pmap_store_pte(pdp, 0x0);
2265
2266 PMAP_UNLOCK(p);
2267
2268 /*
2269 * And free the pte page itself.
2270 */
2271 {
2272 vm_page_t m;
2273
2274 vm_object_lock(p->pm_obj);
2275
2276 m = vm_page_lookup(p->pm_obj,(vm_object_offset_t)(pdp - (pt_entry_t *)&p->dirbase[0]) * PAGE_SIZE);
2277 if (m == VM_PAGE_NULL)
2278 panic("pmap_collect: pte page not in object");
2279
2280 vm_object_unlock(p->pm_obj);
2281
2282 VM_PAGE_FREE(m);
2283
2284 OSAddAtomic(-1, &inuse_ptepages_count);
2285 PMAP_ZINFO_PFREE(p, PAGE_SIZE);
2286 }
2287
2288 PMAP_LOCK(p);
2289 }
2290 }
2291 }
2292 }
2293
2294 PMAP_UPDATE_TLBS(p, 0x0, 0xFFFFFFFFFFFFF000ULL);
2295 PMAP_UNLOCK(p);
2296 return;
2297 }
2298 #endif
2299
2300
2301 void
2302 pmap_copy_page(ppnum_t src, ppnum_t dst)
2303 {
2304 bcopy_phys((addr64_t)i386_ptob(src),
2305 (addr64_t)i386_ptob(dst),
2306 PAGE_SIZE);
2307 }
2308
2309
2310 /*
2311 * Routine: pmap_pageable
2312 * Function:
2313 * Make the specified pages (by pmap, offset)
2314 * pageable (or not) as requested.
2315 *
2316 * A page which is not pageable may not take
2317 * a fault; therefore, its page table entry
2318 * must remain valid for the duration.
2319 *
2320 * This routine is merely advisory; pmap_enter
2321 * will specify that these pages are to be wired
2322 * down (or not) as appropriate.
2323 */
2324 void
2325 pmap_pageable(
2326 __unused pmap_t pmap,
2327 __unused vm_map_offset_t start_addr,
2328 __unused vm_map_offset_t end_addr,
2329 __unused boolean_t pageable)
2330 {
2331 #ifdef lint
2332 pmap++; start_addr++; end_addr++; pageable++;
2333 #endif /* lint */
2334 }
2335
2336 void
2337 invalidate_icache(__unused vm_offset_t addr,
2338 __unused unsigned cnt,
2339 __unused int phys)
2340 {
2341 return;
2342 }
2343
2344 void
2345 flush_dcache(__unused vm_offset_t addr,
2346 __unused unsigned count,
2347 __unused int phys)
2348 {
2349 return;
2350 }
2351
2352 #if CONFIG_DTRACE
2353 /*
2354 * Constrain DTrace copyin/copyout actions
2355 */
2356 extern kern_return_t dtrace_copyio_preflight(addr64_t);
2357 extern kern_return_t dtrace_copyio_postflight(addr64_t);
2358
2359 kern_return_t dtrace_copyio_preflight(__unused addr64_t va)
2360 {
2361 thread_t thread = current_thread();
2362 uint64_t ccr3;
2363 if (current_map() == kernel_map)
2364 return KERN_FAILURE;
2365 else if (((ccr3 = get_cr3_base()) != thread->map->pmap->pm_cr3) && (no_shared_cr3 == FALSE))
2366 return KERN_FAILURE;
2367 else if (no_shared_cr3 && (ccr3 != kernel_pmap->pm_cr3))
2368 return KERN_FAILURE;
2369 else
2370 return KERN_SUCCESS;
2371 }
2372
2373 kern_return_t dtrace_copyio_postflight(__unused addr64_t va)
2374 {
2375 return KERN_SUCCESS;
2376 }
2377 #endif /* CONFIG_DTRACE */
2378
2379 #include <mach_vm_debug.h>
2380 #if MACH_VM_DEBUG
2381 #include <vm/vm_debug.h>
2382
2383 int
2384 pmap_list_resident_pages(
2385 __unused pmap_t pmap,
2386 __unused vm_offset_t *listp,
2387 __unused int space)
2388 {
2389 return 0;
2390 }
2391 #endif /* MACH_VM_DEBUG */
2392
2393
2394 #if CONFIG_COREDUMP
2395 /* temporary workaround */
2396 boolean_t
2397 coredumpok(__unused vm_map_t map, __unused vm_offset_t va)
2398 {
2399 #if 0
2400 pt_entry_t *ptep;
2401
2402 ptep = pmap_pte(map->pmap, va);
2403 if (0 == ptep)
2404 return FALSE;
2405 return ((*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED));
2406 #else
2407 return TRUE;
2408 #endif
2409 }
2410 #endif
2411
2412 boolean_t
2413 phys_page_exists(ppnum_t pn)
2414 {
2415 assert(pn != vm_page_fictitious_addr);
2416
2417 if (!pmap_initialized)
2418 return TRUE;
2419
2420 if (pn == vm_page_guard_addr)
2421 return FALSE;
2422
2423 if (!IS_MANAGED_PAGE(ppn_to_pai(pn)))
2424 return FALSE;
2425
2426 return TRUE;
2427 }
2428
2429
2430
2431 void
2432 pmap_switch(pmap_t tpmap)
2433 {
2434 spl_t s;
2435
2436 s = splhigh(); /* Make sure interruptions are disabled */
2437 set_dirbase(tpmap, current_thread(), cpu_number());
2438 splx(s);
2439 }
2440
2441
2442 /*
2443 * disable no-execute capability on
2444 * the specified pmap
2445 */
2446 void
2447 pmap_disable_NX(pmap_t pmap)
2448 {
2449 pmap->nx_enabled = 0;
2450 }
2451
2452 void
2453 pt_fake_zone_init(int zone_index)
2454 {
2455 pt_fake_zone_index = zone_index;
2456 }
2457
2458 void
2459 pt_fake_zone_info(
2460 int *count,
2461 vm_size_t *cur_size,
2462 vm_size_t *max_size,
2463 vm_size_t *elem_size,
2464 vm_size_t *alloc_size,
2465 uint64_t *sum_size,
2466 int *collectable,
2467 int *exhaustable,
2468 int *caller_acct)
2469 {
2470 *count = inuse_ptepages_count;
2471 *cur_size = PAGE_SIZE * inuse_ptepages_count;
2472 *max_size = PAGE_SIZE * (inuse_ptepages_count +
2473 vm_page_inactive_count +
2474 vm_page_active_count +
2475 vm_page_free_count);
2476 *elem_size = PAGE_SIZE;
2477 *alloc_size = PAGE_SIZE;
2478 *sum_size = alloc_ptepages_count * PAGE_SIZE;
2479
2480 *collectable = 1;
2481 *exhaustable = 0;
2482 *caller_acct = 1;
2483 }
2484
2485
2486 void
2487 pmap_flush_context_init(pmap_flush_context *pfc)
2488 {
2489 pfc->pfc_cpus = 0;
2490 pfc->pfc_invalid_global = 0;
2491 }
2492
2493 extern uint64_t TLBTimeOut;
2494 void
2495 pmap_flush(
2496 pmap_flush_context *pfc)
2497 {
2498 unsigned int my_cpu;
2499 unsigned int cpu;
2500 unsigned int cpu_bit;
2501 cpumask_t cpus_to_respond = 0;
2502 cpumask_t cpus_to_signal = 0;
2503 cpumask_t cpus_signaled = 0;
2504 boolean_t flush_self = FALSE;
2505 uint64_t deadline;
2506
2507 mp_disable_preemption();
2508
2509 my_cpu = cpu_number();
2510 cpus_to_signal = pfc->pfc_cpus;
2511
2512 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_START,
2513 NULL, cpus_to_signal);
2514
2515 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus && cpus_to_signal; cpu++, cpu_bit <<= 1) {
2516
2517 if (cpus_to_signal & cpu_bit) {
2518
2519 cpus_to_signal &= ~cpu_bit;
2520
2521 if (!cpu_datap(cpu)->cpu_running)
2522 continue;
2523
2524 if (pfc->pfc_invalid_global & cpu_bit)
2525 cpu_datap(cpu)->cpu_tlb_invalid_global = TRUE;
2526 else
2527 cpu_datap(cpu)->cpu_tlb_invalid_local = TRUE;
2528 mfence();
2529
2530 if (cpu == my_cpu) {
2531 flush_self = TRUE;
2532 continue;
2533 }
2534 if (CPU_CR3_IS_ACTIVE(cpu)) {
2535 cpus_to_respond |= cpu_bit;
2536 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2537 }
2538 }
2539 }
2540 cpus_signaled = cpus_to_respond;
2541
2542 /*
2543 * Flush local tlb if required.
2544 * Do this now to overlap with other processors responding.
2545 */
2546 if (flush_self && cpu_datap(my_cpu)->cpu_tlb_invalid != FALSE)
2547 process_pmap_updates();
2548
2549 if (cpus_to_respond) {
2550
2551 deadline = mach_absolute_time() +
2552 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
2553 boolean_t is_timeout_traced = FALSE;
2554
2555 /*
2556 * Wait for those other cpus to acknowledge
2557 */
2558 while (cpus_to_respond != 0) {
2559 long orig_acks = 0;
2560
2561 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2562 /* Consider checking local/global invalidity
2563 * as appropriate in the PCID case.
2564 */
2565 if ((cpus_to_respond & cpu_bit) != 0) {
2566 if (!cpu_datap(cpu)->cpu_running ||
2567 cpu_datap(cpu)->cpu_tlb_invalid == FALSE ||
2568 !CPU_CR3_IS_ACTIVE(cpu)) {
2569 cpus_to_respond &= ~cpu_bit;
2570 }
2571 cpu_pause();
2572 }
2573 if (cpus_to_respond == 0)
2574 break;
2575 }
2576 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
2577 if (machine_timeout_suspended())
2578 continue;
2579 if (TLBTimeOut == 0) {
2580 if (is_timeout_traced)
2581 continue;
2582
2583 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
2584 NULL, cpus_to_signal, cpus_to_respond);
2585
2586 is_timeout_traced = TRUE;
2587 continue;
2588 }
2589 orig_acks = NMIPI_acks;
2590 NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
2591 panic("Uninterruptible processor(s): CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu",
2592 cpus_to_respond, orig_acks, NMIPI_acks, deadline);
2593 }
2594 }
2595 }
2596
2597 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_END,
2598 NULL, cpus_signaled, flush_self);
2599
2600 mp_enable_preemption();
2601 }
2602
2603
2604 static void
2605 invept(void *eptp)
2606 {
2607 struct {
2608 uint64_t eptp;
2609 uint64_t reserved;
2610 } __attribute__((aligned(16), packed)) invept_descriptor = {(uint64_t)eptp, 0};
2611
2612 __asm__ volatile("invept (%%rax), %%rcx"
2613 : : "c" (PMAP_INVEPT_SINGLE_CONTEXT), "a" (&invept_descriptor)
2614 : "cc", "memory");
2615 }
2616
2617 /*
2618 * Called with pmap locked, we:
2619 * - scan through per-cpu data to see which other cpus need to flush
2620 * - send an IPI to each non-idle cpu to be flushed
2621 * - wait for all to signal back that they are inactive or we see that
2622 * they are at a safe point (idle).
2623 * - flush the local tlb if active for this pmap
2624 * - return ... the caller will unlock the pmap
2625 */
2626
2627 void
2628 pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int options, pmap_flush_context *pfc)
2629 {
2630 unsigned int cpu;
2631 unsigned int cpu_bit;
2632 cpumask_t cpus_to_signal = 0;
2633 unsigned int my_cpu = cpu_number();
2634 pmap_paddr_t pmap_cr3 = pmap->pm_cr3;
2635 boolean_t flush_self = FALSE;
2636 uint64_t deadline;
2637 boolean_t pmap_is_shared = (pmap->pm_shared || (pmap == kernel_pmap));
2638 boolean_t need_global_flush = FALSE;
2639 uint32_t event_code;
2640 vm_map_offset_t event_startv, event_endv;
2641 boolean_t is_ept = is_ept_pmap(pmap);
2642
2643 assert((processor_avail_count < 2) ||
2644 (ml_get_interrupts_enabled() && get_preemption_level() != 0));
2645
2646 if (pmap == kernel_pmap) {
2647 event_code = PMAP_CODE(PMAP__FLUSH_KERN_TLBS);
2648 event_startv = VM_KERNEL_UNSLIDE_OR_PERM(startv);
2649 event_endv = VM_KERNEL_UNSLIDE_OR_PERM(endv);
2650 } else if (is_ept) {
2651 event_code = PMAP_CODE(PMAP__FLUSH_EPT);
2652 event_startv = startv;
2653 event_endv = endv;
2654 } else {
2655 event_code = PMAP_CODE(PMAP__FLUSH_TLBS);
2656 event_startv = startv;
2657 event_endv = endv;
2658 }
2659
2660 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_START,
2661 VM_KERNEL_UNSLIDE_OR_PERM(pmap), options,
2662 event_startv, event_endv);
2663
2664 if (is_ept) {
2665 mp_cpus_call(CPUMASK_ALL, ASYNC, invept, (void*)pmap->pm_eptp);
2666 goto out;
2667 }
2668
2669 /*
2670 * Scan other cpus for matching active or task CR3.
2671 * For idle cpus (with no active map) we mark them invalid but
2672 * don't signal -- they'll check as they go busy.
2673 */
2674 if (pmap_pcid_ncpus) {
2675 if (pmap_is_shared)
2676 need_global_flush = TRUE;
2677 pmap_pcid_invalidate_all_cpus(pmap);
2678 mfence();
2679 }
2680 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2681 if (!cpu_datap(cpu)->cpu_running)
2682 continue;
2683 uint64_t cpu_active_cr3 = CPU_GET_ACTIVE_CR3(cpu);
2684 uint64_t cpu_task_cr3 = CPU_GET_TASK_CR3(cpu);
2685 //recall that the shadowed task cr3 is pre-composed
2686 if ((pmap_cr3 == cpu_task_cr3) ||
2687 (pmap_cr3 == cpu_active_cr3) ||
2688 (pmap_is_shared)) {
2689
2690 if (options & PMAP_DELAY_TLB_FLUSH) {
2691 if (need_global_flush == TRUE)
2692 pfc->pfc_invalid_global |= cpu_bit;
2693 pfc->pfc_cpus |= cpu_bit;
2694
2695 continue;
2696 }
2697 if (cpu == my_cpu) {
2698 flush_self = TRUE;
2699 continue;
2700 }
2701 if (need_global_flush == TRUE)
2702 cpu_datap(cpu)->cpu_tlb_invalid_global = TRUE;
2703 else
2704 cpu_datap(cpu)->cpu_tlb_invalid_local = TRUE;
2705 mfence();
2706
2707 /*
2708 * We don't need to signal processors which will flush
2709 * lazily at the idle state or kernel boundary.
2710 * For example, if we're invalidating the kernel pmap,
2711 * processors currently in userspace don't need to flush
2712 * their TLBs until the next time they enter the kernel.
2713 * Alterations to the address space of a task active
2714 * on a remote processor result in a signal, to
2715 * account for copy operations. (There may be room
2716 * for optimization in such cases).
2717 * The order of the loads below with respect
2718 * to the store to the "cpu_tlb_invalid" field above
2719 * is important--hence the barrier.
2720 */
2721 if (CPU_CR3_IS_ACTIVE(cpu) &&
2722 (pmap_cr3 == CPU_GET_ACTIVE_CR3(cpu) ||
2723 pmap->pm_shared ||
2724 (pmap_cr3 == CPU_GET_TASK_CR3(cpu)))) {
2725 cpus_to_signal |= cpu_bit;
2726 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2727 }
2728 }
2729 }
2730 if ((options & PMAP_DELAY_TLB_FLUSH))
2731 goto out;
2732
2733 /*
2734 * Flush local tlb if required.
2735 * Do this now to overlap with other processors responding.
2736 */
2737 if (flush_self) {
2738 if (pmap_pcid_ncpus) {
2739 pmap_pcid_validate_cpu(pmap, my_cpu);
2740 if (pmap_is_shared)
2741 tlb_flush_global();
2742 else
2743 flush_tlb_raw();
2744 }
2745 else
2746 flush_tlb_raw();
2747 }
2748
2749 if (cpus_to_signal) {
2750 cpumask_t cpus_to_respond = cpus_to_signal;
2751
2752 deadline = mach_absolute_time() +
2753 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
2754 boolean_t is_timeout_traced = FALSE;
2755
2756 /*
2757 * Wait for those other cpus to acknowledge
2758 */
2759 while (cpus_to_respond != 0) {
2760 long orig_acks = 0;
2761
2762 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2763 /* Consider checking local/global invalidity
2764 * as appropriate in the PCID case.
2765 */
2766 if ((cpus_to_respond & cpu_bit) != 0) {
2767 if (!cpu_datap(cpu)->cpu_running ||
2768 cpu_datap(cpu)->cpu_tlb_invalid == FALSE ||
2769 !CPU_CR3_IS_ACTIVE(cpu)) {
2770 cpus_to_respond &= ~cpu_bit;
2771 }
2772 cpu_pause();
2773 }
2774 if (cpus_to_respond == 0)
2775 break;
2776 }
2777 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
2778 if (machine_timeout_suspended())
2779 continue;
2780 if (TLBTimeOut == 0) {
2781 /* cut tracepoint but don't panic */
2782 if (is_timeout_traced)
2783 continue;
2784
2785 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
2786 VM_KERNEL_UNSLIDE_OR_PERM(pmap),
2787 cpus_to_signal,
2788 cpus_to_respond);
2789
2790 is_timeout_traced = TRUE;
2791 continue;
2792 }
2793 orig_acks = NMIPI_acks;
2794
2795 NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
2796 panic("TLB invalidation IPI timeout, unresponsive CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu",
2797 cpus_to_respond, orig_acks, NMIPI_acks, deadline);
2798 }
2799 }
2800 }
2801
2802 if (__improbable((pmap == kernel_pmap) && (flush_self != TRUE))) {
2803 panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, pmap_cr3: 0x%llx, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, pmap_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map);
2804 }
2805
2806 out:
2807 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_END,
2808 VM_KERNEL_UNSLIDE_OR_PERM(pmap), cpus_to_signal,
2809 event_startv, event_endv);
2810
2811 }
2812
2813 void
2814 process_pmap_updates(void)
2815 {
2816 int ccpu = cpu_number();
2817 pmap_assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
2818 if (pmap_pcid_ncpus) {
2819 pmap_pcid_validate_current();
2820 cpu_datap(ccpu)->cpu_tlb_invalid = FALSE;
2821 tlb_flush_global();
2822 } else {
2823 current_cpu_datap()->cpu_tlb_invalid = FALSE;
2824 flush_tlb_raw();
2825 }
2826
2827 mfence();
2828 }
2829
2830 void
2831 pmap_update_interrupt(void)
2832 {
2833 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START);
2834
2835 if (current_cpu_datap()->cpu_tlb_invalid)
2836 process_pmap_updates();
2837
2838 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END);
2839 }
2840
2841 #include <mach/mach_vm.h> /* mach_vm_region_recurse() */
2842 /* Scan kernel pmap for W+X PTEs, scan kernel VM map for W+X map entries
2843 * and identify ranges with mismatched VM permissions and PTE permissions
2844 */
2845 kern_return_t
2846 pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset_t ev) {
2847 vm_offset_t cv = sv;
2848 kern_return_t rv = KERN_SUCCESS;
2849 uint64_t skip4 = 0, skip2 = 0;
2850
2851 assert(!is_ept_pmap(ipmap));
2852
2853 sv &= ~PAGE_MASK_64;
2854 ev &= ~PAGE_MASK_64;
2855 while (cv < ev) {
2856 if (__improbable((cv > 0x00007FFFFFFFFFFFULL) &&
2857 (cv < 0xFFFF800000000000ULL))) {
2858 cv = 0xFFFF800000000000ULL;
2859 }
2860 /* Potential inconsistencies from not holding pmap lock
2861 * but harmless for the moment.
2862 */
2863 if (((cv & PML4MASK) == 0) && (pmap64_pml4(ipmap, cv) == 0)) {
2864 if ((cv + NBPML4) > cv)
2865 cv += NBPML4;
2866 else
2867 break;
2868 skip4++;
2869 continue;
2870 }
2871 if (((cv & PDMASK) == 0) && (pmap_pde(ipmap, cv) == 0)) {
2872 if ((cv + NBPD) > cv)
2873 cv += NBPD;
2874 else
2875 break;
2876 skip2++;
2877 continue;
2878 }
2879
2880 pt_entry_t *ptep = pmap_pte(ipmap, cv);
2881 if (ptep && (*ptep & INTEL_PTE_VALID)) {
2882 if (*ptep & INTEL_PTE_WRITE) {
2883 if (!(*ptep & INTEL_PTE_NX)) {
2884 kprintf("W+X PTE at 0x%lx, P4: 0x%llx, P3: 0x%llx, P2: 0x%llx, PT: 0x%llx, VP: %u\n", cv, *pmap64_pml4(ipmap, cv), *pmap64_pdpt(ipmap, cv), *pmap64_pde(ipmap, cv), *ptep, pmap_valid_page((ppnum_t)(i386_btop(pte_to_pa(*ptep)))));
2885 rv = KERN_FAILURE;
2886 }
2887 }
2888 }
2889 cv += PAGE_SIZE;
2890 }
2891 kprintf("Completed pmap scan\n");
2892 cv = sv;
2893
2894 struct vm_region_submap_info_64 vbr;
2895 mach_msg_type_number_t vbrcount = 0;
2896 mach_vm_size_t vmsize;
2897 vm_prot_t prot;
2898 uint32_t nesting_depth = 0;
2899 kern_return_t kret;
2900
2901 while (cv < ev) {
2902
2903 for (;;) {
2904 vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64;
2905 if((kret = mach_vm_region_recurse(ivmmap,
2906 (mach_vm_address_t *) &cv, &vmsize, &nesting_depth,
2907 (vm_region_recurse_info_t)&vbr,
2908 &vbrcount)) != KERN_SUCCESS) {
2909 break;
2910 }
2911
2912 if(vbr.is_submap) {
2913 nesting_depth++;
2914 continue;
2915 } else {
2916 break;
2917 }
2918 }
2919
2920 if(kret != KERN_SUCCESS)
2921 break;
2922
2923 prot = vbr.protection;
2924
2925 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) {
2926 kprintf("W+X map entry at address 0x%lx\n", cv);
2927 rv = KERN_FAILURE;
2928 }
2929
2930 if (prot) {
2931 vm_offset_t pcv;
2932 for (pcv = cv; pcv < cv + vmsize; pcv += PAGE_SIZE) {
2933 pt_entry_t *ptep = pmap_pte(ipmap, pcv);
2934 vm_prot_t tprot;
2935
2936 if ((ptep == NULL) || !(*ptep & INTEL_PTE_VALID))
2937 continue;
2938 tprot = VM_PROT_READ;
2939 if (*ptep & INTEL_PTE_WRITE)
2940 tprot |= VM_PROT_WRITE;
2941 if ((*ptep & INTEL_PTE_NX) == 0)
2942 tprot |= VM_PROT_EXECUTE;
2943 if (tprot != prot) {
2944 kprintf("PTE/map entry permissions mismatch at address 0x%lx, pte: 0x%llx, protection: 0x%x\n", pcv, *ptep, prot);
2945 rv = KERN_FAILURE;
2946 }
2947 }
2948 }
2949 cv += vmsize;
2950 }
2951 return rv;
2952 }
2953
2954 #if MACH_ASSERT
2955 extern int pmap_ledgers_panic;
2956 static void
2957 pmap_check_ledgers(
2958 pmap_t pmap)
2959 {
2960 ledger_amount_t bal;
2961 int pid;
2962 char *procname;
2963 boolean_t do_panic;
2964
2965 if (pmap->pmap_pid == 0) {
2966 /*
2967 * This pmap was not or is no longer fully associated
2968 * with a task (e.g. the old pmap after a fork()/exec() or
2969 * spawn()). Its "ledger" still points at a task that is
2970 * now using a different (and active) address space, so
2971 * we can't check that all the pmap ledgers are balanced here.
2972 *
2973 * If the "pid" is set, that means that we went through
2974 * pmap_set_process() in task_terminate_internal(), so
2975 * this task's ledger should not have been re-used and
2976 * all the pmap ledgers should be back to 0.
2977 */
2978 return;
2979 }
2980
2981 do_panic = FALSE;
2982 pid = pmap->pmap_pid;
2983 procname = pmap->pmap_procname;
2984
2985 pmap_ledgers_drift.num_pmaps_checked++;
2986
2987 ledger_get_balance(pmap->ledger,
2988 task_ledgers.phys_footprint,
2989 &bal);
2990 if (bal != 0) {
2991 do_panic = TRUE;
2992 printf("LEDGER BALANCE proc %d (%s) "
2993 "\"phys_footprint\" = %lld\n",
2994 pid, procname, bal);
2995 if (bal > 0) {
2996 pmap_ledgers_drift.phys_footprint_over++;
2997 pmap_ledgers_drift.phys_footprint_over_total += bal;
2998 if (bal > pmap_ledgers_drift.phys_footprint_over_max) {
2999 pmap_ledgers_drift.phys_footprint_over_max = bal;
3000 }
3001 } else {
3002 pmap_ledgers_drift.phys_footprint_under++;
3003 pmap_ledgers_drift.phys_footprint_under_total += bal;
3004 if (bal < pmap_ledgers_drift.phys_footprint_under_max) {
3005 pmap_ledgers_drift.phys_footprint_under_max = bal;
3006 }
3007 }
3008 }
3009 ledger_get_balance(pmap->ledger,
3010 task_ledgers.internal,
3011 &bal);
3012 if (bal != 0) {
3013 do_panic = TRUE;
3014 printf("LEDGER BALANCE proc %d (%s) "
3015 "\"internal\" = %lld\n",
3016 pid, procname, bal);
3017 if (bal > 0) {
3018 pmap_ledgers_drift.internal_over++;
3019 pmap_ledgers_drift.internal_over_total += bal;
3020 if (bal > pmap_ledgers_drift.internal_over_max) {
3021 pmap_ledgers_drift.internal_over_max = bal;
3022 }
3023 } else {
3024 pmap_ledgers_drift.internal_under++;
3025 pmap_ledgers_drift.internal_under_total += bal;
3026 if (bal < pmap_ledgers_drift.internal_under_max) {
3027 pmap_ledgers_drift.internal_under_max = bal;
3028 }
3029 }
3030 }
3031 ledger_get_balance(pmap->ledger,
3032 task_ledgers.internal_compressed,
3033 &bal);
3034 if (bal != 0) {
3035 do_panic = TRUE;
3036 printf("LEDGER BALANCE proc %d (%s) "
3037 "\"internal_compressed\" = %lld\n",
3038 pid, procname, bal);
3039 if (bal > 0) {
3040 pmap_ledgers_drift.internal_compressed_over++;
3041 pmap_ledgers_drift.internal_compressed_over_total += bal;
3042 if (bal > pmap_ledgers_drift.internal_compressed_over_max) {
3043 pmap_ledgers_drift.internal_compressed_over_max = bal;
3044 }
3045 } else {
3046 pmap_ledgers_drift.internal_compressed_under++;
3047 pmap_ledgers_drift.internal_compressed_under_total += bal;
3048 if (bal < pmap_ledgers_drift.internal_compressed_under_max) {
3049 pmap_ledgers_drift.internal_compressed_under_max = bal;
3050 }
3051 }
3052 }
3053 ledger_get_balance(pmap->ledger,
3054 task_ledgers.iokit_mapped,
3055 &bal);
3056 if (bal != 0) {
3057 do_panic = TRUE;
3058 printf("LEDGER BALANCE proc %d (%s) "
3059 "\"iokit_mapped\" = %lld\n",
3060 pid, procname, bal);
3061 if (bal > 0) {
3062 pmap_ledgers_drift.iokit_mapped_over++;
3063 pmap_ledgers_drift.iokit_mapped_over_total += bal;
3064 if (bal > pmap_ledgers_drift.iokit_mapped_over_max) {
3065 pmap_ledgers_drift.iokit_mapped_over_max = bal;
3066 }
3067 } else {
3068 pmap_ledgers_drift.iokit_mapped_under++;
3069 pmap_ledgers_drift.iokit_mapped_under_total += bal;
3070 if (bal < pmap_ledgers_drift.iokit_mapped_under_max) {
3071 pmap_ledgers_drift.iokit_mapped_under_max = bal;
3072 }
3073 }
3074 }
3075 ledger_get_balance(pmap->ledger,
3076 task_ledgers.alternate_accounting,
3077 &bal);
3078 if (bal != 0) {
3079 do_panic = TRUE;
3080 printf("LEDGER BALANCE proc %d (%s) "
3081 "\"alternate_accounting\" = %lld\n",
3082 pid, procname, bal);
3083 if (bal > 0) {
3084 pmap_ledgers_drift.alternate_accounting_over++;
3085 pmap_ledgers_drift.alternate_accounting_over_total += bal;
3086 if (bal > pmap_ledgers_drift.alternate_accounting_over_max) {
3087 pmap_ledgers_drift.alternate_accounting_over_max = bal;
3088 }
3089 } else {
3090 pmap_ledgers_drift.alternate_accounting_under++;
3091 pmap_ledgers_drift.alternate_accounting_under_total += bal;
3092 if (bal < pmap_ledgers_drift.alternate_accounting_under_max) {
3093 pmap_ledgers_drift.alternate_accounting_under_max = bal;
3094 }
3095 }
3096 }
3097 ledger_get_balance(pmap->ledger,
3098 task_ledgers.alternate_accounting_compressed,
3099 &bal);
3100 if (bal != 0) {
3101 do_panic = TRUE;
3102 printf("LEDGER BALANCE proc %d (%s) "
3103 "\"alternate_accounting_compressed\" = %lld\n",
3104 pid, procname, bal);
3105 if (bal > 0) {
3106 pmap_ledgers_drift.alternate_accounting_compressed_over++;
3107 pmap_ledgers_drift.alternate_accounting_compressed_over_total += bal;
3108 if (bal > pmap_ledgers_drift.alternate_accounting_compressed_over_max) {
3109 pmap_ledgers_drift.alternate_accounting_compressed_over_max = bal;
3110 }
3111 } else {
3112 pmap_ledgers_drift.alternate_accounting_compressed_under++;
3113 pmap_ledgers_drift.alternate_accounting_compressed_under_total += bal;
3114 if (bal < pmap_ledgers_drift.alternate_accounting_compressed_under_max) {
3115 pmap_ledgers_drift.alternate_accounting_compressed_under_max = bal;
3116 }
3117 }
3118 }
3119 ledger_get_balance(pmap->ledger,
3120 task_ledgers.page_table,
3121 &bal);
3122 if (bal != 0) {
3123 do_panic = TRUE;
3124 printf("LEDGER BALANCE proc %d (%s) "
3125 "\"page_table\" = %lld\n",
3126 pid, procname, bal);
3127 if (bal > 0) {
3128 pmap_ledgers_drift.page_table_over++;
3129 pmap_ledgers_drift.page_table_over_total += bal;
3130 if (bal > pmap_ledgers_drift.page_table_over_max) {
3131 pmap_ledgers_drift.page_table_over_max = bal;
3132 }
3133 } else {
3134 pmap_ledgers_drift.page_table_under++;
3135 pmap_ledgers_drift.page_table_under_total += bal;
3136 if (bal < pmap_ledgers_drift.page_table_under_max) {
3137 pmap_ledgers_drift.page_table_under_max = bal;
3138 }
3139 }
3140 }
3141 ledger_get_balance(pmap->ledger,
3142 task_ledgers.purgeable_volatile,
3143 &bal);
3144 if (bal != 0) {
3145 do_panic = TRUE;
3146 printf("LEDGER BALANCE proc %d (%s) "
3147 "\"purgeable_volatile\" = %lld\n",
3148 pid, procname, bal);
3149 if (bal > 0) {
3150 pmap_ledgers_drift.purgeable_volatile_over++;
3151 pmap_ledgers_drift.purgeable_volatile_over_total += bal;
3152 if (bal > pmap_ledgers_drift.purgeable_volatile_over_max) {
3153 pmap_ledgers_drift.purgeable_volatile_over_max = bal;
3154 }
3155 } else {
3156 pmap_ledgers_drift.purgeable_volatile_under++;
3157 pmap_ledgers_drift.purgeable_volatile_under_total += bal;
3158 if (bal < pmap_ledgers_drift.purgeable_volatile_under_max) {
3159 pmap_ledgers_drift.purgeable_volatile_under_max = bal;
3160 }
3161 }
3162 }
3163 ledger_get_balance(pmap->ledger,
3164 task_ledgers.purgeable_nonvolatile,
3165 &bal);
3166 if (bal != 0) {
3167 do_panic = TRUE;
3168 printf("LEDGER BALANCE proc %d (%s) "
3169 "\"purgeable_nonvolatile\" = %lld\n",
3170 pid, procname, bal);
3171 if (bal > 0) {
3172 pmap_ledgers_drift.purgeable_nonvolatile_over++;
3173 pmap_ledgers_drift.purgeable_nonvolatile_over_total += bal;
3174 if (bal > pmap_ledgers_drift.purgeable_nonvolatile_over_max) {
3175 pmap_ledgers_drift.purgeable_nonvolatile_over_max = bal;
3176 }
3177 } else {
3178 pmap_ledgers_drift.purgeable_nonvolatile_under++;
3179 pmap_ledgers_drift.purgeable_nonvolatile_under_total += bal;
3180 if (bal < pmap_ledgers_drift.purgeable_nonvolatile_under_max) {
3181 pmap_ledgers_drift.purgeable_nonvolatile_under_max = bal;
3182 }
3183 }
3184 }
3185 ledger_get_balance(pmap->ledger,
3186 task_ledgers.purgeable_volatile_compressed,
3187 &bal);
3188 if (bal != 0) {
3189 do_panic = TRUE;
3190 printf("LEDGER BALANCE proc %d (%s) "
3191 "\"purgeable_volatile_compressed\" = %lld\n",
3192 pid, procname, bal);
3193 if (bal > 0) {
3194 pmap_ledgers_drift.purgeable_volatile_compressed_over++;
3195 pmap_ledgers_drift.purgeable_volatile_compressed_over_total += bal;
3196 if (bal > pmap_ledgers_drift.purgeable_volatile_compressed_over_max) {
3197 pmap_ledgers_drift.purgeable_volatile_compressed_over_max = bal;
3198 }
3199 } else {
3200 pmap_ledgers_drift.purgeable_volatile_compressed_under++;
3201 pmap_ledgers_drift.purgeable_volatile_compressed_under_total += bal;
3202 if (bal < pmap_ledgers_drift.purgeable_volatile_compressed_under_max) {
3203 pmap_ledgers_drift.purgeable_volatile_compressed_under_max = bal;
3204 }
3205 }
3206 }
3207 ledger_get_balance(pmap->ledger,
3208 task_ledgers.purgeable_nonvolatile_compressed,
3209 &bal);
3210 if (bal != 0) {
3211 do_panic = TRUE;
3212 printf("LEDGER BALANCE proc %d (%s) "
3213 "\"purgeable_nonvolatile_compressed\" = %lld\n",
3214 pid, procname, bal);
3215 if (bal > 0) {
3216 pmap_ledgers_drift.purgeable_nonvolatile_compressed_over++;
3217 pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_total += bal;
3218 if (bal > pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_max) {
3219 pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_max = bal;
3220 }
3221 } else {
3222 pmap_ledgers_drift.purgeable_nonvolatile_compressed_under++;
3223 pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_total += bal;
3224 if (bal < pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_max) {
3225 pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_max = bal;
3226 }
3227 }
3228 }
3229
3230 if (do_panic) {
3231 if (pmap_ledgers_panic) {
3232 panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
3233 pmap, pid, procname);
3234 } else {
3235 printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
3236 pmap, pid, procname);
3237 }
3238 }
3239
3240 if (pmap->stats.resident_count != 0 ||
3241 #if 35156815
3242 /*
3243 * "wired_count" is unfortunately a bit inaccurate, so let's
3244 * tolerate some slight deviation to limit the amount of
3245 * somewhat-spurious assertion failures.
3246 */
3247 pmap->stats.wired_count > 10 ||
3248 #else /* 35156815 */
3249 pmap->stats.wired_count != 0 ||
3250 #endif /* 35156815 */
3251 pmap->stats.device != 0 ||
3252 pmap->stats.internal != 0 ||
3253 pmap->stats.external != 0 ||
3254 pmap->stats.reusable != 0 ||
3255 pmap->stats.compressed != 0) {
3256 if (pmap_stats_assert) {
3257 panic("pmap_destroy(%p) %d[%s] imbalanced stats: resident=%d wired=%d device=%d internal=%d external=%d reusable=%d compressed=%lld",
3258 pmap, pid, procname,
3259 pmap->stats.resident_count,
3260 pmap->stats.wired_count,
3261 pmap->stats.device,
3262 pmap->stats.internal,
3263 pmap->stats.external,
3264 pmap->stats.reusable,
3265 pmap->stats.compressed);
3266 } else {
3267 printf("pmap_destroy(%p) %d[%s] imbalanced stats: resident=%d wired=%d device=%d internal=%d external=%d reusable=%d compressed=%lld",
3268 pmap, pid, procname,
3269 pmap->stats.resident_count,
3270 pmap->stats.wired_count,
3271 pmap->stats.device,
3272 pmap->stats.internal,
3273 pmap->stats.external,
3274 pmap->stats.reusable,
3275 pmap->stats.compressed);
3276 }
3277 }
3278 }
3279
3280 void
3281 pmap_set_process(
3282 pmap_t pmap,
3283 int pid,
3284 char *procname)
3285 {
3286 if (pmap == NULL)
3287 return;
3288
3289 pmap->pmap_pid = pid;
3290 strlcpy(pmap->pmap_procname, procname, sizeof (pmap->pmap_procname));
3291 }
3292 #endif /* MACH_ASSERT */
3293
3294
3295 #if DEVELOPMENT || DEBUG
3296 int pmap_pagezero_mitigation = 1;
3297 #endif
3298
3299 void pmap_advise_pagezero_range(pmap_t lpmap, uint64_t low_bound) {
3300 #if DEVELOPMENT || DEBUG
3301 if (pmap_pagezero_mitigation == 0) {
3302 lpmap->pagezero_accessible = FALSE;
3303 return;
3304 }
3305 #endif
3306 lpmap->pagezero_accessible = ((pmap_smap_enabled == FALSE) && (low_bound < 0x1000));
3307 if (lpmap == current_pmap()) {
3308 mp_disable_preemption();
3309 current_cpu_datap()->cpu_pagezero_mapped = lpmap->pagezero_accessible;
3310 mp_enable_preemption();
3311 }
3312 }
3313
3314 void pmap_verify_noncacheable(uintptr_t vaddr) {
3315 pt_entry_t *ptep = NULL;
3316 ptep = pmap_pte(kernel_pmap, vaddr);
3317 if (ptep == NULL) {
3318 panic("pmap_verify_noncacheable: no translation for 0x%lx", vaddr);
3319 }
3320 /* Non-cacheable OK */
3321 if (*ptep & (INTEL_PTE_NCACHE))
3322 return;
3323 /* Write-combined OK */
3324 if (*ptep & (INTEL_PTE_PTA))
3325 return;
3326 panic("pmap_verify_noncacheable: IO read from a cacheable address? address: 0x%lx, PTE: %p, *PTE: 0x%llx", vaddr, ptep, *ptep);
3327 }