]> git.saurik.com Git - apple/xnu.git/blob - osfmk/x86_64/pmap.c
xnu-6153.141.1.tar.gz
[apple/xnu.git] / osfmk / x86_64 / pmap.c
1 /*
2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58
59 /*
60 * File: pmap.c
61 * Author: Avadis Tevanian, Jr., Michael Wayne Young
62 * (These guys wrote the Vax version)
63 *
64 * Physical Map management code for Intel i386, i486, and i860.
65 *
66 * Manages physical address maps.
67 *
68 * In addition to hardware address maps, this
69 * module is called upon to provide software-use-only
70 * maps which may or may not be stored in the same
71 * form as hardware maps. These pseudo-maps are
72 * used to store intermediate results from copy
73 * operations to and from address spaces.
74 *
75 * Since the information managed by this module is
76 * also stored by the logical address mapping module,
77 * this module may throw away valid virtual-to-physical
78 * mappings at almost any time. However, invalidations
79 * of virtual-to-physical mappings must be done as
80 * requested.
81 *
82 * In order to cope with hardware architectures which
83 * make virtual-to-physical map invalidates expensive,
84 * this module may delay invalidate or reduced protection
85 * operations until such time as they are actually
86 * necessary. This module is given full information as
87 * to which processors are currently using which maps,
88 * and to when physical maps must be made correct.
89 */
90
91 #include <string.h>
92 #include <mach_ldebug.h>
93
94 #include <libkern/OSAtomic.h>
95
96 #include <mach/machine/vm_types.h>
97
98 #include <mach/boolean.h>
99 #include <kern/thread.h>
100 #include <kern/zalloc.h>
101 #include <kern/queue.h>
102 #include <kern/ledger.h>
103 #include <kern/mach_param.h>
104
105 #include <kern/kalloc.h>
106 #include <kern/spl.h>
107
108 #include <vm/pmap.h>
109 #include <vm/vm_map.h>
110 #include <vm/vm_kern.h>
111 #include <mach/vm_param.h>
112 #include <mach/vm_prot.h>
113 #include <vm/vm_object.h>
114 #include <vm/vm_page.h>
115
116 #include <mach/machine/vm_param.h>
117 #include <machine/thread.h>
118
119 #include <kern/misc_protos.h> /* prototyping */
120 #include <i386/misc_protos.h>
121 #include <i386/i386_lowmem.h>
122 #include <x86_64/lowglobals.h>
123
124 #include <i386/cpuid.h>
125 #include <i386/cpu_data.h>
126 #include <i386/cpu_number.h>
127 #include <i386/machine_cpu.h>
128 #include <i386/seg.h>
129 #include <i386/serial_io.h>
130 #include <i386/cpu_capabilities.h>
131 #include <i386/machine_routines.h>
132 #include <i386/proc_reg.h>
133 #include <i386/tsc.h>
134 #include <i386/pmap_internal.h>
135 #include <i386/pmap_pcid.h>
136 #if CONFIG_VMX
137 #include <i386/vmx/vmx_cpu.h>
138 #endif
139
140 #include <vm/vm_protos.h>
141 #include <san/kasan.h>
142
143 #include <i386/mp.h>
144 #include <i386/mp_desc.h>
145 #include <libkern/kernel_mach_header.h>
146
147 #include <pexpert/i386/efi.h>
148 #include <libkern/section_keywords.h>
149 #if MACH_ASSERT
150 int pmap_stats_assert = 1;
151 #endif /* MACH_ASSERT */
152
153 #ifdef IWANTTODEBUG
154 #undef DEBUG
155 #define DEBUG 1
156 #define POSTCODE_DELAY 1
157 #include <i386/postcode.h>
158 #endif /* IWANTTODEBUG */
159
160 #ifdef PMAP_DEBUG
161 #define DBG(x...) kprintf("DBG: " x)
162 #else
163 #define DBG(x...)
164 #endif
165 /* Compile time assert to ensure adjacency/alignment of per-CPU data fields used
166 * in the trampolines for kernel/user boundary TLB coherency.
167 */
168 char pmap_cpu_data_assert[(((offsetof(cpu_data_t, cpu_tlb_invalid) - offsetof(cpu_data_t, cpu_active_cr3)) == 8) && (offsetof(cpu_data_t, cpu_active_cr3) % 64 == 0)) ? 1 : -1];
169 boolean_t pmap_trace = FALSE;
170
171 boolean_t no_shared_cr3 = DEBUG; /* TRUE for DEBUG by default */
172
173 #if DEVELOPMENT || DEBUG
174 int nx_enabled = 1; /* enable no-execute protection -- set during boot */
175 #else
176 const int nx_enabled = 1;
177 #endif
178
179 #if DEBUG || DEVELOPMENT
180 int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
181 int allow_stack_exec = 0; /* No apps may execute from the stack by default */
182 #else /* DEBUG || DEVELOPMENT */
183 const int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
184 const int allow_stack_exec = 0; /* No apps may execute from the stack by default */
185 #endif /* DEBUG || DEVELOPMENT */
186
187 uint64_t max_preemption_latency_tsc = 0;
188
189 pv_hashed_entry_t *pv_hash_table; /* hash lists */
190
191 uint32_t npvhashmask = 0, npvhashbuckets = 0;
192
193 pv_hashed_entry_t pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
194 pv_hashed_entry_t pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
195 decl_simple_lock_data(, pv_hashed_free_list_lock);
196 decl_simple_lock_data(, pv_hashed_kern_free_list_lock);
197 decl_simple_lock_data(, pv_hash_table_lock);
198
199 decl_simple_lock_data(, phys_backup_lock);
200
201 zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
202
203 /*
204 * First and last physical addresses that we maintain any information
205 * for. Initialized to zero so that pmap operations done before
206 * pmap_init won't touch any non-existent structures.
207 */
208 boolean_t pmap_initialized = FALSE;/* Has pmap_init completed? */
209
210 static struct vm_object kptobj_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
211 static struct vm_object kpml4obj_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
212 static struct vm_object kpdptobj_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
213
214 /*
215 * Array of physical page attribites for managed pages.
216 * One byte per physical page.
217 */
218 char *pmap_phys_attributes;
219 ppnum_t last_managed_page = 0;
220
221 unsigned pmap_memory_region_count;
222 unsigned pmap_memory_region_current;
223
224 pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
225
226 /*
227 * Other useful macros.
228 */
229 #define current_pmap() (vm_map_pmap(current_thread()->map))
230
231 struct pmap kernel_pmap_store;
232 SECURITY_READ_ONLY_LATE(pmap_t) kernel_pmap = NULL;
233
234 struct zone *pmap_zone; /* zone of pmap structures */
235
236 struct zone *pmap_anchor_zone;
237 struct zone *pmap_uanchor_zone;
238 int pmap_debug = 0; /* flag for debugging prints */
239
240 unsigned int inuse_ptepages_count = 0;
241 long long alloc_ptepages_count __attribute__((aligned(8))) = 0; /* aligned for atomic access */
242 unsigned int bootstrap_wired_pages = 0;
243 int pt_fake_zone_index = -1;
244
245 extern long NMIPI_acks;
246
247 SECURITY_READ_ONLY_LATE(boolean_t) kernel_text_ps_4K = TRUE;
248
249 extern char end;
250
251 static int nkpt;
252
253 #if DEVELOPMENT || DEBUG
254 SECURITY_READ_ONLY_LATE(boolean_t) pmap_disable_kheap_nx = FALSE;
255 SECURITY_READ_ONLY_LATE(boolean_t) pmap_disable_kstack_nx = FALSE;
256 SECURITY_READ_ONLY_LATE(boolean_t) wpkernel = TRUE;
257 #else
258 const boolean_t wpkernel = TRUE;
259 #endif
260
261 extern long __stack_chk_guard[];
262
263 static uint64_t pmap_eptp_flags = 0;
264 boolean_t pmap_ept_support_ad = FALSE;
265
266 static void process_pmap_updates(pmap_t, bool, addr64_t, addr64_t);
267 /*
268 * Map memory at initialization. The physical addresses being
269 * mapped are not managed and are never unmapped.
270 *
271 * For now, VM is already on, we only need to map the
272 * specified memory.
273 */
274 vm_offset_t
275 pmap_map(
276 vm_offset_t virt,
277 vm_map_offset_t start_addr,
278 vm_map_offset_t end_addr,
279 vm_prot_t prot,
280 unsigned int flags)
281 {
282 kern_return_t kr;
283 int ps;
284
285 ps = PAGE_SIZE;
286 while (start_addr < end_addr) {
287 kr = pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
288 (ppnum_t) i386_btop(start_addr), prot, VM_PROT_NONE, flags, TRUE);
289
290 if (kr != KERN_SUCCESS) {
291 panic("%s: failed pmap_enter, "
292 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
293 __FUNCTION__,
294 (void *)virt, (void *)start_addr, (void *)end_addr, prot, flags);
295 }
296
297 virt += ps;
298 start_addr += ps;
299 }
300 return virt;
301 }
302
303 extern char *first_avail;
304 extern vm_offset_t virtual_avail, virtual_end;
305 extern pmap_paddr_t avail_start, avail_end;
306 extern vm_offset_t sHIB;
307 extern vm_offset_t eHIB;
308 extern vm_offset_t stext;
309 extern vm_offset_t etext;
310 extern vm_offset_t sdata, edata;
311 extern vm_offset_t sconst, econst;
312
313 extern void *KPTphys;
314
315 boolean_t pmap_smep_enabled = FALSE;
316 boolean_t pmap_smap_enabled = FALSE;
317
318 void
319 pmap_cpu_init(void)
320 {
321 cpu_data_t *cdp = current_cpu_datap();
322
323 set_cr4(get_cr4() | CR4_PGE);
324
325 /*
326 * Initialize the per-cpu, TLB-related fields.
327 */
328 cdp->cpu_kernel_cr3 = kernel_pmap->pm_cr3;
329 cpu_shadowp(cdp->cpu_number)->cpu_kernel_cr3 = cdp->cpu_kernel_cr3;
330 cdp->cpu_active_cr3 = kernel_pmap->pm_cr3;
331 cdp->cpu_tlb_invalid = 0;
332 cdp->cpu_task_map = TASK_MAP_64BIT;
333
334 pmap_pcid_configure();
335 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMEP) {
336 pmap_smep_enabled = TRUE;
337 #if DEVELOPMENT || DEBUG
338 boolean_t nsmep;
339 if (PE_parse_boot_argn("-pmap_smep_disable", &nsmep, sizeof(nsmep))) {
340 pmap_smep_enabled = FALSE;
341 }
342 #endif
343 if (pmap_smep_enabled) {
344 set_cr4(get_cr4() | CR4_SMEP);
345 }
346 }
347 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMAP) {
348 pmap_smap_enabled = TRUE;
349 #if DEVELOPMENT || DEBUG
350 boolean_t nsmap;
351 if (PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) {
352 pmap_smap_enabled = FALSE;
353 }
354 #endif
355 if (pmap_smap_enabled) {
356 set_cr4(get_cr4() | CR4_SMAP);
357 }
358 }
359
360 #if !MONOTONIC
361 if (cdp->cpu_fixed_pmcs_enabled) {
362 boolean_t enable = TRUE;
363 cpu_pmc_control(&enable);
364 }
365 #endif /* !MONOTONIC */
366 }
367
368 static uint32_t
369 pmap_scale_shift(void)
370 {
371 uint32_t scale = 0;
372
373 if (sane_size <= 8 * GB) {
374 scale = (uint32_t)(sane_size / (2 * GB));
375 } else if (sane_size <= 32 * GB) {
376 scale = 4 + (uint32_t)((sane_size - (8 * GB)) / (4 * GB));
377 } else {
378 scale = 10 + (uint32_t)MIN(4, ((sane_size - (32 * GB)) / (8 * GB)));
379 }
380 return scale;
381 }
382
383 lck_grp_t pmap_lck_grp;
384 lck_grp_attr_t pmap_lck_grp_attr;
385 lck_attr_t pmap_lck_rw_attr;
386
387 /*
388 * Bootstrap the system enough to run with virtual memory.
389 * Map the kernel's code and data, and allocate the system page table.
390 * Called with mapping OFF. Page_size must already be set.
391 */
392
393 void
394 pmap_bootstrap(
395 __unused vm_offset_t load_start,
396 __unused boolean_t IA32e)
397 {
398 #if NCOPY_WINDOWS > 0
399 vm_offset_t va;
400 int i;
401 #endif
402 assert(IA32e);
403
404 vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Set the highest address
405 * known to VM */
406 /*
407 * The kernel's pmap is statically allocated so we don't
408 * have to use pmap_create, which is unlikely to work
409 * correctly at this part of the boot sequence.
410 */
411
412 kernel_pmap = &kernel_pmap_store;
413 os_ref_init(&kernel_pmap->ref_count, NULL);
414 #if DEVELOPMENT || DEBUG
415 kernel_pmap->nx_enabled = TRUE;
416 #endif
417 kernel_pmap->pm_task_map = TASK_MAP_64BIT;
418 kernel_pmap->pm_obj = (vm_object_t) NULL;
419 kernel_pmap->pm_pml4 = IdlePML4;
420 kernel_pmap->pm_upml4 = IdlePML4;
421 kernel_pmap->pm_cr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
422 kernel_pmap->pm_ucr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
423 kernel_pmap->pm_eptp = 0;
424
425 pmap_pcid_initialize_kernel(kernel_pmap);
426
427 current_cpu_datap()->cpu_kernel_cr3 = cpu_shadowp(cpu_number())->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
428
429 nkpt = NKPT;
430 OSAddAtomic(NKPT, &inuse_ptepages_count);
431 OSAddAtomic64(NKPT, &alloc_ptepages_count);
432 bootstrap_wired_pages = NKPT;
433
434 virtual_avail = (vm_offset_t)(VM_MIN_KERNEL_ADDRESS) + (vm_offset_t)first_avail;
435 virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
436
437 #if NCOPY_WINDOWS > 0
438 /*
439 * Reserve some special page table entries/VA space for temporary
440 * mapping of pages.
441 */
442 #define SYSMAP(c, p, v, n) \
443 v = (c)va; va += ((n)*INTEL_PGBYTES);
444
445 va = virtual_avail;
446
447 for (i = 0; i < PMAP_NWINDOWS; i++) {
448 #if 1
449 kprintf("trying to do SYSMAP idx %d %p\n", i,
450 current_cpu_datap());
451 kprintf("cpu_pmap %p\n", current_cpu_datap()->cpu_pmap);
452 kprintf("mapwindow %p\n", current_cpu_datap()->cpu_pmap->mapwindow);
453 kprintf("two stuff %p %p\n",
454 (void *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
455 (void *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR));
456 #endif
457 SYSMAP(caddr_t,
458 (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
459 (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR),
460 1);
461 current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP =
462 &(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP_store);
463 *current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP = 0;
464 }
465
466
467 virtual_avail = va;
468 #endif
469 if (!PE_parse_boot_argn("npvhash", &npvhashmask, sizeof(npvhashmask))) {
470 npvhashmask = ((NPVHASHBUCKETS) << pmap_scale_shift()) - 1;
471 }
472
473 npvhashbuckets = npvhashmask + 1;
474
475 if (0 != ((npvhashbuckets) & npvhashmask)) {
476 panic("invalid hash %d, must be ((2^N)-1), "
477 "using default %d\n", npvhashmask, NPVHASHMASK);
478 }
479
480 lck_grp_attr_setdefault(&pmap_lck_grp_attr);
481 lck_grp_init(&pmap_lck_grp, "pmap", &pmap_lck_grp_attr);
482
483 lck_attr_setdefault(&pmap_lck_rw_attr);
484 lck_attr_cleardebug(&pmap_lck_rw_attr);
485
486 lck_rw_init(&kernel_pmap->pmap_rwl, &pmap_lck_grp, &pmap_lck_rw_attr);
487 kernel_pmap->pmap_rwl.lck_rw_can_sleep = FALSE;
488
489 simple_lock_init(&pv_hashed_free_list_lock, 0);
490 simple_lock_init(&pv_hashed_kern_free_list_lock, 0);
491 simple_lock_init(&pv_hash_table_lock, 0);
492 simple_lock_init(&phys_backup_lock, 0);
493
494 pmap_cpu_init();
495
496 if (pmap_pcid_ncpus) {
497 printf("PMAP: PCID enabled\n");
498 }
499
500 if (pmap_smep_enabled) {
501 printf("PMAP: Supervisor Mode Execute Protection enabled\n");
502 }
503 if (pmap_smap_enabled) {
504 printf("PMAP: Supervisor Mode Access Protection enabled\n");
505 }
506
507 #if DEBUG
508 printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]);
509 printf("early_random(): 0x%qx\n", early_random());
510 #endif
511 #if DEVELOPMENT || DEBUG
512 boolean_t ptmp;
513 /* Check if the user has requested disabling stack or heap no-execute
514 * enforcement. These are "const" variables; that qualifier is cast away
515 * when altering them. The TEXT/DATA const sections are marked
516 * write protected later in the kernel startup sequence, so altering
517 * them is possible at this point, in pmap_bootstrap().
518 */
519 if (PE_parse_boot_argn("-pmap_disable_kheap_nx", &ptmp, sizeof(ptmp))) {
520 boolean_t *pdknxp = (boolean_t *) &pmap_disable_kheap_nx;
521 *pdknxp = TRUE;
522 }
523
524 if (PE_parse_boot_argn("-pmap_disable_kstack_nx", &ptmp, sizeof(ptmp))) {
525 boolean_t *pdknhp = (boolean_t *) &pmap_disable_kstack_nx;
526 *pdknhp = TRUE;
527 }
528 #endif /* DEVELOPMENT || DEBUG */
529
530 boot_args *args = (boot_args *)PE_state.bootArgs;
531 if (args->efiMode == kBootArgsEfiMode32) {
532 printf("EFI32: kernel virtual space limited to 4GB\n");
533 virtual_end = VM_MAX_KERNEL_ADDRESS_EFI32;
534 }
535 kprintf("Kernel virtual space from 0x%lx to 0x%lx.\n",
536 (long)KERNEL_BASE, (long)virtual_end);
537 kprintf("Available physical space from 0x%llx to 0x%llx\n",
538 avail_start, avail_end);
539
540 /*
541 * The -no_shared_cr3 boot-arg is a debugging feature (set by default
542 * in the DEBUG kernel) to force the kernel to switch to its own map
543 * (and cr3) when control is in kernelspace. The kernel's map does not
544 * include (i.e. share) userspace so wild references will cause
545 * a panic. Only copyin and copyout are exempt from this.
546 */
547 (void) PE_parse_boot_argn("-no_shared_cr3",
548 &no_shared_cr3, sizeof(no_shared_cr3));
549 if (no_shared_cr3) {
550 kprintf("Kernel not sharing user map\n");
551 }
552
553 #ifdef PMAP_TRACES
554 if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof(pmap_trace))) {
555 kprintf("Kernel traces for pmap operations enabled\n");
556 }
557 #endif /* PMAP_TRACES */
558
559 #if MACH_ASSERT
560 PE_parse_boot_argn("pmap_asserts", &pmap_asserts_enabled, sizeof(pmap_asserts_enabled));
561 PE_parse_boot_argn("pmap_stats_assert",
562 &pmap_stats_assert,
563 sizeof(pmap_stats_assert));
564 #endif /* MACH_ASSERT */
565 }
566
567 void
568 pmap_virtual_space(
569 vm_offset_t *startp,
570 vm_offset_t *endp)
571 {
572 *startp = virtual_avail;
573 *endp = virtual_end;
574 }
575
576
577
578
579 #if HIBERNATION
580
581 #include <IOKit/IOHibernatePrivate.h>
582
583 int32_t pmap_npages;
584 int32_t pmap_teardown_last_valid_compact_indx = -1;
585
586
587 void hibernate_rebuild_pmap_structs(void);
588 void hibernate_teardown_pmap_structs(addr64_t *, addr64_t *);
589 void pmap_pack_index(uint32_t);
590 int32_t pmap_unpack_index(pv_rooted_entry_t);
591
592
593 int32_t
594 pmap_unpack_index(pv_rooted_entry_t pv_h)
595 {
596 int32_t indx = 0;
597
598 indx = (int32_t)(*((uint64_t *)(&pv_h->qlink.next)) >> 48);
599 indx = indx << 16;
600 indx |= (int32_t)(*((uint64_t *)(&pv_h->qlink.prev)) >> 48);
601
602 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)0xffff << 48);
603 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)0xffff << 48);
604
605 return indx;
606 }
607
608
609 void
610 pmap_pack_index(uint32_t indx)
611 {
612 pv_rooted_entry_t pv_h;
613
614 pv_h = &pv_head_table[indx];
615
616 *((uint64_t *)(&pv_h->qlink.next)) &= ~((uint64_t)0xffff << 48);
617 *((uint64_t *)(&pv_h->qlink.prev)) &= ~((uint64_t)0xffff << 48);
618
619 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)(indx >> 16)) << 48;
620 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)(indx & 0xffff)) << 48;
621 }
622
623
624 void
625 hibernate_teardown_pmap_structs(addr64_t *unneeded_start, addr64_t *unneeded_end)
626 {
627 int32_t i;
628 int32_t compact_target_indx;
629
630 compact_target_indx = 0;
631
632 for (i = 0; i < pmap_npages; i++) {
633 if (pv_head_table[i].pmap == PMAP_NULL) {
634 if (pv_head_table[compact_target_indx].pmap != PMAP_NULL) {
635 compact_target_indx = i;
636 }
637 } else {
638 pmap_pack_index((uint32_t)i);
639
640 if (pv_head_table[compact_target_indx].pmap == PMAP_NULL) {
641 /*
642 * we've got a hole to fill, so
643 * move this pv_rooted_entry_t to it's new home
644 */
645 pv_head_table[compact_target_indx] = pv_head_table[i];
646 pv_head_table[i].pmap = PMAP_NULL;
647
648 pmap_teardown_last_valid_compact_indx = compact_target_indx;
649 compact_target_indx++;
650 } else {
651 pmap_teardown_last_valid_compact_indx = i;
652 }
653 }
654 }
655 *unneeded_start = (addr64_t)&pv_head_table[pmap_teardown_last_valid_compact_indx + 1];
656 *unneeded_end = (addr64_t)&pv_head_table[pmap_npages - 1];
657
658 HIBLOG("hibernate_teardown_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
659 }
660
661
662 void
663 hibernate_rebuild_pmap_structs(void)
664 {
665 int32_t cindx, eindx, rindx = 0;
666 pv_rooted_entry_t pv_h;
667
668 eindx = (int32_t)pmap_npages;
669
670 for (cindx = pmap_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
671 pv_h = &pv_head_table[cindx];
672
673 rindx = pmap_unpack_index(pv_h);
674 assert(rindx < pmap_npages);
675
676 if (rindx != cindx) {
677 /*
678 * this pv_rooted_entry_t was moved by hibernate_teardown_pmap_structs,
679 * so move it back to its real location
680 */
681 pv_head_table[rindx] = pv_head_table[cindx];
682 }
683 if (rindx + 1 != eindx) {
684 /*
685 * the 'hole' between this vm_rooted_entry_t and the previous
686 * vm_rooted_entry_t we moved needs to be initialized as
687 * a range of zero'd vm_rooted_entry_t's
688 */
689 bzero((char *)&pv_head_table[rindx + 1], (eindx - rindx - 1) * sizeof(struct pv_rooted_entry));
690 }
691 eindx = rindx;
692 }
693 if (rindx) {
694 bzero((char *)&pv_head_table[0], rindx * sizeof(struct pv_rooted_entry));
695 }
696
697 HIBLOG("hibernate_rebuild_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
698 }
699
700 #endif
701
702 /*
703 * Create pv entries for kernel pages mapped by early startup code.
704 * These have to exist so we can ml_static_mfree() them later.
705 */
706 static void
707 pmap_pv_fixup(vm_offset_t start_va, vm_offset_t end_va)
708 {
709 ppnum_t ppn;
710 pv_rooted_entry_t pv_h;
711 uint32_t pgsz;
712
713 start_va = round_page(start_va);
714 end_va = trunc_page(end_va);
715 while (start_va < end_va) {
716 pgsz = PAGE_SIZE;
717 ppn = pmap_find_phys(kernel_pmap, start_va);
718 if (ppn != 0 && IS_MANAGED_PAGE(ppn)) {
719 pv_h = pai_to_pvh(ppn);
720 assert(pv_h->qlink.next == 0); /* shouldn't be init'd yet */
721 assert(pv_h->pmap == 0);
722 pv_h->va_and_flags = start_va;
723 pv_h->pmap = kernel_pmap;
724 queue_init(&pv_h->qlink);
725 if (pmap_query_pagesize(kernel_pmap, start_va) == I386_LPGBYTES) {
726 pgsz = I386_LPGBYTES;
727 }
728 }
729 start_va += pgsz;
730 }
731 }
732
733 /*
734 * Initialize the pmap module.
735 * Called by vm_init, to initialize any structures that the pmap
736 * system needs to map virtual memory.
737 */
738 void
739 pmap_init(void)
740 {
741 long npages;
742 vm_offset_t addr;
743 vm_size_t s, vsize;
744 vm_map_offset_t vaddr;
745 ppnum_t ppn;
746
747
748 kernel_pmap->pm_obj_pml4 = &kpml4obj_object_store;
749 _vm_object_allocate((vm_object_size_t)NPML4PGS * PAGE_SIZE, &kpml4obj_object_store);
750
751 kernel_pmap->pm_obj_pdpt = &kpdptobj_object_store;
752 _vm_object_allocate((vm_object_size_t)NPDPTPGS * PAGE_SIZE, &kpdptobj_object_store);
753
754 kernel_pmap->pm_obj = &kptobj_object_store;
755 _vm_object_allocate((vm_object_size_t)NPDEPGS * PAGE_SIZE, &kptobj_object_store);
756
757 /*
758 * Allocate memory for the pv_head_table and its lock bits,
759 * the modify bit array, and the pte_page table.
760 */
761
762 /*
763 * zero bias all these arrays now instead of off avail_start
764 * so we cover all memory
765 */
766
767 npages = i386_btop(avail_end);
768 #if HIBERNATION
769 pmap_npages = (uint32_t)npages;
770 #endif
771 s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
772 + (sizeof(struct pv_hashed_entry_t *) * (npvhashbuckets))
773 + pv_lock_table_size(npages)
774 + pv_hash_lock_table_size((npvhashbuckets))
775 + npages);
776 s = round_page(s);
777 if (kernel_memory_allocate(kernel_map, &addr, s, 0,
778 KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_PMAP)
779 != KERN_SUCCESS) {
780 panic("pmap_init");
781 }
782
783 memset((char *)addr, 0, s);
784
785 vaddr = addr;
786 vsize = s;
787
788 #if PV_DEBUG
789 if (0 == npvhashmask) {
790 panic("npvhashmask not initialized");
791 }
792 #endif
793
794 /*
795 * Allocate the structures first to preserve word-alignment.
796 */
797 pv_head_table = (pv_rooted_entry_t) addr;
798 addr = (vm_offset_t) (pv_head_table + npages);
799
800 pv_hash_table = (pv_hashed_entry_t *)addr;
801 addr = (vm_offset_t) (pv_hash_table + (npvhashbuckets));
802
803 pv_lock_table = (char *) addr;
804 addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
805
806 pv_hash_lock_table = (char *) addr;
807 addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhashbuckets)));
808
809 pmap_phys_attributes = (char *) addr;
810
811 ppnum_t last_pn = i386_btop(avail_end);
812 unsigned int i;
813 pmap_memory_region_t *pmptr = pmap_memory_regions;
814 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
815 if (pmptr->type != kEfiConventionalMemory) {
816 continue;
817 }
818 ppnum_t pn;
819 for (pn = pmptr->base; pn <= pmptr->end; pn++) {
820 if (pn < last_pn) {
821 pmap_phys_attributes[pn] |= PHYS_MANAGED;
822
823 if (pn > last_managed_page) {
824 last_managed_page = pn;
825 }
826
827 if ((pmap_high_used_bottom <= pn && pn <= pmap_high_used_top) ||
828 (pmap_middle_used_bottom <= pn && pn <= pmap_middle_used_top)) {
829 pmap_phys_attributes[pn] |= PHYS_NOENCRYPT;
830 }
831 }
832 }
833 }
834 while (vsize) {
835 ppn = pmap_find_phys(kernel_pmap, vaddr);
836
837 pmap_phys_attributes[ppn] |= PHYS_NOENCRYPT;
838
839 vaddr += PAGE_SIZE;
840 vsize -= PAGE_SIZE;
841 }
842 /*
843 * Create the zone of physical maps,
844 * and of the physical-to-virtual entries.
845 */
846 s = (vm_size_t) sizeof(struct pmap);
847 pmap_zone = zinit(s, 400 * s, 4096, "pmap"); /* XXX */
848 zone_change(pmap_zone, Z_NOENCRYPT, TRUE);
849
850 pmap_anchor_zone = zinit(PAGE_SIZE, task_max, PAGE_SIZE, "pagetable anchors");
851 zone_change(pmap_anchor_zone, Z_NOENCRYPT, TRUE);
852
853 /* The anchor is required to be page aligned. Zone debugging adds
854 * padding which may violate that requirement. Tell the zone
855 * subsystem that alignment is required.
856 */
857
858 zone_change(pmap_anchor_zone, Z_ALIGNMENT_REQUIRED, TRUE);
859 /* TODO: possible general optimisation...pre-allocate via zones commonly created
860 * level3/2 pagetables
861 */
862 pmap_uanchor_zone = zinit(PAGE_SIZE, task_max, PAGE_SIZE, "pagetable user anchors");
863 zone_change(pmap_uanchor_zone, Z_NOENCRYPT, TRUE);
864
865 /* The anchor is required to be page aligned. Zone debugging adds
866 * padding which may violate that requirement. Tell the zone
867 * subsystem that alignment is required.
868 */
869
870 zone_change(pmap_uanchor_zone, Z_ALIGNMENT_REQUIRED, TRUE);
871
872 s = (vm_size_t) sizeof(struct pv_hashed_entry);
873 pv_hashed_list_zone = zinit(s, 10000 * s /* Expandable zone */,
874 4096 * 3 /* LCM x86_64*/, "pv_list");
875 zone_change(pv_hashed_list_zone, Z_NOENCRYPT, TRUE);
876 zone_change(pv_hashed_list_zone, Z_GZALLOC_EXEMPT, TRUE);
877
878 /*
879 * Create pv entries for kernel pages that might get pmap_remove()ed.
880 *
881 * - very low pages that were identity mapped.
882 * - vm_pages[] entries that might be unused and reclaimed.
883 */
884 assert((uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start <= (uintptr_t)vm_page_array_beginning_addr);
885 pmap_pv_fixup((uintptr_t)VM_MIN_KERNEL_ADDRESS, (uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start);
886 pmap_pv_fixup((uintptr_t)vm_page_array_beginning_addr, (uintptr_t)vm_page_array_ending_addr);
887
888 pmap_initialized = TRUE;
889
890 max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
891
892 /*
893 * Ensure the kernel's PML4 entry exists for the basement
894 * before this is shared with any user.
895 */
896 pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT, PMAP_EXPAND_OPTIONS_NONE);
897
898 #if CONFIG_VMX
899 pmap_ept_support_ad = vmx_hv_support() && (VMX_CAP(MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_EPT_VPID_CAP_AD_SHIFT, 1) ? TRUE : FALSE);
900 pmap_eptp_flags = HV_VMX_EPTP_MEMORY_TYPE_WB | HV_VMX_EPTP_WALK_LENGTH(4) | (pmap_ept_support_ad ? HV_VMX_EPTP_ENABLE_AD_FLAGS : 0);
901 #endif /* CONFIG_VMX */
902 }
903
904 static
905 void
906 pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolean_t ro)
907 {
908 uint64_t ev = sv + nxrosz, cv = sv;
909 pd_entry_t *pdep;
910 pt_entry_t *ptep = NULL;
911
912 assert(!is_ept_pmap(npmap));
913
914 assert(((sv & 0xFFFULL) | (nxrosz & 0xFFFULL)) == 0);
915
916 for (pdep = pmap_pde(npmap, cv); pdep != NULL && (cv < ev);) {
917 uint64_t pdev = (cv & ~((uint64_t)PDEMASK));
918
919 if (*pdep & INTEL_PTE_PS) {
920 if (NX) {
921 *pdep |= INTEL_PTE_NX;
922 }
923 if (ro) {
924 *pdep &= ~INTEL_PTE_WRITE;
925 }
926 cv += NBPD;
927 cv &= ~((uint64_t) PDEMASK);
928 pdep = pmap_pde(npmap, cv);
929 continue;
930 }
931
932 for (ptep = pmap_pte(npmap, cv); ptep != NULL && (cv < (pdev + NBPD)) && (cv < ev);) {
933 if (NX) {
934 *ptep |= INTEL_PTE_NX;
935 }
936 if (ro) {
937 *ptep &= ~INTEL_PTE_WRITE;
938 }
939 cv += NBPT;
940 ptep = pmap_pte(npmap, cv);
941 }
942 }
943 DPRINTF("%s(0x%llx, 0x%llx, %u, %u): 0x%llx, 0x%llx\n", __FUNCTION__, sv, nxrosz, NX, ro, cv, ptep ? *ptep: 0);
944 }
945
946 /*
947 * Reclaim memory for early boot 4K page tables that were converted to large page mappings.
948 * We know this memory is part of the KPTphys[] array that was allocated in Idle_PTs_init(),
949 * so we can free it using its address in that array.
950 */
951 static void
952 pmap_free_early_PT(ppnum_t ppn, uint32_t cnt)
953 {
954 ppnum_t KPTphys_ppn;
955 vm_offset_t offset;
956
957 KPTphys_ppn = pmap_find_phys(kernel_pmap, (uintptr_t)KPTphys);
958 assert(ppn >= KPTphys_ppn);
959 assert(ppn + cnt <= KPTphys_ppn + NKPT);
960 offset = (ppn - KPTphys_ppn) << PAGE_SHIFT;
961 ml_static_mfree((uintptr_t)KPTphys + offset, PAGE_SIZE * cnt);
962 }
963
964 /*
965 * Called once VM is fully initialized so that we can release unused
966 * sections of low memory to the general pool.
967 * Also complete the set-up of identity-mapped sections of the kernel:
968 * 1) write-protect kernel text
969 * 2) map kernel text using large pages if possible
970 * 3) read and write-protect page zero (for K32)
971 * 4) map the global page at the appropriate virtual address.
972 *
973 * Use of large pages
974 * ------------------
975 * To effectively map and write-protect all kernel text pages, the text
976 * must be 2M-aligned at the base, and the data section above must also be
977 * 2M-aligned. That is, there's padding below and above. This is achieved
978 * through linker directives. Large pages are used only if this alignment
979 * exists (and not overriden by the -kernel_text_page_4K boot-arg). The
980 * memory layout is:
981 *
982 * : :
983 * | __DATA |
984 * sdata: ================== 2Meg
985 * | |
986 * | zero-padding |
987 * | |
988 * etext: ------------------
989 * | |
990 * : :
991 * | |
992 * | __TEXT |
993 * | |
994 * : :
995 * | |
996 * stext: ================== 2Meg
997 * | |
998 * | zero-padding |
999 * | |
1000 * eHIB: ------------------
1001 * | __HIB |
1002 * : :
1003 *
1004 * Prior to changing the mapping from 4K to 2M, the zero-padding pages
1005 * [eHIB,stext] and [etext,sdata] are ml_static_mfree()'d. Then all the
1006 * 4K pages covering [stext,etext] are coalesced as 2M large pages.
1007 * The now unused level-1 PTE pages are also freed.
1008 */
1009 extern ppnum_t vm_kernel_base_page;
1010 static uint32_t dataptes = 0;
1011
1012 void
1013 pmap_lowmem_finalize(void)
1014 {
1015 spl_t spl;
1016 int i;
1017
1018 /*
1019 * Update wired memory statistics for early boot pages
1020 */
1021 PMAP_ZINFO_PALLOC(kernel_pmap, bootstrap_wired_pages * PAGE_SIZE);
1022
1023 /*
1024 * Free pages in pmap regions below the base:
1025 * rdar://6332712
1026 * We can't free all the pages to VM that EFI reports available.
1027 * Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake.
1028 * There's also a size miscalculation here: pend is one page less
1029 * than it should be but this is not fixed to be backwards
1030 * compatible.
1031 * This is important for KASLR because up to 256*2MB = 512MB of space
1032 * needs has to be released to VM.
1033 */
1034 for (i = 0;
1035 pmap_memory_regions[i].end < vm_kernel_base_page;
1036 i++) {
1037 vm_offset_t pbase = i386_ptob(pmap_memory_regions[i].base);
1038 vm_offset_t pend = i386_ptob(pmap_memory_regions[i].end + 1);
1039
1040 DBG("pmap region %d [%p..[%p\n",
1041 i, (void *) pbase, (void *) pend);
1042
1043 if (pmap_memory_regions[i].attribute & EFI_MEMORY_KERN_RESERVED) {
1044 continue;
1045 }
1046 /*
1047 * rdar://6332712
1048 * Adjust limits not to free pages in range 0xc0000-0xff000.
1049 */
1050 if (pbase >= 0xc0000 && pend <= 0x100000) {
1051 continue;
1052 }
1053 if (pbase < 0xc0000 && pend > 0x100000) {
1054 /* page range entirely within region, free lower part */
1055 DBG("- ml_static_mfree(%p,%p)\n",
1056 (void *) ml_static_ptovirt(pbase),
1057 (void *) (0xc0000 - pbase));
1058 ml_static_mfree(ml_static_ptovirt(pbase), 0xc0000 - pbase);
1059 pbase = 0x100000;
1060 }
1061 if (pbase < 0xc0000) {
1062 pend = MIN(pend, 0xc0000);
1063 }
1064 if (pend > 0x100000) {
1065 pbase = MAX(pbase, 0x100000);
1066 }
1067 DBG("- ml_static_mfree(%p,%p)\n",
1068 (void *) ml_static_ptovirt(pbase),
1069 (void *) (pend - pbase));
1070 ml_static_mfree(ml_static_ptovirt(pbase), pend - pbase);
1071 }
1072
1073 /* A final pass to get rid of all initial identity mappings to
1074 * low pages.
1075 */
1076 DPRINTF("%s: Removing mappings from 0->0x%lx\n", __FUNCTION__, vm_kernel_base);
1077
1078 /*
1079 * Remove all mappings past the boot-cpu descriptor aliases and low globals.
1080 * Non-boot-cpu GDT aliases will be remapped later as needed.
1081 */
1082 pmap_remove(kernel_pmap, LOWGLOBAL_ALIAS + PAGE_SIZE, vm_kernel_base);
1083
1084 /*
1085 * Release any memory for early boot 4K page table pages that got replaced
1086 * with large page mappings for vm_pages[]. We know this memory is part of
1087 * the KPTphys[] array that was allocated in Idle_PTs_init(), so we can free
1088 * it using that address.
1089 */
1090 pmap_free_early_PT(released_PT_ppn, released_PT_cnt);
1091
1092 /*
1093 * If text and data are both 2MB-aligned,
1094 * we can map text with large-pages,
1095 * unless the -kernel_text_ps_4K boot-arg overrides.
1096 */
1097 if ((stext & I386_LPGMASK) == 0 && (sdata & I386_LPGMASK) == 0) {
1098 kprintf("Kernel text is 2MB aligned");
1099 kernel_text_ps_4K = FALSE;
1100 if (PE_parse_boot_argn("-kernel_text_ps_4K",
1101 &kernel_text_ps_4K,
1102 sizeof(kernel_text_ps_4K))) {
1103 kprintf(" but will be mapped with 4K pages\n");
1104 } else {
1105 kprintf(" and will be mapped with 2M pages\n");
1106 }
1107 }
1108 #if DEVELOPMENT || DEBUG
1109 (void) PE_parse_boot_argn("wpkernel", &wpkernel, sizeof(wpkernel));
1110 #endif
1111 if (wpkernel) {
1112 kprintf("Kernel text %p-%p to be write-protected\n",
1113 (void *) stext, (void *) etext);
1114 }
1115
1116 spl = splhigh();
1117
1118 /*
1119 * Scan over text if mappings are to be changed:
1120 * - Remap kernel text readonly unless the "wpkernel" boot-arg is 0
1121 * - Change to large-pages if possible and not overriden.
1122 */
1123 if (kernel_text_ps_4K && wpkernel) {
1124 vm_offset_t myva;
1125 for (myva = stext; myva < etext; myva += PAGE_SIZE) {
1126 pt_entry_t *ptep;
1127
1128 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1129 if (ptep) {
1130 pmap_store_pte(ptep, *ptep & ~INTEL_PTE_WRITE);
1131 }
1132 }
1133 }
1134
1135 if (!kernel_text_ps_4K) {
1136 vm_offset_t myva;
1137
1138 /*
1139 * Release zero-filled page padding used for 2M-alignment.
1140 */
1141 DBG("ml_static_mfree(%p,%p) for padding below text\n",
1142 (void *) eHIB, (void *) (stext - eHIB));
1143 ml_static_mfree(eHIB, stext - eHIB);
1144 DBG("ml_static_mfree(%p,%p) for padding above text\n",
1145 (void *) etext, (void *) (sdata - etext));
1146 ml_static_mfree(etext, sdata - etext);
1147
1148 /*
1149 * Coalesce text pages into large pages.
1150 */
1151 for (myva = stext; myva < sdata; myva += I386_LPGBYTES) {
1152 pt_entry_t *ptep;
1153 vm_offset_t pte_phys;
1154 pt_entry_t *pdep;
1155 pt_entry_t pde;
1156 ppnum_t KPT_ppn;
1157
1158 pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva);
1159 KPT_ppn = (ppnum_t)((*pdep & PG_FRAME) >> PAGE_SHIFT);
1160 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1161 DBG("myva: %p pdep: %p ptep: %p\n",
1162 (void *) myva, (void *) pdep, (void *) ptep);
1163 if ((*ptep & INTEL_PTE_VALID) == 0) {
1164 continue;
1165 }
1166 pte_phys = (vm_offset_t)(*ptep & PG_FRAME);
1167 pde = *pdep & PTMASK; /* page attributes from pde */
1168 pde |= INTEL_PTE_PS; /* make it a 2M entry */
1169 pde |= pte_phys; /* take page frame from pte */
1170
1171 if (wpkernel) {
1172 pde &= ~INTEL_PTE_WRITE;
1173 }
1174 DBG("pmap_store_pte(%p,0x%llx)\n",
1175 (void *)pdep, pde);
1176 pmap_store_pte(pdep, pde);
1177
1178 /*
1179 * Free the now-unused level-1 pte.
1180 */
1181 pmap_free_early_PT(KPT_ppn, 1);
1182 }
1183
1184 /* Change variable read by sysctl machdep.pmap */
1185 pmap_kernel_text_ps = I386_LPGBYTES;
1186 }
1187
1188 vm_offset_t dva;
1189
1190 for (dva = sdata; dva < edata; dva += I386_PGBYTES) {
1191 assert(((sdata | edata) & PAGE_MASK) == 0);
1192 pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva);
1193
1194 dpte = *dptep;
1195 assert((dpte & INTEL_PTE_VALID));
1196 dpte |= INTEL_PTE_NX;
1197 pmap_store_pte(dptep, dpte);
1198 dataptes++;
1199 }
1200 assert(dataptes > 0);
1201
1202 kernel_segment_command_t * seg;
1203 kernel_section_t * sec;
1204
1205 for (seg = firstseg(); seg != NULL; seg = nextsegfromheader(&_mh_execute_header, seg)) {
1206 if (!strcmp(seg->segname, "__TEXT") ||
1207 !strcmp(seg->segname, "__DATA")) {
1208 continue;
1209 }
1210 //XXX
1211 if (!strcmp(seg->segname, "__KLD")) {
1212 continue;
1213 }
1214 if (!strcmp(seg->segname, "__HIB")) {
1215 for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) {
1216 if (sec->addr & PAGE_MASK) {
1217 panic("__HIB segment's sections misaligned");
1218 }
1219 if (!strcmp(sec->sectname, "__text")) {
1220 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), FALSE, TRUE);
1221 } else {
1222 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), TRUE, FALSE);
1223 }
1224 }
1225 } else {
1226 pmap_mark_range(kernel_pmap, seg->vmaddr, round_page_64(seg->vmsize), TRUE, FALSE);
1227 }
1228 }
1229
1230 /*
1231 * If we're debugging, map the low global vector page at the fixed
1232 * virtual address. Otherwise, remove the mapping for this.
1233 */
1234 if (debug_boot_arg) {
1235 pt_entry_t *pte = NULL;
1236 if (0 == (pte = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS))) {
1237 panic("lowmem pte");
1238 }
1239 /* make sure it is defined on page boundary */
1240 assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK));
1241 pmap_store_pte(pte, kvtophys((vm_offset_t)&lowGlo)
1242 | INTEL_PTE_REF
1243 | INTEL_PTE_MOD
1244 | INTEL_PTE_WIRED
1245 | INTEL_PTE_VALID
1246 | INTEL_PTE_WRITE
1247 | INTEL_PTE_NX);
1248 } else {
1249 pmap_remove(kernel_pmap,
1250 LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE);
1251 }
1252 pmap_tlbi_range(0, ~0ULL, true, 0);
1253 splx(spl);
1254 }
1255
1256 /*
1257 * Mark the const data segment as read-only, non-executable.
1258 */
1259 void
1260 x86_64_protect_data_const()
1261 {
1262 boolean_t doconstro = TRUE;
1263 #if DEVELOPMENT || DEBUG
1264 (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro));
1265 #endif
1266 if (doconstro) {
1267 if (sconst & PAGE_MASK) {
1268 panic("CONST segment misaligned 0x%lx 0x%lx\n",
1269 sconst, econst);
1270 }
1271 kprintf("Marking const DATA read-only\n");
1272 pmap_protect(kernel_pmap, sconst, econst, VM_PROT_READ);
1273 }
1274 }
1275 /*
1276 * this function is only used for debugging fron the vm layer
1277 */
1278 boolean_t
1279 pmap_verify_free(
1280 ppnum_t pn)
1281 {
1282 pv_rooted_entry_t pv_h;
1283 int pai;
1284 boolean_t result;
1285
1286 assert(pn != vm_page_fictitious_addr);
1287
1288 if (!pmap_initialized) {
1289 return TRUE;
1290 }
1291
1292 if (pn == vm_page_guard_addr) {
1293 return TRUE;
1294 }
1295
1296 pai = ppn_to_pai(pn);
1297 if (!IS_MANAGED_PAGE(pai)) {
1298 return FALSE;
1299 }
1300 pv_h = pai_to_pvh(pn);
1301 result = (pv_h->pmap == PMAP_NULL);
1302 return result;
1303 }
1304
1305 #if MACH_ASSERT
1306 void
1307 pmap_assert_free(ppnum_t pn)
1308 {
1309 int pai;
1310 pv_rooted_entry_t pv_h = NULL;
1311 pmap_t pmap = NULL;
1312 vm_offset_t va = 0;
1313 static char buffer[32];
1314 static char *pr_name = "not managed pn";
1315 uint_t attr;
1316 pt_entry_t *ptep;
1317 pt_entry_t pte = -1ull;
1318
1319 if (pmap_verify_free(pn)) {
1320 return;
1321 }
1322
1323 if (pn > last_managed_page) {
1324 attr = 0xff;
1325 goto done;
1326 }
1327
1328 pai = ppn_to_pai(pn);
1329 attr = pmap_phys_attributes[pai];
1330 pv_h = pai_to_pvh(pai);
1331 va = pv_h->va_and_flags;
1332 pmap = pv_h->pmap;
1333 if (pmap == kernel_pmap) {
1334 pr_name = "kernel";
1335 } else if (pmap == NULL) {
1336 pr_name = "pmap NULL";
1337 } else if (pmap->pmap_procname[0] != 0) {
1338 pr_name = &pmap->pmap_procname[0];
1339 } else {
1340 snprintf(buffer, sizeof(buffer), "pmap %p", pv_h->pmap);
1341 pr_name = buffer;
1342 }
1343
1344 if (pmap != NULL) {
1345 ptep = pmap_pte(pmap, va);
1346 if (ptep != NULL) {
1347 pte = (uintptr_t)*ptep;
1348 }
1349 }
1350
1351 done:
1352 panic("page not FREE page: 0x%lx attr: 0x%x %s va: 0x%lx PTE: 0x%llx",
1353 (ulong_t)pn, attr, pr_name, va, pte);
1354 }
1355 #endif /* MACH_ASSERT */
1356
1357 boolean_t
1358 pmap_is_empty(
1359 pmap_t pmap,
1360 vm_map_offset_t va_start,
1361 vm_map_offset_t va_end)
1362 {
1363 vm_map_offset_t offset;
1364 ppnum_t phys_page;
1365
1366 if (pmap == PMAP_NULL) {
1367 return TRUE;
1368 }
1369
1370 /*
1371 * Check the resident page count
1372 * - if it's zero, the pmap is completely empty.
1373 * This short-circuit test prevents a virtual address scan which is
1374 * painfully slow for 64-bit spaces.
1375 * This assumes the count is correct
1376 * .. the debug kernel ought to be checking perhaps by page table walk.
1377 */
1378 if (pmap->stats.resident_count == 0) {
1379 return TRUE;
1380 }
1381
1382 for (offset = va_start;
1383 offset < va_end;
1384 offset += PAGE_SIZE_64) {
1385 phys_page = pmap_find_phys(pmap, offset);
1386 if (phys_page) {
1387 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1388 "page %d at 0x%llx\n",
1389 pmap, va_start, va_end, phys_page, offset);
1390 return FALSE;
1391 }
1392 }
1393
1394 return TRUE;
1395 }
1396
1397 void
1398 hv_ept_pmap_create(void **ept_pmap, void **eptp)
1399 {
1400 pmap_t p;
1401
1402 if ((ept_pmap == NULL) || (eptp == NULL)) {
1403 return;
1404 }
1405
1406 p = pmap_create_options(get_task_ledger(current_task()), 0, (PMAP_CREATE_64BIT | PMAP_CREATE_EPT));
1407 if (p == PMAP_NULL) {
1408 *ept_pmap = NULL;
1409 *eptp = NULL;
1410 return;
1411 }
1412
1413 assert(is_ept_pmap(p));
1414
1415 *ept_pmap = (void*)p;
1416 *eptp = (void*)(p->pm_eptp);
1417 return;
1418 }
1419
1420 /*
1421 * pmap_create() is used by some special, legacy 3rd party kexts.
1422 * In our kernel code, always use pmap_create_options().
1423 */
1424 extern pmap_t pmap_create(ledger_t ledger, vm_map_size_t sz, boolean_t is_64bit);
1425
1426 __attribute__((used))
1427 pmap_t
1428 pmap_create(
1429 ledger_t ledger,
1430 vm_map_size_t sz,
1431 boolean_t is_64bit)
1432 {
1433 return pmap_create_options(ledger, sz, is_64bit ? PMAP_CREATE_64BIT : 0);
1434 }
1435
1436 /*
1437 * Create and return a physical map.
1438 *
1439 * If the size specified for the map
1440 * is zero, the map is an actual physical
1441 * map, and may be referenced by the
1442 * hardware.
1443 *
1444 * If the size specified is non-zero,
1445 * the map will be used in software only, and
1446 * is bounded by that size.
1447 */
1448
1449 pmap_t
1450 pmap_create_options(
1451 ledger_t ledger,
1452 vm_map_size_t sz,
1453 unsigned int flags)
1454 {
1455 pmap_t p;
1456 vm_size_t size;
1457 pml4_entry_t *pml4;
1458 pml4_entry_t *kpml4;
1459 int i;
1460
1461 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, sz, flags);
1462
1463 size = (vm_size_t) sz;
1464
1465 /*
1466 * A software use-only map doesn't even need a map.
1467 */
1468
1469 if (size != 0) {
1470 return PMAP_NULL;
1471 }
1472
1473 /*
1474 * Return error when unrecognized flags are passed.
1475 */
1476 if (__improbable((flags & ~(PMAP_CREATE_KNOWN_FLAGS)) != 0)) {
1477 return PMAP_NULL;
1478 }
1479
1480 p = (pmap_t) zalloc(pmap_zone);
1481 if (PMAP_NULL == p) {
1482 panic("pmap_create zalloc");
1483 }
1484
1485 /* Zero all fields */
1486 bzero(p, sizeof(*p));
1487
1488 lck_rw_init(&p->pmap_rwl, &pmap_lck_grp, &pmap_lck_rw_attr);
1489 p->pmap_rwl.lck_rw_can_sleep = FALSE;
1490
1491 bzero(&p->stats, sizeof(p->stats));
1492 os_ref_init(&p->ref_count, NULL);
1493 #if DEVELOPMENT || DEBUG
1494 p->nx_enabled = 1;
1495 #endif
1496 p->pm_shared = FALSE;
1497 ledger_reference(ledger);
1498 p->ledger = ledger;
1499
1500 p->pm_task_map = ((flags & PMAP_CREATE_64BIT) ? TASK_MAP_64BIT : TASK_MAP_32BIT);
1501
1502 p->pagezero_accessible = FALSE;
1503
1504 if (pmap_pcid_ncpus) {
1505 pmap_pcid_initialize(p);
1506 }
1507
1508 p->pm_pml4 = zalloc(pmap_anchor_zone);
1509 p->pm_upml4 = zalloc(pmap_uanchor_zone); //cleanup for EPT
1510
1511 pmap_assert((((uintptr_t)p->pm_pml4) & PAGE_MASK) == 0);
1512 pmap_assert((((uintptr_t)p->pm_upml4) & PAGE_MASK) == 0);
1513
1514 memset((char *)p->pm_pml4, 0, PAGE_SIZE);
1515 memset((char *)p->pm_upml4, 0, PAGE_SIZE);
1516
1517 if (flags & PMAP_CREATE_EPT) {
1518 p->pm_eptp = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4) | pmap_eptp_flags;
1519 p->pm_cr3 = 0;
1520 } else {
1521 p->pm_eptp = 0;
1522 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4);
1523 p->pm_ucr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_upml4);
1524 }
1525
1526 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1527
1528 p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS) *PAGE_SIZE);
1529 if (NULL == p->pm_obj_pml4) {
1530 panic("pmap_create pdpt obj");
1531 }
1532
1533 p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS) *PAGE_SIZE);
1534 if (NULL == p->pm_obj_pdpt) {
1535 panic("pmap_create pdpt obj");
1536 }
1537
1538 p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS) *PAGE_SIZE);
1539 if (NULL == p->pm_obj) {
1540 panic("pmap_create pte obj");
1541 }
1542
1543 if (!(flags & PMAP_CREATE_EPT)) {
1544 /* All host pmaps share the kernel's pml4 */
1545 pml4 = pmap64_pml4(p, 0ULL);
1546 kpml4 = kernel_pmap->pm_pml4;
1547 for (i = KERNEL_PML4_INDEX; i < (KERNEL_PML4_INDEX + KERNEL_PML4_COUNT); i++) {
1548 pml4[i] = kpml4[i];
1549 }
1550 pml4[KERNEL_KEXTS_INDEX] = kpml4[KERNEL_KEXTS_INDEX];
1551 for (i = KERNEL_PHYSMAP_PML4_INDEX; i < (KERNEL_PHYSMAP_PML4_INDEX + KERNEL_PHYSMAP_PML4_COUNT); i++) {
1552 pml4[i] = kpml4[i];
1553 }
1554 pml4[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1555 #if KASAN
1556 for (i = KERNEL_KASAN_PML4_FIRST; i <= KERNEL_KASAN_PML4_LAST; i++) {
1557 pml4[i] = kpml4[i];
1558 }
1559 #endif
1560 pml4_entry_t *pml4u = pmap64_user_pml4(p, 0ULL);
1561 pml4u[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1562 }
1563
1564 #if MACH_ASSERT
1565 p->pmap_stats_assert = TRUE;
1566 p->pmap_pid = 0;
1567 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
1568 #endif /* MACH_ASSERT */
1569
1570 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END,
1571 VM_KERNEL_ADDRHIDE(p));
1572
1573 return p;
1574 }
1575
1576 /*
1577 * We maintain stats and ledgers so that a task's physical footprint is:
1578 * phys_footprint = ((internal - alternate_accounting)
1579 * + (internal_compressed - alternate_accounting_compressed)
1580 * + iokit_mapped
1581 * + purgeable_nonvolatile
1582 * + purgeable_nonvolatile_compressed
1583 * + page_table)
1584 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
1585 */
1586
1587 #if MACH_ASSERT
1588 static void pmap_check_ledgers(pmap_t pmap);
1589 #else /* MACH_ASSERT */
1590 static inline void
1591 pmap_check_ledgers(__unused pmap_t pmap)
1592 {
1593 }
1594 #endif /* MACH_ASSERT */
1595
1596 /*
1597 * Retire the given physical map from service.
1598 * Should only be called if the map contains
1599 * no valid mappings.
1600 */
1601 extern int vm_wired_objects_page_count;
1602
1603 void
1604 pmap_destroy(pmap_t p)
1605 {
1606 os_ref_count_t c;
1607
1608 if (p == PMAP_NULL) {
1609 return;
1610 }
1611
1612 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1613 VM_KERNEL_ADDRHIDe(p));
1614
1615 PMAP_LOCK_EXCLUSIVE(p);
1616
1617 c = os_ref_release_locked(&p->ref_count);
1618
1619 pmap_assert((current_thread() && (current_thread()->map)) ? (current_thread()->map->pmap != p) : TRUE);
1620
1621 if (c == 0) {
1622 /*
1623 * If some cpu is not using the physical pmap pointer that it
1624 * is supposed to be (see set_dirbase), we might be using the
1625 * pmap that is being destroyed! Make sure we are
1626 * physically on the right pmap:
1627 */
1628 PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL);
1629 if (pmap_pcid_ncpus) {
1630 pmap_destroy_pcid_sync(p);
1631 }
1632 }
1633
1634 PMAP_UNLOCK_EXCLUSIVE(p);
1635
1636 if (c != 0) {
1637 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1638 pmap_assert(p == kernel_pmap);
1639 return; /* still in use */
1640 }
1641
1642 /*
1643 * Free the memory maps, then the
1644 * pmap structure.
1645 */
1646 int inuse_ptepages = 0;
1647
1648 zfree(pmap_anchor_zone, p->pm_pml4);
1649 zfree(pmap_uanchor_zone, p->pm_upml4);
1650
1651 inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1652 vm_object_deallocate(p->pm_obj_pml4);
1653
1654 inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1655 vm_object_deallocate(p->pm_obj_pdpt);
1656
1657 inuse_ptepages += p->pm_obj->resident_page_count;
1658 vm_object_deallocate(p->pm_obj);
1659
1660 OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count);
1661 PMAP_ZINFO_PFREE(p, inuse_ptepages * PAGE_SIZE);
1662
1663 pmap_check_ledgers(p);
1664 ledger_dereference(p->ledger);
1665 zfree(pmap_zone, p);
1666
1667 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1668 }
1669
1670 /*
1671 * Add a reference to the specified pmap.
1672 */
1673
1674 void
1675 pmap_reference(pmap_t p)
1676 {
1677 if (p != PMAP_NULL) {
1678 PMAP_LOCK_EXCLUSIVE(p);
1679 os_ref_retain_locked(&p->ref_count);
1680 PMAP_UNLOCK_EXCLUSIVE(p);;
1681 }
1682 }
1683
1684 /*
1685 * Remove phys addr if mapped in specified map
1686 *
1687 */
1688 void
1689 pmap_remove_some_phys(
1690 __unused pmap_t map,
1691 __unused ppnum_t pn)
1692 {
1693 /* Implement to support working set code */
1694 }
1695
1696
1697 void
1698 pmap_protect(
1699 pmap_t map,
1700 vm_map_offset_t sva,
1701 vm_map_offset_t eva,
1702 vm_prot_t prot)
1703 {
1704 pmap_protect_options(map, sva, eva, prot, 0, NULL);
1705 }
1706
1707
1708 /*
1709 * Set the physical protection on the
1710 * specified range of this map as requested.
1711 *
1712 * VERY IMPORTANT: Will *NOT* increase permissions.
1713 * pmap_protect_options() should protect the range against any access types
1714 * that are not in "prot" but it should never grant extra access.
1715 * For example, if "prot" is READ|EXECUTE, that means "remove write
1716 * access" but it does *not* mean "add read and execute" access.
1717 * VM relies on getting soft-faults to enforce extra checks (code
1718 * signing, for example), for example.
1719 * New access permissions are granted via pmap_enter() only.
1720 */
1721 void
1722 pmap_protect_options(
1723 pmap_t map,
1724 vm_map_offset_t sva,
1725 vm_map_offset_t eva,
1726 vm_prot_t prot,
1727 unsigned int options,
1728 void *arg)
1729 {
1730 pt_entry_t *pde;
1731 pt_entry_t *spte, *epte;
1732 vm_map_offset_t lva;
1733 vm_map_offset_t orig_sva;
1734 boolean_t set_NX;
1735 int num_found = 0;
1736 boolean_t is_ept;
1737
1738 pmap_intr_assert();
1739
1740 if (map == PMAP_NULL) {
1741 return;
1742 }
1743
1744 if (prot == VM_PROT_NONE) {
1745 pmap_remove_options(map, sva, eva, options);
1746 return;
1747 }
1748
1749 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
1750 VM_KERNEL_ADDRHIDE(map), VM_KERNEL_ADDRHIDE(sva),
1751 VM_KERNEL_ADDRHIDE(eva));
1752
1753 if (prot & VM_PROT_EXECUTE) {
1754 set_NX = FALSE;
1755 } else {
1756 set_NX = TRUE;
1757 }
1758
1759 #if DEVELOPMENT || DEBUG
1760 if (__improbable(set_NX && (!nx_enabled || !map->nx_enabled))) {
1761 set_NX = FALSE;
1762 }
1763 #endif
1764 is_ept = is_ept_pmap(map);
1765
1766 PMAP_LOCK_EXCLUSIVE(map);
1767
1768 orig_sva = sva;
1769 while (sva < eva) {
1770 lva = (sva + PDE_MAPPED_SIZE) & ~(PDE_MAPPED_SIZE - 1);
1771 if (lva > eva) {
1772 lva = eva;
1773 }
1774 pde = pmap_pde(map, sva);
1775 if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1776 if (*pde & PTE_PS) {
1777 /* superpage */
1778 spte = pde;
1779 epte = spte + 1; /* excluded */
1780 } else {
1781 spte = pmap_pte(map, (sva & ~(PDE_MAPPED_SIZE - 1)));
1782 spte = &spte[ptenum(sva)];
1783 epte = &spte[intel_btop(lva - sva)];
1784 }
1785
1786 for (; spte < epte; spte++) {
1787 if (!(*spte & PTE_VALID_MASK(is_ept))) {
1788 continue;
1789 }
1790
1791 if (is_ept) {
1792 if (!(prot & VM_PROT_READ)) {
1793 pmap_update_pte(spte, PTE_READ(is_ept), 0);
1794 }
1795 }
1796 if (!(prot & VM_PROT_WRITE)) {
1797 pmap_update_pte(spte, PTE_WRITE(is_ept), 0);
1798 }
1799 #if DEVELOPMENT || DEBUG
1800 else if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) &&
1801 map == kernel_pmap) {
1802 pmap_update_pte(spte, 0, PTE_WRITE(is_ept));
1803 }
1804 #endif /* DEVELOPMENT || DEBUG */
1805
1806 if (set_NX) {
1807 if (!is_ept) {
1808 pmap_update_pte(spte, 0, INTEL_PTE_NX);
1809 } else {
1810 pmap_update_pte(spte, INTEL_EPT_EX, 0);
1811 }
1812 }
1813 num_found++;
1814 }
1815 }
1816 sva = lva;
1817 }
1818 if (num_found) {
1819 if (options & PMAP_OPTIONS_NOFLUSH) {
1820 PMAP_UPDATE_TLBS_DELAYED(map, orig_sva, eva, (pmap_flush_context *)arg);
1821 } else {
1822 PMAP_UPDATE_TLBS(map, orig_sva, eva);
1823 }
1824 }
1825
1826 PMAP_UNLOCK_EXCLUSIVE(map);
1827
1828 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
1829 }
1830
1831 /* Map a (possibly) autogenned block */
1832 kern_return_t
1833 pmap_map_block(
1834 pmap_t pmap,
1835 addr64_t va,
1836 ppnum_t pa,
1837 uint32_t size,
1838 vm_prot_t prot,
1839 int attr,
1840 __unused unsigned int flags)
1841 {
1842 kern_return_t kr;
1843 addr64_t original_va = va;
1844 uint32_t page;
1845 int cur_page_size;
1846
1847 if (attr & VM_MEM_SUPERPAGE) {
1848 cur_page_size = SUPERPAGE_SIZE;
1849 } else {
1850 cur_page_size = PAGE_SIZE;
1851 }
1852
1853 for (page = 0; page < size; page += cur_page_size / PAGE_SIZE) {
1854 kr = pmap_enter(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE);
1855
1856 if (kr != KERN_SUCCESS) {
1857 /*
1858 * This will panic for now, as it is unclear that
1859 * removing the mappings is correct.
1860 */
1861 panic("%s: failed pmap_enter, "
1862 "pmap=%p, va=%#llx, pa=%u, size=%u, prot=%#x, flags=%#x",
1863 __FUNCTION__,
1864 pmap, va, pa, size, prot, flags);
1865
1866 pmap_remove(pmap, original_va, va - original_va);
1867 return kr;
1868 }
1869
1870 va += cur_page_size;
1871 pa += cur_page_size / PAGE_SIZE;
1872 }
1873
1874 return KERN_SUCCESS;
1875 }
1876
1877 kern_return_t
1878 pmap_expand_pml4(
1879 pmap_t map,
1880 vm_map_offset_t vaddr,
1881 unsigned int options)
1882 {
1883 vm_page_t m;
1884 pmap_paddr_t pa;
1885 uint64_t i;
1886 ppnum_t pn;
1887 pml4_entry_t *pml4p;
1888 boolean_t is_ept = is_ept_pmap(map);
1889
1890 DBG("pmap_expand_pml4(%p,%p)\n", map, (void *)vaddr);
1891
1892 /* With the exception of the kext "basement", the kernel's level 4
1893 * pagetables must not be dynamically expanded.
1894 */
1895 assert(map != kernel_pmap || (vaddr == KERNEL_BASEMENT));
1896 /*
1897 * Allocate a VM page for the pml4 page
1898 */
1899 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
1900 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
1901 return KERN_RESOURCE_SHORTAGE;
1902 }
1903 VM_PAGE_WAIT();
1904 }
1905 /*
1906 * put the page into the pmap's obj list so it
1907 * can be found later.
1908 */
1909 pn = VM_PAGE_GET_PHYS_PAGE(m);
1910 pa = i386_ptob(pn);
1911 i = pml4idx(map, vaddr);
1912
1913 /*
1914 * Zero the page.
1915 */
1916 pmap_zero_page(pn);
1917
1918 vm_page_lockspin_queues();
1919 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
1920 vm_page_unlock_queues();
1921
1922 OSAddAtomic(1, &inuse_ptepages_count);
1923 OSAddAtomic64(1, &alloc_ptepages_count);
1924 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
1925
1926 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
1927 vm_object_lock(map->pm_obj_pml4);
1928
1929 PMAP_LOCK_EXCLUSIVE(map);
1930 /*
1931 * See if someone else expanded us first
1932 */
1933 if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
1934 PMAP_UNLOCK_EXCLUSIVE(map);
1935 vm_object_unlock(map->pm_obj_pml4);
1936
1937 VM_PAGE_FREE(m);
1938
1939 OSAddAtomic(-1, &inuse_ptepages_count);
1940 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
1941 return KERN_SUCCESS;
1942 }
1943
1944 #if 0 /* DEBUG */
1945 if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE)) {
1946 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
1947 map, map->pm_obj_pml4, vaddr, i);
1948 }
1949 #endif
1950 vm_page_insert_wired(m, map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
1951 vm_object_unlock(map->pm_obj_pml4);
1952
1953 /*
1954 * Set the page directory entry for this page table.
1955 */
1956 pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
1957
1958 pmap_store_pte(pml4p, pa_to_pte(pa)
1959 | PTE_READ(is_ept)
1960 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
1961 | PTE_WRITE(is_ept));
1962 pml4_entry_t *upml4p;
1963
1964 upml4p = pmap64_user_pml4(map, vaddr);
1965 pmap_store_pte(upml4p, pa_to_pte(pa)
1966 | PTE_READ(is_ept)
1967 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
1968 | PTE_WRITE(is_ept));
1969
1970 PMAP_UNLOCK_EXCLUSIVE(map);
1971
1972 return KERN_SUCCESS;
1973 }
1974
1975 kern_return_t
1976 pmap_expand_pdpt(pmap_t map, vm_map_offset_t vaddr, unsigned int options)
1977 {
1978 vm_page_t m;
1979 pmap_paddr_t pa;
1980 uint64_t i;
1981 ppnum_t pn;
1982 pdpt_entry_t *pdptp;
1983 boolean_t is_ept = is_ept_pmap(map);
1984
1985 DBG("pmap_expand_pdpt(%p,%p)\n", map, (void *)vaddr);
1986
1987 while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
1988 kern_return_t pep4kr = pmap_expand_pml4(map, vaddr, options);
1989 if (pep4kr != KERN_SUCCESS) {
1990 return pep4kr;
1991 }
1992 }
1993
1994 /*
1995 * Allocate a VM page for the pdpt page
1996 */
1997 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
1998 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
1999 return KERN_RESOURCE_SHORTAGE;
2000 }
2001 VM_PAGE_WAIT();
2002 }
2003
2004 /*
2005 * put the page into the pmap's obj list so it
2006 * can be found later.
2007 */
2008 pn = VM_PAGE_GET_PHYS_PAGE(m);
2009 pa = i386_ptob(pn);
2010 i = pdptidx(map, vaddr);
2011
2012 /*
2013 * Zero the page.
2014 */
2015 pmap_zero_page(pn);
2016
2017 vm_page_lockspin_queues();
2018 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2019 vm_page_unlock_queues();
2020
2021 OSAddAtomic(1, &inuse_ptepages_count);
2022 OSAddAtomic64(1, &alloc_ptepages_count);
2023 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2024
2025 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2026 vm_object_lock(map->pm_obj_pdpt);
2027
2028 PMAP_LOCK_EXCLUSIVE(map);
2029 /*
2030 * See if someone else expanded us first
2031 */
2032 if (pmap_pde(map, vaddr) != PD_ENTRY_NULL) {
2033 PMAP_UNLOCK_EXCLUSIVE(map);
2034 vm_object_unlock(map->pm_obj_pdpt);
2035
2036 VM_PAGE_FREE(m);
2037
2038 OSAddAtomic(-1, &inuse_ptepages_count);
2039 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2040 return KERN_SUCCESS;
2041 }
2042
2043 #if 0 /* DEBUG */
2044 if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE)) {
2045 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
2046 map, map->pm_obj_pdpt, vaddr, i);
2047 }
2048 #endif
2049 vm_page_insert_wired(m, map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2050 vm_object_unlock(map->pm_obj_pdpt);
2051
2052 /*
2053 * Set the page directory entry for this page table.
2054 */
2055 pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
2056
2057 pmap_store_pte(pdptp, pa_to_pte(pa)
2058 | PTE_READ(is_ept)
2059 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2060 | PTE_WRITE(is_ept));
2061
2062 PMAP_UNLOCK_EXCLUSIVE(map);
2063
2064 return KERN_SUCCESS;
2065 }
2066
2067
2068
2069 /*
2070 * Routine: pmap_expand
2071 *
2072 * Expands a pmap to be able to map the specified virtual address.
2073 *
2074 * Allocates new virtual memory for the P0 or P1 portion of the
2075 * pmap, then re-maps the physical pages that were in the old
2076 * pmap to be in the new pmap.
2077 *
2078 * Must be called with the pmap system and the pmap unlocked,
2079 * since these must be unlocked to use vm_allocate or vm_deallocate.
2080 * Thus it must be called in a loop that checks whether the map
2081 * has been expanded enough.
2082 * (We won't loop forever, since page tables aren't shrunk.)
2083 */
2084 kern_return_t
2085 pmap_expand(
2086 pmap_t map,
2087 vm_map_offset_t vaddr,
2088 unsigned int options)
2089 {
2090 pt_entry_t *pdp;
2091 vm_page_t m;
2092 pmap_paddr_t pa;
2093 uint64_t i;
2094 ppnum_t pn;
2095 boolean_t is_ept = is_ept_pmap(map);
2096
2097
2098 /*
2099 * For the kernel, the virtual address must be in or above the basement
2100 * which is for kexts and is in the 512GB immediately below the kernel..
2101 * XXX - should use VM_MIN_KERNEL_AND_KEXT_ADDRESS not KERNEL_BASEMENT
2102 */
2103 if (__improbable(map == kernel_pmap &&
2104 !(vaddr >= KERNEL_BASEMENT && vaddr <= VM_MAX_KERNEL_ADDRESS))) {
2105 if ((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0) {
2106 panic("pmap_expand: bad vaddr 0x%llx for kernel pmap", vaddr);
2107 }
2108 }
2109
2110 while ((pdp = pmap_pde(map, vaddr)) == PD_ENTRY_NULL) {
2111 assert((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0);
2112 kern_return_t pepkr = pmap_expand_pdpt(map, vaddr, options);
2113 if (pepkr != KERN_SUCCESS) {
2114 return pepkr;
2115 }
2116 }
2117
2118 /*
2119 * Allocate a VM page for the pde entries.
2120 */
2121 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2122 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2123 return KERN_RESOURCE_SHORTAGE;
2124 }
2125 VM_PAGE_WAIT();
2126 }
2127
2128 /*
2129 * put the page into the pmap's obj list so it
2130 * can be found later.
2131 */
2132 pn = VM_PAGE_GET_PHYS_PAGE(m);
2133 pa = i386_ptob(pn);
2134 i = pdeidx(map, vaddr);
2135
2136 /*
2137 * Zero the page.
2138 */
2139 pmap_zero_page(pn);
2140
2141 vm_page_lockspin_queues();
2142 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2143 vm_page_unlock_queues();
2144
2145 OSAddAtomic(1, &inuse_ptepages_count);
2146 OSAddAtomic64(1, &alloc_ptepages_count);
2147 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2148
2149 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2150 vm_object_lock(map->pm_obj);
2151
2152 PMAP_LOCK_EXCLUSIVE(map);
2153
2154 /*
2155 * See if someone else expanded us first
2156 */
2157 if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
2158 PMAP_UNLOCK_EXCLUSIVE(map);
2159 vm_object_unlock(map->pm_obj);
2160
2161 VM_PAGE_FREE(m);
2162
2163 OSAddAtomic(-1, &inuse_ptepages_count); //todo replace all with inlines
2164 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2165 return KERN_SUCCESS;
2166 }
2167
2168 #if 0 /* DEBUG */
2169 if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE)) {
2170 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
2171 map, map->pm_obj, vaddr, i);
2172 }
2173 #endif
2174 vm_page_insert_wired(m, map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2175 vm_object_unlock(map->pm_obj);
2176
2177 /*
2178 * Set the page directory entry for this page table.
2179 */
2180 pdp = pmap_pde(map, vaddr);
2181 pmap_store_pte(pdp, pa_to_pte(pa)
2182 | PTE_READ(is_ept)
2183 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2184 | PTE_WRITE(is_ept));
2185
2186 PMAP_UNLOCK_EXCLUSIVE(map);
2187
2188 return KERN_SUCCESS;
2189 }
2190 /*
2191 * Query a pmap to see what size a given virtual address is mapped with.
2192 * If the vaddr is not mapped, returns 0.
2193 */
2194 vm_size_t
2195 pmap_query_pagesize(
2196 pmap_t pmap,
2197 vm_map_offset_t vaddr)
2198 {
2199 pd_entry_t *pdep;
2200 vm_size_t size = 0;
2201
2202 assert(!is_ept_pmap(pmap));
2203 PMAP_LOCK_EXCLUSIVE(pmap);
2204
2205 pdep = pmap_pde(pmap, vaddr);
2206 if (pdep != PD_ENTRY_NULL) {
2207 if (*pdep & INTEL_PTE_PS) {
2208 size = I386_LPGBYTES;
2209 } else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
2210 size = I386_PGBYTES;
2211 }
2212 }
2213
2214 PMAP_UNLOCK_EXCLUSIVE(pmap);
2215
2216 return size;
2217 }
2218
2219 /*
2220 * Ensure the page table hierarchy is filled in down to
2221 * the large page level. Additionally returns FAILURE if
2222 * a lower page table already exists.
2223 */
2224 static kern_return_t
2225 pmap_pre_expand_large_internal(
2226 pmap_t pmap,
2227 vm_map_offset_t vaddr)
2228 {
2229 ppnum_t pn;
2230 pt_entry_t *pte;
2231 boolean_t is_ept = is_ept_pmap(pmap);
2232 kern_return_t kr = KERN_SUCCESS;
2233
2234 if (pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) {
2235 if (!pmap_next_page_hi(&pn, FALSE)) {
2236 panic("pmap_pre_expand_large no PDPT");
2237 }
2238
2239 pmap_zero_page(pn);
2240
2241 pte = pmap64_pml4(pmap, vaddr);
2242
2243 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) |
2244 PTE_READ(is_ept) |
2245 (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) |
2246 PTE_WRITE(is_ept));
2247
2248 pte = pmap64_user_pml4(pmap, vaddr);
2249
2250 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) |
2251 PTE_READ(is_ept) |
2252 (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) |
2253 PTE_WRITE(is_ept));
2254 }
2255
2256 if (pmap_pde(pmap, vaddr) == PD_ENTRY_NULL) {
2257 if (!pmap_next_page_hi(&pn, FALSE)) {
2258 panic("pmap_pre_expand_large no PDE");
2259 }
2260
2261 pmap_zero_page(pn);
2262
2263 pte = pmap64_pdpt(pmap, vaddr);
2264
2265 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) |
2266 PTE_READ(is_ept) |
2267 (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) |
2268 PTE_WRITE(is_ept));
2269 } else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
2270 kr = KERN_FAILURE;
2271 }
2272
2273 return kr;
2274 }
2275
2276 /*
2277 * Wrapper that locks the pmap.
2278 */
2279 kern_return_t
2280 pmap_pre_expand_large(
2281 pmap_t pmap,
2282 vm_map_offset_t vaddr)
2283 {
2284 kern_return_t kr;
2285
2286 PMAP_LOCK_EXCLUSIVE(pmap);
2287 kr = pmap_pre_expand_large_internal(pmap, vaddr);
2288 PMAP_UNLOCK_EXCLUSIVE(pmap);
2289 return kr;
2290 }
2291
2292 /*
2293 * On large memory machines, pmap_steal_memory() will allocate past
2294 * the 1GB of pre-allocated/mapped virtual kernel area. This function
2295 * expands kernel the page tables to cover a given vaddr. It uses pages
2296 * from the same pool that pmap_steal_memory() uses, since vm_page_grab()
2297 * isn't available yet.
2298 */
2299 void
2300 pmap_pre_expand(
2301 pmap_t pmap,
2302 vm_map_offset_t vaddr)
2303 {
2304 ppnum_t pn;
2305 pt_entry_t *pte;
2306 boolean_t is_ept = is_ept_pmap(pmap);
2307
2308 /*
2309 * This returns failure if a 4K page table already exists.
2310 * Othewise it fills in the page table hierarchy down
2311 * to that level.
2312 */
2313 PMAP_LOCK_EXCLUSIVE(pmap);
2314 if (pmap_pre_expand_large_internal(pmap, vaddr) == KERN_FAILURE) {
2315 PMAP_UNLOCK_EXCLUSIVE(pmap);
2316 return;
2317 }
2318
2319 /* Add the lowest table */
2320 if (!pmap_next_page_hi(&pn, FALSE)) {
2321 panic("pmap_pre_expand");
2322 }
2323
2324 pmap_zero_page(pn);
2325
2326 pte = pmap_pde(pmap, vaddr);
2327
2328 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) |
2329 PTE_READ(is_ept) |
2330 (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) |
2331 PTE_WRITE(is_ept));
2332 PMAP_UNLOCK_EXCLUSIVE(pmap);
2333 }
2334
2335 /*
2336 * pmap_sync_page_data_phys(ppnum_t pa)
2337 *
2338 * Invalidates all of the instruction cache on a physical page and
2339 * pushes any dirty data from the data cache for the same physical page
2340 * Not required in i386.
2341 */
2342 void
2343 pmap_sync_page_data_phys(__unused ppnum_t pa)
2344 {
2345 return;
2346 }
2347
2348 /*
2349 * pmap_sync_page_attributes_phys(ppnum_t pa)
2350 *
2351 * Write back and invalidate all cachelines on a physical page.
2352 */
2353 void
2354 pmap_sync_page_attributes_phys(ppnum_t pa)
2355 {
2356 cache_flush_page_phys(pa);
2357 }
2358
2359 void
2360 pmap_copy_page(ppnum_t src, ppnum_t dst)
2361 {
2362 bcopy_phys((addr64_t)i386_ptob(src),
2363 (addr64_t)i386_ptob(dst),
2364 PAGE_SIZE);
2365 }
2366
2367
2368 /*
2369 * Routine: pmap_pageable
2370 * Function:
2371 * Make the specified pages (by pmap, offset)
2372 * pageable (or not) as requested.
2373 *
2374 * A page which is not pageable may not take
2375 * a fault; therefore, its page table entry
2376 * must remain valid for the duration.
2377 *
2378 * This routine is merely advisory; pmap_enter
2379 * will specify that these pages are to be wired
2380 * down (or not) as appropriate.
2381 */
2382 void
2383 pmap_pageable(
2384 __unused pmap_t pmap,
2385 __unused vm_map_offset_t start_addr,
2386 __unused vm_map_offset_t end_addr,
2387 __unused boolean_t pageable)
2388 {
2389 #ifdef lint
2390 pmap++; start_addr++; end_addr++; pageable++;
2391 #endif /* lint */
2392 }
2393
2394 void
2395 invalidate_icache(__unused vm_offset_t addr,
2396 __unused unsigned cnt,
2397 __unused int phys)
2398 {
2399 return;
2400 }
2401
2402 void
2403 flush_dcache(__unused vm_offset_t addr,
2404 __unused unsigned count,
2405 __unused int phys)
2406 {
2407 return;
2408 }
2409
2410 #if CONFIG_DTRACE
2411 /*
2412 * Constrain DTrace copyin/copyout actions
2413 */
2414 extern kern_return_t dtrace_copyio_preflight(addr64_t);
2415 extern kern_return_t dtrace_copyio_postflight(addr64_t);
2416
2417 kern_return_t
2418 dtrace_copyio_preflight(__unused addr64_t va)
2419 {
2420 thread_t thread = current_thread();
2421 uint64_t ccr3;
2422 if (current_map() == kernel_map) {
2423 return KERN_FAILURE;
2424 } else if (((ccr3 = get_cr3_base()) != thread->map->pmap->pm_cr3) && (no_shared_cr3 == FALSE)) {
2425 return KERN_FAILURE;
2426 } else if (no_shared_cr3 && (ccr3 != kernel_pmap->pm_cr3)) {
2427 return KERN_FAILURE;
2428 } else {
2429 return KERN_SUCCESS;
2430 }
2431 }
2432
2433 kern_return_t
2434 dtrace_copyio_postflight(__unused addr64_t va)
2435 {
2436 return KERN_SUCCESS;
2437 }
2438 #endif /* CONFIG_DTRACE */
2439
2440 #include <mach_vm_debug.h>
2441 #if MACH_VM_DEBUG
2442 #include <vm/vm_debug.h>
2443
2444 int
2445 pmap_list_resident_pages(
2446 __unused pmap_t pmap,
2447 __unused vm_offset_t *listp,
2448 __unused int space)
2449 {
2450 return 0;
2451 }
2452 #endif /* MACH_VM_DEBUG */
2453
2454
2455 #if CONFIG_COREDUMP
2456 /* temporary workaround */
2457 boolean_t
2458 coredumpok(__unused vm_map_t map, __unused vm_offset_t va)
2459 {
2460 #if 0
2461 pt_entry_t *ptep;
2462
2463 ptep = pmap_pte(map->pmap, va);
2464 if (0 == ptep) {
2465 return FALSE;
2466 }
2467 return (*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED);
2468 #else
2469 return TRUE;
2470 #endif
2471 }
2472 #endif
2473
2474 boolean_t
2475 phys_page_exists(ppnum_t pn)
2476 {
2477 assert(pn != vm_page_fictitious_addr);
2478
2479 if (!pmap_initialized) {
2480 return TRUE;
2481 }
2482
2483 if (pn == vm_page_guard_addr) {
2484 return FALSE;
2485 }
2486
2487 if (!IS_MANAGED_PAGE(ppn_to_pai(pn))) {
2488 return FALSE;
2489 }
2490
2491 return TRUE;
2492 }
2493
2494
2495
2496 void
2497 pmap_switch(pmap_t tpmap)
2498 {
2499 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(tpmap));
2500 assert(ml_get_interrupts_enabled() == FALSE);
2501 set_dirbase(tpmap, current_thread(), cpu_number());
2502 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
2503 }
2504
2505
2506 /*
2507 * disable no-execute capability on
2508 * the specified pmap
2509 */
2510 void
2511 pmap_disable_NX(__unused pmap_t pmap)
2512 {
2513 #if DEVELOPMENT || DEBUG
2514 pmap->nx_enabled = 0;
2515 #endif
2516 }
2517
2518 void
2519 pt_fake_zone_init(int zone_index)
2520 {
2521 pt_fake_zone_index = zone_index;
2522 }
2523
2524 void
2525 pt_fake_zone_info(
2526 int *count,
2527 vm_size_t *cur_size,
2528 vm_size_t *max_size,
2529 vm_size_t *elem_size,
2530 vm_size_t *alloc_size,
2531 uint64_t *sum_size,
2532 int *collectable,
2533 int *exhaustable,
2534 int *caller_acct)
2535 {
2536 *count = inuse_ptepages_count;
2537 *cur_size = PAGE_SIZE * inuse_ptepages_count;
2538 *max_size = PAGE_SIZE * (inuse_ptepages_count +
2539 vm_page_inactive_count +
2540 vm_page_active_count +
2541 vm_page_free_count);
2542 *elem_size = PAGE_SIZE;
2543 *alloc_size = PAGE_SIZE;
2544 *sum_size = alloc_ptepages_count * PAGE_SIZE;
2545
2546 *collectable = 1;
2547 *exhaustable = 0;
2548 *caller_acct = 1;
2549 }
2550
2551
2552 void
2553 pmap_flush_context_init(pmap_flush_context *pfc)
2554 {
2555 pfc->pfc_cpus = 0;
2556 pfc->pfc_invalid_global = 0;
2557 }
2558
2559 static bool
2560 pmap_tlbi_response(uint32_t lcpu, uint32_t rcpu, bool ngflush)
2561 {
2562 bool responded = false;
2563 bool gflushed = (cpu_datap(rcpu)->cpu_tlb_invalid_global_count !=
2564 cpu_datap(lcpu)->cpu_tlb_gen_counts_global[rcpu]);
2565
2566 if (ngflush) {
2567 if (gflushed) {
2568 responded = true;
2569 }
2570 } else {
2571 if (gflushed) {
2572 responded = true;
2573 } else {
2574 bool lflushed = (cpu_datap(rcpu)->cpu_tlb_invalid_local_count !=
2575 cpu_datap(lcpu)->cpu_tlb_gen_counts_local[rcpu]);
2576 if (lflushed) {
2577 responded = true;
2578 }
2579 }
2580 }
2581
2582 if (responded == false) {
2583 if ((cpu_datap(rcpu)->cpu_tlb_invalid == 0) ||
2584 !CPU_CR3_IS_ACTIVE(rcpu) ||
2585 !cpu_is_running(rcpu)) {
2586 responded = true;
2587 }
2588 }
2589 return responded;
2590 }
2591
2592 extern uint64_t TLBTimeOut;
2593 void
2594 pmap_flush(
2595 pmap_flush_context *pfc)
2596 {
2597 unsigned int my_cpu;
2598 unsigned int cpu;
2599 cpumask_t cpu_bit;
2600 cpumask_t cpus_to_respond = 0;
2601 cpumask_t cpus_to_signal = 0;
2602 cpumask_t cpus_signaled = 0;
2603 boolean_t flush_self = FALSE;
2604 uint64_t deadline;
2605 bool need_global_flush = false;
2606
2607 mp_disable_preemption();
2608
2609 my_cpu = cpu_number();
2610 cpus_to_signal = pfc->pfc_cpus;
2611
2612 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_START,
2613 NULL, cpus_to_signal);
2614
2615 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus && cpus_to_signal; cpu++, cpu_bit <<= 1) {
2616 if (cpus_to_signal & cpu_bit) {
2617 cpus_to_signal &= ~cpu_bit;
2618
2619 if (!cpu_is_running(cpu)) {
2620 continue;
2621 }
2622
2623 if (pfc->pfc_invalid_global & cpu_bit) {
2624 cpu_datap(cpu)->cpu_tlb_invalid_global = 1;
2625 need_global_flush = true;
2626 } else {
2627 cpu_datap(cpu)->cpu_tlb_invalid_local = 1;
2628 }
2629 cpu_datap(my_cpu)->cpu_tlb_gen_counts_global[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_global_count;
2630 cpu_datap(my_cpu)->cpu_tlb_gen_counts_local[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_local_count;
2631 mfence();
2632
2633 if (cpu == my_cpu) {
2634 flush_self = TRUE;
2635 continue;
2636 }
2637 if (CPU_CR3_IS_ACTIVE(cpu)) {
2638 cpus_to_respond |= cpu_bit;
2639 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2640 }
2641 }
2642 }
2643 cpus_signaled = cpus_to_respond;
2644
2645 /*
2646 * Flush local tlb if required.
2647 * Do this now to overlap with other processors responding.
2648 */
2649 if (flush_self) {
2650 process_pmap_updates(NULL, (pfc->pfc_invalid_global != 0), 0ULL, ~0ULL);
2651 }
2652
2653 if (cpus_to_respond) {
2654 deadline = mach_absolute_time() +
2655 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
2656 boolean_t is_timeout_traced = FALSE;
2657
2658 /*
2659 * Wait for those other cpus to acknowledge
2660 */
2661 while (cpus_to_respond != 0) {
2662 long orig_acks = 0;
2663
2664 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2665 bool responded = false;
2666 if ((cpus_to_respond & cpu_bit) != 0) {
2667 responded = pmap_tlbi_response(my_cpu, cpu, need_global_flush);
2668 if (responded) {
2669 cpus_to_respond &= ~cpu_bit;
2670 }
2671 cpu_pause();
2672 }
2673
2674 if (cpus_to_respond == 0) {
2675 break;
2676 }
2677 }
2678 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
2679 if (machine_timeout_suspended()) {
2680 continue;
2681 }
2682 if (TLBTimeOut == 0) {
2683 if (is_timeout_traced) {
2684 continue;
2685 }
2686
2687 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
2688 NULL, cpus_to_signal, cpus_to_respond);
2689
2690 is_timeout_traced = TRUE;
2691 continue;
2692 }
2693 orig_acks = NMIPI_acks;
2694 NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
2695 panic("Uninterruptible processor(s): CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu",
2696 cpus_to_respond, orig_acks, NMIPI_acks, deadline);
2697 }
2698 }
2699 }
2700
2701 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_END,
2702 NULL, cpus_signaled, flush_self);
2703
2704 mp_enable_preemption();
2705 }
2706
2707
2708 static void
2709 invept(void *eptp)
2710 {
2711 struct {
2712 uint64_t eptp;
2713 uint64_t reserved;
2714 } __attribute__((aligned(16), packed)) invept_descriptor = {(uint64_t)eptp, 0};
2715
2716 __asm__ volatile ("invept (%%rax), %%rcx"
2717 : : "c" (PMAP_INVEPT_SINGLE_CONTEXT), "a" (&invept_descriptor)
2718 : "cc", "memory");
2719 }
2720
2721 /*
2722 * Called with pmap locked, we:
2723 * - scan through per-cpu data to see which other cpus need to flush
2724 * - send an IPI to each non-idle cpu to be flushed
2725 * - wait for all to signal back that they are inactive or we see that
2726 * they are at a safe point (idle).
2727 * - flush the local tlb if active for this pmap
2728 * - return ... the caller will unlock the pmap
2729 */
2730
2731 void
2732 pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int options, pmap_flush_context *pfc)
2733 {
2734 unsigned int cpu;
2735 cpumask_t cpu_bit;
2736 cpumask_t cpus_to_signal = 0;
2737 unsigned int my_cpu = cpu_number();
2738 pmap_paddr_t pmap_cr3 = pmap->pm_cr3;
2739 boolean_t flush_self = FALSE;
2740 uint64_t deadline;
2741 boolean_t pmap_is_shared = (pmap->pm_shared || (pmap == kernel_pmap));
2742 bool need_global_flush = false;
2743 uint32_t event_code;
2744 vm_map_offset_t event_startv, event_endv;
2745 boolean_t is_ept = is_ept_pmap(pmap);
2746
2747 assert((processor_avail_count < 2) ||
2748 (ml_get_interrupts_enabled() && get_preemption_level() != 0));
2749
2750 assert((endv - startv) >= PAGE_SIZE);
2751 assert(((endv | startv) & PAGE_MASK) == 0);
2752
2753 if (__improbable(kdebug_enable)) {
2754 if (pmap == kernel_pmap) {
2755 event_code = PMAP_CODE(PMAP__FLUSH_KERN_TLBS);
2756 event_startv = VM_KERNEL_UNSLIDE_OR_PERM(startv);
2757 event_endv = VM_KERNEL_UNSLIDE_OR_PERM(endv);
2758 } else if (__improbable(is_ept)) {
2759 event_code = PMAP_CODE(PMAP__FLUSH_EPT);
2760 event_startv = startv;
2761 event_endv = endv;
2762 } else {
2763 event_code = PMAP_CODE(PMAP__FLUSH_TLBS);
2764 event_startv = startv;
2765 event_endv = endv;
2766 }
2767 }
2768
2769 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_START,
2770 VM_KERNEL_UNSLIDE_OR_PERM(pmap), options,
2771 event_startv, event_endv);
2772
2773 if (__improbable(is_ept)) {
2774 mp_cpus_call(CPUMASK_ALL, ASYNC, invept, (void*)pmap->pm_eptp);
2775 goto out;
2776 }
2777
2778 /*
2779 * Scan other cpus for matching active or task CR3.
2780 * For idle cpus (with no active map) we mark them invalid but
2781 * don't signal -- they'll check as they go busy.
2782 */
2783 if (pmap_pcid_ncpus) {
2784 if (pmap_is_shared) {
2785 need_global_flush = true;
2786 }
2787 pmap_pcid_invalidate_all_cpus(pmap);
2788 mfence();
2789 }
2790
2791 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2792 if (!cpu_is_running(cpu)) {
2793 continue;
2794 }
2795 uint64_t cpu_active_cr3 = CPU_GET_ACTIVE_CR3(cpu);
2796 uint64_t cpu_task_cr3 = CPU_GET_TASK_CR3(cpu);
2797
2798 if ((pmap_cr3 == cpu_task_cr3) ||
2799 (pmap_cr3 == cpu_active_cr3) ||
2800 (pmap_is_shared)) {
2801 if (options & PMAP_DELAY_TLB_FLUSH) {
2802 if (need_global_flush == true) {
2803 pfc->pfc_invalid_global |= cpu_bit;
2804 }
2805 pfc->pfc_cpus |= cpu_bit;
2806
2807 continue;
2808 }
2809 if (need_global_flush == true) {
2810 cpu_datap(my_cpu)->cpu_tlb_gen_counts_global[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_global_count;
2811 cpu_datap(cpu)->cpu_tlb_invalid_global = 1;
2812 } else {
2813 cpu_datap(my_cpu)->cpu_tlb_gen_counts_local[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_local_count;
2814 cpu_datap(cpu)->cpu_tlb_invalid_local = 1;
2815 }
2816
2817 if (cpu == my_cpu) {
2818 flush_self = TRUE;
2819 continue;
2820 }
2821
2822 mfence();
2823
2824 /*
2825 * We don't need to signal processors which will flush
2826 * lazily at the idle state or kernel boundary.
2827 * For example, if we're invalidating the kernel pmap,
2828 * processors currently in userspace don't need to flush
2829 * their TLBs until the next time they enter the kernel.
2830 * Alterations to the address space of a task active
2831 * on a remote processor result in a signal, to
2832 * account for copy operations. (There may be room
2833 * for optimization in such cases).
2834 * The order of the loads below with respect
2835 * to the store to the "cpu_tlb_invalid" field above
2836 * is important--hence the barrier.
2837 */
2838 if (CPU_CR3_IS_ACTIVE(cpu) &&
2839 (pmap_cr3 == CPU_GET_ACTIVE_CR3(cpu) ||
2840 pmap->pm_shared ||
2841 (pmap_cr3 == CPU_GET_TASK_CR3(cpu)))) {
2842 cpus_to_signal |= cpu_bit;
2843 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2844 }
2845 }
2846 }
2847
2848 if ((options & PMAP_DELAY_TLB_FLUSH)) {
2849 goto out;
2850 }
2851
2852 /*
2853 * Flush local tlb if required.
2854 * Do this now to overlap with other processors responding.
2855 */
2856 if (flush_self) {
2857 process_pmap_updates(pmap, pmap_is_shared, startv, endv);
2858 }
2859
2860 if (cpus_to_signal) {
2861 cpumask_t cpus_to_respond = cpus_to_signal;
2862
2863 deadline = mach_absolute_time() +
2864 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
2865 boolean_t is_timeout_traced = FALSE;
2866
2867 /*
2868 * Wait for those other cpus to acknowledge
2869 */
2870 while (cpus_to_respond != 0) {
2871 long orig_acks = 0;
2872
2873 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2874 bool responded = false;
2875 if ((cpus_to_respond & cpu_bit) != 0) {
2876 responded = pmap_tlbi_response(my_cpu, cpu, need_global_flush);
2877 if (responded) {
2878 cpus_to_respond &= ~cpu_bit;
2879 }
2880 cpu_pause();
2881 }
2882 if (cpus_to_respond == 0) {
2883 break;
2884 }
2885 }
2886 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
2887 if (machine_timeout_suspended()) {
2888 continue;
2889 }
2890 if (TLBTimeOut == 0) {
2891 /* cut tracepoint but don't panic */
2892 if (is_timeout_traced) {
2893 continue;
2894 }
2895
2896 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
2897 VM_KERNEL_UNSLIDE_OR_PERM(pmap),
2898 cpus_to_signal,
2899 cpus_to_respond);
2900
2901 is_timeout_traced = TRUE;
2902 continue;
2903 }
2904 orig_acks = NMIPI_acks;
2905 uint64_t tstamp1 = mach_absolute_time();
2906 NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
2907 uint64_t tstamp2 = mach_absolute_time();
2908 panic("IPI timeout, unresponsive CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu, pre-NMIPI time: 0x%llx, current: 0x%llx, global: %d",
2909 cpus_to_respond, orig_acks, NMIPI_acks, deadline, tstamp1, tstamp2, need_global_flush);
2910 }
2911 }
2912 }
2913
2914 if (__improbable((pmap == kernel_pmap) && (flush_self != TRUE))) {
2915 panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, pmap_cr3: 0x%llx, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, pmap_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map);
2916 }
2917
2918 out:
2919 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_END,
2920 VM_KERNEL_UNSLIDE_OR_PERM(pmap), cpus_to_signal,
2921 event_startv, event_endv);
2922 }
2923
2924 static void
2925 process_pmap_updates(pmap_t p, bool pshared, addr64_t istart, addr64_t iend)
2926 {
2927 int ccpu = cpu_number();
2928 bool gtlbf = false;
2929
2930 pmap_assert(ml_get_interrupts_enabled() == 0 ||
2931 get_preemption_level() != 0);
2932
2933 if (cpu_datap(ccpu)->cpu_tlb_invalid_global) {
2934 cpu_datap(ccpu)->cpu_tlb_invalid_global_count++;
2935 cpu_datap(ccpu)->cpu_tlb_invalid = 0;
2936 gtlbf = true;
2937 } else {
2938 cpu_datap(ccpu)->cpu_tlb_invalid_local_count++;
2939 cpu_datap(ccpu)->cpu_tlb_invalid_local = 0;
2940 }
2941
2942 if (pmap_pcid_ncpus) {
2943 if (p) {
2944 /* TODO global generation count to
2945 * avoid potentially redundant
2946 * csw invalidations post-global invalidation
2947 */
2948 pmap_pcid_validate_cpu(p, ccpu);
2949 pmap_tlbi_range(istart, iend, (pshared || gtlbf), p->pmap_pcid_cpus[ccpu]);
2950 } else {
2951 pmap_pcid_validate_current();
2952 pmap_tlbi_range(istart, iend, true, 0);
2953 }
2954 } else {
2955 pmap_tlbi_range(0, ~0ULL, true, 0);
2956 }
2957 }
2958
2959 void
2960 pmap_update_interrupt(void)
2961 {
2962 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START);
2963
2964 if (current_cpu_datap()->cpu_tlb_invalid) {
2965 process_pmap_updates(NULL, true, 0ULL, ~0ULL);
2966 }
2967
2968 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END);
2969 }
2970
2971 #include <mach/mach_vm.h> /* mach_vm_region_recurse() */
2972 /* Scan kernel pmap for W+X PTEs, scan kernel VM map for W+X map entries
2973 * and identify ranges with mismatched VM permissions and PTE permissions
2974 */
2975 kern_return_t
2976 pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset_t ev)
2977 {
2978 vm_offset_t cv = sv;
2979 kern_return_t rv = KERN_SUCCESS;
2980 uint64_t skip4 = 0, skip2 = 0;
2981
2982 assert(!is_ept_pmap(ipmap));
2983
2984 sv &= ~PAGE_MASK_64;
2985 ev &= ~PAGE_MASK_64;
2986 while (cv < ev) {
2987 if (__improbable((cv > 0x00007FFFFFFFFFFFULL) &&
2988 (cv < 0xFFFF800000000000ULL))) {
2989 cv = 0xFFFF800000000000ULL;
2990 }
2991 /* Potential inconsistencies from not holding pmap lock
2992 * but harmless for the moment.
2993 */
2994 if (((cv & PML4MASK) == 0) && (pmap64_pml4(ipmap, cv) == 0)) {
2995 if ((cv + NBPML4) > cv) {
2996 cv += NBPML4;
2997 } else {
2998 break;
2999 }
3000 skip4++;
3001 continue;
3002 }
3003 if (((cv & PDMASK) == 0) && (pmap_pde(ipmap, cv) == 0)) {
3004 if ((cv + NBPD) > cv) {
3005 cv += NBPD;
3006 } else {
3007 break;
3008 }
3009 skip2++;
3010 continue;
3011 }
3012
3013 pt_entry_t *ptep = pmap_pte(ipmap, cv);
3014 if (ptep && (*ptep & INTEL_PTE_VALID)) {
3015 if (*ptep & INTEL_PTE_WRITE) {
3016 if (!(*ptep & INTEL_PTE_NX)) {
3017 kprintf("W+X PTE at 0x%lx, P4: 0x%llx, P3: 0x%llx, P2: 0x%llx, PT: 0x%llx, VP: %u\n", cv, *pmap64_pml4(ipmap, cv), *pmap64_pdpt(ipmap, cv), *pmap_pde(ipmap, cv), *ptep, pmap_valid_page((ppnum_t)(i386_btop(pte_to_pa(*ptep)))));
3018 rv = KERN_FAILURE;
3019 }
3020 }
3021 }
3022 cv += PAGE_SIZE;
3023 }
3024 kprintf("Completed pmap scan\n");
3025 cv = sv;
3026
3027 struct vm_region_submap_info_64 vbr;
3028 mach_msg_type_number_t vbrcount = 0;
3029 mach_vm_size_t vmsize;
3030 vm_prot_t prot;
3031 uint32_t nesting_depth = 0;
3032 kern_return_t kret;
3033
3034 while (cv < ev) {
3035 for (;;) {
3036 vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64;
3037 if ((kret = mach_vm_region_recurse(ivmmap,
3038 (mach_vm_address_t *) &cv, &vmsize, &nesting_depth,
3039 (vm_region_recurse_info_t)&vbr,
3040 &vbrcount)) != KERN_SUCCESS) {
3041 break;
3042 }
3043
3044 if (vbr.is_submap) {
3045 nesting_depth++;
3046 continue;
3047 } else {
3048 break;
3049 }
3050 }
3051
3052 if (kret != KERN_SUCCESS) {
3053 break;
3054 }
3055
3056 prot = vbr.protection;
3057
3058 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) {
3059 kprintf("W+X map entry at address 0x%lx\n", cv);
3060 rv = KERN_FAILURE;
3061 }
3062
3063 if (prot) {
3064 vm_offset_t pcv;
3065 for (pcv = cv; pcv < cv + vmsize; pcv += PAGE_SIZE) {
3066 pt_entry_t *ptep = pmap_pte(ipmap, pcv);
3067 vm_prot_t tprot;
3068
3069 if ((ptep == NULL) || !(*ptep & INTEL_PTE_VALID)) {
3070 continue;
3071 }
3072 tprot = VM_PROT_READ;
3073 if (*ptep & INTEL_PTE_WRITE) {
3074 tprot |= VM_PROT_WRITE;
3075 }
3076 if ((*ptep & INTEL_PTE_NX) == 0) {
3077 tprot |= VM_PROT_EXECUTE;
3078 }
3079 if (tprot != prot) {
3080 kprintf("PTE/map entry permissions mismatch at address 0x%lx, pte: 0x%llx, protection: 0x%x\n", pcv, *ptep, prot);
3081 rv = KERN_FAILURE;
3082 }
3083 }
3084 }
3085 cv += vmsize;
3086 }
3087 return rv;
3088 }
3089
3090 #if MACH_ASSERT
3091 extern int pmap_ledgers_panic;
3092 extern int pmap_ledgers_panic_leeway;
3093
3094 static void
3095 pmap_check_ledgers(
3096 pmap_t pmap)
3097 {
3098 int pid;
3099 char *procname;
3100
3101 if (pmap->pmap_pid == 0) {
3102 /*
3103 * This pmap was not or is no longer fully associated
3104 * with a task (e.g. the old pmap after a fork()/exec() or
3105 * spawn()). Its "ledger" still points at a task that is
3106 * now using a different (and active) address space, so
3107 * we can't check that all the pmap ledgers are balanced here.
3108 *
3109 * If the "pid" is set, that means that we went through
3110 * pmap_set_process() in task_terminate_internal(), so
3111 * this task's ledger should not have been re-used and
3112 * all the pmap ledgers should be back to 0.
3113 */
3114 return;
3115 }
3116
3117 pid = pmap->pmap_pid;
3118 procname = pmap->pmap_procname;
3119
3120 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
3121
3122 if (pmap->stats.resident_count != 0 ||
3123 #if 35156815
3124 /*
3125 * "wired_count" is unfortunately a bit inaccurate, so let's
3126 * tolerate some slight deviation to limit the amount of
3127 * somewhat-spurious assertion failures.
3128 */
3129 pmap->stats.wired_count > 10 ||
3130 #else /* 35156815 */
3131 pmap->stats.wired_count != 0 ||
3132 #endif /* 35156815 */
3133 pmap->stats.device != 0 ||
3134 pmap->stats.internal != 0 ||
3135 pmap->stats.external != 0 ||
3136 pmap->stats.reusable != 0 ||
3137 pmap->stats.compressed != 0) {
3138 if (pmap_stats_assert &&
3139 pmap->pmap_stats_assert) {
3140 panic("pmap_destroy(%p) %d[%s] imbalanced stats: resident=%d wired=%d device=%d internal=%d external=%d reusable=%d compressed=%lld",
3141 pmap, pid, procname,
3142 pmap->stats.resident_count,
3143 pmap->stats.wired_count,
3144 pmap->stats.device,
3145 pmap->stats.internal,
3146 pmap->stats.external,
3147 pmap->stats.reusable,
3148 pmap->stats.compressed);
3149 } else {
3150 printf("pmap_destroy(%p) %d[%s] imbalanced stats: resident=%d wired=%d device=%d internal=%d external=%d reusable=%d compressed=%lld",
3151 pmap, pid, procname,
3152 pmap->stats.resident_count,
3153 pmap->stats.wired_count,
3154 pmap->stats.device,
3155 pmap->stats.internal,
3156 pmap->stats.external,
3157 pmap->stats.reusable,
3158 pmap->stats.compressed);
3159 }
3160 }
3161 }
3162
3163 void
3164 pmap_set_process(
3165 pmap_t pmap,
3166 int pid,
3167 char *procname)
3168 {
3169 if (pmap == NULL) {
3170 return;
3171 }
3172
3173 pmap->pmap_pid = pid;
3174 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3175 if (pmap_ledgers_panic_leeway) {
3176 /*
3177 * XXX FBDP
3178 * Some processes somehow trigger some issues that make
3179 * the pmap stats and ledgers go off track, causing
3180 * some assertion failures and ledger panics.
3181 * Turn off the sanity checks if we allow some ledger leeway
3182 * because of that. We'll still do a final check in
3183 * pmap_check_ledgers() for discrepancies larger than the
3184 * allowed leeway after the address space has been fully
3185 * cleaned up.
3186 */
3187 pmap->pmap_stats_assert = FALSE;
3188 ledger_disable_panic_on_negative(pmap->ledger,
3189 task_ledgers.phys_footprint);
3190 ledger_disable_panic_on_negative(pmap->ledger,
3191 task_ledgers.internal);
3192 ledger_disable_panic_on_negative(pmap->ledger,
3193 task_ledgers.internal_compressed);
3194 ledger_disable_panic_on_negative(pmap->ledger,
3195 task_ledgers.iokit_mapped);
3196 ledger_disable_panic_on_negative(pmap->ledger,
3197 task_ledgers.alternate_accounting);
3198 ledger_disable_panic_on_negative(pmap->ledger,
3199 task_ledgers.alternate_accounting_compressed);
3200 }
3201 }
3202 #endif /* MACH_ASSERT */
3203
3204
3205 #if DEVELOPMENT || DEBUG
3206 int pmap_pagezero_mitigation = 1;
3207 #endif
3208
3209 void
3210 pmap_advise_pagezero_range(pmap_t lpmap, uint64_t low_bound)
3211 {
3212 #if DEVELOPMENT || DEBUG
3213 if (pmap_pagezero_mitigation == 0) {
3214 lpmap->pagezero_accessible = FALSE;
3215 return;
3216 }
3217 #endif
3218 lpmap->pagezero_accessible = ((pmap_smap_enabled == FALSE) && (low_bound < 0x1000));
3219 if (lpmap == current_pmap()) {
3220 mp_disable_preemption();
3221 current_cpu_datap()->cpu_pagezero_mapped = lpmap->pagezero_accessible;
3222 mp_enable_preemption();
3223 }
3224 }
3225
3226 uintptr_t
3227 pmap_verify_noncacheable(uintptr_t vaddr)
3228 {
3229 pt_entry_t *ptep = NULL;
3230 ptep = pmap_pte(kernel_pmap, vaddr);
3231 if (ptep == NULL) {
3232 panic("pmap_verify_noncacheable: no translation for 0x%lx", vaddr);
3233 }
3234 /* Non-cacheable OK */
3235 if (*ptep & (INTEL_PTE_NCACHE)) {
3236 return pte_to_pa(*ptep) | (vaddr & INTEL_OFFMASK);
3237 }
3238 /* Write-combined OK */
3239 if (*ptep & (INTEL_PTE_PAT)) {
3240 return pte_to_pa(*ptep) | (vaddr & INTEL_OFFMASK);
3241 }
3242 panic("pmap_verify_noncacheable: IO read from a cacheable address? address: 0x%lx, PTE: %p, *PTE: 0x%llx", vaddr, ptep, *ptep);
3243 /*NOTREACHED*/
3244 return 0;
3245 }
3246
3247 void
3248 trust_cache_init(void)
3249 {
3250 // Unsupported on this architecture.
3251 }
3252
3253 kern_return_t
3254 pmap_load_legacy_trust_cache(struct pmap_legacy_trust_cache __unused *trust_cache,
3255 const vm_size_t __unused trust_cache_len)
3256 {
3257 // Unsupported on this architecture.
3258 return KERN_NOT_SUPPORTED;
3259 }
3260
3261 pmap_tc_ret_t
3262 pmap_load_image4_trust_cache(struct pmap_image4_trust_cache __unused *trust_cache,
3263 const vm_size_t __unused trust_cache_len,
3264 uint8_t const * __unused img4_manifest,
3265 const vm_size_t __unused img4_manifest_buffer_len,
3266 const vm_size_t __unused img4_manifest_actual_len,
3267 bool __unused dry_run)
3268 {
3269 // Unsupported on this architecture.
3270 return PMAP_TC_UNKNOWN_FORMAT;
3271 }
3272
3273
3274 bool
3275 pmap_is_trust_cache_loaded(const uuid_t __unused uuid)
3276 {
3277 // Unsupported on this architecture.
3278 return false;
3279 }
3280
3281 bool
3282 pmap_lookup_in_loaded_trust_caches(const uint8_t __unused cdhash[20])
3283 {
3284 // Unsupported on this architecture.
3285 return false;
3286 }
3287
3288 uint32_t
3289 pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20])
3290 {
3291 // Unsupported on this architecture.
3292 return false;
3293 }
3294
3295 bool
3296 pmap_in_ppl(void)
3297 {
3298 // Nonexistent on this architecture.
3299 return false;
3300 }
3301
3302 void *
3303 pmap_claim_reserved_ppl_page(void)
3304 {
3305 // Unsupported on this architecture.
3306 return NULL;
3307 }
3308
3309 void
3310 pmap_free_reserved_ppl_page(void __unused *kva)
3311 {
3312 // Unsupported on this architecture.
3313 }