]> git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/x86_64/pmap.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / osfmk / x86_64 / pmap.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58
59/*
60 * File: pmap.c
61 * Author: Avadis Tevanian, Jr., Michael Wayne Young
62 * (These guys wrote the Vax version)
63 *
64 * Physical Map management code for Intel i386, i486, and i860.
65 *
66 * Manages physical address maps.
67 *
68 * In addition to hardware address maps, this
69 * module is called upon to provide software-use-only
70 * maps which may or may not be stored in the same
71 * form as hardware maps. These pseudo-maps are
72 * used to store intermediate results from copy
73 * operations to and from address spaces.
74 *
75 * Since the information managed by this module is
76 * also stored by the logical address mapping module,
77 * this module may throw away valid virtual-to-physical
78 * mappings at almost any time. However, invalidations
79 * of virtual-to-physical mappings must be done as
80 * requested.
81 *
82 * In order to cope with hardware architectures which
83 * make virtual-to-physical map invalidates expensive,
84 * this module may delay invalidate or reduced protection
85 * operations until such time as they are actually
86 * necessary. This module is given full information as
87 * to which processors are currently using which maps,
88 * and to when physical maps must be made correct.
89 */
90
91#include <string.h>
92#include <mach_ldebug.h>
93
94#include <libkern/OSAtomic.h>
95
96#include <mach/machine/vm_types.h>
97
98#include <mach/boolean.h>
99#include <kern/thread.h>
100#include <kern/zalloc.h>
101#include <kern/queue.h>
102#include <kern/ledger.h>
103#include <kern/mach_param.h>
104
105#include <kern/spl.h>
106
107#include <vm/pmap.h>
108#include <vm/vm_map.h>
109#include <vm/vm_kern.h>
110#include <mach/vm_param.h>
111#include <mach/vm_prot.h>
112#include <vm/vm_object.h>
113#include <vm/vm_page.h>
114
115#include <mach/machine/vm_param.h>
116#include <machine/thread.h>
117
118#include <kern/misc_protos.h> /* prototyping */
119#include <i386/misc_protos.h>
120#include <i386/i386_lowmem.h>
121#include <x86_64/lowglobals.h>
122
123#include <i386/cpuid.h>
124#include <i386/cpu_data.h>
125#include <i386/cpu_number.h>
126#include <i386/machine_cpu.h>
127#include <i386/seg.h>
128#include <i386/serial_io.h>
129#include <i386/cpu_capabilities.h>
130#include <i386/machine_routines.h>
131#include <i386/proc_reg.h>
132#include <i386/tsc.h>
133#include <i386/pmap_internal.h>
134#include <i386/pmap_pcid.h>
135#if CONFIG_VMX
136#include <i386/vmx/vmx_cpu.h>
137#endif
138
139#include <vm/vm_protos.h>
140#include <san/kasan.h>
141
142#include <i386/mp.h>
143#include <i386/mp_desc.h>
144#include <libkern/kernel_mach_header.h>
145
146#include <pexpert/i386/efi.h>
147#include <libkern/section_keywords.h>
148#if MACH_ASSERT
149int pmap_stats_assert = 1;
150#endif /* MACH_ASSERT */
151
152#ifdef IWANTTODEBUG
153#undef DEBUG
154#define DEBUG 1
155#define POSTCODE_DELAY 1
156#include <i386/postcode.h>
157#endif /* IWANTTODEBUG */
158
159#ifdef PMAP_DEBUG
160#define DBG(x...) kprintf("DBG: " x)
161#else
162#define DBG(x...)
163#endif
164/* Compile time assert to ensure adjacency/alignment of per-CPU data fields used
165 * in the trampolines for kernel/user boundary TLB coherency.
166 */
167char pmap_cpu_data_assert[(((offsetof(cpu_data_t, cpu_tlb_invalid) - offsetof(cpu_data_t, cpu_active_cr3)) == 8) && (offsetof(cpu_data_t, cpu_active_cr3) % 64 == 0)) ? 1 : -1];
168boolean_t pmap_trace = FALSE;
169
170boolean_t no_shared_cr3 = DEBUG; /* TRUE for DEBUG by default */
171
172#if DEVELOPMENT || DEBUG
173int nx_enabled = 1; /* enable no-execute protection -- set during boot */
174#else
175const int nx_enabled = 1;
176#endif
177
178#if DEBUG || DEVELOPMENT
179int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
180int allow_stack_exec = 0; /* No apps may execute from the stack by default */
181#else /* DEBUG || DEVELOPMENT */
182const int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
183const int allow_stack_exec = 0; /* No apps may execute from the stack by default */
184#endif /* DEBUG || DEVELOPMENT */
185
186uint64_t max_preemption_latency_tsc = 0;
187
188pv_hashed_entry_t *pv_hash_table; /* hash lists */
189
190uint32_t npvhashmask = 0, npvhashbuckets = 0;
191
192pv_hashed_entry_t pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
193pv_hashed_entry_t pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
194SIMPLE_LOCK_DECLARE(pv_hashed_free_list_lock, 0);
195SIMPLE_LOCK_DECLARE(pv_hashed_kern_free_list_lock, 0);
196SIMPLE_LOCK_DECLARE(pv_hash_table_lock, 0);
197SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
198
199SECURITY_READ_ONLY_LATE(zone_t) pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
200
201/*
202 * First and last physical addresses that we maintain any information
203 * for. Initialized to zero so that pmap operations done before
204 * pmap_init won't touch any non-existent structures.
205 */
206boolean_t pmap_initialized = FALSE;/* Has pmap_init completed? */
207
208static struct vm_object kptobj_object_store VM_PAGE_PACKED_ALIGNED;
209static struct vm_object kpml4obj_object_store VM_PAGE_PACKED_ALIGNED;
210static struct vm_object kpdptobj_object_store VM_PAGE_PACKED_ALIGNED;
211
212/*
213 * Array of physical page attribites for managed pages.
214 * One byte per physical page.
215 */
216char *pmap_phys_attributes;
217ppnum_t last_managed_page = 0;
218
219unsigned pmap_memory_region_count;
220unsigned pmap_memory_region_current;
221
222pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
223
224/*
225 * Other useful macros.
226 */
227#define current_pmap() (vm_map_pmap(current_thread()->map))
228
229struct pmap kernel_pmap_store;
230SECURITY_READ_ONLY_LATE(pmap_t) kernel_pmap = NULL;
231SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
232SECURITY_READ_ONLY_LATE(zone_t) pmap_anchor_zone;
233SECURITY_READ_ONLY_LATE(zone_t) pmap_uanchor_zone;
234int pmap_debug = 0; /* flag for debugging prints */
235
236unsigned int inuse_ptepages_count = 0;
237long long alloc_ptepages_count __attribute__((aligned(8))) = 0; /* aligned for atomic access */
238unsigned int bootstrap_wired_pages = 0;
239
240extern long NMIPI_acks;
241
242SECURITY_READ_ONLY_LATE(boolean_t) kernel_text_ps_4K = TRUE;
243
244extern char end;
245
246static int nkpt;
247
248#if DEVELOPMENT || DEBUG
249SECURITY_READ_ONLY_LATE(boolean_t) pmap_disable_kheap_nx = FALSE;
250SECURITY_READ_ONLY_LATE(boolean_t) pmap_disable_kstack_nx = FALSE;
251SECURITY_READ_ONLY_LATE(boolean_t) wpkernel = TRUE;
252#else
253const boolean_t wpkernel = TRUE;
254#endif
255
256extern long __stack_chk_guard[];
257
258static uint64_t pmap_eptp_flags = 0;
259boolean_t pmap_ept_support_ad = FALSE;
260
261static void process_pmap_updates(pmap_t, bool, addr64_t, addr64_t);
262/*
263 * Map memory at initialization. The physical addresses being
264 * mapped are not managed and are never unmapped.
265 *
266 * For now, VM is already on, we only need to map the
267 * specified memory.
268 */
269vm_offset_t
270pmap_map(
271 vm_offset_t virt,
272 vm_map_offset_t start_addr,
273 vm_map_offset_t end_addr,
274 vm_prot_t prot,
275 unsigned int flags)
276{
277 kern_return_t kr;
278 int ps;
279
280 ps = PAGE_SIZE;
281 while (start_addr < end_addr) {
282 kr = pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
283 (ppnum_t) i386_btop(start_addr), prot, VM_PROT_NONE, flags, TRUE);
284
285 if (kr != KERN_SUCCESS) {
286 panic("%s: failed pmap_enter, "
287 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
288 __FUNCTION__,
289 (void *)virt, (void *)start_addr, (void *)end_addr, prot, flags);
290 }
291
292 virt += ps;
293 start_addr += ps;
294 }
295 return virt;
296}
297
298extern char *first_avail;
299extern vm_offset_t virtual_avail, virtual_end;
300extern pmap_paddr_t avail_start, avail_end;
301extern vm_offset_t sHIB;
302extern vm_offset_t eHIB;
303extern vm_offset_t stext;
304extern vm_offset_t etext;
305extern vm_offset_t sdata, edata;
306extern vm_offset_t sconst, econst;
307
308extern void *KPTphys;
309
310boolean_t pmap_smep_enabled = FALSE;
311boolean_t pmap_smap_enabled = FALSE;
312
313void
314pmap_cpu_init(void)
315{
316 cpu_data_t *cdp = current_cpu_datap();
317
318 set_cr4(get_cr4() | CR4_PGE);
319
320 /*
321 * Initialize the per-cpu, TLB-related fields.
322 */
323 cdp->cpu_kernel_cr3 = kernel_pmap->pm_cr3;
324 cpu_shadowp(cdp->cpu_number)->cpu_kernel_cr3 = cdp->cpu_kernel_cr3;
325 cdp->cpu_active_cr3 = kernel_pmap->pm_cr3;
326 cdp->cpu_tlb_invalid = 0;
327 cdp->cpu_task_map = TASK_MAP_64BIT;
328
329 pmap_pcid_configure();
330 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMEP) {
331 pmap_smep_enabled = TRUE;
332#if DEVELOPMENT || DEBUG
333 boolean_t nsmep;
334 if (PE_parse_boot_argn("-pmap_smep_disable", &nsmep, sizeof(nsmep))) {
335 pmap_smep_enabled = FALSE;
336 }
337#endif
338 if (pmap_smep_enabled) {
339 set_cr4(get_cr4() | CR4_SMEP);
340 }
341 }
342 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMAP) {
343 pmap_smap_enabled = TRUE;
344#if DEVELOPMENT || DEBUG
345 boolean_t nsmap;
346 if (PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) {
347 pmap_smap_enabled = FALSE;
348 }
349#endif
350 if (pmap_smap_enabled) {
351 set_cr4(get_cr4() | CR4_SMAP);
352 }
353 }
354
355#if !MONOTONIC
356 if (cdp->cpu_fixed_pmcs_enabled) {
357 boolean_t enable = TRUE;
358 cpu_pmc_control(&enable);
359 }
360#endif /* !MONOTONIC */
361}
362
363static uint32_t
364pmap_scale_shift(void)
365{
366 uint32_t scale = 0;
367
368 if (sane_size <= 8 * GB) {
369 scale = (uint32_t)(sane_size / (2 * GB));
370 } else if (sane_size <= 32 * GB) {
371 scale = 4 + (uint32_t)((sane_size - (8 * GB)) / (4 * GB));
372 } else {
373 scale = 10 + (uint32_t)MIN(4, ((sane_size - (32 * GB)) / (8 * GB)));
374 }
375 return scale;
376}
377
378LCK_GRP_DECLARE(pmap_lck_grp, "pmap");
379LCK_ATTR_DECLARE(pmap_lck_rw_attr, 0, LCK_ATTR_DEBUG);
380
381/*
382 * Bootstrap the system enough to run with virtual memory.
383 * Map the kernel's code and data, and allocate the system page table.
384 * Called with mapping OFF. Page_size must already be set.
385 */
386
387void
388pmap_bootstrap(
389 __unused vm_offset_t load_start,
390 __unused boolean_t IA32e)
391{
392 assert(IA32e);
393
394 vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Set the highest address
395 * known to VM */
396 /*
397 * The kernel's pmap is statically allocated so we don't
398 * have to use pmap_create, which is unlikely to work
399 * correctly at this part of the boot sequence.
400 */
401
402 kernel_pmap = &kernel_pmap_store;
403 os_ref_init(&kernel_pmap->ref_count, NULL);
404#if DEVELOPMENT || DEBUG
405 kernel_pmap->nx_enabled = TRUE;
406#endif
407 kernel_pmap->pm_task_map = TASK_MAP_64BIT;
408 kernel_pmap->pm_obj = (vm_object_t) NULL;
409 kernel_pmap->pm_pml4 = IdlePML4;
410 kernel_pmap->pm_upml4 = IdlePML4;
411 kernel_pmap->pm_cr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
412 kernel_pmap->pm_ucr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
413 kernel_pmap->pm_eptp = 0;
414
415 pmap_pcid_initialize_kernel(kernel_pmap);
416
417 current_cpu_datap()->cpu_kernel_cr3 = cpu_shadowp(cpu_number())->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
418
419 nkpt = NKPT;
420 OSAddAtomic(NKPT, &inuse_ptepages_count);
421 OSAddAtomic64(NKPT, &alloc_ptepages_count);
422 bootstrap_wired_pages = NKPT;
423
424 virtual_avail = (vm_offset_t)(VM_MIN_KERNEL_ADDRESS) + (vm_offset_t)first_avail;
425 virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
426
427 if (!PE_parse_boot_argn("npvhash", &npvhashmask, sizeof(npvhashmask))) {
428 npvhashmask = ((NPVHASHBUCKETS) << pmap_scale_shift()) - 1;
429 }
430
431 npvhashbuckets = npvhashmask + 1;
432
433 if (0 != ((npvhashbuckets) & npvhashmask)) {
434 panic("invalid hash %d, must be ((2^N)-1), "
435 "using default %d\n", npvhashmask, NPVHASHMASK);
436 }
437
438 lck_rw_init(&kernel_pmap->pmap_rwl, &pmap_lck_grp, &pmap_lck_rw_attr);
439 kernel_pmap->pmap_rwl.lck_rw_can_sleep = FALSE;
440
441 pmap_cpu_init();
442
443 if (pmap_pcid_ncpus) {
444 printf("PMAP: PCID enabled\n");
445 }
446
447 if (pmap_smep_enabled) {
448 printf("PMAP: Supervisor Mode Execute Protection enabled\n");
449 }
450 if (pmap_smap_enabled) {
451 printf("PMAP: Supervisor Mode Access Protection enabled\n");
452 }
453
454#if DEBUG
455 printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]);
456 printf("early_random(): 0x%qx\n", early_random());
457#endif
458#if DEVELOPMENT || DEBUG
459 boolean_t ptmp;
460 /* Check if the user has requested disabling stack or heap no-execute
461 * enforcement. These are "const" variables; that qualifier is cast away
462 * when altering them. The TEXT/DATA const sections are marked
463 * write protected later in the kernel startup sequence, so altering
464 * them is possible at this point, in pmap_bootstrap().
465 */
466 if (PE_parse_boot_argn("-pmap_disable_kheap_nx", &ptmp, sizeof(ptmp))) {
467 boolean_t *pdknxp = (boolean_t *) &pmap_disable_kheap_nx;
468 *pdknxp = TRUE;
469 }
470
471 if (PE_parse_boot_argn("-pmap_disable_kstack_nx", &ptmp, sizeof(ptmp))) {
472 boolean_t *pdknhp = (boolean_t *) &pmap_disable_kstack_nx;
473 *pdknhp = TRUE;
474 }
475#endif /* DEVELOPMENT || DEBUG */
476
477 boot_args *args = (boot_args *)PE_state.bootArgs;
478 if (args->efiMode == kBootArgsEfiMode32) {
479 printf("EFI32: kernel virtual space limited to 4GB\n");
480 virtual_end = VM_MAX_KERNEL_ADDRESS_EFI32;
481 }
482 kprintf("Kernel virtual space from 0x%lx to 0x%lx.\n",
483 (long)KERNEL_BASE, (long)virtual_end);
484 kprintf("Available physical space from 0x%llx to 0x%llx\n",
485 avail_start, avail_end);
486
487 /*
488 * The -no_shared_cr3 boot-arg is a debugging feature (set by default
489 * in the DEBUG kernel) to force the kernel to switch to its own map
490 * (and cr3) when control is in kernelspace. The kernel's map does not
491 * include (i.e. share) userspace so wild references will cause
492 * a panic. Only copyin and copyout are exempt from this.
493 */
494 (void) PE_parse_boot_argn("-no_shared_cr3",
495 &no_shared_cr3, sizeof(no_shared_cr3));
496 if (no_shared_cr3) {
497 kprintf("Kernel not sharing user map\n");
498 }
499
500#ifdef PMAP_TRACES
501 if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof(pmap_trace))) {
502 kprintf("Kernel traces for pmap operations enabled\n");
503 }
504#endif /* PMAP_TRACES */
505
506#if MACH_ASSERT
507 PE_parse_boot_argn("pmap_asserts", &pmap_asserts_enabled, sizeof(pmap_asserts_enabled));
508 PE_parse_boot_argn("pmap_stats_assert",
509 &pmap_stats_assert,
510 sizeof(pmap_stats_assert));
511#endif /* MACH_ASSERT */
512}
513
514void
515pmap_virtual_space(
516 vm_offset_t *startp,
517 vm_offset_t *endp)
518{
519 *startp = virtual_avail;
520 *endp = virtual_end;
521}
522
523
524
525
526#if HIBERNATION
527
528#include <IOKit/IOHibernatePrivate.h>
529#include <machine/pal_hibernate.h>
530
531int32_t pmap_npages;
532int32_t pmap_teardown_last_valid_compact_indx = -1;
533
534void pmap_pack_index(uint32_t);
535int32_t pmap_unpack_index(pv_rooted_entry_t);
536
537int32_t
538pmap_unpack_index(pv_rooted_entry_t pv_h)
539{
540 int32_t indx = 0;
541
542 indx = (int32_t)(*((uint64_t *)(&pv_h->qlink.next)) >> 48);
543 indx = indx << 16;
544 indx |= (int32_t)(*((uint64_t *)(&pv_h->qlink.prev)) >> 48);
545
546 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)0xffff << 48);
547 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)0xffff << 48);
548
549 return indx;
550}
551
552
553void
554pmap_pack_index(uint32_t indx)
555{
556 pv_rooted_entry_t pv_h;
557
558 pv_h = &pv_head_table[indx];
559
560 *((uint64_t *)(&pv_h->qlink.next)) &= ~((uint64_t)0xffff << 48);
561 *((uint64_t *)(&pv_h->qlink.prev)) &= ~((uint64_t)0xffff << 48);
562
563 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)(indx >> 16)) << 48;
564 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)(indx & 0xffff)) << 48;
565}
566
567
568void
569pal_hib_teardown_pmap_structs(addr64_t *unneeded_start, addr64_t *unneeded_end)
570{
571 int32_t i;
572 int32_t compact_target_indx;
573
574 compact_target_indx = 0;
575
576 for (i = 0; i < pmap_npages; i++) {
577 if (pv_head_table[i].pmap == PMAP_NULL) {
578 if (pv_head_table[compact_target_indx].pmap != PMAP_NULL) {
579 compact_target_indx = i;
580 }
581 } else {
582 pmap_pack_index((uint32_t)i);
583
584 if (pv_head_table[compact_target_indx].pmap == PMAP_NULL) {
585 /*
586 * we've got a hole to fill, so
587 * move this pv_rooted_entry_t to it's new home
588 */
589 pv_head_table[compact_target_indx] = pv_head_table[i];
590 pv_head_table[i].pmap = PMAP_NULL;
591
592 pmap_teardown_last_valid_compact_indx = compact_target_indx;
593 compact_target_indx++;
594 } else {
595 pmap_teardown_last_valid_compact_indx = i;
596 }
597 }
598 }
599 *unneeded_start = (addr64_t)&pv_head_table[pmap_teardown_last_valid_compact_indx + 1];
600 *unneeded_end = (addr64_t)&pv_head_table[pmap_npages - 1];
601
602 HIBLOG("pal_hib_teardown_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
603}
604
605
606void
607pal_hib_rebuild_pmap_structs(void)
608{
609 int32_t cindx, eindx, rindx = 0;
610 pv_rooted_entry_t pv_h;
611
612 eindx = (int32_t)pmap_npages;
613
614 for (cindx = pmap_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
615 pv_h = &pv_head_table[cindx];
616
617 rindx = pmap_unpack_index(pv_h);
618 assert(rindx < pmap_npages);
619
620 if (rindx != cindx) {
621 /*
622 * this pv_rooted_entry_t was moved by pal_hib_teardown_pmap_structs,
623 * so move it back to its real location
624 */
625 pv_head_table[rindx] = pv_head_table[cindx];
626 }
627 if (rindx + 1 != eindx) {
628 /*
629 * the 'hole' between this vm_rooted_entry_t and the previous
630 * vm_rooted_entry_t we moved needs to be initialized as
631 * a range of zero'd vm_rooted_entry_t's
632 */
633 bzero((char *)&pv_head_table[rindx + 1], (eindx - rindx - 1) * sizeof(struct pv_rooted_entry));
634 }
635 eindx = rindx;
636 }
637 if (rindx) {
638 bzero((char *)&pv_head_table[0], rindx * sizeof(struct pv_rooted_entry));
639 }
640
641 HIBLOG("pal_hib_rebuild_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
642}
643
644#endif
645
646/*
647 * Create pv entries for kernel pages mapped by early startup code.
648 * These have to exist so we can ml_static_mfree() them later.
649 */
650static void
651pmap_pv_fixup(vm_offset_t start_va, vm_offset_t end_va)
652{
653 ppnum_t ppn;
654 pv_rooted_entry_t pv_h;
655 uint32_t pgsz;
656
657 start_va = round_page(start_va);
658 end_va = trunc_page(end_va);
659 while (start_va < end_va) {
660 pgsz = PAGE_SIZE;
661 ppn = pmap_find_phys(kernel_pmap, start_va);
662 if (ppn != 0 && IS_MANAGED_PAGE(ppn)) {
663 pv_h = pai_to_pvh(ppn);
664 assert(pv_h->qlink.next == 0); /* shouldn't be init'd yet */
665 assert(pv_h->pmap == 0);
666 pv_h->va_and_flags = start_va;
667 pv_h->pmap = kernel_pmap;
668 queue_init(&pv_h->qlink);
669 if (pmap_query_pagesize(kernel_pmap, start_va) == I386_LPGBYTES) {
670 pgsz = I386_LPGBYTES;
671 }
672 }
673 start_va += pgsz;
674 }
675}
676
677/*
678 * Initialize the pmap module.
679 * Called by vm_init, to initialize any structures that the pmap
680 * system needs to map virtual memory.
681 */
682void
683pmap_init(void)
684{
685 long npages;
686 vm_offset_t addr;
687 vm_size_t s, vsize;
688 vm_map_offset_t vaddr;
689 ppnum_t ppn;
690
691
692 kernel_pmap->pm_obj_pml4 = &kpml4obj_object_store;
693 _vm_object_allocate((vm_object_size_t)NPML4PGS * PAGE_SIZE, &kpml4obj_object_store);
694
695 kernel_pmap->pm_obj_pdpt = &kpdptobj_object_store;
696 _vm_object_allocate((vm_object_size_t)NPDPTPGS * PAGE_SIZE, &kpdptobj_object_store);
697
698 kernel_pmap->pm_obj = &kptobj_object_store;
699 _vm_object_allocate((vm_object_size_t)NPDEPGS * PAGE_SIZE, &kptobj_object_store);
700
701 /*
702 * Allocate memory for the pv_head_table and its lock bits,
703 * the modify bit array, and the pte_page table.
704 */
705
706 /*
707 * zero bias all these arrays now instead of off avail_start
708 * so we cover all memory
709 */
710
711 npages = i386_btop(avail_end);
712#if HIBERNATION
713 pmap_npages = (uint32_t)npages;
714#endif
715 s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
716 + (sizeof(struct pv_hashed_entry_t *) * (npvhashbuckets))
717 + pv_lock_table_size(npages)
718 + pv_hash_lock_table_size((npvhashbuckets))
719 + npages);
720 s = round_page(s);
721 if (kernel_memory_allocate(kernel_map, &addr, s, 0,
722 KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_PMAP)
723 != KERN_SUCCESS) {
724 panic("pmap_init");
725 }
726
727 memset((char *)addr, 0, s);
728
729 vaddr = addr;
730 vsize = s;
731
732#if PV_DEBUG
733 if (0 == npvhashmask) {
734 panic("npvhashmask not initialized");
735 }
736#endif
737
738 /*
739 * Allocate the structures first to preserve word-alignment.
740 */
741 pv_head_table = (pv_rooted_entry_t) addr;
742 addr = (vm_offset_t) (pv_head_table + npages);
743
744 pv_hash_table = (pv_hashed_entry_t *)addr;
745 addr = (vm_offset_t) (pv_hash_table + (npvhashbuckets));
746
747 pv_lock_table = (char *) addr;
748 addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
749
750 pv_hash_lock_table = (char *) addr;
751 addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhashbuckets)));
752
753 pmap_phys_attributes = (char *) addr;
754
755 ppnum_t last_pn = i386_btop(avail_end);
756 unsigned int i;
757 pmap_memory_region_t *pmptr = pmap_memory_regions;
758 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
759 if (pmptr->type != kEfiConventionalMemory) {
760 continue;
761 }
762 ppnum_t pn;
763 for (pn = pmptr->base; pn <= pmptr->end; pn++) {
764 if (pn < last_pn) {
765 pmap_phys_attributes[pn] |= PHYS_MANAGED;
766
767 if (pn > last_managed_page) {
768 last_managed_page = pn;
769 }
770
771 if ((pmap_high_used_bottom <= pn && pn <= pmap_high_used_top) ||
772 (pmap_middle_used_bottom <= pn && pn <= pmap_middle_used_top)) {
773 pmap_phys_attributes[pn] |= PHYS_NOENCRYPT;
774 }
775 }
776 }
777 }
778 while (vsize) {
779 ppn = pmap_find_phys(kernel_pmap, vaddr);
780
781 pmap_phys_attributes[ppn] |= PHYS_NOENCRYPT;
782
783 vaddr += PAGE_SIZE;
784 vsize -= PAGE_SIZE;
785 }
786 /*
787 * Create the zone of physical maps,
788 * and of the physical-to-virtual entries.
789 */
790 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
791 ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
792
793 /* The anchor is required to be page aligned. Zone debugging adds
794 * padding which may violate that requirement. Tell the zone
795 * subsystem that alignment is required.
796 */
797 pmap_anchor_zone = zone_create("pagetable anchors", PAGE_SIZE,
798 ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
799
800/* TODO: possible general optimisation...pre-allocate via zones commonly created
801 * level3/2 pagetables
802 */
803 /* The anchor is required to be page aligned. Zone debugging adds
804 * padding which may violate that requirement. Tell the zone
805 * subsystem that alignment is required.
806 */
807 pmap_uanchor_zone = zone_create("pagetable user anchors", PAGE_SIZE,
808 ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
809
810 pv_hashed_list_zone = zone_create("pv_list", sizeof(struct pv_hashed_entry),
811 ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
812
813 /*
814 * Create pv entries for kernel pages that might get pmap_remove()ed.
815 *
816 * - very low pages that were identity mapped.
817 * - vm_pages[] entries that might be unused and reclaimed.
818 */
819 assert((uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start <= (uintptr_t)vm_page_array_beginning_addr);
820 pmap_pv_fixup((uintptr_t)VM_MIN_KERNEL_ADDRESS, (uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start);
821 pmap_pv_fixup((uintptr_t)vm_page_array_beginning_addr, (uintptr_t)vm_page_array_ending_addr);
822
823 pmap_initialized = TRUE;
824
825 max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
826
827 /*
828 * Ensure the kernel's PML4 entry exists for the basement
829 * before this is shared with any user.
830 */
831 pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT, PMAP_EXPAND_OPTIONS_NONE);
832
833#if CONFIG_VMX
834 pmap_ept_support_ad = vmx_hv_support() && (VMX_CAP(MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_EPT_VPID_CAP_AD_SHIFT, 1) ? TRUE : FALSE);
835 pmap_eptp_flags = HV_VMX_EPTP_MEMORY_TYPE_WB | HV_VMX_EPTP_WALK_LENGTH(4) | (pmap_ept_support_ad ? HV_VMX_EPTP_ENABLE_AD_FLAGS : 0);
836#endif /* CONFIG_VMX */
837}
838
839void
840pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolean_t ro)
841{
842 uint64_t ev = sv + nxrosz, cv = sv;
843 pd_entry_t *pdep;
844 pt_entry_t *ptep = NULL;
845
846 /* XXX what if nxrosz is 0? we end up marking the page whose address is passed in via sv -- is that kosher? */
847 assert(!is_ept_pmap(npmap));
848
849 assert(((sv & 0xFFFULL) | (nxrosz & 0xFFFULL)) == 0);
850
851 for (pdep = pmap_pde(npmap, cv); pdep != NULL && (cv < ev);) {
852 uint64_t pdev = (cv & ~((uint64_t)PDEMASK));
853
854 if (*pdep & INTEL_PTE_PS) {
855#ifdef REMAP_DEBUG
856 if ((NX ^ !!(*pdep & INTEL_PTE_NX)) || (ro ^ !!!(*pdep & INTEL_PTE_WRITE))) {
857 kprintf("WARNING: Remapping PDE for %p from %s%s%s to %s%s%s\n", (void *)cv,
858 (*pdep & INTEL_PTE_VALID) ? "R" : "",
859 (*pdep & INTEL_PTE_WRITE) ? "W" : "",
860 (*pdep & INTEL_PTE_NX) ? "" : "X",
861 "R",
862 ro ? "" : "W",
863 NX ? "" : "X");
864 }
865#endif
866
867 if (NX) {
868 *pdep |= INTEL_PTE_NX;
869 } else {
870 *pdep &= ~INTEL_PTE_NX;
871 }
872 if (ro) {
873 *pdep &= ~INTEL_PTE_WRITE;
874 } else {
875 *pdep |= INTEL_PTE_WRITE;
876 }
877 cv += NBPD;
878 cv &= ~((uint64_t) PDEMASK);
879 pdep = pmap_pde(npmap, cv);
880 continue;
881 }
882
883 for (ptep = pmap_pte(npmap, cv); ptep != NULL && (cv < (pdev + NBPD)) && (cv < ev);) {
884#ifdef REMAP_DEBUG
885 if ((NX ^ !!(*ptep & INTEL_PTE_NX)) || (ro ^ !!!(*ptep & INTEL_PTE_WRITE))) {
886 kprintf("WARNING: Remapping PTE for %p from %s%s%s to %s%s%s\n", (void *)cv,
887 (*ptep & INTEL_PTE_VALID) ? "R" : "",
888 (*ptep & INTEL_PTE_WRITE) ? "W" : "",
889 (*ptep & INTEL_PTE_NX) ? "" : "X",
890 "R",
891 ro ? "" : "W",
892 NX ? "" : "X");
893 }
894#endif
895 if (NX) {
896 *ptep |= INTEL_PTE_NX;
897 } else {
898 *ptep &= ~INTEL_PTE_NX;
899 }
900 if (ro) {
901 *ptep &= ~INTEL_PTE_WRITE;
902 } else {
903 *ptep |= INTEL_PTE_WRITE;
904 }
905 cv += NBPT;
906 ptep = pmap_pte(npmap, cv);
907 }
908 }
909 DPRINTF("%s(0x%llx, 0x%llx, %u, %u): 0x%llx, 0x%llx\n", __FUNCTION__, sv, nxrosz, NX, ro, cv, ptep ? *ptep: 0);
910}
911
912/*
913 * Reclaim memory for early boot 4K page tables that were converted to large page mappings.
914 * We know this memory is part of the KPTphys[] array that was allocated in Idle_PTs_init(),
915 * so we can free it using its address in that array.
916 */
917static void
918pmap_free_early_PT(ppnum_t ppn, uint32_t cnt)
919{
920 ppnum_t KPTphys_ppn;
921 vm_offset_t offset;
922
923 KPTphys_ppn = pmap_find_phys(kernel_pmap, (uintptr_t)KPTphys);
924 assert(ppn >= KPTphys_ppn);
925 assert(ppn + cnt <= KPTphys_ppn + NKPT);
926 offset = (ppn - KPTphys_ppn) << PAGE_SHIFT;
927 ml_static_mfree((uintptr_t)KPTphys + offset, PAGE_SIZE * cnt);
928}
929
930/*
931 * Called once VM is fully initialized so that we can release unused
932 * sections of low memory to the general pool.
933 * Also complete the set-up of identity-mapped sections of the kernel:
934 * 1) write-protect kernel text
935 * 2) map kernel text using large pages if possible
936 * 3) read and write-protect page zero (for K32)
937 * 4) map the global page at the appropriate virtual address.
938 *
939 * Use of large pages
940 * ------------------
941 * To effectively map and write-protect all kernel text pages, the text
942 * must be 2M-aligned at the base, and the data section above must also be
943 * 2M-aligned. That is, there's padding below and above. This is achieved
944 * through linker directives. Large pages are used only if this alignment
945 * exists (and not overriden by the -kernel_text_page_4K boot-arg). The
946 * memory layout is:
947 *
948 * : :
949 * | __DATA |
950 * sdata: ================== 2Meg
951 * | |
952 * | zero-padding |
953 * | |
954 * etext: ------------------
955 * | |
956 * : :
957 * | |
958 * | __TEXT |
959 * | |
960 * : :
961 * | |
962 * stext: ================== 2Meg
963 * | |
964 * | zero-padding |
965 * | |
966 * eHIB: ------------------
967 * | __HIB |
968 * : :
969 *
970 * Prior to changing the mapping from 4K to 2M, the zero-padding pages
971 * [eHIB,stext] and [etext,sdata] are ml_static_mfree()'d. Then all the
972 * 4K pages covering [stext,etext] are coalesced as 2M large pages.
973 * The now unused level-1 PTE pages are also freed.
974 */
975extern ppnum_t vm_kernel_base_page;
976static uint32_t dataptes = 0;
977
978void
979pmap_lowmem_finalize(void)
980{
981 spl_t spl;
982 int i;
983
984 /*
985 * Update wired memory statistics for early boot pages
986 */
987 PMAP_ZINFO_PALLOC(kernel_pmap, bootstrap_wired_pages * PAGE_SIZE);
988
989 /*
990 * Free pages in pmap regions below the base:
991 * rdar://6332712
992 * We can't free all the pages to VM that EFI reports available.
993 * Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake.
994 * There's also a size miscalculation here: pend is one page less
995 * than it should be but this is not fixed to be backwards
996 * compatible.
997 * This is important for KASLR because up to 256*2MB = 512MB of space
998 * needs has to be released to VM.
999 */
1000 for (i = 0;
1001 pmap_memory_regions[i].end < vm_kernel_base_page;
1002 i++) {
1003 vm_offset_t pbase = i386_ptob(pmap_memory_regions[i].base);
1004 vm_offset_t pend = i386_ptob(pmap_memory_regions[i].end + 1);
1005
1006 DBG("pmap region %d [%p..[%p\n",
1007 i, (void *) pbase, (void *) pend);
1008
1009 if (pmap_memory_regions[i].attribute & EFI_MEMORY_KERN_RESERVED) {
1010 continue;
1011 }
1012 /*
1013 * rdar://6332712
1014 * Adjust limits not to free pages in range 0xc0000-0xff000.
1015 */
1016 if (pbase >= 0xc0000 && pend <= 0x100000) {
1017 continue;
1018 }
1019 if (pbase < 0xc0000 && pend > 0x100000) {
1020 /* page range entirely within region, free lower part */
1021 DBG("- ml_static_mfree(%p,%p)\n",
1022 (void *) ml_static_ptovirt(pbase),
1023 (void *) (0xc0000 - pbase));
1024 ml_static_mfree(ml_static_ptovirt(pbase), 0xc0000 - pbase);
1025 pbase = 0x100000;
1026 }
1027 if (pbase < 0xc0000) {
1028 pend = MIN(pend, 0xc0000);
1029 }
1030 if (pend > 0x100000) {
1031 pbase = MAX(pbase, 0x100000);
1032 }
1033 DBG("- ml_static_mfree(%p,%p)\n",
1034 (void *) ml_static_ptovirt(pbase),
1035 (void *) (pend - pbase));
1036 ml_static_mfree(ml_static_ptovirt(pbase), pend - pbase);
1037 }
1038
1039 /* A final pass to get rid of all initial identity mappings to
1040 * low pages.
1041 */
1042 DPRINTF("%s: Removing mappings from 0->0x%lx\n", __FUNCTION__, vm_kernel_base);
1043
1044 /*
1045 * Remove all mappings past the boot-cpu descriptor aliases and low globals.
1046 * Non-boot-cpu GDT aliases will be remapped later as needed.
1047 */
1048 pmap_remove(kernel_pmap, LOWGLOBAL_ALIAS + PAGE_SIZE, vm_kernel_base);
1049
1050 /*
1051 * Release any memory for early boot 4K page table pages that got replaced
1052 * with large page mappings for vm_pages[]. We know this memory is part of
1053 * the KPTphys[] array that was allocated in Idle_PTs_init(), so we can free
1054 * it using that address.
1055 */
1056 pmap_free_early_PT(released_PT_ppn, released_PT_cnt);
1057
1058 /*
1059 * If text and data are both 2MB-aligned,
1060 * we can map text with large-pages,
1061 * unless the -kernel_text_ps_4K boot-arg overrides.
1062 */
1063 if ((stext & I386_LPGMASK) == 0 && (sdata & I386_LPGMASK) == 0) {
1064 kprintf("Kernel text is 2MB aligned");
1065 kernel_text_ps_4K = FALSE;
1066 if (PE_parse_boot_argn("-kernel_text_ps_4K",
1067 &kernel_text_ps_4K,
1068 sizeof(kernel_text_ps_4K))) {
1069 kprintf(" but will be mapped with 4K pages\n");
1070 } else {
1071 kprintf(" and will be mapped with 2M pages\n");
1072 }
1073 }
1074#if DEVELOPMENT || DEBUG
1075 (void) PE_parse_boot_argn("wpkernel", &wpkernel, sizeof(wpkernel));
1076#endif
1077 if (wpkernel) {
1078 kprintf("Kernel text %p-%p to be write-protected\n",
1079 (void *) stext, (void *) etext);
1080 }
1081
1082 spl = splhigh();
1083
1084 /*
1085 * Scan over text if mappings are to be changed:
1086 * - Remap kernel text readonly unless the "wpkernel" boot-arg is 0
1087 * - Change to large-pages if possible and not overriden.
1088 */
1089 if (kernel_text_ps_4K && wpkernel) {
1090 vm_offset_t myva;
1091 for (myva = stext; myva < etext; myva += PAGE_SIZE) {
1092 pt_entry_t *ptep;
1093
1094 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1095 if (ptep) {
1096 pmap_store_pte(ptep, *ptep & ~INTEL_PTE_WRITE);
1097 }
1098 }
1099 }
1100
1101 if (!kernel_text_ps_4K) {
1102 vm_offset_t myva;
1103
1104 /*
1105 * Release zero-filled page padding used for 2M-alignment.
1106 */
1107 DBG("ml_static_mfree(%p,%p) for padding below text\n",
1108 (void *) eHIB, (void *) (stext - eHIB));
1109 ml_static_mfree(eHIB, stext - eHIB);
1110 DBG("ml_static_mfree(%p,%p) for padding above text\n",
1111 (void *) etext, (void *) (sdata - etext));
1112 ml_static_mfree(etext, sdata - etext);
1113
1114 /*
1115 * Coalesce text pages into large pages.
1116 */
1117 for (myva = stext; myva < sdata; myva += I386_LPGBYTES) {
1118 pt_entry_t *ptep;
1119 vm_offset_t pte_phys;
1120 pt_entry_t *pdep;
1121 pt_entry_t pde;
1122 ppnum_t KPT_ppn;
1123
1124 pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva);
1125 KPT_ppn = (ppnum_t)((*pdep & PG_FRAME) >> PAGE_SHIFT);
1126 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1127 DBG("myva: %p pdep: %p ptep: %p\n",
1128 (void *) myva, (void *) pdep, (void *) ptep);
1129 if ((*ptep & INTEL_PTE_VALID) == 0) {
1130 continue;
1131 }
1132 pte_phys = (vm_offset_t)(*ptep & PG_FRAME);
1133 pde = *pdep & PTMASK; /* page attributes from pde */
1134 pde |= INTEL_PTE_PS; /* make it a 2M entry */
1135 pde |= pte_phys; /* take page frame from pte */
1136
1137 if (wpkernel) {
1138 pde &= ~INTEL_PTE_WRITE;
1139 }
1140 DBG("pmap_store_pte(%p,0x%llx)\n",
1141 (void *)pdep, pde);
1142 pmap_store_pte(pdep, pde);
1143
1144 /*
1145 * Free the now-unused level-1 pte.
1146 */
1147 pmap_free_early_PT(KPT_ppn, 1);
1148 }
1149
1150 /* Change variable read by sysctl machdep.pmap */
1151 pmap_kernel_text_ps = I386_LPGBYTES;
1152 }
1153
1154 vm_offset_t dva;
1155
1156 for (dva = sdata; dva < edata; dva += I386_PGBYTES) {
1157 assert(((sdata | edata) & PAGE_MASK) == 0);
1158 pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva);
1159
1160 dpte = *dptep;
1161 assert((dpte & INTEL_PTE_VALID));
1162 dpte |= INTEL_PTE_NX;
1163 pmap_store_pte(dptep, dpte);
1164 dataptes++;
1165 }
1166 assert(dataptes > 0);
1167
1168 kernel_segment_command_t * seg;
1169 kernel_section_t * sec;
1170 kc_format_t kc_format;
1171
1172 PE_get_primary_kc_format(&kc_format);
1173
1174 for (seg = firstseg(); seg != NULL; seg = nextsegfromheader(&_mh_execute_header, seg)) {
1175 if (!strcmp(seg->segname, "__TEXT") ||
1176 !strcmp(seg->segname, "__DATA")) {
1177 continue;
1178 }
1179
1180 /* XXX: FIXME_IN_dyld: This is a workaround (see below) */
1181 if (kc_format != KCFormatFileset) {
1182 //XXX
1183 if (!strcmp(seg->segname, "__KLD")) {
1184 continue;
1185 }
1186 }
1187
1188 if (!strcmp(seg->segname, "__HIB")) {
1189 for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) {
1190 if (sec->addr & PAGE_MASK) {
1191 panic("__HIB segment's sections misaligned");
1192 }
1193 if (!strcmp(sec->sectname, "__text")) {
1194 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), FALSE, TRUE);
1195 } else {
1196 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), TRUE, FALSE);
1197 }
1198 }
1199 } else {
1200 if (kc_format == KCFormatFileset) {
1201#if 0
1202 /*
1203 * This block of code is commented out because it may or may not have induced an earlier panic
1204 * in ledger init.
1205 */
1206
1207
1208 boolean_t NXbit = !(seg->initprot & VM_PROT_EXECUTE),
1209 robit = (seg->initprot & (VM_PROT_READ | VM_PROT_WRITE)) == VM_PROT_READ;
1210
1211 /*
1212 * XXX: FIXME_IN_dyld: This is a workaround for primary KC containing incorrect inaccurate
1213 * initprot for segments containing code.
1214 */
1215 if (!strcmp(seg->segname, "__KLD") || !strcmp(seg->segname, "__VECTORS")) {
1216 NXbit = FALSE;
1217 robit = FALSE;
1218 }
1219
1220 pmap_mark_range(kernel_pmap, seg->vmaddr & ~(uint64_t)PAGE_MASK,
1221 round_page_64(seg->vmsize), NXbit, robit);
1222#endif
1223
1224 /*
1225 * XXX: We are marking *every* segment with rwx permissions as a workaround
1226 * XXX: until the primary KC's kernel segments are page-aligned.
1227 */
1228 kprintf("Marking (%p, %p) as rwx\n", (void *)(seg->vmaddr & ~(uint64_t)PAGE_MASK),
1229 (void *)((seg->vmaddr & ~(uint64_t)PAGE_MASK) + round_page_64(seg->vmsize)));
1230 pmap_mark_range(kernel_pmap, seg->vmaddr & ~(uint64_t)PAGE_MASK,
1231 round_page_64(seg->vmsize), FALSE, FALSE);
1232 } else {
1233 pmap_mark_range(kernel_pmap, seg->vmaddr, round_page_64(seg->vmsize), TRUE, FALSE);
1234 }
1235 }
1236 }
1237
1238 /*
1239 * If we're debugging, map the low global vector page at the fixed
1240 * virtual address. Otherwise, remove the mapping for this.
1241 */
1242 if (debug_boot_arg) {
1243 pt_entry_t *pte = NULL;
1244 if (0 == (pte = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS))) {
1245 panic("lowmem pte");
1246 }
1247 /* make sure it is defined on page boundary */
1248 assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK));
1249 pmap_store_pte(pte, kvtophys((vm_offset_t)&lowGlo)
1250 | INTEL_PTE_REF
1251 | INTEL_PTE_MOD
1252 | INTEL_PTE_WIRED
1253 | INTEL_PTE_VALID
1254 | INTEL_PTE_WRITE
1255 | INTEL_PTE_NX);
1256 } else {
1257 pmap_remove(kernel_pmap,
1258 LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE);
1259 }
1260 pmap_tlbi_range(0, ~0ULL, true, 0);
1261 splx(spl);
1262}
1263
1264/*
1265 * Mark the const data segment as read-only, non-executable.
1266 */
1267void
1268x86_64_protect_data_const()
1269{
1270 boolean_t doconstro = TRUE;
1271#if DEVELOPMENT || DEBUG
1272 (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro));
1273#endif
1274 if (doconstro) {
1275 if (sconst & PAGE_MASK) {
1276 panic("CONST segment misaligned 0x%lx 0x%lx\n",
1277 sconst, econst);
1278 }
1279 kprintf("Marking const DATA read-only\n");
1280 pmap_protect(kernel_pmap, sconst, econst, VM_PROT_READ);
1281 }
1282}
1283/*
1284 * this function is only used for debugging fron the vm layer
1285 */
1286boolean_t
1287pmap_verify_free(
1288 ppnum_t pn)
1289{
1290 pv_rooted_entry_t pv_h;
1291 int pai;
1292 boolean_t result;
1293
1294 assert(pn != vm_page_fictitious_addr);
1295
1296 if (!pmap_initialized) {
1297 return TRUE;
1298 }
1299
1300 if (pn == vm_page_guard_addr) {
1301 return TRUE;
1302 }
1303
1304 pai = ppn_to_pai(pn);
1305 if (!IS_MANAGED_PAGE(pai)) {
1306 return FALSE;
1307 }
1308 pv_h = pai_to_pvh(pn);
1309 result = (pv_h->pmap == PMAP_NULL);
1310 return result;
1311}
1312
1313#if MACH_ASSERT
1314void
1315pmap_assert_free(ppnum_t pn)
1316{
1317 int pai;
1318 pv_rooted_entry_t pv_h = NULL;
1319 pmap_t pmap = NULL;
1320 vm_offset_t va = 0;
1321 static char buffer[32];
1322 static char *pr_name = "not managed pn";
1323 uint_t attr;
1324 pt_entry_t *ptep;
1325 pt_entry_t pte = -1ull;
1326
1327 if (pmap_verify_free(pn)) {
1328 return;
1329 }
1330
1331 if (pn > last_managed_page) {
1332 attr = 0xff;
1333 goto done;
1334 }
1335
1336 pai = ppn_to_pai(pn);
1337 attr = pmap_phys_attributes[pai];
1338 pv_h = pai_to_pvh(pai);
1339 va = pv_h->va_and_flags;
1340 pmap = pv_h->pmap;
1341 if (pmap == kernel_pmap) {
1342 pr_name = "kernel";
1343 } else if (pmap == NULL) {
1344 pr_name = "pmap NULL";
1345 } else if (pmap->pmap_procname[0] != 0) {
1346 pr_name = &pmap->pmap_procname[0];
1347 } else {
1348 snprintf(buffer, sizeof(buffer), "pmap %p", pv_h->pmap);
1349 pr_name = buffer;
1350 }
1351
1352 if (pmap != NULL) {
1353 ptep = pmap_pte(pmap, va);
1354 if (ptep != NULL) {
1355 pte = (uintptr_t)*ptep;
1356 }
1357 }
1358
1359done:
1360 panic("page not FREE page: 0x%lx attr: 0x%x %s va: 0x%lx PTE: 0x%llx",
1361 (ulong_t)pn, attr, pr_name, va, pte);
1362}
1363#endif /* MACH_ASSERT */
1364
1365boolean_t
1366pmap_is_empty(
1367 pmap_t pmap,
1368 vm_map_offset_t va_start,
1369 vm_map_offset_t va_end)
1370{
1371 vm_map_offset_t offset;
1372 ppnum_t phys_page;
1373
1374 if (pmap == PMAP_NULL) {
1375 return TRUE;
1376 }
1377
1378 /*
1379 * Check the resident page count
1380 * - if it's zero, the pmap is completely empty.
1381 * This short-circuit test prevents a virtual address scan which is
1382 * painfully slow for 64-bit spaces.
1383 * This assumes the count is correct
1384 * .. the debug kernel ought to be checking perhaps by page table walk.
1385 */
1386 if (pmap->stats.resident_count == 0) {
1387 return TRUE;
1388 }
1389
1390 for (offset = va_start;
1391 offset < va_end;
1392 offset += PAGE_SIZE_64) {
1393 phys_page = pmap_find_phys(pmap, offset);
1394 if (phys_page) {
1395 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1396 "page %d at 0x%llx\n",
1397 pmap, va_start, va_end, phys_page, offset);
1398 return FALSE;
1399 }
1400 }
1401
1402 return TRUE;
1403}
1404
1405void
1406hv_ept_pmap_create(void **ept_pmap, void **eptp)
1407{
1408 pmap_t p;
1409
1410 if ((ept_pmap == NULL) || (eptp == NULL)) {
1411 return;
1412 }
1413
1414 p = pmap_create_options(get_task_ledger(current_task()), 0, (PMAP_CREATE_64BIT | PMAP_CREATE_EPT));
1415 if (p == PMAP_NULL) {
1416 *ept_pmap = NULL;
1417 *eptp = NULL;
1418 return;
1419 }
1420
1421 assert(is_ept_pmap(p));
1422
1423 *ept_pmap = (void*)p;
1424 *eptp = (void*)(p->pm_eptp);
1425 return;
1426}
1427
1428/*
1429 * pmap_create() is used by some special, legacy 3rd party kexts.
1430 * In our kernel code, always use pmap_create_options().
1431 */
1432extern pmap_t pmap_create(ledger_t ledger, vm_map_size_t sz, boolean_t is_64bit);
1433
1434__attribute__((used))
1435pmap_t
1436pmap_create(
1437 ledger_t ledger,
1438 vm_map_size_t sz,
1439 boolean_t is_64bit)
1440{
1441 return pmap_create_options(ledger, sz, is_64bit ? PMAP_CREATE_64BIT : 0);
1442}
1443
1444/*
1445 * Create and return a physical map.
1446 *
1447 * If the size specified for the map
1448 * is zero, the map is an actual physical
1449 * map, and may be referenced by the
1450 * hardware.
1451 *
1452 * If the size specified is non-zero,
1453 * the map will be used in software only, and
1454 * is bounded by that size.
1455 */
1456
1457pmap_t
1458pmap_create_options(
1459 ledger_t ledger,
1460 vm_map_size_t sz,
1461 unsigned int flags)
1462{
1463 pmap_t p;
1464 vm_size_t size;
1465 pml4_entry_t *pml4;
1466 pml4_entry_t *kpml4;
1467 int i;
1468
1469 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, sz, flags);
1470
1471 size = (vm_size_t) sz;
1472
1473 /*
1474 * A software use-only map doesn't even need a map.
1475 */
1476
1477 if (size != 0) {
1478 return PMAP_NULL;
1479 }
1480
1481 /*
1482 * Return error when unrecognized flags are passed.
1483 */
1484 if (__improbable((flags & ~(PMAP_CREATE_KNOWN_FLAGS)) != 0)) {
1485 return PMAP_NULL;
1486 }
1487
1488 p = (pmap_t) zalloc(pmap_zone);
1489 if (PMAP_NULL == p) {
1490 panic("pmap_create zalloc");
1491 }
1492
1493 /* Zero all fields */
1494 bzero(p, sizeof(*p));
1495
1496 lck_rw_init(&p->pmap_rwl, &pmap_lck_grp, &pmap_lck_rw_attr);
1497 p->pmap_rwl.lck_rw_can_sleep = FALSE;
1498
1499 bzero(&p->stats, sizeof(p->stats));
1500 os_ref_init(&p->ref_count, NULL);
1501#if DEVELOPMENT || DEBUG
1502 p->nx_enabled = 1;
1503#endif
1504 p->pm_shared = FALSE;
1505 ledger_reference(ledger);
1506 p->ledger = ledger;
1507
1508 p->pm_task_map = ((flags & PMAP_CREATE_64BIT) ? TASK_MAP_64BIT : TASK_MAP_32BIT);
1509
1510 p->pagezero_accessible = FALSE;
1511 p->pm_vm_map_cs_enforced = FALSE;
1512
1513 if (pmap_pcid_ncpus) {
1514 pmap_pcid_initialize(p);
1515 }
1516
1517 p->pm_pml4 = zalloc(pmap_anchor_zone);
1518 p->pm_upml4 = zalloc(pmap_uanchor_zone); //cleanup for EPT
1519
1520 pmap_assert((((uintptr_t)p->pm_pml4) & PAGE_MASK) == 0);
1521 pmap_assert((((uintptr_t)p->pm_upml4) & PAGE_MASK) == 0);
1522
1523 memset((char *)p->pm_pml4, 0, PAGE_SIZE);
1524 memset((char *)p->pm_upml4, 0, PAGE_SIZE);
1525
1526 if (flags & PMAP_CREATE_EPT) {
1527 p->pm_eptp = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4) | pmap_eptp_flags;
1528 p->pm_cr3 = 0;
1529 } else {
1530 p->pm_eptp = 0;
1531 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4);
1532 p->pm_ucr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_upml4);
1533 }
1534
1535 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1536
1537 p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS) *PAGE_SIZE);
1538 if (NULL == p->pm_obj_pml4) {
1539 panic("pmap_create pdpt obj");
1540 }
1541
1542 p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS) *PAGE_SIZE);
1543 if (NULL == p->pm_obj_pdpt) {
1544 panic("pmap_create pdpt obj");
1545 }
1546
1547 p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS) *PAGE_SIZE);
1548 if (NULL == p->pm_obj) {
1549 panic("pmap_create pte obj");
1550 }
1551
1552 if (!(flags & PMAP_CREATE_EPT)) {
1553 /* All host pmaps share the kernel's pml4 */
1554 pml4 = pmap64_pml4(p, 0ULL);
1555 kpml4 = kernel_pmap->pm_pml4;
1556 for (i = KERNEL_PML4_INDEX; i < (KERNEL_PML4_INDEX + KERNEL_PML4_COUNT); i++) {
1557 pml4[i] = kpml4[i];
1558 }
1559 pml4[KERNEL_KEXTS_INDEX] = kpml4[KERNEL_KEXTS_INDEX];
1560 for (i = KERNEL_PHYSMAP_PML4_INDEX; i < (KERNEL_PHYSMAP_PML4_INDEX + KERNEL_PHYSMAP_PML4_COUNT); i++) {
1561 pml4[i] = kpml4[i];
1562 }
1563 pml4[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1564#if KASAN
1565 for (i = KERNEL_KASAN_PML4_FIRST; i <= KERNEL_KASAN_PML4_LAST; i++) {
1566 pml4[i] = kpml4[i];
1567 }
1568#endif
1569 pml4_entry_t *pml4u = pmap64_user_pml4(p, 0ULL);
1570 pml4u[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1571 }
1572
1573#if MACH_ASSERT
1574 p->pmap_stats_assert = TRUE;
1575 p->pmap_pid = 0;
1576 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
1577#endif /* MACH_ASSERT */
1578
1579 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END,
1580 VM_KERNEL_ADDRHIDE(p));
1581
1582 return p;
1583}
1584
1585/*
1586 * We maintain stats and ledgers so that a task's physical footprint is:
1587 * phys_footprint = ((internal - alternate_accounting)
1588 * + (internal_compressed - alternate_accounting_compressed)
1589 * + iokit_mapped
1590 * + purgeable_nonvolatile
1591 * + purgeable_nonvolatile_compressed
1592 * + page_table)
1593 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
1594 */
1595
1596#if MACH_ASSERT
1597static void pmap_check_ledgers(pmap_t pmap);
1598#else /* MACH_ASSERT */
1599static inline void
1600pmap_check_ledgers(__unused pmap_t pmap)
1601{
1602}
1603#endif /* MACH_ASSERT */
1604
1605/*
1606 * Retire the given physical map from service.
1607 * Should only be called if the map contains
1608 * no valid mappings.
1609 */
1610extern int vm_wired_objects_page_count;
1611
1612void
1613pmap_destroy(pmap_t p)
1614{
1615 os_ref_count_t c;
1616
1617 if (p == PMAP_NULL) {
1618 return;
1619 }
1620
1621 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1622 VM_KERNEL_ADDRHIDe(p));
1623
1624 PMAP_LOCK_EXCLUSIVE(p);
1625
1626 c = os_ref_release_locked(&p->ref_count);
1627
1628 pmap_assert((current_thread() && (current_thread()->map)) ? (current_thread()->map->pmap != p) : TRUE);
1629
1630 if (c == 0) {
1631 /*
1632 * If some cpu is not using the physical pmap pointer that it
1633 * is supposed to be (see set_dirbase), we might be using the
1634 * pmap that is being destroyed! Make sure we are
1635 * physically on the right pmap:
1636 */
1637 PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL);
1638 if (pmap_pcid_ncpus) {
1639 pmap_destroy_pcid_sync(p);
1640 }
1641 }
1642
1643 PMAP_UNLOCK_EXCLUSIVE(p);
1644
1645 if (c != 0) {
1646 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1647 pmap_assert(p == kernel_pmap);
1648 return; /* still in use */
1649 }
1650
1651 /*
1652 * Free the memory maps, then the
1653 * pmap structure.
1654 */
1655 int inuse_ptepages = 0;
1656
1657 zfree(pmap_anchor_zone, p->pm_pml4);
1658 zfree(pmap_uanchor_zone, p->pm_upml4);
1659
1660 inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1661 vm_object_deallocate(p->pm_obj_pml4);
1662
1663 inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1664 vm_object_deallocate(p->pm_obj_pdpt);
1665
1666 inuse_ptepages += p->pm_obj->resident_page_count;
1667 vm_object_deallocate(p->pm_obj);
1668
1669 OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count);
1670 PMAP_ZINFO_PFREE(p, inuse_ptepages * PAGE_SIZE);
1671
1672 pmap_check_ledgers(p);
1673 ledger_dereference(p->ledger);
1674 lck_rw_destroy(&p->pmap_rwl, &pmap_lck_grp);
1675 zfree(pmap_zone, p);
1676
1677 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1678}
1679
1680/*
1681 * Add a reference to the specified pmap.
1682 */
1683
1684void
1685pmap_reference(pmap_t p)
1686{
1687 if (p != PMAP_NULL) {
1688 PMAP_LOCK_EXCLUSIVE(p);
1689 os_ref_retain_locked(&p->ref_count);
1690 PMAP_UNLOCK_EXCLUSIVE(p);;
1691 }
1692}
1693
1694/*
1695 * Remove phys addr if mapped in specified map
1696 *
1697 */
1698void
1699pmap_remove_some_phys(
1700 __unused pmap_t map,
1701 __unused ppnum_t pn)
1702{
1703/* Implement to support working set code */
1704}
1705
1706
1707void
1708pmap_protect(
1709 pmap_t map,
1710 vm_map_offset_t sva,
1711 vm_map_offset_t eva,
1712 vm_prot_t prot)
1713{
1714 pmap_protect_options(map, sva, eva, prot, 0, NULL);
1715}
1716
1717
1718/*
1719 * Set the physical protection on the
1720 * specified range of this map as requested.
1721 *
1722 * VERY IMPORTANT: Will *NOT* increase permissions.
1723 * pmap_protect_options() should protect the range against any access types
1724 * that are not in "prot" but it should never grant extra access.
1725 * For example, if "prot" is READ|EXECUTE, that means "remove write
1726 * access" but it does *not* mean "add read and execute" access.
1727 * VM relies on getting soft-faults to enforce extra checks (code
1728 * signing, for example), for example.
1729 * New access permissions are granted via pmap_enter() only.
1730 */
1731void
1732pmap_protect_options(
1733 pmap_t map,
1734 vm_map_offset_t sva,
1735 vm_map_offset_t eva,
1736 vm_prot_t prot,
1737 unsigned int options,
1738 void *arg)
1739{
1740 pt_entry_t *pde;
1741 pt_entry_t *spte, *epte;
1742 vm_map_offset_t lva;
1743 vm_map_offset_t orig_sva;
1744 boolean_t set_NX;
1745 int num_found = 0;
1746 boolean_t is_ept;
1747
1748 pmap_intr_assert();
1749
1750 if (map == PMAP_NULL) {
1751 return;
1752 }
1753
1754 if (prot == VM_PROT_NONE) {
1755 pmap_remove_options(map, sva, eva, options);
1756 return;
1757 }
1758
1759 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
1760 VM_KERNEL_ADDRHIDE(map), VM_KERNEL_ADDRHIDE(sva),
1761 VM_KERNEL_ADDRHIDE(eva));
1762
1763 if (prot & VM_PROT_EXECUTE) {
1764 set_NX = FALSE;
1765 } else {
1766 set_NX = TRUE;
1767 }
1768
1769#if DEVELOPMENT || DEBUG
1770 if (__improbable(set_NX && (!nx_enabled || !map->nx_enabled))) {
1771 set_NX = FALSE;
1772 }
1773#endif
1774 is_ept = is_ept_pmap(map);
1775
1776 PMAP_LOCK_EXCLUSIVE(map);
1777
1778 orig_sva = sva;
1779 while (sva < eva) {
1780 lva = (sva + PDE_MAPPED_SIZE) & ~(PDE_MAPPED_SIZE - 1);
1781 if (lva > eva) {
1782 lva = eva;
1783 }
1784 pde = pmap_pde(map, sva);
1785 if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1786 if (*pde & PTE_PS) {
1787 /* superpage */
1788 spte = pde;
1789 epte = spte + 1; /* excluded */
1790 } else {
1791 spte = pmap_pte(map, (sva & ~(PDE_MAPPED_SIZE - 1)));
1792 spte = &spte[ptenum(sva)];
1793 epte = &spte[intel_btop(lva - sva)];
1794 }
1795
1796 for (; spte < epte; spte++) {
1797 if (!(*spte & PTE_VALID_MASK(is_ept))) {
1798 continue;
1799 }
1800
1801 if (is_ept) {
1802 if (!(prot & VM_PROT_READ)) {
1803 pmap_update_pte(spte, PTE_READ(is_ept), 0);
1804 }
1805 }
1806 if (!(prot & VM_PROT_WRITE)) {
1807 pmap_update_pte(spte, PTE_WRITE(is_ept), 0);
1808 }
1809#if DEVELOPMENT || DEBUG
1810 else if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) &&
1811 map == kernel_pmap) {
1812 pmap_update_pte(spte, 0, PTE_WRITE(is_ept));
1813 }
1814#endif /* DEVELOPMENT || DEBUG */
1815
1816 if (set_NX) {
1817 if (!is_ept) {
1818 pmap_update_pte(spte, 0, INTEL_PTE_NX);
1819 } else {
1820 pmap_update_pte(spte, INTEL_EPT_EX, 0);
1821 }
1822 }
1823 num_found++;
1824 }
1825 }
1826 sva = lva;
1827 }
1828 if (num_found) {
1829 if (options & PMAP_OPTIONS_NOFLUSH) {
1830 PMAP_UPDATE_TLBS_DELAYED(map, orig_sva, eva, (pmap_flush_context *)arg);
1831 } else {
1832 PMAP_UPDATE_TLBS(map, orig_sva, eva);
1833 }
1834 }
1835
1836 PMAP_UNLOCK_EXCLUSIVE(map);
1837
1838 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
1839}
1840
1841/* Map a (possibly) autogenned block */
1842kern_return_t
1843pmap_map_block(
1844 pmap_t pmap,
1845 addr64_t va,
1846 ppnum_t pa,
1847 uint32_t size,
1848 vm_prot_t prot,
1849 int attr,
1850 __unused unsigned int flags)
1851{
1852 kern_return_t kr;
1853 addr64_t original_va = va;
1854 uint32_t page;
1855 int cur_page_size;
1856
1857 if (attr & VM_MEM_SUPERPAGE) {
1858 cur_page_size = SUPERPAGE_SIZE;
1859 } else {
1860 cur_page_size = PAGE_SIZE;
1861 }
1862
1863 for (page = 0; page < size; page += cur_page_size / PAGE_SIZE) {
1864 kr = pmap_enter(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE);
1865
1866 if (kr != KERN_SUCCESS) {
1867 /*
1868 * This will panic for now, as it is unclear that
1869 * removing the mappings is correct.
1870 */
1871 panic("%s: failed pmap_enter, "
1872 "pmap=%p, va=%#llx, pa=%u, size=%u, prot=%#x, flags=%#x",
1873 __FUNCTION__,
1874 pmap, va, pa, size, prot, flags);
1875
1876 pmap_remove(pmap, original_va, va - original_va);
1877 return kr;
1878 }
1879
1880 va += cur_page_size;
1881 pa += cur_page_size / PAGE_SIZE;
1882 }
1883
1884 return KERN_SUCCESS;
1885}
1886
1887kern_return_t
1888pmap_expand_pml4(
1889 pmap_t map,
1890 vm_map_offset_t vaddr,
1891 unsigned int options)
1892{
1893 vm_page_t m;
1894 pmap_paddr_t pa;
1895 uint64_t i;
1896 ppnum_t pn;
1897 pml4_entry_t *pml4p;
1898 boolean_t is_ept = is_ept_pmap(map);
1899
1900 DBG("pmap_expand_pml4(%p,%p)\n", map, (void *)vaddr);
1901
1902 /* With the exception of the kext "basement", the kernel's level 4
1903 * pagetables must not be dynamically expanded.
1904 */
1905 assert(map != kernel_pmap || (vaddr == KERNEL_BASEMENT));
1906 /*
1907 * Allocate a VM page for the pml4 page
1908 */
1909 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
1910 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
1911 return KERN_RESOURCE_SHORTAGE;
1912 }
1913 VM_PAGE_WAIT();
1914 }
1915 /*
1916 * put the page into the pmap's obj list so it
1917 * can be found later.
1918 */
1919 pn = VM_PAGE_GET_PHYS_PAGE(m);
1920 pa = i386_ptob(pn);
1921 i = pml4idx(map, vaddr);
1922
1923 /*
1924 * Zero the page.
1925 */
1926 pmap_zero_page(pn);
1927
1928 vm_page_lockspin_queues();
1929 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
1930 vm_page_unlock_queues();
1931
1932 OSAddAtomic(1, &inuse_ptepages_count);
1933 OSAddAtomic64(1, &alloc_ptepages_count);
1934 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
1935
1936 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
1937 vm_object_lock(map->pm_obj_pml4);
1938
1939 PMAP_LOCK_EXCLUSIVE(map);
1940 /*
1941 * See if someone else expanded us first
1942 */
1943 if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
1944 PMAP_UNLOCK_EXCLUSIVE(map);
1945 vm_object_unlock(map->pm_obj_pml4);
1946
1947 VM_PAGE_FREE(m);
1948
1949 OSAddAtomic(-1, &inuse_ptepages_count);
1950 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
1951 return KERN_SUCCESS;
1952 }
1953
1954#if 0 /* DEBUG */
1955 if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE)) {
1956 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
1957 map, map->pm_obj_pml4, vaddr, i);
1958 }
1959#endif
1960 vm_page_insert_wired(m, map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
1961 vm_object_unlock(map->pm_obj_pml4);
1962
1963 /*
1964 * Set the page directory entry for this page table.
1965 */
1966 pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
1967
1968 pmap_store_pte(pml4p, pa_to_pte(pa)
1969 | PTE_READ(is_ept)
1970 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
1971 | PTE_WRITE(is_ept));
1972 pml4_entry_t *upml4p;
1973
1974 upml4p = pmap64_user_pml4(map, vaddr);
1975 pmap_store_pte(upml4p, pa_to_pte(pa)
1976 | PTE_READ(is_ept)
1977 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
1978 | PTE_WRITE(is_ept));
1979
1980 PMAP_UNLOCK_EXCLUSIVE(map);
1981
1982 return KERN_SUCCESS;
1983}
1984
1985kern_return_t
1986pmap_expand_pdpt(pmap_t map, vm_map_offset_t vaddr, unsigned int options)
1987{
1988 vm_page_t m;
1989 pmap_paddr_t pa;
1990 uint64_t i;
1991 ppnum_t pn;
1992 pdpt_entry_t *pdptp;
1993 boolean_t is_ept = is_ept_pmap(map);
1994
1995 DBG("pmap_expand_pdpt(%p,%p)\n", map, (void *)vaddr);
1996
1997 while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
1998 kern_return_t pep4kr = pmap_expand_pml4(map, vaddr, options);
1999 if (pep4kr != KERN_SUCCESS) {
2000 return pep4kr;
2001 }
2002 }
2003
2004 /*
2005 * Allocate a VM page for the pdpt page
2006 */
2007 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2008 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2009 return KERN_RESOURCE_SHORTAGE;
2010 }
2011 VM_PAGE_WAIT();
2012 }
2013
2014 /*
2015 * put the page into the pmap's obj list so it
2016 * can be found later.
2017 */
2018 pn = VM_PAGE_GET_PHYS_PAGE(m);
2019 pa = i386_ptob(pn);
2020 i = pdptidx(map, vaddr);
2021
2022 /*
2023 * Zero the page.
2024 */
2025 pmap_zero_page(pn);
2026
2027 vm_page_lockspin_queues();
2028 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2029 vm_page_unlock_queues();
2030
2031 OSAddAtomic(1, &inuse_ptepages_count);
2032 OSAddAtomic64(1, &alloc_ptepages_count);
2033 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2034
2035 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2036 vm_object_lock(map->pm_obj_pdpt);
2037
2038 PMAP_LOCK_EXCLUSIVE(map);
2039 /*
2040 * See if someone else expanded us first
2041 */
2042 if (pmap_pde(map, vaddr) != PD_ENTRY_NULL) {
2043 PMAP_UNLOCK_EXCLUSIVE(map);
2044 vm_object_unlock(map->pm_obj_pdpt);
2045
2046 VM_PAGE_FREE(m);
2047
2048 OSAddAtomic(-1, &inuse_ptepages_count);
2049 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2050 return KERN_SUCCESS;
2051 }
2052
2053#if 0 /* DEBUG */
2054 if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE)) {
2055 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
2056 map, map->pm_obj_pdpt, vaddr, i);
2057 }
2058#endif
2059 vm_page_insert_wired(m, map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2060 vm_object_unlock(map->pm_obj_pdpt);
2061
2062 /*
2063 * Set the page directory entry for this page table.
2064 */
2065 pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
2066
2067 pmap_store_pte(pdptp, pa_to_pte(pa)
2068 | PTE_READ(is_ept)
2069 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2070 | PTE_WRITE(is_ept));
2071
2072 PMAP_UNLOCK_EXCLUSIVE(map);
2073
2074 return KERN_SUCCESS;
2075}
2076
2077
2078
2079/*
2080 * Routine: pmap_expand
2081 *
2082 * Expands a pmap to be able to map the specified virtual address.
2083 *
2084 * Allocates new virtual memory for the P0 or P1 portion of the
2085 * pmap, then re-maps the physical pages that were in the old
2086 * pmap to be in the new pmap.
2087 *
2088 * Must be called with the pmap system and the pmap unlocked,
2089 * since these must be unlocked to use vm_allocate or vm_deallocate.
2090 * Thus it must be called in a loop that checks whether the map
2091 * has been expanded enough.
2092 * (We won't loop forever, since page tables aren't shrunk.)
2093 */
2094kern_return_t
2095pmap_expand(
2096 pmap_t map,
2097 vm_map_offset_t vaddr,
2098 unsigned int options)
2099{
2100 pt_entry_t *pdp;
2101 vm_page_t m;
2102 pmap_paddr_t pa;
2103 uint64_t i;
2104 ppnum_t pn;
2105 boolean_t is_ept = is_ept_pmap(map);
2106
2107
2108 /*
2109 * For the kernel, the virtual address must be in or above the basement
2110 * which is for kexts and is in the 512GB immediately below the kernel..
2111 * XXX - should use VM_MIN_KERNEL_AND_KEXT_ADDRESS not KERNEL_BASEMENT
2112 */
2113 if (__improbable(map == kernel_pmap &&
2114 !(vaddr >= KERNEL_BASEMENT && vaddr <= VM_MAX_KERNEL_ADDRESS))) {
2115 if ((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0) {
2116 panic("pmap_expand: bad vaddr 0x%llx for kernel pmap", vaddr);
2117 }
2118 }
2119
2120 while ((pdp = pmap_pde(map, vaddr)) == PD_ENTRY_NULL) {
2121 assert((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0);
2122 kern_return_t pepkr = pmap_expand_pdpt(map, vaddr, options);
2123 if (pepkr != KERN_SUCCESS) {
2124 return pepkr;
2125 }
2126 }
2127
2128 /*
2129 * Allocate a VM page for the pde entries.
2130 */
2131 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2132 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2133 return KERN_RESOURCE_SHORTAGE;
2134 }
2135 VM_PAGE_WAIT();
2136 }
2137
2138 /*
2139 * put the page into the pmap's obj list so it
2140 * can be found later.
2141 */
2142 pn = VM_PAGE_GET_PHYS_PAGE(m);
2143 pa = i386_ptob(pn);
2144 i = pdeidx(map, vaddr);
2145
2146 /*
2147 * Zero the page.
2148 */
2149 pmap_zero_page(pn);
2150
2151 vm_page_lockspin_queues();
2152 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2153 vm_page_unlock_queues();
2154
2155 OSAddAtomic(1, &inuse_ptepages_count);
2156 OSAddAtomic64(1, &alloc_ptepages_count);
2157 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2158
2159 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2160 vm_object_lock(map->pm_obj);
2161
2162 PMAP_LOCK_EXCLUSIVE(map);
2163
2164 /*
2165 * See if someone else expanded us first
2166 */
2167 if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
2168 PMAP_UNLOCK_EXCLUSIVE(map);
2169 vm_object_unlock(map->pm_obj);
2170
2171 VM_PAGE_FREE(m);
2172
2173 OSAddAtomic(-1, &inuse_ptepages_count); //todo replace all with inlines
2174 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2175 return KERN_SUCCESS;
2176 }
2177
2178#if 0 /* DEBUG */
2179 if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE)) {
2180 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
2181 map, map->pm_obj, vaddr, i);
2182 }
2183#endif
2184 vm_page_insert_wired(m, map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2185 vm_object_unlock(map->pm_obj);
2186
2187 /*
2188 * Set the page directory entry for this page table.
2189 */
2190 pdp = pmap_pde(map, vaddr);
2191 pmap_store_pte(pdp, pa_to_pte(pa)
2192 | PTE_READ(is_ept)
2193 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2194 | PTE_WRITE(is_ept));
2195
2196 PMAP_UNLOCK_EXCLUSIVE(map);
2197
2198 return KERN_SUCCESS;
2199}
2200/*
2201 * Query a pmap to see what size a given virtual address is mapped with.
2202 * If the vaddr is not mapped, returns 0.
2203 */
2204vm_size_t
2205pmap_query_pagesize(
2206 pmap_t pmap,
2207 vm_map_offset_t vaddr)
2208{
2209 pd_entry_t *pdep;
2210 vm_size_t size = 0;
2211
2212 assert(!is_ept_pmap(pmap));
2213 PMAP_LOCK_EXCLUSIVE(pmap);
2214
2215 pdep = pmap_pde(pmap, vaddr);
2216 if (pdep != PD_ENTRY_NULL) {
2217 if (*pdep & INTEL_PTE_PS) {
2218 size = I386_LPGBYTES;
2219 } else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
2220 size = I386_PGBYTES;
2221 }
2222 }
2223
2224 PMAP_UNLOCK_EXCLUSIVE(pmap);
2225
2226 return size;
2227}
2228
2229/*
2230 * Ensure the page table hierarchy is filled in down to
2231 * the large page level. Additionally returns FAILURE if
2232 * a lower page table already exists.
2233 */
2234static kern_return_t
2235pmap_pre_expand_large_internal(
2236 pmap_t pmap,
2237 vm_map_offset_t vaddr)
2238{
2239 ppnum_t pn;
2240 pt_entry_t *pte;
2241 boolean_t is_ept = is_ept_pmap(pmap);
2242 kern_return_t kr = KERN_SUCCESS;
2243
2244 if (pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) {
2245 if (!pmap_next_page_hi(&pn, FALSE)) {
2246 panic("pmap_pre_expand_large no PDPT");
2247 }
2248
2249 pmap_zero_page(pn);
2250
2251 pte = pmap64_pml4(pmap, vaddr);
2252
2253 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) |
2254 PTE_READ(is_ept) |
2255 (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) |
2256 PTE_WRITE(is_ept));
2257
2258 pte = pmap64_user_pml4(pmap, vaddr);
2259
2260 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) |
2261 PTE_READ(is_ept) |
2262 (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) |
2263 PTE_WRITE(is_ept));
2264 }
2265
2266 if (pmap_pde(pmap, vaddr) == PD_ENTRY_NULL) {
2267 if (!pmap_next_page_hi(&pn, FALSE)) {
2268 panic("pmap_pre_expand_large no PDE");
2269 }
2270
2271 pmap_zero_page(pn);
2272
2273 pte = pmap64_pdpt(pmap, vaddr);
2274
2275 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) |
2276 PTE_READ(is_ept) |
2277 (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) |
2278 PTE_WRITE(is_ept));
2279 } else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
2280 kr = KERN_FAILURE;
2281 }
2282
2283 return kr;
2284}
2285
2286/*
2287 * Wrapper that locks the pmap.
2288 */
2289kern_return_t
2290pmap_pre_expand_large(
2291 pmap_t pmap,
2292 vm_map_offset_t vaddr)
2293{
2294 kern_return_t kr;
2295
2296 PMAP_LOCK_EXCLUSIVE(pmap);
2297 kr = pmap_pre_expand_large_internal(pmap, vaddr);
2298 PMAP_UNLOCK_EXCLUSIVE(pmap);
2299 return kr;
2300}
2301
2302/*
2303 * On large memory machines, pmap_steal_memory() will allocate past
2304 * the 1GB of pre-allocated/mapped virtual kernel area. This function
2305 * expands kernel the page tables to cover a given vaddr. It uses pages
2306 * from the same pool that pmap_steal_memory() uses, since vm_page_grab()
2307 * isn't available yet.
2308 */
2309void
2310pmap_pre_expand(
2311 pmap_t pmap,
2312 vm_map_offset_t vaddr)
2313{
2314 ppnum_t pn;
2315 pt_entry_t *pte;
2316 boolean_t is_ept = is_ept_pmap(pmap);
2317
2318 /*
2319 * This returns failure if a 4K page table already exists.
2320 * Othewise it fills in the page table hierarchy down
2321 * to that level.
2322 */
2323 PMAP_LOCK_EXCLUSIVE(pmap);
2324 if (pmap_pre_expand_large_internal(pmap, vaddr) == KERN_FAILURE) {
2325 PMAP_UNLOCK_EXCLUSIVE(pmap);
2326 return;
2327 }
2328
2329 /* Add the lowest table */
2330 if (!pmap_next_page_hi(&pn, FALSE)) {
2331 panic("pmap_pre_expand");
2332 }
2333
2334 pmap_zero_page(pn);
2335
2336 pte = pmap_pde(pmap, vaddr);
2337
2338 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) |
2339 PTE_READ(is_ept) |
2340 (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) |
2341 PTE_WRITE(is_ept));
2342 PMAP_UNLOCK_EXCLUSIVE(pmap);
2343}
2344
2345/*
2346 * pmap_sync_page_data_phys(ppnum_t pa)
2347 *
2348 * Invalidates all of the instruction cache on a physical page and
2349 * pushes any dirty data from the data cache for the same physical page
2350 * Not required in i386.
2351 */
2352void
2353pmap_sync_page_data_phys(__unused ppnum_t pa)
2354{
2355 return;
2356}
2357
2358/*
2359 * pmap_sync_page_attributes_phys(ppnum_t pa)
2360 *
2361 * Write back and invalidate all cachelines on a physical page.
2362 */
2363void
2364pmap_sync_page_attributes_phys(ppnum_t pa)
2365{
2366 cache_flush_page_phys(pa);
2367}
2368
2369void
2370pmap_copy_page(ppnum_t src, ppnum_t dst)
2371{
2372 bcopy_phys((addr64_t)i386_ptob(src),
2373 (addr64_t)i386_ptob(dst),
2374 PAGE_SIZE);
2375}
2376
2377
2378/*
2379 * Routine: pmap_pageable
2380 * Function:
2381 * Make the specified pages (by pmap, offset)
2382 * pageable (or not) as requested.
2383 *
2384 * A page which is not pageable may not take
2385 * a fault; therefore, its page table entry
2386 * must remain valid for the duration.
2387 *
2388 * This routine is merely advisory; pmap_enter
2389 * will specify that these pages are to be wired
2390 * down (or not) as appropriate.
2391 */
2392void
2393pmap_pageable(
2394 __unused pmap_t pmap,
2395 __unused vm_map_offset_t start_addr,
2396 __unused vm_map_offset_t end_addr,
2397 __unused boolean_t pageable)
2398{
2399#ifdef lint
2400 pmap++; start_addr++; end_addr++; pageable++;
2401#endif /* lint */
2402}
2403
2404void
2405invalidate_icache(__unused vm_offset_t addr,
2406 __unused unsigned cnt,
2407 __unused int phys)
2408{
2409 return;
2410}
2411
2412void
2413flush_dcache(__unused vm_offset_t addr,
2414 __unused unsigned count,
2415 __unused int phys)
2416{
2417 return;
2418}
2419
2420#if CONFIG_DTRACE
2421/*
2422 * Constrain DTrace copyin/copyout actions
2423 */
2424extern kern_return_t dtrace_copyio_preflight(addr64_t);
2425extern kern_return_t dtrace_copyio_postflight(addr64_t);
2426
2427kern_return_t
2428dtrace_copyio_preflight(__unused addr64_t va)
2429{
2430 thread_t thread = current_thread();
2431 uint64_t ccr3;
2432 if (current_map() == kernel_map) {
2433 return KERN_FAILURE;
2434 } else if (((ccr3 = get_cr3_base()) != thread->map->pmap->pm_cr3) && (no_shared_cr3 == FALSE)) {
2435 return KERN_FAILURE;
2436 } else if (no_shared_cr3 && (ccr3 != kernel_pmap->pm_cr3)) {
2437 return KERN_FAILURE;
2438 } else {
2439 return KERN_SUCCESS;
2440 }
2441}
2442
2443kern_return_t
2444dtrace_copyio_postflight(__unused addr64_t va)
2445{
2446 return KERN_SUCCESS;
2447}
2448#endif /* CONFIG_DTRACE */
2449
2450#include <mach_vm_debug.h>
2451#if MACH_VM_DEBUG
2452#include <vm/vm_debug.h>
2453
2454int
2455pmap_list_resident_pages(
2456 __unused pmap_t pmap,
2457 __unused vm_offset_t *listp,
2458 __unused int space)
2459{
2460 return 0;
2461}
2462#endif /* MACH_VM_DEBUG */
2463
2464
2465#if CONFIG_COREDUMP
2466/* temporary workaround */
2467boolean_t
2468coredumpok(__unused vm_map_t map, __unused mach_vm_offset_t va)
2469{
2470#if 0
2471 pt_entry_t *ptep;
2472
2473 ptep = pmap_pte(map->pmap, va);
2474 if (0 == ptep) {
2475 return FALSE;
2476 }
2477 return (*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED);
2478#else
2479 return TRUE;
2480#endif
2481}
2482#endif
2483
2484boolean_t
2485phys_page_exists(ppnum_t pn)
2486{
2487 assert(pn != vm_page_fictitious_addr);
2488
2489 if (!pmap_initialized) {
2490 return TRUE;
2491 }
2492
2493 if (pn == vm_page_guard_addr) {
2494 return FALSE;
2495 }
2496
2497 if (!IS_MANAGED_PAGE(ppn_to_pai(pn))) {
2498 return FALSE;
2499 }
2500
2501 return TRUE;
2502}
2503
2504
2505
2506void
2507pmap_switch(pmap_t tpmap)
2508{
2509 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(tpmap));
2510 assert(ml_get_interrupts_enabled() == FALSE);
2511 set_dirbase(tpmap, current_thread(), cpu_number());
2512 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
2513}
2514
2515void
2516pmap_require(pmap_t pmap)
2517{
2518 if (pmap != kernel_pmap) {
2519 zone_id_require(ZONE_ID_PMAP, sizeof(struct pmap), pmap);
2520 }
2521}
2522
2523/*
2524 * disable no-execute capability on
2525 * the specified pmap
2526 */
2527void
2528pmap_disable_NX(__unused pmap_t pmap)
2529{
2530#if DEVELOPMENT || DEBUG
2531 pmap->nx_enabled = 0;
2532#endif
2533}
2534
2535void
2536pmap_flush_context_init(pmap_flush_context *pfc)
2537{
2538 pfc->pfc_cpus = 0;
2539 pfc->pfc_invalid_global = 0;
2540}
2541
2542static bool
2543pmap_tlbi_response(uint32_t lcpu, uint32_t rcpu, bool ngflush)
2544{
2545 bool responded = false;
2546 bool gflushed = (cpu_datap(rcpu)->cpu_tlb_invalid_global_count !=
2547 cpu_datap(lcpu)->cpu_tlb_gen_counts_global[rcpu]);
2548
2549 if (ngflush) {
2550 if (gflushed) {
2551 responded = true;
2552 }
2553 } else {
2554 if (gflushed) {
2555 responded = true;
2556 } else {
2557 bool lflushed = (cpu_datap(rcpu)->cpu_tlb_invalid_local_count !=
2558 cpu_datap(lcpu)->cpu_tlb_gen_counts_local[rcpu]);
2559 if (lflushed) {
2560 responded = true;
2561 }
2562 }
2563 }
2564
2565 if (responded == false) {
2566 if ((cpu_datap(rcpu)->cpu_tlb_invalid == 0) ||
2567 !CPU_CR3_IS_ACTIVE(rcpu) ||
2568 !cpu_is_running(rcpu)) {
2569 responded = true;
2570 }
2571 }
2572 return responded;
2573}
2574
2575extern uint64_t TLBTimeOut;
2576void
2577pmap_flush(
2578 pmap_flush_context *pfc)
2579{
2580 unsigned int my_cpu;
2581 unsigned int cpu;
2582 cpumask_t cpu_bit;
2583 cpumask_t cpus_to_respond = 0;
2584 cpumask_t cpus_to_signal = 0;
2585 cpumask_t cpus_signaled = 0;
2586 boolean_t flush_self = FALSE;
2587 uint64_t deadline;
2588 bool need_global_flush = false;
2589
2590 mp_disable_preemption();
2591
2592 my_cpu = cpu_number();
2593 cpus_to_signal = pfc->pfc_cpus;
2594
2595 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_START,
2596 NULL, cpus_to_signal);
2597
2598 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus && cpus_to_signal; cpu++, cpu_bit <<= 1) {
2599 if (cpus_to_signal & cpu_bit) {
2600 cpus_to_signal &= ~cpu_bit;
2601
2602 if (!cpu_is_running(cpu)) {
2603 continue;
2604 }
2605
2606 if (pfc->pfc_invalid_global & cpu_bit) {
2607 cpu_datap(cpu)->cpu_tlb_invalid_global = 1;
2608 need_global_flush = true;
2609 } else {
2610 cpu_datap(cpu)->cpu_tlb_invalid_local = 1;
2611 }
2612 cpu_datap(my_cpu)->cpu_tlb_gen_counts_global[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_global_count;
2613 cpu_datap(my_cpu)->cpu_tlb_gen_counts_local[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_local_count;
2614 mfence();
2615
2616 if (cpu == my_cpu) {
2617 flush_self = TRUE;
2618 continue;
2619 }
2620 if (CPU_CR3_IS_ACTIVE(cpu)) {
2621 cpus_to_respond |= cpu_bit;
2622 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2623 }
2624 }
2625 }
2626 cpus_signaled = cpus_to_respond;
2627
2628 /*
2629 * Flush local tlb if required.
2630 * Do this now to overlap with other processors responding.
2631 */
2632 if (flush_self) {
2633 process_pmap_updates(NULL, (pfc->pfc_invalid_global != 0), 0ULL, ~0ULL);
2634 }
2635
2636 if (cpus_to_respond) {
2637 deadline = mach_absolute_time() +
2638 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
2639 boolean_t is_timeout_traced = FALSE;
2640
2641 /*
2642 * Wait for those other cpus to acknowledge
2643 */
2644 while (cpus_to_respond != 0) {
2645 long orig_acks = 0;
2646
2647 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2648 bool responded = false;
2649 if ((cpus_to_respond & cpu_bit) != 0) {
2650 responded = pmap_tlbi_response(my_cpu, cpu, need_global_flush);
2651 if (responded) {
2652 cpus_to_respond &= ~cpu_bit;
2653 }
2654 cpu_pause();
2655 }
2656
2657 if (cpus_to_respond == 0) {
2658 break;
2659 }
2660 }
2661 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
2662 if (machine_timeout_suspended()) {
2663 continue;
2664 }
2665 if (TLBTimeOut == 0) {
2666 if (is_timeout_traced) {
2667 continue;
2668 }
2669
2670 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
2671 NULL, cpus_to_signal, cpus_to_respond);
2672
2673 is_timeout_traced = TRUE;
2674 continue;
2675 }
2676 orig_acks = NMIPI_acks;
2677 NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
2678 panic("Uninterruptible processor(s): CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu",
2679 cpus_to_respond, orig_acks, NMIPI_acks, deadline);
2680 }
2681 }
2682 }
2683
2684 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_END,
2685 NULL, cpus_signaled, flush_self);
2686
2687 mp_enable_preemption();
2688}
2689
2690
2691static void
2692invept(void *eptp)
2693{
2694 struct {
2695 uint64_t eptp;
2696 uint64_t reserved;
2697 } __attribute__((aligned(16), packed)) invept_descriptor = {(uint64_t)eptp, 0};
2698
2699 __asm__ volatile ("invept (%%rax), %%rcx"
2700 : : "c" (PMAP_INVEPT_SINGLE_CONTEXT), "a" (&invept_descriptor)
2701 : "cc", "memory");
2702}
2703
2704/*
2705 * Called with pmap locked, we:
2706 * - scan through per-cpu data to see which other cpus need to flush
2707 * - send an IPI to each non-idle cpu to be flushed
2708 * - wait for all to signal back that they are inactive or we see that
2709 * they are at a safe point (idle).
2710 * - flush the local tlb if active for this pmap
2711 * - return ... the caller will unlock the pmap
2712 */
2713
2714void
2715pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int options, pmap_flush_context *pfc)
2716{
2717 unsigned int cpu;
2718 cpumask_t cpu_bit;
2719 cpumask_t cpus_to_signal = 0;
2720 unsigned int my_cpu = cpu_number();
2721 pmap_paddr_t pmap_cr3 = pmap->pm_cr3;
2722 boolean_t flush_self = FALSE;
2723 uint64_t deadline;
2724 boolean_t pmap_is_shared = (pmap->pm_shared || (pmap == kernel_pmap));
2725 bool need_global_flush = false;
2726 uint32_t event_code;
2727 vm_map_offset_t event_startv, event_endv;
2728 boolean_t is_ept = is_ept_pmap(pmap);
2729
2730 assert((processor_avail_count < 2) ||
2731 (ml_get_interrupts_enabled() && get_preemption_level() != 0));
2732
2733 assert((endv - startv) >= PAGE_SIZE);
2734 assert(((endv | startv) & PAGE_MASK) == 0);
2735
2736 if (__improbable(kdebug_enable)) {
2737 if (pmap == kernel_pmap) {
2738 event_code = PMAP_CODE(PMAP__FLUSH_KERN_TLBS);
2739 event_startv = VM_KERNEL_UNSLIDE_OR_PERM(startv);
2740 event_endv = VM_KERNEL_UNSLIDE_OR_PERM(endv);
2741 } else if (__improbable(is_ept)) {
2742 event_code = PMAP_CODE(PMAP__FLUSH_EPT);
2743 event_startv = startv;
2744 event_endv = endv;
2745 } else {
2746 event_code = PMAP_CODE(PMAP__FLUSH_TLBS);
2747 event_startv = startv;
2748 event_endv = endv;
2749 }
2750 }
2751
2752 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_START,
2753 VM_KERNEL_UNSLIDE_OR_PERM(pmap), options,
2754 event_startv, event_endv);
2755
2756 if (__improbable(is_ept)) {
2757 mp_cpus_call(CPUMASK_ALL, ASYNC, invept, (void*)pmap->pm_eptp);
2758 goto out;
2759 }
2760
2761 /*
2762 * Scan other cpus for matching active or task CR3.
2763 * For idle cpus (with no active map) we mark them invalid but
2764 * don't signal -- they'll check as they go busy.
2765 */
2766 if (pmap_pcid_ncpus) {
2767 if (pmap_is_shared) {
2768 need_global_flush = true;
2769 }
2770 pmap_pcid_invalidate_all_cpus(pmap);
2771 mfence();
2772 }
2773
2774 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2775 if (!cpu_is_running(cpu)) {
2776 continue;
2777 }
2778 uint64_t cpu_active_cr3 = CPU_GET_ACTIVE_CR3(cpu);
2779 uint64_t cpu_task_cr3 = CPU_GET_TASK_CR3(cpu);
2780
2781 if ((pmap_cr3 == cpu_task_cr3) ||
2782 (pmap_cr3 == cpu_active_cr3) ||
2783 (pmap_is_shared)) {
2784 if (options & PMAP_DELAY_TLB_FLUSH) {
2785 if (need_global_flush == true) {
2786 pfc->pfc_invalid_global |= cpu_bit;
2787 }
2788 pfc->pfc_cpus |= cpu_bit;
2789
2790 continue;
2791 }
2792 if (need_global_flush == true) {
2793 cpu_datap(my_cpu)->cpu_tlb_gen_counts_global[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_global_count;
2794 cpu_datap(cpu)->cpu_tlb_invalid_global = 1;
2795 } else {
2796 cpu_datap(my_cpu)->cpu_tlb_gen_counts_local[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_local_count;
2797 cpu_datap(cpu)->cpu_tlb_invalid_local = 1;
2798 }
2799
2800 if (cpu == my_cpu) {
2801 flush_self = TRUE;
2802 continue;
2803 }
2804
2805 mfence();
2806
2807 /*
2808 * We don't need to signal processors which will flush
2809 * lazily at the idle state or kernel boundary.
2810 * For example, if we're invalidating the kernel pmap,
2811 * processors currently in userspace don't need to flush
2812 * their TLBs until the next time they enter the kernel.
2813 * Alterations to the address space of a task active
2814 * on a remote processor result in a signal, to
2815 * account for copy operations. (There may be room
2816 * for optimization in such cases).
2817 * The order of the loads below with respect
2818 * to the store to the "cpu_tlb_invalid" field above
2819 * is important--hence the barrier.
2820 */
2821 if (CPU_CR3_IS_ACTIVE(cpu) &&
2822 (pmap_cr3 == CPU_GET_ACTIVE_CR3(cpu) ||
2823 pmap->pm_shared ||
2824 (pmap_cr3 == CPU_GET_TASK_CR3(cpu)))) {
2825 cpus_to_signal |= cpu_bit;
2826 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2827 }
2828 }
2829 }
2830
2831 if ((options & PMAP_DELAY_TLB_FLUSH)) {
2832 goto out;
2833 }
2834
2835 /*
2836 * Flush local tlb if required.
2837 * Do this now to overlap with other processors responding.
2838 */
2839 if (flush_self) {
2840 process_pmap_updates(pmap, pmap_is_shared, startv, endv);
2841 }
2842
2843 if (cpus_to_signal) {
2844 cpumask_t cpus_to_respond = cpus_to_signal;
2845
2846 deadline = mach_absolute_time() +
2847 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
2848 boolean_t is_timeout_traced = FALSE;
2849
2850 /*
2851 * Wait for those other cpus to acknowledge
2852 */
2853 while (cpus_to_respond != 0) {
2854 long orig_acks = 0;
2855
2856 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2857 bool responded = false;
2858 if ((cpus_to_respond & cpu_bit) != 0) {
2859 responded = pmap_tlbi_response(my_cpu, cpu, need_global_flush);
2860 if (responded) {
2861 cpus_to_respond &= ~cpu_bit;
2862 }
2863 cpu_pause();
2864 }
2865 if (cpus_to_respond == 0) {
2866 break;
2867 }
2868 }
2869 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
2870 if (machine_timeout_suspended()) {
2871 continue;
2872 }
2873 if (TLBTimeOut == 0) {
2874 /* cut tracepoint but don't panic */
2875 if (is_timeout_traced) {
2876 continue;
2877 }
2878
2879 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
2880 VM_KERNEL_UNSLIDE_OR_PERM(pmap),
2881 cpus_to_signal,
2882 cpus_to_respond);
2883
2884 is_timeout_traced = TRUE;
2885 continue;
2886 }
2887 orig_acks = NMIPI_acks;
2888 uint64_t tstamp1 = mach_absolute_time();
2889 NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
2890 uint64_t tstamp2 = mach_absolute_time();
2891 panic("IPI timeout, unresponsive CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu, pre-NMIPI time: 0x%llx, current: 0x%llx, global: %d",
2892 cpus_to_respond, orig_acks, NMIPI_acks, deadline, tstamp1, tstamp2, need_global_flush);
2893 }
2894 }
2895 }
2896
2897 if (__improbable((pmap == kernel_pmap) && (flush_self != TRUE))) {
2898 panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, pmap_cr3: 0x%llx, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, pmap_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map);
2899 }
2900
2901out:
2902 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_END,
2903 VM_KERNEL_UNSLIDE_OR_PERM(pmap), cpus_to_signal,
2904 event_startv, event_endv);
2905}
2906
2907static void
2908process_pmap_updates(pmap_t p, bool pshared, addr64_t istart, addr64_t iend)
2909{
2910 int ccpu = cpu_number();
2911 bool gtlbf = false;
2912
2913 pmap_assert(ml_get_interrupts_enabled() == 0 ||
2914 get_preemption_level() != 0);
2915
2916 if (cpu_datap(ccpu)->cpu_tlb_invalid_global) {
2917 cpu_datap(ccpu)->cpu_tlb_invalid_global_count++;
2918 cpu_datap(ccpu)->cpu_tlb_invalid = 0;
2919 gtlbf = true;
2920 } else {
2921 cpu_datap(ccpu)->cpu_tlb_invalid_local_count++;
2922 cpu_datap(ccpu)->cpu_tlb_invalid_local = 0;
2923 }
2924
2925 if (pmap_pcid_ncpus) {
2926 if (p) {
2927 /* TODO global generation count to
2928 * avoid potentially redundant
2929 * csw invalidations post-global invalidation
2930 */
2931 pmap_pcid_validate_cpu(p, ccpu);
2932 pmap_tlbi_range(istart, iend, (pshared || gtlbf), p->pmap_pcid_cpus[ccpu]);
2933 } else {
2934 pmap_pcid_validate_current();
2935 pmap_tlbi_range(istart, iend, true, 0);
2936 }
2937 } else {
2938 pmap_tlbi_range(0, ~0ULL, true, 0);
2939 }
2940}
2941
2942void
2943pmap_update_interrupt(void)
2944{
2945 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START);
2946
2947 if (current_cpu_datap()->cpu_tlb_invalid) {
2948 process_pmap_updates(NULL, true, 0ULL, ~0ULL);
2949 }
2950
2951 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END);
2952}
2953
2954#include <mach/mach_vm.h> /* mach_vm_region_recurse() */
2955/* Scan kernel pmap for W+X PTEs, scan kernel VM map for W+X map entries
2956 * and identify ranges with mismatched VM permissions and PTE permissions
2957 */
2958kern_return_t
2959pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset_t ev)
2960{
2961 vm_offset_t cv = sv;
2962 kern_return_t rv = KERN_SUCCESS;
2963 uint64_t skip4 = 0, skip2 = 0;
2964
2965 assert(!is_ept_pmap(ipmap));
2966
2967 sv &= ~PAGE_MASK_64;
2968 ev &= ~PAGE_MASK_64;
2969 while (cv < ev) {
2970 if (__improbable((cv > 0x00007FFFFFFFFFFFULL) &&
2971 (cv < 0xFFFF800000000000ULL))) {
2972 cv = 0xFFFF800000000000ULL;
2973 }
2974 /* Potential inconsistencies from not holding pmap lock
2975 * but harmless for the moment.
2976 */
2977 if (((cv & PML4MASK) == 0) && (pmap64_pml4(ipmap, cv) == 0)) {
2978 if ((cv + NBPML4) > cv) {
2979 cv += NBPML4;
2980 } else {
2981 break;
2982 }
2983 skip4++;
2984 continue;
2985 }
2986 if (((cv & PDMASK) == 0) && (pmap_pde(ipmap, cv) == 0)) {
2987 if ((cv + NBPD) > cv) {
2988 cv += NBPD;
2989 } else {
2990 break;
2991 }
2992 skip2++;
2993 continue;
2994 }
2995
2996 pt_entry_t *ptep = pmap_pte(ipmap, cv);
2997 if (ptep && (*ptep & INTEL_PTE_VALID)) {
2998 if (*ptep & INTEL_PTE_WRITE) {
2999 if (!(*ptep & INTEL_PTE_NX)) {
3000 kprintf("W+X PTE at 0x%lx, P4: 0x%llx, P3: 0x%llx, P2: 0x%llx, PT: 0x%llx, VP: %u\n", cv, *pmap64_pml4(ipmap, cv), *pmap64_pdpt(ipmap, cv), *pmap_pde(ipmap, cv), *ptep, pmap_valid_page((ppnum_t)(i386_btop(pte_to_pa(*ptep)))));
3001 rv = KERN_FAILURE;
3002 }
3003 }
3004 }
3005 cv += PAGE_SIZE;
3006 }
3007 kprintf("Completed pmap scan\n");
3008 cv = sv;
3009
3010 struct vm_region_submap_info_64 vbr;
3011 mach_msg_type_number_t vbrcount = 0;
3012 mach_vm_size_t vmsize;
3013 vm_prot_t prot;
3014 uint32_t nesting_depth = 0;
3015 kern_return_t kret;
3016
3017 while (cv < ev) {
3018 for (;;) {
3019 vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64;
3020 if ((kret = mach_vm_region_recurse(ivmmap,
3021 (mach_vm_address_t *) &cv, &vmsize, &nesting_depth,
3022 (vm_region_recurse_info_t)&vbr,
3023 &vbrcount)) != KERN_SUCCESS) {
3024 break;
3025 }
3026
3027 if (vbr.is_submap) {
3028 nesting_depth++;
3029 continue;
3030 } else {
3031 break;
3032 }
3033 }
3034
3035 if (kret != KERN_SUCCESS) {
3036 break;
3037 }
3038
3039 prot = vbr.protection;
3040
3041 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) {
3042 kprintf("W+X map entry at address 0x%lx\n", cv);
3043 rv = KERN_FAILURE;
3044 }
3045
3046 if (prot) {
3047 vm_offset_t pcv;
3048 for (pcv = cv; pcv < cv + vmsize; pcv += PAGE_SIZE) {
3049 pt_entry_t *ptep = pmap_pte(ipmap, pcv);
3050 vm_prot_t tprot;
3051
3052 if ((ptep == NULL) || !(*ptep & INTEL_PTE_VALID)) {
3053 continue;
3054 }
3055 tprot = VM_PROT_READ;
3056 if (*ptep & INTEL_PTE_WRITE) {
3057 tprot |= VM_PROT_WRITE;
3058 }
3059 if ((*ptep & INTEL_PTE_NX) == 0) {
3060 tprot |= VM_PROT_EXECUTE;
3061 }
3062 if (tprot != prot) {
3063 kprintf("PTE/map entry permissions mismatch at address 0x%lx, pte: 0x%llx, protection: 0x%x\n", pcv, *ptep, prot);
3064 rv = KERN_FAILURE;
3065 }
3066 }
3067 }
3068 cv += vmsize;
3069 }
3070 return rv;
3071}
3072
3073#if MACH_ASSERT
3074extern int pmap_ledgers_panic;
3075extern int pmap_ledgers_panic_leeway;
3076
3077static void
3078pmap_check_ledgers(
3079 pmap_t pmap)
3080{
3081 int pid;
3082 char *procname;
3083
3084 if (pmap->pmap_pid == 0) {
3085 /*
3086 * This pmap was not or is no longer fully associated
3087 * with a task (e.g. the old pmap after a fork()/exec() or
3088 * spawn()). Its "ledger" still points at a task that is
3089 * now using a different (and active) address space, so
3090 * we can't check that all the pmap ledgers are balanced here.
3091 *
3092 * If the "pid" is set, that means that we went through
3093 * pmap_set_process() in task_terminate_internal(), so
3094 * this task's ledger should not have been re-used and
3095 * all the pmap ledgers should be back to 0.
3096 */
3097 return;
3098 }
3099
3100 pid = pmap->pmap_pid;
3101 procname = pmap->pmap_procname;
3102
3103 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
3104
3105 if (pmap->stats.resident_count != 0 ||
3106#if 35156815
3107 /*
3108 * "wired_count" is unfortunately a bit inaccurate, so let's
3109 * tolerate some slight deviation to limit the amount of
3110 * somewhat-spurious assertion failures.
3111 */
3112 pmap->stats.wired_count > 10 ||
3113#else /* 35156815 */
3114 pmap->stats.wired_count != 0 ||
3115#endif /* 35156815 */
3116 pmap->stats.device != 0 ||
3117 pmap->stats.internal != 0 ||
3118 pmap->stats.external != 0 ||
3119 pmap->stats.reusable != 0 ||
3120 pmap->stats.compressed != 0) {
3121 if (pmap_stats_assert &&
3122 pmap->pmap_stats_assert) {
3123 panic("pmap_destroy(%p) %d[%s] imbalanced stats: resident=%d wired=%d device=%d internal=%d external=%d reusable=%d compressed=%lld",
3124 pmap, pid, procname,
3125 pmap->stats.resident_count,
3126 pmap->stats.wired_count,
3127 pmap->stats.device,
3128 pmap->stats.internal,
3129 pmap->stats.external,
3130 pmap->stats.reusable,
3131 pmap->stats.compressed);
3132 } else {
3133 printf("pmap_destroy(%p) %d[%s] imbalanced stats: resident=%d wired=%d device=%d internal=%d external=%d reusable=%d compressed=%lld",
3134 pmap, pid, procname,
3135 pmap->stats.resident_count,
3136 pmap->stats.wired_count,
3137 pmap->stats.device,
3138 pmap->stats.internal,
3139 pmap->stats.external,
3140 pmap->stats.reusable,
3141 pmap->stats.compressed);
3142 }
3143 }
3144}
3145
3146void
3147pmap_set_process(
3148 pmap_t pmap,
3149 int pid,
3150 char *procname)
3151{
3152 if (pmap == NULL) {
3153 return;
3154 }
3155
3156 pmap->pmap_pid = pid;
3157 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3158 if (pmap_ledgers_panic_leeway) {
3159 /*
3160 * XXX FBDP
3161 * Some processes somehow trigger some issues that make
3162 * the pmap stats and ledgers go off track, causing
3163 * some assertion failures and ledger panics.
3164 * Turn off the sanity checks if we allow some ledger leeway
3165 * because of that. We'll still do a final check in
3166 * pmap_check_ledgers() for discrepancies larger than the
3167 * allowed leeway after the address space has been fully
3168 * cleaned up.
3169 */
3170 pmap->pmap_stats_assert = FALSE;
3171 ledger_disable_panic_on_negative(pmap->ledger,
3172 task_ledgers.phys_footprint);
3173 ledger_disable_panic_on_negative(pmap->ledger,
3174 task_ledgers.internal);
3175 ledger_disable_panic_on_negative(pmap->ledger,
3176 task_ledgers.internal_compressed);
3177 ledger_disable_panic_on_negative(pmap->ledger,
3178 task_ledgers.iokit_mapped);
3179 ledger_disable_panic_on_negative(pmap->ledger,
3180 task_ledgers.alternate_accounting);
3181 ledger_disable_panic_on_negative(pmap->ledger,
3182 task_ledgers.alternate_accounting_compressed);
3183 }
3184}
3185#endif /* MACH_ASSERT */
3186
3187
3188#if DEVELOPMENT || DEBUG
3189int pmap_pagezero_mitigation = 1;
3190#endif
3191
3192void
3193pmap_advise_pagezero_range(pmap_t lpmap, uint64_t low_bound)
3194{
3195#if DEVELOPMENT || DEBUG
3196 if (pmap_pagezero_mitigation == 0) {
3197 lpmap->pagezero_accessible = FALSE;
3198 return;
3199 }
3200#endif
3201 lpmap->pagezero_accessible = ((pmap_smap_enabled == FALSE) && (low_bound < 0x1000));
3202 if (lpmap == current_pmap()) {
3203 mp_disable_preemption();
3204 current_cpu_datap()->cpu_pagezero_mapped = lpmap->pagezero_accessible;
3205 mp_enable_preemption();
3206 }
3207}
3208
3209uintptr_t
3210pmap_verify_noncacheable(uintptr_t vaddr)
3211{
3212 pt_entry_t *ptep = NULL;
3213 ptep = pmap_pte(kernel_pmap, vaddr);
3214 if (ptep == NULL) {
3215 panic("pmap_verify_noncacheable: no translation for 0x%lx", vaddr);
3216 }
3217 /* Non-cacheable OK */
3218 if (*ptep & (INTEL_PTE_NCACHE)) {
3219 return pte_to_pa(*ptep) | (vaddr & INTEL_OFFMASK);
3220 }
3221 /* Write-combined OK */
3222 if (*ptep & (INTEL_PTE_PAT)) {
3223 return pte_to_pa(*ptep) | (vaddr & INTEL_OFFMASK);
3224 }
3225 panic("pmap_verify_noncacheable: IO read from a cacheable address? address: 0x%lx, PTE: %p, *PTE: 0x%llx", vaddr, ptep, *ptep);
3226 /*NOTREACHED*/
3227 return 0;
3228}
3229
3230void
3231trust_cache_init(void)
3232{
3233 // Unsupported on this architecture.
3234}
3235
3236kern_return_t
3237pmap_load_legacy_trust_cache(struct pmap_legacy_trust_cache __unused *trust_cache,
3238 const vm_size_t __unused trust_cache_len)
3239{
3240 // Unsupported on this architecture.
3241 return KERN_NOT_SUPPORTED;
3242}
3243
3244pmap_tc_ret_t
3245pmap_load_image4_trust_cache(struct pmap_image4_trust_cache __unused *trust_cache,
3246 const vm_size_t __unused trust_cache_len,
3247 uint8_t const * __unused img4_manifest,
3248 const vm_size_t __unused img4_manifest_buffer_len,
3249 const vm_size_t __unused img4_manifest_actual_len,
3250 bool __unused dry_run)
3251{
3252 // Unsupported on this architecture.
3253 return PMAP_TC_UNKNOWN_FORMAT;
3254}
3255
3256
3257bool
3258pmap_is_trust_cache_loaded(const uuid_t __unused uuid)
3259{
3260 // Unsupported on this architecture.
3261 return false;
3262}
3263
3264bool
3265pmap_lookup_in_loaded_trust_caches(const uint8_t __unused cdhash[20])
3266{
3267 // Unsupported on this architecture.
3268 return false;
3269}
3270
3271uint32_t
3272pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20])
3273{
3274 // Unsupported on this architecture.
3275 return false;
3276}
3277
3278SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
3279uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
3280
3281void
3282pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
3283{
3284 simple_lock(&pmap_compilation_service_cdhash_lock, LCK_GRP_NULL);
3285 memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
3286 simple_unlock(&pmap_compilation_service_cdhash_lock);
3287
3288#if DEVELOPMENT || DEBUG
3289 printf("Added Compilation Service CDHash through the PMAP: 0x%02X 0x%02X 0x%02X 0x%02X\n", cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
3290#endif
3291}
3292
3293bool
3294pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
3295{
3296 bool match = false;
3297
3298 simple_lock(&pmap_compilation_service_cdhash_lock, LCK_GRP_NULL);
3299 if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
3300 match = true;
3301 }
3302 simple_unlock(&pmap_compilation_service_cdhash_lock);
3303
3304#if DEVELOPMENT || DEBUG
3305 if (match) {
3306 printf("Matched Compilation Service CDHash through the PMAP\n");
3307 }
3308#endif
3309
3310 return match;
3311}
3312
3313bool
3314pmap_in_ppl(void)
3315{
3316 // Nonexistent on this architecture.
3317 return false;
3318}
3319
3320void
3321pmap_lockdown_image4_slab(__unused vm_offset_t slab, __unused vm_size_t slab_len, __unused uint64_t flags)
3322{
3323 // Unsupported on this architecture.
3324}
3325
3326kern_return_t
3327pmap_cs_allow_invalid(__unused pmap_t pmap)
3328{
3329 // Unsupported on this architecture.
3330 return KERN_SUCCESS;
3331}
3332
3333void *
3334pmap_claim_reserved_ppl_page(void)
3335{
3336 // Unsupported on this architecture.
3337 return NULL;
3338}
3339
3340void
3341pmap_free_reserved_ppl_page(void __unused *kva)
3342{
3343 // Unsupported on this architecture.
3344}
3345
3346#if DEVELOPMENT || DEBUG
3347/*
3348 * Used for unit testing recovery from text corruptions.
3349 */
3350kern_return_t
3351pmap_test_text_corruption(pmap_paddr_t pa)
3352{
3353 int pai;
3354 uint8_t *va;
3355
3356 pai = ppn_to_pai(atop(pa));
3357 if (!IS_MANAGED_PAGE(pai)) {
3358 return KERN_FAILURE;
3359 }
3360
3361 va = (uint8_t *)PHYSMAP_PTOV(pa);
3362 va[0] = 0x0f; /* opcode for UD2 */
3363 va[1] = 0x0b;
3364
3365 return KERN_SUCCESS;
3366}
3367#endif /* DEVELOPMENT || DEBUG */