]> git.saurik.com Git - apple/xnu.git/blame - osfmk/x86_64/pmap.c
xnu-3789.70.16.tar.gz
[apple/xnu.git] / osfmk / x86_64 / pmap.c
CommitLineData
b0d623f7 1/*
6d2010ae 2 * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
b0d623f7
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58
59/*
60 * File: pmap.c
61 * Author: Avadis Tevanian, Jr., Michael Wayne Young
62 * (These guys wrote the Vax version)
63 *
64 * Physical Map management code for Intel i386, i486, and i860.
65 *
66 * Manages physical address maps.
67 *
68 * In addition to hardware address maps, this
69 * module is called upon to provide software-use-only
70 * maps which may or may not be stored in the same
71 * form as hardware maps. These pseudo-maps are
72 * used to store intermediate results from copy
73 * operations to and from address spaces.
74 *
75 * Since the information managed by this module is
76 * also stored by the logical address mapping module,
77 * this module may throw away valid virtual-to-physical
78 * mappings at almost any time. However, invalidations
79 * of virtual-to-physical mappings must be done as
80 * requested.
81 *
82 * In order to cope with hardware architectures which
83 * make virtual-to-physical map invalidates expensive,
84 * this module may delay invalidate or reduced protection
85 * operations until such time as they are actually
86 * necessary. This module is given full information as
87 * to which processors are currently using which maps,
88 * and to when physical maps must be made correct.
89 */
90
91#include <string.h>
b0d623f7
A
92#include <mach_ldebug.h>
93
94#include <libkern/OSAtomic.h>
95
96#include <mach/machine/vm_types.h>
97
98#include <mach/boolean.h>
99#include <kern/thread.h>
100#include <kern/zalloc.h>
101#include <kern/queue.h>
316670eb 102#include <kern/ledger.h>
6d2010ae 103#include <kern/mach_param.h>
b0d623f7 104
b0d623f7
A
105#include <kern/kalloc.h>
106#include <kern/spl.h>
107
108#include <vm/pmap.h>
109#include <vm/vm_map.h>
110#include <vm/vm_kern.h>
111#include <mach/vm_param.h>
112#include <mach/vm_prot.h>
113#include <vm/vm_object.h>
114#include <vm/vm_page.h>
115
116#include <mach/machine/vm_param.h>
117#include <machine/thread.h>
118
119#include <kern/misc_protos.h> /* prototyping */
120#include <i386/misc_protos.h>
6d2010ae 121#include <i386/i386_lowmem.h>
b0d623f7
A
122#include <x86_64/lowglobals.h>
123
124#include <i386/cpuid.h>
125#include <i386/cpu_data.h>
126#include <i386/cpu_number.h>
127#include <i386/machine_cpu.h>
128#include <i386/seg.h>
129#include <i386/serial_io.h>
130#include <i386/cpu_capabilities.h>
131#include <i386/machine_routines.h>
132#include <i386/proc_reg.h>
133#include <i386/tsc.h>
134#include <i386/pmap_internal.h>
6d2010ae 135#include <i386/pmap_pcid.h>
3e170ce0
A
136#if CONFIG_VMX
137#include <i386/vmx/vmx_cpu.h>
138#endif
b0d623f7 139
b0d623f7
A
140#include <vm/vm_protos.h>
141
142#include <i386/mp.h>
143#include <i386/mp_desc.h>
316670eb
A
144#include <libkern/kernel_mach_header.h>
145
146#include <pexpert/i386/efi.h>
b0d623f7 147
39037602
A
148#if MACH_ASSERT
149int pmap_stats_assert = 1;
150#endif /* MACH_ASSERT */
b0d623f7 151
b0d623f7
A
152#ifdef IWANTTODEBUG
153#undef DEBUG
154#define DEBUG 1
155#define POSTCODE_DELAY 1
156#include <i386/postcode.h>
157#endif /* IWANTTODEBUG */
158
6d2010ae
A
159#ifdef PMAP_DEBUG
160#define DBG(x...) kprintf("DBG: " x)
b0d623f7
A
161#else
162#define DBG(x...)
163#endif
6d2010ae
A
164/* Compile time assert to ensure adjacency/alignment of per-CPU data fields used
165 * in the trampolines for kernel/user boundary TLB coherency.
b0d623f7 166 */
6d2010ae
A
167char pmap_cpu_data_assert[(((offsetof(cpu_data_t, cpu_tlb_invalid) - offsetof(cpu_data_t, cpu_active_cr3)) == 8) && (offsetof(cpu_data_t, cpu_active_cr3) % 64 == 0)) ? 1 : -1];
168boolean_t pmap_trace = FALSE;
b0d623f7 169
6d2010ae 170boolean_t no_shared_cr3 = DEBUG; /* TRUE for DEBUG by default */
b0d623f7 171
39037602
A
172int nx_enabled = 1; /* enable no-execute protection -- set during boot */
173
174#if DEBUG || DEVELOPMENT
b0d623f7
A
175int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
176int allow_stack_exec = 0; /* No apps may execute from the stack by default */
39037602
A
177#else /* DEBUG || DEVELOPMENT */
178const int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
179const int allow_stack_exec = 0; /* No apps may execute from the stack by default */
180#endif /* DEBUG || DEVELOPMENT */
b0d623f7
A
181
182const boolean_t cpu_64bit = TRUE; /* Mais oui! */
183
b0d623f7
A
184uint64_t max_preemption_latency_tsc = 0;
185
b0d623f7
A
186pv_hashed_entry_t *pv_hash_table; /* hash lists */
187
fe8ab488 188uint32_t npvhashmask = 0, npvhashbuckets = 0;
b0d623f7 189
b0d623f7
A
190pv_hashed_entry_t pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
191pv_hashed_entry_t pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
192decl_simple_lock_data(,pv_hashed_free_list_lock)
193decl_simple_lock_data(,pv_hashed_kern_free_list_lock)
194decl_simple_lock_data(,pv_hash_table_lock)
195
fe8ab488
A
196decl_simple_lock_data(,phys_backup_lock)
197
b0d623f7
A
198zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
199
b0d623f7
A
200/*
201 * First and last physical addresses that we maintain any information
202 * for. Initialized to zero so that pmap operations done before
203 * pmap_init won't touch any non-existent structures.
204 */
205boolean_t pmap_initialized = FALSE;/* Has pmap_init completed? */
206
39037602
A
207static struct vm_object kptobj_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
208static struct vm_object kpml4obj_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
209static struct vm_object kpdptobj_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
b0d623f7
A
210
211/*
6d2010ae 212 * Array of physical page attribites for managed pages.
b0d623f7
A
213 * One byte per physical page.
214 */
215char *pmap_phys_attributes;
316670eb 216ppnum_t last_managed_page = 0;
6d2010ae
A
217
218/*
219 * Amount of virtual memory mapped by one
220 * page-directory entry.
221 */
222
b0d623f7
A
223uint64_t pde_mapped_size = PDE_MAPPED_SIZE;
224
b0d623f7
A
225unsigned pmap_memory_region_count;
226unsigned pmap_memory_region_current;
227
228pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
229
230/*
231 * Other useful macros.
232 */
233#define current_pmap() (vm_map_pmap(current_thread()->map))
234
235struct pmap kernel_pmap_store;
236pmap_t kernel_pmap;
237
b0d623f7
A
238struct zone *pmap_zone; /* zone of pmap structures */
239
6d2010ae
A
240struct zone *pmap_anchor_zone;
241int pmap_debug = 0; /* flag for debugging prints */
242
b0d623f7 243unsigned int inuse_ptepages_count = 0;
6d2010ae
A
244long long alloc_ptepages_count __attribute__((aligned(8))) = 0; /* aligned for atomic access */
245unsigned int bootstrap_wired_pages = 0;
246int pt_fake_zone_index = -1;
b0d623f7 247
6d2010ae 248extern long NMIPI_acks;
b0d623f7 249
6d2010ae
A
250boolean_t kernel_text_ps_4K = TRUE;
251boolean_t wpkernel = TRUE;
b0d623f7
A
252
253extern char end;
254
255static int nkpt;
256
257pt_entry_t *DMAP1, *DMAP2;
258caddr_t DADDR1;
259caddr_t DADDR2;
b0d623f7 260
3e170ce0
A
261boolean_t pmap_disable_kheap_nx = FALSE;
262boolean_t pmap_disable_kstack_nx = FALSE;
b0d623f7 263
316670eb 264extern long __stack_chk_guard[];
b0d623f7 265
7e41aa88 266static uint64_t pmap_eptp_flags = 0;
3e170ce0
A
267boolean_t pmap_ept_support_ad = FALSE;
268
269
b0d623f7
A
270/*
271 * Map memory at initialization. The physical addresses being
272 * mapped are not managed and are never unmapped.
273 *
274 * For now, VM is already on, we only need to map the
275 * specified memory.
276 */
277vm_offset_t
278pmap_map(
279 vm_offset_t virt,
280 vm_map_offset_t start_addr,
281 vm_map_offset_t end_addr,
282 vm_prot_t prot,
283 unsigned int flags)
284{
285 int ps;
286
287 ps = PAGE_SIZE;
288 while (start_addr < end_addr) {
289 pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
316670eb 290 (ppnum_t) i386_btop(start_addr), prot, VM_PROT_NONE, flags, TRUE);
b0d623f7
A
291 virt += ps;
292 start_addr += ps;
293 }
294 return(virt);
295}
296
b0d623f7
A
297extern char *first_avail;
298extern vm_offset_t virtual_avail, virtual_end;
299extern pmap_paddr_t avail_start, avail_end;
300extern vm_offset_t sHIB;
301extern vm_offset_t eHIB;
302extern vm_offset_t stext;
303extern vm_offset_t etext;
316670eb 304extern vm_offset_t sdata, edata;
39037602 305extern vm_offset_t sconst, econst;
b0d623f7 306
6d2010ae
A
307extern void *KPTphys;
308
13f56ec4 309boolean_t pmap_smep_enabled = FALSE;
fe8ab488 310boolean_t pmap_smap_enabled = FALSE;
13f56ec4 311
b0d623f7
A
312void
313pmap_cpu_init(void)
314{
bd504ef0 315 cpu_data_t *cdp = current_cpu_datap();
b0d623f7
A
316 /*
317 * Here early in the life of a processor (from cpu_mode_init()).
6d2010ae 318 * Ensure global page feature is disabled at this point.
b0d623f7 319 */
6d2010ae 320
b0d623f7
A
321 set_cr4(get_cr4() &~ CR4_PGE);
322
323 /*
324 * Initialize the per-cpu, TLB-related fields.
325 */
bd504ef0
A
326 cdp->cpu_kernel_cr3 = kernel_pmap->pm_cr3;
327 cdp->cpu_active_cr3 = kernel_pmap->pm_cr3;
328 cdp->cpu_tlb_invalid = FALSE;
329 cdp->cpu_task_map = TASK_MAP_64BIT;
6d2010ae 330 pmap_pcid_configure();
13f56ec4 331 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMEP) {
39037602
A
332 pmap_smep_enabled = TRUE;
333#if DEVELOPMENT || DEBUG
13f56ec4 334 boolean_t nsmep;
39037602
A
335 if (PE_parse_boot_argn("-pmap_smep_disable", &nsmep, sizeof(nsmep))) {
336 pmap_smep_enabled = FALSE;
337 }
338#endif
339 if (pmap_smep_enabled) {
13f56ec4 340 set_cr4(get_cr4() | CR4_SMEP);
13f56ec4 341 }
39037602 342
13f56ec4 343 }
04b8595b 344 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMAP) {
39037602
A
345 pmap_smap_enabled = TRUE;
346#if DEVELOPMENT || DEBUG
04b8595b 347 boolean_t nsmap;
39037602
A
348 if (PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) {
349 pmap_smap_enabled = FALSE;
350 }
351#endif
352 if (pmap_smap_enabled) {
04b8595b 353 set_cr4(get_cr4() | CR4_SMAP);
04b8595b
A
354 }
355 }
bd504ef0
A
356
357 if (cdp->cpu_fixed_pmcs_enabled) {
358 boolean_t enable = TRUE;
359 cpu_pmc_control(&enable);
360 }
b0d623f7
A
361}
362
fe8ab488
A
363static uint32_t pmap_scale_shift(void) {
364 uint32_t scale = 0;
b0d623f7 365
fe8ab488
A
366 if (sane_size <= 8*GB) {
367 scale = (uint32_t)(sane_size / (2 * GB));
368 } else if (sane_size <= 32*GB) {
369 scale = 4 + (uint32_t)((sane_size - (8 * GB))/ (4 * GB));
370 } else {
371 scale = 10 + (uint32_t)MIN(4, ((sane_size - (32 * GB))/ (8 * GB)));
372 }
373 return scale;
374}
b0d623f7
A
375
376/*
377 * Bootstrap the system enough to run with virtual memory.
378 * Map the kernel's code and data, and allocate the system page table.
379 * Called with mapping OFF. Page_size must already be set.
380 */
381
382void
383pmap_bootstrap(
384 __unused vm_offset_t load_start,
385 __unused boolean_t IA32e)
386{
387#if NCOPY_WINDOWS > 0
388 vm_offset_t va;
389 int i;
390#endif
b0d623f7
A
391 assert(IA32e);
392
393 vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Set the highest address
394 * known to VM */
395 /*
396 * The kernel's pmap is statically allocated so we don't
397 * have to use pmap_create, which is unlikely to work
398 * correctly at this part of the boot sequence.
399 */
400
401 kernel_pmap = &kernel_pmap_store;
402 kernel_pmap->ref_count = 1;
316670eb 403 kernel_pmap->nx_enabled = TRUE;
b0d623f7
A
404 kernel_pmap->pm_task_map = TASK_MAP_64BIT;
405 kernel_pmap->pm_obj = (vm_object_t) NULL;
406 kernel_pmap->dirbase = (pd_entry_t *)((uintptr_t)IdlePTD);
407 kernel_pmap->pm_pdpt = (pd_entry_t *) ((uintptr_t)IdlePDPT);
408 kernel_pmap->pm_pml4 = IdlePML4;
409 kernel_pmap->pm_cr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
3e170ce0 410 kernel_pmap->pm_eptp = 0;
6d2010ae 411 pmap_pcid_initialize_kernel(kernel_pmap);
b0d623f7 412
6d2010ae 413
b0d623f7
A
414
415 current_cpu_datap()->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
416
417 nkpt = NKPT;
418 OSAddAtomic(NKPT, &inuse_ptepages_count);
6d2010ae
A
419 OSAddAtomic64(NKPT, &alloc_ptepages_count);
420 bootstrap_wired_pages = NKPT;
b0d623f7
A
421
422 virtual_avail = (vm_offset_t)(VM_MIN_KERNEL_ADDRESS) + (vm_offset_t)first_avail;
423 virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
424
425#if NCOPY_WINDOWS > 0
426 /*
427 * Reserve some special page table entries/VA space for temporary
428 * mapping of pages.
429 */
430#define SYSMAP(c, p, v, n) \
431 v = (c)va; va += ((n)*INTEL_PGBYTES);
432
433 va = virtual_avail;
434
435 for (i=0; i<PMAP_NWINDOWS; i++) {
436#if 1
437 kprintf("trying to do SYSMAP idx %d %p\n", i,
438 current_cpu_datap());
439 kprintf("cpu_pmap %p\n", current_cpu_datap()->cpu_pmap);
440 kprintf("mapwindow %p\n", current_cpu_datap()->cpu_pmap->mapwindow);
441 kprintf("two stuff %p %p\n",
442 (void *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
443 (void *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR));
444#endif
445 SYSMAP(caddr_t,
446 (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
447 (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR),
448 1);
449 current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP =
450 &(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP_store);
451 *current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP = 0;
452 }
453
454 /* DMAP user for debugger */
455 SYSMAP(caddr_t, DMAP1, DADDR1, 1);
456 SYSMAP(caddr_t, DMAP2, DADDR2, 1); /* XXX temporary - can remove */
457
458 virtual_avail = va;
459#endif
fe8ab488
A
460 if (!PE_parse_boot_argn("npvhash", &npvhashmask, sizeof (npvhashmask))) {
461 npvhashmask = ((NPVHASHBUCKETS) << pmap_scale_shift()) - 1;
b0d623f7 462
fe8ab488
A
463 }
464
465 npvhashbuckets = npvhashmask + 1;
466
467 if (0 != ((npvhashbuckets) & npvhashmask)) {
468 panic("invalid hash %d, must be ((2^N)-1), "
469 "using default %d\n", npvhashmask, NPVHASHMASK);
b0d623f7
A
470 }
471
b0d623f7
A
472 simple_lock_init(&kernel_pmap->lock, 0);
473 simple_lock_init(&pv_hashed_free_list_lock, 0);
474 simple_lock_init(&pv_hashed_kern_free_list_lock, 0);
475 simple_lock_init(&pv_hash_table_lock,0);
fe8ab488 476 simple_lock_init(&phys_backup_lock, 0);
b0d623f7
A
477
478 pmap_cpu_init();
479
6d2010ae
A
480 if (pmap_pcid_ncpus)
481 printf("PMAP: PCID enabled\n");
482
13f56ec4
A
483 if (pmap_smep_enabled)
484 printf("PMAP: Supervisor Mode Execute Protection enabled\n");
04b8595b
A
485 if (pmap_smap_enabled)
486 printf("PMAP: Supervisor Mode Access Protection enabled\n");
7ddcb079 487
316670eb
A
488#if DEBUG
489 printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]);
fe8ab488 490 printf("early_random(): 0x%qx\n", early_random());
316670eb
A
491#endif
492 boolean_t ptmp;
493 /* Check if the user has requested disabling stack or heap no-execute
494 * enforcement. These are "const" variables; that qualifier is cast away
495 * when altering them. The TEXT/DATA const sections are marked
496 * write protected later in the kernel startup sequence, so altering
497 * them is possible at this point, in pmap_bootstrap().
498 */
499 if (PE_parse_boot_argn("-pmap_disable_kheap_nx", &ptmp, sizeof(ptmp))) {
500 boolean_t *pdknxp = (boolean_t *) &pmap_disable_kheap_nx;
501 *pdknxp = TRUE;
502 }
503
504 if (PE_parse_boot_argn("-pmap_disable_kstack_nx", &ptmp, sizeof(ptmp))) {
505 boolean_t *pdknhp = (boolean_t *) &pmap_disable_kstack_nx;
506 *pdknhp = TRUE;
507 }
508
6d2010ae
A
509 boot_args *args = (boot_args *)PE_state.bootArgs;
510 if (args->efiMode == kBootArgsEfiMode32) {
511 printf("EFI32: kernel virtual space limited to 4GB\n");
512 virtual_end = VM_MAX_KERNEL_ADDRESS_EFI32;
513 }
b0d623f7
A
514 kprintf("Kernel virtual space from 0x%lx to 0x%lx.\n",
515 (long)KERNEL_BASE, (long)virtual_end);
516 kprintf("Available physical space from 0x%llx to 0x%llx\n",
517 avail_start, avail_end);
518
519 /*
520 * The -no_shared_cr3 boot-arg is a debugging feature (set by default
521 * in the DEBUG kernel) to force the kernel to switch to its own map
522 * (and cr3) when control is in kernelspace. The kernel's map does not
523 * include (i.e. share) userspace so wild references will cause
524 * a panic. Only copyin and copyout are exempt from this.
525 */
526 (void) PE_parse_boot_argn("-no_shared_cr3",
527 &no_shared_cr3, sizeof (no_shared_cr3));
528 if (no_shared_cr3)
529 kprintf("Kernel not sharing user map\n");
530
531#ifdef PMAP_TRACES
532 if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof (pmap_trace))) {
533 kprintf("Kernel traces for pmap operations enabled\n");
534 }
535#endif /* PMAP_TRACES */
39037602
A
536
537#if MACH_ASSERT
538 PE_parse_boot_argn("pmap_stats_assert",
539 &pmap_stats_assert,
540 sizeof (pmap_stats_assert));
541#endif /* MACH_ASSERT */
b0d623f7
A
542}
543
544void
545pmap_virtual_space(
546 vm_offset_t *startp,
547 vm_offset_t *endp)
548{
549 *startp = virtual_avail;
550 *endp = virtual_end;
551}
552
39236c6e
A
553
554
555
556#if HIBERNATION
557
558#include <IOKit/IOHibernatePrivate.h>
559
560int32_t pmap_npages;
561int32_t pmap_teardown_last_valid_compact_indx = -1;
562
563
564void hibernate_rebuild_pmap_structs(void);
565void hibernate_teardown_pmap_structs(addr64_t *, addr64_t *);
566void pmap_pack_index(uint32_t);
567int32_t pmap_unpack_index(pv_rooted_entry_t);
568
569
570int32_t
571pmap_unpack_index(pv_rooted_entry_t pv_h)
572{
573 int32_t indx = 0;
574
575 indx = (int32_t)(*((uint64_t *)(&pv_h->qlink.next)) >> 48);
576 indx = indx << 16;
577 indx |= (int32_t)(*((uint64_t *)(&pv_h->qlink.prev)) >> 48);
578
579 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)0xffff << 48);
580 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)0xffff << 48);
581
582 return (indx);
583}
584
585
586void
587pmap_pack_index(uint32_t indx)
588{
589 pv_rooted_entry_t pv_h;
590
591 pv_h = &pv_head_table[indx];
592
593 *((uint64_t *)(&pv_h->qlink.next)) &= ~((uint64_t)0xffff << 48);
594 *((uint64_t *)(&pv_h->qlink.prev)) &= ~((uint64_t)0xffff << 48);
595
596 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)(indx >> 16)) << 48;
597 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)(indx & 0xffff)) << 48;
598}
599
600
601void
602hibernate_teardown_pmap_structs(addr64_t *unneeded_start, addr64_t *unneeded_end)
603{
604 int32_t i;
605 int32_t compact_target_indx;
606
607 compact_target_indx = 0;
608
609 for (i = 0; i < pmap_npages; i++) {
610 if (pv_head_table[i].pmap == PMAP_NULL) {
611
612 if (pv_head_table[compact_target_indx].pmap != PMAP_NULL)
613 compact_target_indx = i;
614 } else {
615 pmap_pack_index((uint32_t)i);
616
617 if (pv_head_table[compact_target_indx].pmap == PMAP_NULL) {
618 /*
619 * we've got a hole to fill, so
620 * move this pv_rooted_entry_t to it's new home
621 */
622 pv_head_table[compact_target_indx] = pv_head_table[i];
623 pv_head_table[i].pmap = PMAP_NULL;
624
625 pmap_teardown_last_valid_compact_indx = compact_target_indx;
626 compact_target_indx++;
627 } else
628 pmap_teardown_last_valid_compact_indx = i;
629 }
630 }
631 *unneeded_start = (addr64_t)&pv_head_table[pmap_teardown_last_valid_compact_indx+1];
632 *unneeded_end = (addr64_t)&pv_head_table[pmap_npages-1];
633
634 HIBLOG("hibernate_teardown_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
635}
636
637
638void
639hibernate_rebuild_pmap_structs(void)
640{
39037602 641 int32_t cindx, eindx, rindx = 0;
39236c6e
A
642 pv_rooted_entry_t pv_h;
643
644 eindx = (int32_t)pmap_npages;
645
646 for (cindx = pmap_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
647
648 pv_h = &pv_head_table[cindx];
649
650 rindx = pmap_unpack_index(pv_h);
651 assert(rindx < pmap_npages);
652
653 if (rindx != cindx) {
654 /*
655 * this pv_rooted_entry_t was moved by hibernate_teardown_pmap_structs,
656 * so move it back to its real location
657 */
658 pv_head_table[rindx] = pv_head_table[cindx];
659 }
660 if (rindx+1 != eindx) {
661 /*
662 * the 'hole' between this vm_rooted_entry_t and the previous
663 * vm_rooted_entry_t we moved needs to be initialized as
664 * a range of zero'd vm_rooted_entry_t's
665 */
666 bzero((char *)&pv_head_table[rindx+1], (eindx - rindx - 1) * sizeof (struct pv_rooted_entry));
667 }
668 eindx = rindx;
669 }
670 if (rindx)
671 bzero ((char *)&pv_head_table[0], rindx * sizeof (struct pv_rooted_entry));
672
673 HIBLOG("hibernate_rebuild_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
674}
675
676#endif
677
b0d623f7
A
678/*
679 * Initialize the pmap module.
680 * Called by vm_init, to initialize any structures that the pmap
681 * system needs to map virtual memory.
682 */
683void
684pmap_init(void)
685{
686 long npages;
687 vm_offset_t addr;
060df5ea 688 vm_size_t s, vsize;
b0d623f7
A
689 vm_map_offset_t vaddr;
690 ppnum_t ppn;
691
692
693 kernel_pmap->pm_obj_pml4 = &kpml4obj_object_store;
39236c6e 694 _vm_object_allocate((vm_object_size_t)NPML4PGS * PAGE_SIZE, &kpml4obj_object_store);
b0d623f7
A
695
696 kernel_pmap->pm_obj_pdpt = &kpdptobj_object_store;
39236c6e 697 _vm_object_allocate((vm_object_size_t)NPDPTPGS * PAGE_SIZE, &kpdptobj_object_store);
b0d623f7
A
698
699 kernel_pmap->pm_obj = &kptobj_object_store;
39236c6e 700 _vm_object_allocate((vm_object_size_t)NPDEPGS * PAGE_SIZE, &kptobj_object_store);
b0d623f7
A
701
702 /*
703 * Allocate memory for the pv_head_table and its lock bits,
704 * the modify bit array, and the pte_page table.
705 */
706
707 /*
708 * zero bias all these arrays now instead of off avail_start
709 * so we cover all memory
710 */
711
712 npages = i386_btop(avail_end);
39236c6e
A
713#if HIBERNATION
714 pmap_npages = (uint32_t)npages;
715#endif
b0d623f7 716 s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
fe8ab488 717 + (sizeof (struct pv_hashed_entry_t *) * (npvhashbuckets))
b0d623f7 718 + pv_lock_table_size(npages)
fe8ab488 719 + pv_hash_lock_table_size((npvhashbuckets))
b0d623f7 720 + npages);
b0d623f7
A
721 s = round_page(s);
722 if (kernel_memory_allocate(kernel_map, &addr, s, 0,
3e170ce0 723 KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_PMAP)
b0d623f7
A
724 != KERN_SUCCESS)
725 panic("pmap_init");
726
727 memset((char *)addr, 0, s);
728
060df5ea
A
729 vaddr = addr;
730 vsize = s;
731
b0d623f7 732#if PV_DEBUG
fe8ab488 733 if (0 == npvhashmask) panic("npvhashmask not initialized");
b0d623f7
A
734#endif
735
736 /*
737 * Allocate the structures first to preserve word-alignment.
738 */
739 pv_head_table = (pv_rooted_entry_t) addr;
740 addr = (vm_offset_t) (pv_head_table + npages);
741
742 pv_hash_table = (pv_hashed_entry_t *)addr;
fe8ab488 743 addr = (vm_offset_t) (pv_hash_table + (npvhashbuckets));
b0d623f7
A
744
745 pv_lock_table = (char *) addr;
746 addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
747
748 pv_hash_lock_table = (char *) addr;
fe8ab488 749 addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhashbuckets)));
b0d623f7
A
750
751 pmap_phys_attributes = (char *) addr;
752
753 ppnum_t last_pn = i386_btop(avail_end);
754 unsigned int i;
755 pmap_memory_region_t *pmptr = pmap_memory_regions;
756 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
757 if (pmptr->type != kEfiConventionalMemory)
758 continue;
316670eb 759 ppnum_t pn;
b0d623f7
A
760 for (pn = pmptr->base; pn <= pmptr->end; pn++) {
761 if (pn < last_pn) {
762 pmap_phys_attributes[pn] |= PHYS_MANAGED;
060df5ea 763
b0d623f7
A
764 if (pn > last_managed_page)
765 last_managed_page = pn;
060df5ea 766
7ddcb079 767 if (pn >= lowest_hi && pn <= highest_hi)
060df5ea 768 pmap_phys_attributes[pn] |= PHYS_NOENCRYPT;
b0d623f7
A
769 }
770 }
771 }
060df5ea
A
772 while (vsize) {
773 ppn = pmap_find_phys(kernel_pmap, vaddr);
b0d623f7 774
060df5ea
A
775 pmap_phys_attributes[ppn] |= PHYS_NOENCRYPT;
776
777 vaddr += PAGE_SIZE;
778 vsize -= PAGE_SIZE;
779 }
b0d623f7
A
780 /*
781 * Create the zone of physical maps,
782 * and of the physical-to-virtual entries.
783 */
784 s = (vm_size_t) sizeof(struct pmap);
785 pmap_zone = zinit(s, 400*s, 4096, "pmap"); /* XXX */
060df5ea
A
786 zone_change(pmap_zone, Z_NOENCRYPT, TRUE);
787
6d2010ae
A
788 pmap_anchor_zone = zinit(PAGE_SIZE, task_max, PAGE_SIZE, "pagetable anchors");
789 zone_change(pmap_anchor_zone, Z_NOENCRYPT, TRUE);
790
6d2010ae 791 /* The anchor is required to be page aligned. Zone debugging adds
316670eb
A
792 * padding which may violate that requirement. Tell the zone
793 * subsystem that alignment is required.
6d2010ae 794 */
316670eb
A
795
796 zone_change(pmap_anchor_zone, Z_ALIGNMENT_REQUIRED, TRUE);
6d2010ae 797
b0d623f7 798 s = (vm_size_t) sizeof(struct pv_hashed_entry);
6d2010ae
A
799 pv_hashed_list_zone = zinit(s, 10000*s /* Expandable zone */,
800 4096 * 3 /* LCM x86_64*/, "pv_list");
060df5ea 801 zone_change(pv_hashed_list_zone, Z_NOENCRYPT, TRUE);
b0d623f7
A
802
803 /* create pv entries for kernel pages mapped by low level
804 startup code. these have to exist so we can pmap_remove()
805 e.g. kext pages from the middle of our addr space */
806
807 vaddr = (vm_map_offset_t) VM_MIN_KERNEL_ADDRESS;
6d2010ae 808 for (ppn = VM_MIN_KERNEL_PAGE; ppn < i386_btop(avail_start); ppn++) {
b0d623f7
A
809 pv_rooted_entry_t pv_e;
810
811 pv_e = pai_to_pvh(ppn);
39037602 812 pv_e->va_and_flags = vaddr;
b0d623f7
A
813 vaddr += PAGE_SIZE;
814 pv_e->pmap = kernel_pmap;
815 queue_init(&pv_e->qlink);
816 }
817 pmap_initialized = TRUE;
818
b0d623f7
A
819 max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
820
821 /*
822 * Ensure the kernel's PML4 entry exists for the basement
823 * before this is shared with any user.
824 */
316670eb 825 pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT, PMAP_EXPAND_OPTIONS_NONE);
3e170ce0
A
826
827#if CONFIG_VMX
828 pmap_ept_support_ad = vmx_hv_support() && (VMX_CAP(MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_EPT_VPID_CAP_AD_SHIFT, 1) ? TRUE : FALSE);
7e41aa88 829 pmap_eptp_flags = HV_VMX_EPTP_MEMORY_TYPE_WB | HV_VMX_EPTP_WALK_LENGTH(4) | (pmap_ept_support_ad ? HV_VMX_EPTP_ENABLE_AD_FLAGS : 0);
3e170ce0 830#endif /* CONFIG_VMX */
316670eb
A
831}
832
833static
834void pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolean_t ro) {
835 uint64_t ev = sv + nxrosz, cv = sv;
836 pd_entry_t *pdep;
837 pt_entry_t *ptep = NULL;
838
3e170ce0
A
839 assert(!is_ept_pmap(npmap));
840
316670eb
A
841 assert(((sv & 0xFFFULL) | (nxrosz & 0xFFFULL)) == 0);
842
843 for (pdep = pmap_pde(npmap, cv); pdep != NULL && (cv < ev);) {
844 uint64_t pdev = (cv & ~((uint64_t)PDEMASK));
845
846 if (*pdep & INTEL_PTE_PS) {
847 if (NX)
848 *pdep |= INTEL_PTE_NX;
849 if (ro)
850 *pdep &= ~INTEL_PTE_WRITE;
851 cv += NBPD;
852 cv &= ~((uint64_t) PDEMASK);
853 pdep = pmap_pde(npmap, cv);
854 continue;
855 }
856
857 for (ptep = pmap_pte(npmap, cv); ptep != NULL && (cv < (pdev + NBPD)) && (cv < ev);) {
858 if (NX)
859 *ptep |= INTEL_PTE_NX;
860 if (ro)
861 *ptep &= ~INTEL_PTE_WRITE;
862 cv += NBPT;
863 ptep = pmap_pte(npmap, cv);
864 }
865 }
866 DPRINTF("%s(0x%llx, 0x%llx, %u, %u): 0x%llx, 0x%llx\n", __FUNCTION__, sv, nxrosz, NX, ro, cv, ptep ? *ptep: 0);
b0d623f7
A
867}
868
6d2010ae
A
869/*
870 * Called once VM is fully initialized so that we can release unused
871 * sections of low memory to the general pool.
872 * Also complete the set-up of identity-mapped sections of the kernel:
873 * 1) write-protect kernel text
874 * 2) map kernel text using large pages if possible
875 * 3) read and write-protect page zero (for K32)
876 * 4) map the global page at the appropriate virtual address.
877 *
878 * Use of large pages
879 * ------------------
880 * To effectively map and write-protect all kernel text pages, the text
881 * must be 2M-aligned at the base, and the data section above must also be
882 * 2M-aligned. That is, there's padding below and above. This is achieved
883 * through linker directives. Large pages are used only if this alignment
884 * exists (and not overriden by the -kernel_text_page_4K boot-arg). The
885 * memory layout is:
886 *
887 * : :
888 * | __DATA |
889 * sdata: ================== 2Meg
890 * | |
891 * | zero-padding |
892 * | |
893 * etext: ------------------
894 * | |
895 * : :
896 * | |
897 * | __TEXT |
898 * | |
899 * : :
900 * | |
901 * stext: ================== 2Meg
902 * | |
903 * | zero-padding |
904 * | |
905 * eHIB: ------------------
906 * | __HIB |
907 * : :
908 *
909 * Prior to changing the mapping from 4K to 2M, the zero-padding pages
910 * [eHIB,stext] and [etext,sdata] are ml_static_mfree()'d. Then all the
911 * 4K pages covering [stext,etext] are coalesced as 2M large pages.
912 * The now unused level-1 PTE pages are also freed.
913 */
316670eb 914extern ppnum_t vm_kernel_base_page;
39037602
A
915static uint32_t constptes = 0, dataptes = 0;
916
917void pmap_lowmem_finalize(void) {
6d2010ae
A
918 spl_t spl;
919 int i;
920
6d2010ae
A
921 /*
922 * Update wired memory statistics for early boot pages
923 */
316670eb 924 PMAP_ZINFO_PALLOC(kernel_pmap, bootstrap_wired_pages * PAGE_SIZE);
6d2010ae
A
925
926 /*
316670eb 927 * Free pages in pmap regions below the base:
6d2010ae
A
928 * rdar://6332712
929 * We can't free all the pages to VM that EFI reports available.
930 * Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake.
931 * There's also a size miscalculation here: pend is one page less
932 * than it should be but this is not fixed to be backwards
933 * compatible.
316670eb
A
934 * This is important for KASLR because up to 256*2MB = 512MB of space
935 * needs has to be released to VM.
6d2010ae
A
936 */
937 for (i = 0;
316670eb 938 pmap_memory_regions[i].end < vm_kernel_base_page;
6d2010ae 939 i++) {
316670eb
A
940 vm_offset_t pbase = i386_ptob(pmap_memory_regions[i].base);
941 vm_offset_t pend = i386_ptob(pmap_memory_regions[i].end+1);
6d2010ae 942
316670eb
A
943 DBG("pmap region %d [%p..[%p\n",
944 i, (void *) pbase, (void *) pend);
945
946 if (pmap_memory_regions[i].attribute & EFI_MEMORY_KERN_RESERVED)
947 continue;
948 /*
949 * rdar://6332712
950 * Adjust limits not to free pages in range 0xc0000-0xff000.
951 */
952 if (pbase >= 0xc0000 && pend <= 0x100000)
953 continue;
954 if (pbase < 0xc0000 && pend > 0x100000) {
955 /* page range entirely within region, free lower part */
956 DBG("- ml_static_mfree(%p,%p)\n",
957 (void *) ml_static_ptovirt(pbase),
958 (void *) (0xc0000-pbase));
959 ml_static_mfree(ml_static_ptovirt(pbase),0xc0000-pbase);
960 pbase = 0x100000;
961 }
962 if (pbase < 0xc0000)
963 pend = MIN(pend, 0xc0000);
964 if (pend > 0x100000)
965 pbase = MAX(pbase, 0x100000);
966 DBG("- ml_static_mfree(%p,%p)\n",
6d2010ae 967 (void *) ml_static_ptovirt(pbase),
316670eb 968 (void *) (pend - pbase));
6d2010ae
A
969 ml_static_mfree(ml_static_ptovirt(pbase), pend - pbase);
970 }
971
316670eb
A
972 /* A final pass to get rid of all initial identity mappings to
973 * low pages.
974 */
975 DPRINTF("%s: Removing mappings from 0->0x%lx\n", __FUNCTION__, vm_kernel_base);
976
143464d5
A
977 /*
978 * Remove all mappings past the boot-cpu descriptor aliases and low globals.
979 * Non-boot-cpu GDT aliases will be remapped later as needed.
980 */
316670eb
A
981 pmap_remove(kernel_pmap, LOWGLOBAL_ALIAS + PAGE_SIZE, vm_kernel_base);
982
6d2010ae
A
983 /*
984 * If text and data are both 2MB-aligned,
985 * we can map text with large-pages,
986 * unless the -kernel_text_ps_4K boot-arg overrides.
987 */
988 if ((stext & I386_LPGMASK) == 0 && (sdata & I386_LPGMASK) == 0) {
989 kprintf("Kernel text is 2MB aligned");
990 kernel_text_ps_4K = FALSE;
991 if (PE_parse_boot_argn("-kernel_text_ps_4K",
992 &kernel_text_ps_4K,
993 sizeof (kernel_text_ps_4K)))
994 kprintf(" but will be mapped with 4K pages\n");
995 else
996 kprintf(" and will be mapped with 2M pages\n");
997 }
998
999 (void) PE_parse_boot_argn("wpkernel", &wpkernel, sizeof (wpkernel));
1000 if (wpkernel)
1001 kprintf("Kernel text %p-%p to be write-protected\n",
1002 (void *) stext, (void *) etext);
1003
1004 spl = splhigh();
1005
1006 /*
1007 * Scan over text if mappings are to be changed:
1008 * - Remap kernel text readonly unless the "wpkernel" boot-arg is 0
1009 * - Change to large-pages if possible and not overriden.
1010 */
1011 if (kernel_text_ps_4K && wpkernel) {
1012 vm_offset_t myva;
1013 for (myva = stext; myva < etext; myva += PAGE_SIZE) {
1014 pt_entry_t *ptep;
1015
1016 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1017 if (ptep)
316670eb 1018 pmap_store_pte(ptep, *ptep & ~INTEL_PTE_WRITE);
6d2010ae
A
1019 }
1020 }
1021
1022 if (!kernel_text_ps_4K) {
1023 vm_offset_t myva;
1024
1025 /*
1026 * Release zero-filled page padding used for 2M-alignment.
1027 */
1028 DBG("ml_static_mfree(%p,%p) for padding below text\n",
1029 (void *) eHIB, (void *) (stext - eHIB));
1030 ml_static_mfree(eHIB, stext - eHIB);
1031 DBG("ml_static_mfree(%p,%p) for padding above text\n",
1032 (void *) etext, (void *) (sdata - etext));
1033 ml_static_mfree(etext, sdata - etext);
1034
1035 /*
1036 * Coalesce text pages into large pages.
1037 */
1038 for (myva = stext; myva < sdata; myva += I386_LPGBYTES) {
1039 pt_entry_t *ptep;
1040 vm_offset_t pte_phys;
1041 pt_entry_t *pdep;
1042 pt_entry_t pde;
1043
1044 pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva);
1045 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1046 DBG("myva: %p pdep: %p ptep: %p\n",
1047 (void *) myva, (void *) pdep, (void *) ptep);
1048 if ((*ptep & INTEL_PTE_VALID) == 0)
1049 continue;
1050 pte_phys = (vm_offset_t)(*ptep & PG_FRAME);
1051 pde = *pdep & PTMASK; /* page attributes from pde */
1052 pde |= INTEL_PTE_PS; /* make it a 2M entry */
1053 pde |= pte_phys; /* take page frame from pte */
1054
1055 if (wpkernel)
316670eb 1056 pde &= ~INTEL_PTE_WRITE;
6d2010ae
A
1057 DBG("pmap_store_pte(%p,0x%llx)\n",
1058 (void *)pdep, pde);
1059 pmap_store_pte(pdep, pde);
1060
1061 /*
1062 * Free the now-unused level-1 pte.
1063 * Note: ptep is a virtual address to the pte in the
1064 * recursive map. We can't use this address to free
1065 * the page. Instead we need to compute its address
1066 * in the Idle PTEs in "low memory".
1067 */
1068 vm_offset_t vm_ptep = (vm_offset_t) KPTphys
1069 + (pte_phys >> PTPGSHIFT);
1070 DBG("ml_static_mfree(%p,0x%x) for pte\n",
1071 (void *) vm_ptep, PAGE_SIZE);
1072 ml_static_mfree(vm_ptep, PAGE_SIZE);
1073 }
1074
1075 /* Change variable read by sysctl machdep.pmap */
1076 pmap_kernel_text_ps = I386_LPGBYTES;
1077 }
1078
316670eb 1079 boolean_t doconstro = TRUE;
39037602 1080#if DEVELOPMENT || DEBUG
316670eb 1081 (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro));
39037602
A
1082#endif
1083 if (doconstro) {
1084 if (sconst & PAGE_MASK) {
1085 panic("CONST segment misaligned 0x%lx 0x%lx\n",
1086 sconst, econst);
1087 }
316670eb 1088 kprintf("Marking const DATA read-only\n");
39037602
A
1089 }
1090
316670eb
A
1091 vm_offset_t dva;
1092
1093 for (dva = sdata; dva < edata; dva += I386_PGBYTES) {
1094 assert(((sdata | edata) & PAGE_MASK) == 0);
39037602 1095 pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva);
316670eb 1096
39037602
A
1097 dpte = *dptep;
1098 assert((dpte & INTEL_PTE_VALID));
1099 dpte |= INTEL_PTE_NX;
1100 pmap_store_pte(dptep, dpte);
1101 dataptes++;
1102 }
1103 assert(dataptes > 0);
1104
1105 for (dva = sconst; dva < econst; dva += I386_PGBYTES) {
316670eb
A
1106 pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva);
1107
1108 dpte = *dptep;
1109
1110 assert((dpte & INTEL_PTE_VALID));
316670eb 1111 dpte |= INTEL_PTE_NX;
39037602
A
1112 dpte &= ~INTEL_PTE_WRITE;
1113 constptes++;
316670eb
A
1114 pmap_store_pte(dptep, dpte);
1115 }
39037602
A
1116
1117 assert(constptes > 0);
1118
316670eb
A
1119 kernel_segment_command_t * seg;
1120 kernel_section_t * sec;
1121
1122 for (seg = firstseg(); seg != NULL; seg = nextsegfromheader(&_mh_execute_header, seg)) {
1123 if (!strcmp(seg->segname, "__TEXT") ||
1124 !strcmp(seg->segname, "__DATA")) {
1125 continue;
1126 }
1127 //XXX
1128 if (!strcmp(seg->segname, "__KLD")) {
1129 continue;
1130 }
1131 if (!strcmp(seg->segname, "__HIB")) {
1132 for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) {
1133 if (sec->addr & PAGE_MASK)
1134 panic("__HIB segment's sections misaligned");
1135 if (!strcmp(sec->sectname, "__text")) {
1136 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), FALSE, TRUE);
1137 } else {
1138 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), TRUE, FALSE);
1139 }
1140 }
1141 } else {
1142 pmap_mark_range(kernel_pmap, seg->vmaddr, round_page_64(seg->vmsize), TRUE, FALSE);
1143 }
1144 }
1145
1146 /*
1147 * If we're debugging, map the low global vector page at the fixed
1148 * virtual address. Otherwise, remove the mapping for this.
1149 */
1150 if (debug_boot_arg) {
1151 pt_entry_t *pte = NULL;
1152 if (0 == (pte = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS)))
1153 panic("lowmem pte");
1154 /* make sure it is defined on page boundary */
1155 assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK));
1156 pmap_store_pte(pte, kvtophys((vm_offset_t)&lowGlo)
1157 | INTEL_PTE_REF
1158 | INTEL_PTE_MOD
1159 | INTEL_PTE_WIRED
1160 | INTEL_PTE_VALID
1161 | INTEL_PTE_WRITE
1162 | INTEL_PTE_NX);
1163 } else {
1164 pmap_remove(kernel_pmap,
1165 LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE);
1166 }
1167
6d2010ae
A
1168 splx(spl);
1169 if (pmap_pcid_ncpus)
1170 tlb_flush_global();
1171 else
1172 flush_tlb_raw();
1173}
b0d623f7
A
1174
1175/*
1176 * this function is only used for debugging fron the vm layer
1177 */
1178boolean_t
1179pmap_verify_free(
1180 ppnum_t pn)
1181{
1182 pv_rooted_entry_t pv_h;
1183 int pai;
1184 boolean_t result;
1185
1186 assert(pn != vm_page_fictitious_addr);
1187
1188 if (!pmap_initialized)
1189 return(TRUE);
1190
1191 if (pn == vm_page_guard_addr)
1192 return TRUE;
1193
1194 pai = ppn_to_pai(pn);
1195 if (!IS_MANAGED_PAGE(pai))
1196 return(FALSE);
1197 pv_h = pai_to_pvh(pn);
1198 result = (pv_h->pmap == PMAP_NULL);
1199 return(result);
1200}
1201
1202boolean_t
1203pmap_is_empty(
1204 pmap_t pmap,
1205 vm_map_offset_t va_start,
1206 vm_map_offset_t va_end)
1207{
1208 vm_map_offset_t offset;
1209 ppnum_t phys_page;
1210
1211 if (pmap == PMAP_NULL) {
1212 return TRUE;
1213 }
1214
1215 /*
1216 * Check the resident page count
1217 * - if it's zero, the pmap is completely empty.
1218 * This short-circuit test prevents a virtual address scan which is
1219 * painfully slow for 64-bit spaces.
1220 * This assumes the count is correct
1221 * .. the debug kernel ought to be checking perhaps by page table walk.
1222 */
1223 if (pmap->stats.resident_count == 0)
1224 return TRUE;
1225
1226 for (offset = va_start;
1227 offset < va_end;
1228 offset += PAGE_SIZE_64) {
1229 phys_page = pmap_find_phys(pmap, offset);
1230 if (phys_page) {
1231 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1232 "page %d at 0x%llx\n",
1233 pmap, va_start, va_end, phys_page, offset);
1234 return FALSE;
1235 }
1236 }
1237
1238 return TRUE;
1239}
1240
3e170ce0
A
1241void
1242hv_ept_pmap_create(void **ept_pmap, void **eptp)
1243{
1244 pmap_t p;
1245
1246 if ((ept_pmap == NULL) || (eptp == NULL)) {
1247 return;
1248 }
1249
1250 p = pmap_create_options(get_task_ledger(current_task()), 0, (PMAP_CREATE_64BIT | PMAP_CREATE_EPT));
1251 if (p == PMAP_NULL) {
1252 *ept_pmap = NULL;
1253 *eptp = NULL;
1254 return;
1255 }
1256
1257 assert(is_ept_pmap(p));
1258
1259 *ept_pmap = (void*)p;
1260 *eptp = (void*)(p->pm_eptp);
1261 return;
1262}
b0d623f7
A
1263
1264/*
1265 * Create and return a physical map.
1266 *
1267 * If the size specified for the map
1268 * is zero, the map is an actual physical
1269 * map, and may be referenced by the
1270 * hardware.
1271 *
1272 * If the size specified is non-zero,
1273 * the map will be used in software only, and
1274 * is bounded by that size.
1275 */
1276pmap_t
3e170ce0
A
1277pmap_create_options(
1278 ledger_t ledger,
1279 vm_map_size_t sz,
1280 int flags)
b0d623f7
A
1281{
1282 pmap_t p;
1283 vm_size_t size;
1284 pml4_entry_t *pml4;
1285 pml4_entry_t *kpml4;
1286
1287 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
3e170ce0 1288 (uint32_t) (sz>>32), (uint32_t) sz, flags, 0, 0);
b0d623f7
A
1289
1290 size = (vm_size_t) sz;
1291
1292 /*
1293 * A software use-only map doesn't even need a map.
1294 */
1295
1296 if (size != 0) {
1297 return(PMAP_NULL);
1298 }
1299
3e170ce0
A
1300 /*
1301 * Return error when unrecognized flags are passed.
1302 */
1303 if ((flags & ~(PMAP_CREATE_KNOWN_FLAGS)) != 0) {
1304 return(PMAP_NULL);
1305 }
1306
b0d623f7
A
1307 p = (pmap_t) zalloc(pmap_zone);
1308 if (PMAP_NULL == p)
1309 panic("pmap_create zalloc");
6d2010ae
A
1310 /* Zero all fields */
1311 bzero(p, sizeof(*p));
b0d623f7
A
1312 /* init counts now since we'll be bumping some */
1313 simple_lock_init(&p->lock, 0);
39236c6e 1314#if 00
b0d623f7
A
1315 p->stats.resident_count = 0;
1316 p->stats.resident_max = 0;
1317 p->stats.wired_count = 0;
39236c6e
A
1318#else
1319 bzero(&p->stats, sizeof (p->stats));
1320#endif
b0d623f7
A
1321 p->ref_count = 1;
1322 p->nx_enabled = 1;
1323 p->pm_shared = FALSE;
316670eb
A
1324 ledger_reference(ledger);
1325 p->ledger = ledger;
b0d623f7 1326
3e170ce0 1327 p->pm_task_map = ((flags & PMAP_CREATE_64BIT) ? TASK_MAP_64BIT : TASK_MAP_32BIT);
39037602
A
1328
1329 p->pagezero_accessible = FALSE;
1330
1331 if (pmap_pcid_ncpus) {
6d2010ae 1332 pmap_pcid_initialize(p);
39037602 1333 }
316670eb 1334
6d2010ae 1335 p->pm_pml4 = zalloc(pmap_anchor_zone);
b0d623f7 1336
6d2010ae 1337 pmap_assert((((uintptr_t)p->pm_pml4) & PAGE_MASK) == 0);
b0d623f7 1338
6d2010ae 1339 memset((char *)p->pm_pml4, 0, PAGE_SIZE);
b0d623f7 1340
3e170ce0 1341 if (flags & PMAP_CREATE_EPT) {
7e41aa88 1342 p->pm_eptp = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4) | pmap_eptp_flags;
3e170ce0
A
1343 p->pm_cr3 = 0;
1344 } else {
1345 p->pm_eptp = 0;
1346 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4);
1347 }
b0d623f7
A
1348
1349 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1350
39236c6e 1351 p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS) * PAGE_SIZE);
b0d623f7
A
1352 if (NULL == p->pm_obj_pml4)
1353 panic("pmap_create pdpt obj");
1354
39236c6e 1355 p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS) * PAGE_SIZE);
b0d623f7
A
1356 if (NULL == p->pm_obj_pdpt)
1357 panic("pmap_create pdpt obj");
1358
39236c6e 1359 p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS) * PAGE_SIZE);
b0d623f7
A
1360 if (NULL == p->pm_obj)
1361 panic("pmap_create pte obj");
1362
490019cf
A
1363 if (!(flags & PMAP_CREATE_EPT)) {
1364 /* All host pmaps share the kernel's pml4 */
1365 pml4 = pmap64_pml4(p, 0ULL);
1366 kpml4 = kernel_pmap->pm_pml4;
1367 pml4[KERNEL_PML4_INDEX] = kpml4[KERNEL_PML4_INDEX];
1368 pml4[KERNEL_KEXTS_INDEX] = kpml4[KERNEL_KEXTS_INDEX];
1369 pml4[KERNEL_PHYSMAP_PML4_INDEX] = kpml4[KERNEL_PHYSMAP_PML4_INDEX];
1370 }
b0d623f7 1371
39037602
A
1372#if MACH_ASSERT
1373 p->pmap_pid = 0;
1374 strlcpy(p->pmap_procname, "<nil>", sizeof (p->pmap_procname));
1375#endif /* MACH_ASSERT */
1376
b0d623f7 1377 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
3e170ce0 1378 p, flags, 0, 0, 0);
b0d623f7
A
1379
1380 return(p);
1381}
1382
3e170ce0
A
1383pmap_t
1384pmap_create(
1385 ledger_t ledger,
1386 vm_map_size_t sz,
1387 boolean_t is_64bit)
1388{
1389 return pmap_create_options(ledger, sz, ((is_64bit) ? PMAP_CREATE_64BIT : 0));
1390}
1391
39037602
A
1392/*
1393 * We maintain stats and ledgers so that a task's physical footprint is:
1394 * phys_footprint = ((internal - alternate_accounting)
1395 * + (internal_compressed - alternate_accounting_compressed)
1396 * + iokit_mapped
1397 * + purgeable_nonvolatile
1398 * + purgeable_nonvolatile_compressed
1399 * + page_table)
1400 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
1401 */
1402
1403#if MACH_ASSERT
1404struct {
1405 uint64_t num_pmaps_checked;
1406
1407 int phys_footprint_over;
1408 ledger_amount_t phys_footprint_over_total;
1409 ledger_amount_t phys_footprint_over_max;
1410 int phys_footprint_under;
1411 ledger_amount_t phys_footprint_under_total;
1412 ledger_amount_t phys_footprint_under_max;
1413
1414 int internal_over;
1415 ledger_amount_t internal_over_total;
1416 ledger_amount_t internal_over_max;
1417 int internal_under;
1418 ledger_amount_t internal_under_total;
1419 ledger_amount_t internal_under_max;
1420
1421 int internal_compressed_over;
1422 ledger_amount_t internal_compressed_over_total;
1423 ledger_amount_t internal_compressed_over_max;
1424 int internal_compressed_under;
1425 ledger_amount_t internal_compressed_under_total;
1426 ledger_amount_t internal_compressed_under_max;
1427
1428 int iokit_mapped_over;
1429 ledger_amount_t iokit_mapped_over_total;
1430 ledger_amount_t iokit_mapped_over_max;
1431 int iokit_mapped_under;
1432 ledger_amount_t iokit_mapped_under_total;
1433 ledger_amount_t iokit_mapped_under_max;
1434
1435 int alternate_accounting_over;
1436 ledger_amount_t alternate_accounting_over_total;
1437 ledger_amount_t alternate_accounting_over_max;
1438 int alternate_accounting_under;
1439 ledger_amount_t alternate_accounting_under_total;
1440 ledger_amount_t alternate_accounting_under_max;
1441
1442 int alternate_accounting_compressed_over;
1443 ledger_amount_t alternate_accounting_compressed_over_total;
1444 ledger_amount_t alternate_accounting_compressed_over_max;
1445 int alternate_accounting_compressed_under;
1446 ledger_amount_t alternate_accounting_compressed_under_total;
1447 ledger_amount_t alternate_accounting_compressed_under_max;
1448
1449 int page_table_over;
1450 ledger_amount_t page_table_over_total;
1451 ledger_amount_t page_table_over_max;
1452 int page_table_under;
1453 ledger_amount_t page_table_under_total;
1454 ledger_amount_t page_table_under_max;
1455
1456 int purgeable_volatile_over;
1457 ledger_amount_t purgeable_volatile_over_total;
1458 ledger_amount_t purgeable_volatile_over_max;
1459 int purgeable_volatile_under;
1460 ledger_amount_t purgeable_volatile_under_total;
1461 ledger_amount_t purgeable_volatile_under_max;
1462
1463 int purgeable_nonvolatile_over;
1464 ledger_amount_t purgeable_nonvolatile_over_total;
1465 ledger_amount_t purgeable_nonvolatile_over_max;
1466 int purgeable_nonvolatile_under;
1467 ledger_amount_t purgeable_nonvolatile_under_total;
1468 ledger_amount_t purgeable_nonvolatile_under_max;
1469
1470 int purgeable_volatile_compressed_over;
1471 ledger_amount_t purgeable_volatile_compressed_over_total;
1472 ledger_amount_t purgeable_volatile_compressed_over_max;
1473 int purgeable_volatile_compressed_under;
1474 ledger_amount_t purgeable_volatile_compressed_under_total;
1475 ledger_amount_t purgeable_volatile_compressed_under_max;
1476
1477 int purgeable_nonvolatile_compressed_over;
1478 ledger_amount_t purgeable_nonvolatile_compressed_over_total;
1479 ledger_amount_t purgeable_nonvolatile_compressed_over_max;
1480 int purgeable_nonvolatile_compressed_under;
1481 ledger_amount_t purgeable_nonvolatile_compressed_under_total;
1482 ledger_amount_t purgeable_nonvolatile_compressed_under_max;
1483} pmap_ledgers_drift;
1484static void pmap_check_ledgers(pmap_t pmap);
1485#else /* MACH_ASSERT */
1486static inline void pmap_check_ledgers(__unused pmap_t pmap) {}
1487#endif /* MACH_ASSERT */
1488
b0d623f7
A
1489/*
1490 * Retire the given physical map from service.
1491 * Should only be called if the map contains
1492 * no valid mappings.
1493 */
3e170ce0 1494extern int vm_wired_objects_page_count;
b0d623f7
A
1495
1496void
6d2010ae 1497pmap_destroy(pmap_t p)
b0d623f7 1498{
6d2010ae 1499 int c;
b0d623f7
A
1500
1501 if (p == PMAP_NULL)
1502 return;
1503
1504 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1505 p, 0, 0, 0, 0);
1506
1507 PMAP_LOCK(p);
1508
1509 c = --p->ref_count;
1510
6d2010ae
A
1511 pmap_assert((current_thread() && (current_thread()->map)) ? (current_thread()->map->pmap != p) : TRUE);
1512
b0d623f7
A
1513 if (c == 0) {
1514 /*
1515 * If some cpu is not using the physical pmap pointer that it
1516 * is supposed to be (see set_dirbase), we might be using the
1517 * pmap that is being destroyed! Make sure we are
1518 * physically on the right pmap:
1519 */
1520 PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL);
ebb1b9f4
A
1521 if (pmap_pcid_ncpus)
1522 pmap_destroy_pcid_sync(p);
b0d623f7 1523 }
ebb1b9f4 1524
b0d623f7
A
1525 PMAP_UNLOCK(p);
1526
1527 if (c != 0) {
1528 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1529 p, 1, 0, 0, 0);
6d2010ae 1530 pmap_assert(p == kernel_pmap);
b0d623f7
A
1531 return; /* still in use */
1532 }
1533
1534 /*
1535 * Free the memory maps, then the
1536 * pmap structure.
1537 */
1538 int inuse_ptepages = 0;
1539
6d2010ae 1540 zfree(pmap_anchor_zone, p->pm_pml4);
b0d623f7
A
1541
1542 inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1543 vm_object_deallocate(p->pm_obj_pml4);
1544
1545 inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1546 vm_object_deallocate(p->pm_obj_pdpt);
1547
1548 inuse_ptepages += p->pm_obj->resident_page_count;
1549 vm_object_deallocate(p->pm_obj);
1550
1551 OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count);
316670eb 1552 PMAP_ZINFO_PFREE(p, inuse_ptepages * PAGE_SIZE);
39037602
A
1553
1554 pmap_check_ledgers(p);
316670eb 1555 ledger_dereference(p->ledger);
b0d623f7
A
1556 zfree(pmap_zone, p);
1557
1558 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1559 0, 0, 0, 0, 0);
1560}
1561
1562/*
1563 * Add a reference to the specified pmap.
1564 */
1565
1566void
1567pmap_reference(pmap_t p)
1568{
1569 if (p != PMAP_NULL) {
1570 PMAP_LOCK(p);
1571 p->ref_count++;
1572 PMAP_UNLOCK(p);;
1573 }
1574}
1575
b0d623f7
A
1576/*
1577 * Remove phys addr if mapped in specified map
1578 *
1579 */
1580void
1581pmap_remove_some_phys(
1582 __unused pmap_t map,
1583 __unused ppnum_t pn)
1584{
1585
1586/* Implement to support working set code */
1587
1588}
1589
39236c6e
A
1590
1591void
1592pmap_protect(
1593 pmap_t map,
1594 vm_map_offset_t sva,
1595 vm_map_offset_t eva,
1596 vm_prot_t prot)
1597{
1598 pmap_protect_options(map, sva, eva, prot, 0, NULL);
1599}
1600
1601
b0d623f7
A
1602/*
1603 * Set the physical protection on the
1604 * specified range of this map as requested.
1605 * Will not increase permissions.
1606 */
1607void
39236c6e 1608pmap_protect_options(
b0d623f7
A
1609 pmap_t map,
1610 vm_map_offset_t sva,
1611 vm_map_offset_t eva,
39236c6e
A
1612 vm_prot_t prot,
1613 unsigned int options,
1614 void *arg)
b0d623f7
A
1615{
1616 pt_entry_t *pde;
1617 pt_entry_t *spte, *epte;
1618 vm_map_offset_t lva;
1619 vm_map_offset_t orig_sva;
1620 boolean_t set_NX;
1621 int num_found = 0;
3e170ce0 1622 boolean_t is_ept;
b0d623f7
A
1623
1624 pmap_intr_assert();
1625
1626 if (map == PMAP_NULL)
1627 return;
1628
1629 if (prot == VM_PROT_NONE) {
39236c6e 1630 pmap_remove_options(map, sva, eva, options);
b0d623f7
A
1631 return;
1632 }
1633 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
1634 map,
1635 (uint32_t) (sva >> 32), (uint32_t) sva,
1636 (uint32_t) (eva >> 32), (uint32_t) eva);
1637
1638 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !map->nx_enabled)
1639 set_NX = FALSE;
1640 else
1641 set_NX = TRUE;
1642
3e170ce0
A
1643 is_ept = is_ept_pmap(map);
1644
1645
b0d623f7
A
1646 PMAP_LOCK(map);
1647
1648 orig_sva = sva;
1649 while (sva < eva) {
1650 lva = (sva + pde_mapped_size) & ~(pde_mapped_size - 1);
1651 if (lva > eva)
1652 lva = eva;
1653 pde = pmap_pde(map, sva);
3e170ce0
A
1654 if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1655 if (*pde & PTE_PS) {
b0d623f7
A
1656 /* superpage */
1657 spte = pde;
1658 epte = spte+1; /* excluded */
1659 } else {
1660 spte = pmap_pte(map, (sva & ~(pde_mapped_size - 1)));
1661 spte = &spte[ptenum(sva)];
1662 epte = &spte[intel_btop(lva - sva)];
1663 }
1664
1665 for (; spte < epte; spte++) {
3e170ce0 1666 if (!(*spte & PTE_VALID_MASK(is_ept)))
b0d623f7
A
1667 continue;
1668
3e170ce0
A
1669 if (is_ept) {
1670 if (prot & VM_PROT_READ)
1671 pmap_update_pte(spte, 0, PTE_READ(is_ept));
1672 else
1673 pmap_update_pte(spte, PTE_READ(is_ept), 0);
1674 }
b0d623f7 1675 if (prot & VM_PROT_WRITE)
3e170ce0 1676 pmap_update_pte(spte, 0, PTE_WRITE(is_ept));
b0d623f7 1677 else
3e170ce0 1678 pmap_update_pte(spte, PTE_WRITE(is_ept), 0);
b0d623f7 1679
3e170ce0
A
1680 if (set_NX) {
1681 if (!is_ept)
1682 pmap_update_pte(spte, 0, INTEL_PTE_NX);
1683 else
1684 pmap_update_pte(spte, INTEL_EPT_EX, 0);
1685 } else {
1686 if (!is_ept)
1687 pmap_update_pte(spte, INTEL_PTE_NX, 0);
1688 else
1689 pmap_update_pte(spte, 0, INTEL_EPT_EX);
1690 }
b0d623f7
A
1691 num_found++;
1692 }
1693 }
1694 sva = lva;
1695 }
39236c6e
A
1696 if (num_found) {
1697 if (options & PMAP_OPTIONS_NOFLUSH)
1698 PMAP_UPDATE_TLBS_DELAYED(map, orig_sva, eva, (pmap_flush_context *)arg);
1699 else
1700 PMAP_UPDATE_TLBS(map, orig_sva, eva);
1701 }
b0d623f7
A
1702 PMAP_UNLOCK(map);
1703
1704 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END,
1705 0, 0, 0, 0, 0);
1706
1707}
1708
1709/* Map a (possibly) autogenned block */
1710void
1711pmap_map_block(
1712 pmap_t pmap,
1713 addr64_t va,
1714 ppnum_t pa,
1715 uint32_t size,
1716 vm_prot_t prot,
1717 int attr,
1718 __unused unsigned int flags)
1719{
1720 uint32_t page;
1721 int cur_page_size;
1722
1723 if (attr & VM_MEM_SUPERPAGE)
1724 cur_page_size = SUPERPAGE_SIZE;
1725 else
1726 cur_page_size = PAGE_SIZE;
1727
1728 for (page = 0; page < size; page+=cur_page_size/PAGE_SIZE) {
316670eb 1729 pmap_enter(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE);
b0d623f7
A
1730 va += cur_page_size;
1731 pa+=cur_page_size/PAGE_SIZE;
1732 }
1733}
1734
316670eb 1735kern_return_t
b0d623f7
A
1736pmap_expand_pml4(
1737 pmap_t map,
316670eb
A
1738 vm_map_offset_t vaddr,
1739 unsigned int options)
b0d623f7
A
1740{
1741 vm_page_t m;
1742 pmap_paddr_t pa;
1743 uint64_t i;
1744 ppnum_t pn;
1745 pml4_entry_t *pml4p;
3e170ce0 1746 boolean_t is_ept = is_ept_pmap(map);
b0d623f7
A
1747
1748 DBG("pmap_expand_pml4(%p,%p)\n", map, (void *)vaddr);
1749
1750 /*
1751 * Allocate a VM page for the pml4 page
1752 */
316670eb
A
1753 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
1754 if (options & PMAP_EXPAND_OPTIONS_NOWAIT)
1755 return KERN_RESOURCE_SHORTAGE;
b0d623f7 1756 VM_PAGE_WAIT();
316670eb 1757 }
b0d623f7
A
1758 /*
1759 * put the page into the pmap's obj list so it
1760 * can be found later.
1761 */
39037602 1762 pn = VM_PAGE_GET_PHYS_PAGE(m);
b0d623f7
A
1763 pa = i386_ptob(pn);
1764 i = pml4idx(map, vaddr);
1765
1766 /*
1767 * Zero the page.
1768 */
1769 pmap_zero_page(pn);
1770
1771 vm_page_lockspin_queues();
3e170ce0 1772 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
b0d623f7
A
1773 vm_page_unlock_queues();
1774
1775 OSAddAtomic(1, &inuse_ptepages_count);
6d2010ae 1776 OSAddAtomic64(1, &alloc_ptepages_count);
316670eb 1777 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
b0d623f7
A
1778
1779 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
1780 vm_object_lock(map->pm_obj_pml4);
1781
1782 PMAP_LOCK(map);
1783 /*
1784 * See if someone else expanded us first
1785 */
1786 if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
1787 PMAP_UNLOCK(map);
1788 vm_object_unlock(map->pm_obj_pml4);
1789
1790 VM_PAGE_FREE(m);
1791
1792 OSAddAtomic(-1, &inuse_ptepages_count);
316670eb
A
1793 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
1794 return KERN_SUCCESS;
b0d623f7
A
1795 }
1796
1797#if 0 /* DEBUG */
39236c6e 1798 if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE)) {
b0d623f7
A
1799 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
1800 map, map->pm_obj_pml4, vaddr, i);
1801 }
1802#endif
3e170ce0 1803 vm_page_insert_wired(m, map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
b0d623f7
A
1804 vm_object_unlock(map->pm_obj_pml4);
1805
1806 /*
1807 * Set the page directory entry for this page table.
1808 */
1809 pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
1810
1811 pmap_store_pte(pml4p, pa_to_pte(pa)
3e170ce0
A
1812 | PTE_READ(is_ept)
1813 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
1814 | PTE_WRITE(is_ept));
b0d623f7
A
1815
1816 PMAP_UNLOCK(map);
1817
316670eb 1818 return KERN_SUCCESS;
b0d623f7
A
1819}
1820
316670eb
A
1821kern_return_t
1822pmap_expand_pdpt(pmap_t map, vm_map_offset_t vaddr, unsigned int options)
b0d623f7
A
1823{
1824 vm_page_t m;
1825 pmap_paddr_t pa;
1826 uint64_t i;
1827 ppnum_t pn;
1828 pdpt_entry_t *pdptp;
3e170ce0 1829 boolean_t is_ept = is_ept_pmap(map);
b0d623f7
A
1830
1831 DBG("pmap_expand_pdpt(%p,%p)\n", map, (void *)vaddr);
1832
1833 while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
316670eb
A
1834 kern_return_t pep4kr = pmap_expand_pml4(map, vaddr, options);
1835 if (pep4kr != KERN_SUCCESS)
1836 return pep4kr;
b0d623f7
A
1837 }
1838
1839 /*
1840 * Allocate a VM page for the pdpt page
1841 */
316670eb
A
1842 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
1843 if (options & PMAP_EXPAND_OPTIONS_NOWAIT)
1844 return KERN_RESOURCE_SHORTAGE;
b0d623f7 1845 VM_PAGE_WAIT();
316670eb 1846 }
b0d623f7
A
1847
1848 /*
1849 * put the page into the pmap's obj list so it
1850 * can be found later.
1851 */
39037602 1852 pn = VM_PAGE_GET_PHYS_PAGE(m);
b0d623f7
A
1853 pa = i386_ptob(pn);
1854 i = pdptidx(map, vaddr);
1855
1856 /*
1857 * Zero the page.
1858 */
1859 pmap_zero_page(pn);
1860
1861 vm_page_lockspin_queues();
3e170ce0 1862 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
b0d623f7
A
1863 vm_page_unlock_queues();
1864
1865 OSAddAtomic(1, &inuse_ptepages_count);
6d2010ae 1866 OSAddAtomic64(1, &alloc_ptepages_count);
316670eb 1867 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
b0d623f7
A
1868
1869 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
1870 vm_object_lock(map->pm_obj_pdpt);
1871
1872 PMAP_LOCK(map);
1873 /*
1874 * See if someone else expanded us first
1875 */
1876 if (pmap64_pde(map, vaddr) != PD_ENTRY_NULL) {
1877 PMAP_UNLOCK(map);
1878 vm_object_unlock(map->pm_obj_pdpt);
1879
1880 VM_PAGE_FREE(m);
1881
1882 OSAddAtomic(-1, &inuse_ptepages_count);
316670eb
A
1883 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
1884 return KERN_SUCCESS;
b0d623f7
A
1885 }
1886
1887#if 0 /* DEBUG */
39236c6e 1888 if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE)) {
b0d623f7
A
1889 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
1890 map, map->pm_obj_pdpt, vaddr, i);
1891 }
1892#endif
3e170ce0 1893 vm_page_insert_wired(m, map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
b0d623f7
A
1894 vm_object_unlock(map->pm_obj_pdpt);
1895
1896 /*
1897 * Set the page directory entry for this page table.
1898 */
1899 pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
1900
1901 pmap_store_pte(pdptp, pa_to_pte(pa)
3e170ce0
A
1902 | PTE_READ(is_ept)
1903 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
1904 | PTE_WRITE(is_ept));
b0d623f7
A
1905
1906 PMAP_UNLOCK(map);
1907
316670eb 1908 return KERN_SUCCESS;
b0d623f7
A
1909
1910}
1911
1912
1913
1914/*
1915 * Routine: pmap_expand
1916 *
1917 * Expands a pmap to be able to map the specified virtual address.
1918 *
1919 * Allocates new virtual memory for the P0 or P1 portion of the
1920 * pmap, then re-maps the physical pages that were in the old
1921 * pmap to be in the new pmap.
1922 *
1923 * Must be called with the pmap system and the pmap unlocked,
1924 * since these must be unlocked to use vm_allocate or vm_deallocate.
1925 * Thus it must be called in a loop that checks whether the map
1926 * has been expanded enough.
1927 * (We won't loop forever, since page tables aren't shrunk.)
1928 */
316670eb 1929kern_return_t
b0d623f7
A
1930pmap_expand(
1931 pmap_t map,
316670eb
A
1932 vm_map_offset_t vaddr,
1933 unsigned int options)
b0d623f7
A
1934{
1935 pt_entry_t *pdp;
39037602
A
1936 vm_page_t m;
1937 pmap_paddr_t pa;
b0d623f7
A
1938 uint64_t i;
1939 ppnum_t pn;
3e170ce0 1940 boolean_t is_ept = is_ept_pmap(map);
b0d623f7
A
1941
1942
1943 /*
1944 * For the kernel, the virtual address must be in or above the basement
1945 * which is for kexts and is in the 512GB immediately below the kernel..
1946 * XXX - should use VM_MIN_KERNEL_AND_KEXT_ADDRESS not KERNEL_BASEMENT
1947 */
1948 if (map == kernel_pmap &&
1949 !(vaddr >= KERNEL_BASEMENT && vaddr <= VM_MAX_KERNEL_ADDRESS))
1950 panic("pmap_expand: bad vaddr 0x%llx for kernel pmap", vaddr);
1951
1952
1953 while ((pdp = pmap64_pde(map, vaddr)) == PD_ENTRY_NULL) {
316670eb
A
1954 kern_return_t pepkr = pmap_expand_pdpt(map, vaddr, options);
1955 if (pepkr != KERN_SUCCESS)
1956 return pepkr;
b0d623f7
A
1957 }
1958
1959 /*
1960 * Allocate a VM page for the pde entries.
1961 */
316670eb
A
1962 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
1963 if (options & PMAP_EXPAND_OPTIONS_NOWAIT)
1964 return KERN_RESOURCE_SHORTAGE;
b0d623f7 1965 VM_PAGE_WAIT();
316670eb 1966 }
b0d623f7
A
1967
1968 /*
1969 * put the page into the pmap's obj list so it
1970 * can be found later.
1971 */
39037602 1972 pn = VM_PAGE_GET_PHYS_PAGE(m);
b0d623f7
A
1973 pa = i386_ptob(pn);
1974 i = pdeidx(map, vaddr);
1975
1976 /*
1977 * Zero the page.
1978 */
1979 pmap_zero_page(pn);
1980
1981 vm_page_lockspin_queues();
3e170ce0 1982 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
b0d623f7
A
1983 vm_page_unlock_queues();
1984
1985 OSAddAtomic(1, &inuse_ptepages_count);
6d2010ae 1986 OSAddAtomic64(1, &alloc_ptepages_count);
316670eb 1987 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
b0d623f7
A
1988
1989 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
1990 vm_object_lock(map->pm_obj);
1991
1992 PMAP_LOCK(map);
1993
1994 /*
1995 * See if someone else expanded us first
1996 */
1997 if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
1998 PMAP_UNLOCK(map);
1999 vm_object_unlock(map->pm_obj);
2000
2001 VM_PAGE_FREE(m);
2002
2003 OSAddAtomic(-1, &inuse_ptepages_count);
316670eb
A
2004 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2005 return KERN_SUCCESS;
b0d623f7
A
2006 }
2007
2008#if 0 /* DEBUG */
39236c6e 2009 if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE)) {
b0d623f7
A
2010 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
2011 map, map->pm_obj, vaddr, i);
2012 }
2013#endif
3e170ce0 2014 vm_page_insert_wired(m, map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
b0d623f7
A
2015 vm_object_unlock(map->pm_obj);
2016
2017 /*
2018 * Set the page directory entry for this page table.
2019 */
2020 pdp = pmap_pde(map, vaddr);
2021 pmap_store_pte(pdp, pa_to_pte(pa)
3e170ce0
A
2022 | PTE_READ(is_ept)
2023 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2024 | PTE_WRITE(is_ept));
b0d623f7
A
2025
2026 PMAP_UNLOCK(map);
2027
316670eb 2028 return KERN_SUCCESS;
b0d623f7
A
2029}
2030
2031/* On K64 machines with more than 32GB of memory, pmap_steal_memory
2032 * will allocate past the 1GB of pre-expanded virtual kernel area. This
2033 * function allocates all the page tables using memory from the same pool
2034 * that pmap_steal_memory uses, rather than calling vm_page_grab (which
2035 * isn't available yet). */
2036void
6d2010ae
A
2037pmap_pre_expand(pmap_t pmap, vm_map_offset_t vaddr)
2038{
b0d623f7
A
2039 ppnum_t pn;
2040 pt_entry_t *pte;
3e170ce0 2041 boolean_t is_ept = is_ept_pmap(pmap);
b0d623f7
A
2042
2043 PMAP_LOCK(pmap);
2044
2045 if(pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) {
0b4c1975 2046 if (!pmap_next_page_hi(&pn))
b0d623f7
A
2047 panic("pmap_pre_expand");
2048
2049 pmap_zero_page(pn);
2050
2051 pte = pmap64_pml4(pmap, vaddr);
2052
2053 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
3e170ce0
A
2054 | PTE_READ(is_ept)
2055 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2056 | PTE_WRITE(is_ept));
b0d623f7
A
2057 }
2058
2059 if(pmap64_pde(pmap, vaddr) == PD_ENTRY_NULL) {
0b4c1975 2060 if (!pmap_next_page_hi(&pn))
b0d623f7
A
2061 panic("pmap_pre_expand");
2062
2063 pmap_zero_page(pn);
2064
2065 pte = pmap64_pdpt(pmap, vaddr);
2066
2067 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
3e170ce0
A
2068 | PTE_READ(is_ept)
2069 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2070 | PTE_WRITE(is_ept));
b0d623f7
A
2071 }
2072
2073 if(pmap_pte(pmap, vaddr) == PT_ENTRY_NULL) {
0b4c1975 2074 if (!pmap_next_page_hi(&pn))
b0d623f7
A
2075 panic("pmap_pre_expand");
2076
2077 pmap_zero_page(pn);
2078
2079 pte = pmap64_pde(pmap, vaddr);
2080
2081 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
3e170ce0
A
2082 | PTE_READ(is_ept)
2083 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2084 | PTE_WRITE(is_ept));
b0d623f7
A
2085 }
2086
2087 PMAP_UNLOCK(pmap);
2088}
2089
2090/*
2091 * pmap_sync_page_data_phys(ppnum_t pa)
2092 *
2093 * Invalidates all of the instruction cache on a physical page and
2094 * pushes any dirty data from the data cache for the same physical page
2095 * Not required in i386.
2096 */
2097void
2098pmap_sync_page_data_phys(__unused ppnum_t pa)
2099{
2100 return;
2101}
2102
2103/*
2104 * pmap_sync_page_attributes_phys(ppnum_t pa)
2105 *
2106 * Write back and invalidate all cachelines on a physical page.
2107 */
2108void
2109pmap_sync_page_attributes_phys(ppnum_t pa)
2110{
2111 cache_flush_page_phys(pa);
2112}
2113
2114
2115
2116#ifdef CURRENTLY_UNUSED_AND_UNTESTED
2117
2118int collect_ref;
2119int collect_unref;
2120
2121/*
2122 * Routine: pmap_collect
2123 * Function:
2124 * Garbage collects the physical map system for
2125 * pages which are no longer used.
2126 * Success need not be guaranteed -- that is, there
2127 * may well be pages which are not referenced, but
2128 * others may be collected.
2129 * Usage:
2130 * Called by the pageout daemon when pages are scarce.
2131 */
2132void
2133pmap_collect(
2134 pmap_t p)
2135{
39037602 2136 pt_entry_t *pdp, *ptp;
b0d623f7
A
2137 pt_entry_t *eptp;
2138 int wired;
3e170ce0 2139 boolean_t is_ept;
b0d623f7
A
2140
2141 if (p == PMAP_NULL)
2142 return;
2143
2144 if (p == kernel_pmap)
2145 return;
2146
3e170ce0
A
2147 is_ept = is_ept_pmap(p);
2148
b0d623f7
A
2149 /*
2150 * Garbage collect map.
2151 */
2152 PMAP_LOCK(p);
2153
2154 for (pdp = (pt_entry_t *)p->dirbase;
2155 pdp < (pt_entry_t *)&p->dirbase[(UMAXPTDI+1)];
2156 pdp++)
2157 {
3e170ce0
A
2158 if (*pdp & PTE_VALID_MASK(is_ept)) {
2159 if (*pdp & PTE_REF(is_ept)) {
2160 pmap_store_pte(pdp, *pdp & ~PTE_REF(is_ept));
2161 collect_ref++;
2162 } else {
2163 collect_unref++;
2164 ptp = pmap_pte(p, pdetova(pdp - (pt_entry_t *)p->dirbase));
2165 eptp = ptp + NPTEPG;
b0d623f7 2166
3e170ce0
A
2167 /*
2168 * If the pte page has any wired mappings, we cannot
2169 * free it.
2170 */
2171 wired = 0;
2172 {
39037602 2173 pt_entry_t *ptep;
3e170ce0
A
2174 for (ptep = ptp; ptep < eptp; ptep++) {
2175 if (iswired(*ptep)) {
2176 wired = 1;
2177 break;
2178 }
2179 }
2180 }
2181 if (!wired) {
2182 /*
2183 * Remove the virtual addresses mapped by this pte page.
2184 */
2185 pmap_remove_range(p,
2186 pdetova(pdp - (pt_entry_t *)p->dirbase),
2187 ptp,
2188 eptp);
2189
2190 /*
2191 * Invalidate the page directory pointer.
2192 */
2193 pmap_store_pte(pdp, 0x0);
2194
2195 PMAP_UNLOCK(p);
2196
2197 /*
2198 * And free the pte page itself.
2199 */
2200 {
39037602 2201 vm_page_t m;
3e170ce0
A
2202
2203 vm_object_lock(p->pm_obj);
2204
2205 m = vm_page_lookup(p->pm_obj,(vm_object_offset_t)(pdp - (pt_entry_t *)&p->dirbase[0]) * PAGE_SIZE);
2206 if (m == VM_PAGE_NULL)
2207 panic("pmap_collect: pte page not in object");
2208
2209 vm_object_unlock(p->pm_obj);
2210
2211 VM_PAGE_FREE(m);
2212
2213 OSAddAtomic(-1, &inuse_ptepages_count);
2214 PMAP_ZINFO_PFREE(p, PAGE_SIZE);
2215 }
2216
2217 PMAP_LOCK(p);
2218 }
b0d623f7 2219 }
b0d623f7 2220 }
b0d623f7
A
2221 }
2222
2223 PMAP_UPDATE_TLBS(p, 0x0, 0xFFFFFFFFFFFFF000ULL);
2224 PMAP_UNLOCK(p);
2225 return;
b0d623f7
A
2226}
2227#endif
2228
2229
2230void
2231pmap_copy_page(ppnum_t src, ppnum_t dst)
2232{
2233 bcopy_phys((addr64_t)i386_ptob(src),
2234 (addr64_t)i386_ptob(dst),
2235 PAGE_SIZE);
2236}
2237
2238
2239/*
2240 * Routine: pmap_pageable
2241 * Function:
2242 * Make the specified pages (by pmap, offset)
2243 * pageable (or not) as requested.
2244 *
2245 * A page which is not pageable may not take
2246 * a fault; therefore, its page table entry
2247 * must remain valid for the duration.
2248 *
2249 * This routine is merely advisory; pmap_enter
2250 * will specify that these pages are to be wired
2251 * down (or not) as appropriate.
2252 */
2253void
2254pmap_pageable(
2255 __unused pmap_t pmap,
2256 __unused vm_map_offset_t start_addr,
2257 __unused vm_map_offset_t end_addr,
2258 __unused boolean_t pageable)
2259{
2260#ifdef lint
2261 pmap++; start_addr++; end_addr++; pageable++;
2262#endif /* lint */
2263}
2264
b0d623f7
A
2265void
2266invalidate_icache(__unused vm_offset_t addr,
2267 __unused unsigned cnt,
2268 __unused int phys)
2269{
2270 return;
2271}
2272
2273void
2274flush_dcache(__unused vm_offset_t addr,
2275 __unused unsigned count,
2276 __unused int phys)
2277{
2278 return;
2279}
2280
2281#if CONFIG_DTRACE
2282/*
2283 * Constrain DTrace copyin/copyout actions
2284 */
2285extern kern_return_t dtrace_copyio_preflight(addr64_t);
2286extern kern_return_t dtrace_copyio_postflight(addr64_t);
2287
2288kern_return_t dtrace_copyio_preflight(__unused addr64_t va)
2289{
2290 thread_t thread = current_thread();
6d2010ae 2291 uint64_t ccr3;
b0d623f7
A
2292 if (current_map() == kernel_map)
2293 return KERN_FAILURE;
6d2010ae
A
2294 else if (((ccr3 = get_cr3_base()) != thread->map->pmap->pm_cr3) && (no_shared_cr3 == FALSE))
2295 return KERN_FAILURE;
2296 else if (no_shared_cr3 && (ccr3 != kernel_pmap->pm_cr3))
b0d623f7 2297 return KERN_FAILURE;
b0d623f7
A
2298 else
2299 return KERN_SUCCESS;
2300}
2301
2302kern_return_t dtrace_copyio_postflight(__unused addr64_t va)
2303{
2304 return KERN_SUCCESS;
2305}
2306#endif /* CONFIG_DTRACE */
2307
2308#include <mach_vm_debug.h>
2309#if MACH_VM_DEBUG
2310#include <vm/vm_debug.h>
2311
2312int
2313pmap_list_resident_pages(
2314 __unused pmap_t pmap,
2315 __unused vm_offset_t *listp,
2316 __unused int space)
2317{
2318 return 0;
2319}
2320#endif /* MACH_VM_DEBUG */
2321
2322
39037602 2323#if CONFIG_COREDUMP
b0d623f7
A
2324/* temporary workaround */
2325boolean_t
2326coredumpok(__unused vm_map_t map, __unused vm_offset_t va)
2327{
2328#if 0
2329 pt_entry_t *ptep;
2330
2331 ptep = pmap_pte(map->pmap, va);
2332 if (0 == ptep)
2333 return FALSE;
2334 return ((*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED));
2335#else
2336 return TRUE;
2337#endif
2338}
39037602 2339#endif
b0d623f7
A
2340
2341boolean_t
2342phys_page_exists(ppnum_t pn)
2343{
2344 assert(pn != vm_page_fictitious_addr);
2345
2346 if (!pmap_initialized)
2347 return TRUE;
2348
2349 if (pn == vm_page_guard_addr)
2350 return FALSE;
2351
2352 if (!IS_MANAGED_PAGE(ppn_to_pai(pn)))
2353 return FALSE;
2354
2355 return TRUE;
2356}
2357
6d2010ae
A
2358
2359
b0d623f7
A
2360void
2361pmap_switch(pmap_t tpmap)
2362{
2363 spl_t s;
2364
2365 s = splhigh(); /* Make sure interruptions are disabled */
fe8ab488 2366 set_dirbase(tpmap, current_thread(), cpu_number());
b0d623f7
A
2367 splx(s);
2368}
2369
2370
2371/*
2372 * disable no-execute capability on
2373 * the specified pmap
2374 */
2375void
2376pmap_disable_NX(pmap_t pmap)
2377{
2378 pmap->nx_enabled = 0;
2379}
2380
6d2010ae
A
2381void
2382pt_fake_zone_init(int zone_index)
2383{
2384 pt_fake_zone_index = zone_index;
2385}
2386
b0d623f7
A
2387void
2388pt_fake_zone_info(
2389 int *count,
2390 vm_size_t *cur_size,
2391 vm_size_t *max_size,
2392 vm_size_t *elem_size,
2393 vm_size_t *alloc_size,
6d2010ae 2394 uint64_t *sum_size,
b0d623f7 2395 int *collectable,
6d2010ae
A
2396 int *exhaustable,
2397 int *caller_acct)
b0d623f7
A
2398{
2399 *count = inuse_ptepages_count;
2400 *cur_size = PAGE_SIZE * inuse_ptepages_count;
2401 *max_size = PAGE_SIZE * (inuse_ptepages_count +
2402 vm_page_inactive_count +
2403 vm_page_active_count +
2404 vm_page_free_count);
2405 *elem_size = PAGE_SIZE;
2406 *alloc_size = PAGE_SIZE;
6d2010ae 2407 *sum_size = alloc_ptepages_count * PAGE_SIZE;
b0d623f7
A
2408
2409 *collectable = 1;
2410 *exhaustable = 0;
6d2010ae 2411 *caller_acct = 1;
b0d623f7
A
2412}
2413
39236c6e
A
2414
2415void
2416pmap_flush_context_init(pmap_flush_context *pfc)
2417{
2418 pfc->pfc_cpus = 0;
2419 pfc->pfc_invalid_global = 0;
2420}
2421
39037602 2422extern uint64_t TLBTimeOut;
39236c6e
A
2423void
2424pmap_flush(
2425 pmap_flush_context *pfc)
2426{
2427 unsigned int my_cpu;
2428 unsigned int cpu;
2429 unsigned int cpu_bit;
fe8ab488
A
2430 cpumask_t cpus_to_respond = 0;
2431 cpumask_t cpus_to_signal = 0;
2432 cpumask_t cpus_signaled = 0;
39236c6e
A
2433 boolean_t flush_self = FALSE;
2434 uint64_t deadline;
2435
2436 mp_disable_preemption();
2437
2438 my_cpu = cpu_number();
2439 cpus_to_signal = pfc->pfc_cpus;
2440
2441 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_START,
2442 NULL, cpus_to_signal, 0, 0, 0);
2443
2444 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus && cpus_to_signal; cpu++, cpu_bit <<= 1) {
2445
2446 if (cpus_to_signal & cpu_bit) {
2447
2448 cpus_to_signal &= ~cpu_bit;
2449
2450 if (!cpu_datap(cpu)->cpu_running)
2451 continue;
2452
2453 if (pfc->pfc_invalid_global & cpu_bit)
2454 cpu_datap(cpu)->cpu_tlb_invalid_global = TRUE;
2455 else
2456 cpu_datap(cpu)->cpu_tlb_invalid_local = TRUE;
2457 mfence();
2458
2459 if (cpu == my_cpu) {
2460 flush_self = TRUE;
2461 continue;
2462 }
2463 if (CPU_CR3_IS_ACTIVE(cpu)) {
2464 cpus_to_respond |= cpu_bit;
2465 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2466 }
2467 }
2468 }
2469 cpus_signaled = cpus_to_respond;
2470
2471 /*
2472 * Flush local tlb if required.
2473 * Do this now to overlap with other processors responding.
2474 */
2475 if (flush_self && cpu_datap(my_cpu)->cpu_tlb_invalid != FALSE)
2476 process_pmap_updates();
2477
2478 if (cpus_to_respond) {
2479
fe8ab488
A
2480 deadline = mach_absolute_time() +
2481 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
2482 boolean_t is_timeout_traced = FALSE;
2483
39236c6e
A
2484 /*
2485 * Wait for those other cpus to acknowledge
2486 */
2487 while (cpus_to_respond != 0) {
2488 long orig_acks = 0;
2489
2490 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2491 /* Consider checking local/global invalidity
2492 * as appropriate in the PCID case.
2493 */
2494 if ((cpus_to_respond & cpu_bit) != 0) {
2495 if (!cpu_datap(cpu)->cpu_running ||
2496 cpu_datap(cpu)->cpu_tlb_invalid == FALSE ||
2497 !CPU_CR3_IS_ACTIVE(cpu)) {
2498 cpus_to_respond &= ~cpu_bit;
2499 }
2500 cpu_pause();
2501 }
2502 if (cpus_to_respond == 0)
2503 break;
2504 }
2505 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
2506 if (machine_timeout_suspended())
2507 continue;
fe8ab488
A
2508 if (TLBTimeOut == 0) {
2509 if (is_timeout_traced)
2510 continue;
2511 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
2512 NULL, cpus_to_signal, cpus_to_respond, 0, 0);
2513 is_timeout_traced = TRUE;
2514 continue;
2515 }
39236c6e
A
2516 pmap_tlb_flush_timeout = TRUE;
2517 orig_acks = NMIPI_acks;
fe8ab488 2518 mp_cpus_NMIPI(cpus_to_respond);
39236c6e
A
2519
2520 panic("TLB invalidation IPI timeout: "
3e170ce0 2521 "CPU(s) failed to respond to interrupts, unresponsive CPU bitmap: 0x%llx, NMIPI acks: orig: 0x%lx, now: 0x%lx",
39236c6e
A
2522 cpus_to_respond, orig_acks, NMIPI_acks);
2523 }
2524 }
2525 }
2526 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_END,
2527 NULL, cpus_signaled, flush_self, 0, 0);
2528
2529 mp_enable_preemption();
2530}
2531
2532
3e170ce0
A
2533static void
2534invept(void *eptp)
2535{
2536 struct {
2537 uint64_t eptp;
2538 uint64_t reserved;
2539 } __attribute__((aligned(16), packed)) invept_descriptor = {(uint64_t)eptp, 0};
2540
2541 __asm__ volatile("invept (%%rax), %%rcx"
2542 : : "c" (PMAP_INVEPT_SINGLE_CONTEXT), "a" (&invept_descriptor)
2543 : "cc", "memory");
2544}
2545
b0d623f7
A
2546/*
2547 * Called with pmap locked, we:
2548 * - scan through per-cpu data to see which other cpus need to flush
2549 * - send an IPI to each non-idle cpu to be flushed
2550 * - wait for all to signal back that they are inactive or we see that
2551 * they are at a safe point (idle).
2552 * - flush the local tlb if active for this pmap
2553 * - return ... the caller will unlock the pmap
2554 */
6d2010ae 2555
b0d623f7 2556void
39236c6e 2557pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int options, pmap_flush_context *pfc)
b0d623f7
A
2558{
2559 unsigned int cpu;
2560 unsigned int cpu_bit;
39037602 2561 cpumask_t cpus_to_signal = 0;
b0d623f7
A
2562 unsigned int my_cpu = cpu_number();
2563 pmap_paddr_t pmap_cr3 = pmap->pm_cr3;
2564 boolean_t flush_self = FALSE;
2565 uint64_t deadline;
6d2010ae 2566 boolean_t pmap_is_shared = (pmap->pm_shared || (pmap == kernel_pmap));
39236c6e 2567 boolean_t need_global_flush = FALSE;
fe8ab488 2568 uint32_t event_code;
4bd07ac2 2569 vm_map_offset_t event_startv, event_endv;
3e170ce0 2570 boolean_t is_ept = is_ept_pmap(pmap);
b0d623f7
A
2571
2572 assert((processor_avail_count < 2) ||
2573 (ml_get_interrupts_enabled() && get_preemption_level() != 0));
2574
3e170ce0
A
2575 if (pmap == kernel_pmap) {
2576 event_code = PMAP_CODE(PMAP__FLUSH_KERN_TLBS);
4bd07ac2
A
2577 event_startv = VM_KERNEL_UNSLIDE_OR_PERM(startv);
2578 event_endv = VM_KERNEL_UNSLIDE_OR_PERM(endv);
3e170ce0
A
2579 } else if (is_ept) {
2580 event_code = PMAP_CODE(PMAP__FLUSH_EPT);
4bd07ac2
A
2581 event_startv = startv;
2582 event_endv = endv;
3e170ce0
A
2583 } else {
2584 event_code = PMAP_CODE(PMAP__FLUSH_TLBS);
4bd07ac2
A
2585 event_startv = startv;
2586 event_endv = endv;
3e170ce0
A
2587 }
2588
fe8ab488 2589 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_START,
4bd07ac2 2590 VM_KERNEL_UNSLIDE_OR_PERM(pmap), options, event_startv, event_endv, 0);
fe8ab488 2591
3e170ce0
A
2592 if (is_ept) {
2593 mp_cpus_call(CPUMASK_ALL, ASYNC, invept, (void*)pmap->pm_eptp);
2594 goto out;
2595 }
2596
b0d623f7
A
2597 /*
2598 * Scan other cpus for matching active or task CR3.
2599 * For idle cpus (with no active map) we mark them invalid but
2600 * don't signal -- they'll check as they go busy.
2601 */
6d2010ae 2602 if (pmap_pcid_ncpus) {
39236c6e
A
2603 if (pmap_is_shared)
2604 need_global_flush = TRUE;
6d2010ae 2605 pmap_pcid_invalidate_all_cpus(pmap);
39236c6e 2606 mfence();
6d2010ae 2607 }
b0d623f7
A
2608 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2609 if (!cpu_datap(cpu)->cpu_running)
2610 continue;
2611 uint64_t cpu_active_cr3 = CPU_GET_ACTIVE_CR3(cpu);
2612 uint64_t cpu_task_cr3 = CPU_GET_TASK_CR3(cpu);
2613
2614 if ((pmap_cr3 == cpu_task_cr3) ||
2615 (pmap_cr3 == cpu_active_cr3) ||
6d2010ae 2616 (pmap_is_shared)) {
39236c6e
A
2617
2618 if (options & PMAP_DELAY_TLB_FLUSH) {
2619 if (need_global_flush == TRUE)
2620 pfc->pfc_invalid_global |= cpu_bit;
2621 pfc->pfc_cpus |= cpu_bit;
2622
2623 continue;
2624 }
b0d623f7
A
2625 if (cpu == my_cpu) {
2626 flush_self = TRUE;
2627 continue;
2628 }
39236c6e 2629 if (need_global_flush == TRUE)
6d2010ae
A
2630 cpu_datap(cpu)->cpu_tlb_invalid_global = TRUE;
2631 else
2632 cpu_datap(cpu)->cpu_tlb_invalid_local = TRUE;
39236c6e 2633 mfence();
b0d623f7
A
2634
2635 /*
2636 * We don't need to signal processors which will flush
2637 * lazily at the idle state or kernel boundary.
2638 * For example, if we're invalidating the kernel pmap,
2639 * processors currently in userspace don't need to flush
2640 * their TLBs until the next time they enter the kernel.
2641 * Alterations to the address space of a task active
2642 * on a remote processor result in a signal, to
2643 * account for copy operations. (There may be room
2644 * for optimization in such cases).
2645 * The order of the loads below with respect
2646 * to the store to the "cpu_tlb_invalid" field above
2647 * is important--hence the barrier.
2648 */
2649 if (CPU_CR3_IS_ACTIVE(cpu) &&
2650 (pmap_cr3 == CPU_GET_ACTIVE_CR3(cpu) ||
39236c6e
A
2651 pmap->pm_shared ||
2652 (pmap_cr3 == CPU_GET_TASK_CR3(cpu)))) {
b0d623f7
A
2653 cpus_to_signal |= cpu_bit;
2654 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2655 }
2656 }
2657 }
39236c6e 2658 if ((options & PMAP_DELAY_TLB_FLUSH))
fe8ab488 2659 goto out;
b0d623f7 2660
b0d623f7
A
2661 /*
2662 * Flush local tlb if required.
2663 * Do this now to overlap with other processors responding.
2664 */
6d2010ae
A
2665 if (flush_self) {
2666 if (pmap_pcid_ncpus) {
2667 pmap_pcid_validate_cpu(pmap, my_cpu);
2668 if (pmap_is_shared)
2669 tlb_flush_global();
2670 else
2671 flush_tlb_raw();
2672 }
2673 else
2674 flush_tlb_raw();
2675 }
b0d623f7
A
2676
2677 if (cpus_to_signal) {
fe8ab488
A
2678 cpumask_t cpus_to_respond = cpus_to_signal;
2679
2680 deadline = mach_absolute_time() +
2681 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
2682 boolean_t is_timeout_traced = FALSE;
b0d623f7 2683
b0d623f7
A
2684 /*
2685 * Wait for those other cpus to acknowledge
2686 */
2687 while (cpus_to_respond != 0) {
060df5ea 2688 long orig_acks = 0;
b0d623f7
A
2689
2690 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
6d2010ae
A
2691 /* Consider checking local/global invalidity
2692 * as appropriate in the PCID case.
2693 */
b0d623f7
A
2694 if ((cpus_to_respond & cpu_bit) != 0) {
2695 if (!cpu_datap(cpu)->cpu_running ||
2696 cpu_datap(cpu)->cpu_tlb_invalid == FALSE ||
2697 !CPU_CR3_IS_ACTIVE(cpu)) {
2698 cpus_to_respond &= ~cpu_bit;
2699 }
2700 cpu_pause();
2701 }
2702 if (cpus_to_respond == 0)
2703 break;
2704 }
6d2010ae 2705 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
060df5ea
A
2706 if (machine_timeout_suspended())
2707 continue;
fe8ab488
A
2708 if (TLBTimeOut == 0) {
2709 /* cut tracepoint but don't panic */
2710 if (is_timeout_traced)
2711 continue;
2712 PMAP_TRACE_CONSTANT(
2713 PMAP_CODE(PMAP__FLUSH_TLBS_TO),
4bd07ac2 2714 VM_KERNEL_UNSLIDE_OR_PERM(pmap), cpus_to_signal, cpus_to_respond, 0, 0);
fe8ab488
A
2715 is_timeout_traced = TRUE;
2716 continue;
2717 }
060df5ea
A
2718 pmap_tlb_flush_timeout = TRUE;
2719 orig_acks = NMIPI_acks;
fe8ab488 2720 mp_cpus_NMIPI(cpus_to_respond);
060df5ea
A
2721
2722 panic("TLB invalidation IPI timeout: "
3e170ce0 2723 "CPU(s) failed to respond to interrupts, unresponsive CPU bitmap: 0x%llx, NMIPI acks: orig: 0x%lx, now: 0x%lx",
060df5ea
A
2724 cpus_to_respond, orig_acks, NMIPI_acks);
2725 }
b0d623f7
A
2726 }
2727 }
2728
316670eb 2729 if (__improbable((pmap == kernel_pmap) && (flush_self != TRUE))) {
39236c6e
A
2730 panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, pmap_cr3: 0x%llx, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, pmap_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map);
2731 }
2732
fe8ab488
A
2733out:
2734 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_END,
4bd07ac2 2735 VM_KERNEL_UNSLIDE_OR_PERM(pmap), cpus_to_signal, event_startv, event_endv, 0);
316670eb 2736
b0d623f7
A
2737}
2738
2739void
2740process_pmap_updates(void)
2741{
6d2010ae
A
2742 int ccpu = cpu_number();
2743 pmap_assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
2744 if (pmap_pcid_ncpus) {
2745 pmap_pcid_validate_current();
2746 if (cpu_datap(ccpu)->cpu_tlb_invalid_global) {
2747 cpu_datap(ccpu)->cpu_tlb_invalid = FALSE;
2748 tlb_flush_global();
2749 }
2750 else {
2751 cpu_datap(ccpu)->cpu_tlb_invalid_local = FALSE;
2752 flush_tlb_raw();
2753 }
2754 }
2755 else {
2756 current_cpu_datap()->cpu_tlb_invalid = FALSE;
2757 flush_tlb_raw();
2758 }
b0d623f7 2759
39236c6e 2760 mfence();
b0d623f7
A
2761}
2762
2763void
2764pmap_update_interrupt(void)
2765{
2766 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START,
2767 0, 0, 0, 0, 0);
2768
39236c6e
A
2769 if (current_cpu_datap()->cpu_tlb_invalid)
2770 process_pmap_updates();
b0d623f7
A
2771
2772 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END,
2773 0, 0, 0, 0, 0);
2774}
316670eb
A
2775
2776#include <mach/mach_vm.h> /* mach_vm_region_recurse() */
2777/* Scan kernel pmap for W+X PTEs, scan kernel VM map for W+X map entries
2778 * and identify ranges with mismatched VM permissions and PTE permissions
2779 */
2780kern_return_t
2781pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset_t ev) {
2782 vm_offset_t cv = sv;
2783 kern_return_t rv = KERN_SUCCESS;
2784 uint64_t skip4 = 0, skip2 = 0;
2785
3e170ce0
A
2786 assert(!is_ept_pmap(ipmap));
2787
316670eb
A
2788 sv &= ~PAGE_MASK_64;
2789 ev &= ~PAGE_MASK_64;
2790 while (cv < ev) {
2791 if (__improbable((cv > 0x00007FFFFFFFFFFFULL) &&
2792 (cv < 0xFFFF800000000000ULL))) {
2793 cv = 0xFFFF800000000000ULL;
2794 }
2795 /* Potential inconsistencies from not holding pmap lock
2796 * but harmless for the moment.
2797 */
2798 if (((cv & PML4MASK) == 0) && (pmap64_pml4(ipmap, cv) == 0)) {
2799 if ((cv + NBPML4) > cv)
2800 cv += NBPML4;
2801 else
2802 break;
2803 skip4++;
2804 continue;
2805 }
2806 if (((cv & PDMASK) == 0) && (pmap_pde(ipmap, cv) == 0)) {
2807 if ((cv + NBPD) > cv)
2808 cv += NBPD;
2809 else
2810 break;
2811 skip2++;
2812 continue;
2813 }
2814
2815 pt_entry_t *ptep = pmap_pte(ipmap, cv);
2816 if (ptep && (*ptep & INTEL_PTE_VALID)) {
2817 if (*ptep & INTEL_PTE_WRITE) {
2818 if (!(*ptep & INTEL_PTE_NX)) {
2819 kprintf("W+X PTE at 0x%lx, P4: 0x%llx, P3: 0x%llx, P2: 0x%llx, PT: 0x%llx, VP: %u\n", cv, *pmap64_pml4(ipmap, cv), *pmap64_pdpt(ipmap, cv), *pmap64_pde(ipmap, cv), *ptep, pmap_valid_page((ppnum_t)(i386_btop(pte_to_pa(*ptep)))));
2820 rv = KERN_FAILURE;
2821 }
2822 }
2823 }
2824 cv += PAGE_SIZE;
2825 }
2826 kprintf("Completed pmap scan\n");
2827 cv = sv;
2828
2829 struct vm_region_submap_info_64 vbr;
2830 mach_msg_type_number_t vbrcount = 0;
2831 mach_vm_size_t vmsize;
2832 vm_prot_t prot;
2833 uint32_t nesting_depth = 0;
2834 kern_return_t kret;
2835
2836 while (cv < ev) {
2837
2838 for (;;) {
2839 vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64;
2840 if((kret = mach_vm_region_recurse(ivmmap,
2841 (mach_vm_address_t *) &cv, &vmsize, &nesting_depth,
2842 (vm_region_recurse_info_t)&vbr,
2843 &vbrcount)) != KERN_SUCCESS) {
2844 break;
2845 }
2846
2847 if(vbr.is_submap) {
2848 nesting_depth++;
2849 continue;
2850 } else {
2851 break;
2852 }
2853 }
2854
2855 if(kret != KERN_SUCCESS)
2856 break;
2857
2858 prot = vbr.protection;
2859
2860 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) {
2861 kprintf("W+X map entry at address 0x%lx\n", cv);
2862 rv = KERN_FAILURE;
2863 }
2864
2865 if (prot) {
2866 vm_offset_t pcv;
2867 for (pcv = cv; pcv < cv + vmsize; pcv += PAGE_SIZE) {
2868 pt_entry_t *ptep = pmap_pte(ipmap, pcv);
2869 vm_prot_t tprot;
2870
2871 if ((ptep == NULL) || !(*ptep & INTEL_PTE_VALID))
2872 continue;
2873 tprot = VM_PROT_READ;
2874 if (*ptep & INTEL_PTE_WRITE)
2875 tprot |= VM_PROT_WRITE;
2876 if ((*ptep & INTEL_PTE_NX) == 0)
2877 tprot |= VM_PROT_EXECUTE;
2878 if (tprot != prot) {
2879 kprintf("PTE/map entry permissions mismatch at address 0x%lx, pte: 0x%llx, protection: 0x%x\n", pcv, *ptep, prot);
2880 rv = KERN_FAILURE;
2881 }
2882 }
2883 }
2884 cv += vmsize;
2885 }
2886 return rv;
2887}
39037602
A
2888
2889#if MACH_ASSERT
2890extern int pmap_ledgers_panic;
2891static void
2892pmap_check_ledgers(
2893 pmap_t pmap)
2894{
2895 ledger_amount_t bal;
2896 int pid;
2897 char *procname;
2898 boolean_t do_panic;
2899
2900 if (pmap->pmap_pid == 0) {
2901 /*
2902 * This pmap was not or is no longer fully associated
2903 * with a task (e.g. the old pmap after a fork()/exec() or
2904 * spawn()). Its "ledger" still points at a task that is
2905 * now using a different (and active) address space, so
2906 * we can't check that all the pmap ledgers are balanced here.
2907 *
2908 * If the "pid" is set, that means that we went through
2909 * pmap_set_process() in task_terminate_internal(), so
2910 * this task's ledger should not have been re-used and
2911 * all the pmap ledgers should be back to 0.
2912 */
2913 return;
2914 }
2915
2916 do_panic = FALSE;
2917 pid = pmap->pmap_pid;
2918 procname = pmap->pmap_procname;
2919
2920 pmap_ledgers_drift.num_pmaps_checked++;
2921
2922 ledger_get_balance(pmap->ledger,
2923 task_ledgers.phys_footprint,
2924 &bal);
2925 if (bal != 0) {
2926 do_panic = TRUE;
2927 printf("LEDGER BALANCE proc %d (%s) "
2928 "\"phys_footprint\" = %lld\n",
2929 pid, procname, bal);
2930 if (bal > 0) {
2931 pmap_ledgers_drift.phys_footprint_over++;
2932 pmap_ledgers_drift.phys_footprint_over_total += bal;
2933 if (bal > pmap_ledgers_drift.phys_footprint_over_max) {
2934 pmap_ledgers_drift.phys_footprint_over_max = bal;
2935 }
2936 } else {
2937 pmap_ledgers_drift.phys_footprint_under++;
2938 pmap_ledgers_drift.phys_footprint_under_total += bal;
2939 if (bal < pmap_ledgers_drift.phys_footprint_under_max) {
2940 pmap_ledgers_drift.phys_footprint_under_max = bal;
2941 }
2942 }
2943 }
2944 ledger_get_balance(pmap->ledger,
2945 task_ledgers.internal,
2946 &bal);
2947 if (bal != 0) {
2948 do_panic = TRUE;
2949 printf("LEDGER BALANCE proc %d (%s) "
2950 "\"internal\" = %lld\n",
2951 pid, procname, bal);
2952 if (bal > 0) {
2953 pmap_ledgers_drift.internal_over++;
2954 pmap_ledgers_drift.internal_over_total += bal;
2955 if (bal > pmap_ledgers_drift.internal_over_max) {
2956 pmap_ledgers_drift.internal_over_max = bal;
2957 }
2958 } else {
2959 pmap_ledgers_drift.internal_under++;
2960 pmap_ledgers_drift.internal_under_total += bal;
2961 if (bal < pmap_ledgers_drift.internal_under_max) {
2962 pmap_ledgers_drift.internal_under_max = bal;
2963 }
2964 }
2965 }
2966 ledger_get_balance(pmap->ledger,
2967 task_ledgers.internal_compressed,
2968 &bal);
2969 if (bal != 0) {
2970 do_panic = TRUE;
2971 printf("LEDGER BALANCE proc %d (%s) "
2972 "\"internal_compressed\" = %lld\n",
2973 pid, procname, bal);
2974 if (bal > 0) {
2975 pmap_ledgers_drift.internal_compressed_over++;
2976 pmap_ledgers_drift.internal_compressed_over_total += bal;
2977 if (bal > pmap_ledgers_drift.internal_compressed_over_max) {
2978 pmap_ledgers_drift.internal_compressed_over_max = bal;
2979 }
2980 } else {
2981 pmap_ledgers_drift.internal_compressed_under++;
2982 pmap_ledgers_drift.internal_compressed_under_total += bal;
2983 if (bal < pmap_ledgers_drift.internal_compressed_under_max) {
2984 pmap_ledgers_drift.internal_compressed_under_max = bal;
2985 }
2986 }
2987 }
2988 ledger_get_balance(pmap->ledger,
2989 task_ledgers.iokit_mapped,
2990 &bal);
2991 if (bal != 0) {
2992 do_panic = TRUE;
2993 printf("LEDGER BALANCE proc %d (%s) "
2994 "\"iokit_mapped\" = %lld\n",
2995 pid, procname, bal);
2996 if (bal > 0) {
2997 pmap_ledgers_drift.iokit_mapped_over++;
2998 pmap_ledgers_drift.iokit_mapped_over_total += bal;
2999 if (bal > pmap_ledgers_drift.iokit_mapped_over_max) {
3000 pmap_ledgers_drift.iokit_mapped_over_max = bal;
3001 }
3002 } else {
3003 pmap_ledgers_drift.iokit_mapped_under++;
3004 pmap_ledgers_drift.iokit_mapped_under_total += bal;
3005 if (bal < pmap_ledgers_drift.iokit_mapped_under_max) {
3006 pmap_ledgers_drift.iokit_mapped_under_max = bal;
3007 }
3008 }
3009 }
3010 ledger_get_balance(pmap->ledger,
3011 task_ledgers.alternate_accounting,
3012 &bal);
3013 if (bal != 0) {
3014 do_panic = TRUE;
3015 printf("LEDGER BALANCE proc %d (%s) "
3016 "\"alternate_accounting\" = %lld\n",
3017 pid, procname, bal);
3018 if (bal > 0) {
3019 pmap_ledgers_drift.alternate_accounting_over++;
3020 pmap_ledgers_drift.alternate_accounting_over_total += bal;
3021 if (bal > pmap_ledgers_drift.alternate_accounting_over_max) {
3022 pmap_ledgers_drift.alternate_accounting_over_max = bal;
3023 }
3024 } else {
3025 pmap_ledgers_drift.alternate_accounting_under++;
3026 pmap_ledgers_drift.alternate_accounting_under_total += bal;
3027 if (bal < pmap_ledgers_drift.alternate_accounting_under_max) {
3028 pmap_ledgers_drift.alternate_accounting_under_max = bal;
3029 }
3030 }
3031 }
3032 ledger_get_balance(pmap->ledger,
3033 task_ledgers.alternate_accounting_compressed,
3034 &bal);
3035 if (bal != 0) {
3036 do_panic = TRUE;
3037 printf("LEDGER BALANCE proc %d (%s) "
3038 "\"alternate_accounting_compressed\" = %lld\n",
3039 pid, procname, bal);
3040 if (bal > 0) {
3041 pmap_ledgers_drift.alternate_accounting_compressed_over++;
3042 pmap_ledgers_drift.alternate_accounting_compressed_over_total += bal;
3043 if (bal > pmap_ledgers_drift.alternate_accounting_compressed_over_max) {
3044 pmap_ledgers_drift.alternate_accounting_compressed_over_max = bal;
3045 }
3046 } else {
3047 pmap_ledgers_drift.alternate_accounting_compressed_under++;
3048 pmap_ledgers_drift.alternate_accounting_compressed_under_total += bal;
3049 if (bal < pmap_ledgers_drift.alternate_accounting_compressed_under_max) {
3050 pmap_ledgers_drift.alternate_accounting_compressed_under_max = bal;
3051 }
3052 }
3053 }
3054 ledger_get_balance(pmap->ledger,
3055 task_ledgers.page_table,
3056 &bal);
3057 if (bal != 0) {
3058 do_panic = TRUE;
3059 printf("LEDGER BALANCE proc %d (%s) "
3060 "\"page_table\" = %lld\n",
3061 pid, procname, bal);
3062 if (bal > 0) {
3063 pmap_ledgers_drift.page_table_over++;
3064 pmap_ledgers_drift.page_table_over_total += bal;
3065 if (bal > pmap_ledgers_drift.page_table_over_max) {
3066 pmap_ledgers_drift.page_table_over_max = bal;
3067 }
3068 } else {
3069 pmap_ledgers_drift.page_table_under++;
3070 pmap_ledgers_drift.page_table_under_total += bal;
3071 if (bal < pmap_ledgers_drift.page_table_under_max) {
3072 pmap_ledgers_drift.page_table_under_max = bal;
3073 }
3074 }
3075 }
3076 ledger_get_balance(pmap->ledger,
3077 task_ledgers.purgeable_volatile,
3078 &bal);
3079 if (bal != 0) {
3080 do_panic = TRUE;
3081 printf("LEDGER BALANCE proc %d (%s) "
3082 "\"purgeable_volatile\" = %lld\n",
3083 pid, procname, bal);
3084 if (bal > 0) {
3085 pmap_ledgers_drift.purgeable_volatile_over++;
3086 pmap_ledgers_drift.purgeable_volatile_over_total += bal;
3087 if (bal > pmap_ledgers_drift.purgeable_volatile_over_max) {
3088 pmap_ledgers_drift.purgeable_volatile_over_max = bal;
3089 }
3090 } else {
3091 pmap_ledgers_drift.purgeable_volatile_under++;
3092 pmap_ledgers_drift.purgeable_volatile_under_total += bal;
3093 if (bal < pmap_ledgers_drift.purgeable_volatile_under_max) {
3094 pmap_ledgers_drift.purgeable_volatile_under_max = bal;
3095 }
3096 }
3097 }
3098 ledger_get_balance(pmap->ledger,
3099 task_ledgers.purgeable_nonvolatile,
3100 &bal);
3101 if (bal != 0) {
3102 do_panic = TRUE;
3103 printf("LEDGER BALANCE proc %d (%s) "
3104 "\"purgeable_nonvolatile\" = %lld\n",
3105 pid, procname, bal);
3106 if (bal > 0) {
3107 pmap_ledgers_drift.purgeable_nonvolatile_over++;
3108 pmap_ledgers_drift.purgeable_nonvolatile_over_total += bal;
3109 if (bal > pmap_ledgers_drift.purgeable_nonvolatile_over_max) {
3110 pmap_ledgers_drift.purgeable_nonvolatile_over_max = bal;
3111 }
3112 } else {
3113 pmap_ledgers_drift.purgeable_nonvolatile_under++;
3114 pmap_ledgers_drift.purgeable_nonvolatile_under_total += bal;
3115 if (bal < pmap_ledgers_drift.purgeable_nonvolatile_under_max) {
3116 pmap_ledgers_drift.purgeable_nonvolatile_under_max = bal;
3117 }
3118 }
3119 }
3120 ledger_get_balance(pmap->ledger,
3121 task_ledgers.purgeable_volatile_compressed,
3122 &bal);
3123 if (bal != 0) {
3124 do_panic = TRUE;
3125 printf("LEDGER BALANCE proc %d (%s) "
3126 "\"purgeable_volatile_compressed\" = %lld\n",
3127 pid, procname, bal);
3128 if (bal > 0) {
3129 pmap_ledgers_drift.purgeable_volatile_compressed_over++;
3130 pmap_ledgers_drift.purgeable_volatile_compressed_over_total += bal;
3131 if (bal > pmap_ledgers_drift.purgeable_volatile_compressed_over_max) {
3132 pmap_ledgers_drift.purgeable_volatile_compressed_over_max = bal;
3133 }
3134 } else {
3135 pmap_ledgers_drift.purgeable_volatile_compressed_under++;
3136 pmap_ledgers_drift.purgeable_volatile_compressed_under_total += bal;
3137 if (bal < pmap_ledgers_drift.purgeable_volatile_compressed_under_max) {
3138 pmap_ledgers_drift.purgeable_volatile_compressed_under_max = bal;
3139 }
3140 }
3141 }
3142 ledger_get_balance(pmap->ledger,
3143 task_ledgers.purgeable_nonvolatile_compressed,
3144 &bal);
3145 if (bal != 0) {
3146 do_panic = TRUE;
3147 printf("LEDGER BALANCE proc %d (%s) "
3148 "\"purgeable_nonvolatile_compressed\" = %lld\n",
3149 pid, procname, bal);
3150 if (bal > 0) {
3151 pmap_ledgers_drift.purgeable_nonvolatile_compressed_over++;
3152 pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_total += bal;
3153 if (bal > pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_max) {
3154 pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_max = bal;
3155 }
3156 } else {
3157 pmap_ledgers_drift.purgeable_nonvolatile_compressed_under++;
3158 pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_total += bal;
3159 if (bal < pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_max) {
3160 pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_max = bal;
3161 }
3162 }
3163 }
3164
3165 if (do_panic) {
3166 if (pmap_ledgers_panic) {
3167 panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
3168 pmap, pid, procname);
3169 } else {
3170 printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
3171 pmap, pid, procname);
3172 }
3173 }
3174
3175 if (pmap->stats.resident_count != 0 ||
3176 pmap->stats.wired_count != 0 ||
3177 pmap->stats.device != 0 ||
3178 pmap->stats.internal != 0 ||
3179 pmap->stats.external != 0 ||
3180 pmap->stats.reusable != 0 ||
3181 pmap->stats.compressed != 0) {
3182 if (pmap_stats_assert) {
3183 panic("pmap_destroy(%p) %d[%s] imbalanced stats: resident=%d wired=%d device=%d internal=%d external=%d reusable=%d compressed=%lld",
3184 pmap, pid, procname,
3185 pmap->stats.resident_count,
3186 pmap->stats.wired_count,
3187 pmap->stats.device,
3188 pmap->stats.internal,
3189 pmap->stats.external,
3190 pmap->stats.reusable,
3191 pmap->stats.compressed);
3192 } else {
3193 printf("pmap_destroy(%p) %d[%s] imbalanced stats: resident=%d wired=%d device=%d internal=%d external=%d reusable=%d compressed=%lld",
3194 pmap, pid, procname,
3195 pmap->stats.resident_count,
3196 pmap->stats.wired_count,
3197 pmap->stats.device,
3198 pmap->stats.internal,
3199 pmap->stats.external,
3200 pmap->stats.reusable,
3201 pmap->stats.compressed);
3202 }
3203 }
3204}
3205
3206void
3207pmap_set_process(
3208 pmap_t pmap,
3209 int pid,
3210 char *procname)
3211{
3212 if (pmap == NULL)
3213 return;
3214
3215 pmap->pmap_pid = pid;
3216 strlcpy(pmap->pmap_procname, procname, sizeof (pmap->pmap_procname));
3217}
3218#endif /* MACH_ASSERT */
3219
3220
3221#if DEVELOPMENT || DEBUG
3222int pmap_pagezero_mitigation = 1;
3223#endif
3224
3225void pmap_advise_pagezero_range(pmap_t lpmap, uint64_t low_bound) {
3226#if DEVELOPMENT || DEBUG
3227 if (pmap_pagezero_mitigation == 0) {
3228 lpmap->pagezero_accessible = FALSE;
3229 return;
3230 }
3231#endif
3232 lpmap->pagezero_accessible = ((pmap_smap_enabled == FALSE) && (low_bound < 0x1000));
3233 if (lpmap == current_pmap()) {
3234 mp_disable_preemption();
3235 current_cpu_datap()->cpu_pagezero_mapped = lpmap->pagezero_accessible;
3236 mp_enable_preemption();
3237 }
3238}
813fb2f6
A
3239
3240void pmap_verify_noncacheable(uintptr_t vaddr) {
3241 pt_entry_t *ptep = NULL;
3242 ptep = pmap_pte(kernel_pmap, vaddr);
3243 if (ptep == NULL) {
3244 panic("pmap_verify_noncacheable: no translation for 0x%lx", vaddr);
3245 }
3246 /* Non-cacheable OK */
3247 if (*ptep & (INTEL_PTE_NCACHE))
3248 return;
3249 /* Write-combined OK */
3250 if (*ptep & (INTEL_PTE_PTA))
3251 return;
3252 panic("pmap_verify_noncacheable: IO read from a cacheable address? address: 0x%lx, PTE: %p, *PTE: 0x%llx", vaddr, ptep, *ptep);
3253}