]> git.saurik.com Git - apple/xnu.git/blob - osfmk/x86_64/pmap.c
xnu-3789.60.24.tar.gz
[apple/xnu.git] / osfmk / x86_64 / pmap.c
1 /*
2 * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58
59 /*
60 * File: pmap.c
61 * Author: Avadis Tevanian, Jr., Michael Wayne Young
62 * (These guys wrote the Vax version)
63 *
64 * Physical Map management code for Intel i386, i486, and i860.
65 *
66 * Manages physical address maps.
67 *
68 * In addition to hardware address maps, this
69 * module is called upon to provide software-use-only
70 * maps which may or may not be stored in the same
71 * form as hardware maps. These pseudo-maps are
72 * used to store intermediate results from copy
73 * operations to and from address spaces.
74 *
75 * Since the information managed by this module is
76 * also stored by the logical address mapping module,
77 * this module may throw away valid virtual-to-physical
78 * mappings at almost any time. However, invalidations
79 * of virtual-to-physical mappings must be done as
80 * requested.
81 *
82 * In order to cope with hardware architectures which
83 * make virtual-to-physical map invalidates expensive,
84 * this module may delay invalidate or reduced protection
85 * operations until such time as they are actually
86 * necessary. This module is given full information as
87 * to which processors are currently using which maps,
88 * and to when physical maps must be made correct.
89 */
90
91 #include <string.h>
92 #include <mach_ldebug.h>
93
94 #include <libkern/OSAtomic.h>
95
96 #include <mach/machine/vm_types.h>
97
98 #include <mach/boolean.h>
99 #include <kern/thread.h>
100 #include <kern/zalloc.h>
101 #include <kern/queue.h>
102 #include <kern/ledger.h>
103 #include <kern/mach_param.h>
104
105 #include <kern/kalloc.h>
106 #include <kern/spl.h>
107
108 #include <vm/pmap.h>
109 #include <vm/vm_map.h>
110 #include <vm/vm_kern.h>
111 #include <mach/vm_param.h>
112 #include <mach/vm_prot.h>
113 #include <vm/vm_object.h>
114 #include <vm/vm_page.h>
115
116 #include <mach/machine/vm_param.h>
117 #include <machine/thread.h>
118
119 #include <kern/misc_protos.h> /* prototyping */
120 #include <i386/misc_protos.h>
121 #include <i386/i386_lowmem.h>
122 #include <x86_64/lowglobals.h>
123
124 #include <i386/cpuid.h>
125 #include <i386/cpu_data.h>
126 #include <i386/cpu_number.h>
127 #include <i386/machine_cpu.h>
128 #include <i386/seg.h>
129 #include <i386/serial_io.h>
130 #include <i386/cpu_capabilities.h>
131 #include <i386/machine_routines.h>
132 #include <i386/proc_reg.h>
133 #include <i386/tsc.h>
134 #include <i386/pmap_internal.h>
135 #include <i386/pmap_pcid.h>
136 #if CONFIG_VMX
137 #include <i386/vmx/vmx_cpu.h>
138 #endif
139
140 #include <vm/vm_protos.h>
141
142 #include <i386/mp.h>
143 #include <i386/mp_desc.h>
144 #include <libkern/kernel_mach_header.h>
145
146 #include <pexpert/i386/efi.h>
147
148 #if MACH_ASSERT
149 int pmap_stats_assert = 1;
150 #endif /* MACH_ASSERT */
151
152 #ifdef IWANTTODEBUG
153 #undef DEBUG
154 #define DEBUG 1
155 #define POSTCODE_DELAY 1
156 #include <i386/postcode.h>
157 #endif /* IWANTTODEBUG */
158
159 #ifdef PMAP_DEBUG
160 #define DBG(x...) kprintf("DBG: " x)
161 #else
162 #define DBG(x...)
163 #endif
164 /* Compile time assert to ensure adjacency/alignment of per-CPU data fields used
165 * in the trampolines for kernel/user boundary TLB coherency.
166 */
167 char pmap_cpu_data_assert[(((offsetof(cpu_data_t, cpu_tlb_invalid) - offsetof(cpu_data_t, cpu_active_cr3)) == 8) && (offsetof(cpu_data_t, cpu_active_cr3) % 64 == 0)) ? 1 : -1];
168 boolean_t pmap_trace = FALSE;
169
170 boolean_t no_shared_cr3 = DEBUG; /* TRUE for DEBUG by default */
171
172 int nx_enabled = 1; /* enable no-execute protection -- set during boot */
173
174 #if DEBUG || DEVELOPMENT
175 int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
176 int allow_stack_exec = 0; /* No apps may execute from the stack by default */
177 #else /* DEBUG || DEVELOPMENT */
178 const int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
179 const int allow_stack_exec = 0; /* No apps may execute from the stack by default */
180 #endif /* DEBUG || DEVELOPMENT */
181
182 const boolean_t cpu_64bit = TRUE; /* Mais oui! */
183
184 uint64_t max_preemption_latency_tsc = 0;
185
186 pv_hashed_entry_t *pv_hash_table; /* hash lists */
187
188 uint32_t npvhashmask = 0, npvhashbuckets = 0;
189
190 pv_hashed_entry_t pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
191 pv_hashed_entry_t pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
192 decl_simple_lock_data(,pv_hashed_free_list_lock)
193 decl_simple_lock_data(,pv_hashed_kern_free_list_lock)
194 decl_simple_lock_data(,pv_hash_table_lock)
195
196 decl_simple_lock_data(,phys_backup_lock)
197
198 zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
199
200 /*
201 * First and last physical addresses that we maintain any information
202 * for. Initialized to zero so that pmap operations done before
203 * pmap_init won't touch any non-existent structures.
204 */
205 boolean_t pmap_initialized = FALSE;/* Has pmap_init completed? */
206
207 static struct vm_object kptobj_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
208 static struct vm_object kpml4obj_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
209 static struct vm_object kpdptobj_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
210
211 /*
212 * Array of physical page attribites for managed pages.
213 * One byte per physical page.
214 */
215 char *pmap_phys_attributes;
216 ppnum_t last_managed_page = 0;
217
218 /*
219 * Amount of virtual memory mapped by one
220 * page-directory entry.
221 */
222
223 uint64_t pde_mapped_size = PDE_MAPPED_SIZE;
224
225 unsigned pmap_memory_region_count;
226 unsigned pmap_memory_region_current;
227
228 pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
229
230 /*
231 * Other useful macros.
232 */
233 #define current_pmap() (vm_map_pmap(current_thread()->map))
234
235 struct pmap kernel_pmap_store;
236 pmap_t kernel_pmap;
237
238 struct zone *pmap_zone; /* zone of pmap structures */
239
240 struct zone *pmap_anchor_zone;
241 int pmap_debug = 0; /* flag for debugging prints */
242
243 unsigned int inuse_ptepages_count = 0;
244 long long alloc_ptepages_count __attribute__((aligned(8))) = 0; /* aligned for atomic access */
245 unsigned int bootstrap_wired_pages = 0;
246 int pt_fake_zone_index = -1;
247
248 extern long NMIPI_acks;
249
250 boolean_t kernel_text_ps_4K = TRUE;
251 boolean_t wpkernel = TRUE;
252
253 extern char end;
254
255 static int nkpt;
256
257 pt_entry_t *DMAP1, *DMAP2;
258 caddr_t DADDR1;
259 caddr_t DADDR2;
260
261 boolean_t pmap_disable_kheap_nx = FALSE;
262 boolean_t pmap_disable_kstack_nx = FALSE;
263
264 extern long __stack_chk_guard[];
265
266 static uint64_t pmap_eptp_flags = 0;
267 boolean_t pmap_ept_support_ad = FALSE;
268
269
270 /*
271 * Map memory at initialization. The physical addresses being
272 * mapped are not managed and are never unmapped.
273 *
274 * For now, VM is already on, we only need to map the
275 * specified memory.
276 */
277 vm_offset_t
278 pmap_map(
279 vm_offset_t virt,
280 vm_map_offset_t start_addr,
281 vm_map_offset_t end_addr,
282 vm_prot_t prot,
283 unsigned int flags)
284 {
285 int ps;
286
287 ps = PAGE_SIZE;
288 while (start_addr < end_addr) {
289 pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
290 (ppnum_t) i386_btop(start_addr), prot, VM_PROT_NONE, flags, TRUE);
291 virt += ps;
292 start_addr += ps;
293 }
294 return(virt);
295 }
296
297 extern char *first_avail;
298 extern vm_offset_t virtual_avail, virtual_end;
299 extern pmap_paddr_t avail_start, avail_end;
300 extern vm_offset_t sHIB;
301 extern vm_offset_t eHIB;
302 extern vm_offset_t stext;
303 extern vm_offset_t etext;
304 extern vm_offset_t sdata, edata;
305 extern vm_offset_t sconst, econst;
306
307 extern void *KPTphys;
308
309 boolean_t pmap_smep_enabled = FALSE;
310 boolean_t pmap_smap_enabled = FALSE;
311
312 void
313 pmap_cpu_init(void)
314 {
315 cpu_data_t *cdp = current_cpu_datap();
316 /*
317 * Here early in the life of a processor (from cpu_mode_init()).
318 * Ensure global page feature is disabled at this point.
319 */
320
321 set_cr4(get_cr4() &~ CR4_PGE);
322
323 /*
324 * Initialize the per-cpu, TLB-related fields.
325 */
326 cdp->cpu_kernel_cr3 = kernel_pmap->pm_cr3;
327 cdp->cpu_active_cr3 = kernel_pmap->pm_cr3;
328 cdp->cpu_tlb_invalid = FALSE;
329 cdp->cpu_task_map = TASK_MAP_64BIT;
330 pmap_pcid_configure();
331 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMEP) {
332 pmap_smep_enabled = TRUE;
333 #if DEVELOPMENT || DEBUG
334 boolean_t nsmep;
335 if (PE_parse_boot_argn("-pmap_smep_disable", &nsmep, sizeof(nsmep))) {
336 pmap_smep_enabled = FALSE;
337 }
338 #endif
339 if (pmap_smep_enabled) {
340 set_cr4(get_cr4() | CR4_SMEP);
341 }
342
343 }
344 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMAP) {
345 pmap_smap_enabled = TRUE;
346 #if DEVELOPMENT || DEBUG
347 boolean_t nsmap;
348 if (PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) {
349 pmap_smap_enabled = FALSE;
350 }
351 #endif
352 if (pmap_smap_enabled) {
353 set_cr4(get_cr4() | CR4_SMAP);
354 }
355 }
356
357 if (cdp->cpu_fixed_pmcs_enabled) {
358 boolean_t enable = TRUE;
359 cpu_pmc_control(&enable);
360 }
361 }
362
363 static uint32_t pmap_scale_shift(void) {
364 uint32_t scale = 0;
365
366 if (sane_size <= 8*GB) {
367 scale = (uint32_t)(sane_size / (2 * GB));
368 } else if (sane_size <= 32*GB) {
369 scale = 4 + (uint32_t)((sane_size - (8 * GB))/ (4 * GB));
370 } else {
371 scale = 10 + (uint32_t)MIN(4, ((sane_size - (32 * GB))/ (8 * GB)));
372 }
373 return scale;
374 }
375
376 /*
377 * Bootstrap the system enough to run with virtual memory.
378 * Map the kernel's code and data, and allocate the system page table.
379 * Called with mapping OFF. Page_size must already be set.
380 */
381
382 void
383 pmap_bootstrap(
384 __unused vm_offset_t load_start,
385 __unused boolean_t IA32e)
386 {
387 #if NCOPY_WINDOWS > 0
388 vm_offset_t va;
389 int i;
390 #endif
391 assert(IA32e);
392
393 vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Set the highest address
394 * known to VM */
395 /*
396 * The kernel's pmap is statically allocated so we don't
397 * have to use pmap_create, which is unlikely to work
398 * correctly at this part of the boot sequence.
399 */
400
401 kernel_pmap = &kernel_pmap_store;
402 kernel_pmap->ref_count = 1;
403 kernel_pmap->nx_enabled = TRUE;
404 kernel_pmap->pm_task_map = TASK_MAP_64BIT;
405 kernel_pmap->pm_obj = (vm_object_t) NULL;
406 kernel_pmap->dirbase = (pd_entry_t *)((uintptr_t)IdlePTD);
407 kernel_pmap->pm_pdpt = (pd_entry_t *) ((uintptr_t)IdlePDPT);
408 kernel_pmap->pm_pml4 = IdlePML4;
409 kernel_pmap->pm_cr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
410 kernel_pmap->pm_eptp = 0;
411 pmap_pcid_initialize_kernel(kernel_pmap);
412
413
414
415 current_cpu_datap()->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
416
417 nkpt = NKPT;
418 OSAddAtomic(NKPT, &inuse_ptepages_count);
419 OSAddAtomic64(NKPT, &alloc_ptepages_count);
420 bootstrap_wired_pages = NKPT;
421
422 virtual_avail = (vm_offset_t)(VM_MIN_KERNEL_ADDRESS) + (vm_offset_t)first_avail;
423 virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
424
425 #if NCOPY_WINDOWS > 0
426 /*
427 * Reserve some special page table entries/VA space for temporary
428 * mapping of pages.
429 */
430 #define SYSMAP(c, p, v, n) \
431 v = (c)va; va += ((n)*INTEL_PGBYTES);
432
433 va = virtual_avail;
434
435 for (i=0; i<PMAP_NWINDOWS; i++) {
436 #if 1
437 kprintf("trying to do SYSMAP idx %d %p\n", i,
438 current_cpu_datap());
439 kprintf("cpu_pmap %p\n", current_cpu_datap()->cpu_pmap);
440 kprintf("mapwindow %p\n", current_cpu_datap()->cpu_pmap->mapwindow);
441 kprintf("two stuff %p %p\n",
442 (void *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
443 (void *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR));
444 #endif
445 SYSMAP(caddr_t,
446 (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
447 (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR),
448 1);
449 current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP =
450 &(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP_store);
451 *current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP = 0;
452 }
453
454 /* DMAP user for debugger */
455 SYSMAP(caddr_t, DMAP1, DADDR1, 1);
456 SYSMAP(caddr_t, DMAP2, DADDR2, 1); /* XXX temporary - can remove */
457
458 virtual_avail = va;
459 #endif
460 if (!PE_parse_boot_argn("npvhash", &npvhashmask, sizeof (npvhashmask))) {
461 npvhashmask = ((NPVHASHBUCKETS) << pmap_scale_shift()) - 1;
462
463 }
464
465 npvhashbuckets = npvhashmask + 1;
466
467 if (0 != ((npvhashbuckets) & npvhashmask)) {
468 panic("invalid hash %d, must be ((2^N)-1), "
469 "using default %d\n", npvhashmask, NPVHASHMASK);
470 }
471
472 simple_lock_init(&kernel_pmap->lock, 0);
473 simple_lock_init(&pv_hashed_free_list_lock, 0);
474 simple_lock_init(&pv_hashed_kern_free_list_lock, 0);
475 simple_lock_init(&pv_hash_table_lock,0);
476 simple_lock_init(&phys_backup_lock, 0);
477
478 pmap_cpu_init();
479
480 if (pmap_pcid_ncpus)
481 printf("PMAP: PCID enabled\n");
482
483 if (pmap_smep_enabled)
484 printf("PMAP: Supervisor Mode Execute Protection enabled\n");
485 if (pmap_smap_enabled)
486 printf("PMAP: Supervisor Mode Access Protection enabled\n");
487
488 #if DEBUG
489 printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]);
490 printf("early_random(): 0x%qx\n", early_random());
491 #endif
492 boolean_t ptmp;
493 /* Check if the user has requested disabling stack or heap no-execute
494 * enforcement. These are "const" variables; that qualifier is cast away
495 * when altering them. The TEXT/DATA const sections are marked
496 * write protected later in the kernel startup sequence, so altering
497 * them is possible at this point, in pmap_bootstrap().
498 */
499 if (PE_parse_boot_argn("-pmap_disable_kheap_nx", &ptmp, sizeof(ptmp))) {
500 boolean_t *pdknxp = (boolean_t *) &pmap_disable_kheap_nx;
501 *pdknxp = TRUE;
502 }
503
504 if (PE_parse_boot_argn("-pmap_disable_kstack_nx", &ptmp, sizeof(ptmp))) {
505 boolean_t *pdknhp = (boolean_t *) &pmap_disable_kstack_nx;
506 *pdknhp = TRUE;
507 }
508
509 boot_args *args = (boot_args *)PE_state.bootArgs;
510 if (args->efiMode == kBootArgsEfiMode32) {
511 printf("EFI32: kernel virtual space limited to 4GB\n");
512 virtual_end = VM_MAX_KERNEL_ADDRESS_EFI32;
513 }
514 kprintf("Kernel virtual space from 0x%lx to 0x%lx.\n",
515 (long)KERNEL_BASE, (long)virtual_end);
516 kprintf("Available physical space from 0x%llx to 0x%llx\n",
517 avail_start, avail_end);
518
519 /*
520 * The -no_shared_cr3 boot-arg is a debugging feature (set by default
521 * in the DEBUG kernel) to force the kernel to switch to its own map
522 * (and cr3) when control is in kernelspace. The kernel's map does not
523 * include (i.e. share) userspace so wild references will cause
524 * a panic. Only copyin and copyout are exempt from this.
525 */
526 (void) PE_parse_boot_argn("-no_shared_cr3",
527 &no_shared_cr3, sizeof (no_shared_cr3));
528 if (no_shared_cr3)
529 kprintf("Kernel not sharing user map\n");
530
531 #ifdef PMAP_TRACES
532 if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof (pmap_trace))) {
533 kprintf("Kernel traces for pmap operations enabled\n");
534 }
535 #endif /* PMAP_TRACES */
536
537 #if MACH_ASSERT
538 PE_parse_boot_argn("pmap_stats_assert",
539 &pmap_stats_assert,
540 sizeof (pmap_stats_assert));
541 #endif /* MACH_ASSERT */
542 }
543
544 void
545 pmap_virtual_space(
546 vm_offset_t *startp,
547 vm_offset_t *endp)
548 {
549 *startp = virtual_avail;
550 *endp = virtual_end;
551 }
552
553
554
555
556 #if HIBERNATION
557
558 #include <IOKit/IOHibernatePrivate.h>
559
560 int32_t pmap_npages;
561 int32_t pmap_teardown_last_valid_compact_indx = -1;
562
563
564 void hibernate_rebuild_pmap_structs(void);
565 void hibernate_teardown_pmap_structs(addr64_t *, addr64_t *);
566 void pmap_pack_index(uint32_t);
567 int32_t pmap_unpack_index(pv_rooted_entry_t);
568
569
570 int32_t
571 pmap_unpack_index(pv_rooted_entry_t pv_h)
572 {
573 int32_t indx = 0;
574
575 indx = (int32_t)(*((uint64_t *)(&pv_h->qlink.next)) >> 48);
576 indx = indx << 16;
577 indx |= (int32_t)(*((uint64_t *)(&pv_h->qlink.prev)) >> 48);
578
579 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)0xffff << 48);
580 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)0xffff << 48);
581
582 return (indx);
583 }
584
585
586 void
587 pmap_pack_index(uint32_t indx)
588 {
589 pv_rooted_entry_t pv_h;
590
591 pv_h = &pv_head_table[indx];
592
593 *((uint64_t *)(&pv_h->qlink.next)) &= ~((uint64_t)0xffff << 48);
594 *((uint64_t *)(&pv_h->qlink.prev)) &= ~((uint64_t)0xffff << 48);
595
596 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)(indx >> 16)) << 48;
597 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)(indx & 0xffff)) << 48;
598 }
599
600
601 void
602 hibernate_teardown_pmap_structs(addr64_t *unneeded_start, addr64_t *unneeded_end)
603 {
604 int32_t i;
605 int32_t compact_target_indx;
606
607 compact_target_indx = 0;
608
609 for (i = 0; i < pmap_npages; i++) {
610 if (pv_head_table[i].pmap == PMAP_NULL) {
611
612 if (pv_head_table[compact_target_indx].pmap != PMAP_NULL)
613 compact_target_indx = i;
614 } else {
615 pmap_pack_index((uint32_t)i);
616
617 if (pv_head_table[compact_target_indx].pmap == PMAP_NULL) {
618 /*
619 * we've got a hole to fill, so
620 * move this pv_rooted_entry_t to it's new home
621 */
622 pv_head_table[compact_target_indx] = pv_head_table[i];
623 pv_head_table[i].pmap = PMAP_NULL;
624
625 pmap_teardown_last_valid_compact_indx = compact_target_indx;
626 compact_target_indx++;
627 } else
628 pmap_teardown_last_valid_compact_indx = i;
629 }
630 }
631 *unneeded_start = (addr64_t)&pv_head_table[pmap_teardown_last_valid_compact_indx+1];
632 *unneeded_end = (addr64_t)&pv_head_table[pmap_npages-1];
633
634 HIBLOG("hibernate_teardown_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
635 }
636
637
638 void
639 hibernate_rebuild_pmap_structs(void)
640 {
641 int32_t cindx, eindx, rindx = 0;
642 pv_rooted_entry_t pv_h;
643
644 eindx = (int32_t)pmap_npages;
645
646 for (cindx = pmap_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
647
648 pv_h = &pv_head_table[cindx];
649
650 rindx = pmap_unpack_index(pv_h);
651 assert(rindx < pmap_npages);
652
653 if (rindx != cindx) {
654 /*
655 * this pv_rooted_entry_t was moved by hibernate_teardown_pmap_structs,
656 * so move it back to its real location
657 */
658 pv_head_table[rindx] = pv_head_table[cindx];
659 }
660 if (rindx+1 != eindx) {
661 /*
662 * the 'hole' between this vm_rooted_entry_t and the previous
663 * vm_rooted_entry_t we moved needs to be initialized as
664 * a range of zero'd vm_rooted_entry_t's
665 */
666 bzero((char *)&pv_head_table[rindx+1], (eindx - rindx - 1) * sizeof (struct pv_rooted_entry));
667 }
668 eindx = rindx;
669 }
670 if (rindx)
671 bzero ((char *)&pv_head_table[0], rindx * sizeof (struct pv_rooted_entry));
672
673 HIBLOG("hibernate_rebuild_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
674 }
675
676 #endif
677
678 /*
679 * Initialize the pmap module.
680 * Called by vm_init, to initialize any structures that the pmap
681 * system needs to map virtual memory.
682 */
683 void
684 pmap_init(void)
685 {
686 long npages;
687 vm_offset_t addr;
688 vm_size_t s, vsize;
689 vm_map_offset_t vaddr;
690 ppnum_t ppn;
691
692
693 kernel_pmap->pm_obj_pml4 = &kpml4obj_object_store;
694 _vm_object_allocate((vm_object_size_t)NPML4PGS * PAGE_SIZE, &kpml4obj_object_store);
695
696 kernel_pmap->pm_obj_pdpt = &kpdptobj_object_store;
697 _vm_object_allocate((vm_object_size_t)NPDPTPGS * PAGE_SIZE, &kpdptobj_object_store);
698
699 kernel_pmap->pm_obj = &kptobj_object_store;
700 _vm_object_allocate((vm_object_size_t)NPDEPGS * PAGE_SIZE, &kptobj_object_store);
701
702 /*
703 * Allocate memory for the pv_head_table and its lock bits,
704 * the modify bit array, and the pte_page table.
705 */
706
707 /*
708 * zero bias all these arrays now instead of off avail_start
709 * so we cover all memory
710 */
711
712 npages = i386_btop(avail_end);
713 #if HIBERNATION
714 pmap_npages = (uint32_t)npages;
715 #endif
716 s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
717 + (sizeof (struct pv_hashed_entry_t *) * (npvhashbuckets))
718 + pv_lock_table_size(npages)
719 + pv_hash_lock_table_size((npvhashbuckets))
720 + npages);
721 s = round_page(s);
722 if (kernel_memory_allocate(kernel_map, &addr, s, 0,
723 KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_PMAP)
724 != KERN_SUCCESS)
725 panic("pmap_init");
726
727 memset((char *)addr, 0, s);
728
729 vaddr = addr;
730 vsize = s;
731
732 #if PV_DEBUG
733 if (0 == npvhashmask) panic("npvhashmask not initialized");
734 #endif
735
736 /*
737 * Allocate the structures first to preserve word-alignment.
738 */
739 pv_head_table = (pv_rooted_entry_t) addr;
740 addr = (vm_offset_t) (pv_head_table + npages);
741
742 pv_hash_table = (pv_hashed_entry_t *)addr;
743 addr = (vm_offset_t) (pv_hash_table + (npvhashbuckets));
744
745 pv_lock_table = (char *) addr;
746 addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
747
748 pv_hash_lock_table = (char *) addr;
749 addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhashbuckets)));
750
751 pmap_phys_attributes = (char *) addr;
752
753 ppnum_t last_pn = i386_btop(avail_end);
754 unsigned int i;
755 pmap_memory_region_t *pmptr = pmap_memory_regions;
756 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
757 if (pmptr->type != kEfiConventionalMemory)
758 continue;
759 ppnum_t pn;
760 for (pn = pmptr->base; pn <= pmptr->end; pn++) {
761 if (pn < last_pn) {
762 pmap_phys_attributes[pn] |= PHYS_MANAGED;
763
764 if (pn > last_managed_page)
765 last_managed_page = pn;
766
767 if (pn >= lowest_hi && pn <= highest_hi)
768 pmap_phys_attributes[pn] |= PHYS_NOENCRYPT;
769 }
770 }
771 }
772 while (vsize) {
773 ppn = pmap_find_phys(kernel_pmap, vaddr);
774
775 pmap_phys_attributes[ppn] |= PHYS_NOENCRYPT;
776
777 vaddr += PAGE_SIZE;
778 vsize -= PAGE_SIZE;
779 }
780 /*
781 * Create the zone of physical maps,
782 * and of the physical-to-virtual entries.
783 */
784 s = (vm_size_t) sizeof(struct pmap);
785 pmap_zone = zinit(s, 400*s, 4096, "pmap"); /* XXX */
786 zone_change(pmap_zone, Z_NOENCRYPT, TRUE);
787
788 pmap_anchor_zone = zinit(PAGE_SIZE, task_max, PAGE_SIZE, "pagetable anchors");
789 zone_change(pmap_anchor_zone, Z_NOENCRYPT, TRUE);
790
791 /* The anchor is required to be page aligned. Zone debugging adds
792 * padding which may violate that requirement. Tell the zone
793 * subsystem that alignment is required.
794 */
795
796 zone_change(pmap_anchor_zone, Z_ALIGNMENT_REQUIRED, TRUE);
797
798 s = (vm_size_t) sizeof(struct pv_hashed_entry);
799 pv_hashed_list_zone = zinit(s, 10000*s /* Expandable zone */,
800 4096 * 3 /* LCM x86_64*/, "pv_list");
801 zone_change(pv_hashed_list_zone, Z_NOENCRYPT, TRUE);
802
803 /* create pv entries for kernel pages mapped by low level
804 startup code. these have to exist so we can pmap_remove()
805 e.g. kext pages from the middle of our addr space */
806
807 vaddr = (vm_map_offset_t) VM_MIN_KERNEL_ADDRESS;
808 for (ppn = VM_MIN_KERNEL_PAGE; ppn < i386_btop(avail_start); ppn++) {
809 pv_rooted_entry_t pv_e;
810
811 pv_e = pai_to_pvh(ppn);
812 pv_e->va_and_flags = vaddr;
813 vaddr += PAGE_SIZE;
814 pv_e->pmap = kernel_pmap;
815 queue_init(&pv_e->qlink);
816 }
817 pmap_initialized = TRUE;
818
819 max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
820
821 /*
822 * Ensure the kernel's PML4 entry exists for the basement
823 * before this is shared with any user.
824 */
825 pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT, PMAP_EXPAND_OPTIONS_NONE);
826
827 #if CONFIG_VMX
828 pmap_ept_support_ad = vmx_hv_support() && (VMX_CAP(MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_EPT_VPID_CAP_AD_SHIFT, 1) ? TRUE : FALSE);
829 pmap_eptp_flags = HV_VMX_EPTP_MEMORY_TYPE_WB | HV_VMX_EPTP_WALK_LENGTH(4) | (pmap_ept_support_ad ? HV_VMX_EPTP_ENABLE_AD_FLAGS : 0);
830 #endif /* CONFIG_VMX */
831 }
832
833 static
834 void pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolean_t ro) {
835 uint64_t ev = sv + nxrosz, cv = sv;
836 pd_entry_t *pdep;
837 pt_entry_t *ptep = NULL;
838
839 assert(!is_ept_pmap(npmap));
840
841 assert(((sv & 0xFFFULL) | (nxrosz & 0xFFFULL)) == 0);
842
843 for (pdep = pmap_pde(npmap, cv); pdep != NULL && (cv < ev);) {
844 uint64_t pdev = (cv & ~((uint64_t)PDEMASK));
845
846 if (*pdep & INTEL_PTE_PS) {
847 if (NX)
848 *pdep |= INTEL_PTE_NX;
849 if (ro)
850 *pdep &= ~INTEL_PTE_WRITE;
851 cv += NBPD;
852 cv &= ~((uint64_t) PDEMASK);
853 pdep = pmap_pde(npmap, cv);
854 continue;
855 }
856
857 for (ptep = pmap_pte(npmap, cv); ptep != NULL && (cv < (pdev + NBPD)) && (cv < ev);) {
858 if (NX)
859 *ptep |= INTEL_PTE_NX;
860 if (ro)
861 *ptep &= ~INTEL_PTE_WRITE;
862 cv += NBPT;
863 ptep = pmap_pte(npmap, cv);
864 }
865 }
866 DPRINTF("%s(0x%llx, 0x%llx, %u, %u): 0x%llx, 0x%llx\n", __FUNCTION__, sv, nxrosz, NX, ro, cv, ptep ? *ptep: 0);
867 }
868
869 /*
870 * Called once VM is fully initialized so that we can release unused
871 * sections of low memory to the general pool.
872 * Also complete the set-up of identity-mapped sections of the kernel:
873 * 1) write-protect kernel text
874 * 2) map kernel text using large pages if possible
875 * 3) read and write-protect page zero (for K32)
876 * 4) map the global page at the appropriate virtual address.
877 *
878 * Use of large pages
879 * ------------------
880 * To effectively map and write-protect all kernel text pages, the text
881 * must be 2M-aligned at the base, and the data section above must also be
882 * 2M-aligned. That is, there's padding below and above. This is achieved
883 * through linker directives. Large pages are used only if this alignment
884 * exists (and not overriden by the -kernel_text_page_4K boot-arg). The
885 * memory layout is:
886 *
887 * : :
888 * | __DATA |
889 * sdata: ================== 2Meg
890 * | |
891 * | zero-padding |
892 * | |
893 * etext: ------------------
894 * | |
895 * : :
896 * | |
897 * | __TEXT |
898 * | |
899 * : :
900 * | |
901 * stext: ================== 2Meg
902 * | |
903 * | zero-padding |
904 * | |
905 * eHIB: ------------------
906 * | __HIB |
907 * : :
908 *
909 * Prior to changing the mapping from 4K to 2M, the zero-padding pages
910 * [eHIB,stext] and [etext,sdata] are ml_static_mfree()'d. Then all the
911 * 4K pages covering [stext,etext] are coalesced as 2M large pages.
912 * The now unused level-1 PTE pages are also freed.
913 */
914 extern ppnum_t vm_kernel_base_page;
915 static uint32_t constptes = 0, dataptes = 0;
916
917 void pmap_lowmem_finalize(void) {
918 spl_t spl;
919 int i;
920
921 /*
922 * Update wired memory statistics for early boot pages
923 */
924 PMAP_ZINFO_PALLOC(kernel_pmap, bootstrap_wired_pages * PAGE_SIZE);
925
926 /*
927 * Free pages in pmap regions below the base:
928 * rdar://6332712
929 * We can't free all the pages to VM that EFI reports available.
930 * Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake.
931 * There's also a size miscalculation here: pend is one page less
932 * than it should be but this is not fixed to be backwards
933 * compatible.
934 * This is important for KASLR because up to 256*2MB = 512MB of space
935 * needs has to be released to VM.
936 */
937 for (i = 0;
938 pmap_memory_regions[i].end < vm_kernel_base_page;
939 i++) {
940 vm_offset_t pbase = i386_ptob(pmap_memory_regions[i].base);
941 vm_offset_t pend = i386_ptob(pmap_memory_regions[i].end+1);
942
943 DBG("pmap region %d [%p..[%p\n",
944 i, (void *) pbase, (void *) pend);
945
946 if (pmap_memory_regions[i].attribute & EFI_MEMORY_KERN_RESERVED)
947 continue;
948 /*
949 * rdar://6332712
950 * Adjust limits not to free pages in range 0xc0000-0xff000.
951 */
952 if (pbase >= 0xc0000 && pend <= 0x100000)
953 continue;
954 if (pbase < 0xc0000 && pend > 0x100000) {
955 /* page range entirely within region, free lower part */
956 DBG("- ml_static_mfree(%p,%p)\n",
957 (void *) ml_static_ptovirt(pbase),
958 (void *) (0xc0000-pbase));
959 ml_static_mfree(ml_static_ptovirt(pbase),0xc0000-pbase);
960 pbase = 0x100000;
961 }
962 if (pbase < 0xc0000)
963 pend = MIN(pend, 0xc0000);
964 if (pend > 0x100000)
965 pbase = MAX(pbase, 0x100000);
966 DBG("- ml_static_mfree(%p,%p)\n",
967 (void *) ml_static_ptovirt(pbase),
968 (void *) (pend - pbase));
969 ml_static_mfree(ml_static_ptovirt(pbase), pend - pbase);
970 }
971
972 /* A final pass to get rid of all initial identity mappings to
973 * low pages.
974 */
975 DPRINTF("%s: Removing mappings from 0->0x%lx\n", __FUNCTION__, vm_kernel_base);
976
977 /*
978 * Remove all mappings past the boot-cpu descriptor aliases and low globals.
979 * Non-boot-cpu GDT aliases will be remapped later as needed.
980 */
981 pmap_remove(kernel_pmap, LOWGLOBAL_ALIAS + PAGE_SIZE, vm_kernel_base);
982
983 /*
984 * If text and data are both 2MB-aligned,
985 * we can map text with large-pages,
986 * unless the -kernel_text_ps_4K boot-arg overrides.
987 */
988 if ((stext & I386_LPGMASK) == 0 && (sdata & I386_LPGMASK) == 0) {
989 kprintf("Kernel text is 2MB aligned");
990 kernel_text_ps_4K = FALSE;
991 if (PE_parse_boot_argn("-kernel_text_ps_4K",
992 &kernel_text_ps_4K,
993 sizeof (kernel_text_ps_4K)))
994 kprintf(" but will be mapped with 4K pages\n");
995 else
996 kprintf(" and will be mapped with 2M pages\n");
997 }
998
999 (void) PE_parse_boot_argn("wpkernel", &wpkernel, sizeof (wpkernel));
1000 if (wpkernel)
1001 kprintf("Kernel text %p-%p to be write-protected\n",
1002 (void *) stext, (void *) etext);
1003
1004 spl = splhigh();
1005
1006 /*
1007 * Scan over text if mappings are to be changed:
1008 * - Remap kernel text readonly unless the "wpkernel" boot-arg is 0
1009 * - Change to large-pages if possible and not overriden.
1010 */
1011 if (kernel_text_ps_4K && wpkernel) {
1012 vm_offset_t myva;
1013 for (myva = stext; myva < etext; myva += PAGE_SIZE) {
1014 pt_entry_t *ptep;
1015
1016 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1017 if (ptep)
1018 pmap_store_pte(ptep, *ptep & ~INTEL_PTE_WRITE);
1019 }
1020 }
1021
1022 if (!kernel_text_ps_4K) {
1023 vm_offset_t myva;
1024
1025 /*
1026 * Release zero-filled page padding used for 2M-alignment.
1027 */
1028 DBG("ml_static_mfree(%p,%p) for padding below text\n",
1029 (void *) eHIB, (void *) (stext - eHIB));
1030 ml_static_mfree(eHIB, stext - eHIB);
1031 DBG("ml_static_mfree(%p,%p) for padding above text\n",
1032 (void *) etext, (void *) (sdata - etext));
1033 ml_static_mfree(etext, sdata - etext);
1034
1035 /*
1036 * Coalesce text pages into large pages.
1037 */
1038 for (myva = stext; myva < sdata; myva += I386_LPGBYTES) {
1039 pt_entry_t *ptep;
1040 vm_offset_t pte_phys;
1041 pt_entry_t *pdep;
1042 pt_entry_t pde;
1043
1044 pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva);
1045 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1046 DBG("myva: %p pdep: %p ptep: %p\n",
1047 (void *) myva, (void *) pdep, (void *) ptep);
1048 if ((*ptep & INTEL_PTE_VALID) == 0)
1049 continue;
1050 pte_phys = (vm_offset_t)(*ptep & PG_FRAME);
1051 pde = *pdep & PTMASK; /* page attributes from pde */
1052 pde |= INTEL_PTE_PS; /* make it a 2M entry */
1053 pde |= pte_phys; /* take page frame from pte */
1054
1055 if (wpkernel)
1056 pde &= ~INTEL_PTE_WRITE;
1057 DBG("pmap_store_pte(%p,0x%llx)\n",
1058 (void *)pdep, pde);
1059 pmap_store_pte(pdep, pde);
1060
1061 /*
1062 * Free the now-unused level-1 pte.
1063 * Note: ptep is a virtual address to the pte in the
1064 * recursive map. We can't use this address to free
1065 * the page. Instead we need to compute its address
1066 * in the Idle PTEs in "low memory".
1067 */
1068 vm_offset_t vm_ptep = (vm_offset_t) KPTphys
1069 + (pte_phys >> PTPGSHIFT);
1070 DBG("ml_static_mfree(%p,0x%x) for pte\n",
1071 (void *) vm_ptep, PAGE_SIZE);
1072 ml_static_mfree(vm_ptep, PAGE_SIZE);
1073 }
1074
1075 /* Change variable read by sysctl machdep.pmap */
1076 pmap_kernel_text_ps = I386_LPGBYTES;
1077 }
1078
1079 boolean_t doconstro = TRUE;
1080 #if DEVELOPMENT || DEBUG
1081 (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro));
1082 #endif
1083 if (doconstro) {
1084 if (sconst & PAGE_MASK) {
1085 panic("CONST segment misaligned 0x%lx 0x%lx\n",
1086 sconst, econst);
1087 }
1088 kprintf("Marking const DATA read-only\n");
1089 }
1090
1091 vm_offset_t dva;
1092
1093 for (dva = sdata; dva < edata; dva += I386_PGBYTES) {
1094 assert(((sdata | edata) & PAGE_MASK) == 0);
1095 pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva);
1096
1097 dpte = *dptep;
1098 assert((dpte & INTEL_PTE_VALID));
1099 dpte |= INTEL_PTE_NX;
1100 pmap_store_pte(dptep, dpte);
1101 dataptes++;
1102 }
1103 assert(dataptes > 0);
1104
1105 for (dva = sconst; dva < econst; dva += I386_PGBYTES) {
1106 pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva);
1107
1108 dpte = *dptep;
1109
1110 assert((dpte & INTEL_PTE_VALID));
1111 dpte |= INTEL_PTE_NX;
1112 dpte &= ~INTEL_PTE_WRITE;
1113 constptes++;
1114 pmap_store_pte(dptep, dpte);
1115 }
1116
1117 assert(constptes > 0);
1118
1119 kernel_segment_command_t * seg;
1120 kernel_section_t * sec;
1121
1122 for (seg = firstseg(); seg != NULL; seg = nextsegfromheader(&_mh_execute_header, seg)) {
1123 if (!strcmp(seg->segname, "__TEXT") ||
1124 !strcmp(seg->segname, "__DATA")) {
1125 continue;
1126 }
1127 //XXX
1128 if (!strcmp(seg->segname, "__KLD")) {
1129 continue;
1130 }
1131 if (!strcmp(seg->segname, "__HIB")) {
1132 for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) {
1133 if (sec->addr & PAGE_MASK)
1134 panic("__HIB segment's sections misaligned");
1135 if (!strcmp(sec->sectname, "__text")) {
1136 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), FALSE, TRUE);
1137 } else {
1138 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), TRUE, FALSE);
1139 }
1140 }
1141 } else {
1142 pmap_mark_range(kernel_pmap, seg->vmaddr, round_page_64(seg->vmsize), TRUE, FALSE);
1143 }
1144 }
1145
1146 /*
1147 * If we're debugging, map the low global vector page at the fixed
1148 * virtual address. Otherwise, remove the mapping for this.
1149 */
1150 if (debug_boot_arg) {
1151 pt_entry_t *pte = NULL;
1152 if (0 == (pte = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS)))
1153 panic("lowmem pte");
1154 /* make sure it is defined on page boundary */
1155 assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK));
1156 pmap_store_pte(pte, kvtophys((vm_offset_t)&lowGlo)
1157 | INTEL_PTE_REF
1158 | INTEL_PTE_MOD
1159 | INTEL_PTE_WIRED
1160 | INTEL_PTE_VALID
1161 | INTEL_PTE_WRITE
1162 | INTEL_PTE_NX);
1163 } else {
1164 pmap_remove(kernel_pmap,
1165 LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE);
1166 }
1167
1168 splx(spl);
1169 if (pmap_pcid_ncpus)
1170 tlb_flush_global();
1171 else
1172 flush_tlb_raw();
1173 }
1174
1175 /*
1176 * this function is only used for debugging fron the vm layer
1177 */
1178 boolean_t
1179 pmap_verify_free(
1180 ppnum_t pn)
1181 {
1182 pv_rooted_entry_t pv_h;
1183 int pai;
1184 boolean_t result;
1185
1186 assert(pn != vm_page_fictitious_addr);
1187
1188 if (!pmap_initialized)
1189 return(TRUE);
1190
1191 if (pn == vm_page_guard_addr)
1192 return TRUE;
1193
1194 pai = ppn_to_pai(pn);
1195 if (!IS_MANAGED_PAGE(pai))
1196 return(FALSE);
1197 pv_h = pai_to_pvh(pn);
1198 result = (pv_h->pmap == PMAP_NULL);
1199 return(result);
1200 }
1201
1202 boolean_t
1203 pmap_is_empty(
1204 pmap_t pmap,
1205 vm_map_offset_t va_start,
1206 vm_map_offset_t va_end)
1207 {
1208 vm_map_offset_t offset;
1209 ppnum_t phys_page;
1210
1211 if (pmap == PMAP_NULL) {
1212 return TRUE;
1213 }
1214
1215 /*
1216 * Check the resident page count
1217 * - if it's zero, the pmap is completely empty.
1218 * This short-circuit test prevents a virtual address scan which is
1219 * painfully slow for 64-bit spaces.
1220 * This assumes the count is correct
1221 * .. the debug kernel ought to be checking perhaps by page table walk.
1222 */
1223 if (pmap->stats.resident_count == 0)
1224 return TRUE;
1225
1226 for (offset = va_start;
1227 offset < va_end;
1228 offset += PAGE_SIZE_64) {
1229 phys_page = pmap_find_phys(pmap, offset);
1230 if (phys_page) {
1231 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1232 "page %d at 0x%llx\n",
1233 pmap, va_start, va_end, phys_page, offset);
1234 return FALSE;
1235 }
1236 }
1237
1238 return TRUE;
1239 }
1240
1241 void
1242 hv_ept_pmap_create(void **ept_pmap, void **eptp)
1243 {
1244 pmap_t p;
1245
1246 if ((ept_pmap == NULL) || (eptp == NULL)) {
1247 return;
1248 }
1249
1250 p = pmap_create_options(get_task_ledger(current_task()), 0, (PMAP_CREATE_64BIT | PMAP_CREATE_EPT));
1251 if (p == PMAP_NULL) {
1252 *ept_pmap = NULL;
1253 *eptp = NULL;
1254 return;
1255 }
1256
1257 assert(is_ept_pmap(p));
1258
1259 *ept_pmap = (void*)p;
1260 *eptp = (void*)(p->pm_eptp);
1261 return;
1262 }
1263
1264 /*
1265 * Create and return a physical map.
1266 *
1267 * If the size specified for the map
1268 * is zero, the map is an actual physical
1269 * map, and may be referenced by the
1270 * hardware.
1271 *
1272 * If the size specified is non-zero,
1273 * the map will be used in software only, and
1274 * is bounded by that size.
1275 */
1276 pmap_t
1277 pmap_create_options(
1278 ledger_t ledger,
1279 vm_map_size_t sz,
1280 int flags)
1281 {
1282 pmap_t p;
1283 vm_size_t size;
1284 pml4_entry_t *pml4;
1285 pml4_entry_t *kpml4;
1286
1287 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1288 (uint32_t) (sz>>32), (uint32_t) sz, flags, 0, 0);
1289
1290 size = (vm_size_t) sz;
1291
1292 /*
1293 * A software use-only map doesn't even need a map.
1294 */
1295
1296 if (size != 0) {
1297 return(PMAP_NULL);
1298 }
1299
1300 /*
1301 * Return error when unrecognized flags are passed.
1302 */
1303 if ((flags & ~(PMAP_CREATE_KNOWN_FLAGS)) != 0) {
1304 return(PMAP_NULL);
1305 }
1306
1307 p = (pmap_t) zalloc(pmap_zone);
1308 if (PMAP_NULL == p)
1309 panic("pmap_create zalloc");
1310 /* Zero all fields */
1311 bzero(p, sizeof(*p));
1312 /* init counts now since we'll be bumping some */
1313 simple_lock_init(&p->lock, 0);
1314 #if 00
1315 p->stats.resident_count = 0;
1316 p->stats.resident_max = 0;
1317 p->stats.wired_count = 0;
1318 #else
1319 bzero(&p->stats, sizeof (p->stats));
1320 #endif
1321 p->ref_count = 1;
1322 p->nx_enabled = 1;
1323 p->pm_shared = FALSE;
1324 ledger_reference(ledger);
1325 p->ledger = ledger;
1326
1327 p->pm_task_map = ((flags & PMAP_CREATE_64BIT) ? TASK_MAP_64BIT : TASK_MAP_32BIT);
1328
1329 p->pagezero_accessible = FALSE;
1330
1331 if (pmap_pcid_ncpus) {
1332 pmap_pcid_initialize(p);
1333 }
1334
1335 p->pm_pml4 = zalloc(pmap_anchor_zone);
1336
1337 pmap_assert((((uintptr_t)p->pm_pml4) & PAGE_MASK) == 0);
1338
1339 memset((char *)p->pm_pml4, 0, PAGE_SIZE);
1340
1341 if (flags & PMAP_CREATE_EPT) {
1342 p->pm_eptp = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4) | pmap_eptp_flags;
1343 p->pm_cr3 = 0;
1344 } else {
1345 p->pm_eptp = 0;
1346 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4);
1347 }
1348
1349 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1350
1351 p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS) * PAGE_SIZE);
1352 if (NULL == p->pm_obj_pml4)
1353 panic("pmap_create pdpt obj");
1354
1355 p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS) * PAGE_SIZE);
1356 if (NULL == p->pm_obj_pdpt)
1357 panic("pmap_create pdpt obj");
1358
1359 p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS) * PAGE_SIZE);
1360 if (NULL == p->pm_obj)
1361 panic("pmap_create pte obj");
1362
1363 if (!(flags & PMAP_CREATE_EPT)) {
1364 /* All host pmaps share the kernel's pml4 */
1365 pml4 = pmap64_pml4(p, 0ULL);
1366 kpml4 = kernel_pmap->pm_pml4;
1367 pml4[KERNEL_PML4_INDEX] = kpml4[KERNEL_PML4_INDEX];
1368 pml4[KERNEL_KEXTS_INDEX] = kpml4[KERNEL_KEXTS_INDEX];
1369 pml4[KERNEL_PHYSMAP_PML4_INDEX] = kpml4[KERNEL_PHYSMAP_PML4_INDEX];
1370 }
1371
1372 #if MACH_ASSERT
1373 p->pmap_pid = 0;
1374 strlcpy(p->pmap_procname, "<nil>", sizeof (p->pmap_procname));
1375 #endif /* MACH_ASSERT */
1376
1377 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1378 p, flags, 0, 0, 0);
1379
1380 return(p);
1381 }
1382
1383 pmap_t
1384 pmap_create(
1385 ledger_t ledger,
1386 vm_map_size_t sz,
1387 boolean_t is_64bit)
1388 {
1389 return pmap_create_options(ledger, sz, ((is_64bit) ? PMAP_CREATE_64BIT : 0));
1390 }
1391
1392 /*
1393 * We maintain stats and ledgers so that a task's physical footprint is:
1394 * phys_footprint = ((internal - alternate_accounting)
1395 * + (internal_compressed - alternate_accounting_compressed)
1396 * + iokit_mapped
1397 * + purgeable_nonvolatile
1398 * + purgeable_nonvolatile_compressed
1399 * + page_table)
1400 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
1401 */
1402
1403 #if MACH_ASSERT
1404 struct {
1405 uint64_t num_pmaps_checked;
1406
1407 int phys_footprint_over;
1408 ledger_amount_t phys_footprint_over_total;
1409 ledger_amount_t phys_footprint_over_max;
1410 int phys_footprint_under;
1411 ledger_amount_t phys_footprint_under_total;
1412 ledger_amount_t phys_footprint_under_max;
1413
1414 int internal_over;
1415 ledger_amount_t internal_over_total;
1416 ledger_amount_t internal_over_max;
1417 int internal_under;
1418 ledger_amount_t internal_under_total;
1419 ledger_amount_t internal_under_max;
1420
1421 int internal_compressed_over;
1422 ledger_amount_t internal_compressed_over_total;
1423 ledger_amount_t internal_compressed_over_max;
1424 int internal_compressed_under;
1425 ledger_amount_t internal_compressed_under_total;
1426 ledger_amount_t internal_compressed_under_max;
1427
1428 int iokit_mapped_over;
1429 ledger_amount_t iokit_mapped_over_total;
1430 ledger_amount_t iokit_mapped_over_max;
1431 int iokit_mapped_under;
1432 ledger_amount_t iokit_mapped_under_total;
1433 ledger_amount_t iokit_mapped_under_max;
1434
1435 int alternate_accounting_over;
1436 ledger_amount_t alternate_accounting_over_total;
1437 ledger_amount_t alternate_accounting_over_max;
1438 int alternate_accounting_under;
1439 ledger_amount_t alternate_accounting_under_total;
1440 ledger_amount_t alternate_accounting_under_max;
1441
1442 int alternate_accounting_compressed_over;
1443 ledger_amount_t alternate_accounting_compressed_over_total;
1444 ledger_amount_t alternate_accounting_compressed_over_max;
1445 int alternate_accounting_compressed_under;
1446 ledger_amount_t alternate_accounting_compressed_under_total;
1447 ledger_amount_t alternate_accounting_compressed_under_max;
1448
1449 int page_table_over;
1450 ledger_amount_t page_table_over_total;
1451 ledger_amount_t page_table_over_max;
1452 int page_table_under;
1453 ledger_amount_t page_table_under_total;
1454 ledger_amount_t page_table_under_max;
1455
1456 int purgeable_volatile_over;
1457 ledger_amount_t purgeable_volatile_over_total;
1458 ledger_amount_t purgeable_volatile_over_max;
1459 int purgeable_volatile_under;
1460 ledger_amount_t purgeable_volatile_under_total;
1461 ledger_amount_t purgeable_volatile_under_max;
1462
1463 int purgeable_nonvolatile_over;
1464 ledger_amount_t purgeable_nonvolatile_over_total;
1465 ledger_amount_t purgeable_nonvolatile_over_max;
1466 int purgeable_nonvolatile_under;
1467 ledger_amount_t purgeable_nonvolatile_under_total;
1468 ledger_amount_t purgeable_nonvolatile_under_max;
1469
1470 int purgeable_volatile_compressed_over;
1471 ledger_amount_t purgeable_volatile_compressed_over_total;
1472 ledger_amount_t purgeable_volatile_compressed_over_max;
1473 int purgeable_volatile_compressed_under;
1474 ledger_amount_t purgeable_volatile_compressed_under_total;
1475 ledger_amount_t purgeable_volatile_compressed_under_max;
1476
1477 int purgeable_nonvolatile_compressed_over;
1478 ledger_amount_t purgeable_nonvolatile_compressed_over_total;
1479 ledger_amount_t purgeable_nonvolatile_compressed_over_max;
1480 int purgeable_nonvolatile_compressed_under;
1481 ledger_amount_t purgeable_nonvolatile_compressed_under_total;
1482 ledger_amount_t purgeable_nonvolatile_compressed_under_max;
1483 } pmap_ledgers_drift;
1484 static void pmap_check_ledgers(pmap_t pmap);
1485 #else /* MACH_ASSERT */
1486 static inline void pmap_check_ledgers(__unused pmap_t pmap) {}
1487 #endif /* MACH_ASSERT */
1488
1489 /*
1490 * Retire the given physical map from service.
1491 * Should only be called if the map contains
1492 * no valid mappings.
1493 */
1494 extern int vm_wired_objects_page_count;
1495
1496 void
1497 pmap_destroy(pmap_t p)
1498 {
1499 int c;
1500
1501 if (p == PMAP_NULL)
1502 return;
1503
1504 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1505 p, 0, 0, 0, 0);
1506
1507 PMAP_LOCK(p);
1508
1509 c = --p->ref_count;
1510
1511 pmap_assert((current_thread() && (current_thread()->map)) ? (current_thread()->map->pmap != p) : TRUE);
1512
1513 if (c == 0) {
1514 /*
1515 * If some cpu is not using the physical pmap pointer that it
1516 * is supposed to be (see set_dirbase), we might be using the
1517 * pmap that is being destroyed! Make sure we are
1518 * physically on the right pmap:
1519 */
1520 PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL);
1521 if (pmap_pcid_ncpus)
1522 pmap_destroy_pcid_sync(p);
1523 }
1524
1525 PMAP_UNLOCK(p);
1526
1527 if (c != 0) {
1528 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1529 p, 1, 0, 0, 0);
1530 pmap_assert(p == kernel_pmap);
1531 return; /* still in use */
1532 }
1533
1534 /*
1535 * Free the memory maps, then the
1536 * pmap structure.
1537 */
1538 int inuse_ptepages = 0;
1539
1540 zfree(pmap_anchor_zone, p->pm_pml4);
1541
1542 inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1543 vm_object_deallocate(p->pm_obj_pml4);
1544
1545 inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1546 vm_object_deallocate(p->pm_obj_pdpt);
1547
1548 inuse_ptepages += p->pm_obj->resident_page_count;
1549 vm_object_deallocate(p->pm_obj);
1550
1551 OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count);
1552 PMAP_ZINFO_PFREE(p, inuse_ptepages * PAGE_SIZE);
1553
1554 pmap_check_ledgers(p);
1555 ledger_dereference(p->ledger);
1556 zfree(pmap_zone, p);
1557
1558 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1559 0, 0, 0, 0, 0);
1560 }
1561
1562 /*
1563 * Add a reference to the specified pmap.
1564 */
1565
1566 void
1567 pmap_reference(pmap_t p)
1568 {
1569 if (p != PMAP_NULL) {
1570 PMAP_LOCK(p);
1571 p->ref_count++;
1572 PMAP_UNLOCK(p);;
1573 }
1574 }
1575
1576 /*
1577 * Remove phys addr if mapped in specified map
1578 *
1579 */
1580 void
1581 pmap_remove_some_phys(
1582 __unused pmap_t map,
1583 __unused ppnum_t pn)
1584 {
1585
1586 /* Implement to support working set code */
1587
1588 }
1589
1590
1591 void
1592 pmap_protect(
1593 pmap_t map,
1594 vm_map_offset_t sva,
1595 vm_map_offset_t eva,
1596 vm_prot_t prot)
1597 {
1598 pmap_protect_options(map, sva, eva, prot, 0, NULL);
1599 }
1600
1601
1602 /*
1603 * Set the physical protection on the
1604 * specified range of this map as requested.
1605 * Will not increase permissions.
1606 */
1607 void
1608 pmap_protect_options(
1609 pmap_t map,
1610 vm_map_offset_t sva,
1611 vm_map_offset_t eva,
1612 vm_prot_t prot,
1613 unsigned int options,
1614 void *arg)
1615 {
1616 pt_entry_t *pde;
1617 pt_entry_t *spte, *epte;
1618 vm_map_offset_t lva;
1619 vm_map_offset_t orig_sva;
1620 boolean_t set_NX;
1621 int num_found = 0;
1622 boolean_t is_ept;
1623
1624 pmap_intr_assert();
1625
1626 if (map == PMAP_NULL)
1627 return;
1628
1629 if (prot == VM_PROT_NONE) {
1630 pmap_remove_options(map, sva, eva, options);
1631 return;
1632 }
1633 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
1634 map,
1635 (uint32_t) (sva >> 32), (uint32_t) sva,
1636 (uint32_t) (eva >> 32), (uint32_t) eva);
1637
1638 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !map->nx_enabled)
1639 set_NX = FALSE;
1640 else
1641 set_NX = TRUE;
1642
1643 is_ept = is_ept_pmap(map);
1644
1645
1646 PMAP_LOCK(map);
1647
1648 orig_sva = sva;
1649 while (sva < eva) {
1650 lva = (sva + pde_mapped_size) & ~(pde_mapped_size - 1);
1651 if (lva > eva)
1652 lva = eva;
1653 pde = pmap_pde(map, sva);
1654 if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1655 if (*pde & PTE_PS) {
1656 /* superpage */
1657 spte = pde;
1658 epte = spte+1; /* excluded */
1659 } else {
1660 spte = pmap_pte(map, (sva & ~(pde_mapped_size - 1)));
1661 spte = &spte[ptenum(sva)];
1662 epte = &spte[intel_btop(lva - sva)];
1663 }
1664
1665 for (; spte < epte; spte++) {
1666 if (!(*spte & PTE_VALID_MASK(is_ept)))
1667 continue;
1668
1669 if (is_ept) {
1670 if (prot & VM_PROT_READ)
1671 pmap_update_pte(spte, 0, PTE_READ(is_ept));
1672 else
1673 pmap_update_pte(spte, PTE_READ(is_ept), 0);
1674 }
1675 if (prot & VM_PROT_WRITE)
1676 pmap_update_pte(spte, 0, PTE_WRITE(is_ept));
1677 else
1678 pmap_update_pte(spte, PTE_WRITE(is_ept), 0);
1679
1680 if (set_NX) {
1681 if (!is_ept)
1682 pmap_update_pte(spte, 0, INTEL_PTE_NX);
1683 else
1684 pmap_update_pte(spte, INTEL_EPT_EX, 0);
1685 } else {
1686 if (!is_ept)
1687 pmap_update_pte(spte, INTEL_PTE_NX, 0);
1688 else
1689 pmap_update_pte(spte, 0, INTEL_EPT_EX);
1690 }
1691 num_found++;
1692 }
1693 }
1694 sva = lva;
1695 }
1696 if (num_found) {
1697 if (options & PMAP_OPTIONS_NOFLUSH)
1698 PMAP_UPDATE_TLBS_DELAYED(map, orig_sva, eva, (pmap_flush_context *)arg);
1699 else
1700 PMAP_UPDATE_TLBS(map, orig_sva, eva);
1701 }
1702 PMAP_UNLOCK(map);
1703
1704 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END,
1705 0, 0, 0, 0, 0);
1706
1707 }
1708
1709 /* Map a (possibly) autogenned block */
1710 void
1711 pmap_map_block(
1712 pmap_t pmap,
1713 addr64_t va,
1714 ppnum_t pa,
1715 uint32_t size,
1716 vm_prot_t prot,
1717 int attr,
1718 __unused unsigned int flags)
1719 {
1720 uint32_t page;
1721 int cur_page_size;
1722
1723 if (attr & VM_MEM_SUPERPAGE)
1724 cur_page_size = SUPERPAGE_SIZE;
1725 else
1726 cur_page_size = PAGE_SIZE;
1727
1728 for (page = 0; page < size; page+=cur_page_size/PAGE_SIZE) {
1729 pmap_enter(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE);
1730 va += cur_page_size;
1731 pa+=cur_page_size/PAGE_SIZE;
1732 }
1733 }
1734
1735 kern_return_t
1736 pmap_expand_pml4(
1737 pmap_t map,
1738 vm_map_offset_t vaddr,
1739 unsigned int options)
1740 {
1741 vm_page_t m;
1742 pmap_paddr_t pa;
1743 uint64_t i;
1744 ppnum_t pn;
1745 pml4_entry_t *pml4p;
1746 boolean_t is_ept = is_ept_pmap(map);
1747
1748 DBG("pmap_expand_pml4(%p,%p)\n", map, (void *)vaddr);
1749
1750 /*
1751 * Allocate a VM page for the pml4 page
1752 */
1753 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
1754 if (options & PMAP_EXPAND_OPTIONS_NOWAIT)
1755 return KERN_RESOURCE_SHORTAGE;
1756 VM_PAGE_WAIT();
1757 }
1758 /*
1759 * put the page into the pmap's obj list so it
1760 * can be found later.
1761 */
1762 pn = VM_PAGE_GET_PHYS_PAGE(m);
1763 pa = i386_ptob(pn);
1764 i = pml4idx(map, vaddr);
1765
1766 /*
1767 * Zero the page.
1768 */
1769 pmap_zero_page(pn);
1770
1771 vm_page_lockspin_queues();
1772 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
1773 vm_page_unlock_queues();
1774
1775 OSAddAtomic(1, &inuse_ptepages_count);
1776 OSAddAtomic64(1, &alloc_ptepages_count);
1777 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
1778
1779 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
1780 vm_object_lock(map->pm_obj_pml4);
1781
1782 PMAP_LOCK(map);
1783 /*
1784 * See if someone else expanded us first
1785 */
1786 if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
1787 PMAP_UNLOCK(map);
1788 vm_object_unlock(map->pm_obj_pml4);
1789
1790 VM_PAGE_FREE(m);
1791
1792 OSAddAtomic(-1, &inuse_ptepages_count);
1793 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
1794 return KERN_SUCCESS;
1795 }
1796
1797 #if 0 /* DEBUG */
1798 if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE)) {
1799 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
1800 map, map->pm_obj_pml4, vaddr, i);
1801 }
1802 #endif
1803 vm_page_insert_wired(m, map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
1804 vm_object_unlock(map->pm_obj_pml4);
1805
1806 /*
1807 * Set the page directory entry for this page table.
1808 */
1809 pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
1810
1811 pmap_store_pte(pml4p, pa_to_pte(pa)
1812 | PTE_READ(is_ept)
1813 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
1814 | PTE_WRITE(is_ept));
1815
1816 PMAP_UNLOCK(map);
1817
1818 return KERN_SUCCESS;
1819 }
1820
1821 kern_return_t
1822 pmap_expand_pdpt(pmap_t map, vm_map_offset_t vaddr, unsigned int options)
1823 {
1824 vm_page_t m;
1825 pmap_paddr_t pa;
1826 uint64_t i;
1827 ppnum_t pn;
1828 pdpt_entry_t *pdptp;
1829 boolean_t is_ept = is_ept_pmap(map);
1830
1831 DBG("pmap_expand_pdpt(%p,%p)\n", map, (void *)vaddr);
1832
1833 while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
1834 kern_return_t pep4kr = pmap_expand_pml4(map, vaddr, options);
1835 if (pep4kr != KERN_SUCCESS)
1836 return pep4kr;
1837 }
1838
1839 /*
1840 * Allocate a VM page for the pdpt page
1841 */
1842 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
1843 if (options & PMAP_EXPAND_OPTIONS_NOWAIT)
1844 return KERN_RESOURCE_SHORTAGE;
1845 VM_PAGE_WAIT();
1846 }
1847
1848 /*
1849 * put the page into the pmap's obj list so it
1850 * can be found later.
1851 */
1852 pn = VM_PAGE_GET_PHYS_PAGE(m);
1853 pa = i386_ptob(pn);
1854 i = pdptidx(map, vaddr);
1855
1856 /*
1857 * Zero the page.
1858 */
1859 pmap_zero_page(pn);
1860
1861 vm_page_lockspin_queues();
1862 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
1863 vm_page_unlock_queues();
1864
1865 OSAddAtomic(1, &inuse_ptepages_count);
1866 OSAddAtomic64(1, &alloc_ptepages_count);
1867 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
1868
1869 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
1870 vm_object_lock(map->pm_obj_pdpt);
1871
1872 PMAP_LOCK(map);
1873 /*
1874 * See if someone else expanded us first
1875 */
1876 if (pmap64_pde(map, vaddr) != PD_ENTRY_NULL) {
1877 PMAP_UNLOCK(map);
1878 vm_object_unlock(map->pm_obj_pdpt);
1879
1880 VM_PAGE_FREE(m);
1881
1882 OSAddAtomic(-1, &inuse_ptepages_count);
1883 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
1884 return KERN_SUCCESS;
1885 }
1886
1887 #if 0 /* DEBUG */
1888 if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE)) {
1889 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
1890 map, map->pm_obj_pdpt, vaddr, i);
1891 }
1892 #endif
1893 vm_page_insert_wired(m, map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
1894 vm_object_unlock(map->pm_obj_pdpt);
1895
1896 /*
1897 * Set the page directory entry for this page table.
1898 */
1899 pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
1900
1901 pmap_store_pte(pdptp, pa_to_pte(pa)
1902 | PTE_READ(is_ept)
1903 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
1904 | PTE_WRITE(is_ept));
1905
1906 PMAP_UNLOCK(map);
1907
1908 return KERN_SUCCESS;
1909
1910 }
1911
1912
1913
1914 /*
1915 * Routine: pmap_expand
1916 *
1917 * Expands a pmap to be able to map the specified virtual address.
1918 *
1919 * Allocates new virtual memory for the P0 or P1 portion of the
1920 * pmap, then re-maps the physical pages that were in the old
1921 * pmap to be in the new pmap.
1922 *
1923 * Must be called with the pmap system and the pmap unlocked,
1924 * since these must be unlocked to use vm_allocate or vm_deallocate.
1925 * Thus it must be called in a loop that checks whether the map
1926 * has been expanded enough.
1927 * (We won't loop forever, since page tables aren't shrunk.)
1928 */
1929 kern_return_t
1930 pmap_expand(
1931 pmap_t map,
1932 vm_map_offset_t vaddr,
1933 unsigned int options)
1934 {
1935 pt_entry_t *pdp;
1936 vm_page_t m;
1937 pmap_paddr_t pa;
1938 uint64_t i;
1939 ppnum_t pn;
1940 boolean_t is_ept = is_ept_pmap(map);
1941
1942
1943 /*
1944 * For the kernel, the virtual address must be in or above the basement
1945 * which is for kexts and is in the 512GB immediately below the kernel..
1946 * XXX - should use VM_MIN_KERNEL_AND_KEXT_ADDRESS not KERNEL_BASEMENT
1947 */
1948 if (map == kernel_pmap &&
1949 !(vaddr >= KERNEL_BASEMENT && vaddr <= VM_MAX_KERNEL_ADDRESS))
1950 panic("pmap_expand: bad vaddr 0x%llx for kernel pmap", vaddr);
1951
1952
1953 while ((pdp = pmap64_pde(map, vaddr)) == PD_ENTRY_NULL) {
1954 kern_return_t pepkr = pmap_expand_pdpt(map, vaddr, options);
1955 if (pepkr != KERN_SUCCESS)
1956 return pepkr;
1957 }
1958
1959 /*
1960 * Allocate a VM page for the pde entries.
1961 */
1962 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
1963 if (options & PMAP_EXPAND_OPTIONS_NOWAIT)
1964 return KERN_RESOURCE_SHORTAGE;
1965 VM_PAGE_WAIT();
1966 }
1967
1968 /*
1969 * put the page into the pmap's obj list so it
1970 * can be found later.
1971 */
1972 pn = VM_PAGE_GET_PHYS_PAGE(m);
1973 pa = i386_ptob(pn);
1974 i = pdeidx(map, vaddr);
1975
1976 /*
1977 * Zero the page.
1978 */
1979 pmap_zero_page(pn);
1980
1981 vm_page_lockspin_queues();
1982 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
1983 vm_page_unlock_queues();
1984
1985 OSAddAtomic(1, &inuse_ptepages_count);
1986 OSAddAtomic64(1, &alloc_ptepages_count);
1987 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
1988
1989 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
1990 vm_object_lock(map->pm_obj);
1991
1992 PMAP_LOCK(map);
1993
1994 /*
1995 * See if someone else expanded us first
1996 */
1997 if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
1998 PMAP_UNLOCK(map);
1999 vm_object_unlock(map->pm_obj);
2000
2001 VM_PAGE_FREE(m);
2002
2003 OSAddAtomic(-1, &inuse_ptepages_count);
2004 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2005 return KERN_SUCCESS;
2006 }
2007
2008 #if 0 /* DEBUG */
2009 if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE)) {
2010 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
2011 map, map->pm_obj, vaddr, i);
2012 }
2013 #endif
2014 vm_page_insert_wired(m, map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2015 vm_object_unlock(map->pm_obj);
2016
2017 /*
2018 * Set the page directory entry for this page table.
2019 */
2020 pdp = pmap_pde(map, vaddr);
2021 pmap_store_pte(pdp, pa_to_pte(pa)
2022 | PTE_READ(is_ept)
2023 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2024 | PTE_WRITE(is_ept));
2025
2026 PMAP_UNLOCK(map);
2027
2028 return KERN_SUCCESS;
2029 }
2030
2031 /* On K64 machines with more than 32GB of memory, pmap_steal_memory
2032 * will allocate past the 1GB of pre-expanded virtual kernel area. This
2033 * function allocates all the page tables using memory from the same pool
2034 * that pmap_steal_memory uses, rather than calling vm_page_grab (which
2035 * isn't available yet). */
2036 void
2037 pmap_pre_expand(pmap_t pmap, vm_map_offset_t vaddr)
2038 {
2039 ppnum_t pn;
2040 pt_entry_t *pte;
2041 boolean_t is_ept = is_ept_pmap(pmap);
2042
2043 PMAP_LOCK(pmap);
2044
2045 if(pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) {
2046 if (!pmap_next_page_hi(&pn))
2047 panic("pmap_pre_expand");
2048
2049 pmap_zero_page(pn);
2050
2051 pte = pmap64_pml4(pmap, vaddr);
2052
2053 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
2054 | PTE_READ(is_ept)
2055 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2056 | PTE_WRITE(is_ept));
2057 }
2058
2059 if(pmap64_pde(pmap, vaddr) == PD_ENTRY_NULL) {
2060 if (!pmap_next_page_hi(&pn))
2061 panic("pmap_pre_expand");
2062
2063 pmap_zero_page(pn);
2064
2065 pte = pmap64_pdpt(pmap, vaddr);
2066
2067 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
2068 | PTE_READ(is_ept)
2069 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2070 | PTE_WRITE(is_ept));
2071 }
2072
2073 if(pmap_pte(pmap, vaddr) == PT_ENTRY_NULL) {
2074 if (!pmap_next_page_hi(&pn))
2075 panic("pmap_pre_expand");
2076
2077 pmap_zero_page(pn);
2078
2079 pte = pmap64_pde(pmap, vaddr);
2080
2081 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
2082 | PTE_READ(is_ept)
2083 | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
2084 | PTE_WRITE(is_ept));
2085 }
2086
2087 PMAP_UNLOCK(pmap);
2088 }
2089
2090 /*
2091 * pmap_sync_page_data_phys(ppnum_t pa)
2092 *
2093 * Invalidates all of the instruction cache on a physical page and
2094 * pushes any dirty data from the data cache for the same physical page
2095 * Not required in i386.
2096 */
2097 void
2098 pmap_sync_page_data_phys(__unused ppnum_t pa)
2099 {
2100 return;
2101 }
2102
2103 /*
2104 * pmap_sync_page_attributes_phys(ppnum_t pa)
2105 *
2106 * Write back and invalidate all cachelines on a physical page.
2107 */
2108 void
2109 pmap_sync_page_attributes_phys(ppnum_t pa)
2110 {
2111 cache_flush_page_phys(pa);
2112 }
2113
2114
2115
2116 #ifdef CURRENTLY_UNUSED_AND_UNTESTED
2117
2118 int collect_ref;
2119 int collect_unref;
2120
2121 /*
2122 * Routine: pmap_collect
2123 * Function:
2124 * Garbage collects the physical map system for
2125 * pages which are no longer used.
2126 * Success need not be guaranteed -- that is, there
2127 * may well be pages which are not referenced, but
2128 * others may be collected.
2129 * Usage:
2130 * Called by the pageout daemon when pages are scarce.
2131 */
2132 void
2133 pmap_collect(
2134 pmap_t p)
2135 {
2136 pt_entry_t *pdp, *ptp;
2137 pt_entry_t *eptp;
2138 int wired;
2139 boolean_t is_ept;
2140
2141 if (p == PMAP_NULL)
2142 return;
2143
2144 if (p == kernel_pmap)
2145 return;
2146
2147 is_ept = is_ept_pmap(p);
2148
2149 /*
2150 * Garbage collect map.
2151 */
2152 PMAP_LOCK(p);
2153
2154 for (pdp = (pt_entry_t *)p->dirbase;
2155 pdp < (pt_entry_t *)&p->dirbase[(UMAXPTDI+1)];
2156 pdp++)
2157 {
2158 if (*pdp & PTE_VALID_MASK(is_ept)) {
2159 if (*pdp & PTE_REF(is_ept)) {
2160 pmap_store_pte(pdp, *pdp & ~PTE_REF(is_ept));
2161 collect_ref++;
2162 } else {
2163 collect_unref++;
2164 ptp = pmap_pte(p, pdetova(pdp - (pt_entry_t *)p->dirbase));
2165 eptp = ptp + NPTEPG;
2166
2167 /*
2168 * If the pte page has any wired mappings, we cannot
2169 * free it.
2170 */
2171 wired = 0;
2172 {
2173 pt_entry_t *ptep;
2174 for (ptep = ptp; ptep < eptp; ptep++) {
2175 if (iswired(*ptep)) {
2176 wired = 1;
2177 break;
2178 }
2179 }
2180 }
2181 if (!wired) {
2182 /*
2183 * Remove the virtual addresses mapped by this pte page.
2184 */
2185 pmap_remove_range(p,
2186 pdetova(pdp - (pt_entry_t *)p->dirbase),
2187 ptp,
2188 eptp);
2189
2190 /*
2191 * Invalidate the page directory pointer.
2192 */
2193 pmap_store_pte(pdp, 0x0);
2194
2195 PMAP_UNLOCK(p);
2196
2197 /*
2198 * And free the pte page itself.
2199 */
2200 {
2201 vm_page_t m;
2202
2203 vm_object_lock(p->pm_obj);
2204
2205 m = vm_page_lookup(p->pm_obj,(vm_object_offset_t)(pdp - (pt_entry_t *)&p->dirbase[0]) * PAGE_SIZE);
2206 if (m == VM_PAGE_NULL)
2207 panic("pmap_collect: pte page not in object");
2208
2209 vm_object_unlock(p->pm_obj);
2210
2211 VM_PAGE_FREE(m);
2212
2213 OSAddAtomic(-1, &inuse_ptepages_count);
2214 PMAP_ZINFO_PFREE(p, PAGE_SIZE);
2215 }
2216
2217 PMAP_LOCK(p);
2218 }
2219 }
2220 }
2221 }
2222
2223 PMAP_UPDATE_TLBS(p, 0x0, 0xFFFFFFFFFFFFF000ULL);
2224 PMAP_UNLOCK(p);
2225 return;
2226 }
2227 #endif
2228
2229
2230 void
2231 pmap_copy_page(ppnum_t src, ppnum_t dst)
2232 {
2233 bcopy_phys((addr64_t)i386_ptob(src),
2234 (addr64_t)i386_ptob(dst),
2235 PAGE_SIZE);
2236 }
2237
2238
2239 /*
2240 * Routine: pmap_pageable
2241 * Function:
2242 * Make the specified pages (by pmap, offset)
2243 * pageable (or not) as requested.
2244 *
2245 * A page which is not pageable may not take
2246 * a fault; therefore, its page table entry
2247 * must remain valid for the duration.
2248 *
2249 * This routine is merely advisory; pmap_enter
2250 * will specify that these pages are to be wired
2251 * down (or not) as appropriate.
2252 */
2253 void
2254 pmap_pageable(
2255 __unused pmap_t pmap,
2256 __unused vm_map_offset_t start_addr,
2257 __unused vm_map_offset_t end_addr,
2258 __unused boolean_t pageable)
2259 {
2260 #ifdef lint
2261 pmap++; start_addr++; end_addr++; pageable++;
2262 #endif /* lint */
2263 }
2264
2265 void
2266 invalidate_icache(__unused vm_offset_t addr,
2267 __unused unsigned cnt,
2268 __unused int phys)
2269 {
2270 return;
2271 }
2272
2273 void
2274 flush_dcache(__unused vm_offset_t addr,
2275 __unused unsigned count,
2276 __unused int phys)
2277 {
2278 return;
2279 }
2280
2281 #if CONFIG_DTRACE
2282 /*
2283 * Constrain DTrace copyin/copyout actions
2284 */
2285 extern kern_return_t dtrace_copyio_preflight(addr64_t);
2286 extern kern_return_t dtrace_copyio_postflight(addr64_t);
2287
2288 kern_return_t dtrace_copyio_preflight(__unused addr64_t va)
2289 {
2290 thread_t thread = current_thread();
2291 uint64_t ccr3;
2292 if (current_map() == kernel_map)
2293 return KERN_FAILURE;
2294 else if (((ccr3 = get_cr3_base()) != thread->map->pmap->pm_cr3) && (no_shared_cr3 == FALSE))
2295 return KERN_FAILURE;
2296 else if (no_shared_cr3 && (ccr3 != kernel_pmap->pm_cr3))
2297 return KERN_FAILURE;
2298 else
2299 return KERN_SUCCESS;
2300 }
2301
2302 kern_return_t dtrace_copyio_postflight(__unused addr64_t va)
2303 {
2304 return KERN_SUCCESS;
2305 }
2306 #endif /* CONFIG_DTRACE */
2307
2308 #include <mach_vm_debug.h>
2309 #if MACH_VM_DEBUG
2310 #include <vm/vm_debug.h>
2311
2312 int
2313 pmap_list_resident_pages(
2314 __unused pmap_t pmap,
2315 __unused vm_offset_t *listp,
2316 __unused int space)
2317 {
2318 return 0;
2319 }
2320 #endif /* MACH_VM_DEBUG */
2321
2322
2323 #if CONFIG_COREDUMP
2324 /* temporary workaround */
2325 boolean_t
2326 coredumpok(__unused vm_map_t map, __unused vm_offset_t va)
2327 {
2328 #if 0
2329 pt_entry_t *ptep;
2330
2331 ptep = pmap_pte(map->pmap, va);
2332 if (0 == ptep)
2333 return FALSE;
2334 return ((*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED));
2335 #else
2336 return TRUE;
2337 #endif
2338 }
2339 #endif
2340
2341 boolean_t
2342 phys_page_exists(ppnum_t pn)
2343 {
2344 assert(pn != vm_page_fictitious_addr);
2345
2346 if (!pmap_initialized)
2347 return TRUE;
2348
2349 if (pn == vm_page_guard_addr)
2350 return FALSE;
2351
2352 if (!IS_MANAGED_PAGE(ppn_to_pai(pn)))
2353 return FALSE;
2354
2355 return TRUE;
2356 }
2357
2358
2359
2360 void
2361 pmap_switch(pmap_t tpmap)
2362 {
2363 spl_t s;
2364
2365 s = splhigh(); /* Make sure interruptions are disabled */
2366 set_dirbase(tpmap, current_thread(), cpu_number());
2367 splx(s);
2368 }
2369
2370
2371 /*
2372 * disable no-execute capability on
2373 * the specified pmap
2374 */
2375 void
2376 pmap_disable_NX(pmap_t pmap)
2377 {
2378 pmap->nx_enabled = 0;
2379 }
2380
2381 void
2382 pt_fake_zone_init(int zone_index)
2383 {
2384 pt_fake_zone_index = zone_index;
2385 }
2386
2387 void
2388 pt_fake_zone_info(
2389 int *count,
2390 vm_size_t *cur_size,
2391 vm_size_t *max_size,
2392 vm_size_t *elem_size,
2393 vm_size_t *alloc_size,
2394 uint64_t *sum_size,
2395 int *collectable,
2396 int *exhaustable,
2397 int *caller_acct)
2398 {
2399 *count = inuse_ptepages_count;
2400 *cur_size = PAGE_SIZE * inuse_ptepages_count;
2401 *max_size = PAGE_SIZE * (inuse_ptepages_count +
2402 vm_page_inactive_count +
2403 vm_page_active_count +
2404 vm_page_free_count);
2405 *elem_size = PAGE_SIZE;
2406 *alloc_size = PAGE_SIZE;
2407 *sum_size = alloc_ptepages_count * PAGE_SIZE;
2408
2409 *collectable = 1;
2410 *exhaustable = 0;
2411 *caller_acct = 1;
2412 }
2413
2414
2415 void
2416 pmap_flush_context_init(pmap_flush_context *pfc)
2417 {
2418 pfc->pfc_cpus = 0;
2419 pfc->pfc_invalid_global = 0;
2420 }
2421
2422 extern uint64_t TLBTimeOut;
2423 void
2424 pmap_flush(
2425 pmap_flush_context *pfc)
2426 {
2427 unsigned int my_cpu;
2428 unsigned int cpu;
2429 unsigned int cpu_bit;
2430 cpumask_t cpus_to_respond = 0;
2431 cpumask_t cpus_to_signal = 0;
2432 cpumask_t cpus_signaled = 0;
2433 boolean_t flush_self = FALSE;
2434 uint64_t deadline;
2435
2436 mp_disable_preemption();
2437
2438 my_cpu = cpu_number();
2439 cpus_to_signal = pfc->pfc_cpus;
2440
2441 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_START,
2442 NULL, cpus_to_signal, 0, 0, 0);
2443
2444 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus && cpus_to_signal; cpu++, cpu_bit <<= 1) {
2445
2446 if (cpus_to_signal & cpu_bit) {
2447
2448 cpus_to_signal &= ~cpu_bit;
2449
2450 if (!cpu_datap(cpu)->cpu_running)
2451 continue;
2452
2453 if (pfc->pfc_invalid_global & cpu_bit)
2454 cpu_datap(cpu)->cpu_tlb_invalid_global = TRUE;
2455 else
2456 cpu_datap(cpu)->cpu_tlb_invalid_local = TRUE;
2457 mfence();
2458
2459 if (cpu == my_cpu) {
2460 flush_self = TRUE;
2461 continue;
2462 }
2463 if (CPU_CR3_IS_ACTIVE(cpu)) {
2464 cpus_to_respond |= cpu_bit;
2465 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2466 }
2467 }
2468 }
2469 cpus_signaled = cpus_to_respond;
2470
2471 /*
2472 * Flush local tlb if required.
2473 * Do this now to overlap with other processors responding.
2474 */
2475 if (flush_self && cpu_datap(my_cpu)->cpu_tlb_invalid != FALSE)
2476 process_pmap_updates();
2477
2478 if (cpus_to_respond) {
2479
2480 deadline = mach_absolute_time() +
2481 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
2482 boolean_t is_timeout_traced = FALSE;
2483
2484 /*
2485 * Wait for those other cpus to acknowledge
2486 */
2487 while (cpus_to_respond != 0) {
2488 long orig_acks = 0;
2489
2490 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2491 /* Consider checking local/global invalidity
2492 * as appropriate in the PCID case.
2493 */
2494 if ((cpus_to_respond & cpu_bit) != 0) {
2495 if (!cpu_datap(cpu)->cpu_running ||
2496 cpu_datap(cpu)->cpu_tlb_invalid == FALSE ||
2497 !CPU_CR3_IS_ACTIVE(cpu)) {
2498 cpus_to_respond &= ~cpu_bit;
2499 }
2500 cpu_pause();
2501 }
2502 if (cpus_to_respond == 0)
2503 break;
2504 }
2505 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
2506 if (machine_timeout_suspended())
2507 continue;
2508 if (TLBTimeOut == 0) {
2509 if (is_timeout_traced)
2510 continue;
2511 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
2512 NULL, cpus_to_signal, cpus_to_respond, 0, 0);
2513 is_timeout_traced = TRUE;
2514 continue;
2515 }
2516 pmap_tlb_flush_timeout = TRUE;
2517 orig_acks = NMIPI_acks;
2518 mp_cpus_NMIPI(cpus_to_respond);
2519
2520 panic("TLB invalidation IPI timeout: "
2521 "CPU(s) failed to respond to interrupts, unresponsive CPU bitmap: 0x%llx, NMIPI acks: orig: 0x%lx, now: 0x%lx",
2522 cpus_to_respond, orig_acks, NMIPI_acks);
2523 }
2524 }
2525 }
2526 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_END,
2527 NULL, cpus_signaled, flush_self, 0, 0);
2528
2529 mp_enable_preemption();
2530 }
2531
2532
2533 static void
2534 invept(void *eptp)
2535 {
2536 struct {
2537 uint64_t eptp;
2538 uint64_t reserved;
2539 } __attribute__((aligned(16), packed)) invept_descriptor = {(uint64_t)eptp, 0};
2540
2541 __asm__ volatile("invept (%%rax), %%rcx"
2542 : : "c" (PMAP_INVEPT_SINGLE_CONTEXT), "a" (&invept_descriptor)
2543 : "cc", "memory");
2544 }
2545
2546 /*
2547 * Called with pmap locked, we:
2548 * - scan through per-cpu data to see which other cpus need to flush
2549 * - send an IPI to each non-idle cpu to be flushed
2550 * - wait for all to signal back that they are inactive or we see that
2551 * they are at a safe point (idle).
2552 * - flush the local tlb if active for this pmap
2553 * - return ... the caller will unlock the pmap
2554 */
2555
2556 void
2557 pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int options, pmap_flush_context *pfc)
2558 {
2559 unsigned int cpu;
2560 unsigned int cpu_bit;
2561 cpumask_t cpus_to_signal = 0;
2562 unsigned int my_cpu = cpu_number();
2563 pmap_paddr_t pmap_cr3 = pmap->pm_cr3;
2564 boolean_t flush_self = FALSE;
2565 uint64_t deadline;
2566 boolean_t pmap_is_shared = (pmap->pm_shared || (pmap == kernel_pmap));
2567 boolean_t need_global_flush = FALSE;
2568 uint32_t event_code;
2569 vm_map_offset_t event_startv, event_endv;
2570 boolean_t is_ept = is_ept_pmap(pmap);
2571
2572 assert((processor_avail_count < 2) ||
2573 (ml_get_interrupts_enabled() && get_preemption_level() != 0));
2574
2575 if (pmap == kernel_pmap) {
2576 event_code = PMAP_CODE(PMAP__FLUSH_KERN_TLBS);
2577 event_startv = VM_KERNEL_UNSLIDE_OR_PERM(startv);
2578 event_endv = VM_KERNEL_UNSLIDE_OR_PERM(endv);
2579 } else if (is_ept) {
2580 event_code = PMAP_CODE(PMAP__FLUSH_EPT);
2581 event_startv = startv;
2582 event_endv = endv;
2583 } else {
2584 event_code = PMAP_CODE(PMAP__FLUSH_TLBS);
2585 event_startv = startv;
2586 event_endv = endv;
2587 }
2588
2589 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_START,
2590 VM_KERNEL_UNSLIDE_OR_PERM(pmap), options, event_startv, event_endv, 0);
2591
2592 if (is_ept) {
2593 mp_cpus_call(CPUMASK_ALL, ASYNC, invept, (void*)pmap->pm_eptp);
2594 goto out;
2595 }
2596
2597 /*
2598 * Scan other cpus for matching active or task CR3.
2599 * For idle cpus (with no active map) we mark them invalid but
2600 * don't signal -- they'll check as they go busy.
2601 */
2602 if (pmap_pcid_ncpus) {
2603 if (pmap_is_shared)
2604 need_global_flush = TRUE;
2605 pmap_pcid_invalidate_all_cpus(pmap);
2606 mfence();
2607 }
2608 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2609 if (!cpu_datap(cpu)->cpu_running)
2610 continue;
2611 uint64_t cpu_active_cr3 = CPU_GET_ACTIVE_CR3(cpu);
2612 uint64_t cpu_task_cr3 = CPU_GET_TASK_CR3(cpu);
2613
2614 if ((pmap_cr3 == cpu_task_cr3) ||
2615 (pmap_cr3 == cpu_active_cr3) ||
2616 (pmap_is_shared)) {
2617
2618 if (options & PMAP_DELAY_TLB_FLUSH) {
2619 if (need_global_flush == TRUE)
2620 pfc->pfc_invalid_global |= cpu_bit;
2621 pfc->pfc_cpus |= cpu_bit;
2622
2623 continue;
2624 }
2625 if (cpu == my_cpu) {
2626 flush_self = TRUE;
2627 continue;
2628 }
2629 if (need_global_flush == TRUE)
2630 cpu_datap(cpu)->cpu_tlb_invalid_global = TRUE;
2631 else
2632 cpu_datap(cpu)->cpu_tlb_invalid_local = TRUE;
2633 mfence();
2634
2635 /*
2636 * We don't need to signal processors which will flush
2637 * lazily at the idle state or kernel boundary.
2638 * For example, if we're invalidating the kernel pmap,
2639 * processors currently in userspace don't need to flush
2640 * their TLBs until the next time they enter the kernel.
2641 * Alterations to the address space of a task active
2642 * on a remote processor result in a signal, to
2643 * account for copy operations. (There may be room
2644 * for optimization in such cases).
2645 * The order of the loads below with respect
2646 * to the store to the "cpu_tlb_invalid" field above
2647 * is important--hence the barrier.
2648 */
2649 if (CPU_CR3_IS_ACTIVE(cpu) &&
2650 (pmap_cr3 == CPU_GET_ACTIVE_CR3(cpu) ||
2651 pmap->pm_shared ||
2652 (pmap_cr3 == CPU_GET_TASK_CR3(cpu)))) {
2653 cpus_to_signal |= cpu_bit;
2654 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2655 }
2656 }
2657 }
2658 if ((options & PMAP_DELAY_TLB_FLUSH))
2659 goto out;
2660
2661 /*
2662 * Flush local tlb if required.
2663 * Do this now to overlap with other processors responding.
2664 */
2665 if (flush_self) {
2666 if (pmap_pcid_ncpus) {
2667 pmap_pcid_validate_cpu(pmap, my_cpu);
2668 if (pmap_is_shared)
2669 tlb_flush_global();
2670 else
2671 flush_tlb_raw();
2672 }
2673 else
2674 flush_tlb_raw();
2675 }
2676
2677 if (cpus_to_signal) {
2678 cpumask_t cpus_to_respond = cpus_to_signal;
2679
2680 deadline = mach_absolute_time() +
2681 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
2682 boolean_t is_timeout_traced = FALSE;
2683
2684 /*
2685 * Wait for those other cpus to acknowledge
2686 */
2687 while (cpus_to_respond != 0) {
2688 long orig_acks = 0;
2689
2690 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2691 /* Consider checking local/global invalidity
2692 * as appropriate in the PCID case.
2693 */
2694 if ((cpus_to_respond & cpu_bit) != 0) {
2695 if (!cpu_datap(cpu)->cpu_running ||
2696 cpu_datap(cpu)->cpu_tlb_invalid == FALSE ||
2697 !CPU_CR3_IS_ACTIVE(cpu)) {
2698 cpus_to_respond &= ~cpu_bit;
2699 }
2700 cpu_pause();
2701 }
2702 if (cpus_to_respond == 0)
2703 break;
2704 }
2705 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
2706 if (machine_timeout_suspended())
2707 continue;
2708 if (TLBTimeOut == 0) {
2709 /* cut tracepoint but don't panic */
2710 if (is_timeout_traced)
2711 continue;
2712 PMAP_TRACE_CONSTANT(
2713 PMAP_CODE(PMAP__FLUSH_TLBS_TO),
2714 VM_KERNEL_UNSLIDE_OR_PERM(pmap), cpus_to_signal, cpus_to_respond, 0, 0);
2715 is_timeout_traced = TRUE;
2716 continue;
2717 }
2718 pmap_tlb_flush_timeout = TRUE;
2719 orig_acks = NMIPI_acks;
2720 mp_cpus_NMIPI(cpus_to_respond);
2721
2722 panic("TLB invalidation IPI timeout: "
2723 "CPU(s) failed to respond to interrupts, unresponsive CPU bitmap: 0x%llx, NMIPI acks: orig: 0x%lx, now: 0x%lx",
2724 cpus_to_respond, orig_acks, NMIPI_acks);
2725 }
2726 }
2727 }
2728
2729 if (__improbable((pmap == kernel_pmap) && (flush_self != TRUE))) {
2730 panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, pmap_cr3: 0x%llx, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, pmap_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map);
2731 }
2732
2733 out:
2734 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_END,
2735 VM_KERNEL_UNSLIDE_OR_PERM(pmap), cpus_to_signal, event_startv, event_endv, 0);
2736
2737 }
2738
2739 void
2740 process_pmap_updates(void)
2741 {
2742 int ccpu = cpu_number();
2743 pmap_assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
2744 if (pmap_pcid_ncpus) {
2745 pmap_pcid_validate_current();
2746 if (cpu_datap(ccpu)->cpu_tlb_invalid_global) {
2747 cpu_datap(ccpu)->cpu_tlb_invalid = FALSE;
2748 tlb_flush_global();
2749 }
2750 else {
2751 cpu_datap(ccpu)->cpu_tlb_invalid_local = FALSE;
2752 flush_tlb_raw();
2753 }
2754 }
2755 else {
2756 current_cpu_datap()->cpu_tlb_invalid = FALSE;
2757 flush_tlb_raw();
2758 }
2759
2760 mfence();
2761 }
2762
2763 void
2764 pmap_update_interrupt(void)
2765 {
2766 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START,
2767 0, 0, 0, 0, 0);
2768
2769 if (current_cpu_datap()->cpu_tlb_invalid)
2770 process_pmap_updates();
2771
2772 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END,
2773 0, 0, 0, 0, 0);
2774 }
2775
2776 #include <mach/mach_vm.h> /* mach_vm_region_recurse() */
2777 /* Scan kernel pmap for W+X PTEs, scan kernel VM map for W+X map entries
2778 * and identify ranges with mismatched VM permissions and PTE permissions
2779 */
2780 kern_return_t
2781 pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset_t ev) {
2782 vm_offset_t cv = sv;
2783 kern_return_t rv = KERN_SUCCESS;
2784 uint64_t skip4 = 0, skip2 = 0;
2785
2786 assert(!is_ept_pmap(ipmap));
2787
2788 sv &= ~PAGE_MASK_64;
2789 ev &= ~PAGE_MASK_64;
2790 while (cv < ev) {
2791 if (__improbable((cv > 0x00007FFFFFFFFFFFULL) &&
2792 (cv < 0xFFFF800000000000ULL))) {
2793 cv = 0xFFFF800000000000ULL;
2794 }
2795 /* Potential inconsistencies from not holding pmap lock
2796 * but harmless for the moment.
2797 */
2798 if (((cv & PML4MASK) == 0) && (pmap64_pml4(ipmap, cv) == 0)) {
2799 if ((cv + NBPML4) > cv)
2800 cv += NBPML4;
2801 else
2802 break;
2803 skip4++;
2804 continue;
2805 }
2806 if (((cv & PDMASK) == 0) && (pmap_pde(ipmap, cv) == 0)) {
2807 if ((cv + NBPD) > cv)
2808 cv += NBPD;
2809 else
2810 break;
2811 skip2++;
2812 continue;
2813 }
2814
2815 pt_entry_t *ptep = pmap_pte(ipmap, cv);
2816 if (ptep && (*ptep & INTEL_PTE_VALID)) {
2817 if (*ptep & INTEL_PTE_WRITE) {
2818 if (!(*ptep & INTEL_PTE_NX)) {
2819 kprintf("W+X PTE at 0x%lx, P4: 0x%llx, P3: 0x%llx, P2: 0x%llx, PT: 0x%llx, VP: %u\n", cv, *pmap64_pml4(ipmap, cv), *pmap64_pdpt(ipmap, cv), *pmap64_pde(ipmap, cv), *ptep, pmap_valid_page((ppnum_t)(i386_btop(pte_to_pa(*ptep)))));
2820 rv = KERN_FAILURE;
2821 }
2822 }
2823 }
2824 cv += PAGE_SIZE;
2825 }
2826 kprintf("Completed pmap scan\n");
2827 cv = sv;
2828
2829 struct vm_region_submap_info_64 vbr;
2830 mach_msg_type_number_t vbrcount = 0;
2831 mach_vm_size_t vmsize;
2832 vm_prot_t prot;
2833 uint32_t nesting_depth = 0;
2834 kern_return_t kret;
2835
2836 while (cv < ev) {
2837
2838 for (;;) {
2839 vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64;
2840 if((kret = mach_vm_region_recurse(ivmmap,
2841 (mach_vm_address_t *) &cv, &vmsize, &nesting_depth,
2842 (vm_region_recurse_info_t)&vbr,
2843 &vbrcount)) != KERN_SUCCESS) {
2844 break;
2845 }
2846
2847 if(vbr.is_submap) {
2848 nesting_depth++;
2849 continue;
2850 } else {
2851 break;
2852 }
2853 }
2854
2855 if(kret != KERN_SUCCESS)
2856 break;
2857
2858 prot = vbr.protection;
2859
2860 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) {
2861 kprintf("W+X map entry at address 0x%lx\n", cv);
2862 rv = KERN_FAILURE;
2863 }
2864
2865 if (prot) {
2866 vm_offset_t pcv;
2867 for (pcv = cv; pcv < cv + vmsize; pcv += PAGE_SIZE) {
2868 pt_entry_t *ptep = pmap_pte(ipmap, pcv);
2869 vm_prot_t tprot;
2870
2871 if ((ptep == NULL) || !(*ptep & INTEL_PTE_VALID))
2872 continue;
2873 tprot = VM_PROT_READ;
2874 if (*ptep & INTEL_PTE_WRITE)
2875 tprot |= VM_PROT_WRITE;
2876 if ((*ptep & INTEL_PTE_NX) == 0)
2877 tprot |= VM_PROT_EXECUTE;
2878 if (tprot != prot) {
2879 kprintf("PTE/map entry permissions mismatch at address 0x%lx, pte: 0x%llx, protection: 0x%x\n", pcv, *ptep, prot);
2880 rv = KERN_FAILURE;
2881 }
2882 }
2883 }
2884 cv += vmsize;
2885 }
2886 return rv;
2887 }
2888
2889 #if MACH_ASSERT
2890 extern int pmap_ledgers_panic;
2891 static void
2892 pmap_check_ledgers(
2893 pmap_t pmap)
2894 {
2895 ledger_amount_t bal;
2896 int pid;
2897 char *procname;
2898 boolean_t do_panic;
2899
2900 if (pmap->pmap_pid == 0) {
2901 /*
2902 * This pmap was not or is no longer fully associated
2903 * with a task (e.g. the old pmap after a fork()/exec() or
2904 * spawn()). Its "ledger" still points at a task that is
2905 * now using a different (and active) address space, so
2906 * we can't check that all the pmap ledgers are balanced here.
2907 *
2908 * If the "pid" is set, that means that we went through
2909 * pmap_set_process() in task_terminate_internal(), so
2910 * this task's ledger should not have been re-used and
2911 * all the pmap ledgers should be back to 0.
2912 */
2913 return;
2914 }
2915
2916 do_panic = FALSE;
2917 pid = pmap->pmap_pid;
2918 procname = pmap->pmap_procname;
2919
2920 pmap_ledgers_drift.num_pmaps_checked++;
2921
2922 ledger_get_balance(pmap->ledger,
2923 task_ledgers.phys_footprint,
2924 &bal);
2925 if (bal != 0) {
2926 do_panic = TRUE;
2927 printf("LEDGER BALANCE proc %d (%s) "
2928 "\"phys_footprint\" = %lld\n",
2929 pid, procname, bal);
2930 if (bal > 0) {
2931 pmap_ledgers_drift.phys_footprint_over++;
2932 pmap_ledgers_drift.phys_footprint_over_total += bal;
2933 if (bal > pmap_ledgers_drift.phys_footprint_over_max) {
2934 pmap_ledgers_drift.phys_footprint_over_max = bal;
2935 }
2936 } else {
2937 pmap_ledgers_drift.phys_footprint_under++;
2938 pmap_ledgers_drift.phys_footprint_under_total += bal;
2939 if (bal < pmap_ledgers_drift.phys_footprint_under_max) {
2940 pmap_ledgers_drift.phys_footprint_under_max = bal;
2941 }
2942 }
2943 }
2944 ledger_get_balance(pmap->ledger,
2945 task_ledgers.internal,
2946 &bal);
2947 if (bal != 0) {
2948 do_panic = TRUE;
2949 printf("LEDGER BALANCE proc %d (%s) "
2950 "\"internal\" = %lld\n",
2951 pid, procname, bal);
2952 if (bal > 0) {
2953 pmap_ledgers_drift.internal_over++;
2954 pmap_ledgers_drift.internal_over_total += bal;
2955 if (bal > pmap_ledgers_drift.internal_over_max) {
2956 pmap_ledgers_drift.internal_over_max = bal;
2957 }
2958 } else {
2959 pmap_ledgers_drift.internal_under++;
2960 pmap_ledgers_drift.internal_under_total += bal;
2961 if (bal < pmap_ledgers_drift.internal_under_max) {
2962 pmap_ledgers_drift.internal_under_max = bal;
2963 }
2964 }
2965 }
2966 ledger_get_balance(pmap->ledger,
2967 task_ledgers.internal_compressed,
2968 &bal);
2969 if (bal != 0) {
2970 do_panic = TRUE;
2971 printf("LEDGER BALANCE proc %d (%s) "
2972 "\"internal_compressed\" = %lld\n",
2973 pid, procname, bal);
2974 if (bal > 0) {
2975 pmap_ledgers_drift.internal_compressed_over++;
2976 pmap_ledgers_drift.internal_compressed_over_total += bal;
2977 if (bal > pmap_ledgers_drift.internal_compressed_over_max) {
2978 pmap_ledgers_drift.internal_compressed_over_max = bal;
2979 }
2980 } else {
2981 pmap_ledgers_drift.internal_compressed_under++;
2982 pmap_ledgers_drift.internal_compressed_under_total += bal;
2983 if (bal < pmap_ledgers_drift.internal_compressed_under_max) {
2984 pmap_ledgers_drift.internal_compressed_under_max = bal;
2985 }
2986 }
2987 }
2988 ledger_get_balance(pmap->ledger,
2989 task_ledgers.iokit_mapped,
2990 &bal);
2991 if (bal != 0) {
2992 do_panic = TRUE;
2993 printf("LEDGER BALANCE proc %d (%s) "
2994 "\"iokit_mapped\" = %lld\n",
2995 pid, procname, bal);
2996 if (bal > 0) {
2997 pmap_ledgers_drift.iokit_mapped_over++;
2998 pmap_ledgers_drift.iokit_mapped_over_total += bal;
2999 if (bal > pmap_ledgers_drift.iokit_mapped_over_max) {
3000 pmap_ledgers_drift.iokit_mapped_over_max = bal;
3001 }
3002 } else {
3003 pmap_ledgers_drift.iokit_mapped_under++;
3004 pmap_ledgers_drift.iokit_mapped_under_total += bal;
3005 if (bal < pmap_ledgers_drift.iokit_mapped_under_max) {
3006 pmap_ledgers_drift.iokit_mapped_under_max = bal;
3007 }
3008 }
3009 }
3010 ledger_get_balance(pmap->ledger,
3011 task_ledgers.alternate_accounting,
3012 &bal);
3013 if (bal != 0) {
3014 do_panic = TRUE;
3015 printf("LEDGER BALANCE proc %d (%s) "
3016 "\"alternate_accounting\" = %lld\n",
3017 pid, procname, bal);
3018 if (bal > 0) {
3019 pmap_ledgers_drift.alternate_accounting_over++;
3020 pmap_ledgers_drift.alternate_accounting_over_total += bal;
3021 if (bal > pmap_ledgers_drift.alternate_accounting_over_max) {
3022 pmap_ledgers_drift.alternate_accounting_over_max = bal;
3023 }
3024 } else {
3025 pmap_ledgers_drift.alternate_accounting_under++;
3026 pmap_ledgers_drift.alternate_accounting_under_total += bal;
3027 if (bal < pmap_ledgers_drift.alternate_accounting_under_max) {
3028 pmap_ledgers_drift.alternate_accounting_under_max = bal;
3029 }
3030 }
3031 }
3032 ledger_get_balance(pmap->ledger,
3033 task_ledgers.alternate_accounting_compressed,
3034 &bal);
3035 if (bal != 0) {
3036 do_panic = TRUE;
3037 printf("LEDGER BALANCE proc %d (%s) "
3038 "\"alternate_accounting_compressed\" = %lld\n",
3039 pid, procname, bal);
3040 if (bal > 0) {
3041 pmap_ledgers_drift.alternate_accounting_compressed_over++;
3042 pmap_ledgers_drift.alternate_accounting_compressed_over_total += bal;
3043 if (bal > pmap_ledgers_drift.alternate_accounting_compressed_over_max) {
3044 pmap_ledgers_drift.alternate_accounting_compressed_over_max = bal;
3045 }
3046 } else {
3047 pmap_ledgers_drift.alternate_accounting_compressed_under++;
3048 pmap_ledgers_drift.alternate_accounting_compressed_under_total += bal;
3049 if (bal < pmap_ledgers_drift.alternate_accounting_compressed_under_max) {
3050 pmap_ledgers_drift.alternate_accounting_compressed_under_max = bal;
3051 }
3052 }
3053 }
3054 ledger_get_balance(pmap->ledger,
3055 task_ledgers.page_table,
3056 &bal);
3057 if (bal != 0) {
3058 do_panic = TRUE;
3059 printf("LEDGER BALANCE proc %d (%s) "
3060 "\"page_table\" = %lld\n",
3061 pid, procname, bal);
3062 if (bal > 0) {
3063 pmap_ledgers_drift.page_table_over++;
3064 pmap_ledgers_drift.page_table_over_total += bal;
3065 if (bal > pmap_ledgers_drift.page_table_over_max) {
3066 pmap_ledgers_drift.page_table_over_max = bal;
3067 }
3068 } else {
3069 pmap_ledgers_drift.page_table_under++;
3070 pmap_ledgers_drift.page_table_under_total += bal;
3071 if (bal < pmap_ledgers_drift.page_table_under_max) {
3072 pmap_ledgers_drift.page_table_under_max = bal;
3073 }
3074 }
3075 }
3076 ledger_get_balance(pmap->ledger,
3077 task_ledgers.purgeable_volatile,
3078 &bal);
3079 if (bal != 0) {
3080 do_panic = TRUE;
3081 printf("LEDGER BALANCE proc %d (%s) "
3082 "\"purgeable_volatile\" = %lld\n",
3083 pid, procname, bal);
3084 if (bal > 0) {
3085 pmap_ledgers_drift.purgeable_volatile_over++;
3086 pmap_ledgers_drift.purgeable_volatile_over_total += bal;
3087 if (bal > pmap_ledgers_drift.purgeable_volatile_over_max) {
3088 pmap_ledgers_drift.purgeable_volatile_over_max = bal;
3089 }
3090 } else {
3091 pmap_ledgers_drift.purgeable_volatile_under++;
3092 pmap_ledgers_drift.purgeable_volatile_under_total += bal;
3093 if (bal < pmap_ledgers_drift.purgeable_volatile_under_max) {
3094 pmap_ledgers_drift.purgeable_volatile_under_max = bal;
3095 }
3096 }
3097 }
3098 ledger_get_balance(pmap->ledger,
3099 task_ledgers.purgeable_nonvolatile,
3100 &bal);
3101 if (bal != 0) {
3102 do_panic = TRUE;
3103 printf("LEDGER BALANCE proc %d (%s) "
3104 "\"purgeable_nonvolatile\" = %lld\n",
3105 pid, procname, bal);
3106 if (bal > 0) {
3107 pmap_ledgers_drift.purgeable_nonvolatile_over++;
3108 pmap_ledgers_drift.purgeable_nonvolatile_over_total += bal;
3109 if (bal > pmap_ledgers_drift.purgeable_nonvolatile_over_max) {
3110 pmap_ledgers_drift.purgeable_nonvolatile_over_max = bal;
3111 }
3112 } else {
3113 pmap_ledgers_drift.purgeable_nonvolatile_under++;
3114 pmap_ledgers_drift.purgeable_nonvolatile_under_total += bal;
3115 if (bal < pmap_ledgers_drift.purgeable_nonvolatile_under_max) {
3116 pmap_ledgers_drift.purgeable_nonvolatile_under_max = bal;
3117 }
3118 }
3119 }
3120 ledger_get_balance(pmap->ledger,
3121 task_ledgers.purgeable_volatile_compressed,
3122 &bal);
3123 if (bal != 0) {
3124 do_panic = TRUE;
3125 printf("LEDGER BALANCE proc %d (%s) "
3126 "\"purgeable_volatile_compressed\" = %lld\n",
3127 pid, procname, bal);
3128 if (bal > 0) {
3129 pmap_ledgers_drift.purgeable_volatile_compressed_over++;
3130 pmap_ledgers_drift.purgeable_volatile_compressed_over_total += bal;
3131 if (bal > pmap_ledgers_drift.purgeable_volatile_compressed_over_max) {
3132 pmap_ledgers_drift.purgeable_volatile_compressed_over_max = bal;
3133 }
3134 } else {
3135 pmap_ledgers_drift.purgeable_volatile_compressed_under++;
3136 pmap_ledgers_drift.purgeable_volatile_compressed_under_total += bal;
3137 if (bal < pmap_ledgers_drift.purgeable_volatile_compressed_under_max) {
3138 pmap_ledgers_drift.purgeable_volatile_compressed_under_max = bal;
3139 }
3140 }
3141 }
3142 ledger_get_balance(pmap->ledger,
3143 task_ledgers.purgeable_nonvolatile_compressed,
3144 &bal);
3145 if (bal != 0) {
3146 do_panic = TRUE;
3147 printf("LEDGER BALANCE proc %d (%s) "
3148 "\"purgeable_nonvolatile_compressed\" = %lld\n",
3149 pid, procname, bal);
3150 if (bal > 0) {
3151 pmap_ledgers_drift.purgeable_nonvolatile_compressed_over++;
3152 pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_total += bal;
3153 if (bal > pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_max) {
3154 pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_max = bal;
3155 }
3156 } else {
3157 pmap_ledgers_drift.purgeable_nonvolatile_compressed_under++;
3158 pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_total += bal;
3159 if (bal < pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_max) {
3160 pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_max = bal;
3161 }
3162 }
3163 }
3164
3165 if (do_panic) {
3166 if (pmap_ledgers_panic) {
3167 panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
3168 pmap, pid, procname);
3169 } else {
3170 printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
3171 pmap, pid, procname);
3172 }
3173 }
3174
3175 if (pmap->stats.resident_count != 0 ||
3176 pmap->stats.wired_count != 0 ||
3177 pmap->stats.device != 0 ||
3178 pmap->stats.internal != 0 ||
3179 pmap->stats.external != 0 ||
3180 pmap->stats.reusable != 0 ||
3181 pmap->stats.compressed != 0) {
3182 if (pmap_stats_assert) {
3183 panic("pmap_destroy(%p) %d[%s] imbalanced stats: resident=%d wired=%d device=%d internal=%d external=%d reusable=%d compressed=%lld",
3184 pmap, pid, procname,
3185 pmap->stats.resident_count,
3186 pmap->stats.wired_count,
3187 pmap->stats.device,
3188 pmap->stats.internal,
3189 pmap->stats.external,
3190 pmap->stats.reusable,
3191 pmap->stats.compressed);
3192 } else {
3193 printf("pmap_destroy(%p) %d[%s] imbalanced stats: resident=%d wired=%d device=%d internal=%d external=%d reusable=%d compressed=%lld",
3194 pmap, pid, procname,
3195 pmap->stats.resident_count,
3196 pmap->stats.wired_count,
3197 pmap->stats.device,
3198 pmap->stats.internal,
3199 pmap->stats.external,
3200 pmap->stats.reusable,
3201 pmap->stats.compressed);
3202 }
3203 }
3204 }
3205
3206 void
3207 pmap_set_process(
3208 pmap_t pmap,
3209 int pid,
3210 char *procname)
3211 {
3212 if (pmap == NULL)
3213 return;
3214
3215 pmap->pmap_pid = pid;
3216 strlcpy(pmap->pmap_procname, procname, sizeof (pmap->pmap_procname));
3217 }
3218 #endif /* MACH_ASSERT */
3219
3220
3221 #if DEVELOPMENT || DEBUG
3222 int pmap_pagezero_mitigation = 1;
3223 #endif
3224
3225 void pmap_advise_pagezero_range(pmap_t lpmap, uint64_t low_bound) {
3226 #if DEVELOPMENT || DEBUG
3227 if (pmap_pagezero_mitigation == 0) {
3228 lpmap->pagezero_accessible = FALSE;
3229 return;
3230 }
3231 #endif
3232 lpmap->pagezero_accessible = ((pmap_smap_enabled == FALSE) && (low_bound < 0x1000));
3233 if (lpmap == current_pmap()) {
3234 mp_disable_preemption();
3235 current_cpu_datap()->cpu_pagezero_mapped = lpmap->pagezero_accessible;
3236 mp_enable_preemption();
3237 }
3238 }
3239
3240 void pmap_verify_noncacheable(uintptr_t vaddr) {
3241 pt_entry_t *ptep = NULL;
3242 ptep = pmap_pte(kernel_pmap, vaddr);
3243 if (ptep == NULL) {
3244 panic("pmap_verify_noncacheable: no translation for 0x%lx", vaddr);
3245 }
3246 /* Non-cacheable OK */
3247 if (*ptep & (INTEL_PTE_NCACHE))
3248 return;
3249 /* Write-combined OK */
3250 if (*ptep & (INTEL_PTE_PTA))
3251 return;
3252 panic("pmap_verify_noncacheable: IO read from a cacheable address? address: 0x%lx, PTE: %p, *PTE: 0x%llx", vaddr, ptep, *ptep);
3253 }