]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/pmap.c
xnu-1486.2.11.tar.gz
[apple/xnu.git] / osfmk / i386 / pmap.c
1 /*
2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58
59 /*
60 * File: pmap.c
61 * Author: Avadis Tevanian, Jr., Michael Wayne Young
62 * (These guys wrote the Vax version)
63 *
64 * Physical Map management code for Intel i386, i486, and i860.
65 *
66 * Manages physical address maps.
67 *
68 * In addition to hardware address maps, this
69 * module is called upon to provide software-use-only
70 * maps which may or may not be stored in the same
71 * form as hardware maps. These pseudo-maps are
72 * used to store intermediate results from copy
73 * operations to and from address spaces.
74 *
75 * Since the information managed by this module is
76 * also stored by the logical address mapping module,
77 * this module may throw away valid virtual-to-physical
78 * mappings at almost any time. However, invalidations
79 * of virtual-to-physical mappings must be done as
80 * requested.
81 *
82 * In order to cope with hardware architectures which
83 * make virtual-to-physical map invalidates expensive,
84 * this module may delay invalidate or reduced protection
85 * operations until such time as they are actually
86 * necessary. This module is given full information as
87 * to which processors are currently using which maps,
88 * and to when physical maps must be made correct.
89 */
90
91 #include <string.h>
92 #include <norma_vm.h>
93 #include <mach_kdb.h>
94 #include <mach_ldebug.h>
95
96 #include <libkern/OSAtomic.h>
97
98 #include <mach/machine/vm_types.h>
99
100 #include <mach/boolean.h>
101 #include <kern/thread.h>
102 #include <kern/zalloc.h>
103 #include <kern/queue.h>
104
105 #include <kern/lock.h>
106 #include <kern/kalloc.h>
107 #include <kern/spl.h>
108
109 #include <vm/pmap.h>
110 #include <vm/vm_map.h>
111 #include <vm/vm_kern.h>
112 #include <mach/vm_param.h>
113 #include <mach/vm_prot.h>
114 #include <vm/vm_object.h>
115 #include <vm/vm_page.h>
116
117 #include <mach/machine/vm_param.h>
118 #include <machine/thread.h>
119
120 #include <kern/misc_protos.h> /* prototyping */
121 #include <i386/misc_protos.h>
122
123 #include <i386/cpuid.h>
124 #include <i386/cpu_data.h>
125 #include <i386/cpu_number.h>
126 #include <i386/machine_cpu.h>
127 #include <i386/seg.h>
128 #include <i386/serial_io.h>
129 #include <i386/cpu_capabilities.h>
130 #include <i386/machine_routines.h>
131 #include <i386/proc_reg.h>
132 #include <i386/tsc.h>
133 #include <i386/acpi.h>
134 #include <i386/pmap_internal.h>
135
136 #if MACH_KDB
137 #include <ddb/db_command.h>
138 #include <ddb/db_output.h>
139 #include <ddb/db_sym.h>
140 #include <ddb/db_print.h>
141 #endif /* MACH_KDB */
142
143 #include <vm/vm_protos.h>
144
145 #include <i386/mp.h>
146 #include <i386/mp_desc.h>
147 #include <i386/i386_lowmem.h>
148
149
150 /* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
151 #ifdef DEBUGINTERRUPTS
152 #define pmap_intr_assert() {if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) panic("pmap interrupt assert %s, %d",__FILE__, __LINE__);}
153 #else
154 #define pmap_intr_assert()
155 #endif
156
157 #ifdef IWANTTODEBUG
158 #undef DEBUG
159 #define DEBUG 1
160 #define POSTCODE_DELAY 1
161 #include <i386/postcode.h>
162 #endif /* IWANTTODEBUG */
163
164 /*
165 * Forward declarations for internal functions.
166 */
167
168 void pmap_remove_range(
169 pmap_t pmap,
170 vm_map_offset_t va,
171 pt_entry_t *spte,
172 pt_entry_t *epte);
173
174 void phys_attribute_clear(
175 ppnum_t phys,
176 int bits);
177
178 int phys_attribute_test(
179 ppnum_t phys,
180 int bits);
181
182 void phys_attribute_set(
183 ppnum_t phys,
184 int bits);
185
186 void pmap_set_reference(
187 ppnum_t pn);
188
189 boolean_t phys_page_exists(
190 ppnum_t pn);
191
192
193 #ifdef PMAP_DEBUG
194 void dump_pmap(pmap_t);
195 void dump_4GB_pdpt(pmap_t p);
196 void dump_4GB_pdpt_thread(thread_t tp);
197 #endif
198
199 int nx_enabled = 1; /* enable no-execute protection */
200 #ifdef CONFIG_EMBEDDED
201 int allow_data_exec = 0; /* no exec from data, embedded is hardcore like that */
202 #else
203 int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
204 #endif
205 int allow_stack_exec = 0; /* No apps may execute from the stack by default */
206
207 boolean_t cpu_64bit = FALSE;
208 boolean_t pmap_trace = FALSE;
209
210 /*
211 * when spinning through pmap_remove
212 * ensure that we don't spend too much
213 * time with preemption disabled.
214 * I'm setting the current threshold
215 * to 20us
216 */
217 #define MAX_PREEMPTION_LATENCY_NS 20000
218
219 uint64_t max_preemption_latency_tsc = 0;
220
221
222 /*
223 * Private data structures.
224 */
225
226 /*
227 * For each vm_page_t, there is a list of all currently
228 * valid virtual mappings of that page. An entry is
229 * a pv_rooted_entry_t; the list is the pv_table.
230 *
231 * N.B. with the new combo rooted/hashed scheme it is
232 * only possibly to remove individual non-rooted entries
233 * if they are found via the hashed chains as there is no
234 * way to unlink the singly linked hashed entries if navigated to
235 * via the queue list off the rooted entries. Think of it as
236 * hash/walk/pull, keeping track of the prev pointer while walking
237 * the singly linked hash list. All of this is to save memory and
238 * keep both types of pv_entries as small as possible.
239 */
240
241 /*
242
243 PV HASHING Changes - JK 1/2007
244
245 Pve's establish physical to virtual mappings. These are used for aliasing of a
246 physical page to (potentially many) virtual addresses within pmaps. In the previous
247 implementation the structure of the pv_entries (each 16 bytes in size) was
248
249 typedef struct pv_entry {
250 struct pv_entry_t next;
251 pmap_t pmap;
252 vm_map_offset_t va;
253 } *pv_entry_t;
254
255 An initial array of these is created at boot time, one per physical page of memory,
256 indexed by the physical page number. Additionally, a pool of entries is created from a
257 pv_zone to be used as needed by pmap_enter() when it is creating new mappings.
258 Originally, we kept this pool around because the code in pmap_enter() was unable to
259 block if it needed an entry and none were available - we'd panic. Some time ago I
260 restructured the pmap_enter() code so that for user pmaps it can block while zalloc'ing
261 a pv structure and restart, removing a panic from the code (in the case of the kernel
262 pmap we cannot block and still panic, so, we keep a separate hot pool for use only on
263 kernel pmaps). The pool has not been removed since there is a large performance gain
264 keeping freed pv's around for reuse and not suffering the overhead of zalloc for every new pv we need.
265
266 As pmap_enter() created new mappings it linked the new pve's for them off the fixed
267 pv array for that ppn (off the next pointer). These pve's are accessed for several
268 operations, one of them being address space teardown. In that case, we basically do this
269
270 for (every page/pte in the space) {
271 calc pve_ptr from the ppn in the pte
272 for (every pv in the list for the ppn) {
273 if (this pv is for this pmap/vaddr) {
274 do housekeeping
275 unlink/free the pv
276 }
277 }
278 }
279
280 The problem arose when we were running, say 8000 (or even 2000) apache or other processes
281 and one or all terminate. The list hanging off each pv array entry could have thousands of
282 entries. We were continuously linearly searching each of these lists as we stepped through
283 the address space we were tearing down. Because of the locks we hold, likely taking a cache
284 miss for each node, and interrupt disabling for MP issues the system became completely
285 unresponsive for many seconds while we did this.
286
287 Realizing that pve's are accessed in two distinct ways (linearly running the list by ppn
288 for operations like pmap_page_protect and finding and modifying/removing a single pve as
289 part of pmap_enter processing) has led to modifying the pve structures and databases.
290
291 There are now two types of pve structures. A "rooted" structure which is basically the
292 original structure accessed in an array by ppn, and a ''hashed'' structure accessed on a
293 hash list via a hash of [pmap, vaddr]. These have been designed with the two goals of
294 minimizing wired memory and making the lookup of a ppn faster. Since a vast majority of
295 pages in the system are not aliased and hence represented by a single pv entry I've kept
296 the rooted entry size as small as possible because there is one of these dedicated for
297 every physical page of memory. The hashed pve's are larger due to the addition of the hash
298 link and the ppn entry needed for matching while running the hash list to find the entry we
299 are looking for. This way, only systems that have lots of aliasing (like 2000+ httpd procs)
300 will pay the extra memory price. Both structures have the same first three fields allowing
301 some simplification in the code.
302
303 They have these shapes
304
305 typedef struct pv_rooted_entry {
306 queue_head_t qlink;
307 vm_map_offset_t va;
308 pmap_t pmap;
309 } *pv_rooted_entry_t;
310
311
312 typedef struct pv_hashed_entry {
313 queue_head_t qlink;
314 vm_map_offset_t va;
315 pmap_t pmap;
316 ppnum_t ppn;
317 struct pv_hashed_entry *nexth;
318 } *pv_hashed_entry_t;
319
320 The main flow difference is that the code is now aware of the rooted entry and the hashed
321 entries. Code that runs the pv list still starts with the rooted entry and then continues
322 down the qlink onto the hashed entries. Code that is looking up a specific pv entry first
323 checks the rooted entry and then hashes and runs the hash list for the match. The hash list
324 lengths are much smaller than the original pv lists that contained all aliases for the specific ppn.
325
326 */
327
328 typedef struct pv_rooted_entry { /* first three entries must match pv_hashed_entry_t */
329 queue_head_t qlink;
330 vm_map_offset_t va; /* virtual address for mapping */
331 pmap_t pmap; /* pmap where mapping lies */
332 } *pv_rooted_entry_t;
333
334 #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
335
336 pv_rooted_entry_t pv_head_table; /* array of entries, one per page */
337
338 typedef struct pv_hashed_entry { /* first three entries must match pv_rooted_entry_t */
339 queue_head_t qlink;
340 vm_map_offset_t va;
341 pmap_t pmap;
342 ppnum_t ppn;
343 struct pv_hashed_entry *nexth;
344 } *pv_hashed_entry_t;
345
346 #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
347
348 #define NPVHASH 4095 /* MUST BE 2^N - 1 */
349 pv_hashed_entry_t *pv_hash_table; /* hash lists */
350
351 uint32_t npvhash = 0;
352
353 /* #define PV_DEBUG 1 uncomment to enable some PV debugging code */
354 #ifdef PV_DEBUG
355 #define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
356 #else
357 #define CHK_NPVHASH()
358 #endif
359
360 /*
361 * pv_list entries are kept on a list that can only be accessed
362 * with the pmap system locked (at SPLVM, not in the cpus_active set).
363 * The list is refilled from the pv_hashed_list_zone if it becomes empty.
364 */
365 pv_rooted_entry_t pv_free_list = PV_ROOTED_ENTRY_NULL; /* free list at SPLVM */
366 pv_hashed_entry_t pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
367 pv_hashed_entry_t pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
368 decl_simple_lock_data(,pv_hashed_free_list_lock)
369 decl_simple_lock_data(,pv_hashed_kern_free_list_lock)
370 decl_simple_lock_data(,pv_hash_table_lock)
371
372 int pv_free_count = 0;
373 int pv_hashed_free_count = 0;
374 int pv_kern_free_count = 0;
375 int pv_hashed_kern_free_count = 0;
376 #define PV_HASHED_LOW_WATER_MARK 5000
377 #define PV_HASHED_KERN_LOW_WATER_MARK 100
378 #define PV_HASHED_ALLOC_CHUNK 2000
379 #define PV_HASHED_KERN_ALLOC_CHUNK 50
380 thread_call_t mapping_adjust_call;
381 static thread_call_data_t mapping_adjust_call_data;
382 uint32_t mappingrecurse = 0;
383
384 #define PV_HASHED_ALLOC(pvh_e) { \
385 simple_lock(&pv_hashed_free_list_lock); \
386 if ((pvh_e = pv_hashed_free_list) != 0) { \
387 pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
388 pv_hashed_free_count--; \
389 if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \
390 if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
391 thread_call_enter(mapping_adjust_call); \
392 } \
393 simple_unlock(&pv_hashed_free_list_lock); \
394 }
395
396 #define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
397 simple_lock(&pv_hashed_free_list_lock); \
398 pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; \
399 pv_hashed_free_list = pvh_eh; \
400 pv_hashed_free_count += pv_cnt; \
401 simple_unlock(&pv_hashed_free_list_lock); \
402 }
403
404 #define PV_HASHED_KERN_ALLOC(pvh_e) { \
405 simple_lock(&pv_hashed_kern_free_list_lock); \
406 if ((pvh_e = pv_hashed_kern_free_list) != 0) { \
407 pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
408 pv_hashed_kern_free_count--; \
409 if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) \
410 if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
411 thread_call_enter(mapping_adjust_call); \
412 } \
413 simple_unlock(&pv_hashed_kern_free_list_lock); \
414 }
415
416 #define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
417 simple_lock(&pv_hashed_kern_free_list_lock); \
418 pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; \
419 pv_hashed_kern_free_list = pvh_eh; \
420 pv_hashed_kern_free_count += pv_cnt; \
421 simple_unlock(&pv_hashed_kern_free_list_lock); \
422 }
423
424 zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
425
426 static zone_t pdpt_zone;
427
428 /*
429 * Each entry in the pv_head_table is locked by a bit in the
430 * pv_lock_table. The lock bits are accessed by the physical
431 * address of the page they lock.
432 */
433
434 char *pv_lock_table; /* pointer to array of bits */
435 #define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
436
437 char *pv_hash_lock_table;
438 #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
439
440 /*
441 * First and last physical addresses that we maintain any information
442 * for. Initialized to zero so that pmap operations done before
443 * pmap_init won't touch any non-existent structures.
444 */
445 boolean_t pmap_initialized = FALSE;/* Has pmap_init completed? */
446
447 static struct vm_object kptobj_object_store;
448 static vm_object_t kptobj;
449
450 /*
451 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
452 */
453
454 #define pa_index(pa) (i386_btop(pa))
455 #define ppn_to_pai(ppn) ((int)ppn)
456
457 #define pai_to_pvh(pai) (&pv_head_table[pai])
458 #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
459 #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
460
461 #define pvhashidx(pmap, va) (((uint32_t)pmap ^ ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) & npvhash)
462 #define pvhash(idx) (&pv_hash_table[idx])
463
464 #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
465 #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
466
467 /*
468 * Array of physical page attribites for managed pages.
469 * One byte per physical page.
470 */
471 char *pmap_phys_attributes;
472 unsigned int last_managed_page = 0;
473
474 /*
475 * Physical page attributes. Copy bits from PTE definition.
476 */
477 #define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
478 #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
479 #define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
480
481 /*
482 * Amount of virtual memory mapped by one
483 * page-directory entry.
484 */
485 #define PDE_MAPPED_SIZE (pdetova(1))
486 uint64_t pde_mapped_size;
487
488 /*
489 * Locking and TLB invalidation
490 */
491
492 /*
493 * Locking Protocols: (changed 2/2007 JK)
494 *
495 * There are two structures in the pmap module that need locking:
496 * the pmaps themselves, and the per-page pv_lists (which are locked
497 * by locking the pv_lock_table entry that corresponds to the pv_head
498 * for the list in question.) Most routines want to lock a pmap and
499 * then do operations in it that require pv_list locking -- however
500 * pmap_remove_all and pmap_copy_on_write operate on a physical page
501 * basis and want to do the locking in the reverse order, i.e. lock
502 * a pv_list and then go through all the pmaps referenced by that list.
503 *
504 * The system wide pmap lock has been removed. Now, paths take a lock
505 * on the pmap before changing its 'shape' and the reverse order lockers
506 * (coming in by phys ppn) take a lock on the corresponding pv and then
507 * retest to be sure nothing changed during the window before they locked
508 * and can then run up/down the pv lists holding the list lock. This also
509 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
510 * previously.
511 */
512
513
514 /*
515 * PV locking
516 */
517
518 #define LOCK_PVH(index) { \
519 mp_disable_preemption(); \
520 lock_pvh_pai(index); \
521 }
522
523 #define UNLOCK_PVH(index) { \
524 unlock_pvh_pai(index); \
525 mp_enable_preemption(); \
526 }
527
528 /*
529 * PV hash locking
530 */
531
532 #define LOCK_PV_HASH(hash) lock_hash_hash(hash)
533
534 #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
535
536 #if USLOCK_DEBUG
537 extern int max_lock_loops;
538 #define LOOP_VAR \
539 unsigned int loop_count; \
540 loop_count = disable_serial_output ? max_lock_loops \
541 : max_lock_loops*100
542 #define LOOP_CHECK(msg, pmap) \
543 if (--loop_count == 0) { \
544 mp_disable_preemption(); \
545 kprintf("%s: cpu %d pmap %x\n", \
546 msg, cpu_number(), pmap); \
547 Debugger("deadlock detection"); \
548 mp_enable_preemption(); \
549 loop_count = max_lock_loops; \
550 }
551 #else /* USLOCK_DEBUG */
552 #define LOOP_VAR
553 #define LOOP_CHECK(msg, pmap)
554 #endif /* USLOCK_DEBUG */
555
556 unsigned pmap_memory_region_count;
557 unsigned pmap_memory_region_current;
558
559 pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
560
561 /*
562 * Other useful macros.
563 */
564 #define current_pmap() (vm_map_pmap(current_thread()->map))
565
566 struct pmap kernel_pmap_store;
567 pmap_t kernel_pmap;
568
569 pd_entry_t high_shared_pde;
570 pd_entry_t commpage64_pde;
571
572 struct zone *pmap_zone; /* zone of pmap structures */
573
574 int pmap_debug = 0; /* flag for debugging prints */
575
576 unsigned int inuse_ptepages_count = 0;
577
578 addr64_t kernel64_cr3;
579 boolean_t no_shared_cr3 = FALSE; /* -no_shared_cr3 boot arg */
580
581
582 /*
583 * Pmap cache. Cache is threaded through ref_count field of pmap.
584 * Max will eventually be constant -- variable for experimentation.
585 */
586 int pmap_cache_max = 32;
587 int pmap_alloc_chunk = 8;
588 pmap_t pmap_cache_list;
589 int pmap_cache_count;
590 decl_simple_lock_data(,pmap_cache_lock)
591
592 extern char end;
593
594 static int nkpt;
595
596 pt_entry_t *DMAP1, *DMAP2;
597 caddr_t DADDR1;
598 caddr_t DADDR2;
599
600 static inline
601 void pmap_pvh_unlink(pv_hashed_entry_t pv);
602
603 /*
604 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
605 * properly deals with the anchor.
606 * must be called with the hash locked, does not unlock it
607 */
608
609 static inline
610 void pmap_pvh_unlink(pv_hashed_entry_t pvh)
611 {
612 pv_hashed_entry_t curh;
613 pv_hashed_entry_t *pprevh;
614 int pvhash_idx;
615
616 CHK_NPVHASH();
617 pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
618
619 pprevh = pvhash(pvhash_idx);
620
621 #if PV_DEBUG
622 if (NULL == *pprevh) panic("pvh_unlink null anchor"); /* JK DEBUG */
623 #endif
624 curh = *pprevh;
625
626 while (PV_HASHED_ENTRY_NULL != curh) {
627 if (pvh == curh)
628 break;
629 pprevh = &curh->nexth;
630 curh = curh->nexth;
631 }
632 if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
633 *pprevh = pvh->nexth;
634 return;
635 }
636
637 /*
638 * for legacy, returns the address of the pde entry.
639 * for 64 bit, causes the pdpt page containing the pde entry to be mapped,
640 * then returns the mapped address of the pde entry in that page
641 */
642 pd_entry_t *
643 pmap_pde(pmap_t m, vm_map_offset_t v)
644 {
645 pd_entry_t *pde;
646 if (!cpu_64bit || (m == kernel_pmap)) {
647 pde = (&((m)->dirbase[(vm_offset_t)(v) >> PDESHIFT]));
648 } else {
649 assert(m);
650 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
651 pde = pmap64_pde(m, v);
652 }
653 return pde;
654 }
655
656
657 /*
658 * the single pml4 page per pmap is allocated at pmap create time and exists
659 * for the duration of the pmap. we allocate this page in kernel vm (to save us one
660 * level of page table dynamic mapping.
661 * this returns the address of the requested pml4 entry in the top level page.
662 */
663 static inline
664 pml4_entry_t *
665 pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
666 {
667 return ((pml4_entry_t *)pmap->pm_hold + ((vm_offset_t)((vaddr>>PML4SHIFT)&(NPML4PG-1))));
668 }
669
670 /*
671 * maps in the pml4 page, if any, containing the pdpt entry requested
672 * and returns the address of the pdpt entry in that mapped page
673 */
674 pdpt_entry_t *
675 pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
676 {
677 pml4_entry_t newpf;
678 pml4_entry_t *pml4;
679 int i;
680
681 assert(pmap);
682 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
683 if ((vaddr > 0x00007FFFFFFFFFFFULL) && (vaddr < 0xFFFF800000000000ULL)) {
684 return(0);
685 }
686
687 pml4 = pmap64_pml4(pmap, vaddr);
688
689 if (pml4 && ((*pml4 & INTEL_PTE_VALID))) {
690
691 newpf = *pml4 & PG_FRAME;
692
693
694 for (i=PMAP_PDPT_FIRST_WINDOW; i < PMAP_PDPT_FIRST_WINDOW+PMAP_PDPT_NWINDOWS; i++) {
695 if (((*(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP)) & PG_FRAME) == newpf) {
696 return((pdpt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR) +
697 ((vm_offset_t)((vaddr>>PDPTSHIFT)&(NPDPTPG-1))));
698 }
699 }
700
701 current_cpu_datap()->cpu_pmap->pdpt_window_index++;
702 if (current_cpu_datap()->cpu_pmap->pdpt_window_index > (PMAP_PDPT_FIRST_WINDOW+PMAP_PDPT_NWINDOWS-1))
703 current_cpu_datap()->cpu_pmap->pdpt_window_index = PMAP_PDPT_FIRST_WINDOW;
704 pmap_store_pte(
705 (current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pdpt_window_index].prv_CMAP),
706 newpf | INTEL_PTE_RW | INTEL_PTE_VALID);
707 invlpg((u_int)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pdpt_window_index].prv_CADDR));
708 return ((pdpt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pdpt_window_index].prv_CADDR) +
709 ((vm_offset_t)((vaddr>>PDPTSHIFT)&(NPDPTPG-1))));
710 }
711
712 return (NULL);
713 }
714
715 /*
716 * maps in the pdpt page, if any, containing the pde entry requested
717 * and returns the address of the pde entry in that mapped page
718 */
719 pd_entry_t *
720 pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr)
721 {
722 pdpt_entry_t newpf;
723 pdpt_entry_t *pdpt;
724 int i;
725
726 assert(pmap);
727 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
728 if ((vaddr > 0x00007FFFFFFFFFFFULL) && (vaddr < 0xFFFF800000000000ULL)) {
729 return(0);
730 }
731
732 /* if (vaddr & (1ULL << 63)) panic("neg addr");*/
733 pdpt = pmap64_pdpt(pmap, vaddr);
734
735 if (pdpt && ((*pdpt & INTEL_PTE_VALID))) {
736
737 newpf = *pdpt & PG_FRAME;
738
739 for (i=PMAP_PDE_FIRST_WINDOW; i < PMAP_PDE_FIRST_WINDOW+PMAP_PDE_NWINDOWS; i++) {
740 if (((*(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP)) & PG_FRAME) == newpf) {
741 return((pd_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR) +
742 ((vm_offset_t)((vaddr>>PDSHIFT)&(NPDPG-1))));
743 }
744 }
745
746 current_cpu_datap()->cpu_pmap->pde_window_index++;
747 if (current_cpu_datap()->cpu_pmap->pde_window_index > (PMAP_PDE_FIRST_WINDOW+PMAP_PDE_NWINDOWS-1))
748 current_cpu_datap()->cpu_pmap->pde_window_index = PMAP_PDE_FIRST_WINDOW;
749 pmap_store_pte(
750 (current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pde_window_index].prv_CMAP),
751 newpf | INTEL_PTE_RW | INTEL_PTE_VALID);
752 invlpg((u_int)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pde_window_index].prv_CADDR));
753 return ((pd_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pde_window_index].prv_CADDR) +
754 ((vm_offset_t)((vaddr>>PDSHIFT)&(NPDPG-1))));
755 }
756
757 return (NULL);
758 }
759
760 /*
761 * Because the page tables (top 3 levels) are mapped into per cpu windows,
762 * callers must either disable interrupts or disable preemption before calling
763 * one of the pte mapping routines (e.g. pmap_pte()) as the returned vaddr
764 * is in one of those mapped windows and that cannot be allowed to change until
765 * the caller is done using the returned pte pointer. When done, the caller
766 * restores interrupts or preemption to its previous state after which point the
767 * vaddr for the returned pte can no longer be used
768 */
769
770
771 /*
772 * return address of mapped pte for vaddr va in pmap pmap.
773 * must be called with pre-emption or interrupts disabled
774 * if targeted pmap is not the kernel pmap
775 * since we may be passing back a virtual address that is
776 * associated with this cpu... pre-emption or interrupts
777 * must remain disabled until the caller is done using
778 * the pointer that was passed back .
779 *
780 * maps the pde page, if any, containing the pte in and returns
781 * the address of the pte in that mapped page
782 */
783 pt_entry_t *
784 pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
785 {
786 pd_entry_t *pde;
787 pd_entry_t newpf;
788 int i;
789
790 assert(pmap);
791 pde = pmap_pde(pmap,vaddr);
792
793 if (pde && ((*pde & INTEL_PTE_VALID))) {
794 if (*pde & INTEL_PTE_PS)
795 return pde;
796 if (pmap == kernel_pmap)
797 return (vtopte(vaddr)); /* compat kernel still has pte's mapped */
798 #if TESTING
799 if (ml_get_interrupts_enabled() && get_preemption_level() == 0)
800 panic("pmap_pte: unsafe call");
801 #endif
802 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
803
804 newpf = *pde & PG_FRAME;
805
806 for (i=PMAP_PTE_FIRST_WINDOW; i < PMAP_PTE_FIRST_WINDOW+PMAP_PTE_NWINDOWS; i++) {
807 if (((*(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP)) & PG_FRAME) == newpf) {
808 return((pt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR) +
809 ((vm_offset_t)i386_btop(vaddr) & (NPTEPG-1)));
810 }
811 }
812
813 current_cpu_datap()->cpu_pmap->pte_window_index++;
814 if (current_cpu_datap()->cpu_pmap->pte_window_index > (PMAP_PTE_FIRST_WINDOW+PMAP_PTE_NWINDOWS-1))
815 current_cpu_datap()->cpu_pmap->pte_window_index = PMAP_PTE_FIRST_WINDOW;
816 pmap_store_pte(
817 (current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pte_window_index].prv_CMAP),
818 newpf | INTEL_PTE_RW | INTEL_PTE_VALID);
819 invlpg((u_int)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pte_window_index].prv_CADDR));
820 return ((pt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pte_window_index].prv_CADDR) +
821 ((vm_offset_t)i386_btop(vaddr) & (NPTEPG-1)));
822 }
823
824 return(NULL);
825 }
826
827
828 /*
829 * Map memory at initialization. The physical addresses being
830 * mapped are not managed and are never unmapped.
831 *
832 * For now, VM is already on, we only need to map the
833 * specified memory.
834 */
835 vm_offset_t
836 pmap_map(
837 vm_offset_t virt,
838 vm_map_offset_t start_addr,
839 vm_map_offset_t end_addr,
840 vm_prot_t prot,
841 unsigned int flags)
842 {
843 int ps;
844
845 ps = PAGE_SIZE;
846 while (start_addr < end_addr) {
847 pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
848 (ppnum_t) i386_btop(start_addr), prot, flags, FALSE);
849 virt += ps;
850 start_addr += ps;
851 }
852 return(virt);
853 }
854
855 /*
856 * Back-door routine for mapping kernel VM at initialization.
857 * Useful for mapping memory outside the range
858 * Sets no-cache, A, D.
859 * Otherwise like pmap_map.
860 */
861 vm_offset_t
862 pmap_map_bd(
863 vm_offset_t virt,
864 vm_map_offset_t start_addr,
865 vm_map_offset_t end_addr,
866 vm_prot_t prot,
867 unsigned int flags)
868 {
869 pt_entry_t template;
870 pt_entry_t *pte;
871 spl_t spl;
872
873 template = pa_to_pte(start_addr)
874 | INTEL_PTE_REF
875 | INTEL_PTE_MOD
876 | INTEL_PTE_WIRED
877 | INTEL_PTE_VALID;
878
879 if(flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) {
880 template |= INTEL_PTE_NCACHE;
881 if(!(flags & (VM_MEM_GUARDED | VM_WIMG_USE_DEFAULT)))
882 template |= INTEL_PTE_PTA;
883 }
884
885 if (prot & VM_PROT_WRITE)
886 template |= INTEL_PTE_WRITE;
887
888
889 while (start_addr < end_addr) {
890 spl = splhigh();
891 pte = pmap_pte(kernel_pmap, (vm_map_offset_t)virt);
892 if (pte == PT_ENTRY_NULL) {
893 panic("pmap_map_bd: Invalid kernel address\n");
894 }
895 pmap_store_pte(pte, template);
896 splx(spl);
897 pte_increment_pa(template);
898 virt += PAGE_SIZE;
899 start_addr += PAGE_SIZE;
900 }
901
902
903 flush_tlb();
904 return(virt);
905 }
906
907 extern char *first_avail;
908 extern vm_offset_t virtual_avail, virtual_end;
909 extern pmap_paddr_t avail_start, avail_end;
910
911 void
912 pmap_cpu_init(void)
913 {
914 /*
915 * Here early in the life of a processor (from cpu_mode_init()).
916 * If we're not in 64-bit mode, enable the global TLB feature.
917 * Note: regardless of mode we continue to set the global attribute
918 * bit in ptes for all (32-bit) global pages such as the commpage.
919 */
920 if (!cpu_64bit) {
921 set_cr4(get_cr4() | CR4_PGE);
922 }
923
924 /*
925 * Initialize the per-cpu, TLB-related fields.
926 */
927 current_cpu_datap()->cpu_active_cr3 = kernel_pmap->pm_cr3;
928 current_cpu_datap()->cpu_tlb_invalid = FALSE;
929 }
930
931 vm_offset_t
932 pmap_high_shared_remap(enum high_fixed_addresses e, vm_offset_t va, int sz)
933 {
934 vm_offset_t ve = pmap_index_to_virt(e);
935 pt_entry_t *ptep;
936 pmap_paddr_t pa;
937 int i;
938 spl_t s;
939
940 assert(0 == (va & PAGE_MASK)); /* expecting page aligned */
941 s = splhigh();
942 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ve);
943
944 for (i=0; i< sz; i++) {
945 pa = (pmap_paddr_t) kvtophys(va);
946 pmap_store_pte(ptep, (pa & PG_FRAME)
947 | INTEL_PTE_VALID
948 | INTEL_PTE_GLOBAL
949 | INTEL_PTE_RW
950 | INTEL_PTE_REF
951 | INTEL_PTE_MOD);
952 va+= PAGE_SIZE;
953 ptep++;
954 }
955 splx(s);
956 return ve;
957 }
958
959 vm_offset_t
960 pmap_cpu_high_shared_remap(int cpu, enum high_cpu_types e, vm_offset_t va, int sz)
961 {
962 enum high_fixed_addresses a = e + HIGH_CPU_END * cpu;
963 return pmap_high_shared_remap(HIGH_FIXED_CPUS_BEGIN + a, va, sz);
964 }
965
966 void pmap_init_high_shared(void);
967
968 extern vm_offset_t gdtptr, idtptr;
969
970 extern uint32_t low_intstack;
971
972 extern struct fake_descriptor ldt_desc_pattern;
973 extern struct fake_descriptor tss_desc_pattern;
974
975 extern char hi_remap_text, hi_remap_etext;
976 extern char t_zero_div;
977
978 pt_entry_t *pte_unique_base;
979
980 void
981 pmap_init_high_shared(void)
982 {
983
984 vm_offset_t haddr;
985 spl_t s;
986 #if MACH_KDB
987 struct i386_tss *ttss;
988 #endif
989
990 cpu_desc_index_t * cdi = &cpu_data_master.cpu_desc_index;
991
992 kprintf("HIGH_MEM_BASE 0x%x fixed per-cpu begin 0x%x\n",
993 HIGH_MEM_BASE,pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN));
994 s = splhigh();
995 pte_unique_base = pmap_pte(kernel_pmap, (vm_map_offset_t)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN));
996 splx(s);
997
998 if (i386_btop(&hi_remap_etext - &hi_remap_text + 1) >
999 HIGH_FIXED_TRAMPS_END - HIGH_FIXED_TRAMPS + 1)
1000 panic("tramps too large");
1001 haddr = pmap_high_shared_remap(HIGH_FIXED_TRAMPS,
1002 (vm_offset_t) &hi_remap_text, 3);
1003 kprintf("tramp: 0x%x, ",haddr);
1004 /* map gdt up high and update ptr for reload */
1005 haddr = pmap_high_shared_remap(HIGH_FIXED_GDT,
1006 (vm_offset_t) master_gdt, 1);
1007 cdi->cdi_gdt.ptr = (void *)haddr;
1008 kprintf("GDT: 0x%x, ",haddr);
1009 /* map ldt up high */
1010 haddr = pmap_high_shared_remap(HIGH_FIXED_LDT_BEGIN,
1011 (vm_offset_t) master_ldt,
1012 HIGH_FIXED_LDT_END - HIGH_FIXED_LDT_BEGIN + 1);
1013 cdi->cdi_ldt = (struct fake_descriptor *)haddr;
1014 kprintf("LDT: 0x%x, ",haddr);
1015 /* put new ldt addr into gdt */
1016 struct fake_descriptor temp_fake_desc;
1017 temp_fake_desc = ldt_desc_pattern;
1018 temp_fake_desc.offset = (vm_offset_t) haddr;
1019 fix_desc(&temp_fake_desc, 1);
1020
1021 *(struct fake_descriptor *) &master_gdt[sel_idx(KERNEL_LDT)] = temp_fake_desc;
1022 *(struct fake_descriptor *) &master_gdt[sel_idx(USER_LDT)] = temp_fake_desc;
1023
1024 /* map idt up high */
1025 haddr = pmap_high_shared_remap(HIGH_FIXED_IDT,
1026 (vm_offset_t) master_idt, 1);
1027 cdi->cdi_idt.ptr = (void *)haddr;
1028 kprintf("IDT: 0x%x, ", haddr);
1029 /* remap ktss up high and put new high addr into gdt */
1030 haddr = pmap_high_shared_remap(HIGH_FIXED_KTSS,
1031 (vm_offset_t) &master_ktss, 1);
1032
1033 temp_fake_desc = tss_desc_pattern;
1034 temp_fake_desc.offset = (vm_offset_t) haddr;
1035 fix_desc(&temp_fake_desc, 1);
1036 *(struct fake_descriptor *) &master_gdt[sel_idx(KERNEL_TSS)] = temp_fake_desc;
1037 kprintf("KTSS: 0x%x, ",haddr);
1038 #if MACH_KDB
1039 /* remap dbtss up high and put new high addr into gdt */
1040 haddr = pmap_high_shared_remap(HIGH_FIXED_DBTSS,
1041 (vm_offset_t) &master_dbtss, 1);
1042 temp_fake_desc = tss_desc_pattern;
1043 temp_fake_desc.offset = (vm_offset_t) haddr;
1044 fix_desc(&temp_fake_desc, 1);
1045 *(struct fake_descriptor *)&master_gdt[sel_idx(DEBUG_TSS)] = temp_fake_desc;
1046 ttss = (struct i386_tss *)haddr;
1047 kprintf("DBTSS: 0x%x, ",haddr);
1048 #endif /* MACH_KDB */
1049
1050 /* remap dftss up high and put new high addr into gdt */
1051 haddr = pmap_high_shared_remap(HIGH_FIXED_DFTSS,
1052 (vm_offset_t) &master_dftss, 1);
1053 temp_fake_desc = tss_desc_pattern;
1054 temp_fake_desc.offset = (vm_offset_t) haddr;
1055 fix_desc(&temp_fake_desc, 1);
1056 *(struct fake_descriptor *) &master_gdt[sel_idx(DF_TSS)] = temp_fake_desc;
1057 kprintf("DFTSS: 0x%x\n",haddr);
1058
1059 /* remap mctss up high and put new high addr into gdt */
1060 haddr = pmap_high_shared_remap(HIGH_FIXED_DFTSS,
1061 (vm_offset_t) &master_mctss, 1);
1062 temp_fake_desc = tss_desc_pattern;
1063 temp_fake_desc.offset = (vm_offset_t) haddr;
1064 fix_desc(&temp_fake_desc, 1);
1065 *(struct fake_descriptor *) &master_gdt[sel_idx(MC_TSS)] = temp_fake_desc;
1066 kprintf("MCTSS: 0x%x\n",haddr);
1067
1068 cpu_desc_load(&cpu_data_master);
1069 }
1070
1071
1072 /*
1073 * Bootstrap the system enough to run with virtual memory.
1074 * Map the kernel's code and data, and allocate the system page table.
1075 * Called with mapping OFF. Page_size must already be set.
1076 */
1077
1078 void
1079 pmap_bootstrap(
1080 __unused vm_offset_t load_start,
1081 boolean_t IA32e)
1082 {
1083 vm_offset_t va;
1084 pt_entry_t *pte;
1085 int i;
1086 pdpt_entry_t *pdpt;
1087 spl_t s;
1088
1089 vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Set the highest address
1090 * known to VM */
1091 /*
1092 * The kernel's pmap is statically allocated so we don't
1093 * have to use pmap_create, which is unlikely to work
1094 * correctly at this part of the boot sequence.
1095 */
1096
1097
1098 kernel_pmap = &kernel_pmap_store;
1099 kernel_pmap->ref_count = 1;
1100 kernel_pmap->nx_enabled = FALSE;
1101 kernel_pmap->pm_task_map = TASK_MAP_32BIT;
1102 kernel_pmap->pm_obj = (vm_object_t) NULL;
1103 kernel_pmap->dirbase = (pd_entry_t *)((unsigned int)IdlePTD | KERNBASE);
1104 kernel_pmap->pdirbase = (pmap_paddr_t)((int)IdlePTD);
1105 pdpt = (pd_entry_t *)((unsigned int)IdlePDPT | KERNBASE );
1106 kernel_pmap->pm_pdpt = pdpt;
1107 kernel_pmap->pm_cr3 = (pmap_paddr_t)((int)IdlePDPT);
1108
1109
1110 va = (vm_offset_t)kernel_pmap->dirbase;
1111 /* setup self referential mapping(s) */
1112 for (i = 0; i< NPGPTD; i++, pdpt++) {
1113 pmap_paddr_t pa;
1114 pa = (pmap_paddr_t) kvtophys((vm_offset_t)(va + i386_ptob(i)));
1115 pmap_store_pte(
1116 (pd_entry_t *) (kernel_pmap->dirbase + PTDPTDI + i),
1117 (pa & PG_FRAME) | INTEL_PTE_VALID | INTEL_PTE_RW | INTEL_PTE_REF |
1118 INTEL_PTE_MOD | INTEL_PTE_WIRED) ;
1119 pmap_store_pte(pdpt, pa | INTEL_PTE_VALID);
1120 }
1121
1122 cpu_64bit = IA32e;
1123
1124 lo_kernel_cr3 = kernel_pmap->pm_cr3;
1125 current_cpu_datap()->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
1126
1127 /* save the value we stuff into created pmaps to share the gdts etc */
1128 high_shared_pde = *pmap_pde(kernel_pmap, HIGH_MEM_BASE);
1129 /* make sure G bit is on for high shared pde entry */
1130 high_shared_pde |= INTEL_PTE_GLOBAL;
1131 s = splhigh();
1132 pmap_store_pte(pmap_pde(kernel_pmap, HIGH_MEM_BASE), high_shared_pde);
1133 splx(s);
1134
1135 nkpt = NKPT;
1136 OSAddAtomic(NKPT, &inuse_ptepages_count);
1137
1138 virtual_avail = (vm_offset_t)VADDR(KPTDI,0) + (vm_offset_t)first_avail;
1139 virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
1140
1141 /*
1142 * Reserve some special page table entries/VA space for temporary
1143 * mapping of pages.
1144 */
1145 #define SYSMAP(c, p, v, n) \
1146 v = (c)va; va += ((n)*INTEL_PGBYTES); p = pte; pte += (n)
1147
1148 va = virtual_avail;
1149 pte = vtopte(va);
1150
1151 for (i=0; i<PMAP_NWINDOWS; i++) {
1152 SYSMAP(caddr_t,
1153 (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
1154 (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR),
1155 1);
1156 *current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP = 0;
1157 }
1158
1159 /* DMAP user for debugger */
1160 SYSMAP(caddr_t, DMAP1, DADDR1, 1);
1161 SYSMAP(caddr_t, DMAP2, DADDR2, 1); /* XXX temporary - can remove */
1162
1163 virtual_avail = va;
1164
1165 if (PE_parse_boot_argn("npvhash", &npvhash, sizeof (npvhash))) {
1166 if (0 != ((npvhash+1) & npvhash)) {
1167 kprintf("invalid hash %d, must be ((2^N)-1), using default %d\n",npvhash,NPVHASH);
1168 npvhash = NPVHASH;
1169 }
1170 } else {
1171 npvhash = NPVHASH;
1172 }
1173 printf("npvhash=%d\n",npvhash);
1174
1175 simple_lock_init(&kernel_pmap->lock, 0);
1176 simple_lock_init(&pv_hashed_free_list_lock, 0);
1177 simple_lock_init(&pv_hashed_kern_free_list_lock, 0);
1178 simple_lock_init(&pv_hash_table_lock,0);
1179
1180 pmap_init_high_shared();
1181
1182 pde_mapped_size = PDE_MAPPED_SIZE;
1183
1184 if (cpu_64bit) {
1185 pdpt_entry_t *ppdpt = IdlePDPT;
1186 pdpt_entry_t *ppdpt64 = (pdpt_entry_t *)IdlePDPT64;
1187 pdpt_entry_t *ppml4 = (pdpt_entry_t *)IdlePML4;
1188 int istate = ml_set_interrupts_enabled(FALSE);
1189
1190 /*
1191 * Clone a new 64-bit 3rd-level page table directory, IdlePML4,
1192 * with page bits set for the correct IA-32e operation and so that
1193 * the legacy-mode IdlePDPT is retained for slave processor start-up.
1194 * This is necessary due to the incompatible use of page bits between
1195 * 64-bit and legacy modes.
1196 */
1197 kernel_pmap->pm_cr3 = (pmap_paddr_t)((int)IdlePML4); /* setup in start.s for us */
1198 kernel_pmap->pm_pml4 = IdlePML4;
1199 kernel_pmap->pm_pdpt = (pd_entry_t *)
1200 ((unsigned int)IdlePDPT64 | KERNBASE );
1201 #define PAGE_BITS INTEL_PTE_VALID|INTEL_PTE_RW|INTEL_PTE_USER|INTEL_PTE_REF
1202 pmap_store_pte(kernel_pmap->pm_pml4,
1203 (uint32_t)IdlePDPT64 | PAGE_BITS);
1204 pmap_store_pte((ppdpt64+0), *(ppdpt+0) | PAGE_BITS);
1205 pmap_store_pte((ppdpt64+1), *(ppdpt+1) | PAGE_BITS);
1206 pmap_store_pte((ppdpt64+2), *(ppdpt+2) | PAGE_BITS);
1207 pmap_store_pte((ppdpt64+3), *(ppdpt+3) | PAGE_BITS);
1208
1209 /*
1210 * The kernel is also mapped in the uber-sapce at the 4GB starting
1211 * 0xFFFFFF80:00000000. This is the highest entry in the 4th-level.
1212 */
1213 pmap_store_pte((ppml4+KERNEL_UBER_PML4_INDEX), *(ppml4+0));
1214
1215 kernel64_cr3 = (addr64_t) kernel_pmap->pm_cr3;
1216
1217 /* Re-initialize descriptors and prepare to switch modes */
1218 cpu_desc_init64(&cpu_data_master);
1219 current_cpu_datap()->cpu_is64bit = TRUE;
1220 current_cpu_datap()->cpu_active_cr3 = kernel64_cr3;
1221
1222 pde_mapped_size = 512*4096 ;
1223
1224 ml_set_interrupts_enabled(istate);
1225 }
1226
1227 /* Sets 64-bit mode if required. */
1228 cpu_mode_init(&cpu_data_master);
1229 /* Update in-kernel CPUID information if we're now in 64-bit mode */
1230 if (IA32e)
1231 cpuid_set_info();
1232
1233 kernel_pmap->pm_hold = (vm_offset_t)kernel_pmap->pm_pml4;
1234
1235 kprintf("Kernel virtual space from 0x%x to 0x%x.\n",
1236 VADDR(KPTDI,0), virtual_end);
1237 printf("PAE enabled\n");
1238 if (cpu_64bit){
1239 printf("64 bit mode enabled\n");kprintf("64 bit mode enabled\n"); }
1240
1241 kprintf("Available physical space from 0x%llx to 0x%llx\n",
1242 avail_start, avail_end);
1243
1244 /*
1245 * By default for 64-bit users loaded at 4GB, share kernel mapping.
1246 * But this may be overridden by the -no_shared_cr3 boot-arg.
1247 */
1248 if (PE_parse_boot_argn("-no_shared_cr3", &no_shared_cr3, sizeof (no_shared_cr3))) {
1249 kprintf("Shared kernel address space disabled\n");
1250 }
1251
1252 #ifdef PMAP_TRACES
1253 if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof (pmap_trace))) {
1254 kprintf("Kernel traces for pmap operations enabled\n");
1255 }
1256 #endif /* PMAP_TRACES */
1257 }
1258
1259 void
1260 pmap_virtual_space(
1261 vm_offset_t *startp,
1262 vm_offset_t *endp)
1263 {
1264 *startp = virtual_avail;
1265 *endp = virtual_end;
1266 }
1267
1268 /*
1269 * Initialize the pmap module.
1270 * Called by vm_init, to initialize any structures that the pmap
1271 * system needs to map virtual memory.
1272 */
1273 void
1274 pmap_init(void)
1275 {
1276 register long npages;
1277 vm_offset_t addr;
1278 register vm_size_t s;
1279 vm_map_offset_t vaddr;
1280 ppnum_t ppn;
1281
1282 /*
1283 * Allocate memory for the pv_head_table and its lock bits,
1284 * the modify bit array, and the pte_page table.
1285 */
1286
1287 /*
1288 * zero bias all these arrays now instead of off avail_start
1289 * so we cover all memory
1290 */
1291
1292 npages = (long)i386_btop(avail_end);
1293 s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
1294 + (sizeof (struct pv_hashed_entry_t *) * (npvhash+1))
1295 + pv_lock_table_size(npages)
1296 + pv_hash_lock_table_size((npvhash+1))
1297 + npages);
1298
1299 s = round_page(s);
1300 if (kernel_memory_allocate(kernel_map, &addr, s, 0,
1301 KMA_KOBJECT | KMA_PERMANENT)
1302 != KERN_SUCCESS)
1303 panic("pmap_init");
1304
1305 memset((char *)addr, 0, s);
1306
1307 #if PV_DEBUG
1308 if (0 == npvhash) panic("npvhash not initialized");
1309 #endif
1310
1311 /*
1312 * Allocate the structures first to preserve word-alignment.
1313 */
1314 pv_head_table = (pv_rooted_entry_t) addr;
1315 addr = (vm_offset_t) (pv_head_table + npages);
1316
1317 pv_hash_table = (pv_hashed_entry_t *)addr;
1318 addr = (vm_offset_t) (pv_hash_table + (npvhash + 1));
1319
1320 pv_lock_table = (char *) addr;
1321 addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
1322
1323 pv_hash_lock_table = (char *) addr;
1324 addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhash+1)));
1325
1326 pmap_phys_attributes = (char *) addr;
1327 {
1328 unsigned int i;
1329 unsigned int pn;
1330 ppnum_t last_pn;
1331 pmap_memory_region_t *pmptr = pmap_memory_regions;
1332
1333 last_pn = (ppnum_t)i386_btop(avail_end);
1334
1335 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
1336 if (pmptr->type == kEfiConventionalMemory) {
1337
1338 for (pn = pmptr->base; pn <= pmptr->end; pn++) {
1339 if (pn < last_pn) {
1340 pmap_phys_attributes[pn] |= PHYS_MANAGED;
1341
1342 if (pn > last_managed_page)
1343 last_managed_page = pn;
1344 }
1345 }
1346 }
1347 }
1348 }
1349
1350 /*
1351 * Create the zone of physical maps,
1352 * and of the physical-to-virtual entries.
1353 */
1354 s = (vm_size_t) sizeof(struct pmap);
1355 pmap_zone = zinit(s, 400*s, 4096, "pmap"); /* XXX */
1356 s = (vm_size_t) sizeof(struct pv_hashed_entry);
1357 pv_hashed_list_zone = zinit(s, 10000*s, 4096, "pv_list"); /* XXX */
1358 s = 63;
1359 pdpt_zone = zinit(s, 400*s, 4096, "pdpt"); /* XXX */
1360
1361 kptobj = &kptobj_object_store;
1362 _vm_object_allocate((vm_object_size_t)(NPGPTD*NPTDPG), kptobj);
1363 kernel_pmap->pm_obj = kptobj;
1364
1365 /* create pv entries for kernel pages mapped by low level
1366 startup code. these have to exist so we can pmap_remove()
1367 e.g. kext pages from the middle of our addr space */
1368
1369 vaddr = (vm_map_offset_t)0;
1370 for (ppn = 0; ppn < i386_btop(avail_start) ; ppn++ ) {
1371 pv_rooted_entry_t pv_e;
1372
1373 pv_e = pai_to_pvh(ppn);
1374 pv_e->va = vaddr;
1375 vaddr += PAGE_SIZE;
1376 pv_e->pmap = kernel_pmap;
1377 queue_init(&pv_e->qlink);
1378 }
1379
1380 pmap_initialized = TRUE;
1381
1382 /*
1383 * Initialize pmap cache.
1384 */
1385 pmap_cache_list = PMAP_NULL;
1386 pmap_cache_count = 0;
1387 simple_lock_init(&pmap_cache_lock, 0);
1388
1389 max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
1390
1391 }
1392
1393
1394 #define managed_page(x) ( (unsigned int)x <= last_managed_page && (pmap_phys_attributes[x] & PHYS_MANAGED) )
1395
1396 /*
1397 * this function is only used for debugging fron the vm layer
1398 */
1399 boolean_t
1400 pmap_verify_free(
1401 ppnum_t pn)
1402 {
1403 pv_rooted_entry_t pv_h;
1404 int pai;
1405 boolean_t result;
1406
1407 assert(pn != vm_page_fictitious_addr);
1408
1409 if (!pmap_initialized)
1410 return(TRUE);
1411
1412 if (pn == vm_page_guard_addr)
1413 return TRUE;
1414
1415 pai = ppn_to_pai(pn);
1416 if (!managed_page(pai))
1417 return(FALSE);
1418 pv_h = pai_to_pvh(pn);
1419 result = (pv_h->pmap == PMAP_NULL);
1420 return(result);
1421 }
1422
1423 boolean_t
1424 pmap_is_empty(
1425 pmap_t pmap,
1426 vm_map_offset_t va_start,
1427 vm_map_offset_t va_end)
1428 {
1429 vm_map_offset_t offset;
1430 ppnum_t phys_page;
1431
1432 if (pmap == PMAP_NULL) {
1433 return TRUE;
1434 }
1435
1436 /*
1437 * Check the resident page count
1438 * - if it's zero, the pmap is completely empty.
1439 * This short-circuit test prevents a virtual address scan which is
1440 * painfully slow for 64-bit spaces.
1441 * This assumes the count is correct
1442 * .. the debug kernel ought to be checking perhaps by page table walk.
1443 */
1444 if (pmap->stats.resident_count == 0)
1445 return TRUE;
1446
1447 for (offset = va_start;
1448 offset < va_end;
1449 offset += PAGE_SIZE_64) {
1450 phys_page = pmap_find_phys(pmap, offset);
1451 if (phys_page) {
1452 if (pmap != kernel_pmap &&
1453 pmap->pm_task_map == TASK_MAP_32BIT &&
1454 offset >= HIGH_MEM_BASE) {
1455 /*
1456 * The "high_shared_pde" is used to share
1457 * the entire top-most 2MB of address space
1458 * between the kernel and all 32-bit tasks.
1459 * So none of this can be removed from 32-bit
1460 * tasks.
1461 * Let's pretend there's nothing up
1462 * there...
1463 */
1464 return TRUE;
1465 }
1466 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1467 "page %d at 0x%llx\n",
1468 pmap, va_start, va_end, phys_page, offset);
1469 return FALSE;
1470 }
1471 }
1472
1473 return TRUE;
1474 }
1475
1476
1477 /*
1478 * Create and return a physical map.
1479 *
1480 * If the size specified for the map
1481 * is zero, the map is an actual physical
1482 * map, and may be referenced by the
1483 * hardware.
1484 *
1485 * If the size specified is non-zero,
1486 * the map will be used in software only, and
1487 * is bounded by that size.
1488 */
1489 pmap_t
1490 pmap_create(
1491 vm_map_size_t sz,
1492 boolean_t is_64bit)
1493 {
1494 pmap_t p;
1495 int i;
1496 vm_offset_t va;
1497 vm_size_t size;
1498 pdpt_entry_t *pdpt;
1499 pml4_entry_t *pml4p;
1500 pd_entry_t *pdp;
1501 int template;
1502 spl_t s;
1503
1504 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1505 (int) (sz>>32), (int) sz, (int) is_64bit, 0, 0);
1506
1507 size = (vm_size_t) sz;
1508
1509 /*
1510 * A software use-only map doesn't even need a map.
1511 */
1512
1513 if (size != 0) {
1514 return(PMAP_NULL);
1515 }
1516
1517 p = (pmap_t) zalloc(pmap_zone);
1518 if (PMAP_NULL == p)
1519 panic("pmap_create zalloc");
1520
1521 /* init counts now since we'll be bumping some */
1522 simple_lock_init(&p->lock, 0);
1523 p->stats.resident_count = 0;
1524 p->stats.resident_max = 0;
1525 p->stats.wired_count = 0;
1526 p->ref_count = 1;
1527 p->nx_enabled = 1;
1528 p->pm_shared = FALSE;
1529
1530 assert(!is_64bit || cpu_64bit);
1531 p->pm_task_map = is_64bit ? TASK_MAP_64BIT : TASK_MAP_32BIT;;
1532
1533 if (!cpu_64bit) {
1534 /* legacy 32 bit setup */
1535 /* in the legacy case the pdpt layer is hardwired to 4 entries and each
1536 * entry covers 1GB of addr space */
1537 if (KERN_SUCCESS != kmem_alloc_kobject(kernel_map, (vm_offset_t *)(&p->dirbase), NBPTD))
1538 panic("pmap_create kmem_alloc_kobject");
1539 p->pm_hold = (vm_offset_t)zalloc(pdpt_zone);
1540 if ((vm_offset_t)NULL == p->pm_hold) {
1541 panic("pdpt zalloc");
1542 }
1543 pdpt = (pdpt_entry_t *) (( p->pm_hold + 31) & ~31);
1544 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)pdpt);
1545 if (NULL == (p->pm_obj = vm_object_allocate((vm_object_size_t)(NPGPTD*NPTDPG))))
1546 panic("pmap_create vm_object_allocate");
1547
1548 memset((char *)p->dirbase, 0, NBPTD);
1549
1550 va = (vm_offset_t)p->dirbase;
1551 p->pdirbase = kvtophys(va);
1552
1553 template = cpu_64bit ? INTEL_PTE_VALID|INTEL_PTE_RW|INTEL_PTE_USER|INTEL_PTE_REF : INTEL_PTE_VALID;
1554 for (i = 0; i< NPGPTD; i++, pdpt++ ) {
1555 pmap_paddr_t pa;
1556 pa = (pmap_paddr_t) kvtophys((vm_offset_t)(va + i386_ptob(i)));
1557 pmap_store_pte(pdpt, pa | template);
1558 }
1559
1560 /* map the high shared pde */
1561 s = splhigh();
1562 pmap_store_pte(pmap_pde(p, HIGH_MEM_BASE), high_shared_pde);
1563 splx(s);
1564
1565 } else {
1566 /* 64 bit setup */
1567
1568 /* alloc the pml4 page in kernel vm */
1569 if (KERN_SUCCESS != kmem_alloc_kobject(kernel_map, (vm_offset_t *)(&p->pm_hold), PAGE_SIZE))
1570 panic("pmap_create kmem_alloc_kobject pml4");
1571
1572 memset((char *)p->pm_hold, 0, PAGE_SIZE);
1573 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_hold);
1574
1575 OSAddAtomic(1, &inuse_ptepages_count);
1576
1577 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1578
1579 if (NULL == (p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS))))
1580 panic("pmap_create pdpt obj");
1581
1582 if (NULL == (p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS))))
1583 panic("pmap_create pdpt obj");
1584
1585 if (NULL == (p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS))))
1586 panic("pmap_create pte obj");
1587
1588 /* uber space points to uber mapped kernel */
1589 s = splhigh();
1590 pml4p = pmap64_pml4(p, 0ULL);
1591 pmap_store_pte((pml4p+KERNEL_UBER_PML4_INDEX),*kernel_pmap->pm_pml4);
1592
1593
1594 if (!is_64bit) {
1595 while ((pdp = pmap64_pde(p, (uint64_t)HIGH_MEM_BASE)) == PD_ENTRY_NULL) {
1596 splx(s);
1597 pmap_expand_pdpt(p, (uint64_t)HIGH_MEM_BASE); /* need room for another pde entry */
1598 s = splhigh();
1599 }
1600 pmap_store_pte(pdp, high_shared_pde);
1601 }
1602 splx(s);
1603 }
1604
1605 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1606 (int) p, is_64bit, 0, 0, 0);
1607
1608 return(p);
1609 }
1610
1611 /*
1612 * The following routines implement the shared address optmization for 64-bit
1613 * users with a 4GB page zero.
1614 *
1615 * pmap_set_4GB_pagezero()
1616 * is called in the exec and fork paths to mirror the kernel's
1617 * mapping in the bottom 4G of the user's pmap. The task mapping changes
1618 * from TASK_MAP_64BIT to TASK_MAP_64BIT_SHARED. This routine returns
1619 * without doing anything if the -no_shared_cr3 boot-arg is set.
1620 *
1621 * pmap_clear_4GB_pagezero()
1622 * is called in the exec/exit paths to undo this mirror. The task mapping
1623 * reverts to TASK_MAP_64BIT. In addition, we switch to the kernel's
1624 * CR3 by calling pmap_load_kernel_cr3().
1625 *
1626 * pmap_load_kernel_cr3()
1627 * loads cr3 with the kernel's page table. In addition to being called
1628 * by pmap_clear_4GB_pagezero(), it is used both prior to teardown and
1629 * when we go idle in the context of a shared map.
1630 *
1631 * Further notes on per-cpu data used:
1632 *
1633 * cpu_kernel_cr3 is the cr3 for the kernel's pmap.
1634 * This is loaded in a trampoline on entering the kernel
1635 * from a 32-bit user (or non-shared-cr3 64-bit user).
1636 * cpu_task_cr3 is the cr3 for the current thread.
1637 * This is loaded in a trampoline as we exit the kernel.
1638 * cpu_active_cr3 reflects the cr3 currently loaded.
1639 * However, the low order bit is set when the
1640 * processor is idle or interrupts are disabled
1641 * while the system pmap lock is held. It is used by
1642 * tlb shoot-down.
1643 * cpu_task_map indicates whether the task cr3 belongs to
1644 * a 32-bit, a 64-bit or a 64-bit shared map.
1645 * The latter allows the avoidance of the cr3 load
1646 * on kernel entry and exit.
1647 * cpu_tlb_invalid set TRUE when a tlb flush is requested.
1648 * If the cr3 is "inactive" (the cpu is idle or the
1649 * system-wide pmap lock is held) this not serviced by
1650 * an IPI but at time when the cr3 becomes "active".
1651 */
1652
1653 void
1654 pmap_set_4GB_pagezero(pmap_t p)
1655 {
1656 pdpt_entry_t *user_pdptp;
1657 pdpt_entry_t *kern_pdptp;
1658
1659 assert(p->pm_task_map != TASK_MAP_32BIT);
1660
1661 /* Kernel-shared cr3 may be disabled by boot arg. */
1662 if (no_shared_cr3)
1663 return;
1664
1665 /*
1666 * Set the bottom 4 3rd-level pte's to be the kernel's.
1667 */
1668 PMAP_LOCK(p);
1669 while ((user_pdptp = pmap64_pdpt(p, 0x0)) == PDPT_ENTRY_NULL) {
1670 PMAP_UNLOCK(p);
1671 pmap_expand_pml4(p, 0x0);
1672 PMAP_LOCK(p);
1673 }
1674 kern_pdptp = kernel_pmap->pm_pdpt;
1675 pmap_store_pte(user_pdptp+0, *(kern_pdptp+0));
1676 pmap_store_pte(user_pdptp+1, *(kern_pdptp+1));
1677 pmap_store_pte(user_pdptp+2, *(kern_pdptp+2));
1678 pmap_store_pte(user_pdptp+3, *(kern_pdptp+3));
1679 p->pm_task_map = TASK_MAP_64BIT_SHARED;
1680 PMAP_UNLOCK(p);
1681 }
1682
1683 void
1684 pmap_clear_4GB_pagezero(pmap_t p)
1685 {
1686 pdpt_entry_t *user_pdptp;
1687
1688 if (p->pm_task_map != TASK_MAP_64BIT_SHARED)
1689 return;
1690
1691 PMAP_LOCK(p);
1692
1693 p->pm_task_map = TASK_MAP_64BIT;
1694
1695 pmap_load_kernel_cr3();
1696
1697 user_pdptp = pmap64_pdpt(p, 0x0);
1698 pmap_store_pte(user_pdptp+0, 0);
1699 pmap_store_pte(user_pdptp+1, 0);
1700 pmap_store_pte(user_pdptp+2, 0);
1701 pmap_store_pte(user_pdptp+3, 0);
1702
1703 PMAP_UNLOCK(p);
1704 }
1705
1706 void
1707 pmap_load_kernel_cr3(void)
1708 {
1709 uint64_t kernel_cr3;
1710
1711 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
1712
1713 /*
1714 * Reload cr3 with the true kernel cr3.
1715 */
1716 kernel_cr3 = current_cpu_datap()->cpu_kernel_cr3;
1717 set64_cr3(kernel_cr3);
1718 current_cpu_datap()->cpu_active_cr3 = kernel_cr3;
1719 current_cpu_datap()->cpu_tlb_invalid = FALSE;
1720 __asm__ volatile("mfence");
1721 }
1722
1723 /*
1724 * Retire the given physical map from service.
1725 * Should only be called if the map contains
1726 * no valid mappings.
1727 */
1728
1729 void
1730 pmap_destroy(
1731 register pmap_t p)
1732 {
1733 register int c;
1734
1735 if (p == PMAP_NULL)
1736 return;
1737
1738 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1739 (int) p, 0, 0, 0, 0);
1740
1741 PMAP_LOCK(p);
1742
1743 c = --p->ref_count;
1744
1745 if (c == 0) {
1746 /*
1747 * If some cpu is not using the physical pmap pointer that it
1748 * is supposed to be (see set_dirbase), we might be using the
1749 * pmap that is being destroyed! Make sure we are
1750 * physically on the right pmap:
1751 */
1752 PMAP_UPDATE_TLBS(p,
1753 0x0ULL,
1754 0xFFFFFFFFFFFFF000ULL);
1755 }
1756
1757 PMAP_UNLOCK(p);
1758
1759 if (c != 0) {
1760 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1761 (int) p, 1, 0, 0, 0);
1762 return; /* still in use */
1763 }
1764
1765 /*
1766 * Free the memory maps, then the
1767 * pmap structure.
1768 */
1769 if (!cpu_64bit) {
1770 OSAddAtomic(-p->pm_obj->resident_page_count, &inuse_ptepages_count);
1771
1772 kmem_free(kernel_map, (vm_offset_t)p->dirbase, NBPTD);
1773 zfree(pdpt_zone, (void *)p->pm_hold);
1774
1775 vm_object_deallocate(p->pm_obj);
1776 } else {
1777 /* 64 bit */
1778 int inuse_ptepages = 0;
1779
1780 /* free 64 bit mode structs */
1781 inuse_ptepages++;
1782 kmem_free(kernel_map, (vm_offset_t)p->pm_hold, PAGE_SIZE);
1783
1784 inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1785 vm_object_deallocate(p->pm_obj_pml4);
1786
1787 inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1788 vm_object_deallocate(p->pm_obj_pdpt);
1789
1790 inuse_ptepages += p->pm_obj->resident_page_count;
1791 vm_object_deallocate(p->pm_obj);
1792
1793 OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count);
1794 }
1795 zfree(pmap_zone, p);
1796
1797 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1798 0, 0, 0, 0, 0);
1799
1800 }
1801
1802 /*
1803 * Add a reference to the specified pmap.
1804 */
1805
1806 void
1807 pmap_reference(
1808 register pmap_t p)
1809 {
1810
1811 if (p != PMAP_NULL) {
1812 PMAP_LOCK(p);
1813 p->ref_count++;
1814 PMAP_UNLOCK(p);;
1815 }
1816 }
1817
1818 /*
1819 * Remove a range of hardware page-table entries.
1820 * The entries given are the first (inclusive)
1821 * and last (exclusive) entries for the VM pages.
1822 * The virtual address is the va for the first pte.
1823 *
1824 * The pmap must be locked.
1825 * If the pmap is not the kernel pmap, the range must lie
1826 * entirely within one pte-page. This is NOT checked.
1827 * Assumes that the pte-page exists.
1828 */
1829
1830 void
1831 pmap_remove_range(
1832 pmap_t pmap,
1833 vm_map_offset_t start_vaddr,
1834 pt_entry_t *spte,
1835 pt_entry_t *epte)
1836 {
1837 register pt_entry_t *cpte;
1838 pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL;
1839 pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL;
1840 pv_hashed_entry_t pvh_e;
1841 int pvh_cnt = 0;
1842 int num_removed, num_unwired, num_found;
1843 int pai;
1844 pmap_paddr_t pa;
1845 vm_map_offset_t vaddr;
1846 int pvhash_idx;
1847 uint32_t pv_cnt;
1848
1849 num_removed = 0;
1850 num_unwired = 0;
1851 num_found = 0;
1852
1853 if (pmap != kernel_pmap &&
1854 pmap->pm_task_map == TASK_MAP_32BIT &&
1855 start_vaddr >= HIGH_MEM_BASE) {
1856 /*
1857 * The range is in the "high_shared_pde" which is shared
1858 * between the kernel and all 32-bit tasks. It holds
1859 * the 32-bit commpage but also the trampolines, GDT, etc...
1860 * so we can't let user tasks remove anything from it.
1861 */
1862 return;
1863 }
1864
1865 /* invalidate the PTEs first to "freeze" them */
1866 for (cpte = spte, vaddr = start_vaddr;
1867 cpte < epte;
1868 cpte++, vaddr += PAGE_SIZE_64) {
1869
1870 pa = pte_to_pa(*cpte);
1871 if (pa == 0)
1872 continue;
1873 num_found++;
1874
1875 if (iswired(*cpte))
1876 num_unwired++;
1877
1878 pai = pa_index(pa);
1879
1880 if (!managed_page(pai)) {
1881 /*
1882 * Outside range of managed physical memory.
1883 * Just remove the mappings.
1884 */
1885 pmap_store_pte(cpte, 0);
1886 continue;
1887 }
1888
1889 /* invalidate the PTE */
1890 pmap_update_pte(cpte, *cpte, (*cpte & ~INTEL_PTE_VALID));
1891 }
1892
1893 if (num_found == 0) {
1894 /* nothing was changed: we're done */
1895 goto update_counts;
1896 }
1897
1898 /* propagate the invalidates to other CPUs */
1899
1900 PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
1901
1902 for (cpte = spte, vaddr = start_vaddr;
1903 cpte < epte;
1904 cpte++, vaddr += PAGE_SIZE_64) {
1905
1906 pa = pte_to_pa(*cpte);
1907 if (pa == 0)
1908 continue;
1909
1910 pai = pa_index(pa);
1911
1912 LOCK_PVH(pai);
1913
1914 pa = pte_to_pa(*cpte);
1915 if (pa == 0) {
1916 UNLOCK_PVH(pai);
1917 continue;
1918 }
1919
1920 num_removed++;
1921
1922 /*
1923 * Get the modify and reference bits, then
1924 * nuke the entry in the page table
1925 */
1926 /* remember reference and change */
1927 pmap_phys_attributes[pai] |=
1928 (char)(*cpte & (PHYS_MODIFIED | PHYS_REFERENCED));
1929 /* completely invalidate the PTE */
1930 pmap_store_pte(cpte, 0);
1931
1932 /*
1933 * Remove the mapping from the pvlist for
1934 * this physical page.
1935 */
1936 {
1937 pv_rooted_entry_t pv_h;
1938 pv_hashed_entry_t *pprevh;
1939 ppnum_t ppn = (ppnum_t)pai;
1940
1941 pv_h = pai_to_pvh(pai);
1942 pvh_e = PV_HASHED_ENTRY_NULL;
1943 if (pv_h->pmap == PMAP_NULL)
1944 panic("pmap_remove_range: null pv_list!");
1945
1946 if (pv_h->va == vaddr && pv_h->pmap == pmap) { /* rooted or not */
1947 /*
1948 * Header is the pv_rooted_entry. We can't free that. If there is a queued
1949 * entry after this one we remove that
1950 * from the ppn queue, we remove it from the hash chain
1951 * and copy it to the rooted entry. Then free it instead.
1952 */
1953
1954 pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
1955 if (pv_h != (pv_rooted_entry_t)pvh_e) { /* any queued after rooted? */
1956 CHK_NPVHASH();
1957 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
1958 LOCK_PV_HASH(pvhash_idx);
1959 remque(&pvh_e->qlink);
1960 {
1961 pprevh = pvhash(pvhash_idx);
1962 if (PV_HASHED_ENTRY_NULL == *pprevh) {
1963 panic("pmap_remove_range empty hash removing rooted pv");
1964 }
1965 }
1966 pmap_pvh_unlink(pvh_e);
1967 UNLOCK_PV_HASH(pvhash_idx);
1968 pv_h->pmap = pvh_e->pmap;
1969 pv_h->va = pvh_e->va; /* dispose of pvh_e */
1970 } else { /* none queued after rooted */
1971 pv_h->pmap = PMAP_NULL;
1972 pvh_e = PV_HASHED_ENTRY_NULL;
1973 } /* any queued after rooted */
1974
1975 } else { /* rooted or not */
1976 /* not removing rooted pv. find it on hash chain, remove from ppn queue and
1977 * hash chain and free it */
1978 CHK_NPVHASH();
1979 pvhash_idx = pvhashidx(pmap,vaddr);
1980 LOCK_PV_HASH(pvhash_idx);
1981 pprevh = pvhash(pvhash_idx);
1982 if (PV_HASHED_ENTRY_NULL == *pprevh) {
1983 panic("pmap_remove_range empty hash removing hashed pv");
1984 }
1985 pvh_e = *pprevh;
1986 pmap_pv_hashlist_walks++;
1987 pv_cnt = 0;
1988 while (PV_HASHED_ENTRY_NULL != pvh_e) {
1989 pv_cnt++;
1990 if (pvh_e->pmap == pmap && pvh_e->va == vaddr && pvh_e->ppn == ppn) break;
1991 pprevh = &pvh_e->nexth;
1992 pvh_e = pvh_e->nexth;
1993 }
1994 pmap_pv_hashlist_cnts += pv_cnt;
1995 if (pmap_pv_hashlist_max < pv_cnt) pmap_pv_hashlist_max = pv_cnt;
1996 if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pmap_remove_range pv not on hash");
1997 *pprevh = pvh_e->nexth;
1998 remque(&pvh_e->qlink);
1999 UNLOCK_PV_HASH(pvhash_idx);
2000
2001 } /* rooted or not */
2002
2003 UNLOCK_PVH(pai);
2004
2005 if (pvh_e != PV_HASHED_ENTRY_NULL) {
2006 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
2007 pvh_eh = pvh_e;
2008
2009 if (pvh_et == PV_HASHED_ENTRY_NULL) {
2010 pvh_et = pvh_e;
2011 }
2012
2013 pvh_cnt++;
2014 }
2015
2016 } /* removing mappings for this phy page */
2017 } /* for loop */
2018
2019 if (pvh_eh != PV_HASHED_ENTRY_NULL) {
2020 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
2021 }
2022
2023 update_counts:
2024 /*
2025 * Update the counts
2026 */
2027 #if TESTING
2028 if (pmap->stats.resident_count < num_removed)
2029 panic("pmap_remove_range: resident_count");
2030 #endif
2031 assert(pmap->stats.resident_count >= num_removed);
2032 OSAddAtomic(-num_removed, &pmap->stats.resident_count);
2033
2034 #if TESTING
2035 if (pmap->stats.wired_count < num_unwired)
2036 panic("pmap_remove_range: wired_count");
2037 #endif
2038 assert(pmap->stats.wired_count >= num_unwired);
2039 OSAddAtomic(-num_unwired, &pmap->stats.wired_count);
2040
2041 return;
2042 }
2043
2044 /*
2045 * Remove phys addr if mapped in specified map
2046 *
2047 */
2048 void
2049 pmap_remove_some_phys(
2050 __unused pmap_t map,
2051 __unused ppnum_t pn)
2052 {
2053
2054 /* Implement to support working set code */
2055
2056 }
2057
2058 /*
2059 * Remove the given range of addresses
2060 * from the specified map.
2061 *
2062 * It is assumed that the start and end are properly
2063 * rounded to the hardware page size.
2064 */
2065
2066
2067 void
2068 pmap_remove(
2069 pmap_t map,
2070 addr64_t s64,
2071 addr64_t e64)
2072 {
2073 pt_entry_t *pde;
2074 pt_entry_t *spte, *epte;
2075 addr64_t l64;
2076 addr64_t orig_s64;
2077 uint64_t deadline;
2078
2079 pmap_intr_assert();
2080
2081 if (map == PMAP_NULL || s64 == e64)
2082 return;
2083
2084 PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
2085 (int) map,
2086 (int) (s64>>32), (int) s64,
2087 (int) (e64>>32), (int) e64);
2088
2089 PMAP_LOCK(map);
2090
2091 #if 0
2092 /*
2093 * Check that address range in the kernel does not overlap the stacks.
2094 * We initialize local static min/max variables once to avoid making
2095 * 2 function calls for every remove. Note also that these functions
2096 * both return 0 before kernel stacks have been initialized, and hence
2097 * the panic is not triggered in this case.
2098 */
2099 if (map == kernel_pmap) {
2100 static vm_offset_t kernel_stack_min = 0;
2101 static vm_offset_t kernel_stack_max = 0;
2102
2103 if (kernel_stack_min == 0) {
2104 kernel_stack_min = min_valid_stack_address();
2105 kernel_stack_max = max_valid_stack_address();
2106 }
2107 if ((kernel_stack_min <= s64 && s64 < kernel_stack_max) ||
2108 (kernel_stack_min < e64 && e64 <= kernel_stack_max))
2109 panic("pmap_remove() attempted in kernel stack");
2110 }
2111 #else
2112
2113 /*
2114 * The values of kernel_stack_min and kernel_stack_max are no longer
2115 * relevant now that we allocate kernel stacks anywhere in the kernel map,
2116 * so the old code above no longer applies. If we wanted to check that
2117 * we weren't removing a mapping of a page in a kernel stack we'd have to
2118 * mark the PTE with an unused bit and check that here.
2119 */
2120
2121 #endif
2122
2123 deadline = rdtsc64() + max_preemption_latency_tsc;
2124
2125 orig_s64 = s64;
2126
2127 while (s64 < e64) {
2128 l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size-1);
2129 if (l64 > e64)
2130 l64 = e64;
2131 pde = pmap_pde(map, s64);
2132
2133 if (pde && (*pde & INTEL_PTE_VALID)) {
2134 spte = (pt_entry_t *)pmap_pte(map, (s64 & ~(pde_mapped_size-1)));
2135 spte = &spte[ptenum(s64)];
2136 epte = &spte[intel_btop(l64-s64)];
2137
2138 pmap_remove_range(map, s64, spte, epte);
2139 }
2140 s64 = l64;
2141 pde++;
2142
2143 if (s64 < e64 && rdtsc64() >= deadline) {
2144 PMAP_UNLOCK(map)
2145 PMAP_LOCK(map)
2146
2147 deadline = rdtsc64() + max_preemption_latency_tsc;
2148 }
2149
2150 }
2151
2152 PMAP_UNLOCK(map);
2153
2154 PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END,
2155 (int) map, 0, 0, 0, 0);
2156
2157 }
2158
2159 /*
2160 * Routine: pmap_page_protect
2161 *
2162 * Function:
2163 * Lower the permission for all mappings to a given
2164 * page.
2165 */
2166 void
2167 pmap_page_protect(
2168 ppnum_t pn,
2169 vm_prot_t prot)
2170 {
2171 pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL;
2172 pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL;
2173 pv_hashed_entry_t nexth;
2174 int pvh_cnt = 0;
2175 pv_rooted_entry_t pv_h;
2176 pv_rooted_entry_t pv_e;
2177 pv_hashed_entry_t pvh_e;
2178 pt_entry_t *pte;
2179 int pai;
2180 register pmap_t pmap;
2181 boolean_t remove;
2182 int pvhash_idx;
2183
2184 pmap_intr_assert();
2185 assert(pn != vm_page_fictitious_addr);
2186 if (pn == vm_page_guard_addr)
2187 return;
2188
2189 pai = ppn_to_pai(pn);
2190
2191 if (!managed_page(pai)) {
2192 /*
2193 * Not a managed page.
2194 */
2195 return;
2196 }
2197
2198 PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START,
2199 (int) pn, (int) prot, 0, 0, 0);
2200
2201 /*
2202 * Determine the new protection.
2203 */
2204 switch (prot) {
2205 case VM_PROT_READ:
2206 case VM_PROT_READ|VM_PROT_EXECUTE:
2207 remove = FALSE;
2208 break;
2209 case VM_PROT_ALL:
2210 return; /* nothing to do */
2211 default:
2212 remove = TRUE;
2213 break;
2214 }
2215
2216 pv_h = pai_to_pvh(pai);
2217
2218 LOCK_PVH(pai);
2219
2220
2221 /*
2222 * Walk down PV list, changing or removing all mappings.
2223 */
2224 if (pv_h->pmap != PMAP_NULL) {
2225
2226 pv_e = pv_h;
2227 pvh_e = (pv_hashed_entry_t)pv_e; /* cheat */
2228
2229 do {
2230 register vm_map_offset_t vaddr;
2231 pmap = pv_e->pmap;
2232
2233 vaddr = pv_e->va;
2234 pte = pmap_pte(pmap, vaddr);
2235
2236 if (0 == pte) {
2237 panic("pmap_page_protect: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx, prot: %d kernel_pmap: %p", pmap, pn, vaddr, prot, kernel_pmap);
2238 }
2239
2240 nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink); /* if there is one */
2241
2242 /*
2243 * Remove the mapping if new protection is NONE
2244 * or if write-protecting a kernel mapping.
2245 */
2246 if (remove || pmap == kernel_pmap) {
2247 /*
2248 * Remove the mapping, collecting any modify bits.
2249 */
2250 pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID));
2251
2252 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2253
2254 pmap_phys_attributes[pai] |= *pte & (PHYS_MODIFIED|PHYS_REFERENCED);
2255
2256 pmap_store_pte(pte, 0);
2257
2258 #if TESTING
2259 if (pmap->stats.resident_count < 1)
2260 panic("pmap_page_protect: resident_count");
2261 #endif
2262 assert(pmap->stats.resident_count >= 1);
2263 OSAddAtomic(-1, &pmap->stats.resident_count);
2264
2265 /*
2266 * Deal with the pv_rooted_entry.
2267 */
2268
2269 if (pv_e == pv_h) {
2270 /*
2271 * Fix up head later.
2272 */
2273 pv_h->pmap = PMAP_NULL;
2274 }
2275 else {
2276 /*
2277 * Delete this entry.
2278 */
2279 CHK_NPVHASH();
2280 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
2281 LOCK_PV_HASH(pvhash_idx);
2282 remque(&pvh_e->qlink);
2283 pmap_pvh_unlink(pvh_e);
2284 UNLOCK_PV_HASH(pvhash_idx);
2285
2286 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
2287 pvh_eh = pvh_e;
2288
2289 if (pvh_et == PV_HASHED_ENTRY_NULL)
2290 pvh_et = pvh_e;
2291 pvh_cnt++;
2292 }
2293 } else {
2294 /*
2295 * Write-protect.
2296 */
2297 pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WRITE));
2298 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2299 }
2300
2301 pvh_e = nexth;
2302 } while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h);
2303
2304
2305 /*
2306 * If pv_head mapping was removed, fix it up.
2307 */
2308
2309 if (pv_h->pmap == PMAP_NULL) {
2310 pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
2311
2312 if (pvh_e != (pv_hashed_entry_t)pv_h) {
2313 CHK_NPVHASH();
2314 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
2315 LOCK_PV_HASH(pvhash_idx);
2316 remque(&pvh_e->qlink);
2317 pmap_pvh_unlink(pvh_e);
2318 UNLOCK_PV_HASH(pvhash_idx);
2319 pv_h->pmap = pvh_e->pmap;
2320 pv_h->va = pvh_e->va;
2321 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
2322 pvh_eh = pvh_e;
2323
2324 if (pvh_et == PV_HASHED_ENTRY_NULL)
2325 pvh_et = pvh_e;
2326 pvh_cnt++;
2327 }
2328 }
2329 }
2330 if (pvh_eh != PV_HASHED_ENTRY_NULL) {
2331 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
2332 }
2333
2334 UNLOCK_PVH(pai);
2335
2336 PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END,
2337 0, 0, 0, 0, 0);
2338
2339 }
2340
2341
2342 /*
2343 * Routine:
2344 * pmap_disconnect
2345 *
2346 * Function:
2347 * Disconnect all mappings for this page and return reference and change status
2348 * in generic format.
2349 *
2350 */
2351 unsigned int pmap_disconnect(
2352 ppnum_t pa)
2353 {
2354 pmap_page_protect(pa, 0); /* disconnect the page */
2355 return (pmap_get_refmod(pa)); /* return ref/chg status */
2356 }
2357
2358 /*
2359 * Set the physical protection on the
2360 * specified range of this map as requested.
2361 * Will not increase permissions.
2362 */
2363 void
2364 pmap_protect(
2365 pmap_t map,
2366 vm_map_offset_t sva,
2367 vm_map_offset_t eva,
2368 vm_prot_t prot)
2369 {
2370 register pt_entry_t *pde;
2371 register pt_entry_t *spte, *epte;
2372 vm_map_offset_t lva;
2373 vm_map_offset_t orig_sva;
2374 boolean_t set_NX;
2375 int num_found = 0;
2376
2377 pmap_intr_assert();
2378
2379 if (map == PMAP_NULL)
2380 return;
2381
2382 if (prot == VM_PROT_NONE) {
2383 pmap_remove(map, sva, eva);
2384 return;
2385 }
2386
2387 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
2388 (int) map,
2389 (int) (sva>>32), (int) sva,
2390 (int) (eva>>32), (int) eva);
2391
2392 if ( (prot & VM_PROT_EXECUTE) || !nx_enabled || !map->nx_enabled )
2393 set_NX = FALSE;
2394 else
2395 set_NX = TRUE;
2396
2397 PMAP_LOCK(map);
2398
2399 orig_sva = sva;
2400 while (sva < eva) {
2401 lva = (sva + pde_mapped_size) & ~(pde_mapped_size-1);
2402 if (lva > eva)
2403 lva = eva;
2404 pde = pmap_pde(map, sva);
2405 if (pde && (*pde & INTEL_PTE_VALID)) {
2406 spte = (pt_entry_t *)pmap_pte(map, (sva & ~(pde_mapped_size-1)));
2407 spte = &spte[ptenum(sva)];
2408 epte = &spte[intel_btop(lva-sva)];
2409
2410 while (spte < epte) {
2411
2412 if (*spte & INTEL_PTE_VALID) {
2413
2414 if (prot & VM_PROT_WRITE)
2415 pmap_update_pte(spte, *spte, (*spte | INTEL_PTE_WRITE));
2416 else
2417 pmap_update_pte(spte, *spte, (*spte & ~INTEL_PTE_WRITE));
2418
2419 if (set_NX == TRUE)
2420 pmap_update_pte(spte, *spte, (*spte | INTEL_PTE_NX));
2421 else
2422 pmap_update_pte(spte, *spte, (*spte & ~INTEL_PTE_NX));
2423
2424 num_found++;
2425 }
2426 spte++;
2427 }
2428 }
2429 sva = lva;
2430 }
2431 if (num_found)
2432 PMAP_UPDATE_TLBS(map, orig_sva, eva);
2433
2434 PMAP_UNLOCK(map);
2435
2436 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END,
2437 0, 0, 0, 0, 0);
2438
2439 }
2440
2441 /* Map a (possibly) autogenned block */
2442 void
2443 pmap_map_block(
2444 pmap_t pmap,
2445 addr64_t va,
2446 ppnum_t pa,
2447 uint32_t size,
2448 vm_prot_t prot,
2449 int attr,
2450 __unused unsigned int flags)
2451 {
2452 uint32_t page;
2453
2454 for (page = 0; page < size; page++) {
2455 pmap_enter(pmap, va, pa, prot, attr, TRUE);
2456 va += PAGE_SIZE;
2457 pa++;
2458 }
2459 }
2460
2461
2462 /*
2463 * Insert the given physical page (p) at
2464 * the specified virtual address (v) in the
2465 * target physical map with the protection requested.
2466 *
2467 * If specified, the page will be wired down, meaning
2468 * that the related pte cannot be reclaimed.
2469 *
2470 * NB: This is the only routine which MAY NOT lazy-evaluate
2471 * or lose information. That is, this routine must actually
2472 * insert this page into the given map NOW.
2473 */
2474 void
2475 pmap_enter(
2476 register pmap_t pmap,
2477 vm_map_offset_t vaddr,
2478 ppnum_t pn,
2479 vm_prot_t prot,
2480 unsigned int flags,
2481 boolean_t wired)
2482 {
2483 register pt_entry_t *pte;
2484 register pv_rooted_entry_t pv_h;
2485 register int pai;
2486 pv_hashed_entry_t pvh_e;
2487 pv_hashed_entry_t pvh_new;
2488 pv_hashed_entry_t *hashp;
2489 pt_entry_t template;
2490 pmap_paddr_t old_pa;
2491 pmap_paddr_t pa = (pmap_paddr_t)i386_ptob(pn);
2492 boolean_t need_tlbflush = FALSE;
2493 boolean_t set_NX;
2494 char oattr;
2495 int pvhash_idx;
2496 uint32_t pv_cnt;
2497 boolean_t old_pa_locked;
2498
2499 pmap_intr_assert();
2500 assert(pn != vm_page_fictitious_addr);
2501 if (pmap_debug)
2502 printf("pmap(%qx, %x)\n", vaddr, pn);
2503 if (pmap == PMAP_NULL)
2504 return;
2505 if (pn == vm_page_guard_addr)
2506 return;
2507
2508 PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
2509 (int) pmap,
2510 (int) (vaddr>>32), (int) vaddr,
2511 (int) pn, prot);
2512
2513 if ( (prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled )
2514 set_NX = FALSE;
2515 else
2516 set_NX = TRUE;
2517
2518 /*
2519 * Must allocate a new pvlist entry while we're unlocked;
2520 * zalloc may cause pageout (which will lock the pmap system).
2521 * If we determine we need a pvlist entry, we will unlock
2522 * and allocate one. Then we will retry, throughing away
2523 * the allocated entry later (if we no longer need it).
2524 */
2525
2526 pvh_new = PV_HASHED_ENTRY_NULL;
2527 Retry:
2528 pvh_e = PV_HASHED_ENTRY_NULL;
2529
2530 PMAP_LOCK(pmap);
2531
2532 /*
2533 * Expand pmap to include this pte. Assume that
2534 * pmap is always expanded to include enough hardware
2535 * pages to map one VM page.
2536 */
2537
2538 while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
2539 /*
2540 * Must unlock to expand the pmap.
2541 */
2542 PMAP_UNLOCK(pmap);
2543 pmap_expand(pmap, vaddr); /* going to grow pde level page(s) */
2544 PMAP_LOCK(pmap);
2545 }
2546
2547 old_pa = pte_to_pa(*pte);
2548 pai = pa_index(old_pa);
2549 old_pa_locked = FALSE;
2550
2551 /*
2552 * if we have a previous managed page, lock the pv entry now. after
2553 * we lock it, check to see if someone beat us to the lock and if so
2554 * drop the lock
2555 */
2556
2557 if ((0 != old_pa) && managed_page(pai)) {
2558 LOCK_PVH(pai);
2559 old_pa_locked = TRUE;
2560 old_pa = pte_to_pa(*pte);
2561 if (0 == old_pa) {
2562 UNLOCK_PVH(pai); /* some other path beat us to it */
2563 old_pa_locked = FALSE;
2564 }
2565 }
2566
2567
2568 /*
2569 * Special case if the incoming physical page is already mapped
2570 * at this address.
2571 */
2572 if (old_pa == pa) {
2573
2574 /*
2575 * May be changing its wired attribute or protection
2576 */
2577
2578 template = pa_to_pte(pa) | INTEL_PTE_VALID;
2579
2580 if(VM_MEM_NOT_CACHEABLE == (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) {
2581 if(!(flags & VM_MEM_GUARDED))
2582 template |= INTEL_PTE_PTA;
2583 template |= INTEL_PTE_NCACHE;
2584 }
2585
2586 if (pmap != kernel_pmap)
2587 template |= INTEL_PTE_USER;
2588 if (prot & VM_PROT_WRITE)
2589 template |= INTEL_PTE_WRITE;
2590
2591 if (set_NX == TRUE)
2592 template |= INTEL_PTE_NX;
2593
2594 if (wired) {
2595 template |= INTEL_PTE_WIRED;
2596 if (!iswired(*pte))
2597 OSAddAtomic(+1, &pmap->stats.wired_count);
2598 }
2599 else {
2600 if (iswired(*pte)) {
2601 assert(pmap->stats.wired_count >= 1);
2602 OSAddAtomic(-1, &pmap->stats.wired_count);
2603 }
2604 }
2605
2606 /* store modified PTE and preserve RC bits */
2607 pmap_update_pte(pte, *pte, template | (*pte & (INTEL_PTE_REF | INTEL_PTE_MOD)));
2608 if (old_pa_locked) {
2609 UNLOCK_PVH(pai);
2610 old_pa_locked = FALSE;
2611 }
2612 need_tlbflush = TRUE;
2613 goto Done;
2614 }
2615
2616 /*
2617 * Outline of code from here:
2618 * 1) If va was mapped, update TLBs, remove the mapping
2619 * and remove old pvlist entry.
2620 * 2) Add pvlist entry for new mapping
2621 * 3) Enter new mapping.
2622 *
2623 * If the old physical page is not managed step 1) is skipped
2624 * (except for updating the TLBs), and the mapping is
2625 * overwritten at step 3). If the new physical page is not
2626 * managed, step 2) is skipped.
2627 */
2628
2629 if (old_pa != (pmap_paddr_t) 0) {
2630
2631 /*
2632 * Don't do anything to pages outside valid memory here.
2633 * Instead convince the code that enters a new mapping
2634 * to overwrite the old one.
2635 */
2636
2637 /* invalidate the PTE */
2638 pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID));
2639 /* propagate invalidate everywhere */
2640 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2641 /* remember reference and change */
2642 oattr = (char)(*pte & (PHYS_MODIFIED | PHYS_REFERENCED));
2643 /* completely invalidate the PTE */
2644 pmap_store_pte(pte, 0);
2645
2646 if (managed_page(pai)) {
2647 #if TESTING
2648 if (pmap->stats.resident_count < 1)
2649 panic("pmap_enter: resident_count");
2650 #endif
2651 assert(pmap->stats.resident_count >= 1);
2652 OSAddAtomic(-1, &pmap->stats.resident_count);
2653
2654 if (iswired(*pte)) {
2655
2656 #if TESTING
2657 if (pmap->stats.wired_count < 1)
2658 panic("pmap_enter: wired_count");
2659 #endif
2660 assert(pmap->stats.wired_count >= 1);
2661 OSAddAtomic(-1, &pmap->stats.wired_count);
2662 }
2663
2664 pmap_phys_attributes[pai] |= oattr;
2665 /*
2666 * Remove the mapping from the pvlist for
2667 * this physical page.
2668 * We'll end up with either a rooted pv or a
2669 * hashed pv
2670 */
2671 {
2672
2673 pv_h = pai_to_pvh(pai);
2674
2675 if (pv_h->pmap == PMAP_NULL) {
2676 panic("pmap_enter: null pv_list!");
2677 }
2678
2679 if (pv_h->va == vaddr && pv_h->pmap == pmap) {
2680 /*
2681 * Header is the pv_rooted_entry.
2682 * If there is a next one, copy it to the
2683 * header and free the next one (we cannot
2684 * free the header)
2685 */
2686 pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
2687 if (pvh_e != (pv_hashed_entry_t)pv_h) {
2688 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
2689 LOCK_PV_HASH(pvhash_idx);
2690 remque(&pvh_e->qlink);
2691 pmap_pvh_unlink(pvh_e);
2692 UNLOCK_PV_HASH(pvhash_idx);
2693 pv_h->pmap = pvh_e->pmap;
2694 pv_h->va = pvh_e->va;
2695 }
2696 else {
2697 pv_h->pmap = PMAP_NULL;
2698 pvh_e = PV_HASHED_ENTRY_NULL;
2699 }
2700 }
2701 else {
2702 pv_hashed_entry_t *pprevh;
2703 ppnum_t old_ppn;
2704 /* wasn't the rooted pv - hash, find it, and unlink it */
2705 old_ppn = (ppnum_t)pa_index(old_pa);
2706 CHK_NPVHASH();
2707 pvhash_idx = pvhashidx(pmap,vaddr);
2708 LOCK_PV_HASH(pvhash_idx);
2709 pprevh = pvhash(pvhash_idx);
2710 #if PV_DEBUG
2711 if (NULL==pprevh)panic("pmap enter 1");
2712 #endif
2713 pvh_e = *pprevh;
2714 pmap_pv_hashlist_walks++;
2715 pv_cnt = 0;
2716 while (PV_HASHED_ENTRY_NULL != pvh_e) {
2717 pv_cnt++;
2718 if (pvh_e->pmap == pmap && pvh_e->va == vaddr && pvh_e->ppn == old_ppn) break;
2719 pprevh = &pvh_e->nexth;
2720 pvh_e = pvh_e->nexth;
2721 }
2722 pmap_pv_hashlist_cnts += pv_cnt;
2723 if (pmap_pv_hashlist_max < pv_cnt) pmap_pv_hashlist_max = pv_cnt;
2724 if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pmap_enter: pv not in hash list");
2725 if(NULL==pprevh)panic("pmap enter 2");
2726 *pprevh = pvh_e->nexth;
2727 remque(&pvh_e->qlink);
2728 UNLOCK_PV_HASH(pvhash_idx);
2729 }
2730 }
2731 }
2732 else {
2733 /*
2734 * old_pa is not managed.
2735 * Do removal part of accounting.
2736 */
2737
2738 if (iswired(*pte)) {
2739 assert(pmap->stats.wired_count >= 1);
2740 OSAddAtomic(-1, &pmap->stats.wired_count);
2741 }
2742 }
2743 }
2744
2745 /*
2746 * if we had a previously managed paged locked, unlock it now
2747 */
2748
2749 if (old_pa_locked) {
2750 UNLOCK_PVH(pai);
2751 old_pa_locked = FALSE;
2752 }
2753
2754 pai = pa_index(pa); /* now working with new incoming phys page */
2755 if (managed_page(pai)) {
2756
2757 /*
2758 * Step 2) Enter the mapping in the PV list for this
2759 * physical page.
2760 */
2761 pv_h = pai_to_pvh(pai);
2762
2763 LOCK_PVH(pai);
2764
2765 if (pv_h->pmap == PMAP_NULL) {
2766 /*
2767 * No mappings yet, use rooted pv
2768 */
2769 pv_h->va = vaddr;
2770 pv_h->pmap = pmap;
2771 queue_init(&pv_h->qlink);
2772 }
2773 else {
2774 /*
2775 * Add new pv_hashed_entry after header.
2776 */
2777 if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
2778 pvh_e = pvh_new;
2779 pvh_new = PV_HASHED_ENTRY_NULL; /* show we used it */
2780 } else if (PV_HASHED_ENTRY_NULL == pvh_e) {
2781 PV_HASHED_ALLOC(pvh_e);
2782 if (PV_HASHED_ENTRY_NULL == pvh_e) {
2783 /* the pv list is empty.
2784 * if we are on the kernel pmap we'll use one of the special private
2785 * kernel pv_e's, else, we need to unlock everything, zalloc a pv_e,
2786 * and restart bringing in the pv_e with us.
2787 */
2788 if (kernel_pmap == pmap) {
2789 PV_HASHED_KERN_ALLOC(pvh_e);
2790 } else {
2791 UNLOCK_PVH(pai);
2792 PMAP_UNLOCK(pmap);
2793 pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
2794 goto Retry;
2795 }
2796 }
2797 }
2798
2799 if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pvh_e exhaustion");
2800 pvh_e->va = vaddr;
2801 pvh_e->pmap = pmap;
2802 pvh_e->ppn = pn;
2803 CHK_NPVHASH();
2804 pvhash_idx = pvhashidx(pmap,vaddr);
2805 LOCK_PV_HASH(pvhash_idx);
2806 insque(&pvh_e->qlink, &pv_h->qlink);
2807 hashp = pvhash(pvhash_idx);
2808 #if PV_DEBUG
2809 if(NULL==hashp)panic("pmap_enter 4");
2810 #endif
2811 pvh_e->nexth = *hashp;
2812 *hashp = pvh_e;
2813 UNLOCK_PV_HASH(pvhash_idx);
2814
2815 /*
2816 * Remember that we used the pvlist entry.
2817 */
2818 pvh_e = PV_HASHED_ENTRY_NULL;
2819 }
2820
2821 /*
2822 * only count the mapping
2823 * for 'managed memory'
2824 */
2825 OSAddAtomic(+1, &pmap->stats.resident_count);
2826 if (pmap->stats.resident_count > pmap->stats.resident_max) {
2827 pmap->stats.resident_max = pmap->stats.resident_count;
2828 }
2829 }
2830
2831 /*
2832 * Step 3) Enter the mapping.
2833 *
2834 * Build a template to speed up entering -
2835 * only the pfn changes.
2836 */
2837 template = pa_to_pte(pa) | INTEL_PTE_VALID;
2838
2839 if (flags & VM_MEM_NOT_CACHEABLE) {
2840 if(!(flags & VM_MEM_GUARDED))
2841 template |= INTEL_PTE_PTA;
2842 template |= INTEL_PTE_NCACHE;
2843 }
2844
2845 if (pmap != kernel_pmap)
2846 template |= INTEL_PTE_USER;
2847 if (prot & VM_PROT_WRITE)
2848 template |= INTEL_PTE_WRITE;
2849
2850 if (set_NX == TRUE)
2851 template |= INTEL_PTE_NX;
2852
2853 if (wired) {
2854 template |= INTEL_PTE_WIRED;
2855 OSAddAtomic(+1, &pmap->stats.wired_count);
2856 }
2857 pmap_store_pte(pte, template);
2858
2859 /* if this was a managed page we delayed unlocking the pv until here
2860 * to prevent pmap_page_protect et al from finding it until the pte
2861 * has been stored */
2862
2863 if (managed_page(pai)) {
2864 UNLOCK_PVH(pai);
2865 }
2866
2867 Done:
2868 if (need_tlbflush == TRUE)
2869 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2870
2871 if (pvh_e != PV_HASHED_ENTRY_NULL) {
2872 PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
2873 }
2874
2875 if (pvh_new != PV_HASHED_ENTRY_NULL) {
2876 PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
2877 }
2878
2879 PMAP_UNLOCK(pmap);
2880 PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0);
2881 }
2882
2883 /*
2884 * Routine: pmap_change_wiring
2885 * Function: Change the wiring attribute for a map/virtual-address
2886 * pair.
2887 * In/out conditions:
2888 * The mapping must already exist in the pmap.
2889 */
2890 void
2891 pmap_change_wiring(
2892 register pmap_t map,
2893 vm_map_offset_t vaddr,
2894 boolean_t wired)
2895 {
2896 register pt_entry_t *pte;
2897
2898 /*
2899 * We must grab the pmap system lock because we may
2900 * change a pte_page queue.
2901 */
2902 PMAP_LOCK(map);
2903
2904 if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL)
2905 panic("pmap_change_wiring: pte missing");
2906
2907 if (wired && !iswired(*pte)) {
2908 /*
2909 * wiring down mapping
2910 */
2911 OSAddAtomic(+1, &map->stats.wired_count);
2912 pmap_update_pte(pte, *pte, (*pte | INTEL_PTE_WIRED));
2913 }
2914 else if (!wired && iswired(*pte)) {
2915 /*
2916 * unwiring mapping
2917 */
2918 assert(map->stats.wired_count >= 1);
2919 OSAddAtomic(-1, &map->stats.wired_count);
2920 pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WIRED));
2921 }
2922
2923 PMAP_UNLOCK(map);
2924 }
2925
2926
2927 /*
2928 * Routine: pmap_extract
2929 * Function:
2930 * Extract the physical page address associated
2931 * with the given map/virtual_address pair.
2932 * Change to shim for backwards compatibility but will not
2933 * work for 64 bit systems. Some old drivers that we cannot
2934 * change need this.
2935 */
2936
2937 vm_offset_t
2938 pmap_extract(
2939 register pmap_t pmap,
2940 vm_map_offset_t vaddr)
2941 {
2942 ppnum_t ppn;
2943 vm_offset_t paddr;
2944
2945 paddr = (vm_offset_t)0;
2946 ppn = pmap_find_phys(pmap, vaddr);
2947
2948 if (ppn) {
2949 paddr = ((vm_offset_t)i386_ptob(ppn)) | ((vm_offset_t)vaddr & INTEL_OFFMASK);
2950 }
2951 return (paddr);
2952 }
2953
2954 void
2955 pmap_expand_pml4(
2956 pmap_t map,
2957 vm_map_offset_t vaddr)
2958 {
2959 register vm_page_t m;
2960 register pmap_paddr_t pa;
2961 uint64_t i;
2962 spl_t spl;
2963 ppnum_t pn;
2964 pml4_entry_t *pml4p;
2965
2966 if (kernel_pmap == map) panic("expand kernel pml4");
2967
2968 spl = splhigh();
2969 pml4p = pmap64_pml4(map, vaddr);
2970 splx(spl);
2971 if (PML4_ENTRY_NULL == pml4p) panic("pmap_expand_pml4 no pml4p");
2972
2973 /*
2974 * Allocate a VM page for the pml4 page
2975 */
2976 while ((m = vm_page_grab()) == VM_PAGE_NULL)
2977 VM_PAGE_WAIT();
2978
2979 /*
2980 * put the page into the pmap's obj list so it
2981 * can be found later.
2982 */
2983 pn = m->phys_page;
2984 pa = i386_ptob(pn);
2985 i = pml4idx(map, vaddr);
2986
2987 /*
2988 * Zero the page.
2989 */
2990 pmap_zero_page(pn);
2991
2992 vm_page_lockspin_queues();
2993 vm_page_wire(m);
2994 vm_page_unlock_queues();
2995
2996 OSAddAtomic(1, &inuse_ptepages_count);
2997
2998 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2999 vm_object_lock(map->pm_obj_pml4);
3000
3001 PMAP_LOCK(map);
3002 /*
3003 * See if someone else expanded us first
3004 */
3005 if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
3006 PMAP_UNLOCK(map);
3007 vm_object_unlock(map->pm_obj_pml4);
3008
3009 VM_PAGE_FREE(m);
3010
3011 OSAddAtomic(-1, &inuse_ptepages_count);
3012 return;
3013 }
3014
3015 #if 0 /* DEBUG */
3016 if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i)) {
3017 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
3018 map, map->pm_obj_pml4, vaddr, i);
3019 }
3020 #endif
3021 vm_page_insert(m, map->pm_obj_pml4, (vm_object_offset_t)i);
3022 vm_object_unlock(map->pm_obj_pml4);
3023
3024 /*
3025 * Set the page directory entry for this page table.
3026 */
3027 pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
3028
3029 pmap_store_pte(pml4p, pa_to_pte(pa)
3030 | INTEL_PTE_VALID
3031 | INTEL_PTE_USER
3032 | INTEL_PTE_WRITE);
3033
3034 PMAP_UNLOCK(map);
3035
3036 return;
3037
3038 }
3039
3040 void
3041 pmap_expand_pdpt(
3042 pmap_t map,
3043 vm_map_offset_t vaddr)
3044 {
3045 register vm_page_t m;
3046 register pmap_paddr_t pa;
3047 uint64_t i;
3048 spl_t spl;
3049 ppnum_t pn;
3050 pdpt_entry_t *pdptp;
3051
3052 if (kernel_pmap == map) panic("expand kernel pdpt");
3053
3054 spl = splhigh();
3055 while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
3056 splx(spl);
3057 pmap_expand_pml4(map, vaddr); /* need room for another pdpt entry */
3058 spl = splhigh();
3059 }
3060 splx(spl);
3061
3062 /*
3063 * Allocate a VM page for the pdpt page
3064 */
3065 while ((m = vm_page_grab()) == VM_PAGE_NULL)
3066 VM_PAGE_WAIT();
3067
3068 /*
3069 * put the page into the pmap's obj list so it
3070 * can be found later.
3071 */
3072 pn = m->phys_page;
3073 pa = i386_ptob(pn);
3074 i = pdptidx(map, vaddr);
3075
3076 /*
3077 * Zero the page.
3078 */
3079 pmap_zero_page(pn);
3080
3081 vm_page_lockspin_queues();
3082 vm_page_wire(m);
3083 vm_page_unlock_queues();
3084
3085 OSAddAtomic(1, &inuse_ptepages_count);
3086
3087 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3088 vm_object_lock(map->pm_obj_pdpt);
3089
3090 PMAP_LOCK(map);
3091 /*
3092 * See if someone else expanded us first
3093 */
3094 if (pmap64_pde(map, vaddr) != PD_ENTRY_NULL) {
3095 PMAP_UNLOCK(map);
3096 vm_object_unlock(map->pm_obj_pdpt);
3097
3098 VM_PAGE_FREE(m);
3099
3100 OSAddAtomic(-1, &inuse_ptepages_count);
3101 return;
3102 }
3103
3104 #if 0 /* DEBUG */
3105 if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i)) {
3106 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
3107 map, map->pm_obj_pdpt, vaddr, i);
3108 }
3109 #endif
3110 vm_page_insert(m, map->pm_obj_pdpt, (vm_object_offset_t)i);
3111 vm_object_unlock(map->pm_obj_pdpt);
3112
3113 /*
3114 * Set the page directory entry for this page table.
3115 */
3116 pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
3117
3118 pmap_store_pte(pdptp, pa_to_pte(pa)
3119 | INTEL_PTE_VALID
3120 | INTEL_PTE_USER
3121 | INTEL_PTE_WRITE);
3122
3123 PMAP_UNLOCK(map);
3124
3125 return;
3126
3127 }
3128
3129
3130
3131 /*
3132 * Routine: pmap_expand
3133 *
3134 * Expands a pmap to be able to map the specified virtual address.
3135 *
3136 * Allocates new virtual memory for the P0 or P1 portion of the
3137 * pmap, then re-maps the physical pages that were in the old
3138 * pmap to be in the new pmap.
3139 *
3140 * Must be called with the pmap system and the pmap unlocked,
3141 * since these must be unlocked to use vm_allocate or vm_deallocate.
3142 * Thus it must be called in a loop that checks whether the map
3143 * has been expanded enough.
3144 * (We won't loop forever, since page tables aren't shrunk.)
3145 */
3146 void
3147 pmap_expand(
3148 pmap_t map,
3149 vm_map_offset_t vaddr)
3150 {
3151 pt_entry_t *pdp;
3152 register vm_page_t m;
3153 register pmap_paddr_t pa;
3154 uint64_t i;
3155 spl_t spl;
3156 ppnum_t pn;
3157
3158 /*
3159 * if not the kernel map (while we are still compat kernel mode)
3160 * and we are 64 bit, propagate expand upwards
3161 */
3162
3163 if (cpu_64bit && (map != kernel_pmap)) {
3164 spl = splhigh();
3165 while ((pdp = pmap64_pde(map, vaddr)) == PD_ENTRY_NULL) {
3166 splx(spl);
3167 pmap_expand_pdpt(map, vaddr); /* need room for another pde entry */
3168 spl = splhigh();
3169 }
3170 splx(spl);
3171 }
3172
3173 /*
3174 * Allocate a VM page for the pde entries.
3175 */
3176 while ((m = vm_page_grab()) == VM_PAGE_NULL)
3177 VM_PAGE_WAIT();
3178
3179 /*
3180 * put the page into the pmap's obj list so it
3181 * can be found later.
3182 */
3183 pn = m->phys_page;
3184 pa = i386_ptob(pn);
3185 i = pdeidx(map, vaddr);
3186
3187 /*
3188 * Zero the page.
3189 */
3190 pmap_zero_page(pn);
3191
3192 vm_page_lockspin_queues();
3193 vm_page_wire(m);
3194 vm_page_unlock_queues();
3195
3196 OSAddAtomic(1, &inuse_ptepages_count);
3197
3198 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3199 vm_object_lock(map->pm_obj);
3200
3201 PMAP_LOCK(map);
3202 /*
3203 * See if someone else expanded us first
3204 */
3205
3206 if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
3207 PMAP_UNLOCK(map);
3208 vm_object_unlock(map->pm_obj);
3209
3210 VM_PAGE_FREE(m);
3211
3212 OSAddAtomic(-1, &inuse_ptepages_count);
3213 return;
3214 }
3215
3216 #if 0 /* DEBUG */
3217 if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i)) {
3218 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
3219 map, map->pm_obj, vaddr, i);
3220 }
3221 #endif
3222 vm_page_insert(m, map->pm_obj, (vm_object_offset_t)i);
3223 vm_object_unlock(map->pm_obj);
3224
3225 /*
3226 * refetch while locked
3227 */
3228
3229 pdp = pmap_pde(map, vaddr);
3230
3231 /*
3232 * Set the page directory entry for this page table.
3233 */
3234 pmap_store_pte(pdp, pa_to_pte(pa)
3235 | INTEL_PTE_VALID
3236 | INTEL_PTE_USER
3237 | INTEL_PTE_WRITE);
3238
3239 PMAP_UNLOCK(map);
3240
3241 return;
3242 }
3243
3244
3245 /*
3246 * pmap_sync_page_data_phys(ppnum_t pa)
3247 *
3248 * Invalidates all of the instruction cache on a physical page and
3249 * pushes any dirty data from the data cache for the same physical page
3250 * Not required in i386.
3251 */
3252 void
3253 pmap_sync_page_data_phys(__unused ppnum_t pa)
3254 {
3255 return;
3256 }
3257
3258 /*
3259 * pmap_sync_page_attributes_phys(ppnum_t pa)
3260 *
3261 * Write back and invalidate all cachelines on a physical page.
3262 */
3263 void
3264 pmap_sync_page_attributes_phys(ppnum_t pa)
3265 {
3266 cache_flush_page_phys(pa);
3267 }
3268
3269
3270
3271 #ifdef CURRENTLY_UNUSED_AND_UNTESTED
3272
3273 int collect_ref;
3274 int collect_unref;
3275
3276 /*
3277 * Routine: pmap_collect
3278 * Function:
3279 * Garbage collects the physical map system for
3280 * pages which are no longer used.
3281 * Success need not be guaranteed -- that is, there
3282 * may well be pages which are not referenced, but
3283 * others may be collected.
3284 * Usage:
3285 * Called by the pageout daemon when pages are scarce.
3286 */
3287 void
3288 pmap_collect(
3289 pmap_t p)
3290 {
3291 register pt_entry_t *pdp, *ptp;
3292 pt_entry_t *eptp;
3293 int wired;
3294
3295 if (p == PMAP_NULL)
3296 return;
3297
3298 if (p == kernel_pmap)
3299 return;
3300
3301 /*
3302 * Garbage collect map.
3303 */
3304 PMAP_LOCK(p);
3305
3306 for (pdp = (pt_entry_t *)p->dirbase;
3307 pdp < (pt_entry_t *)&p->dirbase[(UMAXPTDI+1)];
3308 pdp++)
3309 {
3310 if (*pdp & INTEL_PTE_VALID) {
3311 if(*pdp & INTEL_PTE_REF) {
3312 pmap_store_pte(pdp, *pdp & ~INTEL_PTE_REF);
3313 collect_ref++;
3314 } else {
3315 collect_unref++;
3316 ptp = pmap_pte(p, pdetova(pdp - (pt_entry_t *)p->dirbase));
3317 eptp = ptp + NPTEPG;
3318
3319 /*
3320 * If the pte page has any wired mappings, we cannot
3321 * free it.
3322 */
3323 wired = 0;
3324 {
3325 register pt_entry_t *ptep;
3326 for (ptep = ptp; ptep < eptp; ptep++) {
3327 if (iswired(*ptep)) {
3328 wired = 1;
3329 break;
3330 }
3331 }
3332 }
3333 if (!wired) {
3334 /*
3335 * Remove the virtual addresses mapped by this pte page.
3336 */
3337 pmap_remove_range(p,
3338 pdetova(pdp - (pt_entry_t *)p->dirbase),
3339 ptp,
3340 eptp);
3341
3342 /*
3343 * Invalidate the page directory pointer.
3344 */
3345 pmap_store_pte(pdp, 0x0);
3346
3347 PMAP_UNLOCK(p);
3348
3349 /*
3350 * And free the pte page itself.
3351 */
3352 {
3353 register vm_page_t m;
3354
3355 vm_object_lock(p->pm_obj);
3356
3357 m = vm_page_lookup(p->pm_obj,(vm_object_offset_t)(pdp - (pt_entry_t *)&p->dirbase[0]));
3358 if (m == VM_PAGE_NULL)
3359 panic("pmap_collect: pte page not in object");
3360
3361 VM_PAGE_FREE(m);
3362
3363 OSAddAtomic(-1, &inuse_ptepages_count);
3364
3365 vm_object_unlock(p->pm_obj);
3366 }
3367
3368 PMAP_LOCK(p);
3369 }
3370 }
3371 }
3372 }
3373
3374 PMAP_UPDATE_TLBS(p, 0x0, 0xFFFFFFFFFFFFF000ULL);
3375 PMAP_UNLOCK(p);
3376 return;
3377
3378 }
3379 #endif
3380
3381
3382 void
3383 pmap_copy_page(ppnum_t src, ppnum_t dst)
3384 {
3385 bcopy_phys((addr64_t)i386_ptob(src),
3386 (addr64_t)i386_ptob(dst),
3387 PAGE_SIZE);
3388 }
3389
3390
3391 /*
3392 * Routine: pmap_pageable
3393 * Function:
3394 * Make the specified pages (by pmap, offset)
3395 * pageable (or not) as requested.
3396 *
3397 * A page which is not pageable may not take
3398 * a fault; therefore, its page table entry
3399 * must remain valid for the duration.
3400 *
3401 * This routine is merely advisory; pmap_enter
3402 * will specify that these pages are to be wired
3403 * down (or not) as appropriate.
3404 */
3405 void
3406 pmap_pageable(
3407 __unused pmap_t pmap,
3408 __unused vm_map_offset_t start_addr,
3409 __unused vm_map_offset_t end_addr,
3410 __unused boolean_t pageable)
3411 {
3412 #ifdef lint
3413 pmap++; start_addr++; end_addr++; pageable++;
3414 #endif /* lint */
3415 }
3416
3417 /*
3418 * Clear specified attribute bits.
3419 */
3420 void
3421 phys_attribute_clear(
3422 ppnum_t pn,
3423 int bits)
3424 {
3425 pv_rooted_entry_t pv_h;
3426 register pv_hashed_entry_t pv_e;
3427 register pt_entry_t *pte;
3428 int pai;
3429 register pmap_t pmap;
3430
3431 pmap_intr_assert();
3432 assert(pn != vm_page_fictitious_addr);
3433 if (pn == vm_page_guard_addr)
3434 return;
3435
3436 pai = ppn_to_pai(pn);
3437
3438 if (!managed_page(pai)) {
3439 /*
3440 * Not a managed page.
3441 */
3442 return;
3443 }
3444
3445
3446 PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START,
3447 (int) pn, bits, 0, 0, 0);
3448
3449 pv_h = pai_to_pvh(pai);
3450
3451 LOCK_PVH(pai);
3452
3453 /*
3454 * Walk down PV list, clearing all modify or reference bits.
3455 * We do not have to lock the pv_list because we have
3456 * the entire pmap system locked.
3457 */
3458 if (pv_h->pmap != PMAP_NULL) {
3459 /*
3460 * There are some mappings.
3461 */
3462
3463 pv_e = (pv_hashed_entry_t)pv_h;
3464
3465 do {
3466 pmap = pv_e->pmap;
3467
3468 {
3469 vm_map_offset_t va;
3470
3471 va = pv_e->va;
3472
3473 /*
3474 * Clear modify and/or reference bits.
3475 */
3476
3477 pte = pmap_pte(pmap, va);
3478 pmap_update_pte(pte, *pte, (*pte & ~bits));
3479 /* Ensure all processors using this translation
3480 * invalidate this TLB entry. The invalidation *must* follow
3481 * the PTE update, to ensure that the TLB shadow of the
3482 * 'D' bit (in particular) is synchronized with the
3483 * updated PTE.
3484 */
3485 PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
3486 }
3487
3488 pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
3489
3490 } while (pv_e != (pv_hashed_entry_t)pv_h);
3491 }
3492 pmap_phys_attributes[pai] &= ~bits;
3493
3494 UNLOCK_PVH(pai);
3495
3496 PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END,
3497 0, 0, 0, 0, 0);
3498
3499 }
3500
3501 /*
3502 * Check specified attribute bits.
3503 */
3504 int
3505 phys_attribute_test(
3506 ppnum_t pn,
3507 int bits)
3508 {
3509 pv_rooted_entry_t pv_h;
3510 register pv_hashed_entry_t pv_e;
3511 register pt_entry_t *pte;
3512 int pai;
3513 register pmap_t pmap;
3514 int attributes = 0;
3515
3516 pmap_intr_assert();
3517 assert(pn != vm_page_fictitious_addr);
3518 if (pn == vm_page_guard_addr)
3519 return 0;
3520
3521 pai = ppn_to_pai(pn);
3522
3523 if (!managed_page(pai)) {
3524 /*
3525 * Not a managed page.
3526 */
3527 return (0);
3528 }
3529
3530 /*
3531 * super fast check... if bits already collected
3532 * no need to take any locks...
3533 * if not set, we need to recheck after taking
3534 * the lock in case they got pulled in while
3535 * we were waiting for the lock
3536 */
3537 if ( (pmap_phys_attributes[pai] & bits) == bits)
3538 return (bits);
3539
3540 pv_h = pai_to_pvh(pai);
3541
3542 LOCK_PVH(pai);
3543
3544 attributes = pmap_phys_attributes[pai] & bits;
3545
3546
3547 /*
3548 * Walk down PV list, checking the mappings until we
3549 * reach the end or we've found the attributes we've asked for
3550 * We do not have to lock the pv_list because we have
3551 * the entire pmap system locked.
3552 */
3553 if (pv_h->pmap != PMAP_NULL) {
3554 /*
3555 * There are some mappings.
3556 */
3557 pv_e = (pv_hashed_entry_t)pv_h;
3558 if (attributes != bits) do {
3559
3560 pmap = pv_e->pmap;
3561
3562 {
3563 vm_map_offset_t va;
3564
3565 va = pv_e->va;
3566 /*
3567 * first make sure any processor actively
3568 * using this pmap, flushes its TLB state
3569 */
3570 PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
3571
3572 /*
3573 * pick up modify and/or reference bits from this mapping
3574 */
3575 pte = pmap_pte(pmap, va);
3576 attributes |= (int)(*pte & bits);
3577
3578 }
3579
3580 pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
3581
3582 } while ((attributes != bits) && (pv_e != (pv_hashed_entry_t)pv_h));
3583 }
3584
3585 UNLOCK_PVH(pai);
3586 return (attributes);
3587 }
3588
3589 /*
3590 * Set specified attribute bits.
3591 */
3592 void
3593 phys_attribute_set(
3594 ppnum_t pn,
3595 int bits)
3596 {
3597 int pai;
3598
3599 pmap_intr_assert();
3600 assert(pn != vm_page_fictitious_addr);
3601 if (pn == vm_page_guard_addr)
3602 return;
3603
3604 pai = ppn_to_pai(pn);
3605
3606 if (!managed_page(pai)) {
3607 /*
3608 * Not a managed page.
3609 */
3610 return;
3611 }
3612
3613 LOCK_PVH(pai);
3614
3615 pmap_phys_attributes[pai] |= bits;
3616
3617 UNLOCK_PVH(pai);
3618 }
3619
3620 /*
3621 * Set the modify bit on the specified physical page.
3622 */
3623
3624 void pmap_set_modify(
3625 ppnum_t pn)
3626 {
3627 phys_attribute_set(pn, PHYS_MODIFIED);
3628 }
3629
3630 /*
3631 * Clear the modify bits on the specified physical page.
3632 */
3633
3634 void
3635 pmap_clear_modify(
3636 ppnum_t pn)
3637 {
3638 phys_attribute_clear(pn, PHYS_MODIFIED);
3639 }
3640
3641 /*
3642 * pmap_is_modified:
3643 *
3644 * Return whether or not the specified physical page is modified
3645 * by any physical maps.
3646 */
3647
3648 boolean_t
3649 pmap_is_modified(
3650 ppnum_t pn)
3651 {
3652 if (phys_attribute_test(pn, PHYS_MODIFIED))
3653 return TRUE;
3654
3655 return FALSE;
3656 }
3657
3658 /*
3659 * pmap_clear_reference:
3660 *
3661 * Clear the reference bit on the specified physical page.
3662 */
3663
3664 void
3665 pmap_clear_reference(
3666 ppnum_t pn)
3667 {
3668 phys_attribute_clear(pn, PHYS_REFERENCED);
3669 }
3670
3671 void
3672 pmap_set_reference(ppnum_t pn)
3673 {
3674 phys_attribute_set(pn, PHYS_REFERENCED);
3675 }
3676
3677 /*
3678 * pmap_is_referenced:
3679 *
3680 * Return whether or not the specified physical page is referenced
3681 * by any physical maps.
3682 */
3683
3684 boolean_t
3685 pmap_is_referenced(
3686 ppnum_t pn)
3687 {
3688 if (phys_attribute_test(pn, PHYS_REFERENCED))
3689 return TRUE;
3690
3691 return FALSE;
3692 }
3693
3694 /*
3695 * pmap_get_refmod(phys)
3696 * returns the referenced and modified bits of the specified
3697 * physical page.
3698 */
3699 unsigned int
3700 pmap_get_refmod(ppnum_t pa)
3701 {
3702 int refmod;
3703 unsigned int retval = 0;
3704
3705 refmod = phys_attribute_test(pa, PHYS_MODIFIED | PHYS_REFERENCED);
3706
3707 if (refmod & PHYS_MODIFIED)
3708 retval |= VM_MEM_MODIFIED;
3709 if (refmod & PHYS_REFERENCED)
3710 retval |= VM_MEM_REFERENCED;
3711
3712 return (retval);
3713 }
3714
3715 /*
3716 * pmap_clear_refmod(phys, mask)
3717 * clears the referenced and modified bits as specified by the mask
3718 * of the specified physical page.
3719 */
3720 void
3721 pmap_clear_refmod(ppnum_t pa, unsigned int mask)
3722 {
3723 unsigned int x86Mask;
3724
3725 x86Mask = ( ((mask & VM_MEM_MODIFIED)? PHYS_MODIFIED : 0)
3726 | ((mask & VM_MEM_REFERENCED)? PHYS_REFERENCED : 0));
3727 phys_attribute_clear(pa, x86Mask);
3728 }
3729
3730 void
3731 invalidate_icache(__unused vm_offset_t addr,
3732 __unused unsigned cnt,
3733 __unused int phys)
3734 {
3735 return;
3736 }
3737 void
3738 flush_dcache(__unused vm_offset_t addr,
3739 __unused unsigned count,
3740 __unused int phys)
3741 {
3742 return;
3743 }
3744
3745 #if CONFIG_DTRACE
3746 /*
3747 * Constrain DTrace copyin/copyout actions
3748 */
3749 extern kern_return_t dtrace_copyio_preflight(addr64_t);
3750 extern kern_return_t dtrace_copyio_postflight(addr64_t);
3751
3752 kern_return_t dtrace_copyio_preflight(__unused addr64_t va)
3753 {
3754 thread_t thread = current_thread();
3755
3756 if (current_map() == kernel_map)
3757 return KERN_FAILURE;
3758 else if (thread->machine.specFlags & CopyIOActive)
3759 return KERN_FAILURE;
3760 else
3761 return KERN_SUCCESS;
3762 }
3763
3764 kern_return_t dtrace_copyio_postflight(__unused addr64_t va)
3765 {
3766 return KERN_SUCCESS;
3767 }
3768 #endif /* CONFIG_DTRACE */
3769
3770 #if MACH_KDB
3771
3772 /* show phys page mappings and attributes */
3773
3774 extern void db_show_page(pmap_paddr_t pa);
3775
3776 #if 0
3777 void
3778 db_show_page(pmap_paddr_t pa)
3779 {
3780 pv_entry_t pv_h;
3781 int pai;
3782 char attr;
3783
3784 pai = pa_index(pa);
3785 pv_h = pai_to_pvh(pai);
3786
3787 attr = pmap_phys_attributes[pai];
3788 printf("phys page %llx ", pa);
3789 if (attr & PHYS_MODIFIED)
3790 printf("modified, ");
3791 if (attr & PHYS_REFERENCED)
3792 printf("referenced, ");
3793 if (pv_h->pmap || pv_h->next)
3794 printf(" mapped at\n");
3795 else
3796 printf(" not mapped\n");
3797 for (; pv_h; pv_h = pv_h->next)
3798 if (pv_h->pmap)
3799 printf("%llx in pmap %p\n", pv_h->va, pv_h->pmap);
3800 }
3801 #endif
3802
3803 #endif /* MACH_KDB */
3804
3805 #if MACH_KDB
3806 #if 0
3807 void db_kvtophys(vm_offset_t);
3808 void db_show_vaddrs(pt_entry_t *);
3809
3810 /*
3811 * print out the results of kvtophys(arg)
3812 */
3813 void
3814 db_kvtophys(
3815 vm_offset_t vaddr)
3816 {
3817 db_printf("0x%qx", kvtophys(vaddr));
3818 }
3819
3820 /*
3821 * Walk the pages tables.
3822 */
3823 void
3824 db_show_vaddrs(
3825 pt_entry_t *dirbase)
3826 {
3827 pt_entry_t *ptep, *pdep, tmp;
3828 unsigned int x, y, pdecnt, ptecnt;
3829
3830 if (dirbase == 0) {
3831 dirbase = kernel_pmap->dirbase;
3832 }
3833 if (dirbase == 0) {
3834 db_printf("need a dirbase...\n");
3835 return;
3836 }
3837 dirbase = (pt_entry_t *) (int) ((unsigned long) dirbase & ~INTEL_OFFMASK);
3838
3839 db_printf("dirbase: 0x%x\n", dirbase);
3840
3841 pdecnt = ptecnt = 0;
3842 pdep = &dirbase[0];
3843 for (y = 0; y < NPDEPG; y++, pdep++) {
3844 if (((tmp = *pdep) & INTEL_PTE_VALID) == 0) {
3845 continue;
3846 }
3847 pdecnt++;
3848 ptep = (pt_entry_t *) ((unsigned long)(*pdep) & ~INTEL_OFFMASK);
3849 db_printf("dir[%4d]: 0x%x\n", y, *pdep);
3850 for (x = 0; x < NPTEPG; x++, ptep++) {
3851 if (((tmp = *ptep) & INTEL_PTE_VALID) == 0) {
3852 continue;
3853 }
3854 ptecnt++;
3855 db_printf(" tab[%4d]: 0x%x, va=0x%x, pa=0x%x\n",
3856 x,
3857 *ptep,
3858 (y << 22) | (x << 12),
3859 *ptep & ~INTEL_OFFMASK);
3860 }
3861 }
3862
3863 db_printf("total: %d tables, %d page table entries.\n", pdecnt, ptecnt);
3864
3865 }
3866 #endif
3867 #endif /* MACH_KDB */
3868
3869 #include <mach_vm_debug.h>
3870 #if MACH_VM_DEBUG
3871 #include <vm/vm_debug.h>
3872
3873 int
3874 pmap_list_resident_pages(
3875 __unused pmap_t pmap,
3876 __unused vm_offset_t *listp,
3877 __unused int space)
3878 {
3879 return 0;
3880 }
3881 #endif /* MACH_VM_DEBUG */
3882
3883
3884
3885 /* temporary workaround */
3886 boolean_t
3887 coredumpok(__unused vm_map_t map, __unused vm_offset_t va)
3888 {
3889 #if 0
3890 pt_entry_t *ptep;
3891
3892 ptep = pmap_pte(map->pmap, va);
3893 if (0 == ptep)
3894 return FALSE;
3895 return ((*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED));
3896 #else
3897 return TRUE;
3898 #endif
3899 }
3900
3901
3902 boolean_t
3903 phys_page_exists(
3904 ppnum_t pn)
3905 {
3906 assert(pn != vm_page_fictitious_addr);
3907
3908 if (!pmap_initialized)
3909 return (TRUE);
3910
3911 if (pn == vm_page_guard_addr)
3912 return FALSE;
3913
3914 if (!managed_page(ppn_to_pai(pn)))
3915 return (FALSE);
3916
3917 return TRUE;
3918 }
3919
3920 void
3921 mapping_free_prime(void)
3922 {
3923 int i;
3924 pv_hashed_entry_t pvh_e;
3925 pv_hashed_entry_t pvh_eh;
3926 pv_hashed_entry_t pvh_et;
3927 int pv_cnt;
3928
3929 pv_cnt = 0;
3930 pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3931 for (i = 0; i < (5 * PV_HASHED_ALLOC_CHUNK); i++) {
3932 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3933
3934 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3935 pvh_eh = pvh_e;
3936
3937 if (pvh_et == PV_HASHED_ENTRY_NULL)
3938 pvh_et = pvh_e;
3939 pv_cnt++;
3940 }
3941 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
3942
3943 pv_cnt = 0;
3944 pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3945 for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
3946 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3947
3948 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3949 pvh_eh = pvh_e;
3950
3951 if (pvh_et == PV_HASHED_ENTRY_NULL)
3952 pvh_et = pvh_e;
3953 pv_cnt++;
3954 }
3955 PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
3956
3957 }
3958
3959 void
3960 mapping_adjust(void)
3961 {
3962 pv_hashed_entry_t pvh_e;
3963 pv_hashed_entry_t pvh_eh;
3964 pv_hashed_entry_t pvh_et;
3965 int pv_cnt;
3966 int i;
3967
3968 if (mapping_adjust_call == NULL) {
3969 thread_call_setup(&mapping_adjust_call_data,
3970 (thread_call_func_t) mapping_adjust,
3971 (thread_call_param_t) NULL);
3972 mapping_adjust_call = &mapping_adjust_call_data;
3973 }
3974
3975 pv_cnt = 0;
3976 pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3977 if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) {
3978 for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
3979 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3980
3981 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3982 pvh_eh = pvh_e;
3983
3984 if (pvh_et == PV_HASHED_ENTRY_NULL)
3985 pvh_et = pvh_e;
3986 pv_cnt++;
3987 }
3988 PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
3989 }
3990
3991 pv_cnt = 0;
3992 pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3993 if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) {
3994 for (i = 0; i < PV_HASHED_ALLOC_CHUNK; i++) {
3995 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3996
3997 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3998 pvh_eh = pvh_e;
3999
4000 if (pvh_et == PV_HASHED_ENTRY_NULL)
4001 pvh_et = pvh_e;
4002 pv_cnt++;
4003 }
4004 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
4005 }
4006 mappingrecurse = 0;
4007 }
4008
4009 void
4010 pmap_commpage32_init(vm_offset_t kernel_commpage, vm_offset_t user_commpage, int cnt)
4011 {
4012 int i;
4013 pt_entry_t *opte, *npte;
4014 pt_entry_t pte;
4015 spl_t s;
4016
4017 for (i = 0; i < cnt; i++) {
4018 s = splhigh();
4019 opte = pmap_pte(kernel_pmap, (vm_map_offset_t)kernel_commpage);
4020 if (0 == opte)
4021 panic("kernel_commpage");
4022 pte = *opte | INTEL_PTE_USER|INTEL_PTE_GLOBAL;
4023 pte &= ~INTEL_PTE_WRITE; // ensure read only
4024 npte = pmap_pte(kernel_pmap, (vm_map_offset_t)user_commpage);
4025 if (0 == npte)
4026 panic("user_commpage");
4027 pmap_store_pte(npte, pte);
4028 splx(s);
4029 kernel_commpage += INTEL_PGBYTES;
4030 user_commpage += INTEL_PGBYTES;
4031 }
4032 }
4033
4034
4035 #define PMAP_COMMPAGE64_CNT (_COMM_PAGE64_AREA_USED/PAGE_SIZE)
4036 pt_entry_t pmap_commpage64_ptes[PMAP_COMMPAGE64_CNT];
4037
4038 void
4039 pmap_commpage64_init(vm_offset_t kernel_commpage, __unused vm_map_offset_t user_commpage, int cnt)
4040 {
4041 int i;
4042 pt_entry_t *kptep;
4043
4044 PMAP_LOCK(kernel_pmap);
4045
4046 for (i = 0; i < cnt; i++) {
4047 kptep = pmap_pte(kernel_pmap, (uint64_t)kernel_commpage + (i*PAGE_SIZE));
4048 if ((0 == kptep) || (0 == (*kptep & INTEL_PTE_VALID)))
4049 panic("pmap_commpage64_init pte");
4050 pmap_commpage64_ptes[i] = ((*kptep & ~INTEL_PTE_WRITE) | INTEL_PTE_USER);
4051 }
4052 PMAP_UNLOCK(kernel_pmap);
4053 }
4054
4055
4056 static cpu_pmap_t cpu_pmap_master;
4057
4058 struct cpu_pmap *
4059 pmap_cpu_alloc(boolean_t is_boot_cpu)
4060 {
4061 int ret;
4062 int i;
4063 cpu_pmap_t *cp;
4064 vm_offset_t address;
4065 vm_map_address_t mapaddr;
4066 vm_map_entry_t entry;
4067 pt_entry_t *pte;
4068
4069 if (is_boot_cpu) {
4070 cp = &cpu_pmap_master;
4071 } else {
4072 /*
4073 * The per-cpu pmap data structure itself.
4074 */
4075 ret = kmem_alloc(kernel_map,
4076 (vm_offset_t *) &cp, sizeof(cpu_pmap_t));
4077 if (ret != KERN_SUCCESS) {
4078 printf("pmap_cpu_alloc() failed ret=%d\n", ret);
4079 return NULL;
4080 }
4081 bzero((void *)cp, sizeof(cpu_pmap_t));
4082
4083 /*
4084 * The temporary windows used for copy/zero - see loose_ends.c
4085 */
4086 ret = vm_map_find_space(kernel_map,
4087 &mapaddr, PMAP_NWINDOWS*PAGE_SIZE, (vm_map_offset_t)0, 0, &entry);
4088 if (ret != KERN_SUCCESS) {
4089 printf("pmap_cpu_alloc() "
4090 "vm_map_find_space ret=%d\n", ret);
4091 pmap_cpu_free(cp);
4092 return NULL;
4093 }
4094 address = (vm_offset_t)mapaddr;
4095
4096 for (i = 0; i < PMAP_NWINDOWS; i++, address += PAGE_SIZE) {
4097 spl_t s;
4098 s = splhigh();
4099 while ((pte = pmap_pte(kernel_pmap, (vm_map_offset_t)address)) == 0)
4100 pmap_expand(kernel_pmap, (vm_map_offset_t)address);
4101 * (int *) pte = 0;
4102 cp->mapwindow[i].prv_CADDR = (caddr_t) address;
4103 cp->mapwindow[i].prv_CMAP = pte;
4104 splx(s);
4105 }
4106 vm_map_unlock(kernel_map);
4107 }
4108
4109 cp->pdpt_window_index = PMAP_PDPT_FIRST_WINDOW;
4110 cp->pde_window_index = PMAP_PDE_FIRST_WINDOW;
4111 cp->pte_window_index = PMAP_PTE_FIRST_WINDOW;
4112
4113 return cp;
4114 }
4115
4116 void
4117 pmap_cpu_free(struct cpu_pmap *cp)
4118 {
4119 if (cp != NULL && cp != &cpu_pmap_master) {
4120 kfree((void *) cp, sizeof(cpu_pmap_t));
4121 }
4122 }
4123
4124
4125 mapwindow_t *
4126 pmap_get_mapwindow(pt_entry_t pentry)
4127 {
4128 mapwindow_t *mp;
4129 int i;
4130
4131 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
4132
4133 /*
4134 * Note: 0th map reserved for pmap_pte()
4135 */
4136 for (i = PMAP_NWINDOWS_FIRSTFREE; i < PMAP_NWINDOWS; i++) {
4137 mp = &current_cpu_datap()->cpu_pmap->mapwindow[i];
4138
4139 if (*mp->prv_CMAP == 0) {
4140 pmap_store_pte(mp->prv_CMAP, pentry);
4141
4142 invlpg((uintptr_t)mp->prv_CADDR);
4143
4144 return (mp);
4145 }
4146 }
4147 panic("pmap_get_mapwindow: no windows available");
4148
4149 return NULL;
4150 }
4151
4152
4153 void
4154 pmap_put_mapwindow(mapwindow_t *mp)
4155 {
4156 pmap_store_pte(mp->prv_CMAP, 0);
4157 }
4158
4159 void
4160 pmap_switch(pmap_t tpmap)
4161 {
4162 spl_t s;
4163
4164 s = splhigh(); /* Make sure interruptions are disabled */
4165
4166 set_dirbase(tpmap, current_thread());
4167
4168 splx(s);
4169 }
4170
4171
4172 /*
4173 * disable no-execute capability on
4174 * the specified pmap
4175 */
4176 void pmap_disable_NX(pmap_t pmap) {
4177
4178 pmap->nx_enabled = 0;
4179 }
4180
4181 void
4182 pt_fake_zone_info(int *count, vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size,
4183 vm_size_t *alloc_size, int *collectable, int *exhaustable)
4184 {
4185 *count = inuse_ptepages_count;
4186 *cur_size = PAGE_SIZE * inuse_ptepages_count;
4187 *max_size = PAGE_SIZE * (inuse_ptepages_count + vm_page_inactive_count + vm_page_active_count + vm_page_free_count);
4188 *elem_size = PAGE_SIZE;
4189 *alloc_size = PAGE_SIZE;
4190
4191 *collectable = 1;
4192 *exhaustable = 0;
4193 }
4194
4195 vm_offset_t pmap_cpu_high_map_vaddr(int cpu, enum high_cpu_types e)
4196 {
4197 enum high_fixed_addresses a;
4198 a = e + HIGH_CPU_END * cpu;
4199 return pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN + a);
4200 }
4201
4202 vm_offset_t pmap_high_map_vaddr(enum high_cpu_types e)
4203 {
4204 return pmap_cpu_high_map_vaddr(cpu_number(), e);
4205 }
4206
4207 vm_offset_t pmap_high_map(pt_entry_t pte, enum high_cpu_types e)
4208 {
4209 enum high_fixed_addresses a;
4210 vm_offset_t vaddr;
4211
4212 a = e + HIGH_CPU_END * cpu_number();
4213 vaddr = (vm_offset_t)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN + a);
4214 pmap_store_pte(pte_unique_base + a, pte);
4215
4216 /* TLB flush for this page for this cpu */
4217 invlpg((uintptr_t)vaddr);
4218
4219 return vaddr;
4220 }
4221
4222 static inline void
4223 pmap_cpuset_NMIPI(cpu_set cpu_mask) {
4224 unsigned int cpu, cpu_bit;
4225 uint64_t deadline;
4226
4227 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
4228 if (cpu_mask & cpu_bit)
4229 cpu_NMI_interrupt(cpu);
4230 }
4231 deadline = mach_absolute_time() + (LockTimeOut);
4232 while (mach_absolute_time() < deadline)
4233 cpu_pause();
4234 }
4235
4236 /*
4237 * Called with pmap locked, we:
4238 * - scan through per-cpu data to see which other cpus need to flush
4239 * - send an IPI to each non-idle cpu to be flushed
4240 * - wait for all to signal back that they are inactive or we see that
4241 * they are in an interrupt handler or at a safe point
4242 * - flush the local tlb is active for this pmap
4243 * - return ... the caller will unlock the pmap
4244 */
4245 void
4246 pmap_flush_tlbs(pmap_t pmap)
4247 {
4248 unsigned int cpu;
4249 unsigned int cpu_bit;
4250 cpu_set cpus_to_signal;
4251 unsigned int my_cpu = cpu_number();
4252 pmap_paddr_t pmap_cr3 = pmap->pm_cr3;
4253 boolean_t flush_self = FALSE;
4254 uint64_t deadline;
4255
4256 assert((processor_avail_count < 2) ||
4257 (ml_get_interrupts_enabled() && get_preemption_level() != 0));
4258
4259 /*
4260 * Scan other cpus for matching active or task CR3.
4261 * For idle cpus (with no active map) we mark them invalid but
4262 * don't signal -- they'll check as they go busy.
4263 * Note: for the kernel pmap we look for 64-bit shared address maps.
4264 */
4265 cpus_to_signal = 0;
4266 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
4267 if (!cpu_datap(cpu)->cpu_running)
4268 continue;
4269 if ((cpu_datap(cpu)->cpu_task_cr3 == pmap_cr3) ||
4270 (CPU_GET_ACTIVE_CR3(cpu) == pmap_cr3) ||
4271 (pmap->pm_shared) ||
4272 ((pmap == kernel_pmap) &&
4273 (!CPU_CR3_IS_ACTIVE(cpu) ||
4274 cpu_datap(cpu)->cpu_task_map == TASK_MAP_64BIT_SHARED))) {
4275 if (cpu == my_cpu) {
4276 flush_self = TRUE;
4277 continue;
4278 }
4279 cpu_datap(cpu)->cpu_tlb_invalid = TRUE;
4280 __asm__ volatile("mfence");
4281
4282 if (CPU_CR3_IS_ACTIVE(cpu)) {
4283 cpus_to_signal |= cpu_bit;
4284 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
4285 }
4286 }
4287 }
4288
4289 PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_START,
4290 (int) pmap, cpus_to_signal, flush_self, 0, 0);
4291
4292 if (cpus_to_signal) {
4293 cpu_set cpus_to_respond = cpus_to_signal;
4294
4295 deadline = mach_absolute_time() + LockTimeOut;
4296 /*
4297 * Wait for those other cpus to acknowledge
4298 */
4299 while (cpus_to_respond != 0) {
4300 if (mach_absolute_time() > deadline) {
4301 if (mp_recent_debugger_activity())
4302 continue;
4303 if (!panic_active()) {
4304 pmap_tlb_flush_timeout = TRUE;
4305 pmap_cpuset_NMIPI(cpus_to_respond);
4306 }
4307 panic("pmap_flush_tlbs() timeout: "
4308 "cpu(s) failing to respond to interrupts, pmap=%p cpus_to_respond=0x%lx",
4309 pmap, cpus_to_respond);
4310 }
4311
4312 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
4313 if ((cpus_to_respond & cpu_bit) != 0) {
4314 if (!cpu_datap(cpu)->cpu_running ||
4315 cpu_datap(cpu)->cpu_tlb_invalid == FALSE ||
4316 !CPU_CR3_IS_ACTIVE(cpu)) {
4317 cpus_to_respond &= ~cpu_bit;
4318 }
4319 cpu_pause();
4320 }
4321 if (cpus_to_respond == 0)
4322 break;
4323 }
4324 }
4325 }
4326 /*
4327 * Flush local tlb if required.
4328 * We need this flush even if the pmap being changed
4329 * is the user map... in case we do a copyin/out
4330 * before returning to user mode.
4331 */
4332 if (flush_self)
4333 flush_tlb();
4334
4335 if ((pmap == kernel_pmap) && (flush_self != TRUE)) {
4336 panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map);
4337 }
4338
4339 PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_END,
4340 (int) pmap, cpus_to_signal, flush_self, 0, 0);
4341 }
4342
4343 void
4344 process_pmap_updates(void)
4345 {
4346 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
4347
4348 flush_tlb();
4349
4350 current_cpu_datap()->cpu_tlb_invalid = FALSE;
4351 __asm__ volatile("mfence");
4352 }
4353
4354 void
4355 pmap_update_interrupt(void)
4356 {
4357 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START,
4358 0, 0, 0, 0, 0);
4359
4360 process_pmap_updates();
4361
4362 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END,
4363 0, 0, 0, 0, 0);
4364 }
4365
4366
4367 unsigned int pmap_cache_attributes(ppnum_t pn) {
4368
4369 if (!managed_page(ppn_to_pai(pn)))
4370 return (VM_WIMG_IO);
4371
4372 return (VM_WIMG_COPYBACK);
4373 }
4374
4375 #ifdef PMAP_DEBUG
4376 void
4377 pmap_dump(pmap_t p)
4378 {
4379 int i;
4380
4381 kprintf("pmap 0x%x\n",p);
4382
4383 kprintf(" pm_cr3 0x%llx\n",p->pm_cr3);
4384 kprintf(" pm_pml4 0x%x\n",p->pm_pml4);
4385 kprintf(" pm_pdpt 0x%x\n",p->pm_pdpt);
4386
4387 kprintf(" pml4[0] 0x%llx\n",*p->pm_pml4);
4388 for (i=0;i<8;i++)
4389 kprintf(" pdpt[%d] 0x%llx\n",i, p->pm_pdpt[i]);
4390 }
4391
4392 void pmap_dump_wrap(void)
4393 {
4394 pmap_dump(current_cpu_datap()->cpu_active_thread->task->map->pmap);
4395 }
4396
4397 void
4398 dump_4GB_pdpt(pmap_t p)
4399 {
4400 int spl;
4401 pdpt_entry_t *user_pdptp;
4402 pdpt_entry_t *kern_pdptp;
4403 pdpt_entry_t *pml4p;
4404
4405 spl = splhigh();
4406 while ((user_pdptp = pmap64_pdpt(p, 0x0)) == PDPT_ENTRY_NULL) {
4407 splx(spl);
4408 pmap_expand_pml4(p, 0x0);
4409 spl = splhigh();
4410 }
4411 kern_pdptp = kernel_pmap->pm_pdpt;
4412 if (kern_pdptp == NULL)
4413 panic("kern_pdptp == NULL");
4414 kprintf("dump_4GB_pdpt(%p)\n"
4415 "kern_pdptp=%p (phys=0x%016llx)\n"
4416 "\t 0x%08x: 0x%016llx\n"
4417 "\t 0x%08x: 0x%016llx\n"
4418 "\t 0x%08x: 0x%016llx\n"
4419 "\t 0x%08x: 0x%016llx\n"
4420 "\t 0x%08x: 0x%016llx\n"
4421 "user_pdptp=%p (phys=0x%016llx)\n"
4422 "\t 0x%08x: 0x%016llx\n"
4423 "\t 0x%08x: 0x%016llx\n"
4424 "\t 0x%08x: 0x%016llx\n"
4425 "\t 0x%08x: 0x%016llx\n"
4426 "\t 0x%08x: 0x%016llx\n",
4427 p, kern_pdptp, kvtophys(kern_pdptp),
4428 kern_pdptp+0, *(kern_pdptp+0),
4429 kern_pdptp+1, *(kern_pdptp+1),
4430 kern_pdptp+2, *(kern_pdptp+2),
4431 kern_pdptp+3, *(kern_pdptp+3),
4432 kern_pdptp+4, *(kern_pdptp+4),
4433 user_pdptp, kvtophys(user_pdptp),
4434 user_pdptp+0, *(user_pdptp+0),
4435 user_pdptp+1, *(user_pdptp+1),
4436 user_pdptp+2, *(user_pdptp+2),
4437 user_pdptp+3, *(user_pdptp+3),
4438 user_pdptp+4, *(user_pdptp+4));
4439 kprintf("user pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
4440 p->pm_cr3, p->pm_hold, p->pm_pml4);
4441 pml4p = (pdpt_entry_t *)p->pm_hold;
4442 if (pml4p == NULL)
4443 panic("user pml4p == NULL");
4444 kprintf("\t 0x%08x: 0x%016llx\n"
4445 "\t 0x%08x: 0x%016llx\n",
4446 pml4p+0, *(pml4p),
4447 pml4p+KERNEL_UBER_PML4_INDEX, *(pml4p+KERNEL_UBER_PML4_INDEX));
4448 kprintf("kern pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
4449 kernel_pmap->pm_cr3, kernel_pmap->pm_hold, kernel_pmap->pm_pml4);
4450 pml4p = (pdpt_entry_t *)kernel_pmap->pm_hold;
4451 if (pml4p == NULL)
4452 panic("kern pml4p == NULL");
4453 kprintf("\t 0x%08x: 0x%016llx\n"
4454 "\t 0x%08x: 0x%016llx\n",
4455 pml4p+0, *(pml4p),
4456 pml4p+511, *(pml4p+511));
4457 splx(spl);
4458 }
4459
4460 void dump_4GB_pdpt_thread(thread_t tp)
4461 {
4462 dump_4GB_pdpt(tp->map->pmap);
4463 }
4464
4465
4466 #endif
4467