]> git.saurik.com Git - apple/xnu.git/blame - osfmk/i386/pmap.c
xnu-1486.2.11.tar.gz
[apple/xnu.git] / osfmk / i386 / pmap.c
CommitLineData
1c79356b 1/*
c910b4d9 2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58
59/*
60 * File: pmap.c
61 * Author: Avadis Tevanian, Jr., Michael Wayne Young
62 * (These guys wrote the Vax version)
63 *
64 * Physical Map management code for Intel i386, i486, and i860.
65 *
66 * Manages physical address maps.
67 *
68 * In addition to hardware address maps, this
69 * module is called upon to provide software-use-only
70 * maps which may or may not be stored in the same
71 * form as hardware maps. These pseudo-maps are
72 * used to store intermediate results from copy
73 * operations to and from address spaces.
74 *
75 * Since the information managed by this module is
76 * also stored by the logical address mapping module,
77 * this module may throw away valid virtual-to-physical
78 * mappings at almost any time. However, invalidations
79 * of virtual-to-physical mappings must be done as
80 * requested.
81 *
82 * In order to cope with hardware architectures which
83 * make virtual-to-physical map invalidates expensive,
84 * this module may delay invalidate or reduced protection
85 * operations until such time as they are actually
86 * necessary. This module is given full information as
87 * to which processors are currently using which maps,
88 * and to when physical maps must be made correct.
89 */
90
1c79356b
A
91#include <string.h>
92#include <norma_vm.h>
93#include <mach_kdb.h>
94#include <mach_ldebug.h>
95
2d21ac55
A
96#include <libkern/OSAtomic.h>
97
1c79356b
A
98#include <mach/machine/vm_types.h>
99
100#include <mach/boolean.h>
101#include <kern/thread.h>
102#include <kern/zalloc.h>
2d21ac55 103#include <kern/queue.h>
1c79356b
A
104
105#include <kern/lock.h>
91447636 106#include <kern/kalloc.h>
1c79356b
A
107#include <kern/spl.h>
108
109#include <vm/pmap.h>
110#include <vm/vm_map.h>
111#include <vm/vm_kern.h>
112#include <mach/vm_param.h>
113#include <mach/vm_prot.h>
114#include <vm/vm_object.h>
115#include <vm/vm_page.h>
116
117#include <mach/machine/vm_param.h>
118#include <machine/thread.h>
119
120#include <kern/misc_protos.h> /* prototyping */
121#include <i386/misc_protos.h>
122
123#include <i386/cpuid.h>
91447636 124#include <i386/cpu_data.h>
55e303ae
A
125#include <i386/cpu_number.h>
126#include <i386/machine_cpu.h>
0c530ab8 127#include <i386/seg.h>
2d21ac55 128#include <i386/serial_io.h>
0c530ab8 129#include <i386/cpu_capabilities.h>
2d21ac55
A
130#include <i386/machine_routines.h>
131#include <i386/proc_reg.h>
132#include <i386/tsc.h>
b0d623f7
A
133#include <i386/acpi.h>
134#include <i386/pmap_internal.h>
1c79356b
A
135
136#if MACH_KDB
137#include <ddb/db_command.h>
138#include <ddb/db_output.h>
139#include <ddb/db_sym.h>
140#include <ddb/db_print.h>
141#endif /* MACH_KDB */
142
91447636
A
143#include <vm/vm_protos.h>
144
145#include <i386/mp.h>
0c530ab8 146#include <i386/mp_desc.h>
b0d623f7 147#include <i386/i386_lowmem.h>
0c530ab8 148
0c530ab8 149
2d21ac55
A
150/* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
151#ifdef DEBUGINTERRUPTS
152#define pmap_intr_assert() {if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) panic("pmap interrupt assert %s, %d",__FILE__, __LINE__);}
153#else
154#define pmap_intr_assert()
155#endif
156
0c530ab8
A
157#ifdef IWANTTODEBUG
158#undef DEBUG
159#define DEBUG 1
160#define POSTCODE_DELAY 1
161#include <i386/postcode.h>
162#endif /* IWANTTODEBUG */
1c79356b
A
163
164/*
165 * Forward declarations for internal functions.
166 */
0c530ab8 167
b0d623f7 168void pmap_remove_range(
1c79356b 169 pmap_t pmap,
0c530ab8 170 vm_map_offset_t va,
1c79356b
A
171 pt_entry_t *spte,
172 pt_entry_t *epte);
173
91447636 174void phys_attribute_clear(
2d21ac55 175 ppnum_t phys,
1c79356b
A
176 int bits);
177
2d21ac55
A
178int phys_attribute_test(
179 ppnum_t phys,
1c79356b
A
180 int bits);
181
91447636 182void phys_attribute_set(
2d21ac55 183 ppnum_t phys,
1c79356b
A
184 int bits);
185
91447636
A
186void pmap_set_reference(
187 ppnum_t pn);
188
91447636
A
189boolean_t phys_page_exists(
190 ppnum_t pn);
1c79356b 191
2d21ac55 192
0c530ab8
A
193#ifdef PMAP_DEBUG
194void dump_pmap(pmap_t);
195void dump_4GB_pdpt(pmap_t p);
196void dump_4GB_pdpt_thread(thread_t tp);
197#endif
1c79356b 198
0c530ab8 199int nx_enabled = 1; /* enable no-execute protection */
4a3eedf9
A
200#ifdef CONFIG_EMBEDDED
201int allow_data_exec = 0; /* no exec from data, embedded is hardcore like that */
202#else
2d21ac55 203int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
4a3eedf9 204#endif
2d21ac55 205int allow_stack_exec = 0; /* No apps may execute from the stack by default */
0c530ab8 206
b0d623f7
A
207boolean_t cpu_64bit = FALSE;
208boolean_t pmap_trace = FALSE;
1c79356b 209
2d21ac55
A
210/*
211 * when spinning through pmap_remove
212 * ensure that we don't spend too much
213 * time with preemption disabled.
214 * I'm setting the current threshold
215 * to 20us
216 */
217#define MAX_PREEMPTION_LATENCY_NS 20000
218
219uint64_t max_preemption_latency_tsc = 0;
220
55e303ae 221
1c79356b
A
222/*
223 * Private data structures.
224 */
225
226/*
227 * For each vm_page_t, there is a list of all currently
228 * valid virtual mappings of that page. An entry is
2d21ac55
A
229 * a pv_rooted_entry_t; the list is the pv_table.
230 *
231 * N.B. with the new combo rooted/hashed scheme it is
232 * only possibly to remove individual non-rooted entries
233 * if they are found via the hashed chains as there is no
234 * way to unlink the singly linked hashed entries if navigated to
235 * via the queue list off the rooted entries. Think of it as
236 * hash/walk/pull, keeping track of the prev pointer while walking
237 * the singly linked hash list. All of this is to save memory and
238 * keep both types of pv_entries as small as possible.
1c79356b
A
239 */
240
2d21ac55
A
241/*
242
243PV HASHING Changes - JK 1/2007
244
245Pve's establish physical to virtual mappings. These are used for aliasing of a
246physical page to (potentially many) virtual addresses within pmaps. In the previous
247implementation the structure of the pv_entries (each 16 bytes in size) was
248
1c79356b 249typedef struct pv_entry {
2d21ac55
A
250 struct pv_entry_t next;
251 pmap_t pmap;
252 vm_map_offset_t va;
1c79356b
A
253} *pv_entry_t;
254
2d21ac55
A
255An initial array of these is created at boot time, one per physical page of memory,
256indexed by the physical page number. Additionally, a pool of entries is created from a
257pv_zone to be used as needed by pmap_enter() when it is creating new mappings.
258Originally, we kept this pool around because the code in pmap_enter() was unable to
259block if it needed an entry and none were available - we'd panic. Some time ago I
260restructured the pmap_enter() code so that for user pmaps it can block while zalloc'ing
261a pv structure and restart, removing a panic from the code (in the case of the kernel
262pmap we cannot block and still panic, so, we keep a separate hot pool for use only on
263kernel pmaps). The pool has not been removed since there is a large performance gain
264keeping freed pv's around for reuse and not suffering the overhead of zalloc for every new pv we need.
265
266As pmap_enter() created new mappings it linked the new pve's for them off the fixed
267pv array for that ppn (off the next pointer). These pve's are accessed for several
268operations, one of them being address space teardown. In that case, we basically do this
269
270 for (every page/pte in the space) {
271 calc pve_ptr from the ppn in the pte
272 for (every pv in the list for the ppn) {
273 if (this pv is for this pmap/vaddr) {
274 do housekeeping
275 unlink/free the pv
276 }
277 }
278 }
279
280The problem arose when we were running, say 8000 (or even 2000) apache or other processes
281and one or all terminate. The list hanging off each pv array entry could have thousands of
282entries. We were continuously linearly searching each of these lists as we stepped through
283the address space we were tearing down. Because of the locks we hold, likely taking a cache
284miss for each node, and interrupt disabling for MP issues the system became completely
285unresponsive for many seconds while we did this.
286
287Realizing that pve's are accessed in two distinct ways (linearly running the list by ppn
288for operations like pmap_page_protect and finding and modifying/removing a single pve as
289part of pmap_enter processing) has led to modifying the pve structures and databases.
290
291There are now two types of pve structures. A "rooted" structure which is basically the
292original structure accessed in an array by ppn, and a ''hashed'' structure accessed on a
293hash list via a hash of [pmap, vaddr]. These have been designed with the two goals of
294minimizing wired memory and making the lookup of a ppn faster. Since a vast majority of
295pages in the system are not aliased and hence represented by a single pv entry I've kept
296the rooted entry size as small as possible because there is one of these dedicated for
297every physical page of memory. The hashed pve's are larger due to the addition of the hash
298link and the ppn entry needed for matching while running the hash list to find the entry we
299are looking for. This way, only systems that have lots of aliasing (like 2000+ httpd procs)
300will pay the extra memory price. Both structures have the same first three fields allowing
301some simplification in the code.
302
303They have these shapes
304
305typedef struct pv_rooted_entry {
306 queue_head_t qlink;
307 vm_map_offset_t va;
308 pmap_t pmap;
309} *pv_rooted_entry_t;
310
311
312typedef struct pv_hashed_entry {
313 queue_head_t qlink;
314 vm_map_offset_t va;
315 pmap_t pmap;
316 ppnum_t ppn;
317 struct pv_hashed_entry *nexth;
318} *pv_hashed_entry_t;
319
320The main flow difference is that the code is now aware of the rooted entry and the hashed
321entries. Code that runs the pv list still starts with the rooted entry and then continues
322down the qlink onto the hashed entries. Code that is looking up a specific pv entry first
323checks the rooted entry and then hashes and runs the hash list for the match. The hash list
324lengths are much smaller than the original pv lists that contained all aliases for the specific ppn.
1c79356b 325
2d21ac55
A
326*/
327
328typedef struct pv_rooted_entry { /* first three entries must match pv_hashed_entry_t */
329 queue_head_t qlink;
330 vm_map_offset_t va; /* virtual address for mapping */
331 pmap_t pmap; /* pmap where mapping lies */
332} *pv_rooted_entry_t;
333
334#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
335
336pv_rooted_entry_t pv_head_table; /* array of entries, one per page */
337
338typedef struct pv_hashed_entry { /* first three entries must match pv_rooted_entry_t */
339 queue_head_t qlink;
340 vm_map_offset_t va;
341 pmap_t pmap;
342 ppnum_t ppn;
343 struct pv_hashed_entry *nexth;
344} *pv_hashed_entry_t;
345
346#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
347
348#define NPVHASH 4095 /* MUST BE 2^N - 1 */
349pv_hashed_entry_t *pv_hash_table; /* hash lists */
350
351uint32_t npvhash = 0;
352
353/* #define PV_DEBUG 1 uncomment to enable some PV debugging code */
354#ifdef PV_DEBUG
355#define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
356#else
357#define CHK_NPVHASH()
358#endif
1c79356b
A
359
360/*
361 * pv_list entries are kept on a list that can only be accessed
362 * with the pmap system locked (at SPLVM, not in the cpus_active set).
2d21ac55 363 * The list is refilled from the pv_hashed_list_zone if it becomes empty.
1c79356b 364 */
2d21ac55
A
365pv_rooted_entry_t pv_free_list = PV_ROOTED_ENTRY_NULL; /* free list at SPLVM */
366pv_hashed_entry_t pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
367pv_hashed_entry_t pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
368decl_simple_lock_data(,pv_hashed_free_list_lock)
369decl_simple_lock_data(,pv_hashed_kern_free_list_lock)
370decl_simple_lock_data(,pv_hash_table_lock)
371
91447636 372int pv_free_count = 0;
2d21ac55
A
373int pv_hashed_free_count = 0;
374int pv_kern_free_count = 0;
375int pv_hashed_kern_free_count = 0;
376#define PV_HASHED_LOW_WATER_MARK 5000
377#define PV_HASHED_KERN_LOW_WATER_MARK 100
378#define PV_HASHED_ALLOC_CHUNK 2000
379#define PV_HASHED_KERN_ALLOC_CHUNK 50
91447636
A
380thread_call_t mapping_adjust_call;
381static thread_call_data_t mapping_adjust_call_data;
2d21ac55
A
382uint32_t mappingrecurse = 0;
383
384#define PV_HASHED_ALLOC(pvh_e) { \
385 simple_lock(&pv_hashed_free_list_lock); \
386 if ((pvh_e = pv_hashed_free_list) != 0) { \
387 pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
388 pv_hashed_free_count--; \
389 if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \
390 if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
391 thread_call_enter(mapping_adjust_call); \
392 } \
393 simple_unlock(&pv_hashed_free_list_lock); \
394}
395
396#define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
397 simple_lock(&pv_hashed_free_list_lock); \
398 pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; \
399 pv_hashed_free_list = pvh_eh; \
400 pv_hashed_free_count += pv_cnt; \
401 simple_unlock(&pv_hashed_free_list_lock); \
402}
403
404#define PV_HASHED_KERN_ALLOC(pvh_e) { \
405 simple_lock(&pv_hashed_kern_free_list_lock); \
406 if ((pvh_e = pv_hashed_kern_free_list) != 0) { \
407 pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
408 pv_hashed_kern_free_count--; \
409 if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) \
0c530ab8 410 if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
91447636 411 thread_call_enter(mapping_adjust_call); \
1c79356b 412 } \
2d21ac55 413 simple_unlock(&pv_hashed_kern_free_list_lock); \
1c79356b
A
414}
415
2d21ac55
A
416#define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
417 simple_lock(&pv_hashed_kern_free_list_lock); \
418 pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; \
419 pv_hashed_kern_free_list = pvh_eh; \
420 pv_hashed_kern_free_count += pv_cnt; \
421 simple_unlock(&pv_hashed_kern_free_list_lock); \
1c79356b
A
422}
423
2d21ac55 424zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
1c79356b 425
91447636 426static zone_t pdpt_zone;
91447636 427
1c79356b
A
428/*
429 * Each entry in the pv_head_table is locked by a bit in the
430 * pv_lock_table. The lock bits are accessed by the physical
431 * address of the page they lock.
432 */
433
434char *pv_lock_table; /* pointer to array of bits */
435#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
436
2d21ac55
A
437char *pv_hash_lock_table;
438#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
439
1c79356b
A
440/*
441 * First and last physical addresses that we maintain any information
442 * for. Initialized to zero so that pmap operations done before
443 * pmap_init won't touch any non-existent structures.
444 */
1c79356b
A
445boolean_t pmap_initialized = FALSE;/* Has pmap_init completed? */
446
91447636
A
447static struct vm_object kptobj_object_store;
448static vm_object_t kptobj;
91447636 449
1c79356b 450/*
2d21ac55 451 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
1c79356b
A
452 */
453
2d21ac55
A
454#define pa_index(pa) (i386_btop(pa))
455#define ppn_to_pai(ppn) ((int)ppn)
1c79356b
A
456
457#define pai_to_pvh(pai) (&pv_head_table[pai])
458#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
459#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
460
2d21ac55
A
461#define pvhashidx(pmap, va) (((uint32_t)pmap ^ ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) & npvhash)
462#define pvhash(idx) (&pv_hash_table[idx])
463
464#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
465#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
466
1c79356b
A
467/*
468 * Array of physical page attribites for managed pages.
469 * One byte per physical page.
470 */
471char *pmap_phys_attributes;
2d21ac55 472unsigned int last_managed_page = 0;
1c79356b
A
473
474/*
475 * Physical page attributes. Copy bits from PTE definition.
476 */
477#define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
478#define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
2d21ac55 479#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
1c79356b
A
480
481/*
482 * Amount of virtual memory mapped by one
483 * page-directory entry.
484 */
485#define PDE_MAPPED_SIZE (pdetova(1))
0c530ab8 486uint64_t pde_mapped_size;
1c79356b 487
1c79356b
A
488/*
489 * Locking and TLB invalidation
490 */
491
492/*
2d21ac55 493 * Locking Protocols: (changed 2/2007 JK)
1c79356b
A
494 *
495 * There are two structures in the pmap module that need locking:
496 * the pmaps themselves, and the per-page pv_lists (which are locked
497 * by locking the pv_lock_table entry that corresponds to the pv_head
498 * for the list in question.) Most routines want to lock a pmap and
499 * then do operations in it that require pv_list locking -- however
500 * pmap_remove_all and pmap_copy_on_write operate on a physical page
501 * basis and want to do the locking in the reverse order, i.e. lock
502 * a pv_list and then go through all the pmaps referenced by that list.
1c79356b 503 *
2d21ac55
A
504 * The system wide pmap lock has been removed. Now, paths take a lock
505 * on the pmap before changing its 'shape' and the reverse order lockers
506 * (coming in by phys ppn) take a lock on the corresponding pv and then
507 * retest to be sure nothing changed during the window before they locked
508 * and can then run up/down the pv lists holding the list lock. This also
509 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
510 * previously.
1c79356b 511 */
1c79356b 512
1c79356b 513
2d21ac55
A
514/*
515 * PV locking
516 */
517
518#define LOCK_PVH(index) { \
519 mp_disable_preemption(); \
520 lock_pvh_pai(index); \
1c79356b
A
521}
522
2d21ac55
A
523#define UNLOCK_PVH(index) { \
524 unlock_pvh_pai(index); \
525 mp_enable_preemption(); \
1c79356b
A
526}
527
2d21ac55
A
528/*
529 * PV hash locking
530 */
1c79356b 531
2d21ac55
A
532#define LOCK_PV_HASH(hash) lock_hash_hash(hash)
533
534#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
1c79356b 535
55e303ae
A
536#if USLOCK_DEBUG
537extern int max_lock_loops;
91447636
A
538#define LOOP_VAR \
539 unsigned int loop_count; \
2d21ac55 540 loop_count = disable_serial_output ? max_lock_loops \
91447636 541 : max_lock_loops*100
55e303ae 542#define LOOP_CHECK(msg, pmap) \
91447636 543 if (--loop_count == 0) { \
55e303ae 544 mp_disable_preemption(); \
0c530ab8
A
545 kprintf("%s: cpu %d pmap %x\n", \
546 msg, cpu_number(), pmap); \
55e303ae
A
547 Debugger("deadlock detection"); \
548 mp_enable_preemption(); \
91447636 549 loop_count = max_lock_loops; \
55e303ae
A
550 }
551#else /* USLOCK_DEBUG */
552#define LOOP_VAR
553#define LOOP_CHECK(msg, pmap)
554#endif /* USLOCK_DEBUG */
1c79356b 555
b0d623f7
A
556unsigned pmap_memory_region_count;
557unsigned pmap_memory_region_current;
1c79356b 558
91447636 559pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
1c79356b
A
560
561/*
562 * Other useful macros.
563 */
91447636 564#define current_pmap() (vm_map_pmap(current_thread()->map))
1c79356b
A
565
566struct pmap kernel_pmap_store;
567pmap_t kernel_pmap;
568
0c530ab8
A
569pd_entry_t high_shared_pde;
570pd_entry_t commpage64_pde;
91447636 571
1c79356b
A
572struct zone *pmap_zone; /* zone of pmap structures */
573
574int pmap_debug = 0; /* flag for debugging prints */
91447636 575
2d21ac55 576unsigned int inuse_ptepages_count = 0;
1c79356b 577
0c530ab8
A
578addr64_t kernel64_cr3;
579boolean_t no_shared_cr3 = FALSE; /* -no_shared_cr3 boot arg */
580
b0d623f7 581
1c79356b
A
582/*
583 * Pmap cache. Cache is threaded through ref_count field of pmap.
584 * Max will eventually be constant -- variable for experimentation.
585 */
586int pmap_cache_max = 32;
587int pmap_alloc_chunk = 8;
588pmap_t pmap_cache_list;
589int pmap_cache_count;
590decl_simple_lock_data(,pmap_cache_lock)
591
1c79356b
A
592extern char end;
593
91447636
A
594static int nkpt;
595
596pt_entry_t *DMAP1, *DMAP2;
597caddr_t DADDR1;
598caddr_t DADDR2;
1c79356b 599
2d21ac55
A
600static inline
601void pmap_pvh_unlink(pv_hashed_entry_t pv);
602
603/*
604 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
605 * properly deals with the anchor.
606 * must be called with the hash locked, does not unlock it
607 */
608
609static inline
610void pmap_pvh_unlink(pv_hashed_entry_t pvh)
611{
612 pv_hashed_entry_t curh;
613 pv_hashed_entry_t *pprevh;
614 int pvhash_idx;
615
616 CHK_NPVHASH();
617 pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
1c79356b 618
2d21ac55
A
619 pprevh = pvhash(pvhash_idx);
620
621#if PV_DEBUG
622 if (NULL == *pprevh) panic("pvh_unlink null anchor"); /* JK DEBUG */
623#endif
624 curh = *pprevh;
625
626 while (PV_HASHED_ENTRY_NULL != curh) {
627 if (pvh == curh)
628 break;
629 pprevh = &curh->nexth;
630 curh = curh->nexth;
631 }
632 if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
633 *pprevh = pvh->nexth;
634 return;
635}
1c79356b 636
0c530ab8
A
637/*
638 * for legacy, returns the address of the pde entry.
639 * for 64 bit, causes the pdpt page containing the pde entry to be mapped,
640 * then returns the mapped address of the pde entry in that page
641 */
642pd_entry_t *
643pmap_pde(pmap_t m, vm_map_offset_t v)
4452a7af 644{
0c530ab8
A
645 pd_entry_t *pde;
646 if (!cpu_64bit || (m == kernel_pmap)) {
647 pde = (&((m)->dirbase[(vm_offset_t)(v) >> PDESHIFT]));
648 } else {
649 assert(m);
650 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
651 pde = pmap64_pde(m, v);
652 }
653 return pde;
4452a7af
A
654}
655
89b3af67 656
4452a7af 657/*
0c530ab8
A
658 * the single pml4 page per pmap is allocated at pmap create time and exists
659 * for the duration of the pmap. we allocate this page in kernel vm (to save us one
660 * level of page table dynamic mapping.
661 * this returns the address of the requested pml4 entry in the top level page.
4452a7af 662 */
0c530ab8
A
663static inline
664pml4_entry_t *
665pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
666{
667 return ((pml4_entry_t *)pmap->pm_hold + ((vm_offset_t)((vaddr>>PML4SHIFT)&(NPML4PG-1))));
668}
669
670/*
671 * maps in the pml4 page, if any, containing the pdpt entry requested
672 * and returns the address of the pdpt entry in that mapped page
673 */
674pdpt_entry_t *
675pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
676{
677 pml4_entry_t newpf;
678 pml4_entry_t *pml4;
679 int i;
680
681 assert(pmap);
682 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
683 if ((vaddr > 0x00007FFFFFFFFFFFULL) && (vaddr < 0xFFFF800000000000ULL)) {
684 return(0);
4452a7af 685 }
0c530ab8
A
686
687 pml4 = pmap64_pml4(pmap, vaddr);
688
689 if (pml4 && ((*pml4 & INTEL_PTE_VALID))) {
690
691 newpf = *pml4 & PG_FRAME;
692
693
694 for (i=PMAP_PDPT_FIRST_WINDOW; i < PMAP_PDPT_FIRST_WINDOW+PMAP_PDPT_NWINDOWS; i++) {
695 if (((*(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP)) & PG_FRAME) == newpf) {
696 return((pdpt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR) +
697 ((vm_offset_t)((vaddr>>PDPTSHIFT)&(NPDPTPG-1))));
698 }
699 }
700
701 current_cpu_datap()->cpu_pmap->pdpt_window_index++;
702 if (current_cpu_datap()->cpu_pmap->pdpt_window_index > (PMAP_PDPT_FIRST_WINDOW+PMAP_PDPT_NWINDOWS-1))
703 current_cpu_datap()->cpu_pmap->pdpt_window_index = PMAP_PDPT_FIRST_WINDOW;
704 pmap_store_pte(
705 (current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pdpt_window_index].prv_CMAP),
706 newpf | INTEL_PTE_RW | INTEL_PTE_VALID);
707 invlpg((u_int)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pdpt_window_index].prv_CADDR));
708 return ((pdpt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pdpt_window_index].prv_CADDR) +
709 ((vm_offset_t)((vaddr>>PDPTSHIFT)&(NPDPTPG-1))));
710 }
711
2d21ac55 712 return (NULL);
4452a7af
A
713}
714
0c530ab8
A
715/*
716 * maps in the pdpt page, if any, containing the pde entry requested
717 * and returns the address of the pde entry in that mapped page
718 */
719pd_entry_t *
720pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr)
4452a7af 721{
0c530ab8
A
722 pdpt_entry_t newpf;
723 pdpt_entry_t *pdpt;
724 int i;
4452a7af 725
0c530ab8
A
726 assert(pmap);
727 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
728 if ((vaddr > 0x00007FFFFFFFFFFFULL) && (vaddr < 0xFFFF800000000000ULL)) {
729 return(0);
730 }
731
732 /* if (vaddr & (1ULL << 63)) panic("neg addr");*/
733 pdpt = pmap64_pdpt(pmap, vaddr);
734
735 if (pdpt && ((*pdpt & INTEL_PTE_VALID))) {
736
737 newpf = *pdpt & PG_FRAME;
738
739 for (i=PMAP_PDE_FIRST_WINDOW; i < PMAP_PDE_FIRST_WINDOW+PMAP_PDE_NWINDOWS; i++) {
740 if (((*(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP)) & PG_FRAME) == newpf) {
741 return((pd_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR) +
742 ((vm_offset_t)((vaddr>>PDSHIFT)&(NPDPG-1))));
743 }
4452a7af 744 }
0c530ab8
A
745
746 current_cpu_datap()->cpu_pmap->pde_window_index++;
747 if (current_cpu_datap()->cpu_pmap->pde_window_index > (PMAP_PDE_FIRST_WINDOW+PMAP_PDE_NWINDOWS-1))
748 current_cpu_datap()->cpu_pmap->pde_window_index = PMAP_PDE_FIRST_WINDOW;
749 pmap_store_pte(
750 (current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pde_window_index].prv_CMAP),
751 newpf | INTEL_PTE_RW | INTEL_PTE_VALID);
752 invlpg((u_int)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pde_window_index].prv_CADDR));
753 return ((pd_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pde_window_index].prv_CADDR) +
754 ((vm_offset_t)((vaddr>>PDSHIFT)&(NPDPG-1))));
21362eb3 755 }
4452a7af 756
2d21ac55 757 return (NULL);
0c530ab8
A
758}
759
2d21ac55
A
760/*
761 * Because the page tables (top 3 levels) are mapped into per cpu windows,
762 * callers must either disable interrupts or disable preemption before calling
763 * one of the pte mapping routines (e.g. pmap_pte()) as the returned vaddr
764 * is in one of those mapped windows and that cannot be allowed to change until
765 * the caller is done using the returned pte pointer. When done, the caller
766 * restores interrupts or preemption to its previous state after which point the
767 * vaddr for the returned pte can no longer be used
768 */
0c530ab8
A
769
770
771/*
772 * return address of mapped pte for vaddr va in pmap pmap.
773 * must be called with pre-emption or interrupts disabled
774 * if targeted pmap is not the kernel pmap
775 * since we may be passing back a virtual address that is
776 * associated with this cpu... pre-emption or interrupts
777 * must remain disabled until the caller is done using
778 * the pointer that was passed back .
779 *
780 * maps the pde page, if any, containing the pte in and returns
781 * the address of the pte in that mapped page
782 */
783pt_entry_t *
784pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
785{
786 pd_entry_t *pde;
787 pd_entry_t newpf;
788 int i;
789
790 assert(pmap);
791 pde = pmap_pde(pmap,vaddr);
792
793 if (pde && ((*pde & INTEL_PTE_VALID))) {
b0d623f7
A
794 if (*pde & INTEL_PTE_PS)
795 return pde;
2d21ac55
A
796 if (pmap == kernel_pmap)
797 return (vtopte(vaddr)); /* compat kernel still has pte's mapped */
798#if TESTING
799 if (ml_get_interrupts_enabled() && get_preemption_level() == 0)
800 panic("pmap_pte: unsafe call");
801#endif
0c530ab8
A
802 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
803
804 newpf = *pde & PG_FRAME;
805
806 for (i=PMAP_PTE_FIRST_WINDOW; i < PMAP_PTE_FIRST_WINDOW+PMAP_PTE_NWINDOWS; i++) {
807 if (((*(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP)) & PG_FRAME) == newpf) {
808 return((pt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR) +
809 ((vm_offset_t)i386_btop(vaddr) & (NPTEPG-1)));
810 }
811 }
812
813 current_cpu_datap()->cpu_pmap->pte_window_index++;
814 if (current_cpu_datap()->cpu_pmap->pte_window_index > (PMAP_PTE_FIRST_WINDOW+PMAP_PTE_NWINDOWS-1))
815 current_cpu_datap()->cpu_pmap->pte_window_index = PMAP_PTE_FIRST_WINDOW;
816 pmap_store_pte(
817 (current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pte_window_index].prv_CMAP),
818 newpf | INTEL_PTE_RW | INTEL_PTE_VALID);
819 invlpg((u_int)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pte_window_index].prv_CADDR));
820 return ((pt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pte_window_index].prv_CADDR) +
821 ((vm_offset_t)i386_btop(vaddr) & (NPTEPG-1)));
6601e61a 822 }
0c530ab8 823
2d21ac55 824 return(NULL);
1c79356b 825}
2d21ac55 826
1c79356b
A
827
828/*
829 * Map memory at initialization. The physical addresses being
830 * mapped are not managed and are never unmapped.
831 *
832 * For now, VM is already on, we only need to map the
833 * specified memory.
834 */
835vm_offset_t
836pmap_map(
0c530ab8
A
837 vm_offset_t virt,
838 vm_map_offset_t start_addr,
839 vm_map_offset_t end_addr,
840 vm_prot_t prot,
841 unsigned int flags)
1c79356b 842{
0c530ab8 843 int ps;
1c79356b
A
844
845 ps = PAGE_SIZE;
91447636 846 while (start_addr < end_addr) {
0c530ab8
A
847 pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
848 (ppnum_t) i386_btop(start_addr), prot, flags, FALSE);
1c79356b 849 virt += ps;
91447636 850 start_addr += ps;
1c79356b
A
851 }
852 return(virt);
853}
854
855/*
856 * Back-door routine for mapping kernel VM at initialization.
857 * Useful for mapping memory outside the range
858 * Sets no-cache, A, D.
1c79356b
A
859 * Otherwise like pmap_map.
860 */
861vm_offset_t
862pmap_map_bd(
0c530ab8
A
863 vm_offset_t virt,
864 vm_map_offset_t start_addr,
865 vm_map_offset_t end_addr,
866 vm_prot_t prot,
867 unsigned int flags)
1c79356b 868{
0c530ab8 869 pt_entry_t template;
b0d623f7 870 pt_entry_t *pte;
2d21ac55 871 spl_t spl;
1c79356b 872
91447636 873 template = pa_to_pte(start_addr)
1c79356b
A
874 | INTEL_PTE_REF
875 | INTEL_PTE_MOD
876 | INTEL_PTE_WIRED
877 | INTEL_PTE_VALID;
0c530ab8
A
878
879 if(flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) {
880 template |= INTEL_PTE_NCACHE;
881 if(!(flags & (VM_MEM_GUARDED | VM_WIMG_USE_DEFAULT)))
882 template |= INTEL_PTE_PTA;
883 }
884
1c79356b
A
885 if (prot & VM_PROT_WRITE)
886 template |= INTEL_PTE_WRITE;
887
b0d623f7 888
91447636 889 while (start_addr < end_addr) {
2d21ac55 890 spl = splhigh();
0c530ab8 891 pte = pmap_pte(kernel_pmap, (vm_map_offset_t)virt);
91447636 892 if (pte == PT_ENTRY_NULL) {
1c79356b 893 panic("pmap_map_bd: Invalid kernel address\n");
91447636 894 }
0c530ab8 895 pmap_store_pte(pte, template);
2d21ac55 896 splx(spl);
1c79356b
A
897 pte_increment_pa(template);
898 virt += PAGE_SIZE;
91447636 899 start_addr += PAGE_SIZE;
b0d623f7
A
900 }
901
1c79356b 902
55e303ae 903 flush_tlb();
1c79356b
A
904 return(virt);
905}
906
b0d623f7
A
907extern char *first_avail;
908extern vm_offset_t virtual_avail, virtual_end;
909extern pmap_paddr_t avail_start, avail_end;
1c79356b 910
2d21ac55
A
911void
912pmap_cpu_init(void)
913{
914 /*
915 * Here early in the life of a processor (from cpu_mode_init()).
916 * If we're not in 64-bit mode, enable the global TLB feature.
917 * Note: regardless of mode we continue to set the global attribute
918 * bit in ptes for all (32-bit) global pages such as the commpage.
919 */
920 if (!cpu_64bit) {
921 set_cr4(get_cr4() | CR4_PGE);
922 }
923
924 /*
925 * Initialize the per-cpu, TLB-related fields.
926 */
927 current_cpu_datap()->cpu_active_cr3 = kernel_pmap->pm_cr3;
928 current_cpu_datap()->cpu_tlb_invalid = FALSE;
929}
0c530ab8
A
930
931vm_offset_t
932pmap_high_shared_remap(enum high_fixed_addresses e, vm_offset_t va, int sz)
933{
934 vm_offset_t ve = pmap_index_to_virt(e);
935 pt_entry_t *ptep;
936 pmap_paddr_t pa;
937 int i;
2d21ac55 938 spl_t s;
0c530ab8
A
939
940 assert(0 == (va & PAGE_MASK)); /* expecting page aligned */
2d21ac55 941 s = splhigh();
0c530ab8
A
942 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ve);
943
944 for (i=0; i< sz; i++) {
945 pa = (pmap_paddr_t) kvtophys(va);
946 pmap_store_pte(ptep, (pa & PG_FRAME)
947 | INTEL_PTE_VALID
948 | INTEL_PTE_GLOBAL
949 | INTEL_PTE_RW
950 | INTEL_PTE_REF
951 | INTEL_PTE_MOD);
952 va+= PAGE_SIZE;
953 ptep++;
954 }
2d21ac55 955 splx(s);
0c530ab8
A
956 return ve;
957}
958
959vm_offset_t
960pmap_cpu_high_shared_remap(int cpu, enum high_cpu_types e, vm_offset_t va, int sz)
961{
962 enum high_fixed_addresses a = e + HIGH_CPU_END * cpu;
963 return pmap_high_shared_remap(HIGH_FIXED_CPUS_BEGIN + a, va, sz);
964}
965
966void pmap_init_high_shared(void);
967
968extern vm_offset_t gdtptr, idtptr;
969
970extern uint32_t low_intstack;
971
972extern struct fake_descriptor ldt_desc_pattern;
973extern struct fake_descriptor tss_desc_pattern;
974
975extern char hi_remap_text, hi_remap_etext;
976extern char t_zero_div;
977
978pt_entry_t *pte_unique_base;
979
980void
981pmap_init_high_shared(void)
982{
983
984 vm_offset_t haddr;
2d21ac55 985 spl_t s;
0c530ab8
A
986#if MACH_KDB
987 struct i386_tss *ttss;
988#endif
989
b0d623f7
A
990 cpu_desc_index_t * cdi = &cpu_data_master.cpu_desc_index;
991
0c530ab8
A
992 kprintf("HIGH_MEM_BASE 0x%x fixed per-cpu begin 0x%x\n",
993 HIGH_MEM_BASE,pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN));
2d21ac55 994 s = splhigh();
0c530ab8 995 pte_unique_base = pmap_pte(kernel_pmap, (vm_map_offset_t)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN));
2d21ac55 996 splx(s);
0c530ab8
A
997
998 if (i386_btop(&hi_remap_etext - &hi_remap_text + 1) >
999 HIGH_FIXED_TRAMPS_END - HIGH_FIXED_TRAMPS + 1)
1000 panic("tramps too large");
1001 haddr = pmap_high_shared_remap(HIGH_FIXED_TRAMPS,
1002 (vm_offset_t) &hi_remap_text, 3);
1003 kprintf("tramp: 0x%x, ",haddr);
0c530ab8
A
1004 /* map gdt up high and update ptr for reload */
1005 haddr = pmap_high_shared_remap(HIGH_FIXED_GDT,
1006 (vm_offset_t) master_gdt, 1);
b0d623f7 1007 cdi->cdi_gdt.ptr = (void *)haddr;
0c530ab8
A
1008 kprintf("GDT: 0x%x, ",haddr);
1009 /* map ldt up high */
1010 haddr = pmap_high_shared_remap(HIGH_FIXED_LDT_BEGIN,
1011 (vm_offset_t) master_ldt,
1012 HIGH_FIXED_LDT_END - HIGH_FIXED_LDT_BEGIN + 1);
b0d623f7 1013 cdi->cdi_ldt = (struct fake_descriptor *)haddr;
0c530ab8
A
1014 kprintf("LDT: 0x%x, ",haddr);
1015 /* put new ldt addr into gdt */
b0d623f7
A
1016 struct fake_descriptor temp_fake_desc;
1017 temp_fake_desc = ldt_desc_pattern;
1018 temp_fake_desc.offset = (vm_offset_t) haddr;
1019 fix_desc(&temp_fake_desc, 1);
1020
1021 *(struct fake_descriptor *) &master_gdt[sel_idx(KERNEL_LDT)] = temp_fake_desc;
1022 *(struct fake_descriptor *) &master_gdt[sel_idx(USER_LDT)] = temp_fake_desc;
0c530ab8
A
1023
1024 /* map idt up high */
1025 haddr = pmap_high_shared_remap(HIGH_FIXED_IDT,
1026 (vm_offset_t) master_idt, 1);
b0d623f7 1027 cdi->cdi_idt.ptr = (void *)haddr;
0c530ab8
A
1028 kprintf("IDT: 0x%x, ", haddr);
1029 /* remap ktss up high and put new high addr into gdt */
1030 haddr = pmap_high_shared_remap(HIGH_FIXED_KTSS,
1031 (vm_offset_t) &master_ktss, 1);
b0d623f7
A
1032
1033 temp_fake_desc = tss_desc_pattern;
1034 temp_fake_desc.offset = (vm_offset_t) haddr;
1035 fix_desc(&temp_fake_desc, 1);
1036 *(struct fake_descriptor *) &master_gdt[sel_idx(KERNEL_TSS)] = temp_fake_desc;
0c530ab8
A
1037 kprintf("KTSS: 0x%x, ",haddr);
1038#if MACH_KDB
1039 /* remap dbtss up high and put new high addr into gdt */
1040 haddr = pmap_high_shared_remap(HIGH_FIXED_DBTSS,
1041 (vm_offset_t) &master_dbtss, 1);
b0d623f7
A
1042 temp_fake_desc = tss_desc_pattern;
1043 temp_fake_desc.offset = (vm_offset_t) haddr;
1044 fix_desc(&temp_fake_desc, 1);
1045 *(struct fake_descriptor *)&master_gdt[sel_idx(DEBUG_TSS)] = temp_fake_desc;
0c530ab8
A
1046 ttss = (struct i386_tss *)haddr;
1047 kprintf("DBTSS: 0x%x, ",haddr);
1048#endif /* MACH_KDB */
1049
1050 /* remap dftss up high and put new high addr into gdt */
1051 haddr = pmap_high_shared_remap(HIGH_FIXED_DFTSS,
1052 (vm_offset_t) &master_dftss, 1);
b0d623f7
A
1053 temp_fake_desc = tss_desc_pattern;
1054 temp_fake_desc.offset = (vm_offset_t) haddr;
1055 fix_desc(&temp_fake_desc, 1);
1056 *(struct fake_descriptor *) &master_gdt[sel_idx(DF_TSS)] = temp_fake_desc;
0c530ab8
A
1057 kprintf("DFTSS: 0x%x\n",haddr);
1058
1059 /* remap mctss up high and put new high addr into gdt */
1060 haddr = pmap_high_shared_remap(HIGH_FIXED_DFTSS,
1061 (vm_offset_t) &master_mctss, 1);
b0d623f7
A
1062 temp_fake_desc = tss_desc_pattern;
1063 temp_fake_desc.offset = (vm_offset_t) haddr;
1064 fix_desc(&temp_fake_desc, 1);
1065 *(struct fake_descriptor *) &master_gdt[sel_idx(MC_TSS)] = temp_fake_desc;
0c530ab8
A
1066 kprintf("MCTSS: 0x%x\n",haddr);
1067
b0d623f7 1068 cpu_desc_load(&cpu_data_master);
0c530ab8
A
1069}
1070
1071
1c79356b
A
1072/*
1073 * Bootstrap the system enough to run with virtual memory.
1074 * Map the kernel's code and data, and allocate the system page table.
1075 * Called with mapping OFF. Page_size must already be set.
1c79356b
A
1076 */
1077
1078void
1079pmap_bootstrap(
0c530ab8
A
1080 __unused vm_offset_t load_start,
1081 boolean_t IA32e)
1c79356b 1082{
91447636
A
1083 vm_offset_t va;
1084 pt_entry_t *pte;
1085 int i;
0c530ab8 1086 pdpt_entry_t *pdpt;
2d21ac55 1087 spl_t s;
1c79356b 1088
91447636
A
1089 vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Set the highest address
1090 * known to VM */
1c79356b
A
1091 /*
1092 * The kernel's pmap is statically allocated so we don't
1093 * have to use pmap_create, which is unlikely to work
1094 * correctly at this part of the boot sequence.
1095 */
1096
0c530ab8 1097
1c79356b 1098 kernel_pmap = &kernel_pmap_store;
91447636 1099 kernel_pmap->ref_count = 1;
0c530ab8 1100 kernel_pmap->nx_enabled = FALSE;
2d21ac55 1101 kernel_pmap->pm_task_map = TASK_MAP_32BIT;
91447636
A
1102 kernel_pmap->pm_obj = (vm_object_t) NULL;
1103 kernel_pmap->dirbase = (pd_entry_t *)((unsigned int)IdlePTD | KERNBASE);
0c530ab8
A
1104 kernel_pmap->pdirbase = (pmap_paddr_t)((int)IdlePTD);
1105 pdpt = (pd_entry_t *)((unsigned int)IdlePDPT | KERNBASE );
1106 kernel_pmap->pm_pdpt = pdpt;
1107 kernel_pmap->pm_cr3 = (pmap_paddr_t)((int)IdlePDPT);
1c79356b 1108
b0d623f7 1109
91447636
A
1110 va = (vm_offset_t)kernel_pmap->dirbase;
1111 /* setup self referential mapping(s) */
0c530ab8 1112 for (i = 0; i< NPGPTD; i++, pdpt++) {
91447636 1113 pmap_paddr_t pa;
b0d623f7 1114 pa = (pmap_paddr_t) kvtophys((vm_offset_t)(va + i386_ptob(i)));
0c530ab8
A
1115 pmap_store_pte(
1116 (pd_entry_t *) (kernel_pmap->dirbase + PTDPTDI + i),
91447636 1117 (pa & PG_FRAME) | INTEL_PTE_VALID | INTEL_PTE_RW | INTEL_PTE_REF |
0c530ab8
A
1118 INTEL_PTE_MOD | INTEL_PTE_WIRED) ;
1119 pmap_store_pte(pdpt, pa | INTEL_PTE_VALID);
91447636 1120 }
1c79356b 1121
0c530ab8
A
1122 cpu_64bit = IA32e;
1123
1124 lo_kernel_cr3 = kernel_pmap->pm_cr3;
1125 current_cpu_datap()->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
1126
1127 /* save the value we stuff into created pmaps to share the gdts etc */
1128 high_shared_pde = *pmap_pde(kernel_pmap, HIGH_MEM_BASE);
1129 /* make sure G bit is on for high shared pde entry */
1130 high_shared_pde |= INTEL_PTE_GLOBAL;
2d21ac55 1131 s = splhigh();
0c530ab8 1132 pmap_store_pte(pmap_pde(kernel_pmap, HIGH_MEM_BASE), high_shared_pde);
2d21ac55 1133 splx(s);
0c530ab8 1134
91447636 1135 nkpt = NKPT;
b0d623f7 1136 OSAddAtomic(NKPT, &inuse_ptepages_count);
1c79356b 1137
91447636
A
1138 virtual_avail = (vm_offset_t)VADDR(KPTDI,0) + (vm_offset_t)first_avail;
1139 virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
1c79356b
A
1140
1141 /*
91447636
A
1142 * Reserve some special page table entries/VA space for temporary
1143 * mapping of pages.
1c79356b 1144 */
91447636 1145#define SYSMAP(c, p, v, n) \
0c530ab8 1146 v = (c)va; va += ((n)*INTEL_PGBYTES); p = pte; pte += (n)
91447636
A
1147
1148 va = virtual_avail;
0c530ab8 1149 pte = vtopte(va);
6601e61a 1150
0c530ab8
A
1151 for (i=0; i<PMAP_NWINDOWS; i++) {
1152 SYSMAP(caddr_t,
1153 (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
1154 (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR),
1155 1);
1156 *current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP = 0;
1157 }
1c79356b 1158
91447636
A
1159 /* DMAP user for debugger */
1160 SYSMAP(caddr_t, DMAP1, DADDR1, 1);
1161 SYSMAP(caddr_t, DMAP2, DADDR2, 1); /* XXX temporary - can remove */
1c79356b 1162
91447636 1163 virtual_avail = va;
1c79356b 1164
593a1d5f 1165 if (PE_parse_boot_argn("npvhash", &npvhash, sizeof (npvhash))) {
2d21ac55
A
1166 if (0 != ((npvhash+1) & npvhash)) {
1167 kprintf("invalid hash %d, must be ((2^N)-1), using default %d\n",npvhash,NPVHASH);
1168 npvhash = NPVHASH;
1169 }
1170 } else {
1171 npvhash = NPVHASH;
1172 }
1173 printf("npvhash=%d\n",npvhash);
1174
91447636 1175 simple_lock_init(&kernel_pmap->lock, 0);
2d21ac55
A
1176 simple_lock_init(&pv_hashed_free_list_lock, 0);
1177 simple_lock_init(&pv_hashed_kern_free_list_lock, 0);
1178 simple_lock_init(&pv_hash_table_lock,0);
1c79356b 1179
2d21ac55 1180 pmap_init_high_shared();
0c530ab8
A
1181
1182 pde_mapped_size = PDE_MAPPED_SIZE;
1183
1184 if (cpu_64bit) {
b0d623f7 1185 pdpt_entry_t *ppdpt = IdlePDPT;
0c530ab8
A
1186 pdpt_entry_t *ppdpt64 = (pdpt_entry_t *)IdlePDPT64;
1187 pdpt_entry_t *ppml4 = (pdpt_entry_t *)IdlePML4;
1188 int istate = ml_set_interrupts_enabled(FALSE);
1189
1190 /*
1191 * Clone a new 64-bit 3rd-level page table directory, IdlePML4,
1192 * with page bits set for the correct IA-32e operation and so that
1193 * the legacy-mode IdlePDPT is retained for slave processor start-up.
1194 * This is necessary due to the incompatible use of page bits between
1195 * 64-bit and legacy modes.
1196 */
1197 kernel_pmap->pm_cr3 = (pmap_paddr_t)((int)IdlePML4); /* setup in start.s for us */
1198 kernel_pmap->pm_pml4 = IdlePML4;
1199 kernel_pmap->pm_pdpt = (pd_entry_t *)
1200 ((unsigned int)IdlePDPT64 | KERNBASE );
1201#define PAGE_BITS INTEL_PTE_VALID|INTEL_PTE_RW|INTEL_PTE_USER|INTEL_PTE_REF
1202 pmap_store_pte(kernel_pmap->pm_pml4,
1203 (uint32_t)IdlePDPT64 | PAGE_BITS);
1204 pmap_store_pte((ppdpt64+0), *(ppdpt+0) | PAGE_BITS);
1205 pmap_store_pte((ppdpt64+1), *(ppdpt+1) | PAGE_BITS);
1206 pmap_store_pte((ppdpt64+2), *(ppdpt+2) | PAGE_BITS);
1207 pmap_store_pte((ppdpt64+3), *(ppdpt+3) | PAGE_BITS);
1208
1209 /*
1210 * The kernel is also mapped in the uber-sapce at the 4GB starting
1211 * 0xFFFFFF80:00000000. This is the highest entry in the 4th-level.
1212 */
1213 pmap_store_pte((ppml4+KERNEL_UBER_PML4_INDEX), *(ppml4+0));
1214
1215 kernel64_cr3 = (addr64_t) kernel_pmap->pm_cr3;
0c530ab8 1216
2d21ac55 1217 /* Re-initialize descriptors and prepare to switch modes */
b0d623f7 1218 cpu_desc_init64(&cpu_data_master);
2d21ac55
A
1219 current_cpu_datap()->cpu_is64bit = TRUE;
1220 current_cpu_datap()->cpu_active_cr3 = kernel64_cr3;
0c530ab8
A
1221
1222 pde_mapped_size = 512*4096 ;
1223
1224 ml_set_interrupts_enabled(istate);
0c530ab8 1225 }
2d21ac55 1226
b0d623f7 1227 /* Sets 64-bit mode if required. */
2d21ac55 1228 cpu_mode_init(&cpu_data_master);
b0d623f7
A
1229 /* Update in-kernel CPUID information if we're now in 64-bit mode */
1230 if (IA32e)
1231 cpuid_set_info();
2d21ac55 1232
0c530ab8 1233 kernel_pmap->pm_hold = (vm_offset_t)kernel_pmap->pm_pml4;
1c79356b 1234
91447636
A
1235 kprintf("Kernel virtual space from 0x%x to 0x%x.\n",
1236 VADDR(KPTDI,0), virtual_end);
6601e61a 1237 printf("PAE enabled\n");
0c530ab8
A
1238 if (cpu_64bit){
1239 printf("64 bit mode enabled\n");kprintf("64 bit mode enabled\n"); }
1240
1241 kprintf("Available physical space from 0x%llx to 0x%llx\n",
6601e61a 1242 avail_start, avail_end);
0c530ab8
A
1243
1244 /*
1245 * By default for 64-bit users loaded at 4GB, share kernel mapping.
1246 * But this may be overridden by the -no_shared_cr3 boot-arg.
1247 */
593a1d5f 1248 if (PE_parse_boot_argn("-no_shared_cr3", &no_shared_cr3, sizeof (no_shared_cr3))) {
0c530ab8 1249 kprintf("Shared kernel address space disabled\n");
2d21ac55
A
1250 }
1251
1252#ifdef PMAP_TRACES
593a1d5f 1253 if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof (pmap_trace))) {
2d21ac55
A
1254 kprintf("Kernel traces for pmap operations enabled\n");
1255 }
1256#endif /* PMAP_TRACES */
1c79356b
A
1257}
1258
1259void
1260pmap_virtual_space(
1261 vm_offset_t *startp,
1262 vm_offset_t *endp)
1263{
1264 *startp = virtual_avail;
1265 *endp = virtual_end;
1266}
1267
1268/*
1269 * Initialize the pmap module.
1270 * Called by vm_init, to initialize any structures that the pmap
1271 * system needs to map virtual memory.
1272 */
1273void
1274pmap_init(void)
1275{
1276 register long npages;
1277 vm_offset_t addr;
1278 register vm_size_t s;
0c530ab8 1279 vm_map_offset_t vaddr;
2d21ac55 1280 ppnum_t ppn;
1c79356b
A
1281
1282 /*
1283 * Allocate memory for the pv_head_table and its lock bits,
1284 * the modify bit array, and the pte_page table.
1285 */
1286
2d21ac55
A
1287 /*
1288 * zero bias all these arrays now instead of off avail_start
1289 * so we cover all memory
1290 */
1291
b0d623f7 1292 npages = (long)i386_btop(avail_end);
2d21ac55
A
1293 s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
1294 + (sizeof (struct pv_hashed_entry_t *) * (npvhash+1))
1295 + pv_lock_table_size(npages)
1296 + pv_hash_lock_table_size((npvhash+1))
1c79356b
A
1297 + npages);
1298
1299 s = round_page(s);
b0d623f7
A
1300 if (kernel_memory_allocate(kernel_map, &addr, s, 0,
1301 KMA_KOBJECT | KMA_PERMANENT)
1302 != KERN_SUCCESS)
1c79356b
A
1303 panic("pmap_init");
1304
1305 memset((char *)addr, 0, s);
1306
2d21ac55
A
1307#if PV_DEBUG
1308 if (0 == npvhash) panic("npvhash not initialized");
1309#endif
1310
1c79356b
A
1311 /*
1312 * Allocate the structures first to preserve word-alignment.
1313 */
2d21ac55 1314 pv_head_table = (pv_rooted_entry_t) addr;
1c79356b
A
1315 addr = (vm_offset_t) (pv_head_table + npages);
1316
2d21ac55
A
1317 pv_hash_table = (pv_hashed_entry_t *)addr;
1318 addr = (vm_offset_t) (pv_hash_table + (npvhash + 1));
1319
1c79356b
A
1320 pv_lock_table = (char *) addr;
1321 addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
1322
2d21ac55
A
1323 pv_hash_lock_table = (char *) addr;
1324 addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhash+1)));
1325
1c79356b 1326 pmap_phys_attributes = (char *) addr;
2d21ac55
A
1327 {
1328 unsigned int i;
1329 unsigned int pn;
1330 ppnum_t last_pn;
1331 pmap_memory_region_t *pmptr = pmap_memory_regions;
1332
b0d623f7 1333 last_pn = (ppnum_t)i386_btop(avail_end);
2d21ac55
A
1334
1335 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
1336 if (pmptr->type == kEfiConventionalMemory) {
b0d623f7 1337
2d21ac55
A
1338 for (pn = pmptr->base; pn <= pmptr->end; pn++) {
1339 if (pn < last_pn) {
1340 pmap_phys_attributes[pn] |= PHYS_MANAGED;
1341
1342 if (pn > last_managed_page)
1343 last_managed_page = pn;
1344 }
1345 }
1346 }
1347 }
1348 }
1c79356b
A
1349
1350 /*
1351 * Create the zone of physical maps,
1352 * and of the physical-to-virtual entries.
1353 */
1354 s = (vm_size_t) sizeof(struct pmap);
1355 pmap_zone = zinit(s, 400*s, 4096, "pmap"); /* XXX */
2d21ac55
A
1356 s = (vm_size_t) sizeof(struct pv_hashed_entry);
1357 pv_hashed_list_zone = zinit(s, 10000*s, 4096, "pv_list"); /* XXX */
91447636
A
1358 s = 63;
1359 pdpt_zone = zinit(s, 400*s, 4096, "pdpt"); /* XXX */
55e303ae 1360
91447636 1361 kptobj = &kptobj_object_store;
2d21ac55 1362 _vm_object_allocate((vm_object_size_t)(NPGPTD*NPTDPG), kptobj);
91447636 1363 kernel_pmap->pm_obj = kptobj;
91447636
A
1364
1365 /* create pv entries for kernel pages mapped by low level
1366 startup code. these have to exist so we can pmap_remove()
1367 e.g. kext pages from the middle of our addr space */
1368
0c530ab8 1369 vaddr = (vm_map_offset_t)0;
91447636 1370 for (ppn = 0; ppn < i386_btop(avail_start) ; ppn++ ) {
2d21ac55 1371 pv_rooted_entry_t pv_e;
91447636
A
1372
1373 pv_e = pai_to_pvh(ppn);
1374 pv_e->va = vaddr;
1375 vaddr += PAGE_SIZE;
1376 pv_e->pmap = kernel_pmap;
2d21ac55 1377 queue_init(&pv_e->qlink);
91447636
A
1378 }
1379
1c79356b
A
1380 pmap_initialized = TRUE;
1381
1382 /*
2d21ac55 1383 * Initialize pmap cache.
1c79356b
A
1384 */
1385 pmap_cache_list = PMAP_NULL;
1386 pmap_cache_count = 0;
91447636 1387 simple_lock_init(&pmap_cache_lock, 0);
2d21ac55
A
1388
1389 max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
1390
1c79356b
A
1391}
1392
1c79356b 1393
2d21ac55 1394#define managed_page(x) ( (unsigned int)x <= last_managed_page && (pmap_phys_attributes[x] & PHYS_MANAGED) )
1c79356b 1395
2d21ac55
A
1396/*
1397 * this function is only used for debugging fron the vm layer
1398 */
1c79356b
A
1399boolean_t
1400pmap_verify_free(
55e303ae 1401 ppnum_t pn)
1c79356b 1402{
2d21ac55 1403 pv_rooted_entry_t pv_h;
1c79356b 1404 int pai;
1c79356b
A
1405 boolean_t result;
1406
55e303ae 1407 assert(pn != vm_page_fictitious_addr);
2d21ac55 1408
1c79356b
A
1409 if (!pmap_initialized)
1410 return(TRUE);
1411
2d21ac55
A
1412 if (pn == vm_page_guard_addr)
1413 return TRUE;
1c79356b 1414
2d21ac55
A
1415 pai = ppn_to_pai(pn);
1416 if (!managed_page(pai))
1417 return(FALSE);
1418 pv_h = pai_to_pvh(pn);
1419 result = (pv_h->pmap == PMAP_NULL);
1420 return(result);
1421}
1c79356b 1422
2d21ac55
A
1423boolean_t
1424pmap_is_empty(
1425 pmap_t pmap,
b0d623f7
A
1426 vm_map_offset_t va_start,
1427 vm_map_offset_t va_end)
2d21ac55
A
1428{
1429 vm_map_offset_t offset;
1430 ppnum_t phys_page;
1c79356b 1431
2d21ac55
A
1432 if (pmap == PMAP_NULL) {
1433 return TRUE;
1434 }
b0d623f7
A
1435
1436 /*
1437 * Check the resident page count
1438 * - if it's zero, the pmap is completely empty.
1439 * This short-circuit test prevents a virtual address scan which is
1440 * painfully slow for 64-bit spaces.
1441 * This assumes the count is correct
1442 * .. the debug kernel ought to be checking perhaps by page table walk.
1443 */
1444 if (pmap->stats.resident_count == 0)
1445 return TRUE;
1446
1447 for (offset = va_start;
1448 offset < va_end;
2d21ac55
A
1449 offset += PAGE_SIZE_64) {
1450 phys_page = pmap_find_phys(pmap, offset);
1451 if (phys_page) {
1452 if (pmap != kernel_pmap &&
1453 pmap->pm_task_map == TASK_MAP_32BIT &&
1454 offset >= HIGH_MEM_BASE) {
1455 /*
1456 * The "high_shared_pde" is used to share
1457 * the entire top-most 2MB of address space
1458 * between the kernel and all 32-bit tasks.
1459 * So none of this can be removed from 32-bit
1460 * tasks.
1461 * Let's pretend there's nothing up
1462 * there...
1463 */
1464 return TRUE;
1465 }
1466 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1467 "page %d at 0x%llx\n",
b0d623f7 1468 pmap, va_start, va_end, phys_page, offset);
2d21ac55
A
1469 return FALSE;
1470 }
1471 }
1c79356b 1472
2d21ac55 1473 return TRUE;
1c79356b
A
1474}
1475
2d21ac55 1476
1c79356b
A
1477/*
1478 * Create and return a physical map.
1479 *
1480 * If the size specified for the map
1481 * is zero, the map is an actual physical
1482 * map, and may be referenced by the
1483 * hardware.
1484 *
1485 * If the size specified is non-zero,
1486 * the map will be used in software only, and
1487 * is bounded by that size.
1488 */
1489pmap_t
1490pmap_create(
0c530ab8 1491 vm_map_size_t sz,
2d21ac55 1492 boolean_t is_64bit)
1c79356b 1493{
2d21ac55 1494 pmap_t p;
0c530ab8
A
1495 int i;
1496 vm_offset_t va;
1497 vm_size_t size;
1498 pdpt_entry_t *pdpt;
1499 pml4_entry_t *pml4p;
0c530ab8 1500 pd_entry_t *pdp;
2d21ac55 1501 int template;
0c530ab8
A
1502 spl_t s;
1503
2d21ac55
A
1504 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1505 (int) (sz>>32), (int) sz, (int) is_64bit, 0, 0);
1506
0c530ab8 1507 size = (vm_size_t) sz;
1c79356b
A
1508
1509 /*
1510 * A software use-only map doesn't even need a map.
1511 */
1512
1513 if (size != 0) {
1514 return(PMAP_NULL);
1515 }
1516
91447636
A
1517 p = (pmap_t) zalloc(pmap_zone);
1518 if (PMAP_NULL == p)
2d21ac55 1519 panic("pmap_create zalloc");
6601e61a 1520
0c530ab8
A
1521 /* init counts now since we'll be bumping some */
1522 simple_lock_init(&p->lock, 0);
1c79356b 1523 p->stats.resident_count = 0;
2d21ac55 1524 p->stats.resident_max = 0;
1c79356b 1525 p->stats.wired_count = 0;
1c79356b 1526 p->ref_count = 1;
0c530ab8 1527 p->nx_enabled = 1;
0c530ab8
A
1528 p->pm_shared = FALSE;
1529
2d21ac55
A
1530 assert(!is_64bit || cpu_64bit);
1531 p->pm_task_map = is_64bit ? TASK_MAP_64BIT : TASK_MAP_32BIT;;
1532
0c530ab8 1533 if (!cpu_64bit) {
2d21ac55
A
1534 /* legacy 32 bit setup */
1535 /* in the legacy case the pdpt layer is hardwired to 4 entries and each
1536 * entry covers 1GB of addr space */
b0d623f7
A
1537 if (KERN_SUCCESS != kmem_alloc_kobject(kernel_map, (vm_offset_t *)(&p->dirbase), NBPTD))
1538 panic("pmap_create kmem_alloc_kobject");
2d21ac55
A
1539 p->pm_hold = (vm_offset_t)zalloc(pdpt_zone);
1540 if ((vm_offset_t)NULL == p->pm_hold) {
1541 panic("pdpt zalloc");
1542 }
1543 pdpt = (pdpt_entry_t *) (( p->pm_hold + 31) & ~31);
1544 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)pdpt);
1545 if (NULL == (p->pm_obj = vm_object_allocate((vm_object_size_t)(NPGPTD*NPTDPG))))
1546 panic("pmap_create vm_object_allocate");
0c530ab8 1547
2d21ac55 1548 memset((char *)p->dirbase, 0, NBPTD);
0c530ab8 1549
2d21ac55
A
1550 va = (vm_offset_t)p->dirbase;
1551 p->pdirbase = kvtophys(va);
0c530ab8 1552
2d21ac55
A
1553 template = cpu_64bit ? INTEL_PTE_VALID|INTEL_PTE_RW|INTEL_PTE_USER|INTEL_PTE_REF : INTEL_PTE_VALID;
1554 for (i = 0; i< NPGPTD; i++, pdpt++ ) {
1555 pmap_paddr_t pa;
b0d623f7 1556 pa = (pmap_paddr_t) kvtophys((vm_offset_t)(va + i386_ptob(i)));
2d21ac55
A
1557 pmap_store_pte(pdpt, pa | template);
1558 }
0c530ab8 1559
2d21ac55
A
1560 /* map the high shared pde */
1561 s = splhigh();
1562 pmap_store_pte(pmap_pde(p, HIGH_MEM_BASE), high_shared_pde);
1563 splx(s);
4452a7af 1564
0c530ab8 1565 } else {
2d21ac55 1566 /* 64 bit setup */
4452a7af 1567
2d21ac55 1568 /* alloc the pml4 page in kernel vm */
b0d623f7
A
1569 if (KERN_SUCCESS != kmem_alloc_kobject(kernel_map, (vm_offset_t *)(&p->pm_hold), PAGE_SIZE))
1570 panic("pmap_create kmem_alloc_kobject pml4");
4452a7af 1571
2d21ac55
A
1572 memset((char *)p->pm_hold, 0, PAGE_SIZE);
1573 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_hold);
0c530ab8 1574
b0d623f7 1575 OSAddAtomic(1, &inuse_ptepages_count);
0c530ab8 1576
2d21ac55 1577 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
0c530ab8 1578
2d21ac55
A
1579 if (NULL == (p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS))))
1580 panic("pmap_create pdpt obj");
0c530ab8 1581
2d21ac55
A
1582 if (NULL == (p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS))))
1583 panic("pmap_create pdpt obj");
0c530ab8 1584
2d21ac55
A
1585 if (NULL == (p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS))))
1586 panic("pmap_create pte obj");
0c530ab8 1587
2d21ac55
A
1588 /* uber space points to uber mapped kernel */
1589 s = splhigh();
1590 pml4p = pmap64_pml4(p, 0ULL);
1591 pmap_store_pte((pml4p+KERNEL_UBER_PML4_INDEX),*kernel_pmap->pm_pml4);
0c530ab8 1592
0c530ab8 1593
2d21ac55
A
1594 if (!is_64bit) {
1595 while ((pdp = pmap64_pde(p, (uint64_t)HIGH_MEM_BASE)) == PD_ENTRY_NULL) {
1596 splx(s);
1597 pmap_expand_pdpt(p, (uint64_t)HIGH_MEM_BASE); /* need room for another pde entry */
1598 s = splhigh();
1599 }
1600 pmap_store_pte(pdp, high_shared_pde);
1601 }
1602 splx(s);
0c530ab8 1603 }
1c79356b 1604
2d21ac55
A
1605 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1606 (int) p, is_64bit, 0, 0, 0);
1607
1c79356b
A
1608 return(p);
1609}
1610
2d21ac55
A
1611/*
1612 * The following routines implement the shared address optmization for 64-bit
1613 * users with a 4GB page zero.
1614 *
1615 * pmap_set_4GB_pagezero()
1616 * is called in the exec and fork paths to mirror the kernel's
1617 * mapping in the bottom 4G of the user's pmap. The task mapping changes
1618 * from TASK_MAP_64BIT to TASK_MAP_64BIT_SHARED. This routine returns
1619 * without doing anything if the -no_shared_cr3 boot-arg is set.
1620 *
1621 * pmap_clear_4GB_pagezero()
1622 * is called in the exec/exit paths to undo this mirror. The task mapping
1623 * reverts to TASK_MAP_64BIT. In addition, we switch to the kernel's
1624 * CR3 by calling pmap_load_kernel_cr3().
1625 *
1626 * pmap_load_kernel_cr3()
1627 * loads cr3 with the kernel's page table. In addition to being called
1628 * by pmap_clear_4GB_pagezero(), it is used both prior to teardown and
1629 * when we go idle in the context of a shared map.
1630 *
1631 * Further notes on per-cpu data used:
1632 *
1633 * cpu_kernel_cr3 is the cr3 for the kernel's pmap.
1634 * This is loaded in a trampoline on entering the kernel
1635 * from a 32-bit user (or non-shared-cr3 64-bit user).
1636 * cpu_task_cr3 is the cr3 for the current thread.
1637 * This is loaded in a trampoline as we exit the kernel.
1638 * cpu_active_cr3 reflects the cr3 currently loaded.
1639 * However, the low order bit is set when the
1640 * processor is idle or interrupts are disabled
1641 * while the system pmap lock is held. It is used by
1642 * tlb shoot-down.
1643 * cpu_task_map indicates whether the task cr3 belongs to
1644 * a 32-bit, a 64-bit or a 64-bit shared map.
1645 * The latter allows the avoidance of the cr3 load
1646 * on kernel entry and exit.
1647 * cpu_tlb_invalid set TRUE when a tlb flush is requested.
1648 * If the cr3 is "inactive" (the cpu is idle or the
1649 * system-wide pmap lock is held) this not serviced by
1650 * an IPI but at time when the cr3 becomes "active".
1651 */
1652
0c530ab8
A
1653void
1654pmap_set_4GB_pagezero(pmap_t p)
1655{
0c530ab8
A
1656 pdpt_entry_t *user_pdptp;
1657 pdpt_entry_t *kern_pdptp;
1658
2d21ac55 1659 assert(p->pm_task_map != TASK_MAP_32BIT);
0c530ab8
A
1660
1661 /* Kernel-shared cr3 may be disabled by boot arg. */
1662 if (no_shared_cr3)
1663 return;
1664
1665 /*
1666 * Set the bottom 4 3rd-level pte's to be the kernel's.
1667 */
2d21ac55 1668 PMAP_LOCK(p);
0c530ab8 1669 while ((user_pdptp = pmap64_pdpt(p, 0x0)) == PDPT_ENTRY_NULL) {
2d21ac55 1670 PMAP_UNLOCK(p);
0c530ab8 1671 pmap_expand_pml4(p, 0x0);
2d21ac55 1672 PMAP_LOCK(p);
0c530ab8
A
1673 }
1674 kern_pdptp = kernel_pmap->pm_pdpt;
1675 pmap_store_pte(user_pdptp+0, *(kern_pdptp+0));
1676 pmap_store_pte(user_pdptp+1, *(kern_pdptp+1));
1677 pmap_store_pte(user_pdptp+2, *(kern_pdptp+2));
1678 pmap_store_pte(user_pdptp+3, *(kern_pdptp+3));
2d21ac55
A
1679 p->pm_task_map = TASK_MAP_64BIT_SHARED;
1680 PMAP_UNLOCK(p);
0c530ab8
A
1681}
1682
1683void
1684pmap_clear_4GB_pagezero(pmap_t p)
1685{
0c530ab8
A
1686 pdpt_entry_t *user_pdptp;
1687
2d21ac55 1688 if (p->pm_task_map != TASK_MAP_64BIT_SHARED)
0c530ab8
A
1689 return;
1690
2d21ac55
A
1691 PMAP_LOCK(p);
1692
1693 p->pm_task_map = TASK_MAP_64BIT;
1694
1695 pmap_load_kernel_cr3();
1696
0c530ab8
A
1697 user_pdptp = pmap64_pdpt(p, 0x0);
1698 pmap_store_pte(user_pdptp+0, 0);
1699 pmap_store_pte(user_pdptp+1, 0);
1700 pmap_store_pte(user_pdptp+2, 0);
1701 pmap_store_pte(user_pdptp+3, 0);
1702
2d21ac55
A
1703 PMAP_UNLOCK(p);
1704}
0c530ab8 1705
2d21ac55
A
1706void
1707pmap_load_kernel_cr3(void)
1708{
1709 uint64_t kernel_cr3;
0c530ab8 1710
2d21ac55
A
1711 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
1712
1713 /*
1714 * Reload cr3 with the true kernel cr3.
1715 */
1716 kernel_cr3 = current_cpu_datap()->cpu_kernel_cr3;
1717 set64_cr3(kernel_cr3);
1718 current_cpu_datap()->cpu_active_cr3 = kernel_cr3;
1719 current_cpu_datap()->cpu_tlb_invalid = FALSE;
1720 __asm__ volatile("mfence");
0c530ab8
A
1721}
1722
1c79356b
A
1723/*
1724 * Retire the given physical map from service.
1725 * Should only be called if the map contains
1726 * no valid mappings.
1727 */
1728
1729void
1730pmap_destroy(
1731 register pmap_t p)
1732{
1c79356b 1733 register int c;
1c79356b
A
1734
1735 if (p == PMAP_NULL)
1736 return;
2d21ac55
A
1737
1738 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1739 (int) p, 0, 0, 0, 0);
1740
1741 PMAP_LOCK(p);
1742
1c79356b 1743 c = --p->ref_count;
2d21ac55 1744
1c79356b 1745 if (c == 0) {
1c79356b
A
1746 /*
1747 * If some cpu is not using the physical pmap pointer that it
1748 * is supposed to be (see set_dirbase), we might be using the
1749 * pmap that is being destroyed! Make sure we are
1750 * physically on the right pmap:
1751 */
55e303ae 1752 PMAP_UPDATE_TLBS(p,
2d21ac55
A
1753 0x0ULL,
1754 0xFFFFFFFFFFFFF000ULL);
1c79356b 1755 }
2d21ac55
A
1756
1757 PMAP_UNLOCK(p);
1c79356b
A
1758
1759 if (c != 0) {
2d21ac55
A
1760 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1761 (int) p, 1, 0, 0, 0);
1762 return; /* still in use */
1c79356b
A
1763 }
1764
1765 /*
1766 * Free the memory maps, then the
1767 * pmap structure.
1768 */
0c530ab8 1769 if (!cpu_64bit) {
b0d623f7 1770 OSAddAtomic(-p->pm_obj->resident_page_count, &inuse_ptepages_count);
91447636 1771
2d21ac55
A
1772 kmem_free(kernel_map, (vm_offset_t)p->dirbase, NBPTD);
1773 zfree(pdpt_zone, (void *)p->pm_hold);
0c530ab8 1774
2d21ac55
A
1775 vm_object_deallocate(p->pm_obj);
1776 } else {
1777 /* 64 bit */
1778 int inuse_ptepages = 0;
0c530ab8 1779
2d21ac55
A
1780 /* free 64 bit mode structs */
1781 inuse_ptepages++;
1782 kmem_free(kernel_map, (vm_offset_t)p->pm_hold, PAGE_SIZE);
1783
1784 inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1785 vm_object_deallocate(p->pm_obj_pml4);
1786
1787 inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1788 vm_object_deallocate(p->pm_obj_pdpt);
0c530ab8 1789
2d21ac55
A
1790 inuse_ptepages += p->pm_obj->resident_page_count;
1791 vm_object_deallocate(p->pm_obj);
1792
b0d623f7 1793 OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count);
2d21ac55
A
1794 }
1795 zfree(pmap_zone, p);
1c79356b 1796
2d21ac55
A
1797 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1798 0, 0, 0, 0, 0);
0c530ab8 1799
1c79356b
A
1800}
1801
1802/*
1803 * Add a reference to the specified pmap.
1804 */
1805
1806void
1807pmap_reference(
1808 register pmap_t p)
1809{
1c79356b
A
1810
1811 if (p != PMAP_NULL) {
2d21ac55 1812 PMAP_LOCK(p);
1c79356b 1813 p->ref_count++;
2d21ac55 1814 PMAP_UNLOCK(p);;
1c79356b
A
1815 }
1816}
1817
1818/*
1819 * Remove a range of hardware page-table entries.
1820 * The entries given are the first (inclusive)
1821 * and last (exclusive) entries for the VM pages.
1822 * The virtual address is the va for the first pte.
1823 *
1824 * The pmap must be locked.
1825 * If the pmap is not the kernel pmap, the range must lie
1826 * entirely within one pte-page. This is NOT checked.
1827 * Assumes that the pte-page exists.
1828 */
1829
2d21ac55 1830void
1c79356b
A
1831pmap_remove_range(
1832 pmap_t pmap,
0c530ab8 1833 vm_map_offset_t start_vaddr,
1c79356b
A
1834 pt_entry_t *spte,
1835 pt_entry_t *epte)
1836{
1837 register pt_entry_t *cpte;
2d21ac55
A
1838 pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL;
1839 pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL;
1840 pv_hashed_entry_t pvh_e;
1841 int pvh_cnt = 0;
0c530ab8 1842 int num_removed, num_unwired, num_found;
1c79356b 1843 int pai;
91447636 1844 pmap_paddr_t pa;
2d21ac55
A
1845 vm_map_offset_t vaddr;
1846 int pvhash_idx;
1847 uint32_t pv_cnt;
1c79356b 1848
1c79356b
A
1849 num_removed = 0;
1850 num_unwired = 0;
2d21ac55
A
1851 num_found = 0;
1852
1853 if (pmap != kernel_pmap &&
1854 pmap->pm_task_map == TASK_MAP_32BIT &&
1855 start_vaddr >= HIGH_MEM_BASE) {
1856 /*
1857 * The range is in the "high_shared_pde" which is shared
1858 * between the kernel and all 32-bit tasks. It holds
1859 * the 32-bit commpage but also the trampolines, GDT, etc...
1860 * so we can't let user tasks remove anything from it.
1861 */
1862 return;
1863 }
1c79356b 1864
0c530ab8
A
1865 /* invalidate the PTEs first to "freeze" them */
1866 for (cpte = spte, vaddr = start_vaddr;
1867 cpte < epte;
1868 cpte++, vaddr += PAGE_SIZE_64) {
1c79356b
A
1869
1870 pa = pte_to_pa(*cpte);
1871 if (pa == 0)
1872 continue;
0c530ab8 1873 num_found++;
1c79356b 1874
1c79356b
A
1875 if (iswired(*cpte))
1876 num_unwired++;
1877
2d21ac55 1878 pai = pa_index(pa);
1c79356b 1879
2d21ac55 1880 if (!managed_page(pai)) {
1c79356b
A
1881 /*
1882 * Outside range of managed physical memory.
1883 * Just remove the mappings.
1884 */
0c530ab8 1885 pmap_store_pte(cpte, 0);
1c79356b
A
1886 continue;
1887 }
1888
2d21ac55 1889 /* invalidate the PTE */
0c530ab8
A
1890 pmap_update_pte(cpte, *cpte, (*cpte & ~INTEL_PTE_VALID));
1891 }
1c79356b 1892
2d21ac55
A
1893 if (num_found == 0) {
1894 /* nothing was changed: we're done */
1895 goto update_counts;
0c530ab8 1896 }
1c79356b 1897
0c530ab8 1898 /* propagate the invalidates to other CPUs */
91447636 1899
0c530ab8
A
1900 PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
1901
1902 for (cpte = spte, vaddr = start_vaddr;
1903 cpte < epte;
1904 cpte++, vaddr += PAGE_SIZE_64) {
1905
1906 pa = pte_to_pa(*cpte);
1907 if (pa == 0)
1908 continue;
1909
1910 pai = pa_index(pa);
1911
1912 LOCK_PVH(pai);
1913
2d21ac55
A
1914 pa = pte_to_pa(*cpte);
1915 if (pa == 0) {
1916 UNLOCK_PVH(pai);
1917 continue;
1918 }
1919
0c530ab8
A
1920 num_removed++;
1921
1922 /*
1923 * Get the modify and reference bits, then
1924 * nuke the entry in the page table
1925 */
1926 /* remember reference and change */
1927 pmap_phys_attributes[pai] |=
2d21ac55 1928 (char)(*cpte & (PHYS_MODIFIED | PHYS_REFERENCED));
0c530ab8
A
1929 /* completely invalidate the PTE */
1930 pmap_store_pte(cpte, 0);
1c79356b
A
1931
1932 /*
1933 * Remove the mapping from the pvlist for
1934 * this physical page.
1935 */
1936 {
2d21ac55
A
1937 pv_rooted_entry_t pv_h;
1938 pv_hashed_entry_t *pprevh;
1939 ppnum_t ppn = (ppnum_t)pai;
1c79356b
A
1940
1941 pv_h = pai_to_pvh(pai);
2d21ac55
A
1942 pvh_e = PV_HASHED_ENTRY_NULL;
1943 if (pv_h->pmap == PMAP_NULL)
1944 panic("pmap_remove_range: null pv_list!");
1945
1946 if (pv_h->va == vaddr && pv_h->pmap == pmap) { /* rooted or not */
1c79356b 1947 /*
2d21ac55
A
1948 * Header is the pv_rooted_entry. We can't free that. If there is a queued
1949 * entry after this one we remove that
1950 * from the ppn queue, we remove it from the hash chain
1951 * and copy it to the rooted entry. Then free it instead.
1c79356b 1952 */
2d21ac55
A
1953
1954 pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
1955 if (pv_h != (pv_rooted_entry_t)pvh_e) { /* any queued after rooted? */
1956 CHK_NPVHASH();
1957 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
1958 LOCK_PV_HASH(pvhash_idx);
1959 remque(&pvh_e->qlink);
1960 {
1961 pprevh = pvhash(pvhash_idx);
1962 if (PV_HASHED_ENTRY_NULL == *pprevh) {
1963 panic("pmap_remove_range empty hash removing rooted pv");
1964 }
1c79356b 1965 }
2d21ac55
A
1966 pmap_pvh_unlink(pvh_e);
1967 UNLOCK_PV_HASH(pvhash_idx);
1968 pv_h->pmap = pvh_e->pmap;
1969 pv_h->va = pvh_e->va; /* dispose of pvh_e */
1970 } else { /* none queued after rooted */
1971 pv_h->pmap = PMAP_NULL;
1972 pvh_e = PV_HASHED_ENTRY_NULL;
1973 } /* any queued after rooted */
1974
1975 } else { /* rooted or not */
1976 /* not removing rooted pv. find it on hash chain, remove from ppn queue and
1977 * hash chain and free it */
1978 CHK_NPVHASH();
1979 pvhash_idx = pvhashidx(pmap,vaddr);
1980 LOCK_PV_HASH(pvhash_idx);
1981 pprevh = pvhash(pvhash_idx);
1982 if (PV_HASHED_ENTRY_NULL == *pprevh) {
1983 panic("pmap_remove_range empty hash removing hashed pv");
1c79356b 1984 }
2d21ac55
A
1985 pvh_e = *pprevh;
1986 pmap_pv_hashlist_walks++;
1987 pv_cnt = 0;
1988 while (PV_HASHED_ENTRY_NULL != pvh_e) {
1989 pv_cnt++;
1990 if (pvh_e->pmap == pmap && pvh_e->va == vaddr && pvh_e->ppn == ppn) break;
1991 pprevh = &pvh_e->nexth;
1992 pvh_e = pvh_e->nexth;
1993 }
1994 pmap_pv_hashlist_cnts += pv_cnt;
1995 if (pmap_pv_hashlist_max < pv_cnt) pmap_pv_hashlist_max = pv_cnt;
1996 if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pmap_remove_range pv not on hash");
1997 *pprevh = pvh_e->nexth;
1998 remque(&pvh_e->qlink);
1999 UNLOCK_PV_HASH(pvhash_idx);
2000
2001 } /* rooted or not */
2002
1c79356b 2003 UNLOCK_PVH(pai);
2d21ac55
A
2004
2005 if (pvh_e != PV_HASHED_ENTRY_NULL) {
2006 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
2007 pvh_eh = pvh_e;
2008
2009 if (pvh_et == PV_HASHED_ENTRY_NULL) {
2010 pvh_et = pvh_e;
2011 }
2012
2013 pvh_cnt++;
2014 }
2015
2016 } /* removing mappings for this phy page */
2017 } /* for loop */
2018
2019 if (pvh_eh != PV_HASHED_ENTRY_NULL) {
2020 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1c79356b
A
2021 }
2022
2d21ac55 2023update_counts:
1c79356b
A
2024 /*
2025 * Update the counts
2026 */
2d21ac55
A
2027#if TESTING
2028 if (pmap->stats.resident_count < num_removed)
2029 panic("pmap_remove_range: resident_count");
2030#endif
1c79356b 2031 assert(pmap->stats.resident_count >= num_removed);
b0d623f7 2032 OSAddAtomic(-num_removed, &pmap->stats.resident_count);
2d21ac55
A
2033
2034#if TESTING
2035 if (pmap->stats.wired_count < num_unwired)
2036 panic("pmap_remove_range: wired_count");
2037#endif
1c79356b 2038 assert(pmap->stats.wired_count >= num_unwired);
b0d623f7 2039 OSAddAtomic(-num_unwired, &pmap->stats.wired_count);
2d21ac55 2040
0c530ab8 2041 return;
1c79356b
A
2042}
2043
0b4e3aa0
A
2044/*
2045 * Remove phys addr if mapped in specified map
2046 *
2047 */
2048void
2049pmap_remove_some_phys(
91447636
A
2050 __unused pmap_t map,
2051 __unused ppnum_t pn)
0b4e3aa0
A
2052{
2053
2054/* Implement to support working set code */
2055
2056}
2057
1c79356b
A
2058/*
2059 * Remove the given range of addresses
2060 * from the specified map.
2061 *
2062 * It is assumed that the start and end are properly
2063 * rounded to the hardware page size.
2064 */
2065
55e303ae 2066
1c79356b
A
2067void
2068pmap_remove(
2069 pmap_t map,
55e303ae
A
2070 addr64_t s64,
2071 addr64_t e64)
1c79356b 2072{
2d21ac55
A
2073 pt_entry_t *pde;
2074 pt_entry_t *spte, *epte;
2075 addr64_t l64;
2076 addr64_t orig_s64;
2077 uint64_t deadline;
2078
2079 pmap_intr_assert();
1c79356b 2080
0c530ab8 2081 if (map == PMAP_NULL || s64 == e64)
1c79356b 2082 return;
2d21ac55
A
2083
2084 PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
2085 (int) map,
2086 (int) (s64>>32), (int) s64,
2087 (int) (e64>>32), (int) e64);
1c79356b 2088
2d21ac55
A
2089 PMAP_LOCK(map);
2090
2091#if 0
2092 /*
2093 * Check that address range in the kernel does not overlap the stacks.
2094 * We initialize local static min/max variables once to avoid making
2095 * 2 function calls for every remove. Note also that these functions
2096 * both return 0 before kernel stacks have been initialized, and hence
2097 * the panic is not triggered in this case.
2098 */
2099 if (map == kernel_pmap) {
2100 static vm_offset_t kernel_stack_min = 0;
2101 static vm_offset_t kernel_stack_max = 0;
2102
2103 if (kernel_stack_min == 0) {
2104 kernel_stack_min = min_valid_stack_address();
2105 kernel_stack_max = max_valid_stack_address();
2106 }
2107 if ((kernel_stack_min <= s64 && s64 < kernel_stack_max) ||
2108 (kernel_stack_min < e64 && e64 <= kernel_stack_max))
2109 panic("pmap_remove() attempted in kernel stack");
2110 }
2111#else
2112
2113 /*
2114 * The values of kernel_stack_min and kernel_stack_max are no longer
2115 * relevant now that we allocate kernel stacks anywhere in the kernel map,
2116 * so the old code above no longer applies. If we wanted to check that
2117 * we weren't removing a mapping of a page in a kernel stack we'd have to
2118 * mark the PTE with an unused bit and check that here.
2119 */
2120
2121#endif
2122
2123 deadline = rdtsc64() + max_preemption_latency_tsc;
1c79356b 2124
0c530ab8
A
2125 orig_s64 = s64;
2126
2127 while (s64 < e64) {
2128 l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size-1);
2129 if (l64 > e64)
2130 l64 = e64;
2131 pde = pmap_pde(map, s64);
2d21ac55 2132
0c530ab8
A
2133 if (pde && (*pde & INTEL_PTE_VALID)) {
2134 spte = (pt_entry_t *)pmap_pte(map, (s64 & ~(pde_mapped_size-1)));
2135 spte = &spte[ptenum(s64)];
2136 epte = &spte[intel_btop(l64-s64)];
2d21ac55 2137
0c530ab8 2138 pmap_remove_range(map, s64, spte, epte);
1c79356b 2139 }
0c530ab8 2140 s64 = l64;
1c79356b 2141 pde++;
2d21ac55
A
2142
2143 if (s64 < e64 && rdtsc64() >= deadline) {
2144 PMAP_UNLOCK(map)
2145 PMAP_LOCK(map)
2146
2147 deadline = rdtsc64() + max_preemption_latency_tsc;
2148 }
2149
1c79356b 2150 }
91447636 2151
2d21ac55
A
2152 PMAP_UNLOCK(map);
2153
2154 PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END,
2155 (int) map, 0, 0, 0, 0);
2156
1c79356b
A
2157}
2158
2159/*
2160 * Routine: pmap_page_protect
2161 *
2162 * Function:
2163 * Lower the permission for all mappings to a given
2164 * page.
2165 */
2166void
2167pmap_page_protect(
55e303ae 2168 ppnum_t pn,
1c79356b
A
2169 vm_prot_t prot)
2170{
2d21ac55
A
2171 pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL;
2172 pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL;
2173 pv_hashed_entry_t nexth;
2174 int pvh_cnt = 0;
2175 pv_rooted_entry_t pv_h;
2176 pv_rooted_entry_t pv_e;
2177 pv_hashed_entry_t pvh_e;
2178 pt_entry_t *pte;
1c79356b
A
2179 int pai;
2180 register pmap_t pmap;
1c79356b 2181 boolean_t remove;
2d21ac55 2182 int pvhash_idx;
1c79356b 2183
2d21ac55 2184 pmap_intr_assert();
55e303ae 2185 assert(pn != vm_page_fictitious_addr);
2d21ac55
A
2186 if (pn == vm_page_guard_addr)
2187 return;
2188
2189 pai = ppn_to_pai(pn);
0c530ab8 2190
2d21ac55 2191 if (!managed_page(pai)) {
1c79356b
A
2192 /*
2193 * Not a managed page.
2194 */
2195 return;
2196 }
2197
2d21ac55
A
2198 PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START,
2199 (int) pn, (int) prot, 0, 0, 0);
2200
1c79356b
A
2201 /*
2202 * Determine the new protection.
2203 */
2204 switch (prot) {
2205 case VM_PROT_READ:
2206 case VM_PROT_READ|VM_PROT_EXECUTE:
2207 remove = FALSE;
2208 break;
2209 case VM_PROT_ALL:
2210 return; /* nothing to do */
2211 default:
2212 remove = TRUE;
2213 break;
2214 }
0c530ab8 2215
2d21ac55 2216 pv_h = pai_to_pvh(pai);
1c79356b 2217
2d21ac55 2218 LOCK_PVH(pai);
1c79356b 2219
b0d623f7 2220
1c79356b
A
2221 /*
2222 * Walk down PV list, changing or removing all mappings.
1c79356b
A
2223 */
2224 if (pv_h->pmap != PMAP_NULL) {
2225
2d21ac55
A
2226 pv_e = pv_h;
2227 pvh_e = (pv_hashed_entry_t)pv_e; /* cheat */
4452a7af 2228
2d21ac55
A
2229 do {
2230 register vm_map_offset_t vaddr;
2231 pmap = pv_e->pmap;
0c530ab8 2232
2d21ac55
A
2233 vaddr = pv_e->va;
2234 pte = pmap_pte(pmap, vaddr);
2235
2236 if (0 == pte) {
b0d623f7 2237 panic("pmap_page_protect: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx, prot: %d kernel_pmap: %p", pmap, pn, vaddr, prot, kernel_pmap);
2d21ac55 2238 }
0c530ab8 2239
2d21ac55 2240 nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink); /* if there is one */
4452a7af 2241
2d21ac55
A
2242 /*
2243 * Remove the mapping if new protection is NONE
2244 * or if write-protecting a kernel mapping.
2245 */
2246 if (remove || pmap == kernel_pmap) {
2247 /*
2248 * Remove the mapping, collecting any modify bits.
2249 */
2250 pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID));
4452a7af 2251
2d21ac55 2252 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
4452a7af 2253
2d21ac55 2254 pmap_phys_attributes[pai] |= *pte & (PHYS_MODIFIED|PHYS_REFERENCED);
4452a7af 2255
2d21ac55 2256 pmap_store_pte(pte, 0);
0c530ab8 2257
2d21ac55
A
2258#if TESTING
2259 if (pmap->stats.resident_count < 1)
2260 panic("pmap_page_protect: resident_count");
2261#endif
2262 assert(pmap->stats.resident_count >= 1);
b0d623f7 2263 OSAddAtomic(-1, &pmap->stats.resident_count);
0c530ab8 2264
2d21ac55
A
2265 /*
2266 * Deal with the pv_rooted_entry.
2267 */
0c530ab8 2268
2d21ac55
A
2269 if (pv_e == pv_h) {
2270 /*
2271 * Fix up head later.
2272 */
2273 pv_h->pmap = PMAP_NULL;
2274 }
2275 else {
2276 /*
2277 * Delete this entry.
2278 */
2279 CHK_NPVHASH();
2280 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
2281 LOCK_PV_HASH(pvhash_idx);
2282 remque(&pvh_e->qlink);
2283 pmap_pvh_unlink(pvh_e);
2284 UNLOCK_PV_HASH(pvhash_idx);
2285
2286 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
2287 pvh_eh = pvh_e;
2288
2289 if (pvh_et == PV_HASHED_ENTRY_NULL)
2290 pvh_et = pvh_e;
2291 pvh_cnt++;
2292 }
2293 } else {
2294 /*
2295 * Write-protect.
2296 */
2297 pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WRITE));
2298 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2299 }
0c530ab8 2300
2d21ac55
A
2301 pvh_e = nexth;
2302 } while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h);
0c530ab8 2303
b0d623f7 2304
2d21ac55
A
2305 /*
2306 * If pv_head mapping was removed, fix it up.
2307 */
0c530ab8 2308
2d21ac55
A
2309 if (pv_h->pmap == PMAP_NULL) {
2310 pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
2311
2312 if (pvh_e != (pv_hashed_entry_t)pv_h) {
2313 CHK_NPVHASH();
2314 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
2315 LOCK_PV_HASH(pvhash_idx);
2316 remque(&pvh_e->qlink);
2317 pmap_pvh_unlink(pvh_e);
2318 UNLOCK_PV_HASH(pvhash_idx);
2319 pv_h->pmap = pvh_e->pmap;
2320 pv_h->va = pvh_e->va;
2321 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
2322 pvh_eh = pvh_e;
2323
2324 if (pvh_et == PV_HASHED_ENTRY_NULL)
2325 pvh_et = pvh_e;
2326 pvh_cnt++;
1c79356b 2327 }
2d21ac55
A
2328 }
2329 }
2330 if (pvh_eh != PV_HASHED_ENTRY_NULL) {
2331 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1c79356b 2332 }
2d21ac55
A
2333
2334 UNLOCK_PVH(pai);
2335
2336 PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END,
2337 0, 0, 0, 0, 0);
2338
1c79356b
A
2339}
2340
2d21ac55 2341
91447636
A
2342/*
2343 * Routine:
2344 * pmap_disconnect
2345 *
2346 * Function:
2347 * Disconnect all mappings for this page and return reference and change status
2348 * in generic format.
2349 *
2350 */
2351unsigned int pmap_disconnect(
2352 ppnum_t pa)
2353{
2d21ac55 2354 pmap_page_protect(pa, 0); /* disconnect the page */
91447636
A
2355 return (pmap_get_refmod(pa)); /* return ref/chg status */
2356}
2357
1c79356b
A
2358/*
2359 * Set the physical protection on the
2360 * specified range of this map as requested.
2361 * Will not increase permissions.
2362 */
2363void
2364pmap_protect(
2365 pmap_t map,
0c530ab8
A
2366 vm_map_offset_t sva,
2367 vm_map_offset_t eva,
1c79356b
A
2368 vm_prot_t prot)
2369{
2370 register pt_entry_t *pde;
2371 register pt_entry_t *spte, *epte;
0c530ab8
A
2372 vm_map_offset_t lva;
2373 vm_map_offset_t orig_sva;
0c530ab8 2374 boolean_t set_NX;
2d21ac55
A
2375 int num_found = 0;
2376
2377 pmap_intr_assert();
1c79356b
A
2378
2379 if (map == PMAP_NULL)
2380 return;
2381
0c530ab8
A
2382 if (prot == VM_PROT_NONE) {
2383 pmap_remove(map, sva, eva);
1c79356b
A
2384 return;
2385 }
2386
2d21ac55
A
2387 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
2388 (int) map,
2389 (int) (sva>>32), (int) sva,
2390 (int) (eva>>32), (int) eva);
2391
0c530ab8
A
2392 if ( (prot & VM_PROT_EXECUTE) || !nx_enabled || !map->nx_enabled )
2393 set_NX = FALSE;
2394 else
2395 set_NX = TRUE;
2396
2d21ac55 2397 PMAP_LOCK(map);
1c79356b 2398
0c530ab8
A
2399 orig_sva = sva;
2400 while (sva < eva) {
2401 lva = (sva + pde_mapped_size) & ~(pde_mapped_size-1);
2402 if (lva > eva)
2403 lva = eva;
2404 pde = pmap_pde(map, sva);
2405 if (pde && (*pde & INTEL_PTE_VALID)) {
2406 spte = (pt_entry_t *)pmap_pte(map, (sva & ~(pde_mapped_size-1)));
2407 spte = &spte[ptenum(sva)];
2408 epte = &spte[intel_btop(lva-sva)];
1c79356b
A
2409
2410 while (spte < epte) {
2d21ac55 2411
0c530ab8
A
2412 if (*spte & INTEL_PTE_VALID) {
2413
2414 if (prot & VM_PROT_WRITE)
2d21ac55 2415 pmap_update_pte(spte, *spte, (*spte | INTEL_PTE_WRITE));
0c530ab8 2416 else
2d21ac55 2417 pmap_update_pte(spte, *spte, (*spte & ~INTEL_PTE_WRITE));
0c530ab8
A
2418
2419 if (set_NX == TRUE)
2d21ac55 2420 pmap_update_pte(spte, *spte, (*spte | INTEL_PTE_NX));
0c530ab8 2421 else
2d21ac55 2422 pmap_update_pte(spte, *spte, (*spte & ~INTEL_PTE_NX));
0c530ab8
A
2423
2424 num_found++;
0c530ab8 2425 }
1c79356b
A
2426 spte++;
2427 }
2428 }
0c530ab8 2429 sva = lva;
1c79356b 2430 }
0c530ab8 2431 if (num_found)
2d21ac55
A
2432 PMAP_UPDATE_TLBS(map, orig_sva, eva);
2433
2434 PMAP_UNLOCK(map);
2435
2436 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END,
2437 0, 0, 0, 0, 0);
91447636 2438
1c79356b
A
2439}
2440
0c530ab8
A
2441/* Map a (possibly) autogenned block */
2442void
2443pmap_map_block(
2444 pmap_t pmap,
2445 addr64_t va,
2446 ppnum_t pa,
2447 uint32_t size,
2448 vm_prot_t prot,
2449 int attr,
2450 __unused unsigned int flags)
2451{
2d21ac55 2452 uint32_t page;
0c530ab8 2453
2d21ac55
A
2454 for (page = 0; page < size; page++) {
2455 pmap_enter(pmap, va, pa, prot, attr, TRUE);
2456 va += PAGE_SIZE;
2457 pa++;
2458 }
0c530ab8 2459}
1c79356b
A
2460
2461
2462/*
2463 * Insert the given physical page (p) at
2464 * the specified virtual address (v) in the
2465 * target physical map with the protection requested.
2466 *
2467 * If specified, the page will be wired down, meaning
2468 * that the related pte cannot be reclaimed.
2469 *
2470 * NB: This is the only routine which MAY NOT lazy-evaluate
2471 * or lose information. That is, this routine must actually
2472 * insert this page into the given map NOW.
2473 */
2474void
2475pmap_enter(
2476 register pmap_t pmap,
2d21ac55 2477 vm_map_offset_t vaddr,
55e303ae 2478 ppnum_t pn,
1c79356b 2479 vm_prot_t prot,
9bccf70c 2480 unsigned int flags,
1c79356b
A
2481 boolean_t wired)
2482{
2483 register pt_entry_t *pte;
2d21ac55 2484 register pv_rooted_entry_t pv_h;
91447636 2485 register int pai;
2d21ac55
A
2486 pv_hashed_entry_t pvh_e;
2487 pv_hashed_entry_t pvh_new;
2488 pv_hashed_entry_t *hashp;
1c79356b 2489 pt_entry_t template;
91447636 2490 pmap_paddr_t old_pa;
2d21ac55 2491 pmap_paddr_t pa = (pmap_paddr_t)i386_ptob(pn);
0c530ab8
A
2492 boolean_t need_tlbflush = FALSE;
2493 boolean_t set_NX;
2d21ac55
A
2494 char oattr;
2495 int pvhash_idx;
2496 uint32_t pv_cnt;
2497 boolean_t old_pa_locked;
1c79356b 2498
2d21ac55 2499 pmap_intr_assert();
55e303ae 2500 assert(pn != vm_page_fictitious_addr);
1c79356b 2501 if (pmap_debug)
0c530ab8 2502 printf("pmap(%qx, %x)\n", vaddr, pn);
1c79356b
A
2503 if (pmap == PMAP_NULL)
2504 return;
2d21ac55
A
2505 if (pn == vm_page_guard_addr)
2506 return;
2507
2508 PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
2509 (int) pmap,
2510 (int) (vaddr>>32), (int) vaddr,
2511 (int) pn, prot);
1c79356b 2512
0c530ab8
A
2513 if ( (prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled )
2514 set_NX = FALSE;
2515 else
2516 set_NX = TRUE;
2517
1c79356b
A
2518 /*
2519 * Must allocate a new pvlist entry while we're unlocked;
2520 * zalloc may cause pageout (which will lock the pmap system).
2521 * If we determine we need a pvlist entry, we will unlock
2522 * and allocate one. Then we will retry, throughing away
2523 * the allocated entry later (if we no longer need it).
2524 */
91447636 2525
2d21ac55
A
2526 pvh_new = PV_HASHED_ENTRY_NULL;
2527Retry:
2528 pvh_e = PV_HASHED_ENTRY_NULL;
2529
2530 PMAP_LOCK(pmap);
1c79356b
A
2531
2532 /*
2533 * Expand pmap to include this pte. Assume that
2534 * pmap is always expanded to include enough hardware
2535 * pages to map one VM page.
2536 */
2537
0c530ab8 2538 while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
1c79356b
A
2539 /*
2540 * Must unlock to expand the pmap.
2541 */
2d21ac55 2542 PMAP_UNLOCK(pmap);
0c530ab8 2543 pmap_expand(pmap, vaddr); /* going to grow pde level page(s) */
2d21ac55
A
2544 PMAP_LOCK(pmap);
2545 }
2546
2547 old_pa = pte_to_pa(*pte);
2548 pai = pa_index(old_pa);
2549 old_pa_locked = FALSE;
1c79356b 2550
2d21ac55
A
2551 /*
2552 * if we have a previous managed page, lock the pv entry now. after
2553 * we lock it, check to see if someone beat us to the lock and if so
2554 * drop the lock
2555 */
2556
2557 if ((0 != old_pa) && managed_page(pai)) {
2558 LOCK_PVH(pai);
2559 old_pa_locked = TRUE;
2560 old_pa = pte_to_pa(*pte);
2561 if (0 == old_pa) {
2562 UNLOCK_PVH(pai); /* some other path beat us to it */
2563 old_pa_locked = FALSE;
2564 }
1c79356b 2565 }
2d21ac55
A
2566
2567
1c79356b 2568 /*
2d21ac55 2569 * Special case if the incoming physical page is already mapped
1c79356b
A
2570 * at this address.
2571 */
1c79356b 2572 if (old_pa == pa) {
2d21ac55 2573
1c79356b
A
2574 /*
2575 * May be changing its wired attribute or protection
2576 */
2d21ac55 2577
1c79356b 2578 template = pa_to_pte(pa) | INTEL_PTE_VALID;
55e303ae 2579
0c530ab8 2580 if(VM_MEM_NOT_CACHEABLE == (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) {
55e303ae
A
2581 if(!(flags & VM_MEM_GUARDED))
2582 template |= INTEL_PTE_PTA;
2583 template |= INTEL_PTE_NCACHE;
2584 }
2585
1c79356b
A
2586 if (pmap != kernel_pmap)
2587 template |= INTEL_PTE_USER;
2588 if (prot & VM_PROT_WRITE)
2589 template |= INTEL_PTE_WRITE;
0c530ab8
A
2590
2591 if (set_NX == TRUE)
2592 template |= INTEL_PTE_NX;
2593
1c79356b
A
2594 if (wired) {
2595 template |= INTEL_PTE_WIRED;
2596 if (!iswired(*pte))
b0d623f7 2597 OSAddAtomic(+1, &pmap->stats.wired_count);
1c79356b
A
2598 }
2599 else {
2600 if (iswired(*pte)) {
2601 assert(pmap->stats.wired_count >= 1);
b0d623f7 2602 OSAddAtomic(-1, &pmap->stats.wired_count);
1c79356b
A
2603 }
2604 }
2605
2d21ac55 2606 /* store modified PTE and preserve RC bits */
0c530ab8 2607 pmap_update_pte(pte, *pte, template | (*pte & (INTEL_PTE_REF | INTEL_PTE_MOD)));
2d21ac55
A
2608 if (old_pa_locked) {
2609 UNLOCK_PVH(pai);
2610 old_pa_locked = FALSE;
2611 }
0c530ab8 2612 need_tlbflush = TRUE;
1c79356b
A
2613 goto Done;
2614 }
2615
2616 /*
2617 * Outline of code from here:
2618 * 1) If va was mapped, update TLBs, remove the mapping
2619 * and remove old pvlist entry.
2620 * 2) Add pvlist entry for new mapping
2621 * 3) Enter new mapping.
2622 *
1c79356b
A
2623 * If the old physical page is not managed step 1) is skipped
2624 * (except for updating the TLBs), and the mapping is
2625 * overwritten at step 3). If the new physical page is not
2626 * managed, step 2) is skipped.
2627 */
2628
91447636 2629 if (old_pa != (pmap_paddr_t) 0) {
1c79356b 2630
1c79356b
A
2631 /*
2632 * Don't do anything to pages outside valid memory here.
2633 * Instead convince the code that enters a new mapping
2634 * to overwrite the old one.
2635 */
2636
2d21ac55
A
2637 /* invalidate the PTE */
2638 pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID));
2639 /* propagate invalidate everywhere */
2640 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2641 /* remember reference and change */
2642 oattr = (char)(*pte & (PHYS_MODIFIED | PHYS_REFERENCED));
2643 /* completely invalidate the PTE */
2644 pmap_store_pte(pte, 0);
1c79356b 2645
2d21ac55 2646 if (managed_page(pai)) {
2d21ac55
A
2647#if TESTING
2648 if (pmap->stats.resident_count < 1)
2649 panic("pmap_enter: resident_count");
2650#endif
1c79356b 2651 assert(pmap->stats.resident_count >= 1);
b0d623f7 2652 OSAddAtomic(-1, &pmap->stats.resident_count);
2d21ac55 2653
1c79356b 2654 if (iswired(*pte)) {
2d21ac55
A
2655
2656#if TESTING
2657 if (pmap->stats.wired_count < 1)
2658 panic("pmap_enter: wired_count");
2659#endif
1c79356b 2660 assert(pmap->stats.wired_count >= 1);
b0d623f7 2661 OSAddAtomic(-1, &pmap->stats.wired_count);
1c79356b 2662 }
91447636 2663
0c530ab8 2664 pmap_phys_attributes[pai] |= oattr;
1c79356b
A
2665 /*
2666 * Remove the mapping from the pvlist for
2667 * this physical page.
2d21ac55
A
2668 * We'll end up with either a rooted pv or a
2669 * hashed pv
1c79356b
A
2670 */
2671 {
1c79356b
A
2672
2673 pv_h = pai_to_pvh(pai);
2d21ac55 2674
1c79356b
A
2675 if (pv_h->pmap == PMAP_NULL) {
2676 panic("pmap_enter: null pv_list!");
2677 }
0c530ab8
A
2678
2679 if (pv_h->va == vaddr && pv_h->pmap == pmap) {
1c79356b 2680 /*
2d21ac55
A
2681 * Header is the pv_rooted_entry.
2682 * If there is a next one, copy it to the
2683 * header and free the next one (we cannot
1c79356b
A
2684 * free the header)
2685 */
2d21ac55
A
2686 pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
2687 if (pvh_e != (pv_hashed_entry_t)pv_h) {
2688 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
2689 LOCK_PV_HASH(pvhash_idx);
2690 remque(&pvh_e->qlink);
2691 pmap_pvh_unlink(pvh_e);
2692 UNLOCK_PV_HASH(pvhash_idx);
2693 pv_h->pmap = pvh_e->pmap;
2694 pv_h->va = pvh_e->va;
1c79356b 2695 }
2d21ac55
A
2696 else {
2697 pv_h->pmap = PMAP_NULL;
2698 pvh_e = PV_HASHED_ENTRY_NULL;
2699 }
1c79356b
A
2700 }
2701 else {
2d21ac55
A
2702 pv_hashed_entry_t *pprevh;
2703 ppnum_t old_ppn;
2704 /* wasn't the rooted pv - hash, find it, and unlink it */
2705 old_ppn = (ppnum_t)pa_index(old_pa);
2706 CHK_NPVHASH();
2707 pvhash_idx = pvhashidx(pmap,vaddr);
2708 LOCK_PV_HASH(pvhash_idx);
2709 pprevh = pvhash(pvhash_idx);
2710#if PV_DEBUG
2711 if (NULL==pprevh)panic("pmap enter 1");
2712#endif
2713 pvh_e = *pprevh;
2714 pmap_pv_hashlist_walks++;
2715 pv_cnt = 0;
2716 while (PV_HASHED_ENTRY_NULL != pvh_e) {
2717 pv_cnt++;
2718 if (pvh_e->pmap == pmap && pvh_e->va == vaddr && pvh_e->ppn == old_ppn) break;
2719 pprevh = &pvh_e->nexth;
2720 pvh_e = pvh_e->nexth;
2721 }
2722 pmap_pv_hashlist_cnts += pv_cnt;
2723 if (pmap_pv_hashlist_max < pv_cnt) pmap_pv_hashlist_max = pv_cnt;
2724 if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pmap_enter: pv not in hash list");
2725 if(NULL==pprevh)panic("pmap enter 2");
2726 *pprevh = pvh_e->nexth;
2727 remque(&pvh_e->qlink);
2728 UNLOCK_PV_HASH(pvhash_idx);
1c79356b
A
2729 }
2730 }
1c79356b
A
2731 }
2732 else {
1c79356b 2733 /*
2d21ac55
A
2734 * old_pa is not managed.
2735 * Do removal part of accounting.
1c79356b 2736 */
0c530ab8 2737
1c79356b
A
2738 if (iswired(*pte)) {
2739 assert(pmap->stats.wired_count >= 1);
b0d623f7 2740 OSAddAtomic(-1, &pmap->stats.wired_count);
1c79356b
A
2741 }
2742 }
2743 }
2744
2d21ac55
A
2745 /*
2746 * if we had a previously managed paged locked, unlock it now
2747 */
2748
2749 if (old_pa_locked) {
2750 UNLOCK_PVH(pai);
2751 old_pa_locked = FALSE;
2752 }
2753
2754 pai = pa_index(pa); /* now working with new incoming phys page */
2755 if (managed_page(pai)) {
1c79356b
A
2756
2757 /*
2758 * Step 2) Enter the mapping in the PV list for this
2759 * physical page.
2760 */
2d21ac55 2761 pv_h = pai_to_pvh(pai);
1c79356b 2762
1c79356b 2763 LOCK_PVH(pai);
1c79356b
A
2764
2765 if (pv_h->pmap == PMAP_NULL) {
2766 /*
2d21ac55 2767 * No mappings yet, use rooted pv
1c79356b 2768 */
0c530ab8 2769 pv_h->va = vaddr;
1c79356b 2770 pv_h->pmap = pmap;
2d21ac55 2771 queue_init(&pv_h->qlink);
1c79356b
A
2772 }
2773 else {
1c79356b 2774 /*
2d21ac55 2775 * Add new pv_hashed_entry after header.
1c79356b 2776 */
2d21ac55
A
2777 if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
2778 pvh_e = pvh_new;
2779 pvh_new = PV_HASHED_ENTRY_NULL; /* show we used it */
2780 } else if (PV_HASHED_ENTRY_NULL == pvh_e) {
2781 PV_HASHED_ALLOC(pvh_e);
2782 if (PV_HASHED_ENTRY_NULL == pvh_e) {
2783 /* the pv list is empty.
2784 * if we are on the kernel pmap we'll use one of the special private
2785 * kernel pv_e's, else, we need to unlock everything, zalloc a pv_e,
2786 * and restart bringing in the pv_e with us.
2787 */
2788 if (kernel_pmap == pmap) {
2789 PV_HASHED_KERN_ALLOC(pvh_e);
2790 } else {
2791 UNLOCK_PVH(pai);
2792 PMAP_UNLOCK(pmap);
2793 pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
2794 goto Retry;
1c79356b 2795 }
2d21ac55 2796 }
1c79356b 2797 }
2d21ac55
A
2798
2799 if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pvh_e exhaustion");
2800 pvh_e->va = vaddr;
2801 pvh_e->pmap = pmap;
2802 pvh_e->ppn = pn;
2803 CHK_NPVHASH();
2804 pvhash_idx = pvhashidx(pmap,vaddr);
2805 LOCK_PV_HASH(pvhash_idx);
2806 insque(&pvh_e->qlink, &pv_h->qlink);
2807 hashp = pvhash(pvhash_idx);
2808#if PV_DEBUG
2809 if(NULL==hashp)panic("pmap_enter 4");
2810#endif
2811 pvh_e->nexth = *hashp;
2812 *hashp = pvh_e;
2813 UNLOCK_PV_HASH(pvhash_idx);
2814
1c79356b
A
2815 /*
2816 * Remember that we used the pvlist entry.
2817 */
2d21ac55 2818 pvh_e = PV_HASHED_ENTRY_NULL;
1c79356b 2819 }
0c530ab8
A
2820
2821 /*
2822 * only count the mapping
2823 * for 'managed memory'
2824 */
b0d623f7 2825 OSAddAtomic(+1, &pmap->stats.resident_count);
2d21ac55
A
2826 if (pmap->stats.resident_count > pmap->stats.resident_max) {
2827 pmap->stats.resident_max = pmap->stats.resident_count;
2828 }
1c79356b
A
2829 }
2830
2831 /*
0c530ab8 2832 * Step 3) Enter the mapping.
2d21ac55 2833 *
1c79356b
A
2834 * Build a template to speed up entering -
2835 * only the pfn changes.
2836 */
2837 template = pa_to_pte(pa) | INTEL_PTE_VALID;
55e303ae 2838
2d21ac55 2839 if (flags & VM_MEM_NOT_CACHEABLE) {
55e303ae
A
2840 if(!(flags & VM_MEM_GUARDED))
2841 template |= INTEL_PTE_PTA;
2842 template |= INTEL_PTE_NCACHE;
2843 }
2844
1c79356b
A
2845 if (pmap != kernel_pmap)
2846 template |= INTEL_PTE_USER;
2847 if (prot & VM_PROT_WRITE)
2848 template |= INTEL_PTE_WRITE;
0c530ab8
A
2849
2850 if (set_NX == TRUE)
2851 template |= INTEL_PTE_NX;
2852
1c79356b
A
2853 if (wired) {
2854 template |= INTEL_PTE_WIRED;
b0d623f7 2855 OSAddAtomic(+1, &pmap->stats.wired_count);
1c79356b 2856 }
0c530ab8 2857 pmap_store_pte(pte, template);
91447636 2858
2d21ac55
A
2859 /* if this was a managed page we delayed unlocking the pv until here
2860 * to prevent pmap_page_protect et al from finding it until the pte
2861 * has been stored */
2862
2863 if (managed_page(pai)) {
2864 UNLOCK_PVH(pai);
2865 }
2866
1c79356b 2867Done:
0c530ab8
A
2868 if (need_tlbflush == TRUE)
2869 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
91447636 2870
2d21ac55
A
2871 if (pvh_e != PV_HASHED_ENTRY_NULL) {
2872 PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
1c79356b
A
2873 }
2874
2d21ac55
A
2875 if (pvh_new != PV_HASHED_ENTRY_NULL) {
2876 PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
2877 }
2878
2879 PMAP_UNLOCK(pmap);
2880 PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0);
1c79356b
A
2881}
2882
2883/*
2884 * Routine: pmap_change_wiring
2885 * Function: Change the wiring attribute for a map/virtual-address
2886 * pair.
2887 * In/out conditions:
2888 * The mapping must already exist in the pmap.
2889 */
2890void
2891pmap_change_wiring(
2892 register pmap_t map,
0c530ab8 2893 vm_map_offset_t vaddr,
1c79356b
A
2894 boolean_t wired)
2895{
2896 register pt_entry_t *pte;
1c79356b
A
2897
2898 /*
2899 * We must grab the pmap system lock because we may
2900 * change a pte_page queue.
2901 */
2d21ac55 2902 PMAP_LOCK(map);
1c79356b 2903
0c530ab8 2904 if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL)
1c79356b
A
2905 panic("pmap_change_wiring: pte missing");
2906
2907 if (wired && !iswired(*pte)) {
2908 /*
2909 * wiring down mapping
2910 */
b0d623f7 2911 OSAddAtomic(+1, &map->stats.wired_count);
0c530ab8 2912 pmap_update_pte(pte, *pte, (*pte | INTEL_PTE_WIRED));
1c79356b
A
2913 }
2914 else if (!wired && iswired(*pte)) {
2915 /*
2916 * unwiring mapping
2917 */
2918 assert(map->stats.wired_count >= 1);
b0d623f7 2919 OSAddAtomic(-1, &map->stats.wired_count);
0c530ab8 2920 pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WIRED));
1c79356b
A
2921 }
2922
2d21ac55 2923 PMAP_UNLOCK(map);
1c79356b
A
2924}
2925
55e303ae 2926
1c79356b
A
2927/*
2928 * Routine: pmap_extract
2929 * Function:
2930 * Extract the physical page address associated
2931 * with the given map/virtual_address pair.
91447636
A
2932 * Change to shim for backwards compatibility but will not
2933 * work for 64 bit systems. Some old drivers that we cannot
2934 * change need this.
1c79356b
A
2935 */
2936
2937vm_offset_t
2938pmap_extract(
2939 register pmap_t pmap,
0c530ab8 2940 vm_map_offset_t vaddr)
1c79356b 2941{
0c530ab8
A
2942 ppnum_t ppn;
2943 vm_offset_t paddr;
91447636 2944
0c530ab8
A
2945 paddr = (vm_offset_t)0;
2946 ppn = pmap_find_phys(pmap, vaddr);
2d21ac55 2947
0c530ab8 2948 if (ppn) {
b0d623f7 2949 paddr = ((vm_offset_t)i386_ptob(ppn)) | ((vm_offset_t)vaddr & INTEL_OFFMASK);
0c530ab8
A
2950 }
2951 return (paddr);
1c79356b
A
2952}
2953
1c79356b 2954void
0c530ab8
A
2955pmap_expand_pml4(
2956 pmap_t map,
2957 vm_map_offset_t vaddr)
1c79356b 2958{
1c79356b 2959 register vm_page_t m;
91447636 2960 register pmap_paddr_t pa;
0c530ab8 2961 uint64_t i;
1c79356b 2962 spl_t spl;
55e303ae 2963 ppnum_t pn;
0c530ab8 2964 pml4_entry_t *pml4p;
89b3af67 2965
0c530ab8
A
2966 if (kernel_pmap == map) panic("expand kernel pml4");
2967
2968 spl = splhigh();
2d21ac55
A
2969 pml4p = pmap64_pml4(map, vaddr);
2970 splx(spl);
2971 if (PML4_ENTRY_NULL == pml4p) panic("pmap_expand_pml4 no pml4p");
1c79356b
A
2972
2973 /*
0c530ab8 2974 * Allocate a VM page for the pml4 page
1c79356b
A
2975 */
2976 while ((m = vm_page_grab()) == VM_PAGE_NULL)
2977 VM_PAGE_WAIT();
2978
2979 /*
91447636 2980 * put the page into the pmap's obj list so it
1c79356b
A
2981 * can be found later.
2982 */
55e303ae
A
2983 pn = m->phys_page;
2984 pa = i386_ptob(pn);
0c530ab8
A
2985 i = pml4idx(map, vaddr);
2986
2d21ac55
A
2987 /*
2988 * Zero the page.
2989 */
2990 pmap_zero_page(pn);
0c530ab8 2991
b0d623f7 2992 vm_page_lockspin_queues();
1c79356b 2993 vm_page_wire(m);
2d21ac55 2994 vm_page_unlock_queues();
1c79356b 2995
b0d623f7
A
2996 OSAddAtomic(1, &inuse_ptepages_count);
2997
2d21ac55
A
2998 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2999 vm_object_lock(map->pm_obj_pml4);
1c79356b 3000
2d21ac55 3001 PMAP_LOCK(map);
1c79356b
A
3002 /*
3003 * See if someone else expanded us first
3004 */
0c530ab8 3005 if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
2d21ac55
A
3006 PMAP_UNLOCK(map);
3007 vm_object_unlock(map->pm_obj_pml4);
3008
b0d623f7 3009 VM_PAGE_FREE(m);
2d21ac55 3010
b0d623f7 3011 OSAddAtomic(-1, &inuse_ptepages_count);
1c79356b
A
3012 return;
3013 }
3014
2d21ac55
A
3015#if 0 /* DEBUG */
3016 if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i)) {
3017 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
3018 map, map->pm_obj_pml4, vaddr, i);
3019 }
3020#endif
3021 vm_page_insert(m, map->pm_obj_pml4, (vm_object_offset_t)i);
3022 vm_object_unlock(map->pm_obj_pml4);
3023
1c79356b
A
3024 /*
3025 * Set the page directory entry for this page table.
1c79356b 3026 */
0c530ab8 3027 pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
c0fea474 3028
0c530ab8
A
3029 pmap_store_pte(pml4p, pa_to_pte(pa)
3030 | INTEL_PTE_VALID
3031 | INTEL_PTE_USER
3032 | INTEL_PTE_WRITE);
5d5c5d0d 3033
2d21ac55 3034 PMAP_UNLOCK(map);
89b3af67 3035
6601e61a 3036 return;
0c530ab8 3037
6601e61a 3038}
89b3af67 3039
6601e61a 3040void
0c530ab8
A
3041pmap_expand_pdpt(
3042 pmap_t map,
3043 vm_map_offset_t vaddr)
6601e61a 3044{
0c530ab8
A
3045 register vm_page_t m;
3046 register pmap_paddr_t pa;
3047 uint64_t i;
3048 spl_t spl;
3049 ppnum_t pn;
3050 pdpt_entry_t *pdptp;
89b3af67 3051
0c530ab8 3052 if (kernel_pmap == map) panic("expand kernel pdpt");
89b3af67 3053
0c530ab8 3054 spl = splhigh();
2d21ac55
A
3055 while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
3056 splx(spl);
3057 pmap_expand_pml4(map, vaddr); /* need room for another pdpt entry */
3058 spl = splhigh();
3059 }
3060 splx(spl);
4452a7af 3061
0c530ab8
A
3062 /*
3063 * Allocate a VM page for the pdpt page
3064 */
3065 while ((m = vm_page_grab()) == VM_PAGE_NULL)
3066 VM_PAGE_WAIT();
4452a7af 3067
4452a7af 3068 /*
0c530ab8
A
3069 * put the page into the pmap's obj list so it
3070 * can be found later.
4452a7af 3071 */
0c530ab8
A
3072 pn = m->phys_page;
3073 pa = i386_ptob(pn);
3074 i = pdptidx(map, vaddr);
4452a7af 3075
2d21ac55
A
3076 /*
3077 * Zero the page.
3078 */
3079 pmap_zero_page(pn);
0c530ab8 3080
b0d623f7 3081 vm_page_lockspin_queues();
0c530ab8 3082 vm_page_wire(m);
2d21ac55 3083 vm_page_unlock_queues();
0c530ab8 3084
b0d623f7
A
3085 OSAddAtomic(1, &inuse_ptepages_count);
3086
2d21ac55
A
3087 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3088 vm_object_lock(map->pm_obj_pdpt);
0c530ab8 3089
2d21ac55 3090 PMAP_LOCK(map);
0c530ab8
A
3091 /*
3092 * See if someone else expanded us first
3093 */
3094 if (pmap64_pde(map, vaddr) != PD_ENTRY_NULL) {
2d21ac55
A
3095 PMAP_UNLOCK(map);
3096 vm_object_unlock(map->pm_obj_pdpt);
3097
b0d623f7 3098 VM_PAGE_FREE(m);
2d21ac55 3099
b0d623f7 3100 OSAddAtomic(-1, &inuse_ptepages_count);
0c530ab8
A
3101 return;
3102 }
3103
2d21ac55
A
3104#if 0 /* DEBUG */
3105 if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i)) {
3106 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
3107 map, map->pm_obj_pdpt, vaddr, i);
3108 }
3109#endif
3110 vm_page_insert(m, map->pm_obj_pdpt, (vm_object_offset_t)i);
3111 vm_object_unlock(map->pm_obj_pdpt);
3112
0c530ab8
A
3113 /*
3114 * Set the page directory entry for this page table.
0c530ab8 3115 */
0c530ab8
A
3116 pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
3117
3118 pmap_store_pte(pdptp, pa_to_pte(pa)
3119 | INTEL_PTE_VALID
3120 | INTEL_PTE_USER
3121 | INTEL_PTE_WRITE);
3122
2d21ac55 3123 PMAP_UNLOCK(map);
0c530ab8
A
3124
3125 return;
3126
3127}
3128
3129
3130
3131/*
3132 * Routine: pmap_expand
3133 *
3134 * Expands a pmap to be able to map the specified virtual address.
3135 *
3136 * Allocates new virtual memory for the P0 or P1 portion of the
3137 * pmap, then re-maps the physical pages that were in the old
3138 * pmap to be in the new pmap.
3139 *
3140 * Must be called with the pmap system and the pmap unlocked,
3141 * since these must be unlocked to use vm_allocate or vm_deallocate.
3142 * Thus it must be called in a loop that checks whether the map
3143 * has been expanded enough.
3144 * (We won't loop forever, since page tables aren't shrunk.)
3145 */
3146void
3147pmap_expand(
3148 pmap_t map,
3149 vm_map_offset_t vaddr)
3150{
3151 pt_entry_t *pdp;
3152 register vm_page_t m;
3153 register pmap_paddr_t pa;
3154 uint64_t i;
3155 spl_t spl;
3156 ppnum_t pn;
3157
3158 /*
3159 * if not the kernel map (while we are still compat kernel mode)
3160 * and we are 64 bit, propagate expand upwards
3161 */
3162
3163 if (cpu_64bit && (map != kernel_pmap)) {
2d21ac55
A
3164 spl = splhigh();
3165 while ((pdp = pmap64_pde(map, vaddr)) == PD_ENTRY_NULL) {
3166 splx(spl);
3167 pmap_expand_pdpt(map, vaddr); /* need room for another pde entry */
3168 spl = splhigh();
3169 }
3170 splx(spl);
0c530ab8
A
3171 }
3172
0c530ab8
A
3173 /*
3174 * Allocate a VM page for the pde entries.
3175 */
3176 while ((m = vm_page_grab()) == VM_PAGE_NULL)
3177 VM_PAGE_WAIT();
3178
3179 /*
3180 * put the page into the pmap's obj list so it
3181 * can be found later.
3182 */
3183 pn = m->phys_page;
3184 pa = i386_ptob(pn);
3185 i = pdeidx(map, vaddr);
3186
2d21ac55
A
3187 /*
3188 * Zero the page.
3189 */
3190 pmap_zero_page(pn);
0c530ab8 3191
b0d623f7 3192 vm_page_lockspin_queues();
0c530ab8 3193 vm_page_wire(m);
0c530ab8 3194 vm_page_unlock_queues();
0c530ab8 3195
b0d623f7
A
3196 OSAddAtomic(1, &inuse_ptepages_count);
3197
2d21ac55
A
3198 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3199 vm_object_lock(map->pm_obj);
0c530ab8 3200
2d21ac55 3201 PMAP_LOCK(map);
0c530ab8
A
3202 /*
3203 * See if someone else expanded us first
3204 */
2d21ac55 3205
0c530ab8 3206 if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
2d21ac55
A
3207 PMAP_UNLOCK(map);
3208 vm_object_unlock(map->pm_obj);
0c530ab8 3209
b0d623f7 3210 VM_PAGE_FREE(m);
2d21ac55 3211
b0d623f7 3212 OSAddAtomic(-1, &inuse_ptepages_count);
0c530ab8
A
3213 return;
3214 }
3215
2d21ac55
A
3216#if 0 /* DEBUG */
3217 if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i)) {
3218 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
3219 map, map->pm_obj, vaddr, i);
3220 }
3221#endif
3222 vm_page_insert(m, map->pm_obj, (vm_object_offset_t)i);
3223 vm_object_unlock(map->pm_obj);
0c530ab8
A
3224
3225 /*
2d21ac55 3226 * refetch while locked
0c530ab8
A
3227 */
3228
2d21ac55
A
3229 pdp = pmap_pde(map, vaddr);
3230
3231 /*
3232 * Set the page directory entry for this page table.
3233 */
0c530ab8
A
3234 pmap_store_pte(pdp, pa_to_pte(pa)
3235 | INTEL_PTE_VALID
3236 | INTEL_PTE_USER
3237 | INTEL_PTE_WRITE);
0c530ab8 3238
2d21ac55 3239 PMAP_UNLOCK(map);
0c530ab8
A
3240
3241 return;
3242}
3243
3244
3245/*
3246 * pmap_sync_page_data_phys(ppnum_t pa)
3247 *
3248 * Invalidates all of the instruction cache on a physical page and
3249 * pushes any dirty data from the data cache for the same physical page
3250 * Not required in i386.
3251 */
3252void
3253pmap_sync_page_data_phys(__unused ppnum_t pa)
3254{
3255 return;
3256}
3257
3258/*
3259 * pmap_sync_page_attributes_phys(ppnum_t pa)
3260 *
3261 * Write back and invalidate all cachelines on a physical page.
3262 */
3263void
3264pmap_sync_page_attributes_phys(ppnum_t pa)
3265{
3266 cache_flush_page_phys(pa);
3267}
3268
2d21ac55
A
3269
3270
3271#ifdef CURRENTLY_UNUSED_AND_UNTESTED
3272
0c530ab8
A
3273int collect_ref;
3274int collect_unref;
3275
3276/*
3277 * Routine: pmap_collect
3278 * Function:
3279 * Garbage collects the physical map system for
3280 * pages which are no longer used.
3281 * Success need not be guaranteed -- that is, there
3282 * may well be pages which are not referenced, but
3283 * others may be collected.
3284 * Usage:
3285 * Called by the pageout daemon when pages are scarce.
3286 */
3287void
3288pmap_collect(
3289 pmap_t p)
3290{
3291 register pt_entry_t *pdp, *ptp;
3292 pt_entry_t *eptp;
3293 int wired;
0c530ab8
A
3294
3295 if (p == PMAP_NULL)
3296 return;
3297
3298 if (p == kernel_pmap)
3299 return;
3300
3301 /*
3302 * Garbage collect map.
3303 */
2d21ac55 3304 PMAP_LOCK(p);
0c530ab8
A
3305
3306 for (pdp = (pt_entry_t *)p->dirbase;
4452a7af
A
3307 pdp < (pt_entry_t *)&p->dirbase[(UMAXPTDI+1)];
3308 pdp++)
3309 {
3310 if (*pdp & INTEL_PTE_VALID) {
3311 if(*pdp & INTEL_PTE_REF) {
0c530ab8 3312 pmap_store_pte(pdp, *pdp & ~INTEL_PTE_REF);
4452a7af
A
3313 collect_ref++;
3314 } else {
3315 collect_unref++;
3316 ptp = pmap_pte(p, pdetova(pdp - (pt_entry_t *)p->dirbase));
3317 eptp = ptp + NPTEPG;
3318
3319 /*
3320 * If the pte page has any wired mappings, we cannot
3321 * free it.
3322 */
3323 wired = 0;
3324 {
3325 register pt_entry_t *ptep;
3326 for (ptep = ptp; ptep < eptp; ptep++) {
3327 if (iswired(*ptep)) {
3328 wired = 1;
5d5c5d0d 3329 break;
1c79356b
A
3330 }
3331 }
3332 }
3333 if (!wired) {
3334 /*
3335 * Remove the virtual addresses mapped by this pte page.
3336 */
3337 pmap_remove_range(p,
91447636 3338 pdetova(pdp - (pt_entry_t *)p->dirbase),
1c79356b
A
3339 ptp,
3340 eptp);
3341
3342 /*
3343 * Invalidate the page directory pointer.
3344 */
0c530ab8 3345 pmap_store_pte(pdp, 0x0);
91447636 3346
2d21ac55 3347 PMAP_UNLOCK(p);
1c79356b
A
3348
3349 /*
3350 * And free the pte page itself.
3351 */
3352 {
3353 register vm_page_t m;
3354
91447636 3355 vm_object_lock(p->pm_obj);
2d21ac55 3356
91447636 3357 m = vm_page_lookup(p->pm_obj,(vm_object_offset_t)(pdp - (pt_entry_t *)&p->dirbase[0]));
1c79356b
A
3358 if (m == VM_PAGE_NULL)
3359 panic("pmap_collect: pte page not in object");
2d21ac55 3360
b0d623f7
A
3361 VM_PAGE_FREE(m);
3362
3363 OSAddAtomic(-1, &inuse_ptepages_count);
2d21ac55 3364
91447636 3365 vm_object_unlock(p->pm_obj);
1c79356b
A
3366 }
3367
2d21ac55 3368 PMAP_LOCK(p);
1c79356b 3369 }
91447636
A
3370 }
3371 }
1c79356b 3372 }
0c530ab8 3373
2d21ac55
A
3374 PMAP_UPDATE_TLBS(p, 0x0, 0xFFFFFFFFFFFFF000ULL);
3375 PMAP_UNLOCK(p);
1c79356b
A
3376 return;
3377
3378}
2d21ac55 3379#endif
1c79356b 3380
1c79356b 3381
1c79356b 3382void
2d21ac55 3383pmap_copy_page(ppnum_t src, ppnum_t dst)
1c79356b 3384{
2d21ac55
A
3385 bcopy_phys((addr64_t)i386_ptob(src),
3386 (addr64_t)i386_ptob(dst),
3387 PAGE_SIZE);
1c79356b 3388}
1c79356b 3389
1c79356b
A
3390
3391/*
3392 * Routine: pmap_pageable
3393 * Function:
3394 * Make the specified pages (by pmap, offset)
3395 * pageable (or not) as requested.
3396 *
3397 * A page which is not pageable may not take
3398 * a fault; therefore, its page table entry
3399 * must remain valid for the duration.
3400 *
3401 * This routine is merely advisory; pmap_enter
3402 * will specify that these pages are to be wired
3403 * down (or not) as appropriate.
3404 */
3405void
3406pmap_pageable(
91447636 3407 __unused pmap_t pmap,
0c530ab8
A
3408 __unused vm_map_offset_t start_addr,
3409 __unused vm_map_offset_t end_addr,
91447636 3410 __unused boolean_t pageable)
1c79356b
A
3411{
3412#ifdef lint
91447636 3413 pmap++; start_addr++; end_addr++; pageable++;
1c79356b
A
3414#endif /* lint */
3415}
3416
3417/*
3418 * Clear specified attribute bits.
3419 */
3420void
3421phys_attribute_clear(
2d21ac55 3422 ppnum_t pn,
1c79356b
A
3423 int bits)
3424{
2d21ac55
A
3425 pv_rooted_entry_t pv_h;
3426 register pv_hashed_entry_t pv_e;
1c79356b
A
3427 register pt_entry_t *pte;
3428 int pai;
3429 register pmap_t pmap;
1c79356b 3430
2d21ac55 3431 pmap_intr_assert();
91447636 3432 assert(pn != vm_page_fictitious_addr);
2d21ac55
A
3433 if (pn == vm_page_guard_addr)
3434 return;
3435
3436 pai = ppn_to_pai(pn);
3437
3438 if (!managed_page(pai)) {
1c79356b
A
3439 /*
3440 * Not a managed page.
3441 */
3442 return;
3443 }
3444
b0d623f7 3445
2d21ac55
A
3446 PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START,
3447 (int) pn, bits, 0, 0, 0);
1c79356b 3448
1c79356b
A
3449 pv_h = pai_to_pvh(pai);
3450
2d21ac55
A
3451 LOCK_PVH(pai);
3452
1c79356b
A
3453 /*
3454 * Walk down PV list, clearing all modify or reference bits.
3455 * We do not have to lock the pv_list because we have
3456 * the entire pmap system locked.
3457 */
3458 if (pv_h->pmap != PMAP_NULL) {
3459 /*
3460 * There are some mappings.
3461 */
1c79356b 3462
2d21ac55
A
3463 pv_e = (pv_hashed_entry_t)pv_h;
3464
3465 do {
1c79356b 3466 pmap = pv_e->pmap;
1c79356b
A
3467
3468 {
2d21ac55 3469 vm_map_offset_t va;
1c79356b
A
3470
3471 va = pv_e->va;
1c79356b 3472
2d21ac55
A
3473 /*
3474 * Clear modify and/or reference bits.
3475 */
91447636 3476
0c530ab8
A
3477 pte = pmap_pte(pmap, va);
3478 pmap_update_pte(pte, *pte, (*pte & ~bits));
c910b4d9
A
3479 /* Ensure all processors using this translation
3480 * invalidate this TLB entry. The invalidation *must* follow
3481 * the PTE update, to ensure that the TLB shadow of the
3482 * 'D' bit (in particular) is synchronized with the
3483 * updated PTE.
3484 */
3485 PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
1c79356b 3486 }
91447636 3487
2d21ac55 3488 pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
1c79356b 3489
2d21ac55
A
3490 } while (pv_e != (pv_hashed_entry_t)pv_h);
3491 }
1c79356b
A
3492 pmap_phys_attributes[pai] &= ~bits;
3493
2d21ac55
A
3494 UNLOCK_PVH(pai);
3495
3496 PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END,
3497 0, 0, 0, 0, 0);
3498
1c79356b
A
3499}
3500
3501/*
3502 * Check specified attribute bits.
3503 */
2d21ac55 3504int
1c79356b 3505phys_attribute_test(
2d21ac55 3506 ppnum_t pn,
1c79356b
A
3507 int bits)
3508{
2d21ac55
A
3509 pv_rooted_entry_t pv_h;
3510 register pv_hashed_entry_t pv_e;
1c79356b
A
3511 register pt_entry_t *pte;
3512 int pai;
3513 register pmap_t pmap;
2d21ac55 3514 int attributes = 0;
1c79356b 3515
2d21ac55 3516 pmap_intr_assert();
91447636 3517 assert(pn != vm_page_fictitious_addr);
2d21ac55
A
3518 if (pn == vm_page_guard_addr)
3519 return 0;
3520
3521 pai = ppn_to_pai(pn);
3522
3523 if (!managed_page(pai)) {
1c79356b
A
3524 /*
3525 * Not a managed page.
3526 */
2d21ac55 3527 return (0);
1c79356b
A
3528 }
3529
0c530ab8
A
3530 /*
3531 * super fast check... if bits already collected
3532 * no need to take any locks...
3533 * if not set, we need to recheck after taking
3534 * the lock in case they got pulled in while
3535 * we were waiting for the lock
3536 */
2d21ac55
A
3537 if ( (pmap_phys_attributes[pai] & bits) == bits)
3538 return (bits);
3539
0c530ab8
A
3540 pv_h = pai_to_pvh(pai);
3541
2d21ac55 3542 LOCK_PVH(pai);
1c79356b 3543
2d21ac55 3544 attributes = pmap_phys_attributes[pai] & bits;
1c79356b 3545
b0d623f7 3546
1c79356b 3547 /*
2d21ac55
A
3548 * Walk down PV list, checking the mappings until we
3549 * reach the end or we've found the attributes we've asked for
1c79356b
A
3550 * We do not have to lock the pv_list because we have
3551 * the entire pmap system locked.
3552 */
3553 if (pv_h->pmap != PMAP_NULL) {
3554 /*
3555 * There are some mappings.
3556 */
2d21ac55
A
3557 pv_e = (pv_hashed_entry_t)pv_h;
3558 if (attributes != bits) do {
1c79356b 3559
2d21ac55 3560 pmap = pv_e->pmap;
1c79356b
A
3561
3562 {
2d21ac55 3563 vm_map_offset_t va;
1c79356b
A
3564
3565 va = pv_e->va;
2d21ac55
A
3566 /*
3567 * first make sure any processor actively
3568 * using this pmap, flushes its TLB state
3569 */
3570 PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
1c79356b 3571
1c79356b 3572 /*
2d21ac55 3573 * pick up modify and/or reference bits from this mapping
1c79356b 3574 */
2d21ac55 3575 pte = pmap_pte(pmap, va);
b0d623f7 3576 attributes |= (int)(*pte & bits);
2d21ac55 3577
1c79356b 3578 }
2d21ac55
A
3579
3580 pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
3581
3582 } while ((attributes != bits) && (pv_e != (pv_hashed_entry_t)pv_h));
1c79356b 3583 }
2d21ac55
A
3584
3585 UNLOCK_PVH(pai);
3586 return (attributes);
1c79356b
A
3587}
3588
3589/*
3590 * Set specified attribute bits.
3591 */
3592void
3593phys_attribute_set(
2d21ac55 3594 ppnum_t pn,
1c79356b
A
3595 int bits)
3596{
2d21ac55 3597 int pai;
1c79356b 3598
2d21ac55 3599 pmap_intr_assert();
91447636 3600 assert(pn != vm_page_fictitious_addr);
2d21ac55
A
3601 if (pn == vm_page_guard_addr)
3602 return;
3603
3604 pai = ppn_to_pai(pn);
3605
3606 if (!managed_page(pai)) {
1c79356b
A
3607 /*
3608 * Not a managed page.
3609 */
3610 return;
3611 }
3612
2d21ac55
A
3613 LOCK_PVH(pai);
3614
3615 pmap_phys_attributes[pai] |= bits;
3616
3617 UNLOCK_PVH(pai);
1c79356b
A
3618}
3619
3620/*
3621 * Set the modify bit on the specified physical page.
3622 */
3623
3624void pmap_set_modify(
55e303ae 3625 ppnum_t pn)
1c79356b 3626{
91447636 3627 phys_attribute_set(pn, PHYS_MODIFIED);
1c79356b
A
3628}
3629
3630/*
3631 * Clear the modify bits on the specified physical page.
3632 */
3633
3634void
3635pmap_clear_modify(
55e303ae 3636 ppnum_t pn)
1c79356b 3637{
91447636 3638 phys_attribute_clear(pn, PHYS_MODIFIED);
1c79356b
A
3639}
3640
3641/*
3642 * pmap_is_modified:
3643 *
3644 * Return whether or not the specified physical page is modified
3645 * by any physical maps.
3646 */
3647
3648boolean_t
3649pmap_is_modified(
55e303ae 3650 ppnum_t pn)
1c79356b 3651{
2d21ac55
A
3652 if (phys_attribute_test(pn, PHYS_MODIFIED))
3653 return TRUE;
3654
3655 return FALSE;
1c79356b
A
3656}
3657
3658/*
3659 * pmap_clear_reference:
3660 *
3661 * Clear the reference bit on the specified physical page.
3662 */
3663
3664void
3665pmap_clear_reference(
55e303ae 3666 ppnum_t pn)
1c79356b 3667{
91447636
A
3668 phys_attribute_clear(pn, PHYS_REFERENCED);
3669}
3670
3671void
3672pmap_set_reference(ppnum_t pn)
3673{
3674 phys_attribute_set(pn, PHYS_REFERENCED);
1c79356b
A
3675}
3676
3677/*
3678 * pmap_is_referenced:
3679 *
3680 * Return whether or not the specified physical page is referenced
3681 * by any physical maps.
3682 */
3683
3684boolean_t
3685pmap_is_referenced(
55e303ae 3686 ppnum_t pn)
1c79356b 3687{
2d21ac55
A
3688 if (phys_attribute_test(pn, PHYS_REFERENCED))
3689 return TRUE;
3690
3691 return FALSE;
91447636
A
3692}
3693
3694/*
3695 * pmap_get_refmod(phys)
3696 * returns the referenced and modified bits of the specified
3697 * physical page.
3698 */
3699unsigned int
3700pmap_get_refmod(ppnum_t pa)
3701{
2d21ac55
A
3702 int refmod;
3703 unsigned int retval = 0;
3704
3705 refmod = phys_attribute_test(pa, PHYS_MODIFIED | PHYS_REFERENCED);
3706
3707 if (refmod & PHYS_MODIFIED)
3708 retval |= VM_MEM_MODIFIED;
3709 if (refmod & PHYS_REFERENCED)
3710 retval |= VM_MEM_REFERENCED;
3711
3712 return (retval);
91447636
A
3713}
3714
3715/*
3716 * pmap_clear_refmod(phys, mask)
3717 * clears the referenced and modified bits as specified by the mask
3718 * of the specified physical page.
3719 */
3720void
3721pmap_clear_refmod(ppnum_t pa, unsigned int mask)
3722{
3723 unsigned int x86Mask;
3724
3725 x86Mask = ( ((mask & VM_MEM_MODIFIED)? PHYS_MODIFIED : 0)
3726 | ((mask & VM_MEM_REFERENCED)? PHYS_REFERENCED : 0));
3727 phys_attribute_clear(pa, x86Mask);
1c79356b
A
3728}
3729
1c79356b 3730void
91447636
A
3731invalidate_icache(__unused vm_offset_t addr,
3732 __unused unsigned cnt,
3733 __unused int phys)
1c79356b
A
3734{
3735 return;
3736}
3737void
91447636
A
3738flush_dcache(__unused vm_offset_t addr,
3739 __unused unsigned count,
3740 __unused int phys)
1c79356b
A
3741{
3742 return;
3743}
3744
2d21ac55
A
3745#if CONFIG_DTRACE
3746/*
3747 * Constrain DTrace copyin/copyout actions
3748 */
3749extern kern_return_t dtrace_copyio_preflight(addr64_t);
3750extern kern_return_t dtrace_copyio_postflight(addr64_t);
3751
3752kern_return_t dtrace_copyio_preflight(__unused addr64_t va)
3753{
3754 thread_t thread = current_thread();
3755
3756 if (current_map() == kernel_map)
3757 return KERN_FAILURE;
3758 else if (thread->machine.specFlags & CopyIOActive)
3759 return KERN_FAILURE;
3760 else
3761 return KERN_SUCCESS;
3762}
3763
3764kern_return_t dtrace_copyio_postflight(__unused addr64_t va)
3765{
3766 return KERN_SUCCESS;
3767}
3768#endif /* CONFIG_DTRACE */
3769
0c530ab8 3770#if MACH_KDB
6601e61a 3771
0c530ab8 3772/* show phys page mappings and attributes */
6601e61a 3773
0c530ab8 3774extern void db_show_page(pmap_paddr_t pa);
6601e61a 3775
2d21ac55 3776#if 0
6601e61a 3777void
0c530ab8 3778db_show_page(pmap_paddr_t pa)
6601e61a 3779{
0c530ab8
A
3780 pv_entry_t pv_h;
3781 int pai;
3782 char attr;
3783
3784 pai = pa_index(pa);
3785 pv_h = pai_to_pvh(pai);
1c79356b
A
3786
3787 attr = pmap_phys_attributes[pai];
2d21ac55 3788 printf("phys page %llx ", pa);
1c79356b
A
3789 if (attr & PHYS_MODIFIED)
3790 printf("modified, ");
3791 if (attr & PHYS_REFERENCED)
3792 printf("referenced, ");
3793 if (pv_h->pmap || pv_h->next)
3794 printf(" mapped at\n");
3795 else
3796 printf(" not mapped\n");
3797 for (; pv_h; pv_h = pv_h->next)
3798 if (pv_h->pmap)
2d21ac55 3799 printf("%llx in pmap %p\n", pv_h->va, pv_h->pmap);
1c79356b 3800}
2d21ac55 3801#endif
1c79356b
A
3802
3803#endif /* MACH_KDB */
3804
3805#if MACH_KDB
2d21ac55 3806#if 0
1c79356b
A
3807void db_kvtophys(vm_offset_t);
3808void db_show_vaddrs(pt_entry_t *);
3809
3810/*
3811 * print out the results of kvtophys(arg)
3812 */
3813void
3814db_kvtophys(
3815 vm_offset_t vaddr)
3816{
0c530ab8 3817 db_printf("0x%qx", kvtophys(vaddr));
1c79356b
A
3818}
3819
3820/*
3821 * Walk the pages tables.
3822 */
3823void
3824db_show_vaddrs(
3825 pt_entry_t *dirbase)
3826{
3827 pt_entry_t *ptep, *pdep, tmp;
0c530ab8 3828 unsigned int x, y, pdecnt, ptecnt;
1c79356b
A
3829
3830 if (dirbase == 0) {
3831 dirbase = kernel_pmap->dirbase;
3832 }
3833 if (dirbase == 0) {
3834 db_printf("need a dirbase...\n");
3835 return;
3836 }
0c530ab8 3837 dirbase = (pt_entry_t *) (int) ((unsigned long) dirbase & ~INTEL_OFFMASK);
1c79356b
A
3838
3839 db_printf("dirbase: 0x%x\n", dirbase);
3840
3841 pdecnt = ptecnt = 0;
3842 pdep = &dirbase[0];
91447636 3843 for (y = 0; y < NPDEPG; y++, pdep++) {
1c79356b
A
3844 if (((tmp = *pdep) & INTEL_PTE_VALID) == 0) {
3845 continue;
3846 }
3847 pdecnt++;
2d21ac55 3848 ptep = (pt_entry_t *) ((unsigned long)(*pdep) & ~INTEL_OFFMASK);
1c79356b 3849 db_printf("dir[%4d]: 0x%x\n", y, *pdep);
91447636 3850 for (x = 0; x < NPTEPG; x++, ptep++) {
1c79356b
A
3851 if (((tmp = *ptep) & INTEL_PTE_VALID) == 0) {
3852 continue;
3853 }
3854 ptecnt++;
3855 db_printf(" tab[%4d]: 0x%x, va=0x%x, pa=0x%x\n",
3856 x,
3857 *ptep,
3858 (y << 22) | (x << 12),
3859 *ptep & ~INTEL_OFFMASK);
3860 }
3861 }
3862
3863 db_printf("total: %d tables, %d page table entries.\n", pdecnt, ptecnt);
3864
3865}
2d21ac55 3866#endif
1c79356b
A
3867#endif /* MACH_KDB */
3868
3869#include <mach_vm_debug.h>
3870#if MACH_VM_DEBUG
3871#include <vm/vm_debug.h>
3872
3873int
3874pmap_list_resident_pages(
91447636
A
3875 __unused pmap_t pmap,
3876 __unused vm_offset_t *listp,
3877 __unused int space)
1c79356b
A
3878{
3879 return 0;
3880}
3881#endif /* MACH_VM_DEBUG */
3882
6601e61a 3883
1c79356b 3884
91447636
A
3885/* temporary workaround */
3886boolean_t
0c530ab8 3887coredumpok(__unused vm_map_t map, __unused vm_offset_t va)
91447636 3888{
0c530ab8 3889#if 0
91447636 3890 pt_entry_t *ptep;
1c79356b 3891
91447636
A
3892 ptep = pmap_pte(map->pmap, va);
3893 if (0 == ptep)
3894 return FALSE;
3895 return ((*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED));
0c530ab8
A
3896#else
3897 return TRUE;
1c79356b 3898#endif
1c79356b
A
3899}
3900
1c79356b 3901
9bccf70c 3902boolean_t
91447636
A
3903phys_page_exists(
3904 ppnum_t pn)
9bccf70c 3905{
91447636
A
3906 assert(pn != vm_page_fictitious_addr);
3907
3908 if (!pmap_initialized)
3909 return (TRUE);
2d21ac55
A
3910
3911 if (pn == vm_page_guard_addr)
3912 return FALSE;
3913
3914 if (!managed_page(ppn_to_pai(pn)))
91447636
A
3915 return (FALSE);
3916
3917 return TRUE;
3918}
3919
3920void
2d21ac55 3921mapping_free_prime(void)
91447636
A
3922{
3923 int i;
2d21ac55
A
3924 pv_hashed_entry_t pvh_e;
3925 pv_hashed_entry_t pvh_eh;
3926 pv_hashed_entry_t pvh_et;
3927 int pv_cnt;
3928
3929 pv_cnt = 0;
3930 pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3931 for (i = 0; i < (5 * PV_HASHED_ALLOC_CHUNK); i++) {
3932 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3933
3934 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3935 pvh_eh = pvh_e;
3936
3937 if (pvh_et == PV_HASHED_ENTRY_NULL)
3938 pvh_et = pvh_e;
3939 pv_cnt++;
3940 }
3941 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
91447636 3942
2d21ac55
A
3943 pv_cnt = 0;
3944 pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3945 for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
3946 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3947
3948 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3949 pvh_eh = pvh_e;
3950
3951 if (pvh_et == PV_HASHED_ENTRY_NULL)
3952 pvh_et = pvh_e;
3953 pv_cnt++;
91447636 3954 }
2d21ac55
A
3955 PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
3956
91447636
A
3957}
3958
3959void
2d21ac55 3960mapping_adjust(void)
91447636 3961{
2d21ac55
A
3962 pv_hashed_entry_t pvh_e;
3963 pv_hashed_entry_t pvh_eh;
3964 pv_hashed_entry_t pvh_et;
3965 int pv_cnt;
91447636 3966 int i;
91447636
A
3967
3968 if (mapping_adjust_call == NULL) {
3969 thread_call_setup(&mapping_adjust_call_data,
3970 (thread_call_func_t) mapping_adjust,
3971 (thread_call_param_t) NULL);
3972 mapping_adjust_call = &mapping_adjust_call_data;
3973 }
2d21ac55
A
3974
3975 pv_cnt = 0;
3976 pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3977 if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) {
3978 for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
3979 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3980
3981 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3982 pvh_eh = pvh_e;
3983
3984 if (pvh_et == PV_HASHED_ENTRY_NULL)
3985 pvh_et = pvh_e;
3986 pv_cnt++;
3987 }
3988 PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
3989 }
3990
3991 pv_cnt = 0;
3992 pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3993 if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) {
3994 for (i = 0; i < PV_HASHED_ALLOC_CHUNK; i++) {
3995 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3996
3997 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3998 pvh_eh = pvh_e;
3999
4000 if (pvh_et == PV_HASHED_ENTRY_NULL)
4001 pvh_et = pvh_e;
4002 pv_cnt++;
91447636 4003 }
2d21ac55 4004 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
91447636
A
4005 }
4006 mappingrecurse = 0;
4007}
4008
4009void
0c530ab8 4010pmap_commpage32_init(vm_offset_t kernel_commpage, vm_offset_t user_commpage, int cnt)
91447636 4011{
2d21ac55
A
4012 int i;
4013 pt_entry_t *opte, *npte;
4014 pt_entry_t pte;
4015 spl_t s;
4016
4017 for (i = 0; i < cnt; i++) {
4018 s = splhigh();
4019 opte = pmap_pte(kernel_pmap, (vm_map_offset_t)kernel_commpage);
4020 if (0 == opte)
4021 panic("kernel_commpage");
4022 pte = *opte | INTEL_PTE_USER|INTEL_PTE_GLOBAL;
4023 pte &= ~INTEL_PTE_WRITE; // ensure read only
4024 npte = pmap_pte(kernel_pmap, (vm_map_offset_t)user_commpage);
4025 if (0 == npte)
4026 panic("user_commpage");
4027 pmap_store_pte(npte, pte);
4028 splx(s);
4029 kernel_commpage += INTEL_PGBYTES;
4030 user_commpage += INTEL_PGBYTES;
4031 }
91447636
A
4032}
4033
2d21ac55 4034
0c530ab8
A
4035#define PMAP_COMMPAGE64_CNT (_COMM_PAGE64_AREA_USED/PAGE_SIZE)
4036pt_entry_t pmap_commpage64_ptes[PMAP_COMMPAGE64_CNT];
4037
4038void
4039pmap_commpage64_init(vm_offset_t kernel_commpage, __unused vm_map_offset_t user_commpage, int cnt)
4040{
2d21ac55
A
4041 int i;
4042 pt_entry_t *kptep;
0c530ab8 4043
2d21ac55 4044 PMAP_LOCK(kernel_pmap);
0c530ab8 4045
2d21ac55
A
4046 for (i = 0; i < cnt; i++) {
4047 kptep = pmap_pte(kernel_pmap, (uint64_t)kernel_commpage + (i*PAGE_SIZE));
4048 if ((0 == kptep) || (0 == (*kptep & INTEL_PTE_VALID)))
4049 panic("pmap_commpage64_init pte");
4050 pmap_commpage64_ptes[i] = ((*kptep & ~INTEL_PTE_WRITE) | INTEL_PTE_USER);
4051 }
4052 PMAP_UNLOCK(kernel_pmap);
0c530ab8
A
4053}
4054
0c530ab8 4055
91447636 4056static cpu_pmap_t cpu_pmap_master;
91447636
A
4057
4058struct cpu_pmap *
4059pmap_cpu_alloc(boolean_t is_boot_cpu)
4060{
4061 int ret;
4062 int i;
4063 cpu_pmap_t *cp;
91447636 4064 vm_offset_t address;
0c530ab8 4065 vm_map_address_t mapaddr;
91447636 4066 vm_map_entry_t entry;
0c530ab8 4067 pt_entry_t *pte;
91447636
A
4068
4069 if (is_boot_cpu) {
4070 cp = &cpu_pmap_master;
91447636
A
4071 } else {
4072 /*
4073 * The per-cpu pmap data structure itself.
4074 */
4075 ret = kmem_alloc(kernel_map,
4076 (vm_offset_t *) &cp, sizeof(cpu_pmap_t));
4077 if (ret != KERN_SUCCESS) {
4078 printf("pmap_cpu_alloc() failed ret=%d\n", ret);
4079 return NULL;
4080 }
4081 bzero((void *)cp, sizeof(cpu_pmap_t));
4082
4083 /*
0c530ab8 4084 * The temporary windows used for copy/zero - see loose_ends.c
91447636 4085 */
0c530ab8
A
4086 ret = vm_map_find_space(kernel_map,
4087 &mapaddr, PMAP_NWINDOWS*PAGE_SIZE, (vm_map_offset_t)0, 0, &entry);
91447636 4088 if (ret != KERN_SUCCESS) {
0c530ab8
A
4089 printf("pmap_cpu_alloc() "
4090 "vm_map_find_space ret=%d\n", ret);
91447636
A
4091 pmap_cpu_free(cp);
4092 return NULL;
4093 }
0c530ab8 4094 address = (vm_offset_t)mapaddr;
4452a7af 4095
0c530ab8 4096 for (i = 0; i < PMAP_NWINDOWS; i++, address += PAGE_SIZE) {
2d21ac55
A
4097 spl_t s;
4098 s = splhigh();
0c530ab8
A
4099 while ((pte = pmap_pte(kernel_pmap, (vm_map_offset_t)address)) == 0)
4100 pmap_expand(kernel_pmap, (vm_map_offset_t)address);
4101 * (int *) pte = 0;
6601e61a 4102 cp->mapwindow[i].prv_CADDR = (caddr_t) address;
0c530ab8 4103 cp->mapwindow[i].prv_CMAP = pte;
2d21ac55 4104 splx(s);
4452a7af 4105 }
0c530ab8 4106 vm_map_unlock(kernel_map);
4452a7af
A
4107 }
4108
0c530ab8
A
4109 cp->pdpt_window_index = PMAP_PDPT_FIRST_WINDOW;
4110 cp->pde_window_index = PMAP_PDE_FIRST_WINDOW;
4111 cp->pte_window_index = PMAP_PTE_FIRST_WINDOW;
4452a7af 4112
6601e61a 4113 return cp;
4452a7af
A
4114}
4115
4116void
6601e61a 4117pmap_cpu_free(struct cpu_pmap *cp)
4452a7af 4118{
6601e61a 4119 if (cp != NULL && cp != &cpu_pmap_master) {
6601e61a 4120 kfree((void *) cp, sizeof(cpu_pmap_t));
4452a7af 4121 }
4452a7af 4122}
0c530ab8
A
4123
4124
4125mapwindow_t *
4126pmap_get_mapwindow(pt_entry_t pentry)
4127{
4128 mapwindow_t *mp;
4129 int i;
0c530ab8 4130
2d21ac55 4131 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
0c530ab8
A
4132
4133 /*
4134 * Note: 0th map reserved for pmap_pte()
4135 */
4136 for (i = PMAP_NWINDOWS_FIRSTFREE; i < PMAP_NWINDOWS; i++) {
4137 mp = &current_cpu_datap()->cpu_pmap->mapwindow[i];
4138
4139 if (*mp->prv_CMAP == 0) {
2d21ac55
A
4140 pmap_store_pte(mp->prv_CMAP, pentry);
4141
4142 invlpg((uintptr_t)mp->prv_CADDR);
4143
4144 return (mp);
0c530ab8
A
4145 }
4146 }
2d21ac55
A
4147 panic("pmap_get_mapwindow: no windows available");
4148
4149 return NULL;
4150}
4151
4152
4153void
4154pmap_put_mapwindow(mapwindow_t *mp)
4155{
4156 pmap_store_pte(mp->prv_CMAP, 0);
0c530ab8
A
4157}
4158
0c530ab8
A
4159void
4160pmap_switch(pmap_t tpmap)
4161{
4162 spl_t s;
0c530ab8
A
4163
4164 s = splhigh(); /* Make sure interruptions are disabled */
0c530ab8 4165
b0d623f7 4166 set_dirbase(tpmap, current_thread());
0c530ab8
A
4167
4168 splx(s);
4169}
4170
4171
4172/*
4173 * disable no-execute capability on
4174 * the specified pmap
4175 */
4176void pmap_disable_NX(pmap_t pmap) {
4177
4178 pmap->nx_enabled = 0;
4179}
4180
4181void
4182pt_fake_zone_info(int *count, vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size,
4183 vm_size_t *alloc_size, int *collectable, int *exhaustable)
4184{
4185 *count = inuse_ptepages_count;
4186 *cur_size = PAGE_SIZE * inuse_ptepages_count;
4187 *max_size = PAGE_SIZE * (inuse_ptepages_count + vm_page_inactive_count + vm_page_active_count + vm_page_free_count);
4188 *elem_size = PAGE_SIZE;
4189 *alloc_size = PAGE_SIZE;
4190
4191 *collectable = 1;
4192 *exhaustable = 0;
4193}
4194
4195vm_offset_t pmap_cpu_high_map_vaddr(int cpu, enum high_cpu_types e)
4196{
4197 enum high_fixed_addresses a;
4198 a = e + HIGH_CPU_END * cpu;
4199 return pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN + a);
4200}
4201
4202vm_offset_t pmap_high_map_vaddr(enum high_cpu_types e)
4203{
4204 return pmap_cpu_high_map_vaddr(cpu_number(), e);
4205}
4206
4207vm_offset_t pmap_high_map(pt_entry_t pte, enum high_cpu_types e)
4208{
4209 enum high_fixed_addresses a;
4210 vm_offset_t vaddr;
4211
4212 a = e + HIGH_CPU_END * cpu_number();
4213 vaddr = (vm_offset_t)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN + a);
2d21ac55 4214 pmap_store_pte(pte_unique_base + a, pte);
0c530ab8
A
4215
4216 /* TLB flush for this page for this cpu */
4217 invlpg((uintptr_t)vaddr);
4218
4219 return vaddr;
4220}
4221
935ed37a
A
4222static inline void
4223pmap_cpuset_NMIPI(cpu_set cpu_mask) {
4224 unsigned int cpu, cpu_bit;
4225 uint64_t deadline;
4226
4227 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
4228 if (cpu_mask & cpu_bit)
4229 cpu_NMI_interrupt(cpu);
4230 }
b0d623f7 4231 deadline = mach_absolute_time() + (LockTimeOut);
935ed37a
A
4232 while (mach_absolute_time() < deadline)
4233 cpu_pause();
4234}
4235
0c530ab8
A
4236/*
4237 * Called with pmap locked, we:
4238 * - scan through per-cpu data to see which other cpus need to flush
4239 * - send an IPI to each non-idle cpu to be flushed
4240 * - wait for all to signal back that they are inactive or we see that
4241 * they are in an interrupt handler or at a safe point
4242 * - flush the local tlb is active for this pmap
4243 * - return ... the caller will unlock the pmap
4244 */
4245void
4246pmap_flush_tlbs(pmap_t pmap)
4247{
4248 unsigned int cpu;
4249 unsigned int cpu_bit;
4250 cpu_set cpus_to_signal;
4251 unsigned int my_cpu = cpu_number();
4252 pmap_paddr_t pmap_cr3 = pmap->pm_cr3;
4253 boolean_t flush_self = FALSE;
4254 uint64_t deadline;
4255
2d21ac55
A
4256 assert((processor_avail_count < 2) ||
4257 (ml_get_interrupts_enabled() && get_preemption_level() != 0));
0c530ab8
A
4258
4259 /*
4260 * Scan other cpus for matching active or task CR3.
4261 * For idle cpus (with no active map) we mark them invalid but
4262 * don't signal -- they'll check as they go busy.
4263 * Note: for the kernel pmap we look for 64-bit shared address maps.
4264 */
4265 cpus_to_signal = 0;
4266 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
4267 if (!cpu_datap(cpu)->cpu_running)
4268 continue;
2d21ac55
A
4269 if ((cpu_datap(cpu)->cpu_task_cr3 == pmap_cr3) ||
4270 (CPU_GET_ACTIVE_CR3(cpu) == pmap_cr3) ||
0c530ab8
A
4271 (pmap->pm_shared) ||
4272 ((pmap == kernel_pmap) &&
4273 (!CPU_CR3_IS_ACTIVE(cpu) ||
4274 cpu_datap(cpu)->cpu_task_map == TASK_MAP_64BIT_SHARED))) {
4275 if (cpu == my_cpu) {
4276 flush_self = TRUE;
4277 continue;
4278 }
4279 cpu_datap(cpu)->cpu_tlb_invalid = TRUE;
4280 __asm__ volatile("mfence");
4281
4282 if (CPU_CR3_IS_ACTIVE(cpu)) {
4283 cpus_to_signal |= cpu_bit;
4284 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
4285 }
4286 }
4287 }
4288
2d21ac55
A
4289 PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_START,
4290 (int) pmap, cpus_to_signal, flush_self, 0, 0);
0c530ab8 4291
2d21ac55 4292 if (cpus_to_signal) {
935ed37a
A
4293 cpu_set cpus_to_respond = cpus_to_signal;
4294
0c530ab8
A
4295 deadline = mach_absolute_time() + LockTimeOut;
4296 /*
4297 * Wait for those other cpus to acknowledge
4298 */
935ed37a
A
4299 while (cpus_to_respond != 0) {
4300 if (mach_absolute_time() > deadline) {
b0d623f7
A
4301 if (mp_recent_debugger_activity())
4302 continue;
593a1d5f
A
4303 if (!panic_active()) {
4304 pmap_tlb_flush_timeout = TRUE;
4305 pmap_cpuset_NMIPI(cpus_to_respond);
4306 }
935ed37a
A
4307 panic("pmap_flush_tlbs() timeout: "
4308 "cpu(s) failing to respond to interrupts, pmap=%p cpus_to_respond=0x%lx",
4309 pmap, cpus_to_respond);
4310 }
4311
4312 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
4313 if ((cpus_to_respond & cpu_bit) != 0) {
4314 if (!cpu_datap(cpu)->cpu_running ||
4315 cpu_datap(cpu)->cpu_tlb_invalid == FALSE ||
4316 !CPU_CR3_IS_ACTIVE(cpu)) {
4317 cpus_to_respond &= ~cpu_bit;
4318 }
4319 cpu_pause();
2d21ac55 4320 }
935ed37a
A
4321 if (cpus_to_respond == 0)
4322 break;
0c530ab8 4323 }
0c530ab8 4324 }
0c530ab8 4325 }
0c530ab8
A
4326 /*
4327 * Flush local tlb if required.
4328 * We need this flush even if the pmap being changed
4329 * is the user map... in case we do a copyin/out
4330 * before returning to user mode.
4331 */
4332 if (flush_self)
4333 flush_tlb();
4334
b0d623f7
A
4335 if ((pmap == kernel_pmap) && (flush_self != TRUE)) {
4336 panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map);
4337 }
4338
2d21ac55
A
4339 PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_END,
4340 (int) pmap, cpus_to_signal, flush_self, 0, 0);
0c530ab8
A
4341}
4342
4343void
4344process_pmap_updates(void)
4345{
2d21ac55
A
4346 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
4347
0c530ab8
A
4348 flush_tlb();
4349
4350 current_cpu_datap()->cpu_tlb_invalid = FALSE;
4351 __asm__ volatile("mfence");
4352}
4353
4354void
4355pmap_update_interrupt(void)
4356{
2d21ac55
A
4357 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START,
4358 0, 0, 0, 0, 0);
0c530ab8
A
4359
4360 process_pmap_updates();
4361
2d21ac55
A
4362 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END,
4363 0, 0, 0, 0, 0);
0c530ab8
A
4364}
4365
4366
4367unsigned int pmap_cache_attributes(ppnum_t pn) {
4368
2d21ac55 4369 if (!managed_page(ppn_to_pai(pn)))
0c530ab8
A
4370 return (VM_WIMG_IO);
4371
4372 return (VM_WIMG_COPYBACK);
4373}
4374
4375#ifdef PMAP_DEBUG
4376void
4377pmap_dump(pmap_t p)
4378{
4379 int i;
4380
4381 kprintf("pmap 0x%x\n",p);
4382
4383 kprintf(" pm_cr3 0x%llx\n",p->pm_cr3);
4384 kprintf(" pm_pml4 0x%x\n",p->pm_pml4);
4385 kprintf(" pm_pdpt 0x%x\n",p->pm_pdpt);
4386
4387 kprintf(" pml4[0] 0x%llx\n",*p->pm_pml4);
4388 for (i=0;i<8;i++)
4389 kprintf(" pdpt[%d] 0x%llx\n",i, p->pm_pdpt[i]);
4390}
4391
4392void pmap_dump_wrap(void)
4393{
4394 pmap_dump(current_cpu_datap()->cpu_active_thread->task->map->pmap);
4395}
4396
4397void
4398dump_4GB_pdpt(pmap_t p)
4399{
4400 int spl;
4401 pdpt_entry_t *user_pdptp;
4402 pdpt_entry_t *kern_pdptp;
4403 pdpt_entry_t *pml4p;
4404
4405 spl = splhigh();
4406 while ((user_pdptp = pmap64_pdpt(p, 0x0)) == PDPT_ENTRY_NULL) {
4407 splx(spl);
4408 pmap_expand_pml4(p, 0x0);
4409 spl = splhigh();
4410 }
4411 kern_pdptp = kernel_pmap->pm_pdpt;
4412 if (kern_pdptp == NULL)
4413 panic("kern_pdptp == NULL");
4414 kprintf("dump_4GB_pdpt(%p)\n"
4415 "kern_pdptp=%p (phys=0x%016llx)\n"
4416 "\t 0x%08x: 0x%016llx\n"
4417 "\t 0x%08x: 0x%016llx\n"
4418 "\t 0x%08x: 0x%016llx\n"
4419 "\t 0x%08x: 0x%016llx\n"
4420 "\t 0x%08x: 0x%016llx\n"
4421 "user_pdptp=%p (phys=0x%016llx)\n"
4422 "\t 0x%08x: 0x%016llx\n"
4423 "\t 0x%08x: 0x%016llx\n"
4424 "\t 0x%08x: 0x%016llx\n"
4425 "\t 0x%08x: 0x%016llx\n"
4426 "\t 0x%08x: 0x%016llx\n",
4427 p, kern_pdptp, kvtophys(kern_pdptp),
4428 kern_pdptp+0, *(kern_pdptp+0),
4429 kern_pdptp+1, *(kern_pdptp+1),
4430 kern_pdptp+2, *(kern_pdptp+2),
4431 kern_pdptp+3, *(kern_pdptp+3),
4432 kern_pdptp+4, *(kern_pdptp+4),
4433 user_pdptp, kvtophys(user_pdptp),
4434 user_pdptp+0, *(user_pdptp+0),
4435 user_pdptp+1, *(user_pdptp+1),
4436 user_pdptp+2, *(user_pdptp+2),
4437 user_pdptp+3, *(user_pdptp+3),
4438 user_pdptp+4, *(user_pdptp+4));
4439 kprintf("user pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
4440 p->pm_cr3, p->pm_hold, p->pm_pml4);
4441 pml4p = (pdpt_entry_t *)p->pm_hold;
4442 if (pml4p == NULL)
4443 panic("user pml4p == NULL");
4444 kprintf("\t 0x%08x: 0x%016llx\n"
4445 "\t 0x%08x: 0x%016llx\n",
4446 pml4p+0, *(pml4p),
4447 pml4p+KERNEL_UBER_PML4_INDEX, *(pml4p+KERNEL_UBER_PML4_INDEX));
4448 kprintf("kern pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
4449 kernel_pmap->pm_cr3, kernel_pmap->pm_hold, kernel_pmap->pm_pml4);
4450 pml4p = (pdpt_entry_t *)kernel_pmap->pm_hold;
4451 if (pml4p == NULL)
4452 panic("kern pml4p == NULL");
4453 kprintf("\t 0x%08x: 0x%016llx\n"
4454 "\t 0x%08x: 0x%016llx\n",
4455 pml4p+0, *(pml4p),
4456 pml4p+511, *(pml4p+511));
4457 splx(spl);
4458}
4459
4460void dump_4GB_pdpt_thread(thread_t tp)
4461{
4462 dump_4GB_pdpt(tp->map->pmap);
4463}
4464
4465
4466#endif
b0d623f7 4467