]> git.saurik.com Git - apple/xnu.git/blame - osfmk/x86_64/pmap.c
xnu-1486.2.11.tar.gz
[apple/xnu.git] / osfmk / x86_64 / pmap.c
CommitLineData
b0d623f7
A
1
2/*
3 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
4 *
5 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 *
7 * This file contains Original Code and/or Modifications of Original Code
8 * as defined in and that are subject to the Apple Public Source License
9 * Version 2.0 (the 'License'). You may not use this file except in
10 * compliance with the License. The rights granted to you under the License
11 * may not be used to create, or enable the creation or redistribution of,
12 * unlawful or unlicensed copies of an Apple operating system, or to
13 * circumvent, violate, or enable the circumvention or violation of, any
14 * terms of an Apple operating system software license agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 *
19 * The Original Code and all software distributed under the License are
20 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
21 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
22 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
23 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
24 * Please see the License for the specific language governing rights and
25 * limitations under the License.
26 *
27 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 */
29/*
30 * @OSF_COPYRIGHT@
31 */
32/*
33 * Mach Operating System
34 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
35 * All Rights Reserved.
36 *
37 * Permission to use, copy, modify and distribute this software and its
38 * documentation is hereby granted, provided that both the copyright
39 * notice and this permission notice appear in all copies of the
40 * software, derivative works or modified versions, and any portions
41 * thereof, and that both notices appear in supporting documentation.
42 *
43 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
44 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
45 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 *
47 * Carnegie Mellon requests users of this software to return to
48 *
49 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
50 * School of Computer Science
51 * Carnegie Mellon University
52 * Pittsburgh PA 15213-3890
53 *
54 * any improvements or extensions that they make and grant Carnegie Mellon
55 * the rights to redistribute these changes.
56 */
57/*
58 */
59
60/*
61 * File: pmap.c
62 * Author: Avadis Tevanian, Jr., Michael Wayne Young
63 * (These guys wrote the Vax version)
64 *
65 * Physical Map management code for Intel i386, i486, and i860.
66 *
67 * Manages physical address maps.
68 *
69 * In addition to hardware address maps, this
70 * module is called upon to provide software-use-only
71 * maps which may or may not be stored in the same
72 * form as hardware maps. These pseudo-maps are
73 * used to store intermediate results from copy
74 * operations to and from address spaces.
75 *
76 * Since the information managed by this module is
77 * also stored by the logical address mapping module,
78 * this module may throw away valid virtual-to-physical
79 * mappings at almost any time. However, invalidations
80 * of virtual-to-physical mappings must be done as
81 * requested.
82 *
83 * In order to cope with hardware architectures which
84 * make virtual-to-physical map invalidates expensive,
85 * this module may delay invalidate or reduced protection
86 * operations until such time as they are actually
87 * necessary. This module is given full information as
88 * to which processors are currently using which maps,
89 * and to when physical maps must be made correct.
90 */
91
92#include <string.h>
93#include <norma_vm.h>
94#include <mach_kdb.h>
95#include <mach_ldebug.h>
96
97#include <libkern/OSAtomic.h>
98
99#include <mach/machine/vm_types.h>
100
101#include <mach/boolean.h>
102#include <kern/thread.h>
103#include <kern/zalloc.h>
104#include <kern/queue.h>
105
106#include <kern/lock.h>
107#include <kern/kalloc.h>
108#include <kern/spl.h>
109
110#include <vm/pmap.h>
111#include <vm/vm_map.h>
112#include <vm/vm_kern.h>
113#include <mach/vm_param.h>
114#include <mach/vm_prot.h>
115#include <vm/vm_object.h>
116#include <vm/vm_page.h>
117
118#include <mach/machine/vm_param.h>
119#include <machine/thread.h>
120
121#include <kern/misc_protos.h> /* prototyping */
122#include <i386/misc_protos.h>
123#include <x86_64/lowglobals.h>
124
125#include <i386/cpuid.h>
126#include <i386/cpu_data.h>
127#include <i386/cpu_number.h>
128#include <i386/machine_cpu.h>
129#include <i386/seg.h>
130#include <i386/serial_io.h>
131#include <i386/cpu_capabilities.h>
132#include <i386/machine_routines.h>
133#include <i386/proc_reg.h>
134#include <i386/tsc.h>
135#include <i386/pmap_internal.h>
136
137#if MACH_KDB
138#include <ddb/db_command.h>
139#include <ddb/db_output.h>
140#include <ddb/db_sym.h>
141#include <ddb/db_print.h>
142#endif /* MACH_KDB */
143
144#include <vm/vm_protos.h>
145
146#include <i386/mp.h>
147#include <i386/mp_desc.h>
148
149
150/* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
151#ifdef DEBUGINTERRUPTS
152#define pmap_intr_assert() { \
153 if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \
154 panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \
155}
156#else
157#define pmap_intr_assert()
158#endif
159
160#ifdef IWANTTODEBUG
161#undef DEBUG
162#define DEBUG 1
163#define POSTCODE_DELAY 1
164#include <i386/postcode.h>
165#endif /* IWANTTODEBUG */
166
167boolean_t pmap_trace = FALSE;
168
169#if PMAP_DBG
170#define DBG(x...) kprintf("DBG: " x)
171#else
172#define DBG(x...)
173#endif
174
175boolean_t no_shared_cr3 = DEBUG; /* TRUE for DEBUG by default */
176
177/*
178 * Forward declarations for internal functions.
179 */
180
181void pmap_remove_range(
182 pmap_t pmap,
183 vm_map_offset_t va,
184 pt_entry_t *spte,
185 pt_entry_t *epte);
186
187void phys_attribute_clear(
188 ppnum_t phys,
189 int bits);
190
191int phys_attribute_test(
192 ppnum_t phys,
193 int bits);
194
195void phys_attribute_set(
196 ppnum_t phys,
197 int bits);
198
199void pmap_set_reference(
200 ppnum_t pn);
201
202boolean_t phys_page_exists(
203 ppnum_t pn);
204
205
206int nx_enabled = 1; /* enable no-execute protection */
207int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
208int allow_stack_exec = 0; /* No apps may execute from the stack by default */
209
210const boolean_t cpu_64bit = TRUE; /* Mais oui! */
211
212/*
213 * when spinning through pmap_remove
214 * ensure that we don't spend too much
215 * time with preemption disabled.
216 * I'm setting the current threshold
217 * to 20us
218 */
219#define MAX_PREEMPTION_LATENCY_NS 20000
220
221uint64_t max_preemption_latency_tsc = 0;
222
223
224/*
225 * Private data structures.
226 */
227
228/*
229 * For each vm_page_t, there is a list of all currently
230 * valid virtual mappings of that page. An entry is
231 * a pv_rooted_entry_t; the list is the pv_table.
232 *
233 * N.B. with the new combo rooted/hashed scheme it is
234 * only possibly to remove individual non-rooted entries
235 * if they are found via the hashed chains as there is no
236 * way to unlink the singly linked hashed entries if navigated to
237 * via the queue list off the rooted entries. Think of it as
238 * hash/walk/pull, keeping track of the prev pointer while walking
239 * the singly linked hash list. All of this is to save memory and
240 * keep both types of pv_entries as small as possible.
241 */
242
243/*
244
245PV HASHING Changes - JK 1/2007
246
247Pve's establish physical to virtual mappings. These are used for aliasing of a
248physical page to (potentially many) virtual addresses within pmaps. In the
249previous implementation the structure of the pv_entries (each 16 bytes in size) was
250
251typedef struct pv_entry {
252 struct pv_entry_t next;
253 pmap_t pmap;
254 vm_map_offset_t va;
255} *pv_entry_t;
256
257An initial array of these is created at boot time, one per physical page of
258memory, indexed by the physical page number. Additionally, a pool of entries
259is created from a pv_zone to be used as needed by pmap_enter() when it is
260creating new mappings. Originally, we kept this pool around because the code
261in pmap_enter() was unable to block if it needed an entry and none were
262available - we'd panic. Some time ago I restructured the pmap_enter() code
263so that for user pmaps it can block while zalloc'ing a pv structure and restart,
264removing a panic from the code (in the case of the kernel pmap we cannot block
265and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
266The pool has not been removed since there is a large performance gain keeping
267freed pv's around for reuse and not suffering the overhead of zalloc for every
268new pv we need.
269
270As pmap_enter() created new mappings it linked the new pve's for them off the
271fixed pv array for that ppn (off the next pointer). These pve's are accessed
272for several operations, one of them being address space teardown. In that case,
273we basically do this
274
275 for (every page/pte in the space) {
276 calc pve_ptr from the ppn in the pte
277 for (every pv in the list for the ppn) {
278 if (this pv is for this pmap/vaddr) {
279 do housekeeping
280 unlink/free the pv
281 }
282 }
283 }
284
285The problem arose when we were running, say 8000 (or even 2000) apache or
286other processes and one or all terminate. The list hanging off each pv array
287entry could have thousands of entries. We were continuously linearly searching
288each of these lists as we stepped through the address space we were tearing
289down. Because of the locks we hold, likely taking a cache miss for each node,
290and interrupt disabling for MP issues the system became completely unresponsive
291for many seconds while we did this.
292
293Realizing that pve's are accessed in two distinct ways (linearly running the
294list by ppn for operations like pmap_page_protect and finding and
295modifying/removing a single pve as part of pmap_enter processing) has led to
296modifying the pve structures and databases.
297
298There are now two types of pve structures. A "rooted" structure which is
299basically the original structure accessed in an array by ppn, and a ''hashed''
300structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
301designed with the two goals of minimizing wired memory and making the lookup of
302a ppn faster. Since a vast majority of pages in the system are not aliased
303and hence represented by a single pv entry I've kept the rooted entry size as
304small as possible because there is one of these dedicated for every physical
305page of memory. The hashed pve's are larger due to the addition of the hash
306link and the ppn entry needed for matching while running the hash list to find
307the entry we are looking for. This way, only systems that have lots of
308aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
309structures have the same first three fields allowing some simplification in
310the code.
311
312They have these shapes
313
314typedef struct pv_rooted_entry {
315 queue_head_t qlink;
316 vm_map_offset_t va;
317 pmap_t pmap;
318} *pv_rooted_entry_t;
319
320
321typedef struct pv_hashed_entry {
322 queue_head_t qlink;
323 vm_map_offset_t va;
324 pmap_t pmap;
325 ppnum_t ppn;
326 struct pv_hashed_entry *nexth;
327} *pv_hashed_entry_t;
328
329The main flow difference is that the code is now aware of the rooted entry and
330the hashed entries. Code that runs the pv list still starts with the rooted
331entry and then continues down the qlink onto the hashed entries. Code that is
332looking up a specific pv entry first checks the rooted entry and then hashes
333and runs the hash list for the match. The hash list lengths are much smaller
334than the original pv lists that contained all aliases for the specific ppn.
335
336*/
337
338typedef struct pv_rooted_entry {
339 /* first three entries must match pv_hashed_entry_t */
340 queue_head_t qlink;
341 vm_map_offset_t va; /* virtual address for mapping */
342 pmap_t pmap; /* pmap where mapping lies */
343} *pv_rooted_entry_t;
344
345#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
346
347pv_rooted_entry_t pv_head_table; /* array of entries, one per page */
348
349typedef struct pv_hashed_entry {
350 /* first three entries must match pv_rooted_entry_t */
351 queue_head_t qlink;
352 vm_map_offset_t va;
353 pmap_t pmap;
354 ppnum_t ppn;
355 struct pv_hashed_entry *nexth;
356} *pv_hashed_entry_t;
357
358#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
359
360#define NPVHASH 4095 /* MUST BE 2^N - 1 */
361pv_hashed_entry_t *pv_hash_table; /* hash lists */
362
363uint32_t npvhash = 0;
364
365//#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */
366#ifdef PV_DEBUG
367#define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
368#else
369#define CHK_NPVHASH(x)
370#endif
371
372pv_hashed_entry_t pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
373pv_hashed_entry_t pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
374decl_simple_lock_data(,pv_hashed_free_list_lock)
375decl_simple_lock_data(,pv_hashed_kern_free_list_lock)
376decl_simple_lock_data(,pv_hash_table_lock)
377
378int pv_hashed_free_count = 0;
379int pv_hashed_kern_free_count = 0;
380#define PV_HASHED_LOW_WATER_MARK 5000
381#define PV_HASHED_KERN_LOW_WATER_MARK 100
382#define PV_HASHED_ALLOC_CHUNK 2000
383#define PV_HASHED_KERN_ALLOC_CHUNK 50
384thread_call_t mapping_adjust_call;
385static thread_call_data_t mapping_adjust_call_data;
386uint32_t mappingrecurse = 0;
387
388#define PV_HASHED_ALLOC(pvh_e) { \
389 simple_lock(&pv_hashed_free_list_lock); \
390 if ((pvh_e = pv_hashed_free_list) != 0) { \
391 pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
392 pv_hashed_free_count--; \
393 if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \
394 if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
395 thread_call_enter(mapping_adjust_call); \
396 } \
397 simple_unlock(&pv_hashed_free_list_lock); \
398}
399
400#define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
401 simple_lock(&pv_hashed_free_list_lock); \
402 pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; \
403 pv_hashed_free_list = pvh_eh; \
404 pv_hashed_free_count += pv_cnt; \
405 simple_unlock(&pv_hashed_free_list_lock); \
406}
407
408#define PV_HASHED_KERN_ALLOC(pvh_e) { \
409 simple_lock(&pv_hashed_kern_free_list_lock); \
410 if ((pvh_e = pv_hashed_kern_free_list) != 0) { \
411 pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
412 pv_hashed_kern_free_count--; \
413 if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK)\
414 if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
415 thread_call_enter(mapping_adjust_call); \
416 } \
417 simple_unlock(&pv_hashed_kern_free_list_lock); \
418}
419
420#define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
421 simple_lock(&pv_hashed_kern_free_list_lock); \
422 pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; \
423 pv_hashed_kern_free_list = pvh_eh; \
424 pv_hashed_kern_free_count += pv_cnt; \
425 simple_unlock(&pv_hashed_kern_free_list_lock); \
426}
427
428zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
429
430static zone_t pdpt_zone;
431
432/*
433 * Each entry in the pv_head_table is locked by a bit in the
434 * pv_lock_table. The lock bits are accessed by the physical
435 * address of the page they lock.
436 */
437
438char *pv_lock_table; /* pointer to array of bits */
439#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
440
441char *pv_hash_lock_table;
442#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
443
444/*
445 * First and last physical addresses that we maintain any information
446 * for. Initialized to zero so that pmap operations done before
447 * pmap_init won't touch any non-existent structures.
448 */
449boolean_t pmap_initialized = FALSE;/* Has pmap_init completed? */
450
451static struct vm_object kptobj_object_store;
452static struct vm_object kpml4obj_object_store;
453static struct vm_object kpdptobj_object_store;
454
455/*
456 * Index into pv_head table, its lock bits, and the modify/reference and managed bits
457 */
458
459#define pa_index(pa) (i386_btop(pa))
460#define ppn_to_pai(ppn) ((int)ppn)
461
462#define pai_to_pvh(pai) (&pv_head_table[pai])
463#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
464#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
465
466static inline uint32_t
467pvhashidx(pmap_t pmap, vm_offset_t va)
468{
469 return ((uint32_t)(uint64_t)pmap ^
470 ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
471 npvhash;
472}
473#define pvhash(idx) (&pv_hash_table[idx])
474
475#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
476#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
477
478/*
479 * Array of physical page attribites for managed pages.
480 * One byte per physical page.
481 */
482char *pmap_phys_attributes;
483unsigned int last_managed_page = 0;
484#define IS_MANAGED_PAGE(x) \
485 ((unsigned int)(x) <= last_managed_page && \
486 (pmap_phys_attributes[x] & PHYS_MANAGED))
487
488/*
489 * Physical page attributes. Copy bits from PTE definition.
490 */
491#define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
492#define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
493#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
494
495/*
496 * Amount of virtual memory mapped by one
497 * page-directory entry.
498 */
499#define PDE_MAPPED_SIZE (pdetova(1))
500uint64_t pde_mapped_size = PDE_MAPPED_SIZE;
501
502/*
503 * Locking and TLB invalidation
504 */
505
506/*
507 * Locking Protocols: (changed 2/2007 JK)
508 *
509 * There are two structures in the pmap module that need locking:
510 * the pmaps themselves, and the per-page pv_lists (which are locked
511 * by locking the pv_lock_table entry that corresponds to the pv_head
512 * for the list in question.) Most routines want to lock a pmap and
513 * then do operations in it that require pv_list locking -- however
514 * pmap_remove_all and pmap_copy_on_write operate on a physical page
515 * basis and want to do the locking in the reverse order, i.e. lock
516 * a pv_list and then go through all the pmaps referenced by that list.
517 *
518 * The system wide pmap lock has been removed. Now, paths take a lock
519 * on the pmap before changing its 'shape' and the reverse order lockers
520 * (coming in by phys ppn) take a lock on the corresponding pv and then
521 * retest to be sure nothing changed during the window before they locked
522 * and can then run up/down the pv lists holding the list lock. This also
523 * lets the pmap layer run (nearly completely) interrupt enabled, unlike
524 * previously.
525 */
526
527/*
528 * PV locking
529 */
530
531#define LOCK_PVH(index) { \
532 mp_disable_preemption(); \
533 lock_pvh_pai(index); \
534}
535
536#define UNLOCK_PVH(index) { \
537 unlock_pvh_pai(index); \
538 mp_enable_preemption(); \
539}
540/*
541 * PV hash locking
542 */
543
544#define LOCK_PV_HASH(hash) lock_hash_hash(hash)
545#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
546
547unsigned pmap_memory_region_count;
548unsigned pmap_memory_region_current;
549
550pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
551
552/*
553 * Other useful macros.
554 */
555#define current_pmap() (vm_map_pmap(current_thread()->map))
556
557struct pmap kernel_pmap_store;
558pmap_t kernel_pmap;
559
560pd_entry_t high_shared_pde;
561pd_entry_t commpage64_pde;
562
563struct zone *pmap_zone; /* zone of pmap structures */
564
565int pmap_debug = 0; /* flag for debugging prints */
566
567unsigned int inuse_ptepages_count = 0;
568
569addr64_t kernel64_cr3;
570
571/*
572 * Pmap cache. Cache is threaded through ref_count field of pmap.
573 * Max will eventually be constant -- variable for experimentation.
574 */
575int pmap_cache_max = 32;
576int pmap_alloc_chunk = 8;
577pmap_t pmap_cache_list;
578int pmap_cache_count;
579decl_simple_lock_data(,pmap_cache_lock)
580
581extern char end;
582
583static int nkpt;
584
585pt_entry_t *DMAP1, *DMAP2;
586caddr_t DADDR1;
587caddr_t DADDR2;
588
589/*
590 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
591 * properly deals with the anchor.
592 * must be called with the hash locked, does not unlock it
593 */
594
595static inline void
596pmap_pvh_unlink(pv_hashed_entry_t pvh)
597{
598 pv_hashed_entry_t curh;
599 pv_hashed_entry_t *pprevh;
600 int pvhash_idx;
601
602 CHK_NPVHASH();
603 pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
604
605 pprevh = pvhash(pvhash_idx);
606
607#if PV_DEBUG
608 if (NULL == *pprevh)
609 panic("pvh_unlink null anchor"); /* JK DEBUG */
610#endif
611 curh = *pprevh;
612
613 while (PV_HASHED_ENTRY_NULL != curh) {
614 if (pvh == curh)
615 break;
616 pprevh = &curh->nexth;
617 curh = curh->nexth;
618 }
619 if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
620 *pprevh = pvh->nexth;
621 return;
622}
623
624static inline void
625pv_hash_add(pv_hashed_entry_t pvh_e,
626 pv_rooted_entry_t pv_h)
627{
628 pv_hashed_entry_t *hashp;
629 int pvhash_idx;
630
631 CHK_NPVHASH();
632 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
633 LOCK_PV_HASH(pvhash_idx);
634 insque(&pvh_e->qlink, &pv_h->qlink);
635 hashp = pvhash(pvhash_idx);
636#if PV_DEBUG
637 if (NULL==hashp)
638 panic("pv_hash_add(%p) null hash bucket", pvh_e);
639#endif
640 pvh_e->nexth = *hashp;
641 *hashp = pvh_e;
642 UNLOCK_PV_HASH(pvhash_idx);
643}
644
645static inline void
646pv_hash_remove(pv_hashed_entry_t pvh_e)
647{
648 int pvhash_idx;
649
650 CHK_NPVHASH();
651 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
652 LOCK_PV_HASH(pvhash_idx);
653 remque(&pvh_e->qlink);
654 pmap_pvh_unlink(pvh_e);
655 UNLOCK_PV_HASH(pvhash_idx);
656}
657
658/*
659 * Remove pv list entry.
660 * Called with pv_head_table entry locked.
661 * Returns pv entry to be freed (or NULL).
662 */
663static inline pv_hashed_entry_t
664pmap_pv_remove(pmap_t pmap,
665 vm_map_offset_t vaddr,
666 ppnum_t ppn)
667{
668 pv_hashed_entry_t pvh_e;
669 pv_rooted_entry_t pv_h;
670 pv_hashed_entry_t *pprevh;
671 int pvhash_idx;
672 uint32_t pv_cnt;
673
674 pvh_e = PV_HASHED_ENTRY_NULL;
675 pv_h = pai_to_pvh(ppn_to_pai(ppn));
676 if (pv_h->pmap == PMAP_NULL)
677 panic("pmap_pv_remove(%p,%llu,%u): null pv_list!",
678 pmap, vaddr, ppn);
679
680 if (pv_h->va == vaddr && pv_h->pmap == pmap) {
681 /*
682 * Header is the pv_rooted_entry.
683 * We can't free that. If there is a queued
684 * entry after this one we remove that
685 * from the ppn queue, we remove it from the hash chain
686 * and copy it to the rooted entry. Then free it instead.
687 */
688 pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
689 if (pv_h != (pv_rooted_entry_t) pvh_e) {
690 /*
691 * Entry queued to root, remove this from hash
692 * and install as nem root.
693 */
694 CHK_NPVHASH();
695 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
696 LOCK_PV_HASH(pvhash_idx);
697 remque(&pvh_e->qlink);
698 pprevh = pvhash(pvhash_idx);
699 if (PV_HASHED_ENTRY_NULL == *pprevh) {
700 panic("pmap_pv_remove(%p,%llu,%u): "
701 "empty hash, removing rooted",
702 pmap, vaddr, ppn);
703 }
704 pmap_pvh_unlink(pvh_e);
705 UNLOCK_PV_HASH(pvhash_idx);
706 pv_h->pmap = pvh_e->pmap;
707 pv_h->va = pvh_e->va; /* dispose of pvh_e */
708 } else {
709 /* none queued after rooted */
710 pv_h->pmap = PMAP_NULL;
711 pvh_e = PV_HASHED_ENTRY_NULL;
712 }
713 } else {
714 /*
715 * not removing rooted pv. find it on hash chain, remove from
716 * ppn queue and hash chain and free it
717 */
718 CHK_NPVHASH();
719 pvhash_idx = pvhashidx(pmap, vaddr);
720 LOCK_PV_HASH(pvhash_idx);
721 pprevh = pvhash(pvhash_idx);
722 if (PV_HASHED_ENTRY_NULL == *pprevh) {
723 panic("pmap_pv_remove(%p,%llu,%u): empty hash",
724 pmap, vaddr, ppn);
725 }
726 pvh_e = *pprevh;
727 pmap_pv_hashlist_walks++;
728 pv_cnt = 0;
729 while (PV_HASHED_ENTRY_NULL != pvh_e) {
730 pv_cnt++;
731 if (pvh_e->pmap == pmap &&
732 pvh_e->va == vaddr &&
733 pvh_e->ppn == ppn)
734 break;
735 pprevh = &pvh_e->nexth;
736 pvh_e = pvh_e->nexth;
737 }
738 if (PV_HASHED_ENTRY_NULL == pvh_e)
739 panic("pmap_pv_remove(%p,%llu,%u): pv not on hash",
740 pmap, vaddr, ppn);
741 pmap_pv_hashlist_cnts += pv_cnt;
742 if (pmap_pv_hashlist_max < pv_cnt)
743 pmap_pv_hashlist_max = pv_cnt;
744 *pprevh = pvh_e->nexth;
745 remque(&pvh_e->qlink);
746 UNLOCK_PV_HASH(pvhash_idx);
747 }
748
749 return pvh_e;
750}
751
752/*
753 * for legacy, returns the address of the pde entry.
754 * for 64 bit, causes the pdpt page containing the pde entry to be mapped,
755 * then returns the mapped address of the pde entry in that page
756 */
757pd_entry_t *
758pmap_pde(pmap_t m, vm_map_offset_t v)
759{
760 pd_entry_t *pde;
761
762 assert(m);
763#if 0
764 if (m == kernel_pmap)
765 pde = (&((m)->dirbase[(vm_offset_t)(v) >> PDESHIFT]));
766 else
767#endif
768 pde = pmap64_pde(m, v);
769
770 return pde;
771}
772
773/*
774 * the single pml4 page per pmap is allocated at pmap create time and exists
775 * for the duration of the pmap. we allocate this page in kernel vm.
776 * this returns the address of the requested pml4 entry in the top level page.
777 */
778static inline
779pml4_entry_t *
780pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
781{
782 return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)];
783}
784
785/*
786 * maps in the pml4 page, if any, containing the pdpt entry requested
787 * and returns the address of the pdpt entry in that mapped page
788 */
789pdpt_entry_t *
790pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
791{
792 pml4_entry_t newpf;
793 pml4_entry_t *pml4;
794
795 assert(pmap);
796 if ((vaddr > 0x00007FFFFFFFFFFFULL) &&
797 (vaddr < 0xFFFF800000000000ULL)) {
798 return (0);
799 }
800
801 pml4 = pmap64_pml4(pmap, vaddr);
802 if (pml4 && ((*pml4 & INTEL_PTE_VALID))) {
803 newpf = *pml4 & PG_FRAME;
804 return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf))
805 [(vaddr >> PDPTSHIFT) & (NPDPTPG-1)];
806 }
807 return (NULL);
808}
809/*
810 * maps in the pdpt page, if any, containing the pde entry requested
811 * and returns the address of the pde entry in that mapped page
812 */
813pd_entry_t *
814pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr)
815{
816 pdpt_entry_t newpf;
817 pdpt_entry_t *pdpt;
818
819 assert(pmap);
820 if ((vaddr > 0x00007FFFFFFFFFFFULL) &&
821 (vaddr < 0xFFFF800000000000ULL)) {
822 return (0);
823 }
824
825 pdpt = pmap64_pdpt(pmap, vaddr);
826
827 if (pdpt && ((*pdpt & INTEL_PTE_VALID))) {
828 newpf = *pdpt & PG_FRAME;
829 return &((pd_entry_t *) PHYSMAP_PTOV(newpf))
830 [(vaddr >> PDSHIFT) & (NPDPG-1)];
831 }
832 return (NULL);
833}
834
835/*
836 * return address of mapped pte for vaddr va in pmap pmap.
837 *
838 * physically maps the pde page, if any, containing the pte in and returns
839 * the address of the pte in that mapped page
840 *
841 * In case the pde maps a superpage, return the pde, which, in this case
842 * is the actual page table entry.
843 */
844pt_entry_t *
845pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
846{
847 pd_entry_t *pde;
848 pd_entry_t newpf;
849
850 assert(pmap);
851 pde = pmap_pde(pmap, vaddr);
852
853 if (pde && ((*pde & INTEL_PTE_VALID))) {
854 if (*pde & INTEL_PTE_PS)
855 return pde;
856 newpf = *pde & PG_FRAME;
857 return &((pt_entry_t *)PHYSMAP_PTOV(newpf))
858 [i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)];
859 }
860 return (NULL);
861}
862
863/*
864 * Map memory at initialization. The physical addresses being
865 * mapped are not managed and are never unmapped.
866 *
867 * For now, VM is already on, we only need to map the
868 * specified memory.
869 */
870vm_offset_t
871pmap_map(
872 vm_offset_t virt,
873 vm_map_offset_t start_addr,
874 vm_map_offset_t end_addr,
875 vm_prot_t prot,
876 unsigned int flags)
877{
878 int ps;
879
880 ps = PAGE_SIZE;
881 while (start_addr < end_addr) {
882 pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
883 (ppnum_t) i386_btop(start_addr), prot, flags, FALSE);
884 virt += ps;
885 start_addr += ps;
886 }
887 return(virt);
888}
889
890/*
891 * Back-door routine for mapping kernel VM at initialization.
892 * Useful for mapping memory outside the range
893 * Sets no-cache, A, D.
894 * Otherwise like pmap_map.
895 */
896vm_offset_t
897pmap_map_bd(
898 vm_offset_t virt,
899 vm_map_offset_t start_addr,
900 vm_map_offset_t end_addr,
901 vm_prot_t prot,
902 unsigned int flags)
903{
904 pt_entry_t template;
905 pt_entry_t *pte;
906 spl_t spl;
907
908 template = pa_to_pte(start_addr)
909 | INTEL_PTE_REF
910 | INTEL_PTE_MOD
911 | INTEL_PTE_WIRED
912 | INTEL_PTE_VALID;
913
914 if (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) {
915 template |= INTEL_PTE_NCACHE;
916 if (!(flags & (VM_MEM_GUARDED | VM_WIMG_USE_DEFAULT)))
917 template |= INTEL_PTE_PTA;
918 }
919 if (prot & VM_PROT_WRITE)
920 template |= INTEL_PTE_WRITE;
921
922
923 while (start_addr < end_addr) {
924 spl = splhigh();
925 pte = pmap_pte(kernel_pmap, (vm_map_offset_t)virt);
926 if (pte == PT_ENTRY_NULL) {
927 panic("pmap_map_bd: Invalid kernel address\n");
928 }
929 pmap_store_pte(pte, template);
930 splx(spl);
931 pte_increment_pa(template);
932 virt += PAGE_SIZE;
933 start_addr += PAGE_SIZE;
934 }
935
936
937 flush_tlb();
938 return(virt);
939}
940
941extern char *first_avail;
942extern vm_offset_t virtual_avail, virtual_end;
943extern pmap_paddr_t avail_start, avail_end;
944extern vm_offset_t sHIB;
945extern vm_offset_t eHIB;
946extern vm_offset_t stext;
947extern vm_offset_t etext;
948extern vm_offset_t sdata;
949
950void
951pmap_cpu_init(void)
952{
953 /*
954 * Here early in the life of a processor (from cpu_mode_init()).
955 * Ensure global page feature is disabled.
956 */
957 set_cr4(get_cr4() &~ CR4_PGE);
958
959 /*
960 * Initialize the per-cpu, TLB-related fields.
961 */
962 current_cpu_datap()->cpu_kernel_cr3 = kernel_pmap->pm_cr3;
963 current_cpu_datap()->cpu_active_cr3 = kernel_pmap->pm_cr3;
964 current_cpu_datap()->cpu_tlb_invalid = FALSE;
965}
966
967
968
969/*
970 * Bootstrap the system enough to run with virtual memory.
971 * Map the kernel's code and data, and allocate the system page table.
972 * Called with mapping OFF. Page_size must already be set.
973 */
974
975void
976pmap_bootstrap(
977 __unused vm_offset_t load_start,
978 __unused boolean_t IA32e)
979{
980#if NCOPY_WINDOWS > 0
981 vm_offset_t va;
982 int i;
983#endif
984
985 assert(IA32e);
986
987 vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Set the highest address
988 * known to VM */
989 /*
990 * The kernel's pmap is statically allocated so we don't
991 * have to use pmap_create, which is unlikely to work
992 * correctly at this part of the boot sequence.
993 */
994
995 kernel_pmap = &kernel_pmap_store;
996 kernel_pmap->ref_count = 1;
997 kernel_pmap->nx_enabled = FALSE;
998 kernel_pmap->pm_task_map = TASK_MAP_64BIT;
999 kernel_pmap->pm_obj = (vm_object_t) NULL;
1000 kernel_pmap->dirbase = (pd_entry_t *)((uintptr_t)IdlePTD);
1001 kernel_pmap->pm_pdpt = (pd_entry_t *) ((uintptr_t)IdlePDPT);
1002 kernel_pmap->pm_pml4 = IdlePML4;
1003 kernel_pmap->pm_cr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
1004
1005
1006 current_cpu_datap()->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
1007
1008 nkpt = NKPT;
1009 OSAddAtomic(NKPT, &inuse_ptepages_count);
1010
1011 virtual_avail = (vm_offset_t)(VM_MIN_KERNEL_ADDRESS) + (vm_offset_t)first_avail;
1012 virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
1013
1014#if NCOPY_WINDOWS > 0
1015 /*
1016 * Reserve some special page table entries/VA space for temporary
1017 * mapping of pages.
1018 */
1019#define SYSMAP(c, p, v, n) \
1020 v = (c)va; va += ((n)*INTEL_PGBYTES);
1021
1022 va = virtual_avail;
1023
1024 for (i=0; i<PMAP_NWINDOWS; i++) {
1025#if 1
1026 kprintf("trying to do SYSMAP idx %d %p\n", i,
1027 current_cpu_datap());
1028 kprintf("cpu_pmap %p\n", current_cpu_datap()->cpu_pmap);
1029 kprintf("mapwindow %p\n", current_cpu_datap()->cpu_pmap->mapwindow);
1030 kprintf("two stuff %p %p\n",
1031 (void *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
1032 (void *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR));
1033#endif
1034 SYSMAP(caddr_t,
1035 (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
1036 (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR),
1037 1);
1038 current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP =
1039 &(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP_store);
1040 *current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP = 0;
1041 }
1042
1043 /* DMAP user for debugger */
1044 SYSMAP(caddr_t, DMAP1, DADDR1, 1);
1045 SYSMAP(caddr_t, DMAP2, DADDR2, 1); /* XXX temporary - can remove */
1046
1047 virtual_avail = va;
1048#endif
1049
1050 if (PE_parse_boot_argn("npvhash", &npvhash, sizeof (npvhash))) {
1051 if (0 != ((npvhash + 1) & npvhash)) {
1052 kprintf("invalid hash %d, must be ((2^N)-1), "
1053 "using default %d\n", npvhash, NPVHASH);
1054 npvhash = NPVHASH;
1055 }
1056 } else {
1057 npvhash = NPVHASH;
1058 }
1059
1060 printf("npvhash=%d\n", npvhash);
1061
1062 simple_lock_init(&kernel_pmap->lock, 0);
1063 simple_lock_init(&pv_hashed_free_list_lock, 0);
1064 simple_lock_init(&pv_hashed_kern_free_list_lock, 0);
1065 simple_lock_init(&pv_hash_table_lock,0);
1066
1067 pmap_cpu_init();
1068
1069 kprintf("Kernel virtual space from 0x%lx to 0x%lx.\n",
1070 (long)KERNEL_BASE, (long)virtual_end);
1071 kprintf("Available physical space from 0x%llx to 0x%llx\n",
1072 avail_start, avail_end);
1073
1074 /*
1075 * The -no_shared_cr3 boot-arg is a debugging feature (set by default
1076 * in the DEBUG kernel) to force the kernel to switch to its own map
1077 * (and cr3) when control is in kernelspace. The kernel's map does not
1078 * include (i.e. share) userspace so wild references will cause
1079 * a panic. Only copyin and copyout are exempt from this.
1080 */
1081 (void) PE_parse_boot_argn("-no_shared_cr3",
1082 &no_shared_cr3, sizeof (no_shared_cr3));
1083 if (no_shared_cr3)
1084 kprintf("Kernel not sharing user map\n");
1085
1086#ifdef PMAP_TRACES
1087 if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof (pmap_trace))) {
1088 kprintf("Kernel traces for pmap operations enabled\n");
1089 }
1090#endif /* PMAP_TRACES */
1091}
1092
1093void
1094pmap_virtual_space(
1095 vm_offset_t *startp,
1096 vm_offset_t *endp)
1097{
1098 *startp = virtual_avail;
1099 *endp = virtual_end;
1100}
1101
1102/*
1103 * Initialize the pmap module.
1104 * Called by vm_init, to initialize any structures that the pmap
1105 * system needs to map virtual memory.
1106 */
1107void
1108pmap_init(void)
1109{
1110 long npages;
1111 vm_offset_t addr;
1112 vm_size_t s;
1113 vm_map_offset_t vaddr;
1114 ppnum_t ppn;
1115
1116
1117 kernel_pmap->pm_obj_pml4 = &kpml4obj_object_store;
1118 _vm_object_allocate((vm_object_size_t)NPML4PGS, &kpml4obj_object_store);
1119
1120 kernel_pmap->pm_obj_pdpt = &kpdptobj_object_store;
1121 _vm_object_allocate((vm_object_size_t)NPDPTPGS, &kpdptobj_object_store);
1122
1123 kernel_pmap->pm_obj = &kptobj_object_store;
1124 _vm_object_allocate((vm_object_size_t)NPDEPGS, &kptobj_object_store);
1125
1126 /*
1127 * Allocate memory for the pv_head_table and its lock bits,
1128 * the modify bit array, and the pte_page table.
1129 */
1130
1131 /*
1132 * zero bias all these arrays now instead of off avail_start
1133 * so we cover all memory
1134 */
1135
1136 npages = i386_btop(avail_end);
1137 s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
1138 + (sizeof (struct pv_hashed_entry_t *) * (npvhash+1))
1139 + pv_lock_table_size(npages)
1140 + pv_hash_lock_table_size((npvhash+1))
1141 + npages);
1142
1143 s = round_page(s);
1144 if (kernel_memory_allocate(kernel_map, &addr, s, 0,
1145 KMA_KOBJECT | KMA_PERMANENT)
1146 != KERN_SUCCESS)
1147 panic("pmap_init");
1148
1149 memset((char *)addr, 0, s);
1150
1151#if PV_DEBUG
1152 if (0 == npvhash) panic("npvhash not initialized");
1153#endif
1154
1155 /*
1156 * Allocate the structures first to preserve word-alignment.
1157 */
1158 pv_head_table = (pv_rooted_entry_t) addr;
1159 addr = (vm_offset_t) (pv_head_table + npages);
1160
1161 pv_hash_table = (pv_hashed_entry_t *)addr;
1162 addr = (vm_offset_t) (pv_hash_table + (npvhash + 1));
1163
1164 pv_lock_table = (char *) addr;
1165 addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
1166
1167 pv_hash_lock_table = (char *) addr;
1168 addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhash+1)));
1169
1170 pmap_phys_attributes = (char *) addr;
1171
1172 ppnum_t last_pn = i386_btop(avail_end);
1173 unsigned int i;
1174 pmap_memory_region_t *pmptr = pmap_memory_regions;
1175 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
1176 if (pmptr->type != kEfiConventionalMemory)
1177 continue;
1178 unsigned int pn;
1179 for (pn = pmptr->base; pn <= pmptr->end; pn++) {
1180 if (pn < last_pn) {
1181 pmap_phys_attributes[pn] |= PHYS_MANAGED;
1182 if (pn > last_managed_page)
1183 last_managed_page = pn;
1184 }
1185 }
1186 }
1187
1188 /*
1189 * Create the zone of physical maps,
1190 * and of the physical-to-virtual entries.
1191 */
1192 s = (vm_size_t) sizeof(struct pmap);
1193 pmap_zone = zinit(s, 400*s, 4096, "pmap"); /* XXX */
1194 s = (vm_size_t) sizeof(struct pv_hashed_entry);
1195 pv_hashed_list_zone = zinit(s, 10000*s, 4096, "pv_list"); /* XXX */
1196 s = 63;
1197 pdpt_zone = zinit(s, 400*s, 4096, "pdpt"); /* XXX */
1198
1199
1200 /* create pv entries for kernel pages mapped by low level
1201 startup code. these have to exist so we can pmap_remove()
1202 e.g. kext pages from the middle of our addr space */
1203
1204 vaddr = (vm_map_offset_t) VM_MIN_KERNEL_ADDRESS;
1205 for (ppn = 0; ppn < i386_btop(avail_start); ppn++) {
1206 pv_rooted_entry_t pv_e;
1207
1208 pv_e = pai_to_pvh(ppn);
1209 pv_e->va = vaddr;
1210 vaddr += PAGE_SIZE;
1211 pv_e->pmap = kernel_pmap;
1212 queue_init(&pv_e->qlink);
1213 }
1214 pmap_initialized = TRUE;
1215
1216 /*
1217 * Initialize pmap cache.
1218 */
1219 pmap_cache_list = PMAP_NULL;
1220 pmap_cache_count = 0;
1221 simple_lock_init(&pmap_cache_lock, 0);
1222
1223 max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
1224
1225 /*
1226 * Ensure the kernel's PML4 entry exists for the basement
1227 * before this is shared with any user.
1228 */
1229 pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT);
1230}
1231
1232
1233/*
1234 * this function is only used for debugging fron the vm layer
1235 */
1236boolean_t
1237pmap_verify_free(
1238 ppnum_t pn)
1239{
1240 pv_rooted_entry_t pv_h;
1241 int pai;
1242 boolean_t result;
1243
1244 assert(pn != vm_page_fictitious_addr);
1245
1246 if (!pmap_initialized)
1247 return(TRUE);
1248
1249 if (pn == vm_page_guard_addr)
1250 return TRUE;
1251
1252 pai = ppn_to_pai(pn);
1253 if (!IS_MANAGED_PAGE(pai))
1254 return(FALSE);
1255 pv_h = pai_to_pvh(pn);
1256 result = (pv_h->pmap == PMAP_NULL);
1257 return(result);
1258}
1259
1260boolean_t
1261pmap_is_empty(
1262 pmap_t pmap,
1263 vm_map_offset_t va_start,
1264 vm_map_offset_t va_end)
1265{
1266 vm_map_offset_t offset;
1267 ppnum_t phys_page;
1268
1269 if (pmap == PMAP_NULL) {
1270 return TRUE;
1271 }
1272
1273 /*
1274 * Check the resident page count
1275 * - if it's zero, the pmap is completely empty.
1276 * This short-circuit test prevents a virtual address scan which is
1277 * painfully slow for 64-bit spaces.
1278 * This assumes the count is correct
1279 * .. the debug kernel ought to be checking perhaps by page table walk.
1280 */
1281 if (pmap->stats.resident_count == 0)
1282 return TRUE;
1283
1284 for (offset = va_start;
1285 offset < va_end;
1286 offset += PAGE_SIZE_64) {
1287 phys_page = pmap_find_phys(pmap, offset);
1288 if (phys_page) {
1289 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1290 "page %d at 0x%llx\n",
1291 pmap, va_start, va_end, phys_page, offset);
1292 return FALSE;
1293 }
1294 }
1295
1296 return TRUE;
1297}
1298
1299
1300/*
1301 * Create and return a physical map.
1302 *
1303 * If the size specified for the map
1304 * is zero, the map is an actual physical
1305 * map, and may be referenced by the
1306 * hardware.
1307 *
1308 * If the size specified is non-zero,
1309 * the map will be used in software only, and
1310 * is bounded by that size.
1311 */
1312pmap_t
1313pmap_create(
1314 vm_map_size_t sz,
1315 boolean_t is_64bit)
1316{
1317 pmap_t p;
1318 vm_size_t size;
1319 pml4_entry_t *pml4;
1320 pml4_entry_t *kpml4;
1321
1322 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1323 (uint32_t) (sz>>32), (uint32_t) sz, is_64bit, 0, 0);
1324
1325 size = (vm_size_t) sz;
1326
1327 /*
1328 * A software use-only map doesn't even need a map.
1329 */
1330
1331 if (size != 0) {
1332 return(PMAP_NULL);
1333 }
1334
1335 p = (pmap_t) zalloc(pmap_zone);
1336 if (PMAP_NULL == p)
1337 panic("pmap_create zalloc");
1338
1339 /* init counts now since we'll be bumping some */
1340 simple_lock_init(&p->lock, 0);
1341 p->stats.resident_count = 0;
1342 p->stats.resident_max = 0;
1343 p->stats.wired_count = 0;
1344 p->ref_count = 1;
1345 p->nx_enabled = 1;
1346 p->pm_shared = FALSE;
1347
1348 p->pm_task_map = is_64bit ? TASK_MAP_64BIT : TASK_MAP_32BIT;;
1349
1350 /* alloc the pml4 page in kernel vm */
1351 if (KERN_SUCCESS != kmem_alloc_kobject(kernel_map, (vm_offset_t *)(&p->pm_pml4), PAGE_SIZE))
1352 panic("pmap_create kmem_alloc_kobject pml4");
1353
1354 memset((char *)p->pm_pml4, 0, PAGE_SIZE);
1355 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4);
1356
1357 OSAddAtomic(1, &inuse_ptepages_count);
1358
1359 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1360
1361 p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS));
1362 if (NULL == p->pm_obj_pml4)
1363 panic("pmap_create pdpt obj");
1364
1365 p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS));
1366 if (NULL == p->pm_obj_pdpt)
1367 panic("pmap_create pdpt obj");
1368
1369 p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS));
1370 if (NULL == p->pm_obj)
1371 panic("pmap_create pte obj");
1372
1373 /* All pmaps share the kennel's pml4 */
1374 pml4 = pmap64_pml4(p, 0ULL);
1375 kpml4 = kernel_pmap->pm_pml4;
1376 pml4[KERNEL_PML4_INDEX] = kpml4[KERNEL_PML4_INDEX];
1377 pml4[KERNEL_KEXTS_INDEX] = kpml4[KERNEL_KEXTS_INDEX];
1378 pml4[KERNEL_PHYSMAP_INDEX] = kpml4[KERNEL_PHYSMAP_INDEX];
1379
1380 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1381 p, is_64bit, 0, 0, 0);
1382
1383 return(p);
1384}
1385
1386/*
1387 * Retire the given physical map from service.
1388 * Should only be called if the map contains
1389 * no valid mappings.
1390 */
1391
1392void
1393pmap_destroy(
1394 register pmap_t p)
1395{
1396 register int c;
1397
1398 if (p == PMAP_NULL)
1399 return;
1400
1401 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1402 p, 0, 0, 0, 0);
1403
1404 PMAP_LOCK(p);
1405
1406 c = --p->ref_count;
1407
1408 if (c == 0) {
1409 /*
1410 * If some cpu is not using the physical pmap pointer that it
1411 * is supposed to be (see set_dirbase), we might be using the
1412 * pmap that is being destroyed! Make sure we are
1413 * physically on the right pmap:
1414 */
1415 PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL);
1416 }
1417
1418 PMAP_UNLOCK(p);
1419
1420 if (c != 0) {
1421 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1422 p, 1, 0, 0, 0);
1423 return; /* still in use */
1424 }
1425
1426 /*
1427 * Free the memory maps, then the
1428 * pmap structure.
1429 */
1430 int inuse_ptepages = 0;
1431
1432 inuse_ptepages++;
1433 kmem_free(kernel_map, (vm_offset_t)p->pm_pml4, PAGE_SIZE);
1434
1435 inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1436 vm_object_deallocate(p->pm_obj_pml4);
1437
1438 inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1439 vm_object_deallocate(p->pm_obj_pdpt);
1440
1441 inuse_ptepages += p->pm_obj->resident_page_count;
1442 vm_object_deallocate(p->pm_obj);
1443
1444 OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count);
1445
1446 zfree(pmap_zone, p);
1447
1448 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1449 0, 0, 0, 0, 0);
1450}
1451
1452/*
1453 * Add a reference to the specified pmap.
1454 */
1455
1456void
1457pmap_reference(pmap_t p)
1458{
1459 if (p != PMAP_NULL) {
1460 PMAP_LOCK(p);
1461 p->ref_count++;
1462 PMAP_UNLOCK(p);;
1463 }
1464}
1465
1466/*
1467 * Remove a range of hardware page-table entries.
1468 * The entries given are the first (inclusive)
1469 * and last (exclusive) entries for the VM pages.
1470 * The virtual address is the va for the first pte.
1471 *
1472 * The pmap must be locked.
1473 * If the pmap is not the kernel pmap, the range must lie
1474 * entirely within one pte-page. This is NOT checked.
1475 * Assumes that the pte-page exists.
1476 */
1477
1478void
1479pmap_remove_range(
1480 pmap_t pmap,
1481 vm_map_offset_t start_vaddr,
1482 pt_entry_t *spte,
1483 pt_entry_t *epte)
1484{
1485 pt_entry_t *cpte;
1486 pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL;
1487 pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL;
1488 pv_hashed_entry_t pvh_e;
1489 int pvh_cnt = 0;
1490 int num_removed, num_unwired, num_found;
1491 int pai;
1492 pmap_paddr_t pa;
1493 vm_map_offset_t vaddr;
1494
1495 num_removed = 0;
1496 num_unwired = 0;
1497 num_found = 0;
1498
1499 /* invalidate the PTEs first to "freeze" them */
1500 for (cpte = spte, vaddr = start_vaddr;
1501 cpte < epte;
1502 cpte++, vaddr += PAGE_SIZE_64) {
1503
1504 pa = pte_to_pa(*cpte);
1505 if (pa == 0)
1506 continue;
1507 num_found++;
1508
1509 if (iswired(*cpte))
1510 num_unwired++;
1511
1512 pai = pa_index(pa);
1513
1514 if (!IS_MANAGED_PAGE(pai)) {
1515 /*
1516 * Outside range of managed physical memory.
1517 * Just remove the mappings.
1518 */
1519 pmap_store_pte(cpte, 0);
1520 continue;
1521 }
1522
1523 /* invalidate the PTE */
1524 pmap_update_pte(cpte, *cpte, (*cpte & ~INTEL_PTE_VALID));
1525 }
1526
1527 if (num_found == 0) {
1528 /* nothing was changed: we're done */
1529 goto update_counts;
1530 }
1531
1532 /* propagate the invalidates to other CPUs */
1533
1534 PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
1535
1536 for (cpte = spte, vaddr = start_vaddr;
1537 cpte < epte;
1538 cpte++, vaddr += PAGE_SIZE_64) {
1539
1540 pa = pte_to_pa(*cpte);
1541 if (pa == 0)
1542 continue;
1543
1544 pai = pa_index(pa);
1545
1546 LOCK_PVH(pai);
1547
1548 pa = pte_to_pa(*cpte);
1549 if (pa == 0) {
1550 UNLOCK_PVH(pai);
1551 continue;
1552 }
1553 num_removed++;
1554
1555 /*
1556 * Get the modify and reference bits, then
1557 * nuke the entry in the page table
1558 */
1559 /* remember reference and change */
1560 pmap_phys_attributes[pai] |=
1561 (char) (*cpte & (PHYS_MODIFIED | PHYS_REFERENCED));
1562 /* completely invalidate the PTE */
1563 pmap_store_pte(cpte, 0);
1564
1565 /*
1566 * Remove the mapping from the pvlist for this physical page.
1567 */
1568 pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t) pai);
1569
1570 UNLOCK_PVH(pai);
1571
1572 if (pvh_e != PV_HASHED_ENTRY_NULL) {
1573 pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1574 pvh_eh = pvh_e;
1575
1576 if (pvh_et == PV_HASHED_ENTRY_NULL) {
1577 pvh_et = pvh_e;
1578 }
1579 pvh_cnt++;
1580 }
1581 } /* for loop */
1582
1583 if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1584 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1585 }
1586update_counts:
1587 /*
1588 * Update the counts
1589 */
1590#if TESTING
1591 if (pmap->stats.resident_count < num_removed)
1592 panic("pmap_remove_range: resident_count");
1593#endif
1594 assert(pmap->stats.resident_count >= num_removed);
1595 OSAddAtomic(-num_removed, &pmap->stats.resident_count);
1596
1597#if TESTING
1598 if (pmap->stats.wired_count < num_unwired)
1599 panic("pmap_remove_range: wired_count");
1600#endif
1601 assert(pmap->stats.wired_count >= num_unwired);
1602 OSAddAtomic(-num_unwired, &pmap->stats.wired_count);
1603
1604 return;
1605}
1606
1607/*
1608 * Remove phys addr if mapped in specified map
1609 *
1610 */
1611void
1612pmap_remove_some_phys(
1613 __unused pmap_t map,
1614 __unused ppnum_t pn)
1615{
1616
1617/* Implement to support working set code */
1618
1619}
1620
1621/*
1622 * Remove the given range of addresses
1623 * from the specified map.
1624 *
1625 * It is assumed that the start and end are properly
1626 * rounded to the hardware page size.
1627 */
1628void
1629pmap_remove(
1630 pmap_t map,
1631 addr64_t s64,
1632 addr64_t e64)
1633{
1634 pt_entry_t *pde;
1635 pt_entry_t *spte, *epte;
1636 addr64_t l64;
1637 uint64_t deadline;
1638
1639 pmap_intr_assert();
1640
1641 if (map == PMAP_NULL || s64 == e64)
1642 return;
1643
1644 PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
1645 map,
1646 (uint32_t) (s64 >> 32), s64,
1647 (uint32_t) (e64 >> 32), e64);
1648
1649
1650 PMAP_LOCK(map);
1651
1652#if 0
1653 /*
1654 * Check that address range in the kernel does not overlap the stacks.
1655 * We initialize local static min/max variables once to avoid making
1656 * 2 function calls for every remove. Note also that these functions
1657 * both return 0 before kernel stacks have been initialized, and hence
1658 * the panic is not triggered in this case.
1659 */
1660 if (map == kernel_pmap) {
1661 static vm_offset_t kernel_stack_min = 0;
1662 static vm_offset_t kernel_stack_max = 0;
1663
1664 if (kernel_stack_min == 0) {
1665 kernel_stack_min = min_valid_stack_address();
1666 kernel_stack_max = max_valid_stack_address();
1667 }
1668 if ((kernel_stack_min <= s64 && s64 < kernel_stack_max) ||
1669 (kernel_stack_min < e64 && e64 <= kernel_stack_max))
1670 panic("pmap_remove() attempted in kernel stack");
1671 }
1672#else
1673
1674 /*
1675 * The values of kernel_stack_min and kernel_stack_max are no longer
1676 * relevant now that we allocate kernel stacks in the kernel map,
1677 * so the old code above no longer applies. If we wanted to check that
1678 * we weren't removing a mapping of a page in a kernel stack we'd
1679 * mark the PTE with an unused bit and check that here.
1680 */
1681
1682#endif
1683
1684 deadline = rdtsc64() + max_preemption_latency_tsc;
1685
1686 while (s64 < e64) {
1687 l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size - 1);
1688 if (l64 > e64)
1689 l64 = e64;
1690 pde = pmap_pde(map, s64);
1691
1692 if (pde && (*pde & INTEL_PTE_VALID)) {
1693 if (*pde & INTEL_PTE_PS) {
1694 /*
1695 * If we're removing a superpage, pmap_remove_range()
1696 * must work on level 2 instead of level 1; and we're
1697 * only passing a single level 2 entry instead of a
1698 * level 1 range.
1699 */
1700 spte = pde;
1701 epte = spte+1; /* excluded */
1702 } else {
1703 spte = pmap_pte(map, (s64 & ~(pde_mapped_size - 1)));
1704 spte = &spte[ptenum(s64)];
1705 epte = &spte[intel_btop(l64 - s64)];
1706 }
1707 pmap_remove_range(map, s64, spte, epte);
1708 }
1709 s64 = l64;
1710 pde++;
1711
1712 if (s64 < e64 && rdtsc64() >= deadline) {
1713 PMAP_UNLOCK(map)
1714 PMAP_LOCK(map)
1715 deadline = rdtsc64() + max_preemption_latency_tsc;
1716 }
1717 }
1718
1719 PMAP_UNLOCK(map);
1720
1721 PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END,
1722 map, 0, 0, 0, 0);
1723
1724}
1725
1726/*
1727 * Routine: pmap_page_protect
1728 *
1729 * Function:
1730 * Lower the permission for all mappings to a given
1731 * page.
1732 */
1733void
1734pmap_page_protect(
1735 ppnum_t pn,
1736 vm_prot_t prot)
1737{
1738 pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL;
1739 pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL;
1740 pv_hashed_entry_t nexth;
1741 int pvh_cnt = 0;
1742 pv_rooted_entry_t pv_h;
1743 pv_rooted_entry_t pv_e;
1744 pv_hashed_entry_t pvh_e;
1745 pt_entry_t *pte;
1746 int pai;
1747 pmap_t pmap;
1748 boolean_t remove;
1749
1750 pmap_intr_assert();
1751 assert(pn != vm_page_fictitious_addr);
1752 if (pn == vm_page_guard_addr)
1753 return;
1754
1755 pai = ppn_to_pai(pn);
1756
1757 if (!IS_MANAGED_PAGE(pai)) {
1758 /*
1759 * Not a managed page.
1760 */
1761 return;
1762 }
1763 PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START,
1764 pn, prot, 0, 0, 0);
1765
1766 /*
1767 * Determine the new protection.
1768 */
1769 switch (prot) {
1770 case VM_PROT_READ:
1771 case VM_PROT_READ | VM_PROT_EXECUTE:
1772 remove = FALSE;
1773 break;
1774 case VM_PROT_ALL:
1775 return; /* nothing to do */
1776 default:
1777 remove = TRUE;
1778 break;
1779 }
1780
1781 pv_h = pai_to_pvh(pai);
1782
1783 LOCK_PVH(pai);
1784
1785
1786 /*
1787 * Walk down PV list, if any, changing or removing all mappings.
1788 */
1789 if (pv_h->pmap == PMAP_NULL)
1790 goto done;
1791
1792 pv_e = pv_h;
1793 pvh_e = (pv_hashed_entry_t) pv_e; /* cheat */
1794
1795 do {
1796 vm_map_offset_t vaddr;
1797
1798 pmap = pv_e->pmap;
1799 vaddr = pv_e->va;
1800 pte = pmap_pte(pmap, vaddr);
1801 if (0 == pte) {
1802 panic("pmap_page_protect() "
1803 "pmap=%p pn=0x%x vaddr=0x%llx\n",
1804 pmap, pn, vaddr);
1805 }
1806 nexth = (pv_hashed_entry_t) queue_next(&pvh_e->qlink);
1807
1808 /*
1809 * Remove the mapping if new protection is NONE
1810 * or if write-protecting a kernel mapping.
1811 */
1812 if (remove || pmap == kernel_pmap) {
1813 /*
1814 * Remove the mapping, collecting dirty bits.
1815 */
1816 pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_VALID);
1817 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE);
1818 pmap_phys_attributes[pai] |=
1819 *pte & (PHYS_MODIFIED|PHYS_REFERENCED);
1820 pmap_store_pte(pte, 0);
1821
1822#if TESTING
1823 if (pmap->stats.resident_count < 1)
1824 panic("pmap_page_protect: resident_count");
1825#endif
1826 assert(pmap->stats.resident_count >= 1);
1827 OSAddAtomic(-1, &pmap->stats.resident_count);
1828
1829 /*
1830 * Deal with the pv_rooted_entry.
1831 */
1832
1833 if (pv_e == pv_h) {
1834 /*
1835 * Fix up head later.
1836 */
1837 pv_h->pmap = PMAP_NULL;
1838 } else {
1839 /*
1840 * Delete this entry.
1841 */
1842 pv_hash_remove(pvh_e);
1843 pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1844 pvh_eh = pvh_e;
1845
1846 if (pvh_et == PV_HASHED_ENTRY_NULL)
1847 pvh_et = pvh_e;
1848 pvh_cnt++;
1849 }
1850 } else {
1851 /*
1852 * Write-protect.
1853 */
1854 pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_WRITE);
1855 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE);
1856 }
1857 pvh_e = nexth;
1858 } while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h);
1859
1860
1861 /*
1862 * If pv_head mapping was removed, fix it up.
1863 */
1864 if (pv_h->pmap == PMAP_NULL) {
1865 pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
1866
1867 if (pvh_e != (pv_hashed_entry_t) pv_h) {
1868 pv_hash_remove(pvh_e);
1869 pv_h->pmap = pvh_e->pmap;
1870 pv_h->va = pvh_e->va;
1871 pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1872 pvh_eh = pvh_e;
1873
1874 if (pvh_et == PV_HASHED_ENTRY_NULL)
1875 pvh_et = pvh_e;
1876 pvh_cnt++;
1877 }
1878 }
1879 if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1880 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1881 }
1882done:
1883 UNLOCK_PVH(pai);
1884
1885 PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END,
1886 0, 0, 0, 0, 0);
1887}
1888
1889
1890/*
1891 * Routine:
1892 * pmap_disconnect
1893 *
1894 * Function:
1895 * Disconnect all mappings for this page and return reference and change status
1896 * in generic format.
1897 *
1898 */
1899unsigned int pmap_disconnect(
1900 ppnum_t pa)
1901{
1902 pmap_page_protect(pa, 0); /* disconnect the page */
1903 return (pmap_get_refmod(pa)); /* return ref/chg status */
1904}
1905
1906/*
1907 * Set the physical protection on the
1908 * specified range of this map as requested.
1909 * Will not increase permissions.
1910 */
1911void
1912pmap_protect(
1913 pmap_t map,
1914 vm_map_offset_t sva,
1915 vm_map_offset_t eva,
1916 vm_prot_t prot)
1917{
1918 pt_entry_t *pde;
1919 pt_entry_t *spte, *epte;
1920 vm_map_offset_t lva;
1921 vm_map_offset_t orig_sva;
1922 boolean_t set_NX;
1923 int num_found = 0;
1924
1925 pmap_intr_assert();
1926
1927 if (map == PMAP_NULL)
1928 return;
1929
1930 if (prot == VM_PROT_NONE) {
1931 pmap_remove(map, sva, eva);
1932 return;
1933 }
1934 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
1935 map,
1936 (uint32_t) (sva >> 32), (uint32_t) sva,
1937 (uint32_t) (eva >> 32), (uint32_t) eva);
1938
1939 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !map->nx_enabled)
1940 set_NX = FALSE;
1941 else
1942 set_NX = TRUE;
1943
1944 PMAP_LOCK(map);
1945
1946 orig_sva = sva;
1947 while (sva < eva) {
1948 lva = (sva + pde_mapped_size) & ~(pde_mapped_size - 1);
1949 if (lva > eva)
1950 lva = eva;
1951 pde = pmap_pde(map, sva);
1952 if (pde && (*pde & INTEL_PTE_VALID)) {
1953 if (*pde & INTEL_PTE_PS) {
1954 /* superpage */
1955 spte = pde;
1956 epte = spte+1; /* excluded */
1957 } else {
1958 spte = pmap_pte(map, (sva & ~(pde_mapped_size - 1)));
1959 spte = &spte[ptenum(sva)];
1960 epte = &spte[intel_btop(lva - sva)];
1961 }
1962
1963 for (; spte < epte; spte++) {
1964 if (!(*spte & INTEL_PTE_VALID))
1965 continue;
1966
1967 if (prot & VM_PROT_WRITE)
1968 pmap_update_pte(spte, *spte,
1969 *spte | INTEL_PTE_WRITE);
1970 else
1971 pmap_update_pte(spte, *spte,
1972 *spte & ~INTEL_PTE_WRITE);
1973
1974 if (set_NX)
1975 pmap_update_pte(spte, *spte,
1976 *spte | INTEL_PTE_NX);
1977 else
1978 pmap_update_pte(spte, *spte,
1979 *spte & ~INTEL_PTE_NX);
1980
1981 num_found++;
1982 }
1983 }
1984 sva = lva;
1985 }
1986 if (num_found)
1987 PMAP_UPDATE_TLBS(map, orig_sva, eva);
1988
1989 PMAP_UNLOCK(map);
1990
1991 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END,
1992 0, 0, 0, 0, 0);
1993
1994}
1995
1996/* Map a (possibly) autogenned block */
1997void
1998pmap_map_block(
1999 pmap_t pmap,
2000 addr64_t va,
2001 ppnum_t pa,
2002 uint32_t size,
2003 vm_prot_t prot,
2004 int attr,
2005 __unused unsigned int flags)
2006{
2007 uint32_t page;
2008 int cur_page_size;
2009
2010 if (attr & VM_MEM_SUPERPAGE)
2011 cur_page_size = SUPERPAGE_SIZE;
2012 else
2013 cur_page_size = PAGE_SIZE;
2014
2015 for (page = 0; page < size; page+=cur_page_size/PAGE_SIZE) {
2016 pmap_enter(pmap, va, pa, prot, attr, TRUE);
2017 va += cur_page_size;
2018 pa+=cur_page_size/PAGE_SIZE;
2019 }
2020}
2021
2022
2023/*
2024 * Insert the given physical page (p) at
2025 * the specified virtual address (v) in the
2026 * target physical map with the protection requested.
2027 *
2028 * If specified, the page will be wired down, meaning
2029 * that the related pte cannot be reclaimed.
2030 *
2031 * NB: This is the only routine which MAY NOT lazy-evaluate
2032 * or lose information. That is, this routine must actually
2033 * insert this page into the given map NOW.
2034 */
2035void
2036pmap_enter(
2037 register pmap_t pmap,
2038 vm_map_offset_t vaddr,
2039 ppnum_t pn,
2040 vm_prot_t prot,
2041 unsigned int flags,
2042 boolean_t wired)
2043{
2044 pt_entry_t *pte;
2045 pv_rooted_entry_t pv_h;
2046 int pai;
2047 pv_hashed_entry_t pvh_e;
2048 pv_hashed_entry_t pvh_new;
2049 pt_entry_t template;
2050 pmap_paddr_t old_pa;
2051 pmap_paddr_t pa = (pmap_paddr_t) i386_ptob(pn);
2052 boolean_t need_tlbflush = FALSE;
2053 boolean_t set_NX;
2054 char oattr;
2055 boolean_t old_pa_locked;
2056 boolean_t superpage = flags & VM_MEM_SUPERPAGE;
2057 vm_object_t delpage_pm_obj = NULL;
2058 int delpage_pde_index = 0;
2059
2060
2061 pmap_intr_assert();
2062 assert(pn != vm_page_fictitious_addr);
2063 if (pmap_debug)
2064 kprintf("pmap_enter(%p,%llu,%u)\n", pmap, vaddr, pn);
2065 if (pmap == PMAP_NULL)
2066 return;
2067 if (pn == vm_page_guard_addr)
2068 return;
2069
2070 PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
2071 pmap,
2072 (uint32_t) (vaddr >> 32), (uint32_t) vaddr,
2073 pn, prot);
2074
2075 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
2076 set_NX = FALSE;
2077 else
2078 set_NX = TRUE;
2079
2080 /*
2081 * Must allocate a new pvlist entry while we're unlocked;
2082 * zalloc may cause pageout (which will lock the pmap system).
2083 * If we determine we need a pvlist entry, we will unlock
2084 * and allocate one. Then we will retry, throughing away
2085 * the allocated entry later (if we no longer need it).
2086 */
2087
2088 pvh_new = PV_HASHED_ENTRY_NULL;
2089Retry:
2090 pvh_e = PV_HASHED_ENTRY_NULL;
2091
2092 PMAP_LOCK(pmap);
2093
2094 /*
2095 * Expand pmap to include this pte. Assume that
2096 * pmap is always expanded to include enough hardware
2097 * pages to map one VM page.
2098 */
2099 if(superpage) {
2100 while ((pte = pmap64_pde(pmap, vaddr)) == PD_ENTRY_NULL) {
2101 /* need room for another pde entry */
2102 PMAP_UNLOCK(pmap);
2103 pmap_expand_pdpt(pmap, vaddr);
2104 PMAP_LOCK(pmap);
2105 }
2106 } else {
2107 while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
2108 /*
2109 * Must unlock to expand the pmap
2110 * going to grow pde level page(s)
2111 */
2112 PMAP_UNLOCK(pmap);
2113 pmap_expand(pmap, vaddr);
2114 PMAP_LOCK(pmap);
2115 }
2116 }
2117
2118 if (superpage && *pte && !(*pte & INTEL_PTE_PS)) {
2119 /*
2120 * There is still an empty page table mapped that
2121 * was used for a previous base page mapping.
2122 * Remember the PDE and the PDE index, so that we
2123 * can free the page at the end of this function.
2124 */
2125 delpage_pde_index = (int)pdeidx(pmap, vaddr);
2126 delpage_pm_obj = pmap->pm_obj;
2127 *pte = 0;
2128 }
2129
2130 old_pa = pte_to_pa(*pte);
2131 pai = pa_index(old_pa);
2132 old_pa_locked = FALSE;
2133
2134 /*
2135 * if we have a previous managed page, lock the pv entry now. after
2136 * we lock it, check to see if someone beat us to the lock and if so
2137 * drop the lock
2138 */
2139 if ((0 != old_pa) && IS_MANAGED_PAGE(pai)) {
2140 LOCK_PVH(pai);
2141 old_pa_locked = TRUE;
2142 old_pa = pte_to_pa(*pte);
2143 if (0 == old_pa) {
2144 UNLOCK_PVH(pai); /* another path beat us to it */
2145 old_pa_locked = FALSE;
2146 }
2147 }
2148
2149 /*
2150 * Special case if the incoming physical page is already mapped
2151 * at this address.
2152 */
2153 if (old_pa == pa) {
2154
2155 /*
2156 * May be changing its wired attribute or protection
2157 */
2158
2159 template = pa_to_pte(pa) | INTEL_PTE_VALID;
2160
2161 if (VM_MEM_NOT_CACHEABLE ==
2162 (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) {
2163 if (!(flags & VM_MEM_GUARDED))
2164 template |= INTEL_PTE_PTA;
2165 template |= INTEL_PTE_NCACHE;
2166 }
2167 if (pmap != kernel_pmap)
2168 template |= INTEL_PTE_USER;
2169 if (prot & VM_PROT_WRITE)
2170 template |= INTEL_PTE_WRITE;
2171
2172 if (set_NX)
2173 template |= INTEL_PTE_NX;
2174
2175 if (wired) {
2176 template |= INTEL_PTE_WIRED;
2177 if (!iswired(*pte))
2178 OSAddAtomic(+1,
2179 &pmap->stats.wired_count);
2180 } else {
2181 if (iswired(*pte)) {
2182 assert(pmap->stats.wired_count >= 1);
2183 OSAddAtomic(-1,
2184 &pmap->stats.wired_count);
2185 }
2186 }
2187 if (superpage) /* this path can not be used */
2188 template |= INTEL_PTE_PS; /* to change the page size! */
2189
2190 /* store modified PTE and preserve RC bits */
2191 pmap_update_pte(pte, *pte,
2192 template | (*pte & (INTEL_PTE_REF | INTEL_PTE_MOD)));
2193 if (old_pa_locked) {
2194 UNLOCK_PVH(pai);
2195 old_pa_locked = FALSE;
2196 }
2197 need_tlbflush = TRUE;
2198 goto Done;
2199 }
2200
2201 /*
2202 * Outline of code from here:
2203 * 1) If va was mapped, update TLBs, remove the mapping
2204 * and remove old pvlist entry.
2205 * 2) Add pvlist entry for new mapping
2206 * 3) Enter new mapping.
2207 *
2208 * If the old physical page is not managed step 1) is skipped
2209 * (except for updating the TLBs), and the mapping is
2210 * overwritten at step 3). If the new physical page is not
2211 * managed, step 2) is skipped.
2212 */
2213
2214 if (old_pa != (pmap_paddr_t) 0) {
2215
2216 /*
2217 * Don't do anything to pages outside valid memory here.
2218 * Instead convince the code that enters a new mapping
2219 * to overwrite the old one.
2220 */
2221
2222 /* invalidate the PTE */
2223 pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID));
2224 /* propagate invalidate everywhere */
2225 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2226 /* remember reference and change */
2227 oattr = (char) (*pte & (PHYS_MODIFIED | PHYS_REFERENCED));
2228 /* completely invalidate the PTE */
2229 pmap_store_pte(pte, 0);
2230
2231 if (IS_MANAGED_PAGE(pai)) {
2232#if TESTING
2233 if (pmap->stats.resident_count < 1)
2234 panic("pmap_enter: resident_count");
2235#endif
2236 assert(pmap->stats.resident_count >= 1);
2237 OSAddAtomic(-1,
2238 &pmap->stats.resident_count);
2239
2240 if (iswired(*pte)) {
2241#if TESTING
2242 if (pmap->stats.wired_count < 1)
2243 panic("pmap_enter: wired_count");
2244#endif
2245 assert(pmap->stats.wired_count >= 1);
2246 OSAddAtomic(-1,
2247 &pmap->stats.wired_count);
2248 }
2249 pmap_phys_attributes[pai] |= oattr;
2250
2251 /*
2252 * Remove the mapping from the pvlist for
2253 * this physical page.
2254 * We'll end up with either a rooted pv or a
2255 * hashed pv
2256 */
2257 pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t) pai);
2258
2259 } else {
2260
2261 /*
2262 * old_pa is not managed.
2263 * Do removal part of accounting.
2264 */
2265
2266 if (iswired(*pte)) {
2267 assert(pmap->stats.wired_count >= 1);
2268 OSAddAtomic(-1,
2269 &pmap->stats.wired_count);
2270 }
2271 }
2272 }
2273
2274 /*
2275 * if we had a previously managed paged locked, unlock it now
2276 */
2277 if (old_pa_locked) {
2278 UNLOCK_PVH(pai);
2279 old_pa_locked = FALSE;
2280 }
2281
2282 pai = pa_index(pa); /* now working with new incoming phys page */
2283 if (IS_MANAGED_PAGE(pai)) {
2284
2285 /*
2286 * Step 2) Enter the mapping in the PV list for this
2287 * physical page.
2288 */
2289 pv_h = pai_to_pvh(pai);
2290
2291 LOCK_PVH(pai);
2292
2293 if (pv_h->pmap == PMAP_NULL) {
2294 /*
2295 * No mappings yet, use rooted pv
2296 */
2297 pv_h->va = vaddr;
2298 pv_h->pmap = pmap;
2299 queue_init(&pv_h->qlink);
2300 } else {
2301 /*
2302 * Add new pv_hashed_entry after header.
2303 */
2304 if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
2305 pvh_e = pvh_new;
2306 pvh_new = PV_HASHED_ENTRY_NULL;
2307 } else if (PV_HASHED_ENTRY_NULL == pvh_e) {
2308 PV_HASHED_ALLOC(pvh_e);
2309 if (PV_HASHED_ENTRY_NULL == pvh_e) {
2310 /*
2311 * the pv list is empty. if we are on
2312 * the kernel pmap we'll use one of
2313 * the special private kernel pv_e's,
2314 * else, we need to unlock
2315 * everything, zalloc a pv_e, and
2316 * restart bringing in the pv_e with
2317 * us.
2318 */
2319 if (kernel_pmap == pmap) {
2320 PV_HASHED_KERN_ALLOC(pvh_e);
2321 } else {
2322 UNLOCK_PVH(pai);
2323 PMAP_UNLOCK(pmap);
2324 pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
2325 goto Retry;
2326 }
2327 }
2328 }
2329 if (PV_HASHED_ENTRY_NULL == pvh_e)
2330 panic("pvh_e exhaustion");
2331
2332 pvh_e->va = vaddr;
2333 pvh_e->pmap = pmap;
2334 pvh_e->ppn = pn;
2335 pv_hash_add(pvh_e, pv_h);
2336
2337 /*
2338 * Remember that we used the pvlist entry.
2339 */
2340 pvh_e = PV_HASHED_ENTRY_NULL;
2341 }
2342
2343 /*
2344 * only count the mapping
2345 * for 'managed memory'
2346 */
2347 OSAddAtomic(+1, & pmap->stats.resident_count);
2348 if (pmap->stats.resident_count > pmap->stats.resident_max) {
2349 pmap->stats.resident_max = pmap->stats.resident_count;
2350 }
2351 }
2352 /*
2353 * Step 3) Enter the mapping.
2354 *
2355 * Build a template to speed up entering -
2356 * only the pfn changes.
2357 */
2358 template = pa_to_pte(pa) | INTEL_PTE_VALID;
2359
2360 if (flags & VM_MEM_NOT_CACHEABLE) {
2361 if (!(flags & VM_MEM_GUARDED))
2362 template |= INTEL_PTE_PTA;
2363 template |= INTEL_PTE_NCACHE;
2364 }
2365 if (pmap != kernel_pmap)
2366 template |= INTEL_PTE_USER;
2367 if (prot & VM_PROT_WRITE)
2368 template |= INTEL_PTE_WRITE;
2369 if (set_NX)
2370 template |= INTEL_PTE_NX;
2371 if (wired) {
2372 template |= INTEL_PTE_WIRED;
2373 OSAddAtomic(+1, & pmap->stats.wired_count);
2374 }
2375 if (superpage)
2376 template |= INTEL_PTE_PS;
2377 pmap_store_pte(pte, template);
2378
2379 /*
2380 * if this was a managed page we delayed unlocking the pv until here
2381 * to prevent pmap_page_protect et al from finding it until the pte
2382 * has been stored
2383 */
2384 if (IS_MANAGED_PAGE(pai)) {
2385 UNLOCK_PVH(pai);
2386 }
2387Done:
2388 if (need_tlbflush == TRUE)
2389 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2390
2391 if (pvh_e != PV_HASHED_ENTRY_NULL) {
2392 PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
2393 }
2394 if (pvh_new != PV_HASHED_ENTRY_NULL) {
2395 PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
2396 }
2397 PMAP_UNLOCK(pmap);
2398
2399 if (delpage_pm_obj) {
2400 vm_page_t m;
2401
2402 vm_object_lock(delpage_pm_obj);
2403 m = vm_page_lookup(delpage_pm_obj, delpage_pde_index);
2404 if (m == VM_PAGE_NULL)
2405 panic("pmap_enter: pte page not in object");
2406 VM_PAGE_FREE(m);
2407 OSAddAtomic(-1, &inuse_ptepages_count);
2408 vm_object_unlock(delpage_pm_obj);
2409 }
2410
2411 PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0);
2412}
2413
2414/*
2415 * Routine: pmap_change_wiring
2416 * Function: Change the wiring attribute for a map/virtual-address
2417 * pair.
2418 * In/out conditions:
2419 * The mapping must already exist in the pmap.
2420 */
2421void
2422pmap_change_wiring(
2423 pmap_t map,
2424 vm_map_offset_t vaddr,
2425 boolean_t wired)
2426{
2427 pt_entry_t *pte;
2428
2429 PMAP_LOCK(map);
2430
2431 if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL)
2432 panic("pmap_change_wiring: pte missing");
2433
2434 if (wired && !iswired(*pte)) {
2435 /*
2436 * wiring down mapping
2437 */
2438 OSAddAtomic(+1, &map->stats.wired_count);
2439 pmap_update_pte(pte, *pte, (*pte | INTEL_PTE_WIRED));
2440 }
2441 else if (!wired && iswired(*pte)) {
2442 /*
2443 * unwiring mapping
2444 */
2445 assert(map->stats.wired_count >= 1);
2446 OSAddAtomic(-1, &map->stats.wired_count);
2447 pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WIRED));
2448 }
2449
2450 PMAP_UNLOCK(map);
2451}
2452
2453void
2454pmap_expand_pml4(
2455 pmap_t map,
2456 vm_map_offset_t vaddr)
2457{
2458 vm_page_t m;
2459 pmap_paddr_t pa;
2460 uint64_t i;
2461 ppnum_t pn;
2462 pml4_entry_t *pml4p;
2463
2464 DBG("pmap_expand_pml4(%p,%p)\n", map, (void *)vaddr);
2465
2466 /*
2467 * Allocate a VM page for the pml4 page
2468 */
2469 while ((m = vm_page_grab()) == VM_PAGE_NULL)
2470 VM_PAGE_WAIT();
2471
2472 /*
2473 * put the page into the pmap's obj list so it
2474 * can be found later.
2475 */
2476 pn = m->phys_page;
2477 pa = i386_ptob(pn);
2478 i = pml4idx(map, vaddr);
2479
2480 /*
2481 * Zero the page.
2482 */
2483 pmap_zero_page(pn);
2484
2485 vm_page_lockspin_queues();
2486 vm_page_wire(m);
2487 vm_page_unlock_queues();
2488
2489 OSAddAtomic(1, &inuse_ptepages_count);
2490
2491 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2492 vm_object_lock(map->pm_obj_pml4);
2493
2494 PMAP_LOCK(map);
2495 /*
2496 * See if someone else expanded us first
2497 */
2498 if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
2499 PMAP_UNLOCK(map);
2500 vm_object_unlock(map->pm_obj_pml4);
2501
2502 VM_PAGE_FREE(m);
2503
2504 OSAddAtomic(-1, &inuse_ptepages_count);
2505 return;
2506 }
2507
2508#if 0 /* DEBUG */
2509 if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i)) {
2510 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
2511 map, map->pm_obj_pml4, vaddr, i);
2512 }
2513#endif
2514 vm_page_insert(m, map->pm_obj_pml4, (vm_object_offset_t)i);
2515 vm_object_unlock(map->pm_obj_pml4);
2516
2517 /*
2518 * Set the page directory entry for this page table.
2519 */
2520 pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
2521
2522 pmap_store_pte(pml4p, pa_to_pte(pa)
2523 | INTEL_PTE_VALID
2524 | INTEL_PTE_USER
2525 | INTEL_PTE_WRITE);
2526
2527 PMAP_UNLOCK(map);
2528
2529 return;
2530}
2531
2532void
2533pmap_expand_pdpt(
2534 pmap_t map,
2535 vm_map_offset_t vaddr)
2536{
2537 vm_page_t m;
2538 pmap_paddr_t pa;
2539 uint64_t i;
2540 ppnum_t pn;
2541 pdpt_entry_t *pdptp;
2542
2543 DBG("pmap_expand_pdpt(%p,%p)\n", map, (void *)vaddr);
2544
2545 while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
2546 pmap_expand_pml4(map, vaddr);
2547 }
2548
2549 /*
2550 * Allocate a VM page for the pdpt page
2551 */
2552 while ((m = vm_page_grab()) == VM_PAGE_NULL)
2553 VM_PAGE_WAIT();
2554
2555 /*
2556 * put the page into the pmap's obj list so it
2557 * can be found later.
2558 */
2559 pn = m->phys_page;
2560 pa = i386_ptob(pn);
2561 i = pdptidx(map, vaddr);
2562
2563 /*
2564 * Zero the page.
2565 */
2566 pmap_zero_page(pn);
2567
2568 vm_page_lockspin_queues();
2569 vm_page_wire(m);
2570 vm_page_unlock_queues();
2571
2572 OSAddAtomic(1, &inuse_ptepages_count);
2573
2574 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2575 vm_object_lock(map->pm_obj_pdpt);
2576
2577 PMAP_LOCK(map);
2578 /*
2579 * See if someone else expanded us first
2580 */
2581 if (pmap64_pde(map, vaddr) != PD_ENTRY_NULL) {
2582 PMAP_UNLOCK(map);
2583 vm_object_unlock(map->pm_obj_pdpt);
2584
2585 VM_PAGE_FREE(m);
2586
2587 OSAddAtomic(-1, &inuse_ptepages_count);
2588 return;
2589 }
2590
2591#if 0 /* DEBUG */
2592 if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i)) {
2593 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
2594 map, map->pm_obj_pdpt, vaddr, i);
2595 }
2596#endif
2597 vm_page_insert(m, map->pm_obj_pdpt, (vm_object_offset_t)i);
2598 vm_object_unlock(map->pm_obj_pdpt);
2599
2600 /*
2601 * Set the page directory entry for this page table.
2602 */
2603 pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
2604
2605 pmap_store_pte(pdptp, pa_to_pte(pa)
2606 | INTEL_PTE_VALID
2607 | INTEL_PTE_USER
2608 | INTEL_PTE_WRITE);
2609
2610 PMAP_UNLOCK(map);
2611
2612 return;
2613
2614}
2615
2616
2617
2618/*
2619 * Routine: pmap_expand
2620 *
2621 * Expands a pmap to be able to map the specified virtual address.
2622 *
2623 * Allocates new virtual memory for the P0 or P1 portion of the
2624 * pmap, then re-maps the physical pages that were in the old
2625 * pmap to be in the new pmap.
2626 *
2627 * Must be called with the pmap system and the pmap unlocked,
2628 * since these must be unlocked to use vm_allocate or vm_deallocate.
2629 * Thus it must be called in a loop that checks whether the map
2630 * has been expanded enough.
2631 * (We won't loop forever, since page tables aren't shrunk.)
2632 */
2633void
2634pmap_expand(
2635 pmap_t map,
2636 vm_map_offset_t vaddr)
2637{
2638 pt_entry_t *pdp;
2639 register vm_page_t m;
2640 register pmap_paddr_t pa;
2641 uint64_t i;
2642 ppnum_t pn;
2643
2644
2645 /*
2646 * For the kernel, the virtual address must be in or above the basement
2647 * which is for kexts and is in the 512GB immediately below the kernel..
2648 * XXX - should use VM_MIN_KERNEL_AND_KEXT_ADDRESS not KERNEL_BASEMENT
2649 */
2650 if (map == kernel_pmap &&
2651 !(vaddr >= KERNEL_BASEMENT && vaddr <= VM_MAX_KERNEL_ADDRESS))
2652 panic("pmap_expand: bad vaddr 0x%llx for kernel pmap", vaddr);
2653
2654
2655 while ((pdp = pmap64_pde(map, vaddr)) == PD_ENTRY_NULL) {
2656 /* need room for another pde entry */
2657 pmap_expand_pdpt(map, vaddr);
2658 }
2659
2660 /*
2661 * Allocate a VM page for the pde entries.
2662 */
2663 while ((m = vm_page_grab()) == VM_PAGE_NULL)
2664 VM_PAGE_WAIT();
2665
2666 /*
2667 * put the page into the pmap's obj list so it
2668 * can be found later.
2669 */
2670 pn = m->phys_page;
2671 pa = i386_ptob(pn);
2672 i = pdeidx(map, vaddr);
2673
2674 /*
2675 * Zero the page.
2676 */
2677 pmap_zero_page(pn);
2678
2679 vm_page_lockspin_queues();
2680 vm_page_wire(m);
2681 vm_page_unlock_queues();
2682
2683 OSAddAtomic(1, &inuse_ptepages_count);
2684
2685 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2686 vm_object_lock(map->pm_obj);
2687
2688 PMAP_LOCK(map);
2689
2690 /*
2691 * See if someone else expanded us first
2692 */
2693 if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
2694 PMAP_UNLOCK(map);
2695 vm_object_unlock(map->pm_obj);
2696
2697 VM_PAGE_FREE(m);
2698
2699 OSAddAtomic(-1, &inuse_ptepages_count);
2700 return;
2701 }
2702
2703#if 0 /* DEBUG */
2704 if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i)) {
2705 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
2706 map, map->pm_obj, vaddr, i);
2707 }
2708#endif
2709 vm_page_insert(m, map->pm_obj, (vm_object_offset_t)i);
2710 vm_object_unlock(map->pm_obj);
2711
2712 /*
2713 * Set the page directory entry for this page table.
2714 */
2715 pdp = pmap_pde(map, vaddr);
2716 pmap_store_pte(pdp, pa_to_pte(pa)
2717 | INTEL_PTE_VALID
2718 | INTEL_PTE_USER
2719 | INTEL_PTE_WRITE);
2720
2721 PMAP_UNLOCK(map);
2722
2723 return;
2724}
2725
2726/* On K64 machines with more than 32GB of memory, pmap_steal_memory
2727 * will allocate past the 1GB of pre-expanded virtual kernel area. This
2728 * function allocates all the page tables using memory from the same pool
2729 * that pmap_steal_memory uses, rather than calling vm_page_grab (which
2730 * isn't available yet). */
2731void
2732pmap_pre_expand(pmap_t pmap, vm_map_offset_t vaddr) {
2733 ppnum_t pn;
2734 pt_entry_t *pte;
2735
2736 PMAP_LOCK(pmap);
2737
2738 if(pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) {
2739 if (!pmap_next_page_k64(&pn))
2740 panic("pmap_pre_expand");
2741
2742 pmap_zero_page(pn);
2743
2744 pte = pmap64_pml4(pmap, vaddr);
2745
2746 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
2747 | INTEL_PTE_VALID
2748 | INTEL_PTE_USER
2749 | INTEL_PTE_WRITE);
2750 }
2751
2752 if(pmap64_pde(pmap, vaddr) == PD_ENTRY_NULL) {
2753 if (!pmap_next_page_k64(&pn))
2754 panic("pmap_pre_expand");
2755
2756 pmap_zero_page(pn);
2757
2758 pte = pmap64_pdpt(pmap, vaddr);
2759
2760 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
2761 | INTEL_PTE_VALID
2762 | INTEL_PTE_USER
2763 | INTEL_PTE_WRITE);
2764 }
2765
2766 if(pmap_pte(pmap, vaddr) == PT_ENTRY_NULL) {
2767 if (!pmap_next_page_k64(&pn))
2768 panic("pmap_pre_expand");
2769
2770 pmap_zero_page(pn);
2771
2772 pte = pmap64_pde(pmap, vaddr);
2773
2774 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
2775 | INTEL_PTE_VALID
2776 | INTEL_PTE_USER
2777 | INTEL_PTE_WRITE);
2778 }
2779
2780 PMAP_UNLOCK(pmap);
2781}
2782
2783/*
2784 * pmap_sync_page_data_phys(ppnum_t pa)
2785 *
2786 * Invalidates all of the instruction cache on a physical page and
2787 * pushes any dirty data from the data cache for the same physical page
2788 * Not required in i386.
2789 */
2790void
2791pmap_sync_page_data_phys(__unused ppnum_t pa)
2792{
2793 return;
2794}
2795
2796/*
2797 * pmap_sync_page_attributes_phys(ppnum_t pa)
2798 *
2799 * Write back and invalidate all cachelines on a physical page.
2800 */
2801void
2802pmap_sync_page_attributes_phys(ppnum_t pa)
2803{
2804 cache_flush_page_phys(pa);
2805}
2806
2807
2808
2809#ifdef CURRENTLY_UNUSED_AND_UNTESTED
2810
2811int collect_ref;
2812int collect_unref;
2813
2814/*
2815 * Routine: pmap_collect
2816 * Function:
2817 * Garbage collects the physical map system for
2818 * pages which are no longer used.
2819 * Success need not be guaranteed -- that is, there
2820 * may well be pages which are not referenced, but
2821 * others may be collected.
2822 * Usage:
2823 * Called by the pageout daemon when pages are scarce.
2824 */
2825void
2826pmap_collect(
2827 pmap_t p)
2828{
2829 register pt_entry_t *pdp, *ptp;
2830 pt_entry_t *eptp;
2831 int wired;
2832
2833 if (p == PMAP_NULL)
2834 return;
2835
2836 if (p == kernel_pmap)
2837 return;
2838
2839 /*
2840 * Garbage collect map.
2841 */
2842 PMAP_LOCK(p);
2843
2844 for (pdp = (pt_entry_t *)p->dirbase;
2845 pdp < (pt_entry_t *)&p->dirbase[(UMAXPTDI+1)];
2846 pdp++)
2847 {
2848 if (*pdp & INTEL_PTE_VALID) {
2849 if(*pdp & INTEL_PTE_REF) {
2850 pmap_store_pte(pdp, *pdp & ~INTEL_PTE_REF);
2851 collect_ref++;
2852 } else {
2853 collect_unref++;
2854 ptp = pmap_pte(p, pdetova(pdp - (pt_entry_t *)p->dirbase));
2855 eptp = ptp + NPTEPG;
2856
2857 /*
2858 * If the pte page has any wired mappings, we cannot
2859 * free it.
2860 */
2861 wired = 0;
2862 {
2863 register pt_entry_t *ptep;
2864 for (ptep = ptp; ptep < eptp; ptep++) {
2865 if (iswired(*ptep)) {
2866 wired = 1;
2867 break;
2868 }
2869 }
2870 }
2871 if (!wired) {
2872 /*
2873 * Remove the virtual addresses mapped by this pte page.
2874 */
2875 pmap_remove_range(p,
2876 pdetova(pdp - (pt_entry_t *)p->dirbase),
2877 ptp,
2878 eptp);
2879
2880 /*
2881 * Invalidate the page directory pointer.
2882 */
2883 pmap_store_pte(pdp, 0x0);
2884
2885 PMAP_UNLOCK(p);
2886
2887 /*
2888 * And free the pte page itself.
2889 */
2890 {
2891 register vm_page_t m;
2892
2893 vm_object_lock(p->pm_obj);
2894
2895 m = vm_page_lookup(p->pm_obj,(vm_object_offset_t)(pdp - (pt_entry_t *)&p->dirbase[0]));
2896 if (m == VM_PAGE_NULL)
2897 panic("pmap_collect: pte page not in object");
2898
2899 VM_PAGE_FREE(m);
2900
2901 OSAddAtomic(-1, &inuse_ptepages_count);
2902
2903 vm_object_unlock(p->pm_obj);
2904 }
2905
2906 PMAP_LOCK(p);
2907 }
2908 }
2909 }
2910 }
2911
2912 PMAP_UPDATE_TLBS(p, 0x0, 0xFFFFFFFFFFFFF000ULL);
2913 PMAP_UNLOCK(p);
2914 return;
2915
2916}
2917#endif
2918
2919
2920void
2921pmap_copy_page(ppnum_t src, ppnum_t dst)
2922{
2923 bcopy_phys((addr64_t)i386_ptob(src),
2924 (addr64_t)i386_ptob(dst),
2925 PAGE_SIZE);
2926}
2927
2928
2929/*
2930 * Routine: pmap_pageable
2931 * Function:
2932 * Make the specified pages (by pmap, offset)
2933 * pageable (or not) as requested.
2934 *
2935 * A page which is not pageable may not take
2936 * a fault; therefore, its page table entry
2937 * must remain valid for the duration.
2938 *
2939 * This routine is merely advisory; pmap_enter
2940 * will specify that these pages are to be wired
2941 * down (or not) as appropriate.
2942 */
2943void
2944pmap_pageable(
2945 __unused pmap_t pmap,
2946 __unused vm_map_offset_t start_addr,
2947 __unused vm_map_offset_t end_addr,
2948 __unused boolean_t pageable)
2949{
2950#ifdef lint
2951 pmap++; start_addr++; end_addr++; pageable++;
2952#endif /* lint */
2953}
2954
2955/*
2956 * Clear specified attribute bits.
2957 */
2958void
2959phys_attribute_clear(
2960 ppnum_t pn,
2961 int bits)
2962{
2963 pv_rooted_entry_t pv_h;
2964 pv_hashed_entry_t pv_e;
2965 pt_entry_t *pte;
2966 int pai;
2967 pmap_t pmap;
2968
2969 pmap_intr_assert();
2970 assert(pn != vm_page_fictitious_addr);
2971 if (pn == vm_page_guard_addr)
2972 return;
2973
2974 pai = ppn_to_pai(pn);
2975
2976 if (!IS_MANAGED_PAGE(pai)) {
2977 /*
2978 * Not a managed page.
2979 */
2980 return;
2981 }
2982
2983
2984 PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START,
2985 pn, bits, 0, 0, 0);
2986
2987 pv_h = pai_to_pvh(pai);
2988
2989 LOCK_PVH(pai);
2990
2991 /*
2992 * Walk down PV list, clearing all modify or reference bits.
2993 * We do not have to lock the pv_list because we have
2994 * the entire pmap system locked.
2995 */
2996 if (pv_h->pmap != PMAP_NULL) {
2997 /*
2998 * There are some mappings.
2999 */
3000
3001 pv_e = (pv_hashed_entry_t)pv_h;
3002
3003 do {
3004 vm_map_offset_t va;
3005
3006 pmap = pv_e->pmap;
3007 va = pv_e->va;
3008
3009 /*
3010 * Clear modify and/or reference bits.
3011 */
3012 pte = pmap_pte(pmap, va);
3013 pmap_update_pte(pte, *pte, (*pte & ~bits));
3014 /* Ensure all processors using this translation
3015 * invalidate this TLB entry. The invalidation *must*
3016 * follow the PTE update, to ensure that the TLB
3017 * shadow of the 'D' bit (in particular) is
3018 * synchronized with the updated PTE.
3019 */
3020 PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
3021
3022 pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
3023
3024 } while (pv_e != (pv_hashed_entry_t)pv_h);
3025 }
3026 pmap_phys_attributes[pai] &= ~bits;
3027
3028 UNLOCK_PVH(pai);
3029
3030 PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END,
3031 0, 0, 0, 0, 0);
3032}
3033
3034/*
3035 * Check specified attribute bits.
3036 */
3037int
3038phys_attribute_test(
3039 ppnum_t pn,
3040 int bits)
3041{
3042 pv_rooted_entry_t pv_h;
3043 pv_hashed_entry_t pv_e;
3044 pt_entry_t *pte;
3045 int pai;
3046 pmap_t pmap;
3047 int attributes = 0;
3048
3049 pmap_intr_assert();
3050 assert(pn != vm_page_fictitious_addr);
3051 if (pn == vm_page_guard_addr)
3052 return 0;
3053
3054 pai = ppn_to_pai(pn);
3055
3056 if (!IS_MANAGED_PAGE(pai)) {
3057 /*
3058 * Not a managed page.
3059 */
3060 return 0;
3061 }
3062
3063 /*
3064 * super fast check... if bits already collected
3065 * no need to take any locks...
3066 * if not set, we need to recheck after taking
3067 * the lock in case they got pulled in while
3068 * we were waiting for the lock
3069 */
3070 if ((pmap_phys_attributes[pai] & bits) == bits)
3071 return bits;
3072
3073 pv_h = pai_to_pvh(pai);
3074
3075 LOCK_PVH(pai);
3076
3077 attributes = pmap_phys_attributes[pai] & bits;
3078
3079
3080 /*
3081 * Walk down PV list, checking the mappings until we
3082 * reach the end or we've found the attributes we've asked for
3083 * We do not have to lock the pv_list because we have
3084 * the entire pmap system locked.
3085 */
3086 if (attributes != bits &&
3087 pv_h->pmap != PMAP_NULL) {
3088 /*
3089 * There are some mappings.
3090 */
3091 pv_e = (pv_hashed_entry_t)pv_h;
3092 do {
3093 vm_map_offset_t va;
3094
3095 pmap = pv_e->pmap;
3096 va = pv_e->va;
3097 /*
3098 * first make sure any processor actively
3099 * using this pmap, flushes its TLB state
3100 */
3101 PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
3102
3103 /*
3104 * pick up modify and/or reference bits from mapping
3105 */
3106
3107 pte = pmap_pte(pmap, va);
3108 attributes |= (int)(*pte & bits);
3109
3110 pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
3111
3112 } while ((attributes != bits) &&
3113 (pv_e != (pv_hashed_entry_t)pv_h));
3114 }
3115
3116 UNLOCK_PVH(pai);
3117 return (attributes);
3118}
3119
3120/*
3121 * Set specified attribute bits.
3122 */
3123void
3124phys_attribute_set(
3125 ppnum_t pn,
3126 int bits)
3127{
3128 int pai;
3129
3130 pmap_intr_assert();
3131 assert(pn != vm_page_fictitious_addr);
3132 if (pn == vm_page_guard_addr)
3133 return;
3134
3135 pai = ppn_to_pai(pn);
3136
3137 if (!IS_MANAGED_PAGE(pai)) {
3138 /* Not a managed page. */
3139 return;
3140 }
3141
3142 LOCK_PVH(pai);
3143 pmap_phys_attributes[pai] |= bits;
3144 UNLOCK_PVH(pai);
3145}
3146
3147/*
3148 * Set the modify bit on the specified physical page.
3149 */
3150
3151void
3152pmap_set_modify(ppnum_t pn)
3153{
3154 phys_attribute_set(pn, PHYS_MODIFIED);
3155}
3156
3157/*
3158 * Clear the modify bits on the specified physical page.
3159 */
3160
3161void
3162pmap_clear_modify(ppnum_t pn)
3163{
3164 phys_attribute_clear(pn, PHYS_MODIFIED);
3165}
3166
3167/*
3168 * pmap_is_modified:
3169 *
3170 * Return whether or not the specified physical page is modified
3171 * by any physical maps.
3172 */
3173
3174boolean_t
3175pmap_is_modified(ppnum_t pn)
3176{
3177 if (phys_attribute_test(pn, PHYS_MODIFIED))
3178 return TRUE;
3179 return FALSE;
3180}
3181
3182/*
3183 * pmap_clear_reference:
3184 *
3185 * Clear the reference bit on the specified physical page.
3186 */
3187
3188void
3189pmap_clear_reference(ppnum_t pn)
3190{
3191 phys_attribute_clear(pn, PHYS_REFERENCED);
3192}
3193
3194void
3195pmap_set_reference(ppnum_t pn)
3196{
3197 phys_attribute_set(pn, PHYS_REFERENCED);
3198}
3199
3200/*
3201 * pmap_is_referenced:
3202 *
3203 * Return whether or not the specified physical page is referenced
3204 * by any physical maps.
3205 */
3206
3207boolean_t
3208pmap_is_referenced(ppnum_t pn)
3209{
3210 if (phys_attribute_test(pn, PHYS_REFERENCED))
3211 return TRUE;
3212 return FALSE;
3213}
3214
3215/*
3216 * pmap_get_refmod(phys)
3217 * returns the referenced and modified bits of the specified
3218 * physical page.
3219 */
3220unsigned int
3221pmap_get_refmod(ppnum_t pn)
3222{
3223 int refmod;
3224 unsigned int retval = 0;
3225
3226 refmod = phys_attribute_test(pn, PHYS_MODIFIED | PHYS_REFERENCED);
3227
3228 if (refmod & PHYS_MODIFIED)
3229 retval |= VM_MEM_MODIFIED;
3230 if (refmod & PHYS_REFERENCED)
3231 retval |= VM_MEM_REFERENCED;
3232
3233 return (retval);
3234}
3235
3236/*
3237 * pmap_clear_refmod(phys, mask)
3238 * clears the referenced and modified bits as specified by the mask
3239 * of the specified physical page.
3240 */
3241void
3242pmap_clear_refmod(ppnum_t pn, unsigned int mask)
3243{
3244 unsigned int x86Mask;
3245
3246 x86Mask = ( ((mask & VM_MEM_MODIFIED)? PHYS_MODIFIED : 0)
3247 | ((mask & VM_MEM_REFERENCED)? PHYS_REFERENCED : 0));
3248 phys_attribute_clear(pn, x86Mask);
3249}
3250
3251void
3252invalidate_icache(__unused vm_offset_t addr,
3253 __unused unsigned cnt,
3254 __unused int phys)
3255{
3256 return;
3257}
3258
3259void
3260flush_dcache(__unused vm_offset_t addr,
3261 __unused unsigned count,
3262 __unused int phys)
3263{
3264 return;
3265}
3266
3267#if CONFIG_DTRACE
3268/*
3269 * Constrain DTrace copyin/copyout actions
3270 */
3271extern kern_return_t dtrace_copyio_preflight(addr64_t);
3272extern kern_return_t dtrace_copyio_postflight(addr64_t);
3273
3274kern_return_t dtrace_copyio_preflight(__unused addr64_t va)
3275{
3276 thread_t thread = current_thread();
3277
3278 if (current_map() == kernel_map)
3279 return KERN_FAILURE;
3280 else if (get_cr3() != thread->map->pmap->pm_cr3)
3281 return KERN_FAILURE;
3282 else if (thread->machine.specFlags & CopyIOActive)
3283 return KERN_FAILURE;
3284 else
3285 return KERN_SUCCESS;
3286}
3287
3288kern_return_t dtrace_copyio_postflight(__unused addr64_t va)
3289{
3290 return KERN_SUCCESS;
3291}
3292#endif /* CONFIG_DTRACE */
3293
3294#include <mach_vm_debug.h>
3295#if MACH_VM_DEBUG
3296#include <vm/vm_debug.h>
3297
3298int
3299pmap_list_resident_pages(
3300 __unused pmap_t pmap,
3301 __unused vm_offset_t *listp,
3302 __unused int space)
3303{
3304 return 0;
3305}
3306#endif /* MACH_VM_DEBUG */
3307
3308
3309
3310/* temporary workaround */
3311boolean_t
3312coredumpok(__unused vm_map_t map, __unused vm_offset_t va)
3313{
3314#if 0
3315 pt_entry_t *ptep;
3316
3317 ptep = pmap_pte(map->pmap, va);
3318 if (0 == ptep)
3319 return FALSE;
3320 return ((*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED));
3321#else
3322 return TRUE;
3323#endif
3324}
3325
3326
3327boolean_t
3328phys_page_exists(ppnum_t pn)
3329{
3330 assert(pn != vm_page_fictitious_addr);
3331
3332 if (!pmap_initialized)
3333 return TRUE;
3334
3335 if (pn == vm_page_guard_addr)
3336 return FALSE;
3337
3338 if (!IS_MANAGED_PAGE(ppn_to_pai(pn)))
3339 return FALSE;
3340
3341 return TRUE;
3342}
3343
3344void
3345mapping_free_prime(void)
3346{
3347 int i;
3348 pv_hashed_entry_t pvh_e;
3349 pv_hashed_entry_t pvh_eh;
3350 pv_hashed_entry_t pvh_et;
3351 int pv_cnt;
3352
3353 pv_cnt = 0;
3354 pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3355 for (i = 0; i < (5 * PV_HASHED_ALLOC_CHUNK); i++) {
3356 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3357
3358 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3359 pvh_eh = pvh_e;
3360
3361 if (pvh_et == PV_HASHED_ENTRY_NULL)
3362 pvh_et = pvh_e;
3363 pv_cnt++;
3364 }
3365 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
3366
3367 pv_cnt = 0;
3368 pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3369 for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
3370 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3371
3372 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3373 pvh_eh = pvh_e;
3374
3375 if (pvh_et == PV_HASHED_ENTRY_NULL)
3376 pvh_et = pvh_e;
3377 pv_cnt++;
3378 }
3379 PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
3380
3381}
3382
3383void
3384mapping_adjust(void)
3385{
3386 pv_hashed_entry_t pvh_e;
3387 pv_hashed_entry_t pvh_eh;
3388 pv_hashed_entry_t pvh_et;
3389 int pv_cnt;
3390 int i;
3391
3392 if (mapping_adjust_call == NULL) {
3393 thread_call_setup(&mapping_adjust_call_data,
3394 (thread_call_func_t) mapping_adjust,
3395 (thread_call_param_t) NULL);
3396 mapping_adjust_call = &mapping_adjust_call_data;
3397 }
3398
3399 pv_cnt = 0;
3400 pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3401 if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) {
3402 for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
3403 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3404
3405 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3406 pvh_eh = pvh_e;
3407
3408 if (pvh_et == PV_HASHED_ENTRY_NULL)
3409 pvh_et = pvh_e;
3410 pv_cnt++;
3411 }
3412 PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
3413 }
3414
3415 pv_cnt = 0;
3416 pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
3417 if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) {
3418 for (i = 0; i < PV_HASHED_ALLOC_CHUNK; i++) {
3419 pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
3420
3421 pvh_e->qlink.next = (queue_entry_t)pvh_eh;
3422 pvh_eh = pvh_e;
3423
3424 if (pvh_et == PV_HASHED_ENTRY_NULL)
3425 pvh_et = pvh_e;
3426 pv_cnt++;
3427 }
3428 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
3429 }
3430 mappingrecurse = 0;
3431}
3432
3433
3434void
3435pmap_switch(pmap_t tpmap)
3436{
3437 spl_t s;
3438
3439 s = splhigh(); /* Make sure interruptions are disabled */
3440 set_dirbase(tpmap, current_thread());
3441 splx(s);
3442}
3443
3444
3445/*
3446 * disable no-execute capability on
3447 * the specified pmap
3448 */
3449void
3450pmap_disable_NX(pmap_t pmap)
3451{
3452 pmap->nx_enabled = 0;
3453}
3454
3455void
3456pt_fake_zone_info(
3457 int *count,
3458 vm_size_t *cur_size,
3459 vm_size_t *max_size,
3460 vm_size_t *elem_size,
3461 vm_size_t *alloc_size,
3462 int *collectable,
3463 int *exhaustable)
3464{
3465 *count = inuse_ptepages_count;
3466 *cur_size = PAGE_SIZE * inuse_ptepages_count;
3467 *max_size = PAGE_SIZE * (inuse_ptepages_count +
3468 vm_page_inactive_count +
3469 vm_page_active_count +
3470 vm_page_free_count);
3471 *elem_size = PAGE_SIZE;
3472 *alloc_size = PAGE_SIZE;
3473
3474 *collectable = 1;
3475 *exhaustable = 0;
3476}
3477
3478static inline void
3479pmap_cpuset_NMIPI(cpu_set cpu_mask) {
3480 unsigned int cpu, cpu_bit;
3481 uint64_t deadline;
3482
3483 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
3484 if (cpu_mask & cpu_bit)
3485 cpu_NMI_interrupt(cpu);
3486 }
3487 deadline = mach_absolute_time() + (LockTimeOut);
3488 while (mach_absolute_time() < deadline)
3489 cpu_pause();
3490}
3491
3492/*
3493 * Called with pmap locked, we:
3494 * - scan through per-cpu data to see which other cpus need to flush
3495 * - send an IPI to each non-idle cpu to be flushed
3496 * - wait for all to signal back that they are inactive or we see that
3497 * they are at a safe point (idle).
3498 * - flush the local tlb if active for this pmap
3499 * - return ... the caller will unlock the pmap
3500 */
3501void
3502pmap_flush_tlbs(pmap_t pmap)
3503{
3504 unsigned int cpu;
3505 unsigned int cpu_bit;
3506 cpu_set cpus_to_signal;
3507 unsigned int my_cpu = cpu_number();
3508 pmap_paddr_t pmap_cr3 = pmap->pm_cr3;
3509 boolean_t flush_self = FALSE;
3510 uint64_t deadline;
3511
3512 assert((processor_avail_count < 2) ||
3513 (ml_get_interrupts_enabled() && get_preemption_level() != 0));
3514
3515 /*
3516 * Scan other cpus for matching active or task CR3.
3517 * For idle cpus (with no active map) we mark them invalid but
3518 * don't signal -- they'll check as they go busy.
3519 */
3520 cpus_to_signal = 0;
3521 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
3522 if (!cpu_datap(cpu)->cpu_running)
3523 continue;
3524 uint64_t cpu_active_cr3 = CPU_GET_ACTIVE_CR3(cpu);
3525 uint64_t cpu_task_cr3 = CPU_GET_TASK_CR3(cpu);
3526
3527 if ((pmap_cr3 == cpu_task_cr3) ||
3528 (pmap_cr3 == cpu_active_cr3) ||
3529 (pmap->pm_shared) ||
3530 (pmap == kernel_pmap)) {
3531 if (cpu == my_cpu) {
3532 flush_self = TRUE;
3533 continue;
3534 }
3535 cpu_datap(cpu)->cpu_tlb_invalid = TRUE;
3536 __asm__ volatile("mfence");
3537
3538 /*
3539 * We don't need to signal processors which will flush
3540 * lazily at the idle state or kernel boundary.
3541 * For example, if we're invalidating the kernel pmap,
3542 * processors currently in userspace don't need to flush
3543 * their TLBs until the next time they enter the kernel.
3544 * Alterations to the address space of a task active
3545 * on a remote processor result in a signal, to
3546 * account for copy operations. (There may be room
3547 * for optimization in such cases).
3548 * The order of the loads below with respect
3549 * to the store to the "cpu_tlb_invalid" field above
3550 * is important--hence the barrier.
3551 */
3552 if (CPU_CR3_IS_ACTIVE(cpu) &&
3553 (pmap_cr3 == CPU_GET_ACTIVE_CR3(cpu) ||
3554 pmap->pm_shared ||
3555 (pmap_cr3 == CPU_GET_TASK_CR3(cpu)))) {
3556 cpus_to_signal |= cpu_bit;
3557 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
3558 }
3559 }
3560 }
3561
3562 PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_START,
3563 pmap, cpus_to_signal, flush_self, 0, 0);
3564
3565 /*
3566 * Flush local tlb if required.
3567 * Do this now to overlap with other processors responding.
3568 */
3569 if (flush_self)
3570 flush_tlb();
3571
3572 if (cpus_to_signal) {
3573 cpu_set cpus_to_respond = cpus_to_signal;
3574
3575 deadline = mach_absolute_time() + LockTimeOut;
3576 /*
3577 * Wait for those other cpus to acknowledge
3578 */
3579 while (cpus_to_respond != 0) {
3580 if (mach_absolute_time() > deadline) {
3581 if (mp_recent_debugger_activity())
3582 continue;
3583 if (!panic_active()) {
3584 pmap_tlb_flush_timeout = TRUE;
3585 pmap_cpuset_NMIPI(cpus_to_respond);
3586 }
3587 panic("pmap_flush_tlbs() timeout: "
3588 "cpu(s) failing to respond to interrupts, pmap=%p cpus_to_respond=0x%lx",
3589 pmap, cpus_to_respond);
3590 }
3591
3592 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
3593 if ((cpus_to_respond & cpu_bit) != 0) {
3594 if (!cpu_datap(cpu)->cpu_running ||
3595 cpu_datap(cpu)->cpu_tlb_invalid == FALSE ||
3596 !CPU_CR3_IS_ACTIVE(cpu)) {
3597 cpus_to_respond &= ~cpu_bit;
3598 }
3599 cpu_pause();
3600 }
3601 if (cpus_to_respond == 0)
3602 break;
3603 }
3604 }
3605 }
3606
3607 PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_END,
3608 pmap, cpus_to_signal, flush_self, 0, 0);
3609}
3610
3611void
3612process_pmap_updates(void)
3613{
3614 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
3615
3616 flush_tlb();
3617
3618 current_cpu_datap()->cpu_tlb_invalid = FALSE;
3619 __asm__ volatile("mfence");
3620}
3621
3622void
3623pmap_update_interrupt(void)
3624{
3625 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START,
3626 0, 0, 0, 0, 0);
3627
3628 process_pmap_updates();
3629
3630 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END,
3631 0, 0, 0, 0, 0);
3632}
3633
3634
3635unsigned int
3636pmap_cache_attributes(ppnum_t pn)
3637{
3638 return IS_MANAGED_PAGE(ppn_to_pai(pn)) ? VM_WIMG_COPYBACK
3639 : VM_WIMG_IO;
3640}
3641
3642