2 * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <i386/proc_reg.h>
30 #include <i386/cpuid.h>
33 #include <vm/vm_map.h>
34 #include <i386/pmap_internal.h>
35 #include <i386/pmap_pcid.h>
38 * PCID (Process context identifier) aka tagged TLB support.
39 * On processors with this feature, unless disabled via the -pmap_pcid_disable
40 * boot-arg, the following algorithm is in effect:
41 * Each processor maintains an array of tag refcounts indexed by tag.
42 * Each address space maintains an array of tags indexed by CPU number.
43 * Each address space maintains a coherency vector, indexed by CPU
44 * indicating that the TLB state for that address space has a pending
46 * On a context switch, a refcounted tag is lazily assigned to the newly
47 * dispatched (CPU, address space) tuple.
48 * When an inactive address space is invalidated on a remote CPU, it is marked
49 * for invalidation upon the next dispatch. Some invalidations are
50 * also processed at the user/kernel boundary.
51 * Provisions are made for the case where a CPU is overcommmitted, i.e.
52 * more active address spaces exist than the number of logical tags
53 * provided for by the processor architecture (currently 4096).
54 * The algorithm assumes the processor remaps the logical tags
55 * to physical TLB context IDs in an LRU fashion for efficiency. (DRK '10)
58 uint32_t pmap_pcid_ncpus
;
59 boolean_t pmap_pcid_disabled
= FALSE
;
60 bool invpcid_enabled
= false;
61 static uint32_t INP_MAX
= 0;
62 pcid_cdata_t pcid_data
[MAX_CPUS
] __attribute__((aligned(64)));
65 pmap_pcid_configure(void)
67 int ccpu
= cpu_number();
68 uintptr_t cr4
= get_cr4();
69 boolean_t pcid_present
= FALSE
;
71 pmap_pcid_log("PCID configure invoked on CPU %d\n", ccpu
);
72 pmap_assert(ml_get_interrupts_enabled() == FALSE
|| get_preemption_level() != 0);
73 pmap_assert(cpu_mode_is64bit());
75 if (PE_parse_boot_argn("-pmap_pcid_disable", &pmap_pcid_disabled
, sizeof(pmap_pcid_disabled
))) {
76 pmap_pcid_log("PMAP: PCID feature disabled\n");
77 printf("PMAP: PCID feature disabled, %u\n", pmap_pcid_disabled
);
78 kprintf("PMAP: PCID feature disabled %u\n", pmap_pcid_disabled
);
80 /* no_shared_cr3+PCID is currently unsupported */
83 if (pmap_pcid_disabled
== FALSE
) {
84 no_shared_cr3
= FALSE
;
90 pmap_pcid_disabled
= TRUE
;
93 if (pmap_pcid_disabled
|| no_shared_cr3
) {
95 /* Reset PCID status, as we may have picked up
96 * strays if discovered prior to platform
97 * expert initialization.
99 for (i
= 0; i
< real_ncpus
; i
++) {
101 cpu_datap(i
)->cpu_pmap_pcid_enabled
= FALSE
;
105 cpu_datap(ccpu
)->cpu_pmap_pcid_enabled
= FALSE
;
108 /* DRKTODO: assert if features haven't been discovered yet. Redundant
109 * invocation of cpu_mode_init and descendants masks this for now.
111 if ((cpuid_features() & CPUID_FEATURE_PCID
)) {
114 cpu_datap(ccpu
)->cpu_pmap_pcid_enabled
= FALSE
;
115 pmap_pcid_log("PMAP: PCID not detected CPU %d\n", ccpu
);
118 if ((cr4
& (CR4_PCIDE
| CR4_PGE
)) == (CR4_PCIDE
| CR4_PGE
)) {
119 cpu_datap(ccpu
)->cpu_pmap_pcid_enabled
= TRUE
;
120 pmap_pcid_log("PMAP: PCID already enabled %d\n", ccpu
);
123 if (pcid_present
== TRUE
) {
125 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_INVPCID
) {
126 invpcid_enabled
= true;
129 #if DEVELOPMENT || DEBUG
130 PE_parse_boot_argn("pmap_inp_max", &INP_MAX
, sizeof(INP_MAX
));
132 pmap_pcid_log("Pre-PCID:CR0: 0x%lx, CR3: 0x%lx, CR4(CPU %d): 0x%lx\n", get_cr0(), get_cr3_raw(), ccpu
, cr4
);
134 if (cpu_number() >= PMAP_PCID_MAX_CPUS
) {
135 panic("PMAP_PCID_MAX_CPUS %d\n", cpu_number());
137 if ((get_cr4() & CR4_PGE
) == 0) {
138 set_cr4(get_cr4() | CR4_PGE
);
139 pmap_pcid_log("Toggled PGE ON (CPU: %d\n", ccpu
);
141 set_cr4(get_cr4() | CR4_PCIDE
);
142 pmap_pcid_log("Post PCID: CR0: 0x%lx, CR3: 0x%lx, CR4(CPU %d): 0x%lx\n", get_cr0(), get_cr3_raw(), ccpu
, get_cr4());
143 pmap_tlbi_range(0, ~0ULL, true, 0);
144 cpu_datap(ccpu
)->cpu_pmap_pcid_enabled
= TRUE
;
146 if (OSIncrementAtomic(&pmap_pcid_ncpus
) == machine_info
.max_cpus
) {
147 pmap_pcid_log("All PCIDs enabled: real_ncpus: %d, pmap_pcid_ncpus: %d\n", real_ncpus
, pmap_pcid_ncpus
);
149 cpu_datap(ccpu
)->cpu_pmap_pcid_coherentp
=
150 cpu_datap(ccpu
)->cpu_pmap_pcid_coherentp_kernel
=
151 &(kernel_pmap
->pmap_pcid_coherency_vector
[ccpu
]);
152 cpu_datap(ccpu
)->cpu_pcid_data
= &pcid_data
[ccpu
];
153 cpu_datap(ccpu
)->cpu_pcid_data
->cpu_pcid_refcounts
[0] = 1;
158 pmap_pcid_initialize(pmap_t p
)
161 unsigned nc
= sizeof(p
->pmap_pcid_cpus
) / sizeof(pcid_t
);
163 pmap_assert(nc
>= real_ncpus
);
164 for (i
= 0; i
< nc
; i
++) {
165 p
->pmap_pcid_cpus
[i
] = PMAP_PCID_INVALID_PCID
;
166 /* We assume here that the coherency vector is zeroed by
173 pmap_pcid_initialize_kernel(pmap_t p
)
176 unsigned nc
= sizeof(p
->pmap_pcid_cpus
) / sizeof(pcid_t
);
178 for (i
= 0; i
< nc
; i
++) {
179 p
->pmap_pcid_cpus
[i
] = 0;
180 /* We assume here that the coherency vector is zeroed by
187 pmap_pcid_allocate_pcid(int ccpu
)
190 pcid_ref_t cur_min
= 0xFF;
191 uint32_t cur_min_index
= ~1;
192 pcid_ref_t
*cpu_pcid_refcounts
= &cpu_datap(ccpu
)->cpu_pcid_data
->cpu_pcid_refcounts
[0];
193 pcid_ref_t old_count
;
195 if ((i
= cpu_datap(ccpu
)->cpu_pcid_data
->cpu_pcid_free_hint
) != 0) {
196 if (cpu_pcid_refcounts
[i
] == 0) {
197 (void)__sync_fetch_and_add(&cpu_pcid_refcounts
[i
], 1);
198 cpu_datap(ccpu
)->cpu_pcid_data
->cpu_pcid_free_hint
= 0;
202 /* Linear scan to discover free slot, with hint. Room for optimization
203 * but with intelligent prefetchers this should be
204 * adequately performant, as it is invoked
205 * only on first dispatch of a new address space onto
206 * a given processor. DRKTODO: use larger loads and
207 * zero byte discovery -- any pattern != ~1 should
208 * signify a free slot.
210 for (i
= PMAP_PCID_MIN_PCID
; i
< PMAP_PCID_MAX_PCID
; i
++) {
211 pcid_ref_t cur_refcount
= cpu_pcid_refcounts
[i
];
213 pmap_assert(cur_refcount
< PMAP_PCID_MAX_REFCOUNT
);
215 if (cur_refcount
== 0) {
216 (void)__sync_fetch_and_add(&cpu_pcid_refcounts
[i
], 1);
219 if (cur_refcount
< cur_min
) {
221 cur_min
= cur_refcount
;
225 pmap_assert(cur_min_index
> 0 && cur_min_index
< PMAP_PCID_MAX_PCID
);
226 /* Consider "rebalancing" tags actively in highly oversubscribed cases
227 * perhaps selecting tags with lower activity.
230 old_count
= __sync_fetch_and_add(&cpu_pcid_refcounts
[cur_min_index
], 1);
231 pmap_assert(old_count
< PMAP_PCID_MAX_REFCOUNT
);
232 return cur_min_index
;
236 pmap_pcid_deallocate_pcid(int ccpu
, pmap_t tpmap
)
240 pcid_ref_t prior_count
;
242 pcid
= tpmap
->pmap_pcid_cpus
[ccpu
];
243 pmap_assert(pcid
!= PMAP_PCID_INVALID_PCID
);
244 if (pcid
== PMAP_PCID_INVALID_PCID
) {
248 lp
= cpu_datap(ccpu
)->cpu_pcid_data
->cpu_pcid_last_pmap_dispatched
[pcid
];
249 pmap_assert(pcid
> 0 && pcid
< PMAP_PCID_MAX_PCID
);
250 pmap_assert(cpu_datap(ccpu
)->cpu_pcid_data
->cpu_pcid_refcounts
[pcid
] >= 1);
253 (void)__sync_bool_compare_and_swap(&cpu_datap(ccpu
)->cpu_pcid_data
->cpu_pcid_last_pmap_dispatched
[pcid
], tpmap
, PMAP_INVALID
);
256 if ((prior_count
= __sync_fetch_and_sub(&cpu_datap(ccpu
)->cpu_pcid_data
->cpu_pcid_refcounts
[pcid
], 1)) == 1) {
257 cpu_datap(ccpu
)->cpu_pcid_data
->cpu_pcid_free_hint
= pcid
;
259 pmap_assert(prior_count
<= PMAP_PCID_MAX_REFCOUNT
);
263 pmap_destroy_pcid_sync(pmap_t p
)
266 pmap_assert(ml_get_interrupts_enabled() == FALSE
|| get_preemption_level() != 0);
267 for (i
= 0; i
< PMAP_PCID_MAX_CPUS
; i
++) {
268 if (p
->pmap_pcid_cpus
[i
] != PMAP_PCID_INVALID_PCID
) {
269 pmap_pcid_deallocate_pcid(i
, p
);
275 pcid_for_pmap_cpu_tuple(pmap_t cpmap
, thread_t cthread
, int ccpu
)
277 pmap_t active_pmap
= cpmap
;
279 if (__improbable(cpmap
->pagezero_accessible
)) {
280 if ((cthread
->machine
.specFlags
& CopyIOActive
) == 0) {
281 active_pmap
= kernel_pmap
;
285 return active_pmap
->pmap_pcid_cpus
[ccpu
];
290 #define PCID_RECORD_SIZE 128
291 uint64_t pcid_record_array
[PCID_RECORD_SIZE
];
293 #define PMAP_UPCIDP(p) ((p ? (p + PMAP_PCID_MAX_PCID) : 0) | 1ULL << 63)
296 pmap_pcid_activate(pmap_t tpmap
, int ccpu
, boolean_t nopagezero
, boolean_t copyio
)
298 pcid_t new_pcid
= tpmap
->pmap_pcid_cpus
[ccpu
];
300 boolean_t pcid_conflict
= FALSE
, pending_flush
= FALSE
;
301 pcid_cdata_t
*pcdata
= cpu_datap(ccpu
)->cpu_pcid_data
;
303 pmap_assert(cpu_datap(ccpu
)->cpu_pmap_pcid_enabled
);
304 if (__improbable(new_pcid
== PMAP_PCID_INVALID_PCID
)) {
305 new_pcid
= tpmap
->pmap_pcid_cpus
[ccpu
] = pmap_pcid_allocate_pcid(ccpu
);
308 pmap_assert(new_pcid
!= PMAP_PCID_INVALID_PCID
);
310 cpu_datap(ccpu
)->cpu_last_pcid
= cpu_datap(ccpu
)->cpu_active_pcid
;
312 cpu_datap(ccpu
)->cpu_active_pcid
= new_pcid
;
314 pending_flush
= (tpmap
->pmap_pcid_coherency_vector
[ccpu
] != 0);
315 if (__probable(pending_flush
== FALSE
)) {
316 last_pmap
= pcdata
->cpu_pcid_last_pmap_dispatched
[new_pcid
];
317 pcid_conflict
= ((last_pmap
!= NULL
) && (tpmap
!= last_pmap
));
319 if (__improbable(pending_flush
|| pcid_conflict
)) {
320 pmap_pcid_validate_cpu(tpmap
, ccpu
);
322 /* Consider making this a unique id */
323 pcdata
->cpu_pcid_last_pmap_dispatched
[new_pcid
] = tpmap
;
325 pmap_assert(new_pcid
< PMAP_PCID_MAX_PCID
);
326 pmap_assert(((tpmap
== kernel_pmap
) && new_pcid
== 0) ||
327 ((new_pcid
!= PMAP_PCID_INVALID_PCID
) && (new_pcid
!= 0)));
329 pcid_record_array
[ccpu
% PCID_RECORD_SIZE
] = tpmap
->pm_cr3
| new_pcid
| (((uint64_t)(!(pending_flush
|| pcid_conflict
))) << 63);
330 pml4_entry_t
*pml4
= pmap64_pml4(tpmap
, 0ULL);
331 /* Diagnostic to detect pagetable anchor corruption */
332 if (pml4
[KERNEL_PML4_INDEX
] != kernel_pmap
->pm_pml4
[KERNEL_PML4_INDEX
]) {
333 __asm__
volatile ("int3");
335 #endif /* PMAP_ASSERT */
337 pmap_paddr_t ncr3
= tpmap
->pm_cr3
;
339 if (__improbable(nopagezero
)) {
340 pending_flush
= TRUE
;
341 if (copyio
== FALSE
) {
342 new_pcid
= kernel_pmap
->pmap_pcid_cpus
[ccpu
];
343 ncr3
= kernel_pmap
->pm_cr3
;
345 cpu_datap(ccpu
)->cpu_kernel_pcid
= kernel_pmap
->pmap_pcid_cpus
[ccpu
];
349 set_cr3_composed(ncr3
, new_pcid
, 1ULL);
350 cpu_shadowp(ccpu
)->cpu_shadowtask_cr3
= ncr3
| new_pcid
| (1ULL << 63);
352 bool preserve
= !pcid_conflict
&& !pending_flush
;
353 if (preserve
== true) {
354 /* We did not previously observe a pending invalidation for this
355 * ASID. However, the load from the coherency vector
356 * could've been reordered ahead of the store to the
357 * active_cr3 field (in the context switch path, our
358 * caller). Re-consult the pending invalidation vector
359 * after the CR3 write. We rely on MOV CR3's documented
360 * serializing property to avoid insertion of an expensive
363 pending_flush
= (tpmap
->pmap_pcid_coherency_vector
[ccpu
] != 0);
364 if (__improbable(pending_flush
!= 0)) {
365 pmap_pcid_validate_cpu(tpmap
, ccpu
);
370 if (preserve
== false) {
371 bool gtlbi
= (invpcid_enabled
== false);
372 pmap_tlbi_range(0, ~0ULL, gtlbi
, new_pcid
);
375 uint64_t spcid
= PMAP_UPCIDP(new_pcid
);
376 uint64_t scr3
= tpmap
->pm_ucr3
| spcid
;
378 cpu_datap(ccpu
)->cpu_ucr3
= scr3
;
379 cpu_shadowp(ccpu
)->cpu_ucr3
= scr3
;
381 cpu_datap(ccpu
)->cpu_pmap_pcid_coherentp
= &(tpmap
->pmap_pcid_coherency_vector
[ccpu
]);
383 cpu_datap(ccpu
)->cpu_pcid_last_cr3
= scr3
;
384 KERNEL_DEBUG_CONSTANT(0x9c1d0000, tpmap
, new_pcid
, pending_flush
, pcid_conflict
, 0);
389 INP_ALLG
= 2, INP_ASPACE
= 1, INP_SINGLE
= 0, INP_ALLNG
= 3
391 typedef struct __attribute__((packed
)) {
392 uint64_t ipcid_and_rsvd
;
397 invpcid(invpcid_type_t itype
, pcid_t ipcid
, uint64_t iaddr
)
399 invpcid_desc_t ipcdt
;
401 ipcdt
.ipcid_and_rsvd
= ipcid
;
404 uint64_t iptype
= itype
; //promote to workaround assembler bug
406 __asm__
volatile ("invpcid %0, %1" :: "m" (ipcdt
), "r" (iptype
) : "memory");
411 pmap_tlbi_range(uint64_t startv
, uint64_t endv
, bool global
, uint16_t pcid
)
413 assert(ml_get_interrupts_enabled() == FALSE
||
414 get_preemption_level() != 0);
416 if (invpcid_enabled
) {
418 invpcid(INP_ALLG
, 0, 0ULL);
420 /* TODO: separate large page invalidation check */
421 if ((endv
- startv
) >= INP_MAX
) {
422 invpcid(INP_ASPACE
, pcid
, 0ULL);
424 invpcid(INP_ASPACE
, (pcid
+ PMAP_PCID_MAX_PCID
), 0ULL);
427 uint64_t cv
= startv
;
428 for (; cv
< endv
; cv
+= PAGE_SIZE
) {
429 invpcid(INP_SINGLE
, pcid
, cv
);
431 invpcid(INP_SINGLE
, (pcid
+ PMAP_PCID_MAX_PCID
), cv
);
437 if (pmap_pcid_ncpus
) {
438 uintptr_t cr4
= get_cr4();
439 if (__improbable((cr4
& CR4_PGE
) == 0)) {
440 set_cr4(cr4
| CR4_PGE
);
442 set_cr4(cr4
& ~CR4_PGE
);
443 set_cr4(cr4
| CR4_PGE
);
446 set_cr3_raw(get_cr3_raw());
449 __c11_atomic_thread_fence(__ATOMIC_SEQ_CST
);