2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
36 #include <mach_assert.h>
37 #include <machine/atomic.h>
39 #include <kern/assert.h>
40 #include <kern/kern_types.h>
41 #include <kern/mpqueue.h>
42 #include <kern/queue.h>
43 #include <kern/processor.h>
45 #include <pexpert/pexpert.h>
46 #include <mach/i386/thread_status.h>
47 #include <mach/i386/vm_param.h>
48 #include <i386/locks.h>
49 #include <i386/rtclock_protos.h>
50 #include <i386/pmCPU.h>
51 #include <i386/cpu_topology.h>
56 #include <i386/vmx/vmx_cpu.h>
60 #include <machine/monotonic.h>
61 #endif /* MONOTONIC */
63 #include <machine/pal_routines.h>
66 * Data structures referenced (anonymously) from per-cpu data:
68 struct cpu_cons_buffer
;
69 struct cpu_desc_table
;
74 * Data structures embedded in per-cpu data:
76 typedef struct rtclock_timer
{
80 boolean_t has_expired
;
84 /* The 'u' suffixed fields store the double-mapped descriptor addresses */
85 struct x86_64_tss
*cdi_ktssu
;
86 struct x86_64_tss
*cdi_ktssb
;
87 x86_64_desc_register_t cdi_gdtu
;
88 x86_64_desc_register_t cdi_gdtb
;
89 x86_64_desc_register_t cdi_idtu
;
90 x86_64_desc_register_t cdi_idtb
;
91 struct real_descriptor
*cdi_ldtu
;
92 struct real_descriptor
*cdi_ldtb
;
93 vm_offset_t cdi_sstku
;
94 vm_offset_t cdi_sstkb
;
98 TASK_MAP_32BIT
, /* 32-bit user, compatibility mode */
99 TASK_MAP_64BIT
, /* 64-bit user thread, shared space */
104 * This structure is used on entry into the (uber-)kernel on syscall from
105 * a 64-bit user. It contains the address of the machine state save area
106 * for the current thread and a temporary place to save the user's rsp
107 * before loading this address into rsp.
110 addr64_t cu_isf
; /* thread->pcb->iss.isf */
111 uint64_t cu_tmp
; /* temporary scratch */
112 addr64_t cu_user_gs_base
;
115 typedef uint16_t pcid_t
;
116 typedef uint8_t pcid_ref_t
;
118 #define CPU_RTIME_BINS (12)
119 #define CPU_ITIME_BINS (CPU_RTIME_BINS)
121 #define MAX_TRACE_BTFRAMES (16)
125 uint64_t plbt
[MAX_TRACE_BTFRAMES
];
129 IOTRACE_PHYS_READ
= 1,
138 iotrace_type_e iotype
;
143 uint64_t start_time_abs
;
145 uint64_t backtrace
[MAX_TRACE_BTFRAMES
];
148 #if DEVELOPMENT || DEBUG
149 #define DEFAULT_IOTRACE_ENTRIES_PER_CPU (64)
150 #define IOTRACE_MAX_ENTRIES_PER_CPU (256)
151 extern volatile int mmiotrace_enabled
;
152 extern int iotrace_generators
;
153 extern int iotrace_entries_per_cpu
;
154 extern int *iotrace_next
;
155 extern iotrace_entry_t
**iotrace_ring
;
157 extern void init_iotrace_bufs(int cpucnt
, int entries_per_cpu
);
158 #endif /* DEVELOPMENT || DEBUG */
163 * Each processor has a per-cpu data area which is dereferenced through the
164 * current_cpu_datap() macro. For speed, the %gs segment is based here, and
165 * using this, inlines provides single-instruction access to frequently used
166 * members - such as get_cpu_number()/cpu_number(), and get_active_thread()/
169 * Cpu data owned by another processor can be accessed using the
170 * cpu_datap(cpu_number) macro which uses the cpu_data_ptr[] array of per-cpu
174 pcid_t cpu_pcid_free_hint
;
175 #define PMAP_PCID_MAX_PCID (0x800)
176 pcid_ref_t cpu_pcid_refcounts
[PMAP_PCID_MAX_PCID
];
177 pmap_t cpu_pcid_last_pmap_dispatched
[PMAP_PCID_MAX_PCID
];
180 typedef struct cpu_data
{
181 struct pal_cpu_data cpu_pal_data
; /* PAL-specific data */
182 #define cpu_pd cpu_pal_data /* convenience alias */
183 struct cpu_data
*cpu_this
; /* pointer to myself */
184 thread_t cpu_active_thread
;
185 thread_t cpu_nthread
;
186 volatile int cpu_preemption_level
;
187 int cpu_number
; /* Logical CPU */
188 void *cpu_int_state
; /* interrupt state */
189 vm_offset_t cpu_active_stack
; /* kernel stack base */
190 vm_offset_t cpu_kernel_stack
; /* kernel stack top */
191 vm_offset_t cpu_int_stack_top
;
192 int cpu_interrupt_level
;
193 volatile int cpu_signals
; /* IPI events */
194 volatile int cpu_prior_signals
; /* Last set of events,
197 ast_t cpu_pending_ast
;
198 volatile int cpu_running
;
200 boolean_t cpu_fixed_pmcs_enabled
;
201 #endif /* !MONOTONIC */
202 rtclock_timer_t rtclock_timer
;
203 uint64_t quantum_timer_deadline
;
204 volatile addr64_t cpu_active_cr3
__attribute((aligned(64)));
206 volatile uint32_t cpu_tlb_invalid
;
208 volatile uint16_t cpu_tlb_invalid_local
;
209 volatile uint16_t cpu_tlb_invalid_global
;
212 uint64_t cpu_ip_desc
[2];
213 volatile task_map_t cpu_task_map
;
214 volatile addr64_t cpu_task_cr3
;
215 addr64_t cpu_kernel_cr3
;
216 volatile addr64_t cpu_ucr3
;
217 volatile addr64_t cpu_shadowtask_cr3
;
218 boolean_t cpu_pagezero_mapped
;
220 /* Double-mapped per-CPU exception stack address */
223 int cpu_curtask_has_ldt
;
224 int cpu_curthread_do_segchk
;
225 /* Address of shadowed, partially mirrored CPU data structures located
226 * in the double mapped PML4
230 volatile uint32_t cpu_tlb_invalid_count
;
232 volatile uint16_t cpu_tlb_invalid_local_count
;
233 volatile uint16_t cpu_tlb_invalid_global_count
;
237 uint16_t cpu_tlb_gen_counts_local
[MAX_CPUS
];
238 uint16_t cpu_tlb_gen_counts_global
[MAX_CPUS
];
240 struct processor
*cpu_processor
;
241 #if NCOPY_WINDOWS > 0
242 struct cpu_pmap
*cpu_pmap
;
244 struct real_descriptor
*cpu_ldtp
;
245 struct cpu_desc_table
*cpu_desc_tablep
;
246 cpu_desc_index_t cpu_desc_index
;
248 #if NCOPY_WINDOWS > 0
249 vm_offset_t cpu_copywindow_base
;
250 uint64_t *cpu_copywindow_pdp
;
252 vm_offset_t cpu_physwindow_base
;
253 uint64_t *cpu_physwindow_ptep
;
256 #define HWINTCNT_SIZE 256
257 uint32_t cpu_hwIntCnt
[HWINTCNT_SIZE
]; /* Interrupt counts */
258 uint64_t cpu_hwIntpexits
[HWINTCNT_SIZE
];
259 uint64_t cpu_dr7
; /* debug control register */
260 uint64_t cpu_int_event_time
; /* intr entry/exit time */
261 pal_rtc_nanotime_t
*cpu_nanotime
; /* Nanotime info */
263 /* double-buffered performance counter data */
264 uint64_t *cpu_kpc_buf
[2];
265 /* PMC shadow and reload value buffers */
266 uint64_t *cpu_kpc_shadow
;
267 uint64_t *cpu_kpc_reload
;
270 struct mt_cpu cpu_monotonic
;
271 #endif /* MONOTONIC */
272 uint32_t cpu_pmap_pcid_enabled
;
273 pcid_t cpu_active_pcid
;
274 pcid_t cpu_last_pcid
;
275 pcid_t cpu_kernel_pcid
;
276 volatile pcid_ref_t
*cpu_pmap_pcid_coherentp
;
277 volatile pcid_ref_t
*cpu_pmap_pcid_coherentp_kernel
;
278 pcid_cdata_t
*cpu_pcid_data
;
280 uint64_t cpu_pmap_pcid_flushes
;
281 uint64_t cpu_pmap_pcid_preserves
;
288 uint64_t cpu_itime_total
;
289 uint64_t cpu_rtime_total
;
291 uint64_t cpu_idle_exits
;
292 uint64_t cpu_rtimes
[CPU_RTIME_BINS
];
293 uint64_t cpu_itimes
[CPU_ITIME_BINS
];
295 uint64_t cpu_cur_insns
;
296 uint64_t cpu_cur_ucc
;
297 uint64_t cpu_cur_urc
;
298 #endif /* !MONOTONIC */
299 uint64_t cpu_gpmcs
[4];
300 uint64_t cpu_max_observed_int_latency
;
301 int cpu_max_observed_int_latency_vector
;
302 volatile boolean_t cpu_NMI_acknowledged
;
303 uint64_t debugger_entry_time
;
304 uint64_t debugger_ipi_time
;
305 /* A separate nested interrupt stack flag, to account
306 * for non-nested interrupts arriving while on the interrupt stack
307 * Currently only occurs when AICPM enables interrupts on the
308 * interrupt stack during processor offlining.
310 uint32_t cpu_nested_istack
;
311 uint32_t cpu_nested_istack_events
;
312 x86_saved_state64_t
*cpu_fatal_trap_state
;
313 x86_saved_state64_t
*cpu_post_fatal_trap_state
;
315 vmx_cpu_t cpu_vmx
; /* wonderful world of virtualization */
318 struct mca_state
*cpu_mca_state
; /* State at MC fault */
324 boolean_t cpu_boot_complete
;
326 #define MAX_PREEMPTION_RECORDS (8)
327 #if DEVELOPMENT || DEBUG
329 plrecord_t plrecords
[MAX_PREEMPTION_RECORDS
];
331 void *cpu_console_buf
;
332 struct x86_lcpu lcpu
;
333 int cpu_phys_number
; /* Physical CPU */
334 cpu_id_t cpu_id
; /* Platform Expert */
336 uint64_t cpu_entry_cr3
;
337 uint64_t cpu_exit_cr3
;
338 uint64_t cpu_pcid_last_cr3
;
340 boolean_t cpu_rendezvous_in_progress
;
343 extern cpu_data_t
*cpu_data_ptr
[];
345 /* Macro to generate inline bodies to retrieve per-cpu data fields. */
346 #if defined(__clang__)
347 #define GS_RELATIVE volatile __attribute__((address_space(256)))
349 #define offsetof(TYPE, MEMBER) __builtin_offsetof(TYPE,MEMBER)
352 #define CPU_DATA_GET(member, type) \
353 cpu_data_t GS_RELATIVE *cpu_data = \
354 (cpu_data_t GS_RELATIVE *)0UL; \
356 ret = cpu_data->member; \
359 #define CPU_DATA_GET_INDEX(member, index, type) \
360 cpu_data_t GS_RELATIVE *cpu_data = \
361 (cpu_data_t GS_RELATIVE *)0UL; \
363 ret = cpu_data->member[index]; \
366 #define CPU_DATA_SET(member, value) \
367 cpu_data_t GS_RELATIVE *cpu_data = \
368 (cpu_data_t GS_RELATIVE *)0UL; \
369 cpu_data->member = value;
371 #define CPU_DATA_XCHG(member, value, type) \
372 cpu_data_t GS_RELATIVE *cpu_data = \
373 (cpu_data_t GS_RELATIVE *)0UL; \
375 ret = cpu_data->member; \
376 cpu_data->member = value; \
379 #else /* !defined(__clang__) */
382 #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
383 #endif /* offsetof */
384 #define CPU_DATA_GET(member, type) \
386 __asm__ volatile ("mov %%gs:%P1,%0" \
388 : "i" (offsetof(cpu_data_t,member))); \
391 #define CPU_DATA_GET_INDEX(member, index, type) \
393 __asm__ volatile ("mov %%gs:(%1),%0" \
395 : "r" (offsetof(cpu_data_t,member[index]))); \
398 #define CPU_DATA_SET(member, value) \
399 __asm__ volatile ("mov %0,%%gs:%P1" \
401 : "r" (value), "i" (offsetof(cpu_data_t,member)));
403 #define CPU_DATA_XCHG(member, value, type) \
405 __asm__ volatile ("xchg %0,%%gs:%P1" \
407 : "i" (offsetof(cpu_data_t,member)), "0" (value)); \
410 #endif /* !defined(__clang__) */
413 * Everyone within the osfmk part of the kernel can use the fast
414 * inline versions of these routines. Everyone outside, must call
420 * The "volatile" flavor of current_thread() is intended for use by
421 * scheduler code which may need to update the thread pointer in the
422 * course of a context switch. Any call to current_thread() made
423 * prior to the thread pointer update should be safe to optimize away
424 * as it should be consistent with that thread's state to the extent
425 * the compiler can reason about it. Likewise, the context switch
426 * path will eventually result in an arbitrary branch to the new
427 * thread's pc, about which the compiler won't be able to reason.
428 * Thus any compile-time optimization of current_thread() calls made
429 * within the new thread should be safely encapsulated in its
430 * register/stack state. The volatile form therefore exists to cover
431 * the window between the thread pointer update and the branch to
434 static inline thread_t
435 get_active_thread_volatile(void)
437 CPU_DATA_GET(cpu_active_thread
, thread_t
)
440 static inline __attribute__((const)) thread_t
441 get_active_thread(void)
443 CPU_DATA_GET(cpu_active_thread
, thread_t
)
446 #define current_thread_fast() get_active_thread()
447 #define current_thread_volatile() get_active_thread_volatile()
448 #define current_thread() current_thread_fast()
450 #define cpu_mode_is64bit() TRUE
453 get_preemption_level(void)
455 CPU_DATA_GET(cpu_preemption_level
, int)
458 get_interrupt_level(void)
460 CPU_DATA_GET(cpu_interrupt_level
, int)
465 CPU_DATA_GET(cpu_number
, int)
468 get_cpu_phys_number(void)
470 CPU_DATA_GET(cpu_phys_number
, int)
473 static inline cpu_data_t
*
474 current_cpu_datap(void)
476 CPU_DATA_GET(cpu_this
, cpu_data_t
*);
480 * Facility to diagnose preemption-level imbalances, which are otherwise
481 * challenging to debug. On each operation that enables or disables preemption,
482 * we record a backtrace into a per-CPU ring buffer, along with the current
483 * preemption level and operation type. Thus, if an imbalance is observed,
484 * one can examine these per-CPU records to determine which codepath failed
485 * to re-enable preemption, enabled premption without a corresponding
486 * disablement etc. The backtracer determines which stack is currently active,
487 * and uses that to perform bounds checks on unterminated stacks.
488 * To enable, sysctl -w machdep.pltrace=1 on DEVELOPMENT or DEBUG kernels (DRK '15)
489 * The bounds check currently doesn't account for non-default thread stack sizes.
491 #if DEVELOPMENT || DEBUG
493 rbtrace_bt(uint64_t *rets
, int maxframes
, cpu_data_t
*cdata
)
495 extern uint32_t low_intstack
[]; /* bottom */
496 extern uint32_t low_eintstack
[]; /* top */
497 extern char mp_slave_stack
[];
499 uint64_t kstackb
, kstackt
;
501 /* Obtain the 'current' program counter, initial backtrace
502 * element. This will also indicate if we were unable to
503 * trace further up the stack for some reason
505 __asm__
volatile ("leaq 1f(%%rip), %%rax; mov %%rax, %0\n1:"
511 thread_t cplthread
= cdata
->cpu_active_thread
;
514 __asm__
__volatile__ ("movq %%rsp, %0": "=r" (csp
):);
515 /* Determine which stack we're on to populate stack bounds.
516 * We don't need to trace across stack boundaries for this
519 kstackb
= cdata
->cpu_active_stack
;
520 kstackt
= kstackb
+ KERNEL_STACK_SIZE
;
521 if (csp
< kstackb
|| csp
> kstackt
) {
522 kstackt
= cdata
->cpu_kernel_stack
;
523 kstackb
= kstackt
- KERNEL_STACK_SIZE
;
524 if (csp
< kstackb
|| csp
> kstackt
) {
525 kstackt
= cdata
->cpu_int_stack_top
;
526 kstackb
= kstackt
- INTSTACK_SIZE
;
527 if (csp
< kstackb
|| csp
> kstackt
) {
528 kstackt
= (uintptr_t)low_eintstack
;
529 kstackb
= kstackt
- INTSTACK_SIZE
;
530 if (csp
< kstackb
|| csp
> kstackt
) {
531 kstackb
= (uintptr_t) mp_slave_stack
;
532 kstackt
= kstackb
+ PAGE_SIZE
;
541 if (__probable(kstackb
&& kstackt
)) {
542 uint64_t *cfp
= (uint64_t *) __builtin_frame_address(0);
545 for (rbbtf
= 1; rbbtf
< maxframes
; rbbtf
++) {
546 if (((uint64_t)cfp
== 0) || (((uint64_t)cfp
< kstackb
) || ((uint64_t)cfp
> kstackt
))) {
550 rets
[rbbtf
] = *(cfp
+ 1);
551 cfp
= (uint64_t *) (*cfp
);
558 pltrace_internal(boolean_t enable
)
560 cpu_data_t
*cdata
= current_cpu_datap();
561 int cpli
= cdata
->cpu_preemption_level
;
562 int cplrecord
= cdata
->cpu_plri
;
567 cdata
->plrecords
[cplrecord
].pltype
= enable
;
568 cdata
->plrecords
[cplrecord
].plevel
= cpli
;
570 plbts
= &cdata
->plrecords
[cplrecord
].plbt
[0];
574 if (cplrecord
>= MAX_PREEMPTION_RECORDS
) {
578 cdata
->cpu_plri
= cplrecord
;
580 rbtrace_bt(plbts
, MAX_TRACE_BTFRAMES
- 1, cdata
);
583 extern int plctrace_enabled
;
586 iotrace(iotrace_type_e type
, uint64_t vaddr
, uint64_t paddr
, int size
, uint64_t val
,
587 uint64_t sabs
, uint64_t duration
)
590 int cpu_num
, nextidx
;
591 iotrace_entry_t
*cur_iotrace_ring
;
593 if (__improbable(mmiotrace_enabled
== 0 || iotrace_generators
== 0)) {
597 cdata
= current_cpu_datap();
598 cpu_num
= cdata
->cpu_number
;
599 nextidx
= iotrace_next
[cpu_num
];
600 cur_iotrace_ring
= iotrace_ring
[cpu_num
];
602 cur_iotrace_ring
[nextidx
].iotype
= type
;
603 cur_iotrace_ring
[nextidx
].vaddr
= vaddr
;
604 cur_iotrace_ring
[nextidx
].paddr
= paddr
;
605 cur_iotrace_ring
[nextidx
].size
= size
;
606 cur_iotrace_ring
[nextidx
].val
= val
;
607 cur_iotrace_ring
[nextidx
].start_time_abs
= sabs
;
608 cur_iotrace_ring
[nextidx
].duration
= duration
;
610 iotrace_next
[cpu_num
] = ((nextidx
+ 1) >= iotrace_entries_per_cpu
) ? 0 : (nextidx
+ 1);
612 rbtrace_bt(&cur_iotrace_ring
[nextidx
].backtrace
[0],
613 MAX_TRACE_BTFRAMES
- 1, cdata
);
615 #endif /* DEVELOPMENT || DEBUG */
618 pltrace(boolean_t plenable
)
620 #if DEVELOPMENT || DEBUG
621 if (__improbable(plctrace_enabled
!= 0)) {
622 pltrace_internal(plenable
);
630 disable_preemption_internal(void)
632 assert(get_preemption_level() >= 0);
634 os_compiler_barrier(release
);
635 #if defined(__clang__)
636 cpu_data_t GS_RELATIVE
*cpu_data
= (cpu_data_t GS_RELATIVE
*)0UL;
637 cpu_data
->cpu_preemption_level
++;
639 __asm__
volatile ("incl %%gs:%P0"
641 : "i" (offsetof(cpu_data_t
, cpu_preemption_level
)));
643 os_compiler_barrier(acquire
);
648 enable_preemption_internal(void)
650 assert(get_preemption_level() > 0);
652 os_compiler_barrier(release
);
653 #if defined(__clang__)
654 cpu_data_t GS_RELATIVE
*cpu_data
= (cpu_data_t GS_RELATIVE
*)0UL;
655 if (0 == --cpu_data
->cpu_preemption_level
) {
656 kernel_preempt_check();
659 __asm__
volatile ("decl %%gs:%P0 \n\t"
661 "call _kernel_preempt_check \n\t"
664 : "i" (offsetof(cpu_data_t
, cpu_preemption_level
))
665 : "eax", "ecx", "edx", "cc", "memory");
667 os_compiler_barrier(acquire
);
671 enable_preemption_no_check(void)
673 assert(get_preemption_level() > 0);
676 os_compiler_barrier(release
);
677 #if defined(__clang__)
678 cpu_data_t GS_RELATIVE
*cpu_data
= (cpu_data_t GS_RELATIVE
*)0UL;
679 cpu_data
->cpu_preemption_level
--;
681 __asm__
volatile ("decl %%gs:%P0"
683 : "i" (offsetof(cpu_data_t
, cpu_preemption_level
))
686 os_compiler_barrier(acquire
);
690 _enable_preemption_no_check(void)
692 enable_preemption_no_check();
696 mp_disable_preemption(void)
698 disable_preemption_internal();
702 _mp_disable_preemption(void)
704 disable_preemption_internal();
708 mp_enable_preemption(void)
710 enable_preemption_internal();
714 _mp_enable_preemption(void)
716 enable_preemption_internal();
720 mp_enable_preemption_no_check(void)
722 enable_preemption_no_check();
726 _mp_enable_preemption_no_check(void)
728 enable_preemption_no_check();
731 #ifdef XNU_KERNEL_PRIVATE
732 #define disable_preemption() disable_preemption_internal()
733 #define enable_preemption() enable_preemption_internal()
734 #define MACHINE_PREEMPTION_MACROS (1)
737 static inline cpu_data_t
*
740 return cpu_data_ptr
[cpu
];
744 cpu_is_running(int cpu
)
746 return (cpu_datap(cpu
) != NULL
) && (cpu_datap(cpu
)->cpu_running
);
749 #ifdef MACH_KERNEL_PRIVATE
750 static inline cpu_data_t
*
753 return cpu_data_ptr
[cpu
]->cd_shadow
;
757 extern cpu_data_t
*cpu_data_alloc(boolean_t is_boot_cpu
);
758 extern void cpu_data_realloc(void);
760 #endif /* I386_CPU_DATA */