X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/91447636331957f3d9b5ca5b508f07c526b0074d..cb3231590a3c94ab4375e2228bd5e86b0cf1ad7e:/osfmk/i386/mp.c diff --git a/osfmk/i386/mp.c b/osfmk/i386/mp.c index 1edc74a25..b6654cc39 100644 --- a/osfmk/i386/mp.c +++ b/osfmk/i386/mp.c @@ -1,887 +1,840 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ */ -#include -#include #include +#include #include -#include #include #include #include #include +#include #include #include #include #include +#include #include +#include +#include +#include +#include +#include +#include #include #include -#include - +#include +#include +#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include +#include #include -#include +#include #include -#include -#include -#include -#include -#include -#include - -#if MP_DEBUG -#define PAUSE delay(1000000) -#define DBG(x...) kprintf(x) -#else -#define DBG(x...) -#define PAUSE -#endif /* MP_DEBUG */ - -/* - * By default, use high vectors to leave vector space for systems - * with multiple I/O APIC's. However some systems that boot with - * local APIC disabled will hang in SMM when vectors greater than - * 0x5F are used. Those systems are not expected to have I/O APIC - * so 16 (0x50 - 0x40) vectors for legacy PIC support is perfect. - */ -#define LAPIC_DEFAULT_INTERRUPT_BASE 0xD0 -#define LAPIC_REDUCED_INTERRUPT_BASE 0x50 -/* - * Specific lapic interrupts are relative to this base: - */ -#define LAPIC_PERFCNT_INTERRUPT 0xB -#define LAPIC_TIMER_INTERRUPT 0xC -#define LAPIC_SPURIOUS_INTERRUPT 0xD -#define LAPIC_INTERPROCESSOR_INTERRUPT 0xE -#define LAPIC_ERROR_INTERRUPT 0xF - -/* Initialize lapic_id so cpu_number() works on non SMP systems */ -unsigned long lapic_id_initdata = 0; -unsigned long lapic_id = (unsigned long)&lapic_id_initdata; -vm_offset_t lapic_start; - -static i386_intr_func_t lapic_timer_func; -static i386_intr_func_t lapic_pmi_func; - -/* TRUE if local APIC was enabled by the OS not by the BIOS */ -static boolean_t lapic_os_enabled = FALSE; - -/* Base vector for local APIC interrupt sources */ -int lapic_interrupt_base = LAPIC_DEFAULT_INTERRUPT_BASE; - -void slave_boot_init(void); - -static void mp_kdp_wait(void); -static void mp_rendezvous_action(void); - -boolean_t smp_initialized = FALSE; - -decl_simple_lock_data(,mp_kdp_lock); - -decl_mutex_data(static, mp_cpu_boot_lock); - -/* Variables needed for MP rendezvous. */ -static void (*mp_rv_setup_func)(void *arg); -static void (*mp_rv_action_func)(void *arg); -static void (*mp_rv_teardown_func)(void *arg); -static void *mp_rv_func_arg; -static int mp_rv_ncpus; -static long mp_rv_waiters[2]; -decl_simple_lock_data(,mp_rv_lock); - -int lapic_to_cpu[MAX_CPUS]; -int cpu_to_lapic[MAX_CPUS]; - -static void -lapic_cpu_map_init(void) -{ - int i; +#include +#if CONFIG_MCA +#include +#endif +#include - for (i = 0; i < MAX_CPUS; i++) { - lapic_to_cpu[i] = -1; - cpu_to_lapic[i] = -1; - } -} +#include -void -lapic_cpu_map(int apic_id, int cpu) -{ - cpu_to_lapic[cpu] = apic_id; - lapic_to_cpu[apic_id] = cpu; -} +#include -#ifdef MP_DEBUG -static void -lapic_cpu_map_dump(void) -{ - int i; +#if MONOTONIC +#include +#endif /* MONOTONIC */ - for (i = 0; i < MAX_CPUS; i++) { - if (cpu_to_lapic[i] == -1) - continue; - kprintf("cpu_to_lapic[%d]: %d\n", - i, cpu_to_lapic[i]); - } - for (i = 0; i < MAX_CPUS; i++) { - if (lapic_to_cpu[i] == -1) - continue; - kprintf("lapic_to_cpu[%d]: %d\n", - i, lapic_to_cpu[i]); - } -} -#define LAPIC_CPU_MAP_DUMP() lapic_cpu_map_dump() -#define LAPIC_DUMP() lapic_dump() +#if MP_DEBUG +#define PAUSE delay(1000000) +#define DBG(x...) kprintf(x) #else -#define LAPIC_CPU_MAP_DUMP() -#define LAPIC_DUMP() -#endif /* MP_DEBUG */ +#define DBG(x...) +#define PAUSE +#endif /* MP_DEBUG */ + +/* Debugging/test trace events: */ +#define TRACE_MP_TLB_FLUSH MACHDBG_CODE(DBG_MACH_MP, 0) +#define TRACE_MP_CPUS_CALL MACHDBG_CODE(DBG_MACH_MP, 1) +#define TRACE_MP_CPUS_CALL_LOCAL MACHDBG_CODE(DBG_MACH_MP, 2) +#define TRACE_MP_CPUS_CALL_ACTION MACHDBG_CODE(DBG_MACH_MP, 3) +#define TRACE_MP_CPUS_CALL_NOBUF MACHDBG_CODE(DBG_MACH_MP, 4) +#define TRACE_MP_CPU_FAST_START MACHDBG_CODE(DBG_MACH_MP, 5) +#define TRACE_MP_CPU_START MACHDBG_CODE(DBG_MACH_MP, 6) +#define TRACE_MP_CPU_DEACTIVATE MACHDBG_CODE(DBG_MACH_MP, 7) + +#define ABS(v) (((v) > 0)?(v):-(v)) + +void slave_boot_init(void); +void i386_cpu_IPI(int cpu); + +#if MACH_KDP +static void mp_kdp_wait(boolean_t flush, boolean_t isNMI); +#endif /* MACH_KDP */ + +#if MACH_KDP +static boolean_t cpu_signal_pending(int cpu, mp_event_t event); +#endif /* MACH_KDP */ +static int NMIInterruptHandler(x86_saved_state_t *regs); + +boolean_t smp_initialized = FALSE; +uint32_t TSC_sync_margin = 0xFFF; +volatile boolean_t force_immediate_debugger_NMI = FALSE; +volatile boolean_t pmap_tlb_flush_timeout = FALSE; +#if DEBUG || DEVELOPMENT +boolean_t mp_interrupt_watchdog_enabled = TRUE; +uint32_t mp_interrupt_watchdog_events = 0; +#endif -#define LAPIC_REG(reg) \ - (*((volatile int *)(lapic_start + LAPIC_##reg))) -#define LAPIC_REG_OFFSET(reg,off) \ - (*((volatile int *)(lapic_start + LAPIC_##reg + (off)))) +decl_simple_lock_data(, debugger_callback_lock); +struct debugger_callback *debugger_callback = NULL; -#define LAPIC_VECTOR(src) \ - (lapic_interrupt_base + LAPIC_##src##_INTERRUPT) +decl_lck_mtx_data(static, mp_cpu_boot_lock); +lck_mtx_ext_t mp_cpu_boot_lock_ext; -#define LAPIC_ISR_IS_SET(base,src) \ - (LAPIC_REG_OFFSET(ISR_BASE,((base+LAPIC_##src##_INTERRUPT)/32)*0x10) & \ - (1 <<((base + LAPIC_##src##_INTERRUPT)%32))) +/* Variables needed for MP rendezvous. */ +decl_simple_lock_data(, mp_rv_lock); +static void (*mp_rv_setup_func)(void *arg); +static void (*mp_rv_action_func)(void *arg); +static void (*mp_rv_teardown_func)(void *arg); +static void *mp_rv_func_arg; +static volatile int mp_rv_ncpus; +/* Cache-aligned barriers: */ +static volatile long mp_rv_entry __attribute__((aligned(64))); +static volatile long mp_rv_exit __attribute__((aligned(64))); +static volatile long mp_rv_complete __attribute__((aligned(64))); + +volatile uint64_t debugger_entry_time; +volatile uint64_t debugger_exit_time; +#if MACH_KDP +#include +extern int kdp_snapshot; +static struct _kdp_xcpu_call_func { + kdp_x86_xcpu_func_t func; + void *arg0, *arg1; + volatile long ret; + volatile uint16_t cpu; +} kdp_xcpu_call_func = { + .cpu = KDP_XCPU_NONE +}; -#if GPROF -/* - * Initialize dummy structs for profiling. These aren't used but - * allows hertz_tick() to be built with GPROF defined. - */ -struct profile_vars _profile_vars; -struct profile_vars *_profile_vars_cpus[MAX_CPUS] = { &_profile_vars }; -#define GPROF_INIT() \ -{ \ - int i; \ - \ - /* Hack to initialize pointers to unused profiling structs */ \ - for (i = 1; i < MAX_CPUS; i++) \ - _profile_vars_cpus[i] = &_profile_vars; \ -} -#else -#define GPROF_INIT() -#endif /* GPROF */ +#endif -extern void master_up(void); +/* Variables needed for MP broadcast. */ +static void (*mp_bc_action_func)(void *arg); +static void *mp_bc_func_arg; +static int mp_bc_ncpus; +static volatile long mp_bc_count; +decl_lck_mtx_data(static, mp_bc_lock); +lck_mtx_ext_t mp_bc_lock_ext; +static volatile int debugger_cpu = -1; +volatile long NMIPI_acks = 0; +volatile long NMI_count = 0; +static NMI_reason_t NMI_panic_reason = NONE; +static int vector_timed_out; + +extern void NMI_cpus(void); + +static void mp_cpus_call_init(void); +static void mp_cpus_call_action(void); +static void mp_call_PM(void); + +char mp_slave_stack[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); // Temp stack for slave init + +/* PAL-related routines */ +boolean_t i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler, + int ipi_vector, i386_intr_func_t ipi_handler); +void i386_start_cpu(int lapic_id, int cpu_num); +void i386_send_NMI(int cpu); +void NMIPI_enable(boolean_t); + +static lck_grp_t smp_lck_grp; +static lck_grp_attr_t smp_lck_grp_attr; + +#define NUM_CPU_WARM_CALLS 20 +struct timer_call cpu_warm_call_arr[NUM_CPU_WARM_CALLS]; +queue_head_t cpu_warm_call_list; +decl_simple_lock_data(static, cpu_warm_lock); + +typedef struct cpu_warm_data { + timer_call_t cwd_call; + uint64_t cwd_deadline; + int cwd_result; +} *cpu_warm_data_t; + +static void cpu_prewarm_init(void); +static void cpu_warm_timer_call_func(call_entry_param_t p0, call_entry_param_t p1); +static void _cpu_warm_setup(void *arg); +static timer_call_t grab_warm_timer_call(void); +static void free_warm_timer_call(timer_call_t call); void smp_init(void) { - int result; - vm_map_entry_t entry; - uint32_t lo; - uint32_t hi; - boolean_t is_boot_processor; - boolean_t is_lapic_enabled; - vm_offset_t lapic_base; - - simple_lock_init(&mp_kdp_lock, 0); simple_lock_init(&mp_rv_lock, 0); - mutex_init(&mp_cpu_boot_lock, 0); + simple_lock_init(&debugger_callback_lock, 0); + lck_grp_attr_setdefault(&smp_lck_grp_attr); + lck_grp_init(&smp_lck_grp, "i386_smp", &smp_lck_grp_attr); + lck_mtx_init_ext(&mp_cpu_boot_lock, &mp_cpu_boot_lock_ext, &smp_lck_grp, LCK_ATTR_NULL); + lck_mtx_init_ext(&mp_bc_lock, &mp_bc_lock_ext, &smp_lck_grp, LCK_ATTR_NULL); console_init(); - /* Local APIC? */ - if (!lapic_probe()) + if (!i386_smp_init(LAPIC_NMI_INTERRUPT, NMIInterruptHandler, + LAPIC_VECTOR(INTERPROCESSOR), cpu_signal_handler)) { return; - - /* Examine the local APIC state */ - rdmsr(MSR_IA32_APIC_BASE, lo, hi); - is_boot_processor = (lo & MSR_IA32_APIC_BASE_BSP) != 0; - is_lapic_enabled = (lo & MSR_IA32_APIC_BASE_ENABLE) != 0; - lapic_base = (lo & MSR_IA32_APIC_BASE_BASE); - kprintf("MSR_IA32_APIC_BASE 0x%x %s %s\n", lapic_base, - is_lapic_enabled ? "enabled" : "disabled", - is_boot_processor ? "BSP" : "AP"); - if (!is_boot_processor || !is_lapic_enabled) - panic("Unexpected local APIC state\n"); - - /* Establish a map to the local apic */ - lapic_start = vm_map_min(kernel_map); - result = vm_map_find_space(kernel_map, &lapic_start, - round_page(LAPIC_SIZE), 0, &entry); - if (result != KERN_SUCCESS) { - panic("smp_init: vm_map_find_entry FAILED (err=%d)", result); - } - vm_map_unlock(kernel_map); - pmap_enter(pmap_kernel(), - lapic_start, - (ppnum_t) i386_btop(lapic_base), - VM_PROT_READ|VM_PROT_WRITE, - VM_WIMG_USE_DEFAULT, - TRUE); - lapic_id = (unsigned long)(lapic_start + LAPIC_ID); - - if ((LAPIC_REG(VERSION)&LAPIC_VERSION_MASK) != 0x14) { - printf("Local APIC version not 0x14 as expected\n"); - } - - /* Set up the lapic_id <-> cpu_number map and add this boot processor */ - lapic_cpu_map_init(); - lapic_cpu_map((LAPIC_REG(ID)>>LAPIC_ID_SHIFT)&LAPIC_ID_MASK, 0); - - lapic_init(); + } cpu_thread_init(); - if (pmc_init() != KERN_SUCCESS) - printf("Performance counters not available\n"); - - GPROF_INIT(); DBGLOG_CPU_INIT(master_cpu); - slave_boot_init(); - master_up(); + mp_cpus_call_init(); + mp_cpus_call_cpu_init(master_cpu); + +#if DEBUG || DEVELOPMENT + if (PE_parse_boot_argn("interrupt_watchdog", + &mp_interrupt_watchdog_enabled, + sizeof(mp_interrupt_watchdog_enabled))) { + kprintf("Interrupt watchdog %sabled\n", + mp_interrupt_watchdog_enabled ? "en" : "dis"); + } +#endif + if (PE_parse_boot_argn("TSC_sync_margin", + &TSC_sync_margin, sizeof(TSC_sync_margin))) { + kprintf("TSC sync Margin 0x%x\n", TSC_sync_margin); + } else if (cpuid_vmm_present()) { + kprintf("TSC sync margin disabled\n"); + TSC_sync_margin = 0; + } smp_initialized = TRUE; + cpu_prewarm_init(); + return; } +typedef struct { + int target_cpu; + int target_lapic; + int starter_cpu; +} processor_start_info_t; +static processor_start_info_t start_info __attribute__((aligned(64))); -static int -lapic_esr_read(void) -{ - /* write-read register */ - LAPIC_REG(ERROR_STATUS) = 0; - return LAPIC_REG(ERROR_STATUS); -} +/* + * Cache-alignment is to avoid cross-cpu false-sharing interference. + */ +static volatile long tsc_entry_barrier __attribute__((aligned(64))); +static volatile long tsc_exit_barrier __attribute__((aligned(64))); +static volatile uint64_t tsc_target __attribute__((aligned(64))); -static void -lapic_esr_clear(void) +/* + * Poll a CPU to see when it has marked itself as running. + */ +static void +mp_wait_for_cpu_up(int slot_num, unsigned int iters, unsigned int usecdelay) { - LAPIC_REG(ERROR_STATUS) = 0; - LAPIC_REG(ERROR_STATUS) = 0; -} - -static const char *DM[8] = { - "Fixed", - "Lowest Priority", - "Invalid", - "Invalid", - "NMI", - "Reset", - "Invalid", - "ExtINT"}; - -void -lapic_dump(void) -{ - int i; - -#define BOOL(a) ((a)?' ':'!') - - kprintf("LAPIC %d at 0x%x version 0x%x\n", - (LAPIC_REG(ID)>>LAPIC_ID_SHIFT)&LAPIC_ID_MASK, - lapic_start, - LAPIC_REG(VERSION)&LAPIC_VERSION_MASK); - kprintf("Priorities: Task 0x%x Arbitration 0x%x Processor 0x%x\n", - LAPIC_REG(TPR)&LAPIC_TPR_MASK, - LAPIC_REG(APR)&LAPIC_APR_MASK, - LAPIC_REG(PPR)&LAPIC_PPR_MASK); - kprintf("Destination Format 0x%x Logical Destination 0x%x\n", - LAPIC_REG(DFR)>>LAPIC_DFR_SHIFT, - LAPIC_REG(LDR)>>LAPIC_LDR_SHIFT); - kprintf("%cEnabled %cFocusChecking SV 0x%x\n", - BOOL(LAPIC_REG(SVR)&LAPIC_SVR_ENABLE), - BOOL(!(LAPIC_REG(SVR)&LAPIC_SVR_FOCUS_OFF)), - LAPIC_REG(SVR) & LAPIC_SVR_MASK); - kprintf("LVT_TIMER: Vector 0x%02x %s %cmasked %s\n", - LAPIC_REG(LVT_TIMER)&LAPIC_LVT_VECTOR_MASK, - (LAPIC_REG(LVT_TIMER)&LAPIC_LVT_DS_PENDING)?"SendPending":"Idle", - BOOL(LAPIC_REG(LVT_TIMER)&LAPIC_LVT_MASKED), - (LAPIC_REG(LVT_TIMER)&LAPIC_LVT_PERIODIC)?"Periodic":"OneShot"); - kprintf(" Initial Count: 0x%08x \n", LAPIC_REG(TIMER_INITIAL_COUNT)); - kprintf(" Current Count: 0x%08x \n", LAPIC_REG(TIMER_CURRENT_COUNT)); - kprintf(" Divide Config: 0x%08x \n", LAPIC_REG(TIMER_DIVIDE_CONFIG)); - kprintf("LVT_PERFCNT: Vector 0x%02x [%s] %s %cmasked\n", - LAPIC_REG(LVT_PERFCNT)&LAPIC_LVT_VECTOR_MASK, - DM[(LAPIC_REG(LVT_PERFCNT)>>LAPIC_LVT_DM_SHIFT)&LAPIC_LVT_DM_MASK], - (LAPIC_REG(LVT_PERFCNT)&LAPIC_LVT_DS_PENDING)?"SendPending":"Idle", - BOOL(LAPIC_REG(LVT_PERFCNT)&LAPIC_LVT_MASKED)); - kprintf("LVT_LINT0: Vector 0x%02x [%s][%s][%s] %s %cmasked\n", - LAPIC_REG(LVT_LINT0)&LAPIC_LVT_VECTOR_MASK, - DM[(LAPIC_REG(LVT_LINT0)>>LAPIC_LVT_DM_SHIFT)&LAPIC_LVT_DM_MASK], - (LAPIC_REG(LVT_LINT0)&LAPIC_LVT_TM_LEVEL)?"Level":"Edge ", - (LAPIC_REG(LVT_LINT0)&LAPIC_LVT_IP_PLRITY_LOW)?"Low ":"High", - (LAPIC_REG(LVT_LINT0)&LAPIC_LVT_DS_PENDING)?"SendPending":"Idle", - BOOL(LAPIC_REG(LVT_LINT0)&LAPIC_LVT_MASKED)); - kprintf("LVT_LINT1: Vector 0x%02x [%s][%s][%s] %s %cmasked\n", - LAPIC_REG(LVT_LINT1)&LAPIC_LVT_VECTOR_MASK, - DM[(LAPIC_REG(LVT_LINT1)>>LAPIC_LVT_DM_SHIFT)&LAPIC_LVT_DM_MASK], - (LAPIC_REG(LVT_LINT1)&LAPIC_LVT_TM_LEVEL)?"Level":"Edge ", - (LAPIC_REG(LVT_LINT1)&LAPIC_LVT_IP_PLRITY_LOW)?"Low ":"High", - (LAPIC_REG(LVT_LINT1)&LAPIC_LVT_DS_PENDING)?"SendPending":"Idle", - BOOL(LAPIC_REG(LVT_LINT1)&LAPIC_LVT_MASKED)); - kprintf("LVT_ERROR: Vector 0x%02x %s %cmasked\n", - LAPIC_REG(LVT_ERROR)&LAPIC_LVT_VECTOR_MASK, - (LAPIC_REG(LVT_ERROR)&LAPIC_LVT_DS_PENDING)?"SendPending":"Idle", - BOOL(LAPIC_REG(LVT_ERROR)&LAPIC_LVT_MASKED)); - kprintf("ESR: %08x \n", lapic_esr_read()); - kprintf(" "); - for(i=0xf; i>=0; i--) - kprintf("%x%x%x%x",i,i,i,i); - kprintf("\n"); - kprintf("TMR: 0x"); - for(i=7; i>=0; i--) - kprintf("%08x",LAPIC_REG_OFFSET(TMR_BASE, i*0x10)); - kprintf("\n"); - kprintf("IRR: 0x"); - for(i=7; i>=0; i--) - kprintf("%08x",LAPIC_REG_OFFSET(IRR_BASE, i*0x10)); - kprintf("\n"); - kprintf("ISR: 0x"); - for(i=7; i >= 0; i--) - kprintf("%08x",LAPIC_REG_OFFSET(ISR_BASE, i*0x10)); - kprintf("\n"); + while (iters-- > 0) { + if (cpu_datap(slot_num)->cpu_running) { + break; + } + delay(usecdelay); + } } -boolean_t -lapic_probe(void) +/* + * Quickly bring a CPU back online which has been halted. + */ +kern_return_t +intel_startCPU_fast(int slot_num) { - uint32_t lo; - uint32_t hi; - - if (cpuid_features() & CPUID_FEATURE_APIC) - return TRUE; - - if (cpuid_family() == 6 || cpuid_family() == 15) { - /* - * Mobile Pentiums: - * There may be a local APIC which wasn't enabled by BIOS. - * So we try to enable it explicitly. - */ - rdmsr(MSR_IA32_APIC_BASE, lo, hi); - lo &= ~MSR_IA32_APIC_BASE_BASE; - lo |= MSR_IA32_APIC_BASE_ENABLE | LAPIC_START; - lo |= MSR_IA32_APIC_BASE_ENABLE; - wrmsr(MSR_IA32_APIC_BASE, lo, hi); + kern_return_t rc; + /* + * Try to perform a fast restart + */ + rc = pmCPUExitHalt(slot_num); + if (rc != KERN_SUCCESS) { /* - * Re-initialize cpu features info and re-check. + * The CPU was not eligible for a fast restart. */ - set_cpu_model(); - if (cpuid_features() & CPUID_FEATURE_APIC) { - printf("Local APIC discovered and enabled\n"); - lapic_os_enabled = TRUE; - lapic_interrupt_base = LAPIC_REDUCED_INTERRUPT_BASE; - return TRUE; - } + return rc; } - return FALSE; -} - -void -lapic_shutdown(void) -{ - uint32_t lo; - uint32_t hi; - uint32_t value; - - /* Shutdown if local APIC was enabled by OS */ - if (lapic_os_enabled == FALSE) - return; + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_FAST_START | DBG_FUNC_START, + slot_num, 0, 0, 0, 0); + /* + * Wait until the CPU is back online. + */ mp_disable_preemption(); - /* ExtINT: masked */ - if (get_cpu_number() == master_cpu) { - value = LAPIC_REG(LVT_LINT0); - value |= LAPIC_LVT_MASKED; - LAPIC_REG(LVT_LINT0) = value; - } - - /* Timer: masked */ - LAPIC_REG(LVT_TIMER) |= LAPIC_LVT_MASKED; - - /* Perfmon: masked */ - LAPIC_REG(LVT_PERFCNT) |= LAPIC_LVT_MASKED; - - /* Error: masked */ - LAPIC_REG(LVT_ERROR) |= LAPIC_LVT_MASKED; - - /* APIC software disabled */ - LAPIC_REG(SVR) &= ~LAPIC_SVR_ENABLE; - - /* Bypass the APIC completely and update cpu features */ - rdmsr(MSR_IA32_APIC_BASE, lo, hi); - lo &= ~MSR_IA32_APIC_BASE_ENABLE; - wrmsr(MSR_IA32_APIC_BASE, lo, hi); - set_cpu_model(); + /* + * We use short pauses (1us) for low latency. 30,000 iterations is + * longer than a full restart would require so it should be more + * than long enough. + */ + mp_wait_for_cpu_up(slot_num, 30000, 1); mp_enable_preemption(); -} - -void -lapic_init(void) -{ - int value; - - /* Set flat delivery model, logical processor id */ - LAPIC_REG(DFR) = LAPIC_DFR_FLAT; - LAPIC_REG(LDR) = (get_cpu_number()) << LAPIC_LDR_SHIFT; - /* Accept all */ - LAPIC_REG(TPR) = 0; + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_FAST_START | DBG_FUNC_END, + slot_num, cpu_datap(slot_num)->cpu_running, 0, 0, 0); - LAPIC_REG(SVR) = LAPIC_VECTOR(SPURIOUS) | LAPIC_SVR_ENABLE; - - /* ExtINT */ - if (get_cpu_number() == master_cpu) { - value = LAPIC_REG(LVT_LINT0); - value &= ~LAPIC_LVT_MASKED; - value |= LAPIC_LVT_DM_EXTINT; - LAPIC_REG(LVT_LINT0) = value; + /* + * Check to make sure that the CPU is really running. If not, + * go through the slow path. + */ + if (cpu_datap(slot_num)->cpu_running) { + return KERN_SUCCESS; + } else { + return KERN_FAILURE; } - - /* Timer: unmasked, one-shot */ - LAPIC_REG(LVT_TIMER) = LAPIC_VECTOR(TIMER); - - /* Perfmon: unmasked */ - LAPIC_REG(LVT_PERFCNT) = LAPIC_VECTOR(PERFCNT); - - lapic_esr_clear(); - - LAPIC_REG(LVT_ERROR) = LAPIC_VECTOR(ERROR); - } -void -lapic_set_timer_func(i386_intr_func_t func) +static void +started_cpu(void) { - lapic_timer_func = func; -} + /* Here on the started cpu with cpu_running set TRUE */ -void -lapic_set_timer( - boolean_t interrupt, - lapic_timer_mode_t mode, - lapic_timer_divide_t divisor, - lapic_timer_count_t initial_count) -{ - boolean_t state; - uint32_t timer_vector; - - state = ml_set_interrupts_enabled(FALSE); - timer_vector = LAPIC_REG(LVT_TIMER); - timer_vector &= ~(LAPIC_LVT_MASKED|LAPIC_LVT_PERIODIC);; - timer_vector |= interrupt ? 0 : LAPIC_LVT_MASKED; - timer_vector |= (mode == periodic) ? LAPIC_LVT_PERIODIC : 0; - LAPIC_REG(LVT_TIMER) = timer_vector; - LAPIC_REG(TIMER_DIVIDE_CONFIG) = divisor; - LAPIC_REG(TIMER_INITIAL_COUNT) = initial_count; - ml_set_interrupts_enabled(state); + if (TSC_sync_margin && + start_info.target_cpu == cpu_number()) { + /* + * I've just started-up, synchronize again with the starter cpu + * and then snap my TSC. + */ + tsc_target = 0; + atomic_decl(&tsc_entry_barrier, 1); + while (tsc_entry_barrier != 0) { + ; /* spin for starter and target at barrier */ + } + tsc_target = rdtsc64(); + atomic_decl(&tsc_exit_barrier, 1); + } } -void -lapic_get_timer( - lapic_timer_mode_t *mode, - lapic_timer_divide_t *divisor, - lapic_timer_count_t *initial_count, - lapic_timer_count_t *current_count) -{ - boolean_t state; - - state = ml_set_interrupts_enabled(FALSE); - if (mode) - *mode = (LAPIC_REG(LVT_TIMER) & LAPIC_LVT_PERIODIC) ? - periodic : one_shot; - if (divisor) - *divisor = LAPIC_REG(TIMER_DIVIDE_CONFIG) & LAPIC_TIMER_DIVIDE_MASK; - if (initial_count) - *initial_count = LAPIC_REG(TIMER_INITIAL_COUNT); - if (current_count) - *current_count = LAPIC_REG(TIMER_CURRENT_COUNT); - ml_set_interrupts_enabled(state); -} - -void -lapic_set_pmi_func(i386_intr_func_t func) +static void +start_cpu(void *arg) { - lapic_pmi_func = func; -} + int i = 1000; + processor_start_info_t *psip = (processor_start_info_t *) arg; -static inline void -_lapic_end_of_interrupt(void) -{ - LAPIC_REG(EOI) = 0; -} + /* Ignore this if the current processor is not the starter */ + if (cpu_number() != psip->starter_cpu) { + return; + } -void -lapic_end_of_interrupt(void) -{ - _lapic_end_of_interrupt(); -} + DBG("start_cpu(%p) about to start cpu %d, lapic %d\n", + arg, psip->target_cpu, psip->target_lapic); -int -lapic_interrupt(int interrupt, void *state) -{ - interrupt -= lapic_interrupt_base; - if (interrupt < 0) - return 0; - - switch(interrupt) { - case LAPIC_PERFCNT_INTERRUPT: - if (lapic_pmi_func != NULL) - (*lapic_pmi_func)( - (struct i386_interrupt_state *) state); - /* Clear interrupt masked */ - LAPIC_REG(LVT_PERFCNT) = LAPIC_VECTOR(PERFCNT); - _lapic_end_of_interrupt(); - return 1; - case LAPIC_TIMER_INTERRUPT: - _lapic_end_of_interrupt(); - if (lapic_timer_func != NULL) - (*lapic_timer_func)( - (struct i386_interrupt_state *) state); - return 1; - case LAPIC_ERROR_INTERRUPT: - lapic_dump(); - panic("Local APIC error\n"); - _lapic_end_of_interrupt(); - return 1; - case LAPIC_SPURIOUS_INTERRUPT: - kprintf("SPIV\n"); - /* No EOI required here */ - return 1; - case LAPIC_INTERPROCESSOR_INTERRUPT: - cpu_signal_handler((struct i386_interrupt_state *) state); - _lapic_end_of_interrupt(); - return 1; - } - return 0; -} + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_START | DBG_FUNC_START, + psip->target_cpu, + psip->target_lapic, 0, 0, 0); -void -lapic_smm_restore(void) -{ - boolean_t state; + i386_start_cpu(psip->target_lapic, psip->target_cpu); - if (lapic_os_enabled == FALSE) - return; +#ifdef POSTCODE_DELAY + /* Wait much longer if postcodes are displayed for a delay period. */ + i *= 10000; +#endif + DBG("start_cpu(%p) about to wait for cpu %d\n", + arg, psip->target_cpu); - state = ml_set_interrupts_enabled(FALSE); + mp_wait_for_cpu_up(psip->target_cpu, i * 100, 100); - if (LAPIC_ISR_IS_SET(LAPIC_REDUCED_INTERRUPT_BASE, TIMER)) { - /* - * Bogus SMI handler enables interrupts but does not know about - * local APIC interrupt sources. When APIC timer counts down to - * zero while in SMM, local APIC will end up waiting for an EOI - * but no interrupt was delivered to the OS. - */ - _lapic_end_of_interrupt(); + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_START | DBG_FUNC_END, + psip->target_cpu, + cpu_datap(psip->target_cpu)->cpu_running, 0, 0, 0); + if (TSC_sync_margin && + cpu_datap(psip->target_cpu)->cpu_running) { /* - * timer is one-shot, trigger another quick countdown to trigger - * another timer interrupt. + * Compare the TSC from the started processor with ours. + * Report and log/panic if it diverges by more than + * TSC_sync_margin (TSC_SYNC_MARGIN) ticks. This margin + * can be overriden by boot-arg (with 0 meaning no checking). */ - if (LAPIC_REG(TIMER_CURRENT_COUNT) == 0) { - LAPIC_REG(TIMER_INITIAL_COUNT) = 1; + uint64_t tsc_starter; + int64_t tsc_delta; + atomic_decl(&tsc_entry_barrier, 1); + while (tsc_entry_barrier != 0) { + ; /* spin for both processors at barrier */ + } + tsc_starter = rdtsc64(); + atomic_decl(&tsc_exit_barrier, 1); + while (tsc_exit_barrier != 0) { + ; /* spin for target to store its TSC */ + } + tsc_delta = tsc_target - tsc_starter; + kprintf("TSC sync for cpu %d: 0x%016llx delta 0x%llx (%lld)\n", + psip->target_cpu, tsc_target, tsc_delta, tsc_delta); + if (ABS(tsc_delta) > (int64_t) TSC_sync_margin) { +#if DEBUG + panic( +#else + printf( +#endif + "Unsynchronized TSC for cpu %d: " + "0x%016llx, delta 0x%llx\n", + psip->target_cpu, tsc_target, tsc_delta); } - - kprintf("lapic_smm_restore\n"); } - - ml_set_interrupts_enabled(state); } kern_return_t intel_startCPU( - int slot_num) + int slot_num) { - - int i = 1000; - int lapic = cpu_to_lapic[slot_num]; + int lapic = cpu_to_lapic[slot_num]; + boolean_t istate; assert(lapic != -1); DBGLOG_CPU_INIT(slot_num); DBG("intel_startCPU(%d) lapic_id=%d\n", slot_num, lapic); - DBG("IdlePTD(%p): 0x%x\n", &IdlePTD, (int) IdlePTD); + DBG("IdlePTD(%p): 0x%x\n", &IdlePTD, (int) (uintptr_t)IdlePTD); - /* Initialize (or re-initialize) the descriptor tables for this cpu. */ - mp_desc_init(cpu_datap(slot_num), FALSE); + /* + * Initialize (or re-initialize) the descriptor tables for this cpu. + * Propagate processor mode to slave. + */ + cpu_desc_init(cpu_datap(slot_num)); - /* Serialize use of the slave boot stack. */ - mutex_lock(&mp_cpu_boot_lock); + /* Serialize use of the slave boot stack, etc. */ + lck_mtx_lock(&mp_cpu_boot_lock); - mp_disable_preemption(); + istate = ml_set_interrupts_enabled(FALSE); if (slot_num == get_cpu_number()) { - mp_enable_preemption(); - mutex_unlock(&mp_cpu_boot_lock); + ml_set_interrupts_enabled(istate); + lck_mtx_unlock(&mp_cpu_boot_lock); return KERN_SUCCESS; } - LAPIC_REG(ICRD) = lapic << LAPIC_ICRD_DEST_SHIFT; - LAPIC_REG(ICR) = LAPIC_ICR_DM_INIT; - delay(10000); + start_info.starter_cpu = cpu_number(); + start_info.target_cpu = slot_num; + start_info.target_lapic = lapic; + tsc_entry_barrier = 2; + tsc_exit_barrier = 2; - LAPIC_REG(ICRD) = lapic << LAPIC_ICRD_DEST_SHIFT; - LAPIC_REG(ICR) = LAPIC_ICR_DM_STARTUP|(MP_BOOT>>12); - delay(200); - - LAPIC_REG(ICRD) = lapic << LAPIC_ICRD_DEST_SHIFT; - LAPIC_REG(ICR) = LAPIC_ICR_DM_STARTUP|(MP_BOOT>>12); - delay(200); + /* + * Perform the processor startup sequence with all running + * processors rendezvous'ed. This is required during periods when + * the cache-disable bit is set for MTRR/PAT initialization. + */ + mp_rendezvous_no_intrs(start_cpu, (void *) &start_info); -#ifdef POSTCODE_DELAY - /* Wait much longer if postcodes are displayed for a delay period. */ - i *= 10000; -#endif - while(i-- > 0) { - if (cpu_datap(slot_num)->cpu_running) - break; - delay(10000); - } + start_info.target_cpu = 0; - mp_enable_preemption(); - mutex_unlock(&mp_cpu_boot_lock); + ml_set_interrupts_enabled(istate); + lck_mtx_unlock(&mp_cpu_boot_lock); if (!cpu_datap(slot_num)->cpu_running) { - DBG("Failed to start CPU %02d\n", slot_num); + kprintf("Failed to start CPU %02d\n", slot_num); printf("Failed to start CPU %02d, rebooting...\n", slot_num); delay(1000000); - cpu_shutdown(); + halt_cpu(); return KERN_SUCCESS; } else { - DBG("Started CPU %02d\n", slot_num); - printf("Started CPU %02d\n", slot_num); + kprintf("Started cpu %d (lapic id %08x)\n", slot_num, lapic); return KERN_SUCCESS; } } -extern char slave_boot_base[]; -extern char slave_boot_end[]; -extern void pstart(void); - -void -slave_boot_init(void) -{ - DBG("V(slave_boot_base)=%p P(slave_boot_base)=%p MP_BOOT=%p sz=0x%x\n", - slave_boot_base, - kvtophys((vm_offset_t) slave_boot_base), - MP_BOOT, - slave_boot_end-slave_boot_base); - - /* - * Copy the boot entry code to the real-mode vector area MP_BOOT. - * This is in page 1 which has been reserved for this purpose by - * machine_startup() from the boot processor. - * The slave boot code is responsible for switching to protected - * mode and then jumping to the common startup, _start(). - */ - bcopy_phys((addr64_t) kvtophys((vm_offset_t) slave_boot_base), - (addr64_t) MP_BOOT, - slave_boot_end-slave_boot_base); - - /* - * Zero a stack area above the boot code. - */ - DBG("bzero_phys 0x%x sz 0x%x\n",MP_BOOTSTACK+MP_BOOT-0x400, 0x400); - bzero_phys((addr64_t)MP_BOOTSTACK+MP_BOOT-0x400, 0x400); - - /* - * Set the location at the base of the stack to point to the - * common startup entry. - */ - DBG("writing 0x%x at phys 0x%x\n", - kvtophys((vm_offset_t) &pstart), MP_MACH_START+MP_BOOT); - ml_phys_write_word(MP_MACH_START+MP_BOOT, - kvtophys((vm_offset_t) &pstart)); - - /* Flush caches */ - __asm__("wbinvd"); -} - -#if MP_DEBUG -cpu_signal_event_log_t *cpu_signal[MAX_CPUS]; -cpu_signal_event_log_t *cpu_handle[MAX_CPUS]; +#if MP_DEBUG +cpu_signal_event_log_t *cpu_signal[MAX_CPUS]; +cpu_signal_event_log_t *cpu_handle[MAX_CPUS]; MP_EVENT_NAME_DECL(); -#endif /* MP_DEBUG */ +#endif /* MP_DEBUG */ -void -cpu_signal_handler(__unused struct i386_interrupt_state *regs) +/* + * Note: called with NULL state when polling for TLB flush and cross-calls. + */ +int +cpu_signal_handler(x86_saved_state_t *regs) { - int my_cpu; - volatile int *my_word; -#if MACH_KDB && MACH_ASSERT - int i=100; -#endif /* MACH_KDB && MACH_ASSERT */ +#if !MACH_KDP +#pragma unused (regs) +#endif /* !MACH_KDP */ + int my_cpu; + volatile int *my_word; - mp_disable_preemption(); + SCHED_STATS_IPI(current_processor()); my_cpu = cpu_number(); - my_word = ¤t_cpu_datap()->cpu_signals; + my_word = &cpu_data_ptr[my_cpu]->cpu_signals; + /* Store the initial set of signals for diagnostics. New + * signals could arrive while these are being processed + * so it's no more than a hint. + */ + + cpu_data_ptr[my_cpu]->cpu_prior_signals = *my_word; do { -#if MACH_KDB && MACH_ASSERT - if (i-- <= 0) - Debugger("cpu_signal_handler"); -#endif /* MACH_KDB && MACH_ASSERT */ -#if MACH_KDP +#if MACH_KDP if (i_bit(MP_KDP, my_word)) { - DBGLOG(cpu_handle,my_cpu,MP_KDP); + DBGLOG(cpu_handle, my_cpu, MP_KDP); i_bit_clear(MP_KDP, my_word); - mp_kdp_wait(); +/* Ensure that the i386_kernel_state at the base of the + * current thread's stack (if any) is synchronized with the + * context at the moment of the interrupt, to facilitate + * access through the debugger. + */ + sync_iss_to_iks(regs); + if (pmsafe_debug && !kdp_snapshot) { + pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE); + } + mp_kdp_wait(TRUE, FALSE); + if (pmsafe_debug && !kdp_snapshot) { + pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL); + } } else -#endif /* MACH_KDP */ +#endif /* MACH_KDP */ if (i_bit(MP_TLB_FLUSH, my_word)) { - DBGLOG(cpu_handle,my_cpu,MP_TLB_FLUSH); + DBGLOG(cpu_handle, my_cpu, MP_TLB_FLUSH); i_bit_clear(MP_TLB_FLUSH, my_word); pmap_update_interrupt(); + } else if (i_bit(MP_CALL, my_word)) { + DBGLOG(cpu_handle, my_cpu, MP_CALL); + i_bit_clear(MP_CALL, my_word); + mp_cpus_call_action(); + } else if (i_bit(MP_CALL_PM, my_word)) { + DBGLOG(cpu_handle, my_cpu, MP_CALL_PM); + i_bit_clear(MP_CALL_PM, my_word); + mp_call_PM(); + } + if (regs == NULL) { + /* Called to poll only for cross-calls and TLB flush */ + break; } else if (i_bit(MP_AST, my_word)) { - DBGLOG(cpu_handle,my_cpu,MP_AST); + DBGLOG(cpu_handle, my_cpu, MP_AST); i_bit_clear(MP_AST, my_word); ast_check(cpu_to_processor(my_cpu)); -#if MACH_KDB - } else if (i_bit(MP_KDB, my_word)) { - extern kdb_is_slave[]; - - i_bit_clear(MP_KDB, my_word); - kdb_is_slave[my_cpu]++; - kdb_kintr(); -#endif /* MACH_KDB */ - } else if (i_bit(MP_RENDEZVOUS, my_word)) { - DBGLOG(cpu_handle,my_cpu,MP_RENDEZVOUS); - i_bit_clear(MP_RENDEZVOUS, my_word); - mp_rendezvous_action(); } } while (*my_word); - mp_enable_preemption(); - + return 0; } -#ifdef MP_DEBUG -extern int max_lock_loops; -#endif /* MP_DEBUG */ -void -cpu_interrupt(int cpu) +extern void kprintf_break_lock(void); +int +NMIInterruptHandler(x86_saved_state_t *regs) { - boolean_t state; - - if (smp_initialized) { + void *stackptr; + char pstr[256]; + uint64_t now = mach_absolute_time(); - /* Wait for previous interrupt to be delivered... */ -#ifdef MP_DEBUG - int pending_busy_count = 0; - while (LAPIC_REG(ICR) & LAPIC_ICR_DS_PENDING) { - if (++pending_busy_count > max_lock_loops) - panic("cpus_interrupt() deadlock\n"); -#else - while (LAPIC_REG(ICR) & LAPIC_ICR_DS_PENDING) { -#endif /* MP_DEBUG */ + if (panic_active() && !panicDebugging) { + if (pmsafe_debug) { + pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE); + } + for (;;) { cpu_pause(); } + } + + atomic_incl(&NMIPI_acks, 1); + atomic_incl(&NMI_count, 1); + sync_iss_to_iks_unconditionally(regs); + __asm__ volatile ("movq %%rbp, %0" : "=m" (stackptr)); + + if (cpu_number() == debugger_cpu) { + goto NMExit; + } - state = ml_set_interrupts_enabled(FALSE); - LAPIC_REG(ICRD) = - cpu_to_lapic[cpu] << LAPIC_ICRD_DEST_SHIFT; - LAPIC_REG(ICR) = - LAPIC_VECTOR(INTERPROCESSOR) | LAPIC_ICR_DM_FIXED; - (void) ml_set_interrupts_enabled(state); + if (NMI_panic_reason == SPINLOCK_TIMEOUT) { + snprintf(&pstr[0], sizeof(pstr), + "Panic(CPU %d, time %llu): NMIPI for spinlock acquisition timeout, spinlock: %p, spinlock owner: %p, current_thread: %p, spinlock_owner_cpu: 0x%x\n", + cpu_number(), now, spinlock_timed_out, (void *) spinlock_timed_out->interlock.lock_data, current_thread(), spinlock_owner_cpu); + panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs); + } else if (NMI_panic_reason == TLB_FLUSH_TIMEOUT) { + snprintf(&pstr[0], sizeof(pstr), + "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: TLB flush timeout, TLB state:0x%x\n", + cpu_number(), now, current_cpu_datap()->cpu_tlb_invalid); + panic_i386_backtrace(stackptr, 48, &pstr[0], TRUE, regs); + } else if (NMI_panic_reason == CROSSCALL_TIMEOUT) { + snprintf(&pstr[0], sizeof(pstr), + "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: cross-call timeout\n", + cpu_number(), now); + panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs); + } else if (NMI_panic_reason == INTERRUPT_WATCHDOG) { + snprintf(&pstr[0], sizeof(pstr), + "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: interrupt watchdog for vector 0x%x\n", + cpu_number(), now, vector_timed_out); + panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs); } +#if MACH_KDP + if (pmsafe_debug && !kdp_snapshot) { + pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE); + } + current_cpu_datap()->cpu_NMI_acknowledged = TRUE; + i_bit_clear(MP_KDP, ¤t_cpu_datap()->cpu_signals); + if (panic_active() || NMI_panic_reason != NONE) { + mp_kdp_wait(FALSE, TRUE); + } else if (!mp_kdp_trap && + !mp_kdp_is_NMI && + virtualized && (debug_boot_arg & DB_NMI)) { + /* + * Under a VMM with the debug boot-arg set, drop into kdp. + * Since an NMI is involved, there's a risk of contending with + * a panic. And side-effects of NMIs may result in entry into, + * and continuing from, the debugger being unreliable. + */ + if (__sync_bool_compare_and_swap(&mp_kdp_is_NMI, FALSE, TRUE)) { + kprintf_break_lock(); + kprintf("Debugger entry requested by NMI\n"); + kdp_i386_trap(T_DEBUG, saved_state64(regs), 0, 0); + printf("Debugger entry requested by NMI\n"); + mp_kdp_is_NMI = FALSE; + } else { + mp_kdp_wait(FALSE, FALSE); + } + } else { + mp_kdp_wait(FALSE, FALSE); + } + if (pmsafe_debug && !kdp_snapshot) { + pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL); + } +#endif +NMExit: + return 1; } + +/* + * cpu_interrupt is really just to be used by the scheduler to + * get a CPU's attention it may not always issue an IPI. If an + * IPI is always needed then use i386_cpu_IPI. + */ void -i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode) +cpu_interrupt(int cpu) { - volatile int *signals = &cpu_datap(cpu)->cpu_signals; - uint64_t tsc_timeout; - + boolean_t did_IPI = FALSE; - if (!cpu_datap(cpu)->cpu_running) - return; + if (smp_initialized + && pmCPUExitIdle(cpu_datap(cpu))) { + i386_cpu_IPI(cpu); + did_IPI = TRUE; + } - DBGLOG(cpu_signal, cpu, event); + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, did_IPI, 0, 0, 0); +} - i_bit_set(event, signals); - cpu_interrupt(cpu); - if (mode == SYNC) { - again: - tsc_timeout = rdtsc64() + (1000*1000*1000); - while (i_bit(event, signals) && rdtsc64() < tsc_timeout) { - cpu_pause(); - } - if (i_bit(event, signals)) { - DBG("i386_signal_cpu(%d, 0x%x, SYNC) timed out\n", - cpu, event); - goto again; - } +/* + * Send a true NMI via the local APIC to the specified CPU. + */ +void +cpu_NMI_interrupt(int cpu) +{ + if (smp_initialized) { + i386_send_NMI(cpu); } } void -i386_signal_cpus(mp_event_t event, mp_sync_t mode) +NMI_cpus(void) { - unsigned int cpu; - unsigned int my_cpu = cpu_number(); + unsigned int cpu; + boolean_t intrs_enabled; + uint64_t tsc_timeout; + intrs_enabled = ml_set_interrupts_enabled(FALSE); + NMIPI_enable(TRUE); for (cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running) + if (!cpu_is_running(cpu)) { continue; - i386_signal_cpu(cpu, event, mode); + } + cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE; + cpu_NMI_interrupt(cpu); + tsc_timeout = !machine_timeout_suspended() ? + rdtsc64() + (1000 * 1000 * 1000 * 10ULL) : + ~0ULL; + while (!cpu_datap(cpu)->cpu_NMI_acknowledged) { + handle_pending_TLB_flushes(); + cpu_pause(); + if (rdtsc64() > tsc_timeout) { + panic("NMI_cpus() timeout cpu %d", cpu); + } + } + cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE; } + NMIPI_enable(FALSE); + + ml_set_interrupts_enabled(intrs_enabled); } -int -i386_active_cpus(void) +static void(*volatile mp_PM_func)(void) = NULL; + +static void +mp_call_PM(void) { - unsigned int cpu; - unsigned int ncpus = 0; + assert(!ml_get_interrupts_enabled()); - for (cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu_datap(cpu)->cpu_running) - ncpus++; + if (mp_PM_func != NULL) { + mp_PM_func(); + } +} + +void +cpu_PM_interrupt(int cpu) +{ + assert(!ml_get_interrupts_enabled()); + + if (mp_PM_func != NULL) { + if (cpu == cpu_number()) { + mp_PM_func(); + } else { + i386_signal_cpu(cpu, MP_CALL_PM, ASYNC); + } + } +} + +void +PM_interrupt_register(void (*fn)(void)) +{ + mp_PM_func = fn; +} + +void +i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode) +{ + volatile int *signals = &cpu_datap(cpu)->cpu_signals; + uint64_t tsc_timeout; + + + if (!cpu_datap(cpu)->cpu_running) { + return; + } + + if (event == MP_TLB_FLUSH) { + KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_START, cpu, 0, 0, 0, 0); + } + + DBGLOG(cpu_signal, cpu, event); + + i_bit_set(event, signals); + i386_cpu_IPI(cpu); + if (mode == SYNC) { +again: + tsc_timeout = !machine_timeout_suspended() ? + rdtsc64() + (1000 * 1000 * 1000) : + ~0ULL; + while (i_bit(event, signals) && rdtsc64() < tsc_timeout) { + cpu_pause(); + } + if (i_bit(event, signals)) { + DBG("i386_signal_cpu(%d, 0x%x, SYNC) timed out\n", + cpu, event); + goto again; + } + } + if (event == MP_TLB_FLUSH) { + KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_END, cpu, 0, 0, 0, 0); + } +} + +/* + * Helper function called when busy-waiting: panic if too long + * a TSC-based time has elapsed since the start of the spin. + */ +static boolean_t +mp_spin_timeout(uint64_t tsc_start) +{ + uint64_t tsc_timeout; + + cpu_pause(); + if (machine_timeout_suspended()) { + return FALSE; + } + + /* + * The timeout is 4 * the spinlock timeout period + * unless we have serial console printing (kprintf) enabled + * in which case we allow an even greater margin. + */ + tsc_timeout = disable_serial_output ? LockTimeOutTSC << 2 + : LockTimeOutTSC << 4; + return rdtsc64() > tsc_start + tsc_timeout; +} + +/* + * Helper function to take a spinlock while ensuring that incoming IPIs + * are still serviced if interrupts are masked while we spin. + * Returns current interrupt state. + */ +boolean_t +mp_safe_spin_lock(usimple_lock_t lock) +{ + if (ml_get_interrupts_enabled()) { + simple_lock(lock, LCK_GRP_NULL); + return TRUE; + } else { + uint64_t tsc_spin_start = rdtsc64(); + while (!simple_lock_try(lock, LCK_GRP_NULL)) { + cpu_signal_handler(NULL); + if (mp_spin_timeout(tsc_spin_start)) { + uint32_t lock_cpu; + uintptr_t lowner = (uintptr_t) + lock->interlock.lock_data; + spinlock_timed_out = lock; + lock_cpu = spinlock_timeout_NMI(lowner); + NMIPI_panic(cpu_to_cpumask(lock_cpu), SPINLOCK_TIMEOUT); + panic("mp_safe_spin_lock() timed out, lock: %p, owner thread: 0x%lx, current_thread: %p, owner on CPU 0x%x, time: %llu", + lock, lowner, current_thread(), lock_cpu, mach_absolute_time()); + } + } + return FALSE; } - return(ncpus); } /* * All-CPU rendezvous: - * - CPUs are signalled, + * - CPUs are signalled, * - all execute the setup function (if specified), * - rendezvous (i.e. all cpus reach a barrier), * - all execute the action function (if specified), @@ -894,47 +847,90 @@ i386_active_cpus(void) */ static void -mp_rendezvous_action(void) +mp_rendezvous_action(__unused void *null) { + boolean_t intrs_enabled; + uint64_t tsc_spin_start; + + /* + * Note that mp_rv_lock was acquired by the thread that initiated the + * rendezvous and must have been acquired before we enter + * mp_rendezvous_action(). + */ + current_cpu_datap()->cpu_rendezvous_in_progress = TRUE; /* setup function */ - if (mp_rv_setup_func != NULL) + if (mp_rv_setup_func != NULL) { mp_rv_setup_func(mp_rv_func_arg); + } + + intrs_enabled = ml_get_interrupts_enabled(); + /* spin on entry rendezvous */ - atomic_incl(&mp_rv_waiters[0], 1); - while (*((volatile long *) &mp_rv_waiters[0]) < mp_rv_ncpus) - cpu_pause(); + atomic_incl(&mp_rv_entry, 1); + tsc_spin_start = rdtsc64(); + + while (mp_rv_entry < mp_rv_ncpus) { + /* poll for pesky tlb flushes if interrupts disabled */ + if (!intrs_enabled) { + handle_pending_TLB_flushes(); + } + if (mp_spin_timeout(tsc_spin_start)) { + panic("mp_rv_action() entry: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_entry, mp_rv_ncpus, tsc_spin_start, rdtsc64()); + } + } + /* action function */ - if (mp_rv_action_func != NULL) + if (mp_rv_action_func != NULL) { mp_rv_action_func(mp_rv_func_arg); + } + /* spin on exit rendezvous */ - atomic_incl(&mp_rv_waiters[1], 1); - while (*((volatile long *) &mp_rv_waiters[1]) < mp_rv_ncpus) - cpu_pause(); + atomic_incl(&mp_rv_exit, 1); + tsc_spin_start = rdtsc64(); + while (mp_rv_exit < mp_rv_ncpus) { + if (!intrs_enabled) { + handle_pending_TLB_flushes(); + } + if (mp_spin_timeout(tsc_spin_start)) { + panic("mp_rv_action() exit: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_exit, mp_rv_ncpus, tsc_spin_start, rdtsc64()); + } + } + /* teardown function */ - if (mp_rv_teardown_func != NULL) + if (mp_rv_teardown_func != NULL) { mp_rv_teardown_func(mp_rv_func_arg); + } + + current_cpu_datap()->cpu_rendezvous_in_progress = FALSE; + + /* Bump completion count */ + atomic_incl(&mp_rv_complete, 1); } void -mp_rendezvous(void (*setup_func)(void *), - void (*action_func)(void *), - void (*teardown_func)(void *), - void *arg) +mp_rendezvous(void (*setup_func)(void *), + void (*action_func)(void *), + void (*teardown_func)(void *), + void *arg) { + uint64_t tsc_spin_start; if (!smp_initialized) { - if (setup_func != NULL) + if (setup_func != NULL) { setup_func(arg); - if (action_func != NULL) + } + if (action_func != NULL) { action_func(arg); - if (teardown_func != NULL) + } + if (teardown_func != NULL) { teardown_func(arg); + } return; } - + /* obtain rendezvous lock */ - simple_lock(&mp_rv_lock); + mp_rendezvous_lock(); /* set static function pointers */ mp_rv_setup_func = setup_func; @@ -942,36 +938,668 @@ mp_rendezvous(void (*setup_func)(void *), mp_rv_teardown_func = teardown_func; mp_rv_func_arg = arg; - mp_rv_waiters[0] = 0; /* entry rendezvous count */ - mp_rv_waiters[1] = 0; /* exit rendezvous count */ - mp_rv_ncpus = i386_active_cpus(); + mp_rv_entry = 0; + mp_rv_exit = 0; + mp_rv_complete = 0; /* * signal other processors, which will call mp_rendezvous_action() * with interrupts disabled */ - i386_signal_cpus(MP_RENDEZVOUS, ASYNC); + mp_rv_ncpus = mp_cpus_call(CPUMASK_OTHERS, NOSYNC, &mp_rendezvous_action, NULL) + 1; /* call executor function on this cpu */ - mp_rendezvous_action(); + mp_rendezvous_action(NULL); + + /* + * Spin for everyone to complete. + * This is necessary to ensure that all processors have proceeded + * from the exit barrier before we release the rendezvous structure. + */ + tsc_spin_start = rdtsc64(); + while (mp_rv_complete < mp_rv_ncpus) { + if (mp_spin_timeout(tsc_spin_start)) { + panic("mp_rendezvous() timeout: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_complete, mp_rv_ncpus, tsc_spin_start, rdtsc64()); + } + } + + /* Tidy up */ + mp_rv_setup_func = NULL; + mp_rv_action_func = NULL; + mp_rv_teardown_func = NULL; + mp_rv_func_arg = NULL; /* release lock */ + mp_rendezvous_unlock(); +} + +void +mp_rendezvous_lock(void) +{ + (void) mp_safe_spin_lock(&mp_rv_lock); +} + +void +mp_rendezvous_unlock(void) +{ simple_unlock(&mp_rv_lock); } -#if MACH_KDP -volatile boolean_t mp_kdp_trap = FALSE; -long mp_kdp_ncpus; -boolean_t mp_kdp_state; +void +mp_rendezvous_break_lock(void) +{ + simple_lock_init(&mp_rv_lock, 0); +} + +static void +setup_disable_intrs(__unused void * param_not_used) +{ + /* disable interrupts before the first barrier */ + boolean_t intr = ml_set_interrupts_enabled(FALSE); + + current_cpu_datap()->cpu_iflag = intr; + DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__); +} +static void +teardown_restore_intrs(__unused void * param_not_used) +{ + /* restore interrupt flag following MTRR changes */ + ml_set_interrupts_enabled(current_cpu_datap()->cpu_iflag); + DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__); +} +/* + * A wrapper to mp_rendezvous() to call action_func() with interrupts disabled. + * This is exported for use by kexts. + */ void -mp_kdp_enter(void) +mp_rendezvous_no_intrs( + void (*action_func)(void *), + void *arg) +{ + mp_rendezvous(setup_disable_intrs, + action_func, + teardown_restore_intrs, + arg); +} + + +typedef struct { + queue_chain_t link; /* queue linkage */ + void (*func)(void *, void *); /* routine to call */ + void *arg0; /* routine's 1st arg */ + void *arg1; /* routine's 2nd arg */ + cpumask_t *maskp; /* completion response mask */ +} mp_call_t; + + +typedef struct { + queue_head_t queue; + decl_simple_lock_data(, lock); +} mp_call_queue_t; +#define MP_CPUS_CALL_BUFS_PER_CPU MAX_CPUS +static mp_call_queue_t mp_cpus_call_freelist; +static mp_call_queue_t mp_cpus_call_head[MAX_CPUS]; + +static inline boolean_t +mp_call_head_lock(mp_call_queue_t *cqp) { - unsigned int cpu; - unsigned int ncpus; - unsigned int my_cpu = cpu_number(); - uint64_t tsc_timeout; + boolean_t intrs_enabled; + + intrs_enabled = ml_set_interrupts_enabled(FALSE); + simple_lock(&cqp->lock, LCK_GRP_NULL); + + return intrs_enabled; +} + +/* + * Deliver an NMIPI to a set of processors to cause them to panic . + */ +void +NMIPI_panic(cpumask_t cpu_mask, NMI_reason_t why) +{ + unsigned int cpu; + cpumask_t cpu_bit; + uint64_t deadline; + + NMIPI_enable(TRUE); + NMI_panic_reason = why; + + for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) { + if ((cpu_mask & cpu_bit) == 0) { + continue; + } + cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE; + cpu_NMI_interrupt(cpu); + } + + /* Wait (only so long) for NMi'ed cpus to respond */ + deadline = mach_absolute_time() + LockTimeOut; + for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) { + if ((cpu_mask & cpu_bit) == 0) { + continue; + } + while (!cpu_datap(cpu)->cpu_NMI_acknowledged && + mach_absolute_time() < deadline) { + cpu_pause(); + } + } +} + +#if MACH_ASSERT +static inline boolean_t +mp_call_head_is_locked(mp_call_queue_t *cqp) +{ + return !ml_get_interrupts_enabled() && + hw_lock_held((hw_lock_t)&cqp->lock); +} +#endif + +static inline void +mp_call_head_unlock(mp_call_queue_t *cqp, boolean_t intrs_enabled) +{ + simple_unlock(&cqp->lock); + ml_set_interrupts_enabled(intrs_enabled); +} + +static inline mp_call_t * +mp_call_alloc(void) +{ + mp_call_t *callp = NULL; + boolean_t intrs_enabled; + mp_call_queue_t *cqp = &mp_cpus_call_freelist; + + intrs_enabled = mp_call_head_lock(cqp); + if (!queue_empty(&cqp->queue)) { + queue_remove_first(&cqp->queue, callp, typeof(callp), link); + } + mp_call_head_unlock(cqp, intrs_enabled); + + return callp; +} + +static inline void +mp_call_free(mp_call_t *callp) +{ + boolean_t intrs_enabled; + mp_call_queue_t *cqp = &mp_cpus_call_freelist; + + intrs_enabled = mp_call_head_lock(cqp); + queue_enter_first(&cqp->queue, callp, typeof(callp), link); + mp_call_head_unlock(cqp, intrs_enabled); +} + +static inline mp_call_t * +mp_call_dequeue_locked(mp_call_queue_t *cqp) +{ + mp_call_t *callp = NULL; + + assert(mp_call_head_is_locked(cqp)); + if (!queue_empty(&cqp->queue)) { + queue_remove_first(&cqp->queue, callp, typeof(callp), link); + } + return callp; +} + +static inline void +mp_call_enqueue_locked( + mp_call_queue_t *cqp, + mp_call_t *callp) +{ + queue_enter(&cqp->queue, callp, typeof(callp), link); +} + +/* Called on the boot processor to initialize global structures */ +static void +mp_cpus_call_init(void) +{ + mp_call_queue_t *cqp = &mp_cpus_call_freelist; + + DBG("mp_cpus_call_init()\n"); + simple_lock_init(&cqp->lock, 0); + queue_init(&cqp->queue); +} + +/* + * Called at processor registration to add call buffers to the free list + * and to initialize the per-cpu call queue. + */ +void +mp_cpus_call_cpu_init(int cpu) +{ + int i; + mp_call_queue_t *cqp = &mp_cpus_call_head[cpu]; + mp_call_t *callp; + + simple_lock_init(&cqp->lock, 0); + queue_init(&cqp->queue); + for (i = 0; i < MP_CPUS_CALL_BUFS_PER_CPU; i++) { + callp = (mp_call_t *) kalloc(sizeof(mp_call_t)); + mp_call_free(callp); + } + + DBG("mp_cpus_call_init(%d) done\n", cpu); +} + +/* + * This is called from cpu_signal_handler() to process an MP_CALL signal. + * And also from i386_deactivate_cpu() when a cpu is being taken offline. + */ +static void +mp_cpus_call_action(void) +{ + mp_call_queue_t *cqp; + boolean_t intrs_enabled; + mp_call_t *callp; + mp_call_t call; + + assert(!ml_get_interrupts_enabled()); + cqp = &mp_cpus_call_head[cpu_number()]; + intrs_enabled = mp_call_head_lock(cqp); + while ((callp = mp_call_dequeue_locked(cqp)) != NULL) { + /* Copy call request to the stack to free buffer */ + call = *callp; + mp_call_free(callp); + if (call.func != NULL) { + mp_call_head_unlock(cqp, intrs_enabled); + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL_ACTION, + VM_KERNEL_UNSLIDE(call.func), VM_KERNEL_UNSLIDE_OR_PERM(call.arg0), + VM_KERNEL_UNSLIDE_OR_PERM(call.arg1), VM_KERNEL_ADDRPERM(call.maskp), 0); + call.func(call.arg0, call.arg1); + (void) mp_call_head_lock(cqp); + } + if (call.maskp != NULL) { + i_bit_set(cpu_number(), call.maskp); + } + } + mp_call_head_unlock(cqp, intrs_enabled); +} + +/* + * mp_cpus_call() runs a given function on cpus specified in a given cpu mask. + * Possible modes are: + * SYNC: function is called serially on target cpus in logical cpu order + * waiting for each call to be acknowledged before proceeding + * ASYNC: function call is queued to the specified cpus + * waiting for all calls to complete in parallel before returning + * NOSYNC: function calls are queued + * but we return before confirmation of calls completing. + * The action function may be NULL. + * The cpu mask may include the local cpu. Offline cpus are ignored. + * The return value is the number of cpus on which the call was made or queued. + */ +cpu_t +mp_cpus_call( + cpumask_t cpus, + mp_sync_t mode, + void (*action_func)(void *), + void *arg) +{ + return mp_cpus_call1( + cpus, + mode, + (void (*)(void *, void *))action_func, + arg, + NULL, + NULL); +} + +static void +mp_cpus_call_wait(boolean_t intrs_enabled, + cpumask_t cpus_called, + cpumask_t *cpus_responded) +{ + mp_call_queue_t *cqp; + uint64_t tsc_spin_start; + + assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0); + cqp = &mp_cpus_call_head[cpu_number()]; + + tsc_spin_start = rdtsc64(); + while (*cpus_responded != cpus_called) { + if (!intrs_enabled) { + /* Sniffing w/o locking */ + if (!queue_empty(&cqp->queue)) { + mp_cpus_call_action(); + } + cpu_signal_handler(NULL); + } + if (mp_spin_timeout(tsc_spin_start)) { + cpumask_t cpus_unresponsive; + + cpus_unresponsive = cpus_called & ~(*cpus_responded); + NMIPI_panic(cpus_unresponsive, CROSSCALL_TIMEOUT); + panic("mp_cpus_call_wait() timeout, cpus: 0x%llx", + cpus_unresponsive); + } + } +} + +cpu_t +mp_cpus_call1( + cpumask_t cpus, + mp_sync_t mode, + void (*action_func)(void *, void *), + void *arg0, + void *arg1, + cpumask_t *cpus_calledp) +{ + cpu_t cpu = 0; + boolean_t intrs_enabled = FALSE; + boolean_t call_self = FALSE; + cpumask_t cpus_called = 0; + cpumask_t cpus_responded = 0; + long cpus_call_count = 0; + uint64_t tsc_spin_start; + boolean_t topo_lock; + + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL | DBG_FUNC_START, + cpus, mode, VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1)); + + if (!smp_initialized) { + if ((cpus & CPUMASK_SELF) == 0) { + goto out; + } + if (action_func != NULL) { + intrs_enabled = ml_set_interrupts_enabled(FALSE); + action_func(arg0, arg1); + ml_set_interrupts_enabled(intrs_enabled); + } + call_self = TRUE; + goto out; + } + + /* + * Queue the call for each non-local requested cpu. + * This is performed under the topo lock to prevent changes to + * cpus online state and to prevent concurrent rendezvouses -- + * although an exception is made if we're calling only the master + * processor since that always remains active. Note: this exception + * is expected for longterm timer nosync cross-calls to the master cpu. + */ + mp_disable_preemption(); + intrs_enabled = ml_get_interrupts_enabled(); + topo_lock = (cpus != cpu_to_cpumask(master_cpu)); + if (topo_lock) { + ml_set_interrupts_enabled(FALSE); + (void) mp_safe_spin_lock(&x86_topo_lock); + } + for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) { + if (((cpu_to_cpumask(cpu) & cpus) == 0) || + !cpu_is_running(cpu)) { + continue; + } + tsc_spin_start = rdtsc64(); + if (cpu == (cpu_t) cpu_number()) { + /* + * We don't IPI ourself and if calling asynchronously, + * we defer our call until we have signalled all others. + */ + call_self = TRUE; + if (mode == SYNC && action_func != NULL) { + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL_LOCAL, + VM_KERNEL_UNSLIDE(action_func), + VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0); + action_func(arg0, arg1); + } + } else { + /* + * Here to queue a call to cpu and IPI. + */ + mp_call_t *callp = NULL; + mp_call_queue_t *cqp = &mp_cpus_call_head[cpu]; + boolean_t intrs_inner; + +queue_call: + if (callp == NULL) { + callp = mp_call_alloc(); + } + intrs_inner = mp_call_head_lock(cqp); + if (callp == NULL) { + mp_call_head_unlock(cqp, intrs_inner); + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL_NOBUF, + cpu, 0, 0, 0, 0); + if (!intrs_inner) { + /* Sniffing w/o locking */ + if (!queue_empty(&cqp->queue)) { + mp_cpus_call_action(); + } + handle_pending_TLB_flushes(); + } + if (mp_spin_timeout(tsc_spin_start)) { + panic("mp_cpus_call1() timeout start: 0x%llx, cur: 0x%llx", + tsc_spin_start, rdtsc64()); + } + goto queue_call; + } + callp->maskp = (mode == NOSYNC) ? NULL : &cpus_responded; + callp->func = action_func; + callp->arg0 = arg0; + callp->arg1 = arg1; + mp_call_enqueue_locked(cqp, callp); + cpus_call_count++; + cpus_called |= cpu_to_cpumask(cpu); + i386_signal_cpu(cpu, MP_CALL, ASYNC); + mp_call_head_unlock(cqp, intrs_inner); + if (mode == SYNC) { + mp_cpus_call_wait(intrs_inner, cpus_called, &cpus_responded); + } + } + } + if (topo_lock) { + simple_unlock(&x86_topo_lock); + ml_set_interrupts_enabled(intrs_enabled); + } + + /* Call locally if mode not SYNC */ + if (mode != SYNC && call_self) { + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL_LOCAL, + VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0); + if (action_func != NULL) { + ml_set_interrupts_enabled(FALSE); + action_func(arg0, arg1); + ml_set_interrupts_enabled(intrs_enabled); + } + } + + /* For ASYNC, now wait for all signaled cpus to complete their calls */ + if (mode == ASYNC) { + mp_cpus_call_wait(intrs_enabled, cpus_called, &cpus_responded); + } + + /* Safe to allow pre-emption now */ + mp_enable_preemption(); + +out: + if (call_self) { + cpus_called |= cpu_to_cpumask(cpu); + cpus_call_count++; + } + + if (cpus_calledp) { + *cpus_calledp = cpus_called; + } + + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL | DBG_FUNC_END, + cpus_call_count, cpus_called, 0, 0, 0); + + return (cpu_t) cpus_call_count; +} + + +static void +mp_broadcast_action(__unused void *null) +{ + /* call action function */ + if (mp_bc_action_func != NULL) { + mp_bc_action_func(mp_bc_func_arg); + } + + /* if we're the last one through, wake up the instigator */ + if (atomic_decl_and_test(&mp_bc_count, 1)) { + thread_wakeup(((event_t)(uintptr_t) &mp_bc_count)); + } +} + +/* + * mp_broadcast() runs a given function on all active cpus. + * The caller blocks until the functions has run on all cpus. + * The caller will also block if there is another pending broadcast. + */ +void +mp_broadcast( + void (*action_func)(void *), + void *arg) +{ + if (!smp_initialized) { + if (action_func != NULL) { + action_func(arg); + } + return; + } + + /* obtain broadcast lock */ + lck_mtx_lock(&mp_bc_lock); + + /* set static function pointers */ + mp_bc_action_func = action_func; + mp_bc_func_arg = arg; + + assert_wait((event_t)(uintptr_t)&mp_bc_count, THREAD_UNINT); + + /* + * signal other processors, which will call mp_broadcast_action() + */ + mp_bc_count = real_ncpus; /* assume max possible active */ + mp_bc_ncpus = mp_cpus_call(CPUMASK_ALL, NOSYNC, *mp_broadcast_action, NULL); + atomic_decl(&mp_bc_count, real_ncpus - mp_bc_ncpus); /* subtract inactive */ + + /* block for other cpus to have run action_func */ + if (mp_bc_ncpus > 1) { + thread_block(THREAD_CONTINUE_NULL); + } else { + clear_wait(current_thread(), THREAD_AWAKENED); + } + + /* release lock */ + lck_mtx_unlock(&mp_bc_lock); +} + +void +mp_cpus_kick(cpumask_t cpus) +{ + cpu_t cpu; + boolean_t intrs_enabled = FALSE; + + intrs_enabled = ml_set_interrupts_enabled(FALSE); + mp_safe_spin_lock(&x86_topo_lock); + + for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) { + if ((cpu == (cpu_t) cpu_number()) + || ((cpu_to_cpumask(cpu) & cpus) == 0) + || !cpu_is_running(cpu)) { + continue; + } + + lapic_send_ipi(cpu, LAPIC_VECTOR(KICK)); + } + + simple_unlock(&x86_topo_lock); + ml_set_interrupts_enabled(intrs_enabled); +} + +void +i386_activate_cpu(void) +{ + cpu_data_t *cdp = current_cpu_datap(); + + assert(!ml_get_interrupts_enabled()); + + if (!smp_initialized) { + cdp->cpu_running = TRUE; + return; + } + + mp_safe_spin_lock(&x86_topo_lock); + cdp->cpu_running = TRUE; + started_cpu(); + pmap_tlbi_range(0, ~0ULL, true, 0); + simple_unlock(&x86_topo_lock); +} + +void +i386_deactivate_cpu(void) +{ + cpu_data_t *cdp = current_cpu_datap(); + + assert(!ml_get_interrupts_enabled()); + + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_START, + 0, 0, 0, 0, 0); + + mp_safe_spin_lock(&x86_topo_lock); + cdp->cpu_running = FALSE; + simple_unlock(&x86_topo_lock); + + /* + * Move all of this cpu's timers to the master/boot cpu, + * and poke it in case there's a sooner deadline for it to schedule. + */ + timer_queue_shutdown(&cdp->rtclock_timer.queue); + mp_cpus_call(cpu_to_cpumask(master_cpu), ASYNC, timer_queue_expire_local, NULL); + +#if MONOTONIC + mt_cpu_down(cdp); +#endif /* MONOTONIC */ + + /* + * Open an interrupt window + * and ensure any pending IPI or timer is serviced + */ + mp_disable_preemption(); + ml_set_interrupts_enabled(TRUE); + + while (cdp->cpu_signals && x86_lcpu()->rtcDeadline != EndOfAllTime) { + cpu_pause(); + } + /* + * Ensure there's no remaining timer deadline set + * - AICPM may have left one active. + */ + setPop(0); + + ml_set_interrupts_enabled(FALSE); + mp_enable_preemption(); + + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_END, + 0, 0, 0, 0, 0); +} + +int pmsafe_debug = 1; + +#if MACH_KDP +volatile boolean_t mp_kdp_trap = FALSE; +volatile boolean_t mp_kdp_is_NMI = FALSE; +volatile unsigned long mp_kdp_ncpus; +boolean_t mp_kdp_state; + + +void +mp_kdp_enter(boolean_t proceed_on_failure) +{ + unsigned int cpu; + unsigned int ncpus = 0; + unsigned int my_cpu; + uint64_t tsc_timeout; DBG("mp_kdp_enter()\n"); @@ -981,261 +1609,559 @@ mp_kdp_enter(void) * stopping others. */ mp_kdp_state = ml_set_interrupts_enabled(FALSE); - simple_lock(&mp_kdp_lock); - while (mp_kdp_trap) { - simple_unlock(&mp_kdp_lock); - DBG("mp_kdp_enter() race lost\n"); - mp_kdp_wait(); - simple_lock(&mp_kdp_lock); - } - mp_kdp_ncpus = 1; /* self */ + my_cpu = cpu_number(); + + if (my_cpu == (unsigned) debugger_cpu) { + kprintf("\n\nRECURSIVE DEBUGGER ENTRY DETECTED\n\n"); + kdp_reset(); + return; + } + + uint64_t start_time = cpu_datap(my_cpu)->debugger_entry_time = mach_absolute_time(); + int locked = 0; + while (!locked || mp_kdp_trap) { + if (locked) { + simple_unlock(&x86_topo_lock); + } + if (proceed_on_failure) { + if (mach_absolute_time() - start_time > 500000000ll) { + paniclog_append_noflush("mp_kdp_enter() can't get x86_topo_lock! Debugging anyway! #YOLO\n"); + break; + } + locked = simple_lock_try(&x86_topo_lock, LCK_GRP_NULL); + if (!locked) { + cpu_pause(); + } + } else { + mp_safe_spin_lock(&x86_topo_lock); + locked = TRUE; + } + + if (locked && mp_kdp_trap) { + simple_unlock(&x86_topo_lock); + DBG("mp_kdp_enter() race lost\n"); +#if MACH_KDP + mp_kdp_wait(TRUE, FALSE); +#endif + locked = FALSE; + } + } + + if (pmsafe_debug && !kdp_snapshot) { + pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE); + } + + debugger_cpu = my_cpu; + ncpus = 1; + atomic_incl((volatile long *)&mp_kdp_ncpus, 1); mp_kdp_trap = TRUE; - simple_unlock(&mp_kdp_lock); + debugger_entry_time = cpu_datap(my_cpu)->debugger_entry_time; - /* Deliver a nudge to other cpus, counting how many */ + /* + * Deliver a nudge to other cpus, counting how many + */ DBG("mp_kdp_enter() signaling other processors\n"); - for (ncpus = 1, cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running) + if (force_immediate_debugger_NMI == FALSE) { + for (cpu = 0; cpu < real_ncpus; cpu++) { + if (cpu == my_cpu || !cpu_is_running(cpu)) { + continue; + } + ncpus++; + i386_signal_cpu(cpu, MP_KDP, ASYNC); + } + /* + * Wait other processors to synchronize + */ + DBG("mp_kdp_enter() waiting for (%d) processors to suspend\n", ncpus); + + /* + * This timeout is rather arbitrary; we don't want to NMI + * processors that are executing at potentially + * "unsafe-to-interrupt" points such as the trampolines, + * but neither do we want to lose state by waiting too long. + */ + tsc_timeout = rdtsc64() + (LockTimeOutTSC); + + while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) { + /* + * A TLB shootdown request may be pending--this would + * result in the requesting processor waiting in + * PMAP_UPDATE_TLBS() until this processor deals with it. + * Process it, so it can now enter mp_kdp_wait() + */ + handle_pending_TLB_flushes(); + cpu_pause(); + } + /* If we've timed out, and some processor(s) are still unresponsive, + * interrupt them with an NMI via the local APIC, iff a panic is + * in progress. + */ + if (panic_active()) { + NMIPI_enable(TRUE); + } + if (mp_kdp_ncpus != ncpus) { + unsigned int wait_cycles = 0; + if (proceed_on_failure) { + paniclog_append_noflush("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu); + } else { + DBG("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu); + } + for (cpu = 0; cpu < real_ncpus; cpu++) { + if (cpu == my_cpu || !cpu_is_running(cpu)) { + continue; + } + if (cpu_signal_pending(cpu, MP_KDP)) { + cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE; + cpu_NMI_interrupt(cpu); + } + } + /* Wait again for the same timeout */ + tsc_timeout = rdtsc64() + (LockTimeOutTSC); + while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) { + handle_pending_TLB_flushes(); + cpu_pause(); + ++wait_cycles; + } + if (mp_kdp_ncpus != ncpus) { + paniclog_append_noflush("mp_kdp_enter() NMI pending on cpus:"); + for (cpu = 0; cpu < real_ncpus; cpu++) { + if (cpu_is_running(cpu) && !cpu_datap(cpu)->cpu_NMI_acknowledged) { + paniclog_append_noflush(" %d", cpu); + } + } + paniclog_append_noflush("\n"); + if (proceed_on_failure) { + paniclog_append_noflush("mp_kdp_enter() timed-out during %s wait after NMI;" + "expected %u acks but received %lu after %u loops in %llu ticks\n", + (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC); + } else { + panic("mp_kdp_enter() timed-out during %s wait after NMI;" + "expected %u acks but received %lu after %u loops in %llu ticks", + (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC); + } + } + } + } else { + for (cpu = 0; cpu < real_ncpus; cpu++) { + if (cpu == my_cpu || !cpu_is_running(cpu)) { + continue; + } + cpu_NMI_interrupt(cpu); + } + } + + if (locked) { + simple_unlock(&x86_topo_lock); + } + + DBG("mp_kdp_enter() %d processors done %s\n", + (int)mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out"); + + postcode(MP_KDP_ENTER); +} + +boolean_t +mp_kdp_all_cpus_halted() +{ + unsigned int ncpus = 0, cpu = 0, my_cpu = 0; + + my_cpu = cpu_number(); + ncpus = 1; /* current CPU */ + for (cpu = 0; cpu < real_ncpus; cpu++) { + if (cpu == my_cpu || !cpu_is_running(cpu)) { continue; + } ncpus++; - i386_signal_cpu(cpu, MP_KDP, ASYNC); } - /* Wait other processors to spin. */ - DBG("mp_kdp_enter() waiting for (%d) processors to suspend\n", ncpus); - tsc_timeout = rdtsc64() + (1000*1000*1000); - while (*((volatile unsigned int *) &mp_kdp_ncpus) != ncpus - && rdtsc64() < tsc_timeout) { + return mp_kdp_ncpus == ncpus; +} + +static boolean_t +cpu_signal_pending(int cpu, mp_event_t event) +{ + volatile int *signals = &cpu_datap(cpu)->cpu_signals; + boolean_t retval = FALSE; + + if (i_bit(event, signals)) { + retval = TRUE; + } + return retval; +} + +long +kdp_x86_xcpu_invoke(const uint16_t lcpu, kdp_x86_xcpu_func_t func, + void *arg0, void *arg1) +{ + if (lcpu > (real_ncpus - 1)) { + return -1; + } + + if (func == NULL) { + return -1; + } + + kdp_xcpu_call_func.func = func; + kdp_xcpu_call_func.ret = -1; + kdp_xcpu_call_func.arg0 = arg0; + kdp_xcpu_call_func.arg1 = arg1; + kdp_xcpu_call_func.cpu = lcpu; + DBG("Invoking function %p on CPU %d\n", func, (int32_t)lcpu); + while (kdp_xcpu_call_func.cpu != KDP_XCPU_NONE) { cpu_pause(); } - DBG("mp_kdp_enter() %d processors done %s\n", - mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out"); - postcode(MP_KDP_ENTER); + return kdp_xcpu_call_func.ret; } static void -mp_kdp_wait(void) +kdp_x86_xcpu_poll(void) { - boolean_t state; + if ((uint16_t)cpu_number() == kdp_xcpu_call_func.cpu) { + kdp_xcpu_call_func.ret = + kdp_xcpu_call_func.func(kdp_xcpu_call_func.arg0, + kdp_xcpu_call_func.arg1, + cpu_number()); + kdp_xcpu_call_func.cpu = KDP_XCPU_NONE; + } +} - state = ml_set_interrupts_enabled(TRUE); +static void +mp_kdp_wait(boolean_t flush, boolean_t isNMI) +{ DBG("mp_kdp_wait()\n"); - atomic_incl(&mp_kdp_ncpus, 1); - while (mp_kdp_trap) { + + current_cpu_datap()->debugger_ipi_time = mach_absolute_time(); +#if CONFIG_MCA + /* If we've trapped due to a machine-check, save MCA registers */ + mca_check_save(); +#endif + + atomic_incl((volatile long *)&mp_kdp_ncpus, 1); + while (mp_kdp_trap || (isNMI == TRUE)) { + /* + * A TLB shootdown request may be pending--this would result + * in the requesting processor waiting in PMAP_UPDATE_TLBS() + * until this processor handles it. + * Process it, so it can now enter mp_kdp_wait() + */ + if (flush) { + handle_pending_TLB_flushes(); + } + + kdp_x86_xcpu_poll(); cpu_pause(); } - atomic_decl(&mp_kdp_ncpus, 1); + + atomic_decl((volatile long *)&mp_kdp_ncpus, 1); DBG("mp_kdp_wait() done\n"); - (void) ml_set_interrupts_enabled(state); } void mp_kdp_exit(void) { DBG("mp_kdp_exit()\n"); - atomic_decl(&mp_kdp_ncpus, 1); + debugger_cpu = -1; + atomic_decl((volatile long *)&mp_kdp_ncpus, 1); + + debugger_exit_time = mach_absolute_time(); + mp_kdp_trap = FALSE; + mfence(); /* Wait other processors to stop spinning. XXX needs timeout */ DBG("mp_kdp_exit() waiting for processors to resume\n"); - while (*((volatile long *) &mp_kdp_ncpus) > 0) { + while (mp_kdp_ncpus > 0) { + /* + * a TLB shootdown request may be pending... this would result in the requesting + * processor waiting in PMAP_UPDATE_TLBS() until this processor deals with it. + * Process it, so it can now enter mp_kdp_wait() + */ + handle_pending_TLB_flushes(); + cpu_pause(); } + + if (pmsafe_debug && !kdp_snapshot) { + pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL); + } + + debugger_exit_time = mach_absolute_time(); + DBG("mp_kdp_exit() done\n"); (void) ml_set_interrupts_enabled(mp_kdp_state); - postcode(0); + postcode(MP_KDP_EXIT); +} + +#endif /* MACH_KDP */ + +boolean_t +mp_recent_debugger_activity(void) +{ + uint64_t abstime = mach_absolute_time(); + return ((abstime - debugger_entry_time) < LastDebuggerEntryAllowance) || + ((abstime - debugger_exit_time) < LastDebuggerEntryAllowance); } -#endif /* MACH_KDP */ /*ARGSUSED*/ void init_ast_check( - __unused processor_t processor) + __unused processor_t processor) { } void cause_ast_check( - processor_t processor) + processor_t processor) { - int cpu = PROCESSOR_DATA(processor, slot_num); + int cpu = processor->cpu_id; if (cpu != cpu_number()) { i386_signal_cpu(cpu, MP_AST, ASYNC); + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, 1, 0, 0, 0); } } -/* - * invoke kdb on slave processors - */ - void -remote_kdb(void) +slave_machine_init(void *param) { - unsigned int my_cpu = cpu_number(); - unsigned int cpu; - - mp_disable_preemption(); - for (cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running) - continue; - i386_signal_cpu(cpu, MP_KDB, SYNC); + /* + * Here in process context, but with interrupts disabled. + */ + DBG("slave_machine_init() CPU%d\n", get_cpu_number()); + + if (param == FULL_SLAVE_INIT) { + /* + * Cold start + */ + clock_init(); } - mp_enable_preemption(); + cpu_machine_init(); /* Interrupts enabled hereafter */ } -/* - * Clear kdb interrupt - */ - -void -clear_kdb_intr(void) +#undef cpu_number +int +cpu_number(void) { - mp_disable_preemption(); - i_bit_clear(MP_KDB, ¤t_cpu_datap()->cpu_signals); - mp_enable_preemption(); + return get_cpu_number(); } -/* - * i386_init_slave() is called from pstart. - * We're in the cpu's interrupt stack with interrupts disabled. - */ -void -i386_init_slave(void) +static void +cpu_prewarm_init() { - postcode(I386_INIT_SLAVE); - - /* Ensure that caching and write-through are enabled */ - set_cr0(get_cr0() & ~(CR0_NW|CR0_CD)); - - DBG("i386_init_slave() CPU%d: phys (%d) active.\n", - get_cpu_number(), get_cpu_phys_number()); - - lapic_init(); - - LAPIC_DUMP(); - LAPIC_CPU_MAP_DUMP(); - - mtrr_update_cpu(); + int i; - pat_init(); - - cpu_init(); - - slave_main(); - - panic("i386_init_slave() returned from slave_main()"); + simple_lock_init(&cpu_warm_lock, 0); + queue_init(&cpu_warm_call_list); + for (i = 0; i < NUM_CPU_WARM_CALLS; i++) { + enqueue_head(&cpu_warm_call_list, (queue_entry_t)&cpu_warm_call_arr[i]); + } } -void -slave_machine_init(void) +static timer_call_t +grab_warm_timer_call() { - /* - * Here in process context. - */ - DBG("slave_machine_init() CPU%d\n", get_cpu_number()); - - init_fpu(); + spl_t x; + timer_call_t call = NULL; - cpu_thread_init(); + x = splsched(); + simple_lock(&cpu_warm_lock, LCK_GRP_NULL); + if (!queue_empty(&cpu_warm_call_list)) { + call = (timer_call_t) dequeue_head(&cpu_warm_call_list); + } + simple_unlock(&cpu_warm_lock); + splx(x); - pmc_init(); + return call; +} - cpu_machine_init(); +static void +free_warm_timer_call(timer_call_t call) +{ + spl_t x; - clock_init(); + x = splsched(); + simple_lock(&cpu_warm_lock, LCK_GRP_NULL); + enqueue_head(&cpu_warm_call_list, (queue_entry_t)call); + simple_unlock(&cpu_warm_lock); + splx(x); } -#undef cpu_number() -int cpu_number(void) +/* + * Runs in timer call context (interrupts disabled). + */ +static void +cpu_warm_timer_call_func( + call_entry_param_t p0, + __unused call_entry_param_t p1) { - return get_cpu_number(); + free_warm_timer_call((timer_call_t)p0); + return; } -#if MACH_KDB -#include - -#define TRAP_DEBUG 0 /* Must match interrupt.s and spl.s */ - +/* + * Runs with interrupts disabled on the CPU we wish to warm (i.e. CPU 0). + */ +static void +_cpu_warm_setup( + void *arg) +{ + cpu_warm_data_t cwdp = (cpu_warm_data_t)arg; -#if TRAP_DEBUG -#define MTRAPS 100 -struct mp_trap_hist_struct { - unsigned char type; - unsigned char data[5]; -} trap_hist[MTRAPS], *cur_trap_hist = trap_hist, - *max_trap_hist = &trap_hist[MTRAPS]; + timer_call_enter(cwdp->cwd_call, cwdp->cwd_deadline, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL); + cwdp->cwd_result = 0; -void db_trap_hist(void); + return; +} /* - * SPL: - * 1: new spl - * 2: old spl - * 3: new tpr - * 4: old tpr - * INT: - * 1: int vec - * 2: old spl - * 3: new spl - * 4: post eoi tpr - * 5: exit tpr + * Not safe to call with interrupts disabled. */ +kern_return_t +ml_interrupt_prewarm( + uint64_t deadline) +{ + struct cpu_warm_data cwd; + timer_call_t call; + cpu_t ct; -void -db_trap_hist(void) -{ - int i,j; - for(i=0;i=cur_trap_hist)?"*":" ", - (trap_hist[i].type == 1)?"SPL":"INT"); - for(j=0;j<5;j++) - db_printf(" %02x", trap_hist[i].data[j]); - db_printf("\n"); - } - -} -#endif /* TRAP_DEBUG */ - -void db_lapic(int cpu); -unsigned int db_remote_read(int cpu, int reg); -void db_ioapic(unsigned int); -void kdb_console(void); + if (ml_get_interrupts_enabled() == FALSE) { + panic("%s: Interrupts disabled?\n", __FUNCTION__); + } -void -kdb_console(void) -{ -} + /* + * If the platform doesn't need our help, say that we succeeded. + */ + if (!ml_get_interrupt_prewake_applicable()) { + return KERN_SUCCESS; + } -#define BOOLP(a) ((a)?' ':'!') + /* + * Grab a timer call to use. + */ + call = grab_warm_timer_call(); + if (call == NULL) { + return KERN_RESOURCE_SHORTAGE; + } -static char *DM[8] = { - "Fixed", - "Lowest Priority", - "Invalid", - "Invalid", - "NMI", - "Reset", - "Invalid", - "ExtINT"}; + timer_call_setup(call, cpu_warm_timer_call_func, call); + cwd.cwd_call = call; + cwd.cwd_deadline = deadline; + cwd.cwd_result = 0; -unsigned int -db_remote_read(int cpu, int reg) -{ - return -1; + /* + * For now, non-local interrupts happen on the master processor. + */ + ct = mp_cpus_call(cpu_to_cpumask(master_cpu), SYNC, _cpu_warm_setup, &cwd); + if (ct == 0) { + free_warm_timer_call(call); + return KERN_FAILURE; + } else { + return cwd.cwd_result; + } } +#if DEBUG || DEVELOPMENT void -db_lapic(int cpu) +kernel_spin(uint64_t spin_ns) { + boolean_t istate; + uint64_t spin_abs; + uint64_t deadline; + cpu_data_t *cdp; + + kprintf("kernel_spin(%llu) spinning uninterruptibly\n", spin_ns); + istate = ml_set_interrupts_enabled(FALSE); + cdp = current_cpu_datap(); + nanoseconds_to_absolutetime(spin_ns, &spin_abs); + + /* Fake interrupt handler entry for testing mp_interrupt_watchdog() */ + cdp->cpu_int_event_time = mach_absolute_time(); + cdp->cpu_int_state = (void *) USER_STATE(current_thread()); + + deadline = mach_absolute_time() + spin_ns; + while (mach_absolute_time() < deadline) { + cpu_pause(); + } + + cdp->cpu_int_event_time = 0; + cdp->cpu_int_state = NULL; + + ml_set_interrupts_enabled(istate); + kprintf("kernel_spin() continuing\n"); } +/* + * Called from the scheduler's maintenance thread, + * scan running processors for long-running ISRs and: + * - panic if longer than LockTimeOut, or + * - log if more than a quantum. + */ void -db_ioapic(unsigned int ind) +mp_interrupt_watchdog(void) { -} + cpu_t cpu; + boolean_t intrs_enabled = FALSE; + uint16_t cpu_int_num; + uint64_t cpu_int_event_time; + uint64_t cpu_rip; + uint64_t cpu_int_duration; + uint64_t now; + x86_saved_state_t *cpu_int_state; + + if (__improbable(!mp_interrupt_watchdog_enabled)) { + return; + } -#endif /* MACH_KDB */ + intrs_enabled = ml_set_interrupts_enabled(FALSE); + now = mach_absolute_time(); + /* + * While timeouts are not suspended, + * check all other processors for long outstanding interrupt handling. + */ + for (cpu = 0; + cpu < (cpu_t) real_ncpus && !machine_timeout_suspended(); + cpu++) { + if ((cpu == (cpu_t) cpu_number()) || + (!cpu_is_running(cpu))) { + continue; + } + cpu_int_event_time = cpu_datap(cpu)->cpu_int_event_time; + if (cpu_int_event_time == 0) { + continue; + } + if (__improbable(now < cpu_int_event_time)) { + continue; /* skip due to inter-processor skew */ + } + cpu_int_state = cpu_datap(cpu)->cpu_int_state; + if (__improbable(cpu_int_state == NULL)) { + /* The interrupt may have been dismissed */ + continue; + } + /* Here with a cpu handling an interrupt */ + + cpu_int_duration = now - cpu_int_event_time; + if (__improbable(cpu_int_duration > LockTimeOut)) { + cpu_int_num = saved_state64(cpu_int_state)->isf.trapno; + cpu_rip = saved_state64(cpu_int_state)->isf.rip; + vector_timed_out = cpu_int_num; + NMIPI_panic(cpu_to_cpumask(cpu), INTERRUPT_WATCHDOG); + panic("Interrupt watchdog, " + "cpu: %d interrupt: 0x%x time: %llu..%llu state: %p RIP: 0x%llx", + cpu, cpu_int_num, cpu_int_event_time, now, cpu_int_state, cpu_rip); + /* NOT REACHED */ + } else if (__improbable(cpu_int_duration > (uint64_t) std_quantum)) { + mp_interrupt_watchdog_events++; + cpu_int_num = saved_state64(cpu_int_state)->isf.trapno; + cpu_rip = saved_state64(cpu_int_state)->isf.rip; + ml_set_interrupts_enabled(intrs_enabled); + printf("Interrupt watchdog, " + "cpu: %d interrupt: 0x%x time: %llu..%llu RIP: 0x%llx\n", + cpu, cpu_int_num, cpu_int_event_time, now, cpu_rip); + return; + } + } + + ml_set_interrupts_enabled(intrs_enabled); +} +#endif