X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/d41d1dae2cd00cc08c7982087d1c445180cad9f5..d26ffc64f583ab2d29df48f13518685602bc8832:/osfmk/i386/mp.c?ds=inline diff --git a/osfmk/i386/mp.c b/osfmk/i386/mp.c index 4dd1e625d..3b7232687 100644 --- a/osfmk/i386/mp.c +++ b/osfmk/i386/mp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -29,9 +29,8 @@ * @OSF_COPYRIGHT@ */ -#include -#include #include +#include #include #include @@ -48,13 +47,17 @@ #include #include #include -#include +#include +#include +#include +#include #include #include #include +#include #include #include #include @@ -65,31 +68,22 @@ #include #include #include -#include #include #include #include -#include #include #if CONFIG_MCA #include #endif #include -#include -#include - #include -#if MACH_KDB -#include -#include -#include -#include -#include -#include -#include -#include -#endif + +#include + +#if MONOTONIC +#include +#endif /* MONOTONIC */ #if MP_DEBUG #define PAUSE delay(1000000) @@ -99,30 +93,41 @@ #define PAUSE #endif /* MP_DEBUG */ +/* Debugging/test trace events: */ +#define TRACE_MP_TLB_FLUSH MACHDBG_CODE(DBG_MACH_MP, 0) +#define TRACE_MP_CPUS_CALL MACHDBG_CODE(DBG_MACH_MP, 1) +#define TRACE_MP_CPUS_CALL_LOCAL MACHDBG_CODE(DBG_MACH_MP, 2) +#define TRACE_MP_CPUS_CALL_ACTION MACHDBG_CODE(DBG_MACH_MP, 3) +#define TRACE_MP_CPUS_CALL_NOBUF MACHDBG_CODE(DBG_MACH_MP, 4) +#define TRACE_MP_CPU_FAST_START MACHDBG_CODE(DBG_MACH_MP, 5) +#define TRACE_MP_CPU_START MACHDBG_CODE(DBG_MACH_MP, 6) +#define TRACE_MP_CPU_DEACTIVATE MACHDBG_CODE(DBG_MACH_MP, 7) #define ABS(v) (((v) > 0)?(v):-(v)) void slave_boot_init(void); +void i386_cpu_IPI(int cpu); -#if MACH_KDB -static void mp_kdb_wait(void); -volatile boolean_t mp_kdb_trap = FALSE; -volatile long mp_kdb_ncpus = 0; -#endif - +#if MACH_KDP static void mp_kdp_wait(boolean_t flush, boolean_t isNMI); -static void mp_rendezvous_action(void); -static void mp_broadcast_action(void); +#endif /* MACH_KDP */ +#if MACH_KDP static boolean_t cpu_signal_pending(int cpu, mp_event_t event); -static int cpu_signal_handler(x86_saved_state_t *regs); +#endif /* MACH_KDP */ static int NMIInterruptHandler(x86_saved_state_t *regs); boolean_t smp_initialized = FALSE; uint32_t TSC_sync_margin = 0xFFF; volatile boolean_t force_immediate_debugger_NMI = FALSE; volatile boolean_t pmap_tlb_flush_timeout = FALSE; -decl_simple_lock_data(,mp_kdp_lock); +#if DEBUG || DEVELOPMENT +boolean_t mp_interrupt_watchdog_enabled = TRUE; +uint32_t mp_interrupt_watchdog_events = 0; +#endif + +decl_simple_lock_data(,debugger_callback_lock); +struct debugger_callback *debugger_callback = NULL; decl_lck_mtx_data(static, mp_cpu_boot_lock); lck_mtx_ext_t mp_cpu_boot_lock_ext; @@ -142,7 +147,7 @@ static volatile long mp_rv_complete __attribute__((aligned(64))); volatile uint64_t debugger_entry_time; volatile uint64_t debugger_exit_time; #if MACH_KDP - +#include extern int kdp_snapshot; static struct _kdp_xcpu_call_func { kdp_x86_xcpu_func_t func; @@ -163,13 +168,25 @@ static volatile long mp_bc_count; decl_lck_mtx_data(static, mp_bc_lock); lck_mtx_ext_t mp_bc_lock_ext; static volatile int debugger_cpu = -1; +volatile long NMIPI_acks = 0; +volatile long NMI_count = 0; +static NMI_reason_t NMI_panic_reason = NONE; +static int vector_timed_out; + +extern void NMI_cpus(void); +static void mp_cpus_call_init(void); static void mp_cpus_call_action(void); static void mp_call_PM(void); char mp_slave_stack[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); // Temp stack for slave init - +/* PAL-related routines */ +boolean_t i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler, + int ipi_vector, i386_intr_func_t ipi_handler); +void i386_start_cpu(int lapic_id, int cpu_num); +void i386_send_NMI(int cpu); +void NMIPI_enable(boolean_t); #if GPROF /* * Initialize dummy structs for profiling. These aren't used but @@ -192,40 +209,66 @@ struct profile_vars *_profile_vars_cpus[MAX_CPUS] = { &_profile_vars }; static lck_grp_t smp_lck_grp; static lck_grp_attr_t smp_lck_grp_attr; -extern void slave_pstart(void); +#define NUM_CPU_WARM_CALLS 20 +struct timer_call cpu_warm_call_arr[NUM_CPU_WARM_CALLS]; +queue_head_t cpu_warm_call_list; +decl_simple_lock_data(static, cpu_warm_lock); + +typedef struct cpu_warm_data { + timer_call_t cwd_call; + uint64_t cwd_deadline; + int cwd_result; +} *cpu_warm_data_t; + +static void cpu_prewarm_init(void); +static void cpu_warm_timer_call_func(call_entry_param_t p0, call_entry_param_t p1); +static void _cpu_warm_setup(void *arg); +static timer_call_t grab_warm_timer_call(void); +static void free_warm_timer_call(timer_call_t call); void smp_init(void) { - simple_lock_init(&mp_kdp_lock, 0); simple_lock_init(&mp_rv_lock, 0); + simple_lock_init(&debugger_callback_lock, 0); lck_grp_attr_setdefault(&smp_lck_grp_attr); lck_grp_init(&smp_lck_grp, "i386_smp", &smp_lck_grp_attr); lck_mtx_init_ext(&mp_cpu_boot_lock, &mp_cpu_boot_lock_ext, &smp_lck_grp, LCK_ATTR_NULL); lck_mtx_init_ext(&mp_bc_lock, &mp_bc_lock_ext, &smp_lck_grp, LCK_ATTR_NULL); console_init(); - /* Local APIC? */ - if (!lapic_probe()) + if(!i386_smp_init(LAPIC_NMI_INTERRUPT, NMIInterruptHandler, + LAPIC_VECTOR(INTERPROCESSOR), cpu_signal_handler)) return; - lapic_init(); - lapic_configure(); - lapic_set_intr_func(LAPIC_NMI_INTERRUPT, NMIInterruptHandler); - lapic_set_intr_func(LAPIC_VECTOR(INTERPROCESSOR), cpu_signal_handler); - cpu_thread_init(); GPROF_INIT(); DBGLOG_CPU_INIT(master_cpu); - install_real_mode_bootstrap(slave_pstart); + mp_cpus_call_init(); + mp_cpus_call_cpu_init(master_cpu); + +#if DEBUG || DEVELOPMENT + if (PE_parse_boot_argn("interrupt_watchdog", + &mp_interrupt_watchdog_enabled, + sizeof(mp_interrupt_watchdog_enabled))) { + kprintf("Interrupt watchdog %sabled\n", + mp_interrupt_watchdog_enabled ? "en" : "dis"); + } +#endif if (PE_parse_boot_argn("TSC_sync_margin", - &TSC_sync_margin, sizeof(TSC_sync_margin))) + &TSC_sync_margin, sizeof(TSC_sync_margin))) { kprintf("TSC sync Margin 0x%x\n", TSC_sync_margin); + } else if (cpuid_vmm_present()) { + kprintf("TSC sync margin disabled\n"); + TSC_sync_margin = 0; + } smp_initialized = TRUE; + cpu_prewarm_init(); + return; } @@ -274,6 +317,10 @@ intel_startCPU_fast(int slot_num) */ return(rc); + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_FAST_START | DBG_FUNC_START, + slot_num, 0, 0, 0, 0); + /* * Wait until the CPU is back online. */ @@ -284,9 +331,14 @@ intel_startCPU_fast(int slot_num) * longer than a full restart would require so it should be more * than long enough. */ + mp_wait_for_cpu_up(slot_num, 30000, 1); mp_enable_preemption(); + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_FAST_START | DBG_FUNC_END, + slot_num, cpu_datap(slot_num)->cpu_running, 0, 0, 0); + /* * Check to make sure that the CPU is really running. If not, * go through the slow path. @@ -327,18 +379,30 @@ start_cpu(void *arg) if (cpu_number() != psip->starter_cpu) return; - LAPIC_WRITE(ICRD, psip->target_lapic << LAPIC_ICRD_DEST_SHIFT); - LAPIC_WRITE(ICR, LAPIC_ICR_DM_INIT); - delay(100); + DBG("start_cpu(%p) about to start cpu %d, lapic %d\n", + arg, psip->target_cpu, psip->target_lapic); + + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_START | DBG_FUNC_START, + psip->target_cpu, + psip->target_lapic, 0, 0, 0); - LAPIC_WRITE(ICRD, psip->target_lapic << LAPIC_ICRD_DEST_SHIFT); - LAPIC_WRITE(ICR, LAPIC_ICR_DM_STARTUP|(REAL_MODE_BOOTSTRAP_OFFSET>>12)); + i386_start_cpu(psip->target_lapic, psip->target_cpu); #ifdef POSTCODE_DELAY /* Wait much longer if postcodes are displayed for a delay period. */ i *= 10000; #endif + DBG("start_cpu(%p) about to wait for cpu %d\n", + arg, psip->target_cpu); + mp_wait_for_cpu_up(psip->target_cpu, i*100, 100); + + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_START | DBG_FUNC_END, + psip->target_cpu, + cpu_datap(psip->target_cpu)->cpu_running, 0, 0, 0); + if (TSC_sync_margin && cpu_datap(psip->target_cpu)->cpu_running) { /* @@ -372,12 +436,6 @@ start_cpu(void *arg) } } -extern char prot_mode_gdt[]; -extern char slave_boot_base[]; -extern char real_mode_bootstrap_base[]; -extern char real_mode_bootstrap_end[]; -extern char slave_boot_end[]; - kern_return_t intel_startCPU( int slot_num) @@ -390,16 +448,13 @@ intel_startCPU( DBGLOG_CPU_INIT(slot_num); DBG("intel_startCPU(%d) lapic_id=%d\n", slot_num, lapic); - DBG("IdlePTD(%p): 0x%x\n", &IdlePTD, (int) IdlePTD); + DBG("IdlePTD(%p): 0x%x\n", &IdlePTD, (int) (uintptr_t)IdlePTD); /* * Initialize (or re-initialize) the descriptor tables for this cpu. * Propagate processor mode to slave. */ - if (cpu_mode_is64bit()) - cpu_desc_init64(cpu_datap(slot_num)); - else - cpu_desc_init(cpu_datap(slot_num)); + cpu_desc_init(cpu_datap(slot_num)); /* Serialize use of the slave boot stack, etc. */ lck_mtx_lock(&mp_cpu_boot_lock); @@ -449,25 +504,30 @@ MP_EVENT_NAME_DECL(); #endif /* MP_DEBUG */ +/* + * Note: called with NULL state when polling for TLB flush and cross-calls. + */ int cpu_signal_handler(x86_saved_state_t *regs) { +#if !MACH_KDP +#pragma unused (regs) +#endif /* !MACH_KDP */ int my_cpu; volatile int *my_word; -#if MACH_KDB && MACH_ASSERT - int i=100; -#endif /* MACH_KDB && MACH_ASSERT */ - mp_disable_preemption(); + SCHED_STATS_IPI(current_processor()); my_cpu = cpu_number(); - my_word = ¤t_cpu_datap()->cpu_signals; + my_word = &cpu_data_ptr[my_cpu]->cpu_signals; + /* Store the initial set of signals for diagnostics. New + * signals could arrive while these are being processed + * so it's no more than a hint. + */ + + cpu_data_ptr[my_cpu]->cpu_prior_signals = *my_word; do { -#if MACH_KDB && MACH_ASSERT - if (i-- <= 0) - Debugger("cpu_signal_handler: signals did not clear"); -#endif /* MACH_KDB && MACH_ASSERT */ #if MACH_KDP if (i_bit(MP_KDP, my_word)) { DBGLOG(cpu_handle,my_cpu,MP_KDP); @@ -489,30 +549,6 @@ cpu_signal_handler(x86_saved_state_t *regs) DBGLOG(cpu_handle,my_cpu,MP_TLB_FLUSH); i_bit_clear(MP_TLB_FLUSH, my_word); pmap_update_interrupt(); - } else if (i_bit(MP_AST, my_word)) { - DBGLOG(cpu_handle,my_cpu,MP_AST); - i_bit_clear(MP_AST, my_word); - ast_check(cpu_to_processor(my_cpu)); -#if MACH_KDB - } else if (i_bit(MP_KDB, my_word)) { - - i_bit_clear(MP_KDB, my_word); - current_cpu_datap()->cpu_kdb_is_slave++; - mp_kdb_wait(); - current_cpu_datap()->cpu_kdb_is_slave--; -#endif /* MACH_KDB */ - } else if (i_bit(MP_RENDEZVOUS, my_word)) { - DBGLOG(cpu_handle,my_cpu,MP_RENDEZVOUS); - i_bit_clear(MP_RENDEZVOUS, my_word); - mp_rendezvous_action(); - } else if (i_bit(MP_BROADCAST, my_word)) { - DBGLOG(cpu_handle,my_cpu,MP_BROADCAST); - i_bit_clear(MP_BROADCAST, my_word); - mp_broadcast_action(); - } else if (i_bit(MP_CHUD, my_word)) { - DBGLOG(cpu_handle,my_cpu,MP_CHUD); - i_bit_clear(MP_CHUD, my_word); - chudxnu_cpu_signal_handler(); } else if (i_bit(MP_CALL, my_word)) { DBGLOG(cpu_handle,my_cpu,MP_CALL); i_bit_clear(MP_CALL, my_word); @@ -522,38 +558,92 @@ cpu_signal_handler(x86_saved_state_t *regs) i_bit_clear(MP_CALL_PM, my_word); mp_call_PM(); } + if (regs == NULL) { + /* Called to poll only for cross-calls and TLB flush */ + break; + } else if (i_bit(MP_AST, my_word)) { + DBGLOG(cpu_handle,my_cpu,MP_AST); + i_bit_clear(MP_AST, my_word); + ast_check(cpu_to_processor(my_cpu)); + } } while (*my_word); - mp_enable_preemption(); - return 0; } +extern void kprintf_break_lock(void); static int NMIInterruptHandler(x86_saved_state_t *regs) { - void *stackptr; - + void *stackptr; + char pstr[192]; + uint64_t now = mach_absolute_time(); + + if (panic_active() && !panicDebugging) { + if (pmsafe_debug) + pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE); + for(;;) + cpu_pause(); + } + + atomic_incl(&NMIPI_acks, 1); + atomic_incl(&NMI_count, 1); sync_iss_to_iks_unconditionally(regs); -#if defined (__i386__) - __asm__ volatile("movl %%ebp, %0" : "=m" (stackptr)); -#elif defined (__x86_64__) __asm__ volatile("movq %%rbp, %0" : "=m" (stackptr)); -#endif if (cpu_number() == debugger_cpu) - goto NMExit; - - if (pmap_tlb_flush_timeout == TRUE && current_cpu_datap()->cpu_tlb_invalid) { - char pstr[128]; - snprintf(&pstr[0], sizeof(pstr), "Panic(CPU %d): Unresponsive processor\n", cpu_number()); - panic_i386_backtrace(stackptr, 16, &pstr[0], TRUE, regs); + goto NMExit; + + if (NMI_panic_reason == SPINLOCK_TIMEOUT) { + snprintf(&pstr[0], sizeof(pstr), + "Panic(CPU %d, time %llu): NMIPI for spinlock acquisition timeout, spinlock: %p, spinlock owner: %p, current_thread: %p, spinlock_owner_cpu: 0x%x\n", + cpu_number(), now, spinlock_timed_out, (void *) spinlock_timed_out->interlock.lock_data, current_thread(), spinlock_owner_cpu); + panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs); + } else if (NMI_panic_reason == TLB_FLUSH_TIMEOUT) { + snprintf(&pstr[0], sizeof(pstr), + "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: TLB flush timeout, TLB state:0x%x\n", + cpu_number(), now, current_cpu_datap()->cpu_tlb_invalid); + panic_i386_backtrace(stackptr, 48, &pstr[0], TRUE, regs); + } else if (NMI_panic_reason == CROSSCALL_TIMEOUT) { + snprintf(&pstr[0], sizeof(pstr), + "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: cross-call timeout\n", + cpu_number(), now); + panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs); + } else if (NMI_panic_reason == INTERRUPT_WATCHDOG) { + snprintf(&pstr[0], sizeof(pstr), + "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: interrupt watchdog for vector 0x%x\n", + cpu_number(), now, vector_timed_out); + panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs); } - + #if MACH_KDP if (pmsafe_debug && !kdp_snapshot) pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE); - mp_kdp_wait(FALSE, pmap_tlb_flush_timeout); + current_cpu_datap()->cpu_NMI_acknowledged = TRUE; + i_bit_clear(MP_KDP, ¤t_cpu_datap()->cpu_signals); + if (panic_active() || NMI_panic_reason != NONE) { + mp_kdp_wait(FALSE, TRUE); + } else if (!mp_kdp_trap && + !mp_kdp_is_NMI && + virtualized && (debug_boot_arg & DB_NMI)) { + /* + * Under a VMM with the debug boot-arg set, drop into kdp. + * Since an NMI is involved, there's a risk of contending with + * a panic. And side-effects of NMIs may result in entry into, + * and continuing from, the debugger being unreliable. + */ + if (__sync_bool_compare_and_swap(&mp_kdp_is_NMI, FALSE, TRUE)) { + kprintf_break_lock(); + kprintf("Debugger entry requested by NMI\n"); + kdp_i386_trap(T_DEBUG, saved_state64(regs), 0, 0); + printf("Debugger entry requested by NMI\n"); + mp_kdp_is_NMI = FALSE; + } else { + mp_kdp_wait(FALSE, FALSE); + } + } else { + mp_kdp_wait(FALSE, FALSE); + } if (pmsafe_debug && !kdp_snapshot) pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL); #endif @@ -561,51 +651,6 @@ NMExit: return 1; } -#ifdef MP_DEBUG -int max_lock_loops = 100000000; -int trappedalready = 0; /* (BRINGUP) */ -#endif /* MP_DEBUG */ - -static void -i386_cpu_IPI(int cpu) -{ - boolean_t state; - -#ifdef MP_DEBUG - if(cpu_datap(cpu)->cpu_signals & 6) { /* (BRINGUP) */ - kprintf("i386_cpu_IPI: sending enter debugger signal (%08X) to cpu %d\n", cpu_datap(cpu)->cpu_signals, cpu); - } -#endif /* MP_DEBUG */ - -#if MACH_KDB -#ifdef MP_DEBUG - if(!trappedalready && (cpu_datap(cpu)->cpu_signals & 6)) { /* (BRINGUP) */ - if(kdb_cpu != cpu_number()) { - trappedalready = 1; - panic("i386_cpu_IPI: sending enter debugger signal (%08X) to cpu %d and I do not own debugger, owner = %08X\n", - cpu_datap(cpu)->cpu_signals, cpu, kdb_cpu); - } - } -#endif /* MP_DEBUG */ -#endif - - /* Wait for previous interrupt to be delivered... */ -#ifdef MP_DEBUG - int pending_busy_count = 0; - while (LAPIC_READ(ICR) & LAPIC_ICR_DS_PENDING) { - if (++pending_busy_count > max_lock_loops) - panic("i386_cpu_IPI() deadlock\n"); -#else - while (LAPIC_READ(ICR) & LAPIC_ICR_DS_PENDING) { -#endif /* MP_DEBUG */ - cpu_pause(); - } - - state = ml_set_interrupts_enabled(FALSE); - LAPIC_WRITE(ICRD, cpu_to_lapic[cpu] << LAPIC_ICRD_DEST_SHIFT); - LAPIC_WRITE(ICR, LAPIC_VECTOR(INTERPROCESSOR) | LAPIC_ICR_DM_FIXED); - (void) ml_set_interrupts_enabled(state); -} /* * cpu_interrupt is really just to be used by the scheduler to @@ -615,10 +660,15 @@ i386_cpu_IPI(int cpu) void cpu_interrupt(int cpu) { + boolean_t did_IPI = FALSE; + if (smp_initialized && pmCPUExitIdle(cpu_datap(cpu))) { i386_cpu_IPI(cpu); + did_IPI = TRUE; } + + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, did_IPI, 0, 0, 0); } /* @@ -627,18 +677,38 @@ cpu_interrupt(int cpu) void cpu_NMI_interrupt(int cpu) { - boolean_t state; - if (smp_initialized) { - state = ml_set_interrupts_enabled(FALSE); -/* Program the interrupt command register */ - LAPIC_WRITE(ICRD, cpu_to_lapic[cpu] << LAPIC_ICRD_DEST_SHIFT); -/* The vector is ignored in this case--the target CPU will enter on the - * NMI vector. - */ - LAPIC_WRITE(ICR, LAPIC_VECTOR(INTERPROCESSOR)|LAPIC_ICR_DM_NMI); - (void) ml_set_interrupts_enabled(state); + i386_send_NMI(cpu); + } +} + +void +NMI_cpus(void) +{ + unsigned int cpu; + boolean_t intrs_enabled; + uint64_t tsc_timeout; + + intrs_enabled = ml_set_interrupts_enabled(FALSE); + + for (cpu = 0; cpu < real_ncpus; cpu++) { + if (!cpu_is_running(cpu)) + continue; + cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE; + cpu_NMI_interrupt(cpu); + tsc_timeout = !machine_timeout_suspended() ? + rdtsc64() + (1000 * 1000 * 1000 * 10ULL) : + ~0ULL; + while (!cpu_datap(cpu)->cpu_NMI_acknowledged) { + handle_pending_TLB_flushes(); + cpu_pause(); + if (rdtsc64() > tsc_timeout) + panic("NMI_cpus() timeout cpu %d", cpu); + } + cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE; } + + ml_set_interrupts_enabled(intrs_enabled); } static void (* volatile mp_PM_func)(void) = NULL; @@ -682,7 +752,7 @@ i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode) return; if (event == MP_TLB_FLUSH) - KERNEL_DEBUG(0xef800020 | DBG_FUNC_START, cpu, 0, 0, 0, 0); + KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_START, cpu, 0, 0, 0, 0); DBGLOG(cpu_signal, cpu, event); @@ -690,7 +760,9 @@ i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode) i386_cpu_IPI(cpu); if (mode == SYNC) { again: - tsc_timeout = rdtsc64() + (1000*1000*1000); + tsc_timeout = !machine_timeout_suspended() ? + rdtsc64() + (1000*1000*1000) : + ~0ULL; while (i_bit(event, signals) && rdtsc64() < tsc_timeout) { cpu_pause(); } @@ -701,45 +773,60 @@ i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode) } } if (event == MP_TLB_FLUSH) - KERNEL_DEBUG(0xef800020 | DBG_FUNC_END, cpu, 0, 0, 0, 0); + KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_END, cpu, 0, 0, 0, 0); } /* - * Send event to all running cpus. - * Called with the topology locked. + * Helper function called when busy-waiting: panic if too long + * a TSC-based time has elapsed since the start of the spin. */ -void -i386_signal_cpus(mp_event_t event, mp_sync_t mode) +static boolean_t +mp_spin_timeout(uint64_t tsc_start) { - unsigned int cpu; - unsigned int my_cpu = cpu_number(); + uint64_t tsc_timeout; - assert(hw_lock_held((hw_lock_t)&x86_topo_lock)); + cpu_pause(); + if (machine_timeout_suspended()) + return FALSE; - for (cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running) - continue; - i386_signal_cpu(cpu, event, mode); - } + /* + * The timeout is 4 * the spinlock timeout period + * unless we have serial console printing (kprintf) enabled + * in which case we allow an even greater margin. + */ + tsc_timeout = disable_serial_output ? LockTimeOutTSC << 2 + : LockTimeOutTSC << 4; + return (rdtsc64() > tsc_start + tsc_timeout); } /* - * Return the number of running cpus. - * Called with the topology locked. + * Helper function to take a spinlock while ensuring that incoming IPIs + * are still serviced if interrupts are masked while we spin. + * Returns current interrupt state. */ -int -i386_active_cpus(void) +boolean_t +mp_safe_spin_lock(usimple_lock_t lock) { - unsigned int cpu; - unsigned int ncpus = 0; - - assert(hw_lock_held((hw_lock_t)&x86_topo_lock)); - - for (cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu_datap(cpu)->cpu_running) - ncpus++; - } - return(ncpus); + if (ml_get_interrupts_enabled()) { + simple_lock(lock); + return TRUE; + } else { + uint64_t tsc_spin_start = rdtsc64(); + while (!simple_lock_try(lock)) { + cpu_signal_handler(NULL); + if (mp_spin_timeout(tsc_spin_start)) { + uint32_t lock_cpu; + uintptr_t lowner = (uintptr_t) + lock->interlock.lock_data; + spinlock_timed_out = lock; + lock_cpu = spinlock_timeout_NMI(lowner); + NMIPI_panic(cpu_to_cpumask(lock_cpu), SPINLOCK_TIMEOUT); + panic("mp_safe_spin_lock() timed out, lock: %p, owner thread: 0x%lx, current_thread: %p, owner on CPU 0x%x, time: %llu", + lock, lowner, current_thread(), lock_cpu, mach_absolute_time()); + } + } + return FALSE; + } } /* @@ -757,9 +844,10 @@ i386_active_cpus(void) */ static void -mp_rendezvous_action(void) +mp_rendezvous_action(__unused void *null) { - boolean_t intrs_enabled; + boolean_t intrs_enabled; + uint64_t tsc_spin_start; /* setup function */ if (mp_rv_setup_func != NULL) @@ -767,25 +855,33 @@ mp_rendezvous_action(void) intrs_enabled = ml_get_interrupts_enabled(); - /* spin on entry rendezvous */ atomic_incl(&mp_rv_entry, 1); + tsc_spin_start = rdtsc64(); + while (mp_rv_entry < mp_rv_ncpus) { /* poll for pesky tlb flushes if interrupts disabled */ if (!intrs_enabled) handle_pending_TLB_flushes(); - cpu_pause(); + if (mp_spin_timeout(tsc_spin_start)) { + panic("mp_rv_action() entry: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_entry, mp_rv_ncpus, tsc_spin_start, rdtsc64()); + } } + /* action function */ if (mp_rv_action_func != NULL) mp_rv_action_func(mp_rv_func_arg); + /* spin on exit rendezvous */ atomic_incl(&mp_rv_exit, 1); + tsc_spin_start = rdtsc64(); while (mp_rv_exit < mp_rv_ncpus) { if (!intrs_enabled) handle_pending_TLB_flushes(); - cpu_pause(); + if (mp_spin_timeout(tsc_spin_start)) + panic("mp_rv_action() exit: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_exit, mp_rv_ncpus, tsc_spin_start, rdtsc64()); } + /* teardown function */ if (mp_rv_teardown_func != NULL) mp_rv_teardown_func(mp_rv_func_arg); @@ -800,6 +896,7 @@ mp_rendezvous(void (*setup_func)(void *), void (*teardown_func)(void *), void *arg) { + uint64_t tsc_spin_start; if (!smp_initialized) { if (setup_func != NULL) @@ -812,7 +909,7 @@ mp_rendezvous(void (*setup_func)(void *), } /* obtain rendezvous lock */ - simple_lock(&mp_rv_lock); + (void) mp_safe_spin_lock(&mp_rv_lock); /* set static function pointers */ mp_rv_setup_func = setup_func; @@ -828,21 +925,20 @@ mp_rendezvous(void (*setup_func)(void *), * signal other processors, which will call mp_rendezvous_action() * with interrupts disabled */ - simple_lock(&x86_topo_lock); - mp_rv_ncpus = i386_active_cpus(); - i386_signal_cpus(MP_RENDEZVOUS, ASYNC); - simple_unlock(&x86_topo_lock); + mp_rv_ncpus = mp_cpus_call(CPUMASK_OTHERS, NOSYNC, &mp_rendezvous_action, NULL) + 1; /* call executor function on this cpu */ - mp_rendezvous_action(); + mp_rendezvous_action(NULL); /* * Spin for everyone to complete. * This is necessary to ensure that all processors have proceeded * from the exit barrier before we release the rendezvous structure. */ + tsc_spin_start = rdtsc64(); while (mp_rv_complete < mp_rv_ncpus) { - cpu_pause(); + if (mp_spin_timeout(tsc_spin_start)) + panic("mp_rendezvous() timeout: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_complete, mp_rv_ncpus, tsc_spin_start, rdtsc64()); } /* Tidy up */ @@ -894,38 +990,204 @@ mp_rendezvous_no_intrs( arg); } + +typedef struct { + queue_chain_t link; /* queue linkage */ + void (*func)(void *,void *); /* routine to call */ + void *arg0; /* routine's 1st arg */ + void *arg1; /* routine's 2nd arg */ + cpumask_t *maskp; /* completion response mask */ +} mp_call_t; + + +typedef struct { + queue_head_t queue; + decl_simple_lock_data(, lock); +} mp_call_queue_t; +#define MP_CPUS_CALL_BUFS_PER_CPU MAX_CPUS +static mp_call_queue_t mp_cpus_call_freelist; +static mp_call_queue_t mp_cpus_call_head[MAX_CPUS]; + +static inline boolean_t +mp_call_head_lock(mp_call_queue_t *cqp) +{ + boolean_t intrs_enabled; + + intrs_enabled = ml_set_interrupts_enabled(FALSE); + simple_lock(&cqp->lock); + + return intrs_enabled; +} + +/* + * Deliver an NMIPI to a set of processors to cause them to panic . + */ void -handle_pending_TLB_flushes(void) +NMIPI_panic(cpumask_t cpu_mask, NMI_reason_t why) { + unsigned int cpu, cpu_bit; + uint64_t deadline; + + NMIPI_enable(TRUE); + NMI_panic_reason = why; + + for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) { + if ((cpu_mask & cpu_bit) == 0) + continue; + cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE; + cpu_NMI_interrupt(cpu); + } + + /* Wait (only so long) for NMi'ed cpus to respond */ + deadline = mach_absolute_time() + LockTimeOut; + for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) { + if ((cpu_mask & cpu_bit) == 0) + continue; + while (!cpu_datap(cpu)->cpu_NMI_acknowledged && + mach_absolute_time() < deadline) { + cpu_pause(); + } + } +} + +#if MACH_ASSERT +static inline boolean_t +mp_call_head_is_locked(mp_call_queue_t *cqp) { - volatile int *my_word = ¤t_cpu_datap()->cpu_signals; + return !ml_get_interrupts_enabled() && + hw_lock_held((hw_lock_t)&cqp->lock); +} +#endif - if (i_bit(MP_TLB_FLUSH, my_word)) { - DBGLOG(cpu_handle, cpu_number(), MP_TLB_FLUSH); - i_bit_clear(MP_TLB_FLUSH, my_word); - pmap_update_interrupt(); +static inline void +mp_call_head_unlock(mp_call_queue_t *cqp, boolean_t intrs_enabled) +{ + simple_unlock(&cqp->lock); + ml_set_interrupts_enabled(intrs_enabled); +} + +static inline mp_call_t * +mp_call_alloc(void) +{ + mp_call_t *callp = NULL; + boolean_t intrs_enabled; + mp_call_queue_t *cqp = &mp_cpus_call_freelist; + + intrs_enabled = mp_call_head_lock(cqp); + if (!queue_empty(&cqp->queue)) + queue_remove_first(&cqp->queue, callp, typeof(callp), link); + mp_call_head_unlock(cqp, intrs_enabled); + + return callp; +} + +static inline void +mp_call_free(mp_call_t *callp) +{ + boolean_t intrs_enabled; + mp_call_queue_t *cqp = &mp_cpus_call_freelist; + + intrs_enabled = mp_call_head_lock(cqp); + queue_enter_first(&cqp->queue, callp, typeof(callp), link); + mp_call_head_unlock(cqp, intrs_enabled); +} + +static inline mp_call_t * +mp_call_dequeue_locked(mp_call_queue_t *cqp) +{ + mp_call_t *callp = NULL; + + assert(mp_call_head_is_locked(cqp)); + if (!queue_empty(&cqp->queue)) + queue_remove_first(&cqp->queue, callp, typeof(callp), link); + return callp; +} + +static inline void +mp_call_enqueue_locked( + mp_call_queue_t *cqp, + mp_call_t *callp) +{ + queue_enter(&cqp->queue, callp, typeof(callp), link); +} + +/* Called on the boot processor to initialize global structures */ +static void +mp_cpus_call_init(void) +{ + mp_call_queue_t *cqp = &mp_cpus_call_freelist; + + DBG("mp_cpus_call_init()\n"); + simple_lock_init(&cqp->lock, 0); + queue_init(&cqp->queue); +} + +/* + * Called at processor registration to add call buffers to the free list + * and to initialize the per-cpu call queue. + */ +void +mp_cpus_call_cpu_init(int cpu) +{ + int i; + mp_call_queue_t *cqp = &mp_cpus_call_head[cpu]; + mp_call_t *callp; + + simple_lock_init(&cqp->lock, 0); + queue_init(&cqp->queue); + for (i = 0; i < MP_CPUS_CALL_BUFS_PER_CPU; i++) { + callp = (mp_call_t *) kalloc(sizeof(mp_call_t)); + mp_call_free(callp); } + + DBG("mp_cpus_call_init(%d) done\n", cpu); } /* * This is called from cpu_signal_handler() to process an MP_CALL signal. + * And also from i386_deactivate_cpu() when a cpu is being taken offline. */ static void mp_cpus_call_action(void) { - if (mp_rv_action_func != NULL) - mp_rv_action_func(mp_rv_func_arg); - atomic_incl(&mp_rv_complete, 1); + mp_call_queue_t *cqp; + boolean_t intrs_enabled; + mp_call_t *callp; + mp_call_t call; + + assert(!ml_get_interrupts_enabled()); + cqp = &mp_cpus_call_head[cpu_number()]; + intrs_enabled = mp_call_head_lock(cqp); + while ((callp = mp_call_dequeue_locked(cqp)) != NULL) { + /* Copy call request to the stack to free buffer */ + call = *callp; + mp_call_free(callp); + if (call.func != NULL) { + mp_call_head_unlock(cqp, intrs_enabled); + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL_ACTION, + VM_KERNEL_UNSLIDE(call.func), VM_KERNEL_UNSLIDE_OR_PERM(call.arg0), + VM_KERNEL_UNSLIDE_OR_PERM(call.arg1), VM_KERNEL_ADDRPERM(call.maskp), 0); + call.func(call.arg0, call.arg1); + (void) mp_call_head_lock(cqp); + } + if (call.maskp != NULL) + i_bit_set(cpu_number(), call.maskp); + } + mp_call_head_unlock(cqp, intrs_enabled); } /* * mp_cpus_call() runs a given function on cpus specified in a given cpu mask. - * If the mode is SYNC, the function is called serially on the target cpus - * in logical cpu order. If the mode is ASYNC, the function is called in - * parallel over the specified cpus. + * Possible modes are: + * SYNC: function is called serially on target cpus in logical cpu order + * waiting for each call to be acknowledged before proceeding + * ASYNC: function call is queued to the specified cpus + * waiting for all calls to complete in parallel before returning + * NOSYNC: function calls are queued + * but we return before confirmation of calls completing. * The action function may be NULL. * The cpu mask may include the local cpu. Offline cpus are ignored. - * Return does not occur until the function has completed on all cpus. - * The return value is the number of cpus on which the function was called. + * The return value is the number of cpus on which the call was made or queued. */ cpu_t mp_cpus_call( @@ -934,35 +1196,99 @@ mp_cpus_call( void (*action_func)(void *), void *arg) { - cpu_t cpu; - boolean_t intrs_enabled = ml_get_interrupts_enabled(); + return mp_cpus_call1( + cpus, + mode, + (void (*)(void *,void *))action_func, + arg, + NULL, + NULL); +} + +static void +mp_cpus_call_wait(boolean_t intrs_enabled, + cpumask_t cpus_called, + cpumask_t *cpus_responded) +{ + mp_call_queue_t *cqp; + uint64_t tsc_spin_start; + + assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0); + cqp = &mp_cpus_call_head[cpu_number()]; + + tsc_spin_start = rdtsc64(); + while (*cpus_responded != cpus_called) { + if (!intrs_enabled) { + /* Sniffing w/o locking */ + if (!queue_empty(&cqp->queue)) + mp_cpus_call_action(); + cpu_signal_handler(NULL); + } + if (mp_spin_timeout(tsc_spin_start)) { + cpumask_t cpus_unresponsive; + + cpus_unresponsive = cpus_called & ~(*cpus_responded); + NMIPI_panic(cpus_unresponsive, CROSSCALL_TIMEOUT); + panic("mp_cpus_call_wait() timeout, cpus: 0x%llx", + cpus_unresponsive); + } + } +} + +cpu_t +mp_cpus_call1( + cpumask_t cpus, + mp_sync_t mode, + void (*action_func)(void *, void *), + void *arg0, + void *arg1, + cpumask_t *cpus_calledp) +{ + cpu_t cpu = 0; + boolean_t intrs_enabled = FALSE; boolean_t call_self = FALSE; + cpumask_t cpus_called = 0; + cpumask_t cpus_responded = 0; + long cpus_call_count = 0; + uint64_t tsc_spin_start; + boolean_t topo_lock; + + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL | DBG_FUNC_START, + cpus, mode, VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1)); if (!smp_initialized) { if ((cpus & CPUMASK_SELF) == 0) - return 0; + goto out; if (action_func != NULL) { - (void) ml_set_interrupts_enabled(FALSE); - action_func(arg); + intrs_enabled = ml_set_interrupts_enabled(FALSE); + action_func(arg0, arg1); ml_set_interrupts_enabled(intrs_enabled); } - return 1; + call_self = TRUE; + goto out; } - - /* obtain rendezvous lock */ - simple_lock(&mp_rv_lock); - - /* Use the rendezvous data structures for this call */ - mp_rv_action_func = action_func; - mp_rv_func_arg = arg; - mp_rv_ncpus = 0; - mp_rv_complete = 0; - simple_lock(&x86_topo_lock); + /* + * Queue the call for each non-local requested cpu. + * This is performed under the topo lock to prevent changes to + * cpus online state and to prevent concurrent rendezvouses -- + * although an exception is made if we're calling only the master + * processor since that always remains active. Note: this exception + * is expected for longterm timer nosync cross-calls to the master cpu. + */ + mp_disable_preemption(); + intrs_enabled = ml_get_interrupts_enabled(); + topo_lock = (cpus != cpu_to_cpumask(master_cpu)); + if (topo_lock) { + ml_set_interrupts_enabled(FALSE); + (void) mp_safe_spin_lock(&x86_topo_lock); + } for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) { if (((cpu_to_cpumask(cpu) & cpus) == 0) || - !cpu_datap(cpu)->cpu_running) + !cpu_is_running(cpu)) continue; + tsc_spin_start = rdtsc64(); if (cpu == (cpu_t) cpu_number()) { /* * We don't IPI ourself and if calling asynchronously, @@ -970,62 +1296,97 @@ mp_cpus_call( */ call_self = TRUE; if (mode == SYNC && action_func != NULL) { - (void) ml_set_interrupts_enabled(FALSE); - action_func(arg); - ml_set_interrupts_enabled(intrs_enabled); + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL_LOCAL, + VM_KERNEL_UNSLIDE(action_func), + VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0); + action_func(arg0, arg1); } } else { /* - * Bump count of other cpus called and signal this cpu. - * Note: we signal asynchronously regardless of mode - * because we wait on mp_rv_complete either here - * (if mode == SYNC) or later (if mode == ASYNC). - * While spinning, poll for TLB flushes if interrupts - * are disabled. + * Here to queue a call to cpu and IPI. */ - mp_rv_ncpus++; + mp_call_t *callp = NULL; + mp_call_queue_t *cqp = &mp_cpus_call_head[cpu]; + boolean_t intrs_inner; + + queue_call: + if (callp == NULL) + callp = mp_call_alloc(); + intrs_inner = mp_call_head_lock(cqp); + if (callp == NULL) { + mp_call_head_unlock(cqp, intrs_inner); + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL_NOBUF, + cpu, 0, 0, 0, 0); + if (!intrs_inner) { + /* Sniffing w/o locking */ + if (!queue_empty(&cqp->queue)) + mp_cpus_call_action(); + handle_pending_TLB_flushes(); + } + if (mp_spin_timeout(tsc_spin_start)) + panic("mp_cpus_call1() timeout start: 0x%llx, cur: 0x%llx", + tsc_spin_start, rdtsc64()); + goto queue_call; + } + callp->maskp = (mode == NOSYNC) ? NULL : &cpus_responded; + callp->func = action_func; + callp->arg0 = arg0; + callp->arg1 = arg1; + mp_call_enqueue_locked(cqp, callp); + cpus_call_count++; + cpus_called |= cpu_to_cpumask(cpu); i386_signal_cpu(cpu, MP_CALL, ASYNC); + mp_call_head_unlock(cqp, intrs_inner); if (mode == SYNC) { - simple_unlock(&x86_topo_lock); - while (mp_rv_complete < mp_rv_ncpus) { - if (!intrs_enabled) - handle_pending_TLB_flushes(); - cpu_pause(); - } - simple_lock(&x86_topo_lock); + mp_cpus_call_wait(intrs_inner, cpus_called, &cpus_responded); } } } - simple_unlock(&x86_topo_lock); + if (topo_lock) { + simple_unlock(&x86_topo_lock); + ml_set_interrupts_enabled(intrs_enabled); + } - /* - * If calls are being made asynchronously, - * make the local call now if needed, and then - * wait for all other cpus to finish their calls. - */ - if (mode == ASYNC) { - if (call_self && action_func != NULL) { - (void) ml_set_interrupts_enabled(FALSE); - action_func(arg); + /* Call locally if mode not SYNC */ + if (mode != SYNC && call_self ) { + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL_LOCAL, + VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0); + if (action_func != NULL) { + ml_set_interrupts_enabled(FALSE); + action_func(arg0, arg1); ml_set_interrupts_enabled(intrs_enabled); } - while (mp_rv_complete < mp_rv_ncpus) { - if (!intrs_enabled) - handle_pending_TLB_flushes(); - cpu_pause(); - } } - - /* Determine the number of cpus called */ - cpu = mp_rv_ncpus + (call_self ? 1 : 0); - simple_unlock(&mp_rv_lock); + /* For ASYNC, now wait for all signaled cpus to complete their calls */ + if (mode == ASYNC) + mp_cpus_call_wait(intrs_enabled, cpus_called, &cpus_responded); + + /* Safe to allow pre-emption now */ + mp_enable_preemption(); + +out: + if (call_self){ + cpus_called |= cpu_to_cpumask(cpu); + cpus_call_count++; + } + + if (cpus_calledp) + *cpus_calledp = cpus_called; - return cpu; + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL | DBG_FUNC_END, + cpus_call_count, cpus_called, 0, 0, 0); + + return (cpu_t) cpus_call_count; } + static void -mp_broadcast_action(void) +mp_broadcast_action(__unused void *null) { /* call action function */ if (mp_bc_action_func != NULL) @@ -1064,16 +1425,14 @@ mp_broadcast( /* * signal other processors, which will call mp_broadcast_action() */ - simple_lock(&x86_topo_lock); - mp_bc_ncpus = i386_active_cpus(); /* total including this cpu */ - mp_bc_count = mp_bc_ncpus; - i386_signal_cpus(MP_BROADCAST, ASYNC); + mp_bc_count = real_ncpus; /* assume max possible active */ + mp_bc_ncpus = mp_cpus_call(CPUMASK_OTHERS, NOSYNC, *mp_broadcast_action, NULL) + 1; + atomic_decl(&mp_bc_count, real_ncpus - mp_bc_ncpus); /* subtract inactive */ /* call executor function on this cpu */ - mp_broadcast_action(); - simple_unlock(&x86_topo_lock); + mp_broadcast_action(NULL); - /* block for all cpus to have run action_func */ + /* block for other cpus to have run action_func */ if (mp_bc_ncpus > 1) thread_block(THREAD_CONTINUE_NULL); else @@ -1083,6 +1442,30 @@ mp_broadcast( lck_mtx_unlock(&mp_bc_lock); } +void +mp_cpus_kick(cpumask_t cpus) +{ + cpu_t cpu; + boolean_t intrs_enabled = FALSE; + + intrs_enabled = ml_set_interrupts_enabled(FALSE); + mp_safe_spin_lock(&x86_topo_lock); + + for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) { + if ((cpu == (cpu_t) cpu_number()) + || ((cpu_to_cpumask(cpu) & cpus) == 0) + || !cpu_is_running(cpu)) + { + continue; + } + + lapic_send_ipi(cpu, LAPIC_VECTOR(KICK)); + } + + simple_unlock(&x86_topo_lock); + ml_set_interrupts_enabled(intrs_enabled); +} + void i386_activate_cpu(void) { @@ -1095,55 +1478,76 @@ i386_activate_cpu(void) return; } - simple_lock(&x86_topo_lock); + mp_safe_spin_lock(&x86_topo_lock); cdp->cpu_running = TRUE; started_cpu(); simple_unlock(&x86_topo_lock); + flush_tlb_raw(); } -extern void etimer_timer_expire(void *arg); - void i386_deactivate_cpu(void) { cpu_data_t *cdp = current_cpu_datap(); assert(!ml_get_interrupts_enabled()); + + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_START, + 0, 0, 0, 0, 0); - simple_lock(&x86_topo_lock); + mp_safe_spin_lock(&x86_topo_lock); cdp->cpu_running = FALSE; simple_unlock(&x86_topo_lock); + /* + * Move all of this cpu's timers to the master/boot cpu, + * and poke it in case there's a sooner deadline for it to schedule. + */ timer_queue_shutdown(&cdp->rtclock_timer.queue); - cdp->rtclock_timer.deadline = EndOfAllTime; - mp_cpus_call(cpu_to_cpumask(master_cpu), ASYNC, etimer_timer_expire, NULL); + mp_cpus_call(cpu_to_cpumask(master_cpu), ASYNC, timer_queue_expire_local, NULL); + +#if MONOTONIC + mt_cpu_down(cdp); +#endif /* MONOTONIC */ + + /* + * Open an interrupt window + * and ensure any pending IPI or timer is serviced + */ + mp_disable_preemption(); + ml_set_interrupts_enabled(TRUE); + while (cdp->cpu_signals && x86_lcpu()->rtcDeadline != EndOfAllTime) + cpu_pause(); /* - * In case a rendezvous/braodcast/call was initiated to this cpu - * before we cleared cpu_running, we must perform any actions due. + * Ensure there's no remaining timer deadline set + * - AICPM may have left one active. */ - if (i_bit(MP_RENDEZVOUS, &cdp->cpu_signals)) - mp_rendezvous_action(); - if (i_bit(MP_BROADCAST, &cdp->cpu_signals)) - mp_broadcast_action(); - if (i_bit(MP_CALL, &cdp->cpu_signals)) - mp_cpus_call_action(); - cdp->cpu_signals = 0; /* all clear */ + setPop(0); + + ml_set_interrupts_enabled(FALSE); + mp_enable_preemption(); + + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_END, + 0, 0, 0, 0, 0); } int pmsafe_debug = 1; #if MACH_KDP volatile boolean_t mp_kdp_trap = FALSE; +volatile boolean_t mp_kdp_is_NMI = FALSE; volatile unsigned long mp_kdp_ncpus; boolean_t mp_kdp_state; void -mp_kdp_enter(void) +mp_kdp_enter(boolean_t proceed_on_failure) { unsigned int cpu; - unsigned int ncpus; + unsigned int ncpus = 0; unsigned int my_cpu; uint64_t tsc_timeout; @@ -1155,32 +1559,60 @@ mp_kdp_enter(void) * stopping others. */ mp_kdp_state = ml_set_interrupts_enabled(FALSE); - simple_lock(&mp_kdp_lock); - debugger_entry_time = mach_absolute_time(); - if (pmsafe_debug && !kdp_snapshot) - pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE); + my_cpu = cpu_number(); + + if (my_cpu == (unsigned) debugger_cpu) { + kprintf("\n\nRECURSIVE DEBUGGER ENTRY DETECTED\n\n"); + kdp_reset(); + return; + } + + uint64_t start_time = cpu_datap(my_cpu)->debugger_entry_time = mach_absolute_time(); + int locked = 0; + while (!locked || mp_kdp_trap) { + if (locked) { + simple_unlock(&x86_topo_lock); + } + if (proceed_on_failure) { + if (mach_absolute_time() - start_time > 500000000ll) { + kprintf("mp_kdp_enter() can't get x86_topo_lock! Debugging anyway! #YOLO\n"); + break; + } + locked = simple_lock_try(&x86_topo_lock); + if (!locked) { + cpu_pause(); + } + } else { + mp_safe_spin_lock(&x86_topo_lock); + locked = TRUE; + } - while (mp_kdp_trap) { - simple_unlock(&mp_kdp_lock); - DBG("mp_kdp_enter() race lost\n"); + if (locked && mp_kdp_trap) { + simple_unlock(&x86_topo_lock); + DBG("mp_kdp_enter() race lost\n"); #if MACH_KDP - mp_kdp_wait(TRUE, FALSE); + mp_kdp_wait(TRUE, FALSE); #endif - simple_lock(&mp_kdp_lock); + locked = FALSE; + } } - my_cpu = cpu_number(); + + if (pmsafe_debug && !kdp_snapshot) + pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE); + debugger_cpu = my_cpu; - mp_kdp_ncpus = 1; /* self */ + ncpus = 1; + atomic_incl((volatile long *)&mp_kdp_ncpus, 1); mp_kdp_trap = TRUE; - simple_unlock(&mp_kdp_lock); + debugger_entry_time = cpu_datap(my_cpu)->debugger_entry_time; /* * Deliver a nudge to other cpus, counting how many */ DBG("mp_kdp_enter() signaling other processors\n"); if (force_immediate_debugger_NMI == FALSE) { - for (ncpus = 1, cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running) + for (cpu = 0; cpu < real_ncpus; cpu++) { + if (cpu == my_cpu || !cpu_is_running(cpu)) continue; ncpus++; i386_signal_cpu(cpu, MP_KDP, ASYNC); @@ -1196,7 +1628,7 @@ mp_kdp_enter(void) * "unsafe-to-interrupt" points such as the trampolines, * but neither do we want to lose state by waiting too long. */ - tsc_timeout = rdtsc64() + (ncpus * 1000 * 1000); + tsc_timeout = rdtsc64() + (LockTimeOutTSC); while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) { /* @@ -1209,26 +1641,47 @@ mp_kdp_enter(void) cpu_pause(); } /* If we've timed out, and some processor(s) are still unresponsive, - * interrupt them with an NMI via the local APIC. + * interrupt them with an NMI via the local APIC, iff a panic is + * in progress. */ + if (panic_active()) { + NMIPI_enable(TRUE); + } if (mp_kdp_ncpus != ncpus) { + cpumask_t cpus_NMI_pending = 0; + DBG("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu); for (cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running) + if (cpu == my_cpu || !cpu_is_running(cpu)) continue; - if (cpu_signal_pending(cpu, MP_KDP)) + if (cpu_signal_pending(cpu, MP_KDP)) { + cpus_NMI_pending |= cpu_to_cpumask(cpu); cpu_NMI_interrupt(cpu); + } + } + /* Wait again for the same timeout */ + tsc_timeout = rdtsc64() + (LockTimeOutTSC); + while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) { + handle_pending_TLB_flushes(); + cpu_pause(); + } + if (mp_kdp_ncpus != ncpus) { + kdb_printf("mp_kdp_enter(): %llu, %lu, %u TIMED-OUT WAITING FOR NMI-ACK, PROCEEDING\n", cpus_NMI_pending, mp_kdp_ncpus, ncpus); } } } else for (cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running) + if (cpu == my_cpu || !cpu_is_running(cpu)) continue; cpu_NMI_interrupt(cpu); } - DBG("mp_kdp_enter() %u processors done %s\n", - mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out"); + if (locked) { + simple_unlock(&x86_topo_lock); + } + + DBG("mp_kdp_enter() %d processors done %s\n", + (int)mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out"); postcode(MP_KDP_ENTER); } @@ -1280,9 +1733,8 @@ static void mp_kdp_wait(boolean_t flush, boolean_t isNMI) { DBG("mp_kdp_wait()\n"); - /* If an I/O port has been specified as a debugging aid, issue a read */ - panic_io_port_read(); + current_cpu_datap()->debugger_ipi_time = mach_absolute_time(); #if CONFIG_MCA /* If we've trapped due to a machine-check, save MCA registers */ mca_check_save(); @@ -1317,7 +1769,7 @@ mp_kdp_exit(void) debugger_exit_time = mach_absolute_time(); mp_kdp_trap = FALSE; - __asm__ volatile("mfence"); + mfence(); /* Wait other processors to stop spinning. XXX needs timeout */ DBG("mp_kdp_exit() waiting for processors to resume\n"); @@ -1335,16 +1787,20 @@ mp_kdp_exit(void) if (pmsafe_debug && !kdp_snapshot) pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL); + debugger_exit_time = mach_absolute_time(); + DBG("mp_kdp_exit() done\n"); (void) ml_set_interrupts_enabled(mp_kdp_state); - postcode(0); + postcode(MP_KDP_EXIT); } + #endif /* MACH_KDP */ boolean_t -mp_recent_debugger_activity() { - return (((mach_absolute_time() - debugger_entry_time) < LastDebuggerEntryAllowance) || - ((mach_absolute_time() - debugger_exit_time) < LastDebuggerEntryAllowance)); +mp_recent_debugger_activity(void) { + uint64_t abstime = mach_absolute_time(); + return (((abstime - debugger_entry_time) < LastDebuggerEntryAllowance) || + ((abstime - debugger_exit_time) < LastDebuggerEntryAllowance)); } /*ARGSUSED*/ @@ -1362,176 +1818,244 @@ cause_ast_check( if (cpu != cpu_number()) { i386_signal_cpu(cpu, MP_AST, ASYNC); + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, 1, 0, 0, 0); } } -#if MACH_KDB -/* - * invoke kdb on slave processors - */ - void -remote_kdb(void) +slave_machine_init(void *param) { - unsigned int my_cpu = cpu_number(); - unsigned int cpu; - int kdb_ncpus; - uint64_t tsc_timeout = 0; - - mp_kdb_trap = TRUE; - mp_kdb_ncpus = 1; - for (kdb_ncpus = 1, cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running) - continue; - kdb_ncpus++; - i386_signal_cpu(cpu, MP_KDB, ASYNC); - } - DBG("remote_kdb() waiting for (%d) processors to suspend\n",kdb_ncpus); - - tsc_timeout = rdtsc64() + (kdb_ncpus * 100 * 1000 * 1000); + /* + * Here in process context, but with interrupts disabled. + */ + DBG("slave_machine_init() CPU%d\n", get_cpu_number()); - while (mp_kdb_ncpus != kdb_ncpus && rdtsc64() < tsc_timeout) { - /* - * a TLB shootdown request may be pending... this would result in the requesting - * processor waiting in PMAP_UPDATE_TLBS() until this processor deals with it. - * Process it, so it can now enter mp_kdp_wait() + if (param == FULL_SLAVE_INIT) { + /* + * Cold start */ - handle_pending_TLB_flushes(); - - cpu_pause(); + clock_init(); } - DBG("mp_kdp_enter() %d processors done %s\n", - mp_kdb_ncpus, (mp_kdb_ncpus == kdb_ncpus) ? "OK" : "timed out"); + cpu_machine_init(); /* Interrupts enabled hereafter */ +} + +#undef cpu_number +int cpu_number(void) +{ + return get_cpu_number(); } static void -mp_kdb_wait(void) +cpu_prewarm_init() { - DBG("mp_kdb_wait()\n"); + int i; - /* If an I/O port has been specified as a debugging aid, issue a read */ - panic_io_port_read(); + simple_lock_init(&cpu_warm_lock, 0); + queue_init(&cpu_warm_call_list); + for (i = 0; i < NUM_CPU_WARM_CALLS; i++) { + enqueue_head(&cpu_warm_call_list, (queue_entry_t)&cpu_warm_call_arr[i]); + } +} - atomic_incl(&mp_kdb_ncpus, 1); - while (mp_kdb_trap) { - /* - * a TLB shootdown request may be pending... this would result in the requesting - * processor waiting in PMAP_UPDATE_TLBS() until this processor deals with it. - * Process it, so it can now enter mp_kdp_wait() - */ - handle_pending_TLB_flushes(); +static timer_call_t +grab_warm_timer_call() +{ + spl_t x; + timer_call_t call = NULL; - cpu_pause(); + x = splsched(); + simple_lock(&cpu_warm_lock); + if (!queue_empty(&cpu_warm_call_list)) { + call = (timer_call_t) dequeue_head(&cpu_warm_call_list); } - atomic_decl((volatile long *)&mp_kdb_ncpus, 1); - DBG("mp_kdb_wait() done\n"); + simple_unlock(&cpu_warm_lock); + splx(x); + + return call; +} + +static void +free_warm_timer_call(timer_call_t call) +{ + spl_t x; + + x = splsched(); + simple_lock(&cpu_warm_lock); + enqueue_head(&cpu_warm_call_list, (queue_entry_t)call); + simple_unlock(&cpu_warm_lock); + splx(x); } /* - * Clear kdb interrupt + * Runs in timer call context (interrupts disabled). */ - -void -clear_kdb_intr(void) +static void +cpu_warm_timer_call_func( + call_entry_param_t p0, + __unused call_entry_param_t p1) { - mp_disable_preemption(); - i_bit_clear(MP_KDB, ¤t_cpu_datap()->cpu_signals); - mp_enable_preemption(); + free_warm_timer_call((timer_call_t)p0); + return; } -void -mp_kdb_exit(void) +/* + * Runs with interrupts disabled on the CPU we wish to warm (i.e. CPU 0). + */ +static void +_cpu_warm_setup( + void *arg) { - DBG("mp_kdb_exit()\n"); - atomic_decl((volatile long *)&mp_kdb_ncpus, 1); - mp_kdb_trap = FALSE; - __asm__ volatile("mfence"); + cpu_warm_data_t cwdp = (cpu_warm_data_t)arg; - while (mp_kdb_ncpus > 0) { - /* - * a TLB shootdown request may be pending... this would result in the requesting - * processor waiting in PMAP_UPDATE_TLBS() until this processor deals with it. - * Process it, so it can now enter mp_kdp_wait() - */ - handle_pending_TLB_flushes(); - - cpu_pause(); - } + timer_call_enter(cwdp->cwd_call, cwdp->cwd_deadline, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL); + cwdp->cwd_result = 0; - DBG("mp_kdb_exit() done\n"); + return; } -#endif /* MACH_KDB */ - -void -slave_machine_init(void *param) +/* + * Not safe to call with interrupts disabled. + */ +kern_return_t +ml_interrupt_prewarm( + uint64_t deadline) { + struct cpu_warm_data cwd; + timer_call_t call; + cpu_t ct; + + if (ml_get_interrupts_enabled() == FALSE) { + panic("%s: Interrupts disabled?\n", __FUNCTION__); + } + + /* + * If the platform doesn't need our help, say that we succeeded. + */ + if (!ml_get_interrupt_prewake_applicable()) { + return KERN_SUCCESS; + } + /* - * Here in process context, but with interrupts disabled. + * Grab a timer call to use. */ - DBG("slave_machine_init() CPU%d\n", get_cpu_number()); + call = grab_warm_timer_call(); + if (call == NULL) { + return KERN_RESOURCE_SHORTAGE; + } - if (param == FULL_SLAVE_INIT) { - /* - * Cold start - */ - clock_init(); + timer_call_setup(call, cpu_warm_timer_call_func, call); + cwd.cwd_call = call; + cwd.cwd_deadline = deadline; + cwd.cwd_result = 0; - cpu_machine_init(); /* Interrupts enabled hereafter */ + /* + * For now, non-local interrupts happen on the master processor. + */ + ct = mp_cpus_call(cpu_to_cpumask(master_cpu), SYNC, _cpu_warm_setup, &cwd); + if (ct == 0) { + free_warm_timer_call(call); + return KERN_FAILURE; + } else { + return cwd.cwd_result; } } -#undef cpu_number -int cpu_number(void) +#if DEBUG || DEVELOPMENT +void +kernel_spin(uint64_t spin_ns) { - return get_cpu_number(); -} + boolean_t istate; + uint64_t spin_abs; + uint64_t deadline; + cpu_data_t *cdp; -#if MACH_KDB -#include + kprintf("kernel_spin(%llu) spinning uninterruptibly\n", spin_ns); + istate = ml_set_interrupts_enabled(FALSE); + cdp = current_cpu_datap(); + nanoseconds_to_absolutetime(spin_ns, &spin_abs); -#define TRAP_DEBUG 0 /* Must match interrupt.s and spl.s */ + /* Fake interrupt handler entry for testing mp_interrupt_watchdog() */ + cdp->cpu_int_event_time = mach_absolute_time(); + cdp->cpu_int_state = (void *) USER_STATE(current_thread()); + deadline = mach_absolute_time() + spin_ns; + while (mach_absolute_time() < deadline) + cpu_pause(); -#if TRAP_DEBUG -#define MTRAPS 100 -struct mp_trap_hist_struct { - unsigned char type; - unsigned char data[5]; -} trap_hist[MTRAPS], *cur_trap_hist = trap_hist, - *max_trap_hist = &trap_hist[MTRAPS]; + cdp->cpu_int_event_time = 0; + cdp->cpu_int_state = NULL; -void db_trap_hist(void); + ml_set_interrupts_enabled(istate); + kprintf("kernel_spin() continuing\n"); +} /* - * SPL: - * 1: new spl - * 2: old spl - * 3: new tpr - * 4: old tpr - * INT: - * 1: int vec - * 2: old spl - * 3: new spl - * 4: post eoi tpr - * 5: exit tpr + * Called from the scheduler's maintenance thread, + * scan running processors for long-running ISRs and: + * - panic if longer than LockTimeOut, or + * - log if more than a quantum. */ - void -db_trap_hist(void) +mp_interrupt_watchdog(void) { - int i,j; - for(i=0;i=cur_trap_hist)?"*":" ", - (trap_hist[i].type == 1)?"SPL":"INT"); - for(j=0;j<5;j++) - db_printf(" %02x", trap_hist[i].data[j]); - db_printf("\n"); - } - -} -#endif /* TRAP_DEBUG */ -#endif /* MACH_KDB */ + cpu_t cpu; + boolean_t intrs_enabled = FALSE; + uint16_t cpu_int_num; + uint64_t cpu_int_event_time; + uint64_t cpu_rip; + uint64_t cpu_int_duration; + uint64_t now; + x86_saved_state_t *cpu_int_state; + + if (__improbable(!mp_interrupt_watchdog_enabled)) + return; + intrs_enabled = ml_set_interrupts_enabled(FALSE); + now = mach_absolute_time(); + /* + * While timeouts are not suspended, + * check all other processors for long outstanding interrupt handling. + */ + for (cpu = 0; + cpu < (cpu_t) real_ncpus && !machine_timeout_suspended(); + cpu++) { + if ((cpu == (cpu_t) cpu_number()) || + (!cpu_is_running(cpu))) + continue; + cpu_int_event_time = cpu_datap(cpu)->cpu_int_event_time; + if (cpu_int_event_time == 0) + continue; + if (__improbable(now < cpu_int_event_time)) + continue; /* skip due to inter-processor skew */ + cpu_int_state = cpu_datap(cpu)->cpu_int_state; + if (__improbable(cpu_int_state == NULL)) + /* The interrupt may have been dismissed */ + continue; + + /* Here with a cpu handling an interrupt */ + + cpu_int_duration = now - cpu_int_event_time; + if (__improbable(cpu_int_duration > LockTimeOut)) { + cpu_int_num = saved_state64(cpu_int_state)->isf.trapno; + cpu_rip = saved_state64(cpu_int_state)->isf.rip; + vector_timed_out = cpu_int_num; + NMIPI_panic(cpu_to_cpumask(cpu), INTERRUPT_WATCHDOG); + panic("Interrupt watchdog, " + "cpu: %d interrupt: 0x%x time: %llu..%llu state: %p RIP: 0x%llx", + cpu, cpu_int_num, cpu_int_event_time, now, cpu_int_state, cpu_rip); + /* NOT REACHED */ + } else if (__improbable(cpu_int_duration > (uint64_t) std_quantum)) { + mp_interrupt_watchdog_events++; + cpu_int_num = saved_state64(cpu_int_state)->isf.trapno; + cpu_rip = saved_state64(cpu_int_state)->isf.rip; + ml_set_interrupts_enabled(intrs_enabled); + printf("Interrupt watchdog, " + "cpu: %d interrupt: 0x%x time: %llu..%llu RIP: 0x%llx\n", + cpu, cpu_int_num, cpu_int_event_time, now, cpu_rip); + return; + } + } + + ml_set_interrupts_enabled(intrs_enabled); +} +#endif