X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/89b3af67bb32e691275bf6fa803d1834b2284115..HEAD:/osfmk/i386/trap.c diff --git a/osfmk/i386/trap.c b/osfmk/i386/trap.c index 62bd38713..10b1ac10d 100644 --- a/osfmk/i386/trap.c +++ b/osfmk/i386/trap.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2020 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,45 +22,44 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ */ -/* +/* * Mach Operating System * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University * All Rights Reserved. - * + * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. - * + * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * + * * Carnegie Mellon requests users of this software to return to - * + * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 - * + * * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ /* */ + /* * Hardware trap/fault handler. */ -#include -#include #include #include @@ -69,7 +68,8 @@ #include #include #include -#include /* panic_io_port_read() */ +#include +#include #include #include @@ -88,470 +88,664 @@ #include #include #include - +#include +#if CONFIG_TELEMETRY +#include +#endif #include - -#if MACH_KGDB -#include -#endif /* MACH_KGDB */ - -#if MACH_KDB -#include -#include -#include -#include -#include -#endif /* MACH_KDB */ +#include +#include +#include #include -#include #include #include #include +#include +#if CONFIG_MCA +#include +#endif #include +#include +#include +#include + +extern void throttle_lowpri_io(int); +extern void kprint_state(x86_saved_state64_t *saved_state); +#if DEVELOPMENT || DEBUG +int insnstream_force_cacheline_mismatch = 0; +extern int panic_on_cacheline_mismatch; +extern char panic_on_trap_procname[]; +extern uint32_t panic_on_trap_mask; +#endif + +extern int insn_copyin_count; + /* * Forward declarations */ -static void user_page_fault_continue(kern_return_t kret); -static void panic_trap(x86_saved_state32_t *saved_state); -static void set_recovery_ip(x86_saved_state32_t *saved_state, vm_offset_t ip); +static void panic_trap(x86_saved_state64_t *saved_state, uint32_t pl, kern_return_t fault_result) __dead2; +static void set_recovery_ip(x86_saved_state64_t *saved_state, vm_offset_t ip); +#if DEVELOPMENT || DEBUG +static __attribute__((noinline)) void copy_instruction_stream(thread_t thread, uint64_t rip, int trap_code, bool inspect_cacheline); +#else +static __attribute__((noinline)) void copy_instruction_stream(thread_t thread, uint64_t rip, int trap_code); +#endif -perfCallback perfTrapHook = NULL; /* Pointer to CHUD trap hook routine */ -perfCallback perfASTHook = NULL; /* Pointer to CHUD AST hook routine */ +#if CONFIG_DTRACE +/* See */ +perfCallback tempDTraceTrapHook = NULL; /* Pointer to DTrace fbt trap hook routine */ +extern boolean_t dtrace_tally_fault(user_addr_t); +extern boolean_t dtrace_handle_trap(int, x86_saved_state_t *); +#endif + +#ifdef MACH_BSD +extern char * proc_name_address(void *p); +#endif /* MACH_BSD */ + +extern boolean_t pmap_smep_enabled; +extern boolean_t pmap_smap_enabled; + +__attribute__((noreturn)) void thread_syscall_return( - kern_return_t ret) + kern_return_t ret) { - thread_t thr_act = current_thread(); + thread_t thr_act = current_thread(); + boolean_t is_mach; + int code; + + pal_register_cache_state(thr_act, DIRTY); + + if (thread_is_64bit_addr(thr_act)) { + x86_saved_state64_t *regs; - if (thread_is_64bit(thr_act)) { - x86_saved_state64_t *regs; - regs = USER_REGS64(thr_act); - if (kdebug_enable && ((regs->rax & SYSCALL_CLASS_MASK) == (SYSCALL_CLASS_MACH << SYSCALL_CLASS_SHIFT))) { - /* Mach trap */ - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_EXCP_SC, ((int) (regs->rax & SYSCALL_NUMBER_MASK))) - | DBG_FUNC_END, - ret, 0, 0, 0, 0); + code = (int) (regs->rax & SYSCALL_NUMBER_MASK); + is_mach = (regs->rax & SYSCALL_CLASS_MASK) + == (SYSCALL_CLASS_MACH << SYSCALL_CLASS_SHIFT); + if (kdebug_enable && is_mach) { + /* Mach trap */ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_EXCP_SC, code) | DBG_FUNC_END, + ret, 0, 0, 0, 0); } regs->rax = ret; - +#if DEBUG + if (is_mach) { + DEBUG_KPRINT_SYSCALL_MACH( + "thread_syscall_return: 64-bit mach ret=%u\n", + ret); + } else { + DEBUG_KPRINT_SYSCALL_UNIX( + "thread_syscall_return: 64-bit unix ret=%u\n", + ret); + } +#endif } else { - x86_saved_state32_t *regs; - + x86_saved_state32_t *regs; + regs = USER_REGS32(thr_act); - if (kdebug_enable && ((int) regs->eax < 0)) { - /* Mach trap */ - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_EXCP_SC, -((int) regs->eax)) - | DBG_FUNC_END, - ret, 0, 0, 0, 0); + code = ((int) regs->eax); + is_mach = (code < 0); + if (kdebug_enable && is_mach) { + /* Mach trap */ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_EXCP_SC, -code) | DBG_FUNC_END, + ret, 0, 0, 0, 0); } regs->eax = ret; +#if DEBUG + if (is_mach) { + DEBUG_KPRINT_SYSCALL_MACH( + "thread_syscall_return: 32-bit mach ret=%u\n", + ret); + } else { + DEBUG_KPRINT_SYSCALL_UNIX( + "thread_syscall_return: 32-bit unix ret=%u\n", + ret); + } +#endif } - thread_exception_return(); - /*NOTREACHED*/ + +#if DEBUG || DEVELOPMENT + kern_allocation_name_t + prior __assert_only = thread_get_kernel_state(thr_act)->allocation_name; + assertf(prior == NULL, "thread_set_allocation_name(\"%s\") not cleared", kern_allocation_get_name(prior)); +#endif /* DEBUG || DEVELOPMENT */ + + throttle_lowpri_io(1); + + thread_exception_return(); + /*NOTREACHED*/ } +/* + * Fault recovery in copyin/copyout routines. + */ +struct recovery { + uintptr_t fault_addr; + uintptr_t recover_addr; +}; + +extern struct recovery recover_table[]; +extern struct recovery recover_table_end[]; -#if MACH_KDB -boolean_t debug_all_traps_with_kdb = FALSE; -extern struct db_watchpoint *db_watchpoint_list; -extern boolean_t db_watchpoints_inserted; -extern boolean_t db_breakpoints_inserted; +const char * trap_type[] = {TRAP_NAMES}; +unsigned TRAP_TYPES = sizeof(trap_type) / sizeof(trap_type[0]); +extern void PE_incoming_interrupt(int interrupt); + +#if defined(__x86_64__) && DEBUG void -thread_kdb_return(void) +kprint_state(x86_saved_state64_t *saved_state) { - thread_t thr_act = current_thread(); - x86_saved_state_t *iss = USER_STATE(thr_act); + kprintf("current_cpu_datap() 0x%lx\n", (uintptr_t)current_cpu_datap()); + kprintf("Current GS base MSR 0x%llx\n", rdmsr64(MSR_IA32_GS_BASE)); + kprintf("Kernel GS base MSR 0x%llx\n", rdmsr64(MSR_IA32_KERNEL_GS_BASE)); + kprintf("state at 0x%lx:\n", (uintptr_t) saved_state); + + kprintf(" rdi 0x%llx\n", saved_state->rdi); + kprintf(" rsi 0x%llx\n", saved_state->rsi); + kprintf(" rdx 0x%llx\n", saved_state->rdx); + kprintf(" r10 0x%llx\n", saved_state->r10); + kprintf(" r8 0x%llx\n", saved_state->r8); + kprintf(" r9 0x%llx\n", saved_state->r9); + + kprintf(" cr2 0x%llx\n", saved_state->cr2); + kprintf("real cr2 0x%lx\n", get_cr2()); + kprintf(" r15 0x%llx\n", saved_state->r15); + kprintf(" r14 0x%llx\n", saved_state->r14); + kprintf(" r13 0x%llx\n", saved_state->r13); + kprintf(" r12 0x%llx\n", saved_state->r12); + kprintf(" r11 0x%llx\n", saved_state->r11); + kprintf(" rbp 0x%llx\n", saved_state->rbp); + kprintf(" rbx 0x%llx\n", saved_state->rbx); + kprintf(" rcx 0x%llx\n", saved_state->rcx); + kprintf(" rax 0x%llx\n", saved_state->rax); + + kprintf(" gs 0x%x\n", saved_state->gs); + kprintf(" fs 0x%x\n", saved_state->fs); + + kprintf(" isf.trapno 0x%x\n", saved_state->isf.trapno); + kprintf(" isf._pad 0x%x\n", saved_state->isf._pad); + kprintf(" isf.trapfn 0x%llx\n", saved_state->isf.trapfn); + kprintf(" isf.err 0x%llx\n", saved_state->isf.err); + kprintf(" isf.rip 0x%llx\n", saved_state->isf.rip); + kprintf(" isf.cs 0x%llx\n", saved_state->isf.cs); + kprintf(" isf.rflags 0x%llx\n", saved_state->isf.rflags); + kprintf(" isf.rsp 0x%llx\n", saved_state->isf.rsp); + kprintf(" isf.ss 0x%llx\n", saved_state->isf.ss); +} +#endif - if (is_saved_state64(iss)) { - x86_saved_state64_t *regs; - - regs = saved_state64(iss); - if (kdb_trap(regs->isf.trapno, (int)regs->isf.err, (void *)regs)) { - thread_exception_return(); - /*NOTREACHED*/ - } +/* + * Non-zero indicates latency assert is enabled and capped at valued + * absolute time units. + */ + +uint64_t interrupt_latency_cap = 0; +boolean_t ilat_assert = FALSE; +void +interrupt_latency_tracker_setup(void) +{ + uint32_t ilat_cap_us; + if (PE_parse_boot_argn("interrupt_latency_cap_us", &ilat_cap_us, sizeof(ilat_cap_us))) { + interrupt_latency_cap = ilat_cap_us * NSEC_PER_USEC; + nanoseconds_to_absolutetime(interrupt_latency_cap, &interrupt_latency_cap); } else { - x86_saved_state32_t *regs; - - regs = saved_state32(iss); + interrupt_latency_cap = LockTimeOut; + } + PE_parse_boot_argn("-interrupt_latency_assert_enable", &ilat_assert, sizeof(ilat_assert)); +} - if (kdb_trap(regs->trapno, regs->err, (void *)regs)) { - thread_exception_return(); - /*NOTREACHED*/ +void +interrupt_reset_latency_stats(void) +{ + uint32_t i; + for (i = 0; i < real_ncpus; i++) { + cpu_data_ptr[i]->cpu_max_observed_int_latency = + cpu_data_ptr[i]->cpu_max_observed_int_latency_vector = 0; + } +} + +void +interrupt_populate_latency_stats(char *buf, unsigned bufsize) +{ + uint32_t i, tcpu = ~0; + uint64_t cur_max = 0; + + for (i = 0; i < real_ncpus; i++) { + if (cur_max < cpu_data_ptr[i]->cpu_max_observed_int_latency) { + cur_max = cpu_data_ptr[i]->cpu_max_observed_int_latency; + tcpu = i; } } + + if (tcpu < real_ncpus) { + snprintf(buf, bufsize, "0x%x 0x%x 0x%llx", tcpu, cpu_data_ptr[tcpu]->cpu_max_observed_int_latency_vector, cpu_data_ptr[tcpu]->cpu_max_observed_int_latency); + } } -#endif /* MACH_KDB */ +uint32_t interrupt_timer_coalescing_enabled = 1; +uint64_t interrupt_coalesced_timers; +/* + * Handle interrupts: + * - local APIC interrupts (IPIs, timers, etc) are handled by the kernel, + * - device interrupts go to the platform expert. + */ void -user_page_fault_continue( - kern_return_t kr) +interrupt(x86_saved_state_t *state) { - thread_t thread = current_thread(); - x86_saved_state_t *regs = USER_STATE(thread); - ast_t *myast; - boolean_t intr; - user_addr_t vaddr; -#if MACH_KDB - int err; - int trapno; + uint64_t rip; + uint64_t rsp; + int interrupt_num; + boolean_t user_mode = FALSE; + int ipl; + int cnum = cpu_number(); + cpu_data_t *cdp = cpu_data_ptr[cnum]; + int itype = DBG_INTR_TYPE_UNKNOWN; + int handled; + + x86_saved_state64_t *state64 = saved_state64(state); + rip = state64->isf.rip; + rsp = state64->isf.rsp; + interrupt_num = state64->isf.trapno; + if (state64->isf.cs & 0x03) { + user_mode = TRUE; + } + +#if DEVELOPMENT || DEBUG + uint64_t frameptr = is_saved_state64(state) ? state64->rbp : saved_state32(state)->ebp; + uint32_t traptrace_index = traptrace_start(interrupt_num, rip, mach_absolute_time(), frameptr); #endif - assert((is_saved_state32(regs) && !thread_is_64bit(thread)) || - (is_saved_state64(regs) && thread_is_64bit(thread))); + if (cpu_data_ptr[cnum]->lcpu.package->num_idle == topoParms.nLThreadsPerPackage) { + cpu_data_ptr[cnum]->cpu_hwIntpexits[interrupt_num]++; + } - if (thread_is_64bit(thread)) { - x86_saved_state64_t *uregs; + if (interrupt_num == (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_INTERPROCESSOR_INTERRUPT)) { + itype = DBG_INTR_TYPE_IPI; + } else if (interrupt_num == (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_TIMER_INTERRUPT)) { + itype = DBG_INTR_TYPE_TIMER; + } else { + itype = DBG_INTR_TYPE_OTHER; + } - uregs = USER_REGS64(thread); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_START, + interrupt_num, + (user_mode ? rip : VM_KERNEL_UNSLIDE(rip)), + user_mode, itype, 0); -#if MACH_KDB - trapno = uregs->isf.trapno; - err = uregs->isf.err; + SCHED_STATS_INC(interrupt_count); + +#if CONFIG_TELEMETRY + if (telemetry_needs_record) { + telemetry_mark_curthread(user_mode, FALSE); + } #endif - vaddr = (user_addr_t)uregs->cr2; - } else { - x86_saved_state32_t *uregs; - uregs = USER_REGS32(thread); + ipl = get_preemption_level(); -#if MACH_KDB - trapno = uregs->trapno; - err = uregs->err; -#endif - vaddr = uregs->cr2; + /* + * Handle local APIC interrupts + * else call platform expert for devices. + */ + handled = lapic_interrupt(interrupt_num, state); + + if (!handled) { + if (interrupt_num == (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_CMCI_INTERRUPT)) { + /* + * CMCI can be signalled on any logical processor, and the kexts + * that implement handling CMCI use IOKit to register handlers for + * the CMCI vector, so if we see a CMCI, do not encode a CPU + * number in bits 8:31 (since the vector is the same regardless of + * the handling CPU). + */ + PE_incoming_interrupt(interrupt_num); + } else if (cnum <= lapic_max_interrupt_cpunum) { + PE_incoming_interrupt((cnum << 8) | interrupt_num); + } + } + + if (__improbable(get_preemption_level() != ipl)) { + panic("Preemption level altered by interrupt vector 0x%x: initial 0x%x, final: 0x%x\n", interrupt_num, ipl, get_preemption_level()); } - if ((kr == KERN_SUCCESS) || (kr == KERN_ABORTED)) { -#if MACH_KDB - if (!db_breakpoints_inserted) { - db_set_breakpoints(); + + if (__improbable(cdp->cpu_nested_istack)) { + cdp->cpu_nested_istack_events++; + } else { + uint64_t ctime = mach_absolute_time(); + uint64_t int_latency = ctime - cdp->cpu_int_event_time; + uint64_t esdeadline, ehdeadline; + /* Attempt to process deferred timers in the context of + * this interrupt, unless interrupt time has already exceeded + * TCOAL_ILAT_THRESHOLD. + */ +#define TCOAL_ILAT_THRESHOLD (30000ULL) + + if ((int_latency < TCOAL_ILAT_THRESHOLD) && + interrupt_timer_coalescing_enabled) { + esdeadline = cdp->rtclock_timer.queue.earliest_soft_deadline; + ehdeadline = cdp->rtclock_timer.deadline; + if ((ctime >= esdeadline) && (ctime < ehdeadline)) { + interrupt_coalesced_timers++; + TCOAL_DEBUG(0x88880000 | DBG_FUNC_START, ctime, esdeadline, ehdeadline, interrupt_coalesced_timers, 0); + rtclock_intr(state); + TCOAL_DEBUG(0x88880000 | DBG_FUNC_END, ctime, esdeadline, interrupt_coalesced_timers, 0, 0); + } else { + TCOAL_DEBUG(0x77770000, ctime, cdp->rtclock_timer.queue.earliest_soft_deadline, cdp->rtclock_timer.deadline, interrupt_coalesced_timers, 0); + } } - if (db_watchpoint_list && - db_watchpoints_inserted && - (err & T_PF_WRITE) && - db_find_watchpoint(thread->map, - (vm_offset_t)vaddr, - regs)) - kdb_trap(T_WATCHPOINT, 0, regs); -#endif /* MACH_KDB */ - intr = ml_set_interrupts_enabled(FALSE); - myast = ast_pending(); - while (*myast & AST_ALL) { - ast_taken(AST_ALL, intr); - ml_set_interrupts_enabled(FALSE); - myast = ast_pending(); + + if (__improbable(ilat_assert && (int_latency > interrupt_latency_cap) && !machine_timeout_suspended())) { + panic("Interrupt vector 0x%x exceeded interrupt latency threshold, 0x%llx absolute time delta, prior signals: 0x%x, current signals: 0x%x", interrupt_num, int_latency, cdp->cpu_prior_signals, cdp->cpu_signals); } - ml_set_interrupts_enabled(intr); - thread_exception_return(); - /*NOTREACHED*/ + if (__improbable(int_latency > cdp->cpu_max_observed_int_latency)) { + cdp->cpu_max_observed_int_latency = int_latency; + cdp->cpu_max_observed_int_latency_vector = interrupt_num; + } } -#if MACH_KDB - if (debug_all_traps_with_kdb && - kdb_trap(trapno, err, regs)) { - thread_exception_return(); - /*NOTREACHED*/ + /* + * Having serviced the interrupt first, look at the interrupted stack depth. + */ + if (!user_mode) { + uint64_t depth = cdp->cpu_kernel_stack + + sizeof(struct thread_kernel_state) + + sizeof(struct i386_exception_link *) + - rsp; + if (__improbable(depth > kernel_stack_depth_max)) { + kernel_stack_depth_max = (vm_offset_t)depth; + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_DEPTH), + (long) depth, (long) VM_KERNEL_UNSLIDE(rip), 0, 0, 0); + } } -#endif /* MACH_KDB */ - i386_exception(EXC_BAD_ACCESS, kr, vaddr); - /*NOTREACHED*/ -} + if (cnum == master_cpu) { + entropy_collect(); + } -/* - * Fault recovery in copyin/copyout routines. - */ -struct recovery { - uint32_t fault_addr; - uint32_t recover_addr; -}; +#if KPERF + kperf_interrupt(); +#endif /* KPERF */ -extern struct recovery recover_table[]; -extern struct recovery recover_table_end[]; + KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END, + interrupt_num); -const char * trap_type[] = {TRAP_NAMES}; -unsigned TRAP_TYPES = sizeof(trap_type)/sizeof(trap_type[0]); + assert(ml_get_interrupts_enabled() == FALSE); + +#if DEVELOPMENT || DEBUG + if (traptrace_index != TRAPTRACE_INVALID_INDEX) { + traptrace_end(traptrace_index, mach_absolute_time()); + } +#endif +} static inline void reset_dr7(void) { - uint32_t dr7 = 0x400; /* magic dr7 reset value */ - __asm__ volatile("movl %0,%%dr7" : : "r" (dr7)); + long dr7 = 0x400; /* magic dr7 reset value; 32 bit on i386, 64 bit on x86_64 */ + __asm__ volatile ("mov %0,%%dr7" : : "r" (dr7)); } #if MACH_KDP unsigned kdp_has_active_watchpoints = 0; +#define NO_WATCHPOINTS (!kdp_has_active_watchpoints) +#else +#define NO_WATCHPOINTS 1 #endif /* * Trap from kernel mode. Only page-fault errors are recoverable, * and then only in special circumstances. All other errors are * fatal. Return value indicates if trap was handled. */ + void kernel_trap( - x86_saved_state_t *state) + x86_saved_state_t *state, + uintptr_t *lo_spp) { - x86_saved_state32_t *saved_state; - int code; - user_addr_t vaddr; - int type; - vm_map_t map; - kern_return_t result = KERN_FAILURE; - thread_t thread; - ast_t *myast; + x86_saved_state64_t *saved_state; + int code; + user_addr_t vaddr; + int type; + vm_map_t map = 0; /* protected by T_PAGE_FAULT */ + kern_return_t result = KERN_FAILURE; + kern_return_t fault_result = KERN_SUCCESS; + thread_t thread; boolean_t intr; - vm_prot_t prot; - struct recovery *rp; - vm_offset_t kern_ip; - int fault_in_copy_window = -1; - int is_user = 0; -#if MACH_KDB - pt_entry_t *pte; -#endif /* MACH_KDB */ + vm_prot_t prot; + struct recovery *rp; + vm_offset_t kern_ip; + int is_user; + int trap_pl = get_preemption_level(); thread = current_thread(); - if (is_saved_state64(state)) - panic("kernel_trap(%p) with 64-bit state", state); - saved_state = saved_state32(state); + if (__improbable(is_saved_state32(state))) { + panic("kernel_trap(%p) with 32-bit state", state); + } + saved_state = saved_state64(state); + + /* Record cpu where state was captured */ + saved_state->isf.cpu = cpu_number(); vaddr = (user_addr_t)saved_state->cr2; - type = saved_state->trapno; - code = saved_state->err & 0xffff; - intr = (saved_state->efl & EFL_IF) != 0; /* state of ints at trap */ - - kern_ip = (vm_offset_t)saved_state->eip; + type = saved_state->isf.trapno; + code = (int)(saved_state->isf.err & 0xffff); + intr = (saved_state->isf.rflags & EFL_IF) != 0; /* state of ints at trap */ + kern_ip = (vm_offset_t)saved_state->isf.rip; - myast = ast_pending(); + is_user = (vaddr < VM_MAX_USER_PAGE_ADDRESS); - if (perfASTHook) { - if (*myast & AST_CHUD_ALL) - perfASTHook(type, NULL, 0, 0); - } else - *myast &= ~AST_CHUD_ALL; +#if DEVELOPMENT || DEBUG + uint32_t traptrace_index = traptrace_start(type, kern_ip, mach_absolute_time(), saved_state->rbp); +#endif +#if CONFIG_DTRACE /* - * Is there a hook? + * Is there a DTrace hook? */ - if (perfTrapHook) { - if (perfTrapHook(type, NULL, 0, 0) == KERN_SUCCESS) { - /* + if (__improbable(tempDTraceTrapHook != NULL)) { + if (tempDTraceTrapHook(type, state, lo_spp, 0) == KERN_SUCCESS) { + /* * If it succeeds, we are done... */ - return; + goto common_return; + } + } + + /* Handle traps originated from probe context. */ + if (thread != THREAD_NULL && thread->t_dtrace_inprobe) { + if (dtrace_handle_trap(type, state)) { + goto common_return; } } + +#endif /* CONFIG_DTRACE */ + /* * we come here with interrupts off as we don't want to recurse * on preemption below. but we do want to re-enable interrupts * as soon we possibly can to hold latency down */ - if (T_PREEMPT == type) { + if (__improbable(T_PREEMPT == type)) { + ast_taken_kernel(); - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_EXCP_KTRAP_x86, type)) | DBG_FUNC_NONE, - 0, 0, 0, kern_ip, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_EXCP_KTRAP_x86, type)) | DBG_FUNC_NONE, + 0, 0, 0, VM_KERNEL_UNSLIDE(kern_ip), 0); - ast_taken(AST_PREEMPTION, FALSE); - return; + goto common_return; } - + + user_addr_t kd_vaddr = is_user ? vaddr : VM_KERNEL_UNSLIDE(vaddr); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_EXCP_KTRAP_x86, type)) | DBG_FUNC_NONE, + (unsigned)(kd_vaddr >> 32), (unsigned)kd_vaddr, is_user, + VM_KERNEL_UNSLIDE(kern_ip), 0); + + if (T_PAGE_FAULT == type) { /* * assume we're faulting in the kernel map */ map = kernel_map; - if (thread != THREAD_NULL && thread->map != kernel_map) { - vm_offset_t copy_window_base; - vm_offset_t kvaddr; - int window_index; - - kvaddr = (vm_offset_t)vaddr; - /* - * must determine if fault occurred in - * the copy window while pre-emption is - * disabled for this processor so that - * we only need to look at the window - * associated with this processor + if (__probable((thread != THREAD_NULL) && (thread->map != kernel_map) && + (vaddr < VM_MAX_USER_PAGE_ADDRESS))) { + /* fault occurred in userspace */ + map = thread->map; + + /* Intercept a potential Supervisor Mode Execute + * Protection fault. These criteria identify + * both NX faults and SMEP faults, but both + * are fatal. We avoid checking PTEs (racy). + * (The VM could just redrive a SMEP fault, hence + * the intercept). */ - copy_window_base = current_cpu_datap()->cpu_copywindow_base; - - if (kvaddr >= copy_window_base && kvaddr < (copy_window_base + (NBPDE * NCOPY_WINDOWS)) ) { - - window_index = (kvaddr - copy_window_base) / NBPDE; - - if (thread->machine.copy_window[window_index].user_base != (user_addr_t)-1) { + if (__improbable((code == (T_PF_PROT | T_PF_EXECUTE)) && + (pmap_smep_enabled) && (saved_state->isf.rip == vaddr))) { + goto debugger_entry; + } - kvaddr -= (copy_window_base + (NBPDE * window_index)); - vaddr = thread->machine.copy_window[window_index].user_base + kvaddr; + /* + * Additionally check for SMAP faults... + * which are characterized by page-present and + * the AC bit unset (i.e. not from copyin/out path). + */ + if (__improbable(code & T_PF_PROT && + pmap_smap_enabled && + (saved_state->isf.rflags & EFL_AC) == 0)) { + goto debugger_entry; + } - map = thread->map; - fault_in_copy_window = window_index; - } - is_user = -1; + /* + * If we're not sharing cr3 with the user + * and we faulted in copyio, + * then switch cr3 here and dismiss the fault. + */ + if (no_shared_cr3 && + (thread->machine.specFlags & CopyIOActive) && + map->pmap->pm_cr3 != get_cr3_base()) { + pmap_assert(current_cpu_datap()->cpu_pmap_pcid_enabled == FALSE); + set_cr3_raw(map->pmap->pm_cr3); + return; + } + if (__improbable(vaddr < PAGE_SIZE) && + ((thread->machine.specFlags & CopyIOActive) == 0)) { + goto debugger_entry; } } } - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_EXCP_KTRAP_x86, type)) | DBG_FUNC_NONE, - (int)(vaddr >> 32), (int)vaddr, is_user, kern_ip, 0); - (void) ml_set_interrupts_enabled(intr); switch (type) { - - case T_NO_FPU: + case T_NO_FPU: fpnoextflt(); - return; + goto common_return; - case T_FPU_FAULT: + case T_FPU_FAULT: fpextovrflt(); - return; + goto common_return; - case T_FLOATING_POINT_ERROR: + case T_FLOATING_POINT_ERROR: fpexterrflt(); - return; - - case T_SSE_FLOAT_ERROR: - fpSSEexterrflt(); - return; - case T_DEBUG: - if ((saved_state->efl & EFL_TF) == 0 - && !kdp_has_active_watchpoints) { - /* We've somehow encountered a debug - * register match that does not belong - * to the kernel debugger. - * This isn't supposed to happen. - */ - reset_dr7(); - return; - } - goto debugger_entry; - case T_PAGE_FAULT: - /* - * If the current map is a submap of the kernel map, - * and the address is within that map, fault on that - * map. If the same check is done in vm_fault - * (vm_map_lookup), we may deadlock on the kernel map - * lock. - */ - - prot = VM_PROT_READ; - - if (code & T_PF_WRITE) - prot |= VM_PROT_WRITE; -#if PAE - if (code & T_PF_EXECUTE) - prot |= VM_PROT_EXECUTE; -#endif - -#if MACH_KDB - /* - * Check for watchpoint on kernel static data. - * vm_fault would fail in this case - */ - if (map == kernel_map && db_watchpoint_list && db_watchpoints_inserted && - (code & T_PF_WRITE) && vaddr < vm_map_max(map) && - ((*(pte = pmap_pte(kernel_pmap, (vm_map_offset_t)vaddr))) & INTEL_PTE_WRITE) == 0) { - pmap_store_pte( - pte, - *pte | INTEL_PTE_VALID | INTEL_PTE_WRITE); - /* XXX need invltlb here? */ - - result = KERN_SUCCESS; - goto look_for_watchpoints; - } -#endif /* MACH_KDB */ - - result = vm_fault(map, - vm_map_trunc_page(vaddr), - prot, - FALSE, - THREAD_UNINT, NULL, 0); - -#if MACH_KDB - if (result == KERN_SUCCESS) { - /* - * Look for watchpoints + goto common_return; + + case T_SSE_FLOAT_ERROR: + fpSSEexterrflt(); + goto common_return; + + case T_INVALID_OPCODE: + fpUDflt(kern_ip); + goto debugger_entry; + + case T_DEBUG: + if ((saved_state->isf.rflags & EFL_TF) == 0 && NO_WATCHPOINTS) { + /* We've somehow encountered a debug + * register match that does not belong + * to the kernel debugger. + * This isn't supposed to happen. */ -look_for_watchpoints: - if (map == kernel_map && db_watchpoint_list && db_watchpoints_inserted && (code & T_PF_WRITE) && - db_find_watchpoint(map, vaddr, saved_state)) - kdb_trap(T_WATCHPOINT, 0, saved_state); + reset_dr7(); + goto common_return; } -#endif /* MACH_KDB */ - - if (result == KERN_SUCCESS) { - - if (fault_in_copy_window != -1) { - pt_entry_t *updp; - pt_entry_t *kpdp; - + goto debugger_entry; + case T_INT3: + goto debugger_entry; + case T_PAGE_FAULT: + +#if CONFIG_DTRACE + if (thread != THREAD_NULL && thread->t_dtrace_inprobe) { /* Executing under dtrace_probe? */ + if (dtrace_tally_fault(vaddr)) { /* Should a fault under dtrace be ignored? */ /* - * in case there was no page table assigned - * for the user base address and the pmap - * got 'expanded' due to this fault, we'll - * copy in the descriptor - * - * we're either setting the page table descriptor - * to the same value or it was 0... no need - * for a TLB flush in either case + * DTrace has "anticipated" the possibility of this fault, and has + * established the suitable recovery state. Drop down now into the + * recovery handling code in "case T_GENERAL_PROTECTION:". */ + goto FALL_THROUGH; + } + } +#endif /* CONFIG_DTRACE */ - ml_set_interrupts_enabled(FALSE); - updp = pmap_pde(map->pmap, thread->machine.copy_window[fault_in_copy_window].user_base); - assert(updp); - if (0 == updp) panic("trap: updp 0"); /* XXX DEBUG */ - kpdp = current_cpu_datap()->cpu_copywindow_pdp; - kpdp += fault_in_copy_window; + prot = VM_PROT_READ; -#if JOE_DEBUG - if (*kpdp && (*kpdp & PG_FRAME) != (*updp & PG_FRAME)) - panic("kernel_fault: user pdp doesn't match - updp = 0x%x, kpdp = 0x%x\n", updp, kpdp); -#endif - pmap_store_pte(kpdp, *updp); + if (code & T_PF_WRITE) { + prot |= VM_PROT_WRITE; + } + if (code & T_PF_EXECUTE) { + prot |= VM_PROT_EXECUTE; + } - (void) ml_set_interrupts_enabled(intr); - } - return; + fault_result = result = vm_fault(map, + vaddr, + prot, + FALSE, VM_KERN_MEMORY_NONE, + THREAD_UNINT, NULL, 0); + + if (result == KERN_SUCCESS) { + goto common_return; } /* * fall through */ +#if CONFIG_DTRACE +FALL_THROUGH: +#endif /* CONFIG_DTRACE */ - case T_GENERAL_PROTECTION: + case T_GENERAL_PROTECTION: /* * If there is a failure recovery address * for this fault, go there. */ - for (rp = recover_table; rp < recover_table_end; rp++) { - if (kern_ip == rp->fault_addr) { - set_recovery_ip(saved_state, rp->recover_addr); - return; + for (rp = recover_table; rp < recover_table_end; rp++) { + if (kern_ip == rp->fault_addr) { + set_recovery_ip(saved_state, rp->recover_addr); + goto common_return; } } /* * Check thread recovery address also. */ - if (thread->recover) { - set_recovery_ip(saved_state, thread->recover); + if (thread != THREAD_NULL && thread->recover) { + set_recovery_ip(saved_state, thread->recover); thread->recover = 0; - return; + goto common_return; } /* * Unanticipated page-fault errors in kernel @@ -559,15 +753,15 @@ look_for_watchpoints: * * fall through... */ - - default: + OS_FALLTHROUGH; + default: /* * Exception 15 is reserved but some chips may generate it * spuriously. Seen at startup on AMD Athlon-64. */ - if (type == 15) { - kprintf("kernel_trap() ignoring spurious trap 15\n"); - return; + if (type == 15) { + kprintf("kernel_trap() ignoring spurious trap 15\n"); + goto common_return; } debugger_entry: /* Ensure that the i386_kernel_state at the base of the @@ -575,257 +769,107 @@ debugger_entry: * context at the moment of the trap, to facilitate * access through the debugger. */ - sync_iss_to_iks(saved_state); -#if MACH_KDB -restart_debugger: -#endif /* MACH_KDB */ + sync_iss_to_iks(state); #if MACH_KDP - if (current_debugger != KDB_CUR_DB) { - if (kdp_i386_trap(type, saved_state, result, vaddr)) - return; + if (kdp_i386_trap(type, saved_state, result, (vm_offset_t)vaddr)) { + goto common_return; } -#endif /* MACH_KDP */ -#if MACH_KDB - else - if (kdb_trap(type, code, saved_state)) { - if (switch_debugger) { - current_debugger = KDP_CUR_DB; - switch_debugger = 0; - goto restart_debugger; - } - return; - } -#endif /* MACH_KDB */ +#endif } - - panic_trap(saved_state); + pal_cli(); + panic_trap(saved_state, trap_pl, fault_result); /* * NO RETURN */ -} - -static void -set_recovery_ip(x86_saved_state32_t *saved_state, vm_offset_t ip) -{ - saved_state->eip = ip; +common_return: +#if DEVELOPMENT || DEBUG + if (traptrace_index != TRAPTRACE_INVALID_INDEX) { + traptrace_end(traptrace_index, mach_absolute_time()); + } +#endif + return; } - static void -panic_trap(x86_saved_state32_t *regs) -{ - const char *trapname = "Unknown"; - uint32_t cr0 = get_cr0(); - uint32_t cr2 = get_cr2(); - uint32_t cr3 = get_cr3(); - uint32_t cr4 = get_cr4(); - - panic_io_port_read(); - - kprintf("panic trap number 0x%x, eip 0x%x\n", regs->trapno, regs->eip); - kprintf("cr0 0x%08x cr2 0x%08x cr3 0x%08x cr4 0x%08x\n", - cr0, cr2, cr3, cr4); - - if (regs->trapno < TRAP_TYPES) - trapname = trap_type[regs->trapno]; - - panic("Unresolved kernel trap (CPU %d, Type %d=%s), registers:\n" - "CR0: 0x%08x, CR2: 0x%08x, CR3: 0x%08x, CR4: 0x%08x\n" - "EAX: 0x%08x, EBX: 0x%08x, ECX: 0x%08x, EDX: 0x%08x\n" - "CR2: 0x%08x, EBP: 0x%08x, ESI: 0x%08x, EDI: 0x%08x\n" - "EFL: 0x%08x, EIP: 0x%08x, CS: 0x%08x, DS: 0x%08x\n", - cpu_number(), regs->trapno, trapname, cr0, cr2, cr3, cr4, - regs->eax,regs->ebx,regs->ecx,regs->edx, - regs->cr2,regs->ebp,regs->esi,regs->edi, - regs->efl,regs->eip,regs->cs, regs->ds); - /* - * This next statement is not executed, - * but it's needed to stop the compiler using tail call optimization - * for the panic call - which confuses the subsequent backtrace. - */ - cr0 = 0; -} - -extern void kprintf_break_lock(void); - - -/* - * Called from locore on a special reserved stack after a double-fault - * is taken in kernel space. - * Kernel stack overflow is one route here. - */ -void -panic_double_fault(int code) +set_recovery_ip(x86_saved_state64_t *saved_state, vm_offset_t ip) { - struct i386_tss *my_ktss = current_ktss(); - - /* Set postcode (DEBUG only) */ - postcode(PANIC_DOUBLE_FAULT); - -/* Issue an I/O port read if one has been requested - this is an event logic - * analyzers can use as a trigger point. - */ - panic_io_port_read(); - - /* - * Break kprintf lock in case of recursion, - * and record originally faulted instruction address. - */ - kprintf_break_lock(); - -#if MACH_KDP - /* - * Print backtrace leading to first fault: - */ - panic_i386_backtrace((void *) my_ktss->ebp, 10); -#endif - - panic("Double fault (CPU:%d, thread:%p, code:0x%x)," - "registers:\n" - "CR0: 0x%08x, CR2: 0x%08x, CR3: 0x%08x, CR4: 0x%08x\n" - "EAX: 0x%08x, EBX: 0x%08x, ECX: 0x%08x, EDX: 0x%08x\n" - "ESP: 0x%08x, EBP: 0x%08x, ESI: 0x%08x, EDI: 0x%08x\n" - "EFL: 0x%08x, EIP: 0x%08x\n", - cpu_number(), current_thread(), code, - get_cr0(), get_cr2(), get_cr3(), get_cr4(), - my_ktss->eax, my_ktss->ebx, my_ktss->ecx, my_ktss->edx, - my_ktss->esp, my_ktss->ebp, my_ktss->esi, my_ktss->edi, - my_ktss->eflags, my_ktss->eip); + saved_state->isf.rip = ip; } - -/* - * Called from locore on a special reserved stack after a machine-check - */ -void -panic_machine_check(int code) +static void +panic_trap(x86_saved_state64_t *regs, uint32_t pl, kern_return_t fault_result) { - struct i386_tss *my_ktss = current_ktss(); - - /* Set postcode (DEBUG only) */ - postcode(PANIC_MACHINE_CHECK); - + const char *trapname = "Unknown"; + pal_cr_t cr0, cr2, cr3, cr4; + boolean_t potential_smep_fault = FALSE, potential_kernel_NX_fault = FALSE; + boolean_t potential_smap_fault = FALSE; + + pal_get_control_registers( &cr0, &cr2, &cr3, &cr4 ); + assert(ml_get_interrupts_enabled() == FALSE); + current_cpu_datap()->cpu_fatal_trap_state = regs; /* - * Break kprintf lock in case of recursion, - * and record originally faulted instruction address. + * Issue an I/O port read if one has been requested - this is an + * event logic analyzers can use as a trigger point. */ - kprintf_break_lock(); - panic("Machine-check (CPU:%d, thread:%p, code:0x%x)," - "registers:\n" - "CR0: 0x%08x, CR2: 0x%08x, CR3: 0x%08x, CR4: 0x%08x\n" - "EAX: 0x%08x, EBX: 0x%08x, ECX: 0x%08x, EDX: 0x%08x\n" - "ESP: 0x%08x, EBP: 0x%08x, ESI: 0x%08x, EDI: 0x%08x\n" - "EFL: 0x%08x, EIP: 0x%08x\n", - cpu_number(), current_thread(), code, - get_cr0(), get_cr2(), get_cr3(), get_cr4(), - my_ktss->eax, my_ktss->ebx, my_ktss->ecx, my_ktss->edx, - my_ktss->esp, my_ktss->ebp, my_ktss->esi, my_ktss->edi, - my_ktss->eflags, my_ktss->eip); -} + panic_notify(); -void -panic_double_fault64(x86_saved_state_t *esp) -{ - /* Set postcode (DEBUG only) */ - postcode(PANIC_DOUBLE_FAULT); + kprintf("CPU %d panic trap number 0x%x, rip 0x%016llx\n", + cpu_number(), regs->isf.trapno, regs->isf.rip); + kprintf("cr0 0x%016llx cr2 0x%016llx cr3 0x%016llx cr4 0x%016llx\n", + cr0, cr2, cr3, cr4); - /* - * Break kprintf lock in case of recursion, - * and record originally faulted instruction address. - */ - kprintf_break_lock(); + if (regs->isf.trapno < TRAP_TYPES) { + trapname = trap_type[regs->isf.trapno]; + } - /* - * Dump the interrupt stack frame at last kernel entry. - */ - if (is_saved_state64(esp)) { - x86_saved_state64_t *ss64p = saved_state64(esp); - panic("Double fault (CPU:%d, thread:%p, trapno:0x%x, err:0x%qx)," - "registers:\n" - "CR0: 0x%08x, CR2: 0x%08x, CR3: 0x%08x, CR4: 0x%08x\n" - "RAX: 0x%016qx, RBX: 0x%016qx, RCX: 0x%016qx, RDX: 0x%016qx\n" - "RSP: 0x%016qx, RBP: 0x%016qx, RSI: 0x%016qx, RDI: 0x%016qx\n" - "R8: 0x%016qx, R9: 0x%016qx, R10: 0x%016qx, R11: 0x%016qx\n" - "R12: 0x%016qx, R13: 0x%016qx, R14: 0x%016qx, R15: 0x%016qx\n" - "RFL: 0x%016qx, RIP: 0x%016qx, CR2: 0x%016qx\n", - cpu_number(), current_thread(), ss64p->isf.trapno, ss64p->isf.err, - get_cr0(), get_cr2(), get_cr3(), get_cr4(), - ss64p->rax, ss64p->rbx, ss64p->rcx, ss64p->rdx, - ss64p->isf.rsp, ss64p->rbp, ss64p->rsi, ss64p->rdi, - ss64p->r8, ss64p->r9, ss64p->r10, ss64p->r11, - ss64p->r12, ss64p->r13, ss64p->r14, ss64p->r15, - ss64p->isf.rflags, ss64p->isf.rip, ss64p->cr2); - } else { - x86_saved_state32_t *ss32p = saved_state32(esp); - panic("Double fault (CPU:%d, thread:%p, trapno:0x%x, err:0x%x)," - "registers:\n" - "CR0: 0x%08x, CR2: 0x%08x, CR3: 0x%08x, CR4: 0x%08x\n" - "EAX: 0x%08x, EBX: 0x%08x, ECX: 0x%08x, EDX: 0x%08x\n" - "ESP: 0x%08x, EBP: 0x%08x, ESI: 0x%08x, EDI: 0x%08x\n" - "EFL: 0x%08x, EIP: 0x%08x\n", - cpu_number(), current_thread(), ss32p->trapno, ss32p->err, - get_cr0(), get_cr2(), get_cr3(), get_cr4(), - ss32p->eax, ss32p->ebx, ss32p->ecx, ss32p->edx, - ss32p->uesp, ss32p->ebp, ss32p->esi, ss32p->edi, - ss32p->efl, ss32p->eip); + if ((regs->isf.trapno == T_PAGE_FAULT) && (regs->isf.err == (T_PF_PROT | T_PF_EXECUTE)) && (regs->isf.rip == regs->cr2)) { + if (pmap_smep_enabled && (regs->isf.rip < VM_MAX_USER_PAGE_ADDRESS)) { + potential_smep_fault = TRUE; + } else if (regs->isf.rip >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) { + potential_kernel_NX_fault = TRUE; + } + } else if (pmap_smap_enabled && + regs->isf.trapno == T_PAGE_FAULT && + regs->isf.err & T_PF_PROT && + regs->cr2 < VM_MAX_USER_PAGE_ADDRESS && + regs->isf.rip >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) { + potential_smap_fault = TRUE; } -} -/* - * Simplistic machine check handler. - * We could peruse all those MSRs but we only dump register state as we do for - * the double fault exception. - * Note: the machine check registers are non-volatile across warm boot - so - * they'll be around when we return. - */ -void -panic_machine_check64(x86_saved_state_t *esp) -{ - /* Set postcode (DEBUG only) */ - postcode(PANIC_MACHINE_CHECK); +#undef panic + panic("Kernel trap at 0x%016llx, type %d=%s, registers:\n" + "CR0: 0x%016llx, CR2: 0x%016llx, CR3: 0x%016llx, CR4: 0x%016llx\n" + "RAX: 0x%016llx, RBX: 0x%016llx, RCX: 0x%016llx, RDX: 0x%016llx\n" + "RSP: 0x%016llx, RBP: 0x%016llx, RSI: 0x%016llx, RDI: 0x%016llx\n" + "R8: 0x%016llx, R9: 0x%016llx, R10: 0x%016llx, R11: 0x%016llx\n" + "R12: 0x%016llx, R13: 0x%016llx, R14: 0x%016llx, R15: 0x%016llx\n" + "RFL: 0x%016llx, RIP: 0x%016llx, CS: 0x%016llx, SS: 0x%016llx\n" + "Fault CR2: 0x%016llx, Error code: 0x%016llx, Fault CPU: 0x%x%s%s%s%s, PL: %d, VF: %d\n", + regs->isf.rip, regs->isf.trapno, trapname, + cr0, cr2, cr3, cr4, + regs->rax, regs->rbx, regs->rcx, regs->rdx, + regs->isf.rsp, regs->rbp, regs->rsi, regs->rdi, + regs->r8, regs->r9, regs->r10, regs->r11, + regs->r12, regs->r13, regs->r14, regs->r15, + regs->isf.rflags, regs->isf.rip, regs->isf.cs & 0xFFFF, + regs->isf.ss & 0xFFFF, regs->cr2, regs->isf.err, regs->isf.cpu, + virtualized ? " VMM" : "", + potential_kernel_NX_fault ? " Kernel NX fault" : "", + potential_smep_fault ? " SMEP/User NX fault" : "", + potential_smap_fault ? " SMAP fault" : "", + pl, + fault_result); +} - /* - * Break kprintf lock in case of recursion, - * and record originally faulted instruction address. - */ - kprintf_break_lock(); +#if CONFIG_DTRACE +extern kern_return_t dtrace_user_probe(x86_saved_state_t *); +#endif - /* - * Dump the interrupt stack frame at last kernel entry. - */ - if (is_saved_state64(esp)) { - x86_saved_state64_t *ss64p = saved_state64(esp); - panic("Machine Check (CPU:%d, thread:%p, trapno:0x%x, err:0x%qx)," - "registers:\n" - "CR0: 0x%08x, CR2: 0x%08x, CR3: 0x%08x, CR4: 0x%08x\n" - "RAX: 0x%016qx, RBX: 0x%016qx, RCX: 0x%016qx, RDX: 0x%016qx\n" - "RSP: 0x%016qx, RBP: 0x%016qx, RSI: 0x%016qx, RDI: 0x%016qx\n" - "R8: 0x%016qx, R9: 0x%016qx, R10: 0x%016qx, R11: 0x%016qx\n" - "R12: 0x%016qx, R13: 0x%016qx, R14: 0x%016qx, R15: 0x%016qx\n" - "RFL: 0x%016qx, RIP: 0x%016qx\n", - cpu_number(), current_thread(), ss64p->isf.trapno, ss64p->isf.err, - get_cr0(), get_cr2(), get_cr3(), get_cr4(), - ss64p->rax, ss64p->rbx, ss64p->rcx, ss64p->rdx, - ss64p->isf.rsp, ss64p->rbp, ss64p->rsi, ss64p->rdi, - ss64p->r8, ss64p->r9, ss64p->r10, ss64p->r11, - ss64p->r12, ss64p->r13, ss64p->r14, ss64p->r15, - ss64p->isf.rflags, ss64p->isf.rip); - } else { - x86_saved_state32_t *ss32p = saved_state32(esp); - panic("Machine Check (CPU:%d, thread:%p, trapno:0x%x, err:0x%x)," - "registers:\n" - "CR0: 0x%08x, CR2: 0x%08x, CR3: 0x%08x, CR4: 0x%08x\n" - "EAX: 0x%08x, EBX: 0x%08x, ECX: 0x%08x, EDX: 0x%08x\n" - "ESP: 0x%08x, EBP: 0x%08x, ESI: 0x%08x, EDI: 0x%08x\n" - "EFL: 0x%08x, EIP: 0x%08x\n", - cpu_number(), current_thread(), ss32p->trapno, ss32p->err, - get_cr0(), get_cr2(), get_cr3(), get_cr4(), - ss32p->eax, ss32p->ebx, ss32p->ecx, ss32p->edx, - ss32p->uesp, ss32p->ebp, ss32p->esi, ss32p->edi, - ss32p->efl, ss32p->eip); - } -} +#if DEBUG +uint32_t fsigs[2]; +uint32_t fsigns, fsigcs; +#endif /* * Trap from user mode. @@ -834,235 +878,548 @@ void user_trap( x86_saved_state_t *saved_state) { - int exc; - int code; - int err; - unsigned int subcode; - int type; - user_addr_t vaddr; - vm_prot_t prot; - thread_t thread = current_thread(); - ast_t *myast; - boolean_t intr; - kern_return_t kret; - user_addr_t rip; - - assert((is_saved_state32(saved_state) && !thread_is_64bit(thread)) || - (is_saved_state64(saved_state) && thread_is_64bit(thread))); + int exc; + int err; + mach_exception_code_t code; + mach_exception_subcode_t subcode; + int type; + user_addr_t vaddr; + vm_prot_t prot; + thread_t thread = current_thread(); + kern_return_t kret; + user_addr_t rip; + unsigned long dr6 = 0; /* 32 bit for i386, 64 bit for x86_64 */ + int current_cpu = cpu_number(); +#if DEVELOPMENT || DEBUG + bool inspect_cacheline = false; + uint32_t traptrace_index; +#endif + assert((is_saved_state32(saved_state) && !thread_is_64bit_addr(thread)) || + (is_saved_state64(saved_state) && thread_is_64bit_addr(thread))); if (is_saved_state64(saved_state)) { - x86_saved_state64_t *regs; + x86_saved_state64_t *regs; regs = saved_state64(saved_state); + /* Record cpu where state was captured */ + regs->isf.cpu = current_cpu; + type = regs->isf.trapno; - err = regs->isf.err & 0xffff; + err = (int)regs->isf.err & 0xffff; vaddr = (user_addr_t)regs->cr2; rip = (user_addr_t)regs->isf.rip; +#if DEVELOPMENT || DEBUG + traptrace_index = traptrace_start(type, rip, mach_absolute_time(), regs->rbp); +#endif } else { - x86_saved_state32_t *regs; + x86_saved_state32_t *regs; regs = saved_state32(saved_state); + /* Record cpu where state was captured */ + regs->cpu = current_cpu; + type = regs->trapno; err = regs->err & 0xffff; vaddr = (user_addr_t)regs->cr2; rip = (user_addr_t)regs->eip; +#if DEVELOPMENT || DEBUG + traptrace_index = traptrace_start(type, rip, mach_absolute_time(), regs->ebp); +#endif } - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_EXCP_UTRAP_x86, type)) | DBG_FUNC_NONE, - (int)(vaddr>>32), (int)vaddr, (int)(rip>>32), (int)rip, 0); +#if DEVELOPMENT || DEBUG + /* + * Copy the cacheline of code into the thread's instruction stream save area + * before enabling interrupts (the assumption is that we have not otherwise faulted or + * trapped since the original cache line stores). If the saved code is not valid, + * we'll catch it below when we process the copyin() for unhandled faults. + */ + if (type == T_PAGE_FAULT || type == T_INVALID_OPCODE || type == T_GENERAL_PROTECTION) { +#define CACHELINE_SIZE 64 + THREAD_TO_PCB(thread)->insn_cacheline[CACHELINE_SIZE] = (uint8_t)(rip & (CACHELINE_SIZE - 1)); + bcopy(&cpu_shadowp(current_cpu)->cpu_rtimes[0], + &THREAD_TO_PCB(thread)->insn_cacheline[0], + sizeof(THREAD_TO_PCB(thread)->insn_cacheline) - 1); + inspect_cacheline = true; + } +#endif + + if (type == T_DEBUG) { + if (thread->machine.ids) { + unsigned long clear = 0; + /* Stash and clear this processor's DR6 value, in the event + * this was a debug register match + */ + __asm__ volatile ("mov %%db6, %0" : "=r" (dr6)); + __asm__ volatile ("mov %0, %%db6" : : "r" (clear)); + } + /* [Re]Enable LBRs *BEFORE* enabling interrupts to ensure we hit the right CPU */ + i386_lbr_enable(); + } + + pal_sti(); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_EXCP_UTRAP_x86, type)) | DBG_FUNC_NONE, + (unsigned)(vaddr >> 32), (unsigned)vaddr, + (unsigned)(rip >> 32), (unsigned)rip, 0); code = 0; subcode = 0; exc = 0; -#if DEBUG_TRACE - kprintf("user_trap(0x%08x) type=%d vaddr=0x%016llx\n", - saved_state, type, vaddr); +#if CONFIG_DTRACE + /* + * DTrace does not consume all user traps, only INT_3's for now. + * Avoid needlessly calling tempDTraceTrapHook here, and let the + * INT_3 case handle them. + */ #endif - myast = ast_pending(); - if (perfASTHook) { - if (*myast & AST_CHUD_ALL) { - perfASTHook(type, saved_state, 0, 0); - } - } else { - *myast &= ~AST_CHUD_ALL; - } - /* Is there a hook? */ - if (perfTrapHook) { - if (perfTrapHook(type, saved_state, 0, 0) == KERN_SUCCESS) - return; /* If it succeeds, we are done... */ - } + DEBUG_KPRINT_SYSCALL_MASK(1, + "user_trap: type=0x%x(%s) err=0x%x cr2=%p rip=%p\n", + type, trap_type[type], err, (void *)(long) vaddr, (void *)(long) rip); switch (type) { - - case T_DIVIDE_ERROR: + case T_DIVIDE_ERROR: exc = EXC_ARITHMETIC; code = EXC_I386_DIV; break; - case T_DEBUG: - { - pcb_t pcb; - unsigned int clear = 0; + case T_DEBUG: + { + pcb_t pcb; + /* + * Update the PCB with this processor's DR6 value + * in the event this was a debug register match. + */ + pcb = THREAD_TO_PCB(thread); + if (pcb->ids) { /* - * get dr6 and set it in the thread's pcb before - * returning to userland + * We can get and set the status register + * in 32-bit mode even on a 64-bit thread + * because the high order bits are not + * used on x86_64 */ - pcb = thread->machine.pcb; - if (pcb->ids) { - /* - * We can get and set the status register - * in 32-bit mode even on a 64-bit thread - * because the high order bits are not - * used on x86_64 - */ - if (thread_is_64bit(thread)) { - uint32_t dr6; - x86_debug_state64_t *ids = pcb->ids; - dr6 = (uint32_t)ids->dr6; - __asm__ volatile ("movl %%db6, %0" : "=r" (dr6)); - ids->dr6 = dr6; - } else { /* 32 bit thread */ - x86_debug_state32_t *ids = pcb->ids; - __asm__ volatile ("movl %%db6, %0" : "=r" (ids->dr6)); - } - __asm__ volatile ("movl %0, %%db6" : : "r" (clear)); + if (thread_is_64bit_addr(thread)) { + x86_debug_state64_t *ids = pcb->ids; + ids->dr6 = dr6; + } else { /* 32 bit thread */ + x86_debug_state32_t *ids = pcb->ids; + ids->dr6 = (uint32_t) dr6; } - exc = EXC_BREAKPOINT; - code = EXC_I386_SGL; - break; } - case T_INT3: + exc = EXC_BREAKPOINT; + code = EXC_I386_SGL; + break; + } + case T_INT3: +#if CONFIG_DTRACE + if (dtrace_user_probe(saved_state) == KERN_SUCCESS) { + return; /* If it succeeds, we are done... */ + } +#endif exc = EXC_BREAKPOINT; code = EXC_I386_BPT; break; - case T_OVERFLOW: + case T_OVERFLOW: exc = EXC_ARITHMETIC; code = EXC_I386_INTO; break; - case T_OUT_OF_BOUNDS: + case T_OUT_OF_BOUNDS: exc = EXC_SOFTWARE; code = EXC_I386_BOUND; break; - case T_INVALID_OPCODE: - exc = EXC_BAD_INSTRUCTION; - code = EXC_I386_INVOP; + case T_INVALID_OPCODE: + if (fpUDflt(rip) == 1) { + exc = EXC_BAD_INSTRUCTION; + code = EXC_I386_INVOP; + } break; - case T_NO_FPU: - case 32: /* XXX */ + case T_NO_FPU: fpnoextflt(); - return; + break; - case T_FPU_FAULT: + case T_FPU_FAULT: fpextovrflt(); - return; + /* + * Raise exception. + */ + exc = EXC_BAD_ACCESS; + code = VM_PROT_READ | VM_PROT_EXECUTE; + subcode = 0; + break; - case 10: /* invalid TSS == iret with NT flag set */ + case T_INVALID_TSS: /* invalid TSS == iret with NT flag set */ exc = EXC_BAD_INSTRUCTION; code = EXC_I386_INVTSSFLT; subcode = err; break; - case T_SEGMENT_NOT_PRESENT: + case T_SEGMENT_NOT_PRESENT: exc = EXC_BAD_INSTRUCTION; code = EXC_I386_SEGNPFLT; subcode = err; break; - case T_STACK_FAULT: + case T_STACK_FAULT: exc = EXC_BAD_INSTRUCTION; code = EXC_I386_STKFLT; subcode = err; break; - case T_GENERAL_PROTECTION: - exc = EXC_BAD_INSTRUCTION; + case T_GENERAL_PROTECTION: + /* + * There's a wide range of circumstances which generate this + * class of exception. From user-space, many involve bad + * addresses (such as a non-canonical 64-bit address). + * So we map this to EXC_BAD_ACCESS (and thereby SIGSEGV). + * The trouble is cr2 doesn't contain the faulting address; + * we'd need to decode the faulting instruction to really + * determine this. We'll leave that to debuggers. + * However, attempted execution of privileged instructions + * (e.g. cli) also generate GP faults and so we map these to + * to EXC_BAD_ACCESS (and thence SIGSEGV) also - rather than + * EXC_BAD_INSTRUCTION which is more accurate. We just can't + * win! + */ + exc = EXC_BAD_ACCESS; code = EXC_I386_GPFLT; subcode = err; break; - case T_PAGE_FAULT: + case T_PAGE_FAULT: + { prot = VM_PROT_READ; - if (err & T_PF_WRITE) - prot |= VM_PROT_WRITE; -#if PAE - if (err & T_PF_EXECUTE) - prot |= VM_PROT_EXECUTE; + if (err & T_PF_WRITE) { + prot |= VM_PROT_WRITE; + } + if (__improbable(err & T_PF_EXECUTE)) { + prot |= VM_PROT_EXECUTE; + } +#if DEVELOPMENT || DEBUG + uint32_t fsig = 0; + fsig = thread_fpsimd_hash(thread); +#if DEBUG + fsigs[0] = fsig; +#endif +#endif + kret = vm_fault(thread->map, + vaddr, + prot, FALSE, VM_KERN_MEMORY_NONE, + THREAD_ABORTSAFE, NULL, 0); +#if DEVELOPMENT || DEBUG + if (fsig) { + uint32_t fsig2 = thread_fpsimd_hash(thread); +#if DEBUG + fsigcs++; + fsigs[1] = fsig2; +#endif + if (fsig != fsig2) { + panic("FP/SIMD state hash mismatch across fault thread: %p 0x%x->0x%x", thread, fsig, fsig2); + } + } else { +#if DEBUG + fsigns++; #endif - kret = vm_fault(thread->map, vm_map_trunc_page(vaddr), - prot, FALSE, - THREAD_ABORTSAFE, NULL, 0); + } +#endif + if (__probable((kret == KERN_SUCCESS) || (kret == KERN_ABORTED))) { + break; + } else if (__improbable(kret == KERN_FAILURE)) { + /* + * For a user trap, vm_fault() should never return KERN_FAILURE. + * If it does, we're leaking preemption disables somewhere in the kernel. + */ + panic("vm_fault() KERN_FAILURE from user fault on thread %p", thread); + } - user_page_fault_continue(kret); - - /* NOTREACHED */ - break; + /* PAL debug hook (empty on x86) */ + pal_dbg_page_fault(thread, vaddr, kret); + exc = EXC_BAD_ACCESS; + code = kret; + subcode = vaddr; + } + break; - case T_SSE_FLOAT_ERROR: - fpSSEexterrflt(); - return; + case T_SSE_FLOAT_ERROR: + fpSSEexterrflt(); + exc = EXC_ARITHMETIC; + code = EXC_I386_SSEEXTERR; + subcode = ((struct x86_fx_thread_state *)thread->machine.ifps)->fx_MXCSR; + break; - case T_FLOATING_POINT_ERROR: + case T_FLOATING_POINT_ERROR: fpexterrflt(); - return; + exc = EXC_ARITHMETIC; + code = EXC_I386_EXTERR; + subcode = ((struct x86_fx_thread_state *)thread->machine.ifps)->fx_status; + break; - default: -#if MACH_KGDB - Debugger("Unanticipated user trap"); - return; -#endif /* MACH_KGDB */ -#if MACH_KDB - if (kdb_trap(type, err, saved_state)) - return; -#endif /* MACH_KDB */ - panic("user trap"); - return; - } - intr = ml_set_interrupts_enabled(FALSE); - myast = ast_pending(); - while (*myast & AST_ALL) { - ast_taken(AST_ALL, intr); - ml_set_interrupts_enabled(FALSE); - myast = ast_pending(); + case T_DTRACE_RET: +#if CONFIG_DTRACE + if (dtrace_user_probe(saved_state) == KERN_SUCCESS) { + return; /* If it succeeds, we are done... */ + } +#endif + /* + * If we get an INT 0x7f when we do not expect to, + * treat it as an illegal instruction + */ + exc = EXC_BAD_INSTRUCTION; + code = EXC_I386_INVOP; + break; + + default: + panic("Unexpected user trap, type %d", type); } - ml_set_interrupts_enabled(intr); - i386_exception(exc, code, subcode); - /*NOTREACHED*/ -} + if (exc != 0) { + uint16_t cs; + boolean_t intrs; + + if (is_saved_state64(saved_state)) { + cs = saved_state64(saved_state)->isf.cs; + } else { + cs = saved_state32(saved_state)->cs; + } + + if (last_branch_support_enabled) { + intrs = ml_set_interrupts_enabled(FALSE); + /* + * This is a bit racy (it's possible for this thread to migrate to another CPU, then + * migrate back, but that seems rather rare in practice), but good enough to ensure + * the LBRs are saved before proceeding with exception/signal dispatch. + */ + if (current_cpu == cpu_number()) { + i386_lbr_synch(thread); + } + ml_set_interrupts_enabled(intrs); + } + /* + * Do not try to copyin from the instruction stream if the page fault was due + * to an access to rip and was unhandled. + * Do not deal with cases when %cs != USER[64]_CS + * And of course there's no need to copy the instruction stream if the boot-arg + * was set to 0. + */ + if (insn_copyin_count > 0 && + (cs == USER64_CS || cs == USER_CS) && (type != T_PAGE_FAULT || vaddr != rip)) { +#if DEVELOPMENT || DEBUG + copy_instruction_stream(thread, rip, type, inspect_cacheline); +#else + copy_instruction_stream(thread, rip, type); +#endif + } + +#if DEVELOPMENT || DEBUG + if (traptrace_index != TRAPTRACE_INVALID_INDEX) { + traptrace_end(traptrace_index, mach_absolute_time()); + } +#endif + /* + * Note: Codepaths that directly return from user_trap() have pending + * ASTs processed in locore + */ + i386_exception(exc, code, subcode); + /* NOTREACHED */ + } else { +#if DEVELOPMENT || DEBUG + if (traptrace_index != TRAPTRACE_INVALID_INDEX) { + traptrace_end(traptrace_index, mach_absolute_time()); + } +#endif + } +} /* - * Handle AST traps for i386. - * Check for delayed floating-point exception from - * AT-bus machines. + * Copyin up to x86_INSTRUCTION_STATE_MAX_INSN_BYTES bytes from the page that includes `rip`, + * ensuring that we stay on the same page, clipping the start or end, as needed. + * Add the clipped amount back at the start or end, depending on where it fits. + * Consult the variable populated by the boot-arg `insn_capcnt' */ +static __attribute__((noinline)) void +copy_instruction_stream(thread_t thread, uint64_t rip, int __unused trap_code +#if DEVELOPMENT || DEBUG + , bool inspect_cacheline +#endif + ) +{ +#if x86_INSTRUCTION_STATE_MAX_INSN_BYTES > 4096 +#error x86_INSTRUCTION_STATE_MAX_INSN_BYTES cannot exceed a page in size. +#endif + pcb_t pcb = THREAD_TO_PCB(thread); + vm_map_offset_t pagemask = ~vm_map_page_mask(current_map()); + vm_map_offset_t rip_page = rip & pagemask; + vm_map_offset_t start_addr; + vm_map_offset_t insn_offset; + vm_map_offset_t end_addr = rip + (insn_copyin_count / 2); + void *stack_buffer; + int copyin_err = 0; +#if defined(MACH_BSD) && (DEVELOPMENT || DEBUG) + void *procname; +#endif -extern void log_thread_action (thread_t, char *); +#if DEVELOPMENT || DEBUG + assert(insn_copyin_count <= x86_INSTRUCTION_STATE_MAX_INSN_BYTES); +#else + if (insn_copyin_count > x86_INSTRUCTION_STATE_MAX_INSN_BYTES || + insn_copyin_count < 64 /* CACHELINE_SIZE */) { + return; + } +#endif -void -i386_astintr(int preemption) -{ - ast_t mask = AST_ALL; - spl_t s; +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Walloca" + stack_buffer = __builtin_alloca(insn_copyin_count); +#pragma clang diagnostic pop - if (preemption) - mask = AST_PREEMPTION; + if (rip >= (insn_copyin_count / 2)) { + start_addr = rip - (insn_copyin_count / 2); + } else { + start_addr = 0; + } + + if (start_addr < rip_page) { + insn_offset = (insn_copyin_count / 2) - (rip_page - start_addr); + end_addr += (rip_page - start_addr); + start_addr = rip_page; + } else if (end_addr >= (rip_page + (~pagemask + 1))) { + start_addr -= (end_addr - (rip_page + (~pagemask + 1))); /* Adjust start address backward */ + /* Adjust instruction offset due to start address change */ + insn_offset = (insn_copyin_count / 2) + (end_addr - (rip_page + (~pagemask + 1))); + end_addr = rip_page + (~pagemask + 1); /* clip to the start of the next page (non-inclusive */ + } else { + insn_offset = insn_copyin_count / 2; + } + + disable_preemption(); /* Prevent copyin from faulting in the instruction stream */ + if ( +#if DEVELOPMENT || DEBUG + (insnstream_force_cacheline_mismatch < 2) && +#endif + ((end_addr > start_addr) && (copyin_err = copyin(start_addr, stack_buffer, end_addr - start_addr)) == 0)) { + enable_preemption(); + + if (pcb->insn_state == 0) { + pcb->insn_state = kalloc(sizeof(x86_instruction_state_t)); + } + + if (pcb->insn_state != 0) { + bcopy(stack_buffer, pcb->insn_state->insn_bytes, end_addr - start_addr); + bzero(&pcb->insn_state->insn_bytes[end_addr - start_addr], + insn_copyin_count - (end_addr - start_addr)); + + pcb->insn_state->insn_stream_valid_bytes = (int)(end_addr - start_addr); + pcb->insn_state->insn_offset = (int)insn_offset; + +#if DEVELOPMENT || DEBUG + /* Now try to validate the cacheline we read at early-fault time matches the code + * copied in. Before we do that, we have to make sure the buffer contains a valid + * cacheline by looking for the 2 sentinel values written in the event the cacheline + * could not be copied. + */ +#define CACHELINE_DATA_NOT_PRESENT 0xdeadc0debeefcafeULL +#define CACHELINE_MASK (CACHELINE_SIZE - 1) - s = splsched(); + if (inspect_cacheline && + (*(uint64_t *)(uintptr_t)&pcb->insn_cacheline[0] != CACHELINE_DATA_NOT_PRESENT && + *(uint64_t *)(uintptr_t)&pcb->insn_cacheline[8] != CACHELINE_DATA_NOT_PRESENT)) { + /* + * The position of the cacheline in the instruction buffer is at offset + * insn_offset - (rip & CACHELINE_MASK) + */ + if (__improbable((rip & CACHELINE_MASK) > insn_offset)) { + printf("thread %p code cacheline @ %p clipped wrt copied-in code (offset %d)\n", + thread, (void *)(rip & ~CACHELINE_MASK), (int)(rip & CACHELINE_MASK)); + } else if (bcmp(&pcb->insn_state->insn_bytes[insn_offset - (rip & CACHELINE_MASK)], + &pcb->insn_cacheline[0], CACHELINE_SIZE) != 0 + || insnstream_force_cacheline_mismatch + ) { +#if x86_INSTRUCTION_STATE_CACHELINE_SIZE != CACHELINE_SIZE +#error cacheline size mismatch +#endif + bcopy(&pcb->insn_cacheline[0], &pcb->insn_state->insn_cacheline[0], + x86_INSTRUCTION_STATE_CACHELINE_SIZE); + /* Mark the instruction stream as being out-of-synch */ + pcb->insn_state->out_of_synch = 1; + + printf("thread %p code cacheline @ %p mismatches with copied-in code [trap 0x%x]\n", + thread, (void *)(rip & ~CACHELINE_MASK), trap_code); + for (int i = 0; i < 8; i++) { + printf("\t[%d] cl=0x%08llx vs. ci=0x%08llx\n", i, *(uint64_t *)(uintptr_t)&pcb->insn_cacheline[i * 8], + *(uint64_t *)(uintptr_t)&pcb->insn_state->insn_bytes[(i * 8) + insn_offset - (rip & CACHELINE_MASK)]); + } + if (panic_on_cacheline_mismatch) { + panic("Cacheline mismatch while processing unhandled exception."); + } + } else { + printf("thread %p code cacheline @ %p DOES match with copied-in code\n", + thread, (void *)(rip & ~CACHELINE_MASK)); + pcb->insn_state->out_of_synch = 0; + } + } else if (inspect_cacheline) { + printf("thread %p could not capture code cacheline at fault IP %p [offset %d]\n", + (void *)thread, (void *)rip, (int)(insn_offset - (rip & CACHELINE_MASK))); + pcb->insn_state->out_of_synch = 0; + } +#else + pcb->insn_state->out_of_synch = 0; +#endif /* DEVELOPMENT || DEBUG */ + +#if defined(MACH_BSD) && (DEVELOPMENT || DEBUG) + if (panic_on_trap_procname[0] != 0) { + char procnamebuf[65] = {0}; + + if (thread->task->bsd_info != NULL) { + procname = proc_name_address(thread->task->bsd_info); + strlcpy(procnamebuf, procname, sizeof(procnamebuf)); + + if (strcasecmp(panic_on_trap_procname, procnamebuf) == 0 && + ((1U << trap_code) & panic_on_trap_mask) != 0) { + panic("Panic requested on trap type 0x%x for process `%s'", trap_code, + panic_on_trap_procname); + /*NORETURN*/ + } + } + } +#endif /* MACH_BSD && (DEVELOPMENT || DEBUG) */ + } + } else { + enable_preemption(); - ast_taken(mask, s); + pcb->insn_state_copyin_failure_errorcode = copyin_err; +#if DEVELOPMENT || DEBUG + if (inspect_cacheline && pcb->insn_state == 0) { + pcb->insn_state = kalloc(sizeof(x86_instruction_state_t)); + } + if (pcb->insn_state != 0) { + pcb->insn_state->insn_stream_valid_bytes = 0; + pcb->insn_state->insn_offset = 0; - splx(s); + if (inspect_cacheline && + (*(uint64_t *)(uintptr_t)&pcb->insn_cacheline[0] != CACHELINE_DATA_NOT_PRESENT && + *(uint64_t *)(uintptr_t)&pcb->insn_cacheline[8] != CACHELINE_DATA_NOT_PRESENT)) { + /* + * We can still copy the cacheline into the instruction state structure + * if it contains valid data + */ + pcb->insn_state->out_of_synch = 1; + bcopy(&pcb->insn_cacheline[0], &pcb->insn_state->insn_cacheline[0], + x86_INSTRUCTION_STATE_CACHELINE_SIZE); + } + } +#endif /* DEVELOPMENT || DEBUG */ + } } /* @@ -1077,137 +1434,76 @@ i386_astintr(int preemption) */ void i386_exception( - int exc, - int code, - int subcode) + int exc, + mach_exception_code_t code, + mach_exception_subcode_t subcode) { - exception_data_type_t codes[EXCEPTION_CODE_MAX]; + mach_exception_data_type_t codes[EXCEPTION_CODE_MAX]; - codes[0] = code; /* new exception interface */ + DEBUG_KPRINT_SYSCALL_MACH("i386_exception: exc=%d code=0x%llx subcode=0x%llx\n", + exc, code, subcode); + codes[0] = code; /* new exception interface */ codes[1] = subcode; exception_triage(exc, codes, 2); /*NOTREACHED*/ } -void -kernel_preempt_check(void) -{ - ast_t *myast; - boolean_t intr; - - /* - * disable interrupts to both prevent pre-emption - * and to keep the ast state from changing via - * an interrupt handler making something runnable - */ - intr = ml_set_interrupts_enabled(FALSE); - - myast = ast_pending(); - - if ((*myast & AST_URGENT) && intr == TRUE && get_interrupt_level() == 0) { - /* - * can handle interrupts and preemptions - * at this point - */ - ml_set_interrupts_enabled(intr); - - /* - * now cause the PRE-EMPTION trap - */ - __asm__ volatile (" int $0xff"); - } else { - /* - * if interrupts were already disabled or - * we're in an interrupt context, we can't - * preempt... of course if AST_URGENT - * isn't set we also don't want to - */ - ml_set_interrupts_enabled(intr); - } -} - -#if MACH_KDB - -extern void db_i386_state(x86_saved_state32_t *regs); - -#include - -void -db_i386_state( - x86_saved_state32_t *regs) -{ - db_printf("eip %8x\n", regs->eip); - db_printf("trap %8x\n", regs->trapno); - db_printf("err %8x\n", regs->err); - db_printf("efl %8x\n", regs->efl); - db_printf("ebp %8x\n", regs->ebp); - db_printf("esp %8x\n", regs->cr2); - db_printf("uesp %8x\n", regs->uesp); - db_printf("cs %8x\n", regs->cs & 0xff); - db_printf("ds %8x\n", regs->ds & 0xff); - db_printf("es %8x\n", regs->es & 0xff); - db_printf("fs %8x\n", regs->fs & 0xff); - db_printf("gs %8x\n", regs->gs & 0xff); - db_printf("ss %8x\n", regs->ss & 0xff); - db_printf("eax %8x\n", regs->eax); - db_printf("ebx %8x\n", regs->ebx); - db_printf("ecx %8x\n", regs->ecx); - db_printf("edx %8x\n", regs->edx); - db_printf("esi %8x\n", regs->esi); - db_printf("edi %8x\n", regs->edi); -} - -#endif /* MACH_KDB */ - -/* Synchronize a thread's i386_kernel_state (if any) with the given - * i386_saved_state_t obtained from the trap/IPI handler; called in +/* Synchronize a thread's x86_kernel_state (if any) with the given + * x86_saved_state_t obtained from the trap/IPI handler; called in * kernel_trap() prior to entering the debugger, and when receiving - * an "MP_KDP" IPI. + * an "MP_KDP" IPI. Called with null saved_state if an incoming IPI + * was detected from the kernel while spinning with interrupts masked. */ - + void -sync_iss_to_iks(x86_saved_state32_t *saved_state) +sync_iss_to_iks(x86_saved_state_t *saved_state) { - struct x86_kernel_state32 *iks; + struct x86_kernel_state *iks = NULL; vm_offset_t kstack; boolean_t record_active_regs = FALSE; - if ((kstack = current_thread()->kernel_stack) != 0) { - x86_saved_state32_t *regs; + /* The PAL may have a special way to sync registers */ + if (saved_state && saved_state->flavor == THREAD_STATE_NONE) { + pal_get_kern_regs( saved_state ); + } - regs = saved_state; + if (current_thread() != NULL && + (kstack = current_thread()->kernel_stack) != 0) { + x86_saved_state64_t *regs = saved_state64(saved_state); iks = STACK_IKS(kstack); - /* - * Did we take the trap/interrupt in kernel mode? - */ - if (regs == USER_REGS32(current_thread())) - record_active_regs = TRUE; - else { - iks->k_ebx = regs->ebx; - iks->k_esp = (int)regs; - iks->k_ebp = regs->ebp; - iks->k_edi = regs->edi; - iks->k_esi = regs->esi; - iks->k_eip = regs->eip; + /* Did we take the trap/interrupt in kernel mode? */ + if (saved_state == NULL || /* NULL => polling in kernel */ + regs == USER_REGS64(current_thread())) { + record_active_regs = TRUE; + } else { + iks->k_rbx = regs->rbx; + iks->k_rsp = regs->isf.rsp; + iks->k_rbp = regs->rbp; + iks->k_r12 = regs->r12; + iks->k_r13 = regs->r13; + iks->k_r14 = regs->r14; + iks->k_r15 = regs->r15; + iks->k_rip = regs->isf.rip; } } if (record_active_regs == TRUE) { - /* - * Show the trap handler path - */ - __asm__ volatile("movl %%ebx, %0" : "=m" (iks->k_ebx)); - __asm__ volatile("movl %%esp, %0" : "=m" (iks->k_esp)); - __asm__ volatile("movl %%ebp, %0" : "=m" (iks->k_ebp)); - __asm__ volatile("movl %%edi, %0" : "=m" (iks->k_edi)); - __asm__ volatile("movl %%esi, %0" : "=m" (iks->k_esi)); - /* - * "Current" instruction pointer - */ - __asm__ volatile("movl $1f, %0\n1:" : "=m" (iks->k_eip)); + /* Show the trap handler path */ + __asm__ volatile ("movq %%rbx, %0" : "=m" (iks->k_rbx)); + __asm__ volatile ("movq %%rsp, %0" : "=m" (iks->k_rsp)); + __asm__ volatile ("movq %%rbp, %0" : "=m" (iks->k_rbp)); + __asm__ volatile ("movq %%r12, %0" : "=m" (iks->k_r12)); + __asm__ volatile ("movq %%r13, %0" : "=m" (iks->k_r13)); + __asm__ volatile ("movq %%r14, %0" : "=m" (iks->k_r14)); + __asm__ volatile ("movq %%r15, %0" : "=m" (iks->k_r15)); + /* "Current" instruction pointer */ + __asm__ volatile ("leaq 1f(%%rip), %%rax; mov %%rax, %0\n1:" + : "=m" (iks->k_rip) + : + : "rax"); } } @@ -1218,26 +1514,52 @@ sync_iss_to_iks(x86_saved_state32_t *saved_state) * or user space. */ void -sync_iss_to_iks_unconditionally(__unused x86_saved_state32_t *saved_state) { - struct x86_kernel_state32 *iks; +sync_iss_to_iks_unconditionally(__unused x86_saved_state_t *saved_state) +{ + struct x86_kernel_state *iks; vm_offset_t kstack; - boolean_t record_active_regs = FALSE; if ((kstack = current_thread()->kernel_stack) != 0) { - iks = STACK_IKS(kstack); - /* - * Show the trap handler path - */ - __asm__ volatile("movl %%ebx, %0" : "=m" (iks->k_ebx)); - __asm__ volatile("movl %%esp, %0" : "=m" (iks->k_esp)); - __asm__ volatile("movl %%ebp, %0" : "=m" (iks->k_ebp)); - __asm__ volatile("movl %%edi, %0" : "=m" (iks->k_edi)); - __asm__ volatile("movl %%esi, %0" : "=m" (iks->k_esi)); - /* - * "Current" instruction pointer - */ - __asm__ volatile("movl $1f, %0\n1:" : "=m" (iks->k_eip)); + /* Display the trap handler path */ + __asm__ volatile ("movq %%rbx, %0" : "=m" (iks->k_rbx)); + __asm__ volatile ("movq %%rsp, %0" : "=m" (iks->k_rsp)); + __asm__ volatile ("movq %%rbp, %0" : "=m" (iks->k_rbp)); + __asm__ volatile ("movq %%r12, %0" : "=m" (iks->k_r12)); + __asm__ volatile ("movq %%r13, %0" : "=m" (iks->k_r13)); + __asm__ volatile ("movq %%r14, %0" : "=m" (iks->k_r14)); + __asm__ volatile ("movq %%r15, %0" : "=m" (iks->k_r15)); + /* "Current" instruction pointer */ + __asm__ volatile ("leaq 1f(%%rip), %%rax; mov %%rax, %0\n1:" : "=m" (iks->k_rip)::"rax"); + } +} + +#if DEBUG +#define TERI 1 +#endif +#if TERI +extern void thread_exception_return_internal(void) __dead2; + +void +thread_exception_return(void) +{ + thread_t thread = current_thread(); + ml_set_interrupts_enabled(FALSE); + if (thread_is_64bit_addr(thread) != task_has_64Bit_addr(thread->task)) { + panic("Task/thread bitness mismatch %p %p, task: %d, thread: %d", thread, thread->task, thread_is_64bit_addr(thread), task_has_64Bit_addr(thread->task)); } + + if (thread_is_64bit_addr(thread)) { + if ((gdt_desc_p(USER64_CS)->access & ACC_PL_U) == 0) { + panic("64-GDT mismatch %p, descriptor: %p", thread, gdt_desc_p(USER64_CS)); + } + } else { + if ((gdt_desc_p(USER_CS)->access & ACC_PL_U) == 0) { + panic("32-GDT mismatch %p, descriptor: %p", thread, gdt_desc_p(USER_CS)); + } + } + assert(get_preemption_level() == 0); + thread_exception_return_internal(); } +#endif