X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/e5568f75972dfc723778653c11cb6b4dc825716a..cb3231590a3c94ab4375e2228bd5e86b0cf1ad7e:/osfmk/i386/trap.c diff --git a/osfmk/i386/trap.c b/osfmk/i386/trap.c index 0554da8a5..bfc24c4aa 100644 --- a/osfmk/i386/trap.c +++ b/osfmk/i386/trap.c @@ -1,62 +1,65 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ */ -/* +/* * Mach Operating System * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University * All Rights Reserved. - * + * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. - * + * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * + * * Carnegie Mellon requests users of this software to return to - * + * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 - * + * * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ /* */ + /* * Hardware trap/fault handler. */ -#include -#include -#include -#include #include #include @@ -65,6 +68,8 @@ #include #include #include +#include /* panic_io_port_read() */ +#include #include #include @@ -74,9 +79,8 @@ #include #include -#include #include -#include +#include #include #include #include @@ -84,125 +88,151 @@ #include #include #include +#include +#if CONFIG_TELEMETRY +#include +#endif +#include +#include +#include -#if MACH_KGDB -#include -#endif /* MACH_KGDB */ - -#include - -#if MACH_KGDB -#include -#endif /* MACH_KGDB */ +#include -#if MACH_KDB -#include -#include -#include -#include -#endif /* MACH_KDB */ +#include +#include +#include +#if CONFIG_MCA +#include +#endif +#include -#include +#include +#include +#include -#include +extern void throttle_lowpri_io(int); +extern void kprint_state(x86_saved_state64_t *saved_state); /* * Forward declarations */ -extern void user_page_fault_continue( - kern_return_t kr); +static void user_page_fault_continue(kern_return_t kret); +static void panic_trap(x86_saved_state64_t *saved_state, uint32_t pl, kern_return_t fault_result) __dead2; +static void set_recovery_ip(x86_saved_state64_t *saved_state, vm_offset_t ip); -extern boolean_t v86_assist( - thread_t thread, - struct i386_saved_state *regs); +#if CONFIG_DTRACE +/* See */ +perfCallback tempDTraceTrapHook = NULL; /* Pointer to DTrace fbt trap hook routine */ -extern boolean_t check_io_fault( - struct i386_saved_state *regs); +extern boolean_t dtrace_tally_fault(user_addr_t); +#endif -extern int inst_fetch( - int eip, - int cs); +extern boolean_t pmap_smep_enabled; +extern boolean_t pmap_smap_enabled; +__attribute__((noreturn)) void thread_syscall_return( - kern_return_t ret) + kern_return_t ret) { - register thread_act_t thr_act = current_act(); - register struct i386_saved_state *regs = USER_REGS(thr_act); - regs->eax = ret; - thread_exception_return(); - /*NOTREACHED*/ -} + thread_t thr_act = current_thread(); + boolean_t is_mach; + int code; + pal_register_cache_state(thr_act, DIRTY); -#if MACH_KDB -boolean_t debug_all_traps_with_kdb = FALSE; -extern struct db_watchpoint *db_watchpoint_list; -extern boolean_t db_watchpoints_inserted; -extern boolean_t db_breakpoints_inserted; + if (thread_is_64bit_addr(thr_act)) { + x86_saved_state64_t *regs; -void -thread_kdb_return(void) -{ - register thread_act_t thr_act = current_act(); - register thread_t cur_thr = current_thread(); - register struct i386_saved_state *regs = USER_REGS(thr_act); - - if (kdb_trap(regs->trapno, regs->err, regs)) { -#if MACH_LDEBUG - assert(cur_thr->mutex_count == 0); -#endif /* MACH_LDEBUG */ - check_simple_locks(); - thread_exception_return(); - /*NOTREACHED*/ + regs = USER_REGS64(thr_act); + + code = (int) (regs->rax & SYSCALL_NUMBER_MASK); + is_mach = (regs->rax & SYSCALL_CLASS_MASK) + == (SYSCALL_CLASS_MACH << SYSCALL_CLASS_SHIFT); + if (kdebug_enable && is_mach) { + /* Mach trap */ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_EXCP_SC, code) | DBG_FUNC_END, + ret, 0, 0, 0, 0); + } + regs->rax = ret; +#if DEBUG + if (is_mach) { + DEBUG_KPRINT_SYSCALL_MACH( + "thread_syscall_return: 64-bit mach ret=%u\n", + ret); + } else { + DEBUG_KPRINT_SYSCALL_UNIX( + "thread_syscall_return: 64-bit unix ret=%u\n", + ret); + } +#endif + } else { + x86_saved_state32_t *regs; + + regs = USER_REGS32(thr_act); + + code = ((int) regs->eax); + is_mach = (code < 0); + if (kdebug_enable && is_mach) { + /* Mach trap */ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_EXCP_SC, -code) | DBG_FUNC_END, + ret, 0, 0, 0, 0); + } + regs->eax = ret; +#if DEBUG + if (is_mach) { + DEBUG_KPRINT_SYSCALL_MACH( + "thread_syscall_return: 32-bit mach ret=%u\n", + ret); + } else { + DEBUG_KPRINT_SYSCALL_UNIX( + "thread_syscall_return: 32-bit unix ret=%u\n", + ret); + } +#endif } -} -boolean_t let_ddb_vm_fault = FALSE; -#if NCPUS > 1 -extern int kdb_active[NCPUS]; -#endif /* NCPUS > 1 */ +#if DEBUG || DEVELOPMENT + kern_allocation_name_t + prior __assert_only = thread_get_kernel_state(thr_act)->allocation_name; + assertf(prior == NULL, "thread_set_allocation_name(\"%s\") not cleared", kern_allocation_get_name(prior)); +#endif /* DEBUG || DEVELOPMENT */ -#endif /* MACH_KDB */ + throttle_lowpri_io(1); -void + thread_exception_return(); + /*NOTREACHED*/ +} + + +static inline void user_page_fault_continue( - kern_return_t kr) + kern_return_t kr) { - register thread_act_t thr_act = current_act(); - register thread_t cur_thr = current_thread(); - register struct i386_saved_state *regs = USER_REGS(thr_act); - - if ((kr == KERN_SUCCESS) || (kr == KERN_ABORTED)) { -#if MACH_KDB - if (!db_breakpoints_inserted) { - db_set_breakpoints(); - } - if (db_watchpoint_list && - db_watchpoints_inserted && - (regs->err & T_PF_WRITE) && - db_find_watchpoint(thr_act->map, - (vm_offset_t)regs->cr2, - regs)) - kdb_trap(T_WATCHPOINT, 0, regs); -#endif /* MACH_KDB */ - thread_exception_return(); - /*NOTREACHED*/ - } + thread_t thread = current_thread(); + user_addr_t vaddr; -#if MACH_KDB - if (debug_all_traps_with_kdb && - kdb_trap(regs->trapno, regs->err, regs)) { -#if MACH_LDEBUG - assert(cur_thr->mutex_count == 0); -#endif /* MACH_LDEBUG */ - check_simple_locks(); - thread_exception_return(); - /*NOTREACHED*/ + if (thread_is_64bit_addr(thread)) { + x86_saved_state64_t *uregs; + + uregs = USER_REGS64(thread); + + vaddr = (user_addr_t)uregs->cr2; + } else { + x86_saved_state32_t *uregs; + + uregs = USER_REGS32(thread); + + vaddr = uregs->cr2; } -#endif /* MACH_KDB */ - i386_exception(EXC_BAD_ACCESS, kr, regs->cr2); + + /* PAL debug hook */ + pal_dbg_page_fault( thread, vaddr, kr ); + + i386_exception(EXC_BAD_ACCESS, kr, vaddr); /*NOTREACHED*/ } @@ -210,928 +240,927 @@ user_page_fault_continue( * Fault recovery in copyin/copyout routines. */ struct recovery { - int fault_addr; - int recover_addr; + uintptr_t fault_addr; + uintptr_t recover_addr; }; -extern struct recovery recover_table[]; -extern struct recovery recover_table_end[]; +extern struct recovery recover_table[]; +extern struct recovery recover_table_end[]; + +const char * trap_type[] = {TRAP_NAMES}; +unsigned TRAP_TYPES = sizeof(trap_type) / sizeof(trap_type[0]); + +extern void PE_incoming_interrupt(int interrupt); + +#if defined(__x86_64__) && DEBUG +void +kprint_state(x86_saved_state64_t *saved_state) +{ + kprintf("current_cpu_datap() 0x%lx\n", (uintptr_t)current_cpu_datap()); + kprintf("Current GS base MSR 0x%llx\n", rdmsr64(MSR_IA32_GS_BASE)); + kprintf("Kernel GS base MSR 0x%llx\n", rdmsr64(MSR_IA32_KERNEL_GS_BASE)); + kprintf("state at 0x%lx:\n", (uintptr_t) saved_state); + + kprintf(" rdi 0x%llx\n", saved_state->rdi); + kprintf(" rsi 0x%llx\n", saved_state->rsi); + kprintf(" rdx 0x%llx\n", saved_state->rdx); + kprintf(" r10 0x%llx\n", saved_state->r10); + kprintf(" r8 0x%llx\n", saved_state->r8); + kprintf(" r9 0x%llx\n", saved_state->r9); + + kprintf(" cr2 0x%llx\n", saved_state->cr2); + kprintf("real cr2 0x%lx\n", get_cr2()); + kprintf(" r15 0x%llx\n", saved_state->r15); + kprintf(" r14 0x%llx\n", saved_state->r14); + kprintf(" r13 0x%llx\n", saved_state->r13); + kprintf(" r12 0x%llx\n", saved_state->r12); + kprintf(" r11 0x%llx\n", saved_state->r11); + kprintf(" rbp 0x%llx\n", saved_state->rbp); + kprintf(" rbx 0x%llx\n", saved_state->rbx); + kprintf(" rcx 0x%llx\n", saved_state->rcx); + kprintf(" rax 0x%llx\n", saved_state->rax); + + kprintf(" gs 0x%x\n", saved_state->gs); + kprintf(" fs 0x%x\n", saved_state->fs); + + kprintf(" isf.trapno 0x%x\n", saved_state->isf.trapno); + kprintf(" isf._pad 0x%x\n", saved_state->isf._pad); + kprintf(" isf.trapfn 0x%llx\n", saved_state->isf.trapfn); + kprintf(" isf.err 0x%llx\n", saved_state->isf.err); + kprintf(" isf.rip 0x%llx\n", saved_state->isf.rip); + kprintf(" isf.cs 0x%llx\n", saved_state->isf.cs); + kprintf(" isf.rflags 0x%llx\n", saved_state->isf.rflags); + kprintf(" isf.rsp 0x%llx\n", saved_state->isf.rsp); + kprintf(" isf.ss 0x%llx\n", saved_state->isf.ss); +} +#endif + /* - * Recovery from Successful fault in copyout does not - * return directly - it retries the pte check, since - * the 386 ignores write protection in kernel mode. + * Non-zero indicates latency assert is enabled and capped at valued + * absolute time units. */ -extern struct recovery retry_table[]; -extern struct recovery retry_table_end[]; -char * trap_type[] = {TRAP_NAMES}; -int TRAP_TYPES = sizeof(trap_type)/sizeof(trap_type[0]); +uint64_t interrupt_latency_cap = 0; +boolean_t ilat_assert = FALSE; + +void +interrupt_latency_tracker_setup(void) +{ + uint32_t ilat_cap_us; + if (PE_parse_boot_argn("interrupt_latency_cap_us", &ilat_cap_us, sizeof(ilat_cap_us))) { + interrupt_latency_cap = ilat_cap_us * NSEC_PER_USEC; + nanoseconds_to_absolutetime(interrupt_latency_cap, &interrupt_latency_cap); + } else { + interrupt_latency_cap = LockTimeOut; + } + PE_parse_boot_argn("-interrupt_latency_assert_enable", &ilat_assert, sizeof(ilat_assert)); +} + +void +interrupt_reset_latency_stats(void) +{ + uint32_t i; + for (i = 0; i < real_ncpus; i++) { + cpu_data_ptr[i]->cpu_max_observed_int_latency = + cpu_data_ptr[i]->cpu_max_observed_int_latency_vector = 0; + } +} + +void +interrupt_populate_latency_stats(char *buf, unsigned bufsize) +{ + uint32_t i, tcpu = ~0; + uint64_t cur_max = 0; + + for (i = 0; i < real_ncpus; i++) { + if (cur_max < cpu_data_ptr[i]->cpu_max_observed_int_latency) { + cur_max = cpu_data_ptr[i]->cpu_max_observed_int_latency; + tcpu = i; + } + } + + if (tcpu < real_ncpus) { + snprintf(buf, bufsize, "0x%x 0x%x 0x%llx", tcpu, cpu_data_ptr[tcpu]->cpu_max_observed_int_latency_vector, cpu_data_ptr[tcpu]->cpu_max_observed_int_latency); + } +} + +uint32_t interrupt_timer_coalescing_enabled = 1; +uint64_t interrupt_coalesced_timers; + +/* + * Handle interrupts: + * - local APIC interrupts (IPIs, timers, etc) are handled by the kernel, + * - device interrupts go to the platform expert. + */ +void +interrupt(x86_saved_state_t *state) +{ + uint64_t rip; + uint64_t rsp; + int interrupt_num; + boolean_t user_mode = FALSE; + int ipl; + int cnum = cpu_number(); + cpu_data_t *cdp = cpu_data_ptr[cnum]; + int itype = DBG_INTR_TYPE_UNKNOWN; + int handled; + + x86_saved_state64_t *state64 = saved_state64(state); + rip = state64->isf.rip; + rsp = state64->isf.rsp; + interrupt_num = state64->isf.trapno; + if (state64->isf.cs & 0x03) { + user_mode = TRUE; + } + + if (cpu_data_ptr[cnum]->lcpu.package->num_idle == topoParms.nLThreadsPerPackage) { + cpu_data_ptr[cnum]->cpu_hwIntpexits[interrupt_num]++; + } + + if (interrupt_num == (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_INTERPROCESSOR_INTERRUPT)) { + itype = DBG_INTR_TYPE_IPI; + } else if (interrupt_num == (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_TIMER_INTERRUPT)) { + itype = DBG_INTR_TYPE_TIMER; + } else { + itype = DBG_INTR_TYPE_OTHER; + } + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_START, + interrupt_num, + (user_mode ? rip : VM_KERNEL_UNSLIDE(rip)), + user_mode, itype, 0); + + SCHED_STATS_INTERRUPT(current_processor()); + +#if CONFIG_TELEMETRY + if (telemetry_needs_record) { + telemetry_mark_curthread(user_mode, FALSE); + } +#endif + + ipl = get_preemption_level(); + + /* + * Handle local APIC interrupts + * else call platform expert for devices. + */ + handled = lapic_interrupt(interrupt_num, state); + + if (!handled) { + if (interrupt_num == (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_CMCI_INTERRUPT)) { + /* + * CMCI can be signalled on any logical processor, and the kexts + * that implement handling CMCI use IOKit to register handlers for + * the CMCI vector, so if we see a CMCI, do not encode a CPU + * number in bits 8:31 (since the vector is the same regardless of + * the handling CPU). + */ + PE_incoming_interrupt(interrupt_num); + } else if (cnum <= lapic_max_interrupt_cpunum) { + PE_incoming_interrupt((cnum << 8) | interrupt_num); + } + } + + if (__improbable(get_preemption_level() != ipl)) { + panic("Preemption level altered by interrupt vector 0x%x: initial 0x%x, final: 0x%x\n", interrupt_num, ipl, get_preemption_level()); + } + + + if (__improbable(cdp->cpu_nested_istack)) { + cdp->cpu_nested_istack_events++; + } else { + uint64_t ctime = mach_absolute_time(); + uint64_t int_latency = ctime - cdp->cpu_int_event_time; + uint64_t esdeadline, ehdeadline; + /* Attempt to process deferred timers in the context of + * this interrupt, unless interrupt time has already exceeded + * TCOAL_ILAT_THRESHOLD. + */ +#define TCOAL_ILAT_THRESHOLD (30000ULL) + + if ((int_latency < TCOAL_ILAT_THRESHOLD) && + interrupt_timer_coalescing_enabled) { + esdeadline = cdp->rtclock_timer.queue.earliest_soft_deadline; + ehdeadline = cdp->rtclock_timer.deadline; + if ((ctime >= esdeadline) && (ctime < ehdeadline)) { + interrupt_coalesced_timers++; + TCOAL_DEBUG(0x88880000 | DBG_FUNC_START, ctime, esdeadline, ehdeadline, interrupt_coalesced_timers, 0); + rtclock_intr(state); + TCOAL_DEBUG(0x88880000 | DBG_FUNC_END, ctime, esdeadline, interrupt_coalesced_timers, 0, 0); + } else { + TCOAL_DEBUG(0x77770000, ctime, cdp->rtclock_timer.queue.earliest_soft_deadline, cdp->rtclock_timer.deadline, interrupt_coalesced_timers, 0); + } + } + + if (__improbable(ilat_assert && (int_latency > interrupt_latency_cap) && !machine_timeout_suspended())) { + panic("Interrupt vector 0x%x exceeded interrupt latency threshold, 0x%llx absolute time delta, prior signals: 0x%x, current signals: 0x%x", interrupt_num, int_latency, cdp->cpu_prior_signals, cdp->cpu_signals); + } + + if (__improbable(int_latency > cdp->cpu_max_observed_int_latency)) { + cdp->cpu_max_observed_int_latency = int_latency; + cdp->cpu_max_observed_int_latency_vector = interrupt_num; + } + } + /* + * Having serviced the interrupt first, look at the interrupted stack depth. + */ + if (!user_mode) { + uint64_t depth = cdp->cpu_kernel_stack + + sizeof(struct thread_kernel_state) + + sizeof(struct i386_exception_link *) + - rsp; + if (__improbable(depth > kernel_stack_depth_max)) { + kernel_stack_depth_max = (vm_offset_t)depth; + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_DEPTH), + (long) depth, (long) VM_KERNEL_UNSLIDE(rip), 0, 0, 0); + } + } + + if (cnum == master_cpu) { + ml_entropy_collect(); + } + +#if KPERF + kperf_interrupt(); +#endif /* KPERF */ + + KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END, + interrupt_num); + + assert(ml_get_interrupts_enabled() == FALSE); +} + +static inline void +reset_dr7(void) +{ + long dr7 = 0x400; /* magic dr7 reset value; 32 bit on i386, 64 bit on x86_64 */ + __asm__ volatile ("mov %0,%%dr7" : : "r" (dr7)); +} +#if MACH_KDP +unsigned kdp_has_active_watchpoints = 0; +#define NO_WATCHPOINTS (!kdp_has_active_watchpoints) +#else +#define NO_WATCHPOINTS 1 +#endif /* * Trap from kernel mode. Only page-fault errors are recoverable, * and then only in special circumstances. All other errors are * fatal. Return value indicates if trap was handled. */ -boolean_t + +void kernel_trap( - register struct i386_saved_state *regs) + x86_saved_state_t *state, + uintptr_t *lo_spp) { - int exc; - int code; - int subcode; - int interruptible; - register int type; - vm_map_t map; - kern_return_t result; - register thread_t thread; - thread_act_t thr_act; - etap_data_t probe_data; - pt_entry_t *pte; - extern vm_offset_t vm_last_phys; - - type = regs->trapno; - code = regs->err; - thread = current_thread(); - thr_act = current_act(); + x86_saved_state64_t *saved_state; + int code; + user_addr_t vaddr; + int type; + vm_map_t map = 0; /* protected by T_PAGE_FAULT */ + kern_return_t result = KERN_FAILURE; + kern_return_t fault_result = KERN_SUCCESS; + thread_t thread; + boolean_t intr; + vm_prot_t prot; + struct recovery *rp; + vm_offset_t kern_ip; +#if NCOPY_WINDOWS > 0 + int fault_in_copy_window = -1; +#endif + int is_user; + int trap_pl = get_preemption_level(); - ETAP_DATA_LOAD(probe_data[0], regs->trapno); - ETAP_DATA_LOAD(probe_data[1], MACH_PORT_NULL); - ETAP_DATA_LOAD(probe_data[2], MACH_PORT_NULL); - ETAP_PROBE_DATA(ETAP_P_EXCEPTION, - 0, - thread, - &probe_data, - ETAP_DATA_ENTRY*3); + thread = current_thread(); - switch (type) { - case T_PREEMPT: - return (TRUE); + if (__improbable(is_saved_state32(state))) { + panic("kernel_trap(%p) with 32-bit state", state); + } + saved_state = saved_state64(state); - case T_NO_FPU: - fpnoextflt(); - return (TRUE); + /* Record cpu where state was captured */ + saved_state->isf.cpu = cpu_number(); - case T_FPU_FAULT: - fpextovrflt(); - return (TRUE); + vaddr = (user_addr_t)saved_state->cr2; + type = saved_state->isf.trapno; + code = (int)(saved_state->isf.err & 0xffff); + intr = (saved_state->isf.rflags & EFL_IF) != 0; /* state of ints at trap */ + kern_ip = (vm_offset_t)saved_state->isf.rip; - case T_FLOATING_POINT_ERROR: - fpexterrflt(); - return (TRUE); + is_user = (vaddr < VM_MAX_USER_PAGE_ADDRESS); - case T_PAGE_FAULT: - /* - * If the current map is a submap of the kernel map, - * and the address is within that map, fault on that - * map. If the same check is done in vm_fault - * (vm_map_lookup), we may deadlock on the kernel map - * lock. - */ -#if MACH_KDB - mp_disable_preemption(); - if (db_active -#if NCPUS > 1 - && kdb_active[cpu_number()] -#endif /* NCPUS > 1 */ - && !let_ddb_vm_fault) { +#if CONFIG_DTRACE + /* + * Is there a DTrace hook? + */ + if (__improbable(tempDTraceTrapHook != NULL)) { + if (tempDTraceTrapHook(type, state, lo_spp, 0) == KERN_SUCCESS) { /* - * Force kdb to handle this one. + * If it succeeds, we are done... */ - mp_enable_preemption(); - return (FALSE); - } - mp_enable_preemption(); -#endif /* MACH_KDB */ - subcode = regs->cr2; /* get faulting address */ - - if (subcode > LINEAR_KERNEL_ADDRESS) { - map = kernel_map; - subcode -= LINEAR_KERNEL_ADDRESS; - } else if (thr_act == THR_ACT_NULL || thread == THREAD_NULL) - map = kernel_map; - else { - map = thr_act->map; + return; } + } +#endif /* CONFIG_DTRACE */ + + /* + * we come here with interrupts off as we don't want to recurse + * on preemption below. but we do want to re-enable interrupts + * as soon we possibly can to hold latency down + */ + if (__improbable(T_PREEMPT == type)) { + ast_taken_kernel(); -#if MACH_KDB + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_EXCP_KTRAP_x86, type)) | DBG_FUNC_NONE, + 0, 0, 0, VM_KERNEL_UNSLIDE(kern_ip), 0); + return; + } + + user_addr_t kd_vaddr = is_user ? vaddr : VM_KERNEL_UNSLIDE(vaddr); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_EXCP_KTRAP_x86, type)) | DBG_FUNC_NONE, + (unsigned)(kd_vaddr >> 32), (unsigned)kd_vaddr, is_user, + VM_KERNEL_UNSLIDE(kern_ip), 0); + + + if (T_PAGE_FAULT == type) { /* - * Check for watchpoint on kernel static data. - * vm_fault would fail in this case + * assume we're faulting in the kernel map */ - if (map == kernel_map && - db_watchpoint_list && - db_watchpoints_inserted && - (code & T_PF_WRITE) && - (vm_offset_t)subcode < vm_last_phys && - ((*(pte = pmap_pte(kernel_pmap, (vm_offset_t)subcode))) & - INTEL_PTE_WRITE) == 0) { - *pte = INTEL_PTE_VALID | INTEL_PTE_WRITE | - pa_to_pte(trunc_page((vm_offset_t)subcode) - - VM_MIN_KERNEL_ADDRESS); - result = KERN_SUCCESS; - } else -#endif /* MACH_KDB */ - { - /* - * Since the 386 ignores write protection in - * kernel mode, always try for write permission - * first. If that fails and the fault was a - * read fault, retry with read permission. + map = kernel_map; + + if (__probable(thread != THREAD_NULL && thread->map != kernel_map)) { +#if NCOPY_WINDOWS > 0 + vm_offset_t copy_window_base; + vm_offset_t kvaddr; + int window_index; + + kvaddr = (vm_offset_t)vaddr; + /* + * must determine if fault occurred in + * the copy window while pre-emption is + * disabled for this processor so that + * we only need to look at the window + * associated with this processor */ - if (map == kernel_map) { - register struct recovery *rp; - - interruptible = THREAD_UNINT; - for (rp = recover_table; rp < recover_table_end; rp++) { - if (regs->eip == rp->fault_addr) { - interruptible = THREAD_ABORTSAFE; - break; - } + copy_window_base = current_cpu_datap()->cpu_copywindow_base; + + if (kvaddr >= copy_window_base && kvaddr < (copy_window_base + (NBPDE * NCOPY_WINDOWS))) { + window_index = (int)((kvaddr - copy_window_base) / NBPDE); + + if (thread->machine.copy_window[window_index].user_base != (user_addr_t)-1) { + kvaddr -= (copy_window_base + (NBPDE * window_index)); + vaddr = thread->machine.copy_window[window_index].user_base + kvaddr; + + map = thread->map; + fault_in_copy_window = window_index; } } +#else + if (__probable(vaddr < VM_MAX_USER_PAGE_ADDRESS)) { + /* fault occurred in userspace */ + map = thread->map; + + /* Intercept a potential Supervisor Mode Execute + * Protection fault. These criteria identify + * both NX faults and SMEP faults, but both + * are fatal. We avoid checking PTEs (racy). + * (The VM could just redrive a SMEP fault, hence + * the intercept). + */ + if (__improbable((code == (T_PF_PROT | T_PF_EXECUTE)) && + (pmap_smep_enabled) && (saved_state->isf.rip == vaddr))) { + goto debugger_entry; + } + + /* + * Additionally check for SMAP faults... + * which are characterized by page-present and + * the AC bit unset (i.e. not from copyin/out path). + */ + if (__improbable(code & T_PF_PROT && + pmap_smap_enabled && + (saved_state->isf.rflags & EFL_AC) == 0)) { + goto debugger_entry; + } - result = vm_fault(map, - trunc_page((vm_offset_t)subcode), - VM_PROT_READ|VM_PROT_WRITE, - FALSE, - (map == kernel_map) ? interruptible : THREAD_ABORTSAFE, NULL, 0); + /* + * If we're not sharing cr3 with the user + * and we faulted in copyio, + * then switch cr3 here and dismiss the fault. + */ + if (no_shared_cr3 && + (thread->machine.specFlags & CopyIOActive) && + map->pmap->pm_cr3 != get_cr3_base()) { + pmap_assert(current_cpu_datap()->cpu_pmap_pcid_enabled == FALSE); + set_cr3_raw(map->pmap->pm_cr3); + return; + } + if (__improbable(vaddr < PAGE_SIZE) && + ((thread->machine.specFlags & CopyIOActive) == 0)) { + goto debugger_entry; + } + } +#endif } -#if MACH_KDB - if (result == KERN_SUCCESS) { - /* Look for watchpoints */ - if (db_watchpoint_list && - db_watchpoints_inserted && - (code & T_PF_WRITE) && - db_find_watchpoint(map, - (vm_offset_t)subcode, regs)) - kdb_trap(T_WATCHPOINT, 0, regs); + } + + (void) ml_set_interrupts_enabled(intr); + + switch (type) { + case T_NO_FPU: + fpnoextflt(); + return; + + case T_FPU_FAULT: + fpextovrflt(); + return; + + case T_FLOATING_POINT_ERROR: + fpexterrflt(); + return; + + case T_SSE_FLOAT_ERROR: + fpSSEexterrflt(); + return; + + case T_INVALID_OPCODE: + fpUDflt(kern_ip); + goto debugger_entry; + + case T_DEBUG: + if ((saved_state->isf.rflags & EFL_TF) == 0 && NO_WATCHPOINTS) { + /* We've somehow encountered a debug + * register match that does not belong + * to the kernel debugger. + * This isn't supposed to happen. + */ + reset_dr7(); + return; } - else -#endif /* MACH_KDB */ - if ((code & T_PF_WRITE) == 0 && - result == KERN_PROTECTION_FAILURE) - { - /* - * Must expand vm_fault by hand, - * so that we can ask for read-only access - * but enter a (kernel)writable mapping. - */ - result = intel_read_fault(map, - trunc_page((vm_offset_t)subcode)); + goto debugger_entry; + case T_INT3: + goto debugger_entry; + case T_PAGE_FAULT: + +#if CONFIG_DTRACE + if (thread != THREAD_NULL && thread->t_dtrace_inprobe) { /* Executing under dtrace_probe? */ + if (dtrace_tally_fault(vaddr)) { /* Should a fault under dtrace be ignored? */ + /* + * DTrace has "anticipated" the possibility of this fault, and has + * established the suitable recovery state. Drop down now into the + * recovery handling code in "case T_GENERAL_PROTECTION:". + */ + goto FALL_THROUGH; + } } +#endif /* CONFIG_DTRACE */ - if (result == KERN_SUCCESS) { - /* - * Certain faults require that we back up - * the EIP. - */ - register struct recovery *rp; - - for (rp = retry_table; rp < retry_table_end; rp++) { - if (regs->eip == rp->fault_addr) { - regs->eip = rp->recover_addr; - break; - } - } - return (TRUE); + prot = VM_PROT_READ; + + if (code & T_PF_WRITE) { + prot |= VM_PROT_WRITE; + } + if (code & T_PF_EXECUTE) { + prot |= VM_PROT_EXECUTE; } - /* fall through */ + fault_result = result = vm_fault(map, + vaddr, + prot, + FALSE, VM_KERN_MEMORY_NONE, + THREAD_UNINT, NULL, 0); - case T_GENERAL_PROTECTION: + if (result == KERN_SUCCESS) { +#if NCOPY_WINDOWS > 0 + if (fault_in_copy_window != -1) { + ml_set_interrupts_enabled(FALSE); + copy_window_fault(thread, map, + fault_in_copy_window); + (void) ml_set_interrupts_enabled(intr); + } +#endif /* NCOPY_WINDOWS > 0 */ + return; + } + /* + * fall through + */ +#if CONFIG_DTRACE +FALL_THROUGH: +#endif /* CONFIG_DTRACE */ + case T_GENERAL_PROTECTION: /* * If there is a failure recovery address * for this fault, go there. */ - { - register struct recovery *rp; - - for (rp = recover_table; - rp < recover_table_end; - rp++) { - if (regs->eip == rp->fault_addr) { - regs->eip = rp->recover_addr; - return (TRUE); + for (rp = recover_table; rp < recover_table_end; rp++) { + if (kern_ip == rp->fault_addr) { + set_recovery_ip(saved_state, rp->recover_addr); + return; } - } } /* - * Check thread recovery address also - - * v86 assist uses it. + * Check thread recovery address also. */ - if (thread->recover) { - regs->eip = thread->recover; - thread->recover = 0; - return (TRUE); + if (thread != THREAD_NULL && thread->recover) { + set_recovery_ip(saved_state, thread->recover); + thread->recover = 0; + return; } - + /* + * Unanticipated page-fault errors in kernel + * should not happen. + * + * fall through... + */ + default: /* - * Unanticipated page-fault errors in kernel - * should not happen. + * Exception 15 is reserved but some chips may generate it + * spuriously. Seen at startup on AMD Athlon-64. */ - /* fall through... */ - - default: - /* - * ...and return failure, so that locore can call into - * debugger. + if (type == 15) { + kprintf("kernel_trap() ignoring spurious trap 15\n"); + return; + } +debugger_entry: + /* Ensure that the i386_kernel_state at the base of the + * current thread's stack (if any) is synchronized with the + * context at the moment of the trap, to facilitate + * access through the debugger. */ + sync_iss_to_iks(state); #if MACH_KDP - kdp_i386_trap(type, regs, result, regs->cr2); + if (kdp_i386_trap(type, saved_state, result, (vm_offset_t)vaddr)) { + return; + } #endif - return (FALSE); } - return (TRUE); + pal_cli(); + panic_trap(saved_state, trap_pl, fault_result); + /* + * NO RETURN + */ } -/* - * Called if both kernel_trap() and kdb_trap() fail. - */ -void -panic_trap( - register struct i386_saved_state *regs) +static void +set_recovery_ip(x86_saved_state64_t *saved_state, vm_offset_t ip) { - int code; - register int type; + saved_state->isf.rip = ip; +} + +static void +panic_trap(x86_saved_state64_t *regs, uint32_t pl, kern_return_t fault_result) +{ + const char *trapname = "Unknown"; + pal_cr_t cr0, cr2, cr3, cr4; + boolean_t potential_smep_fault = FALSE, potential_kernel_NX_fault = FALSE; + boolean_t potential_smap_fault = FALSE; + + pal_get_control_registers( &cr0, &cr2, &cr3, &cr4 ); + assert(ml_get_interrupts_enabled() == FALSE); + current_cpu_datap()->cpu_fatal_trap_state = regs; + /* + * Issue an I/O port read if one has been requested - this is an + * event logic analyzers can use as a trigger point. + */ + panic_io_port_read(); + + kprintf("CPU %d panic trap number 0x%x, rip 0x%016llx\n", + cpu_number(), regs->isf.trapno, regs->isf.rip); + kprintf("cr0 0x%016llx cr2 0x%016llx cr3 0x%016llx cr4 0x%016llx\n", + cr0, cr2, cr3, cr4); - type = regs->trapno; - code = regs->err; + if (regs->isf.trapno < TRAP_TYPES) { + trapname = trap_type[regs->isf.trapno]; + } + + if ((regs->isf.trapno == T_PAGE_FAULT) && (regs->isf.err == (T_PF_PROT | T_PF_EXECUTE)) && (regs->isf.rip == regs->cr2)) { + if (pmap_smep_enabled && (regs->isf.rip < VM_MAX_USER_PAGE_ADDRESS)) { + potential_smep_fault = TRUE; + } else if (regs->isf.rip >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) { + potential_kernel_NX_fault = TRUE; + } + } else if (pmap_smap_enabled && + regs->isf.trapno == T_PAGE_FAULT && + regs->isf.err & T_PF_PROT && + regs->cr2 < VM_MAX_USER_PAGE_ADDRESS && + regs->isf.rip >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) { + potential_smap_fault = TRUE; + } - printf("trap type %d, code = %x, pc = %x\n", - type, code, regs->eip); - panic("trap"); +#undef panic + panic("Kernel trap at 0x%016llx, type %d=%s, registers:\n" + "CR0: 0x%016llx, CR2: 0x%016llx, CR3: 0x%016llx, CR4: 0x%016llx\n" + "RAX: 0x%016llx, RBX: 0x%016llx, RCX: 0x%016llx, RDX: 0x%016llx\n" + "RSP: 0x%016llx, RBP: 0x%016llx, RSI: 0x%016llx, RDI: 0x%016llx\n" + "R8: 0x%016llx, R9: 0x%016llx, R10: 0x%016llx, R11: 0x%016llx\n" + "R12: 0x%016llx, R13: 0x%016llx, R14: 0x%016llx, R15: 0x%016llx\n" + "RFL: 0x%016llx, RIP: 0x%016llx, CS: 0x%016llx, SS: 0x%016llx\n" + "Fault CR2: 0x%016llx, Error code: 0x%016llx, Fault CPU: 0x%x%s%s%s%s, PL: %d, VF: %d\n", + regs->isf.rip, regs->isf.trapno, trapname, + cr0, cr2, cr3, cr4, + regs->rax, regs->rbx, regs->rcx, regs->rdx, + regs->isf.rsp, regs->rbp, regs->rsi, regs->rdi, + regs->r8, regs->r9, regs->r10, regs->r11, + regs->r12, regs->r13, regs->r14, regs->r15, + regs->isf.rflags, regs->isf.rip, regs->isf.cs & 0xFFFF, + regs->isf.ss & 0xFFFF, regs->cr2, regs->isf.err, regs->isf.cpu, + virtualized ? " VMM" : "", + potential_kernel_NX_fault ? " Kernel NX fault" : "", + potential_smep_fault ? " SMEP/User NX fault" : "", + potential_smap_fault ? " SMAP fault" : "", + pl, + fault_result); } +#if CONFIG_DTRACE +extern kern_return_t dtrace_user_probe(x86_saved_state_t *); +#endif + +#if DEBUG +uint32_t fsigs[2]; +uint32_t fsigns, fsigcs; +#endif /* * Trap from user mode. */ void user_trap( - register struct i386_saved_state *regs) + x86_saved_state_t *saved_state) { - int exc; - int code; - int subcode; - register int type; - vm_map_t map; - vm_prot_t prot; - kern_return_t result; - register thread_act_t thr_act = current_act(); - thread_t thread = (thr_act ? thr_act->thread : THREAD_NULL); - boolean_t kernel_act = FALSE; - etap_data_t probe_data; - - if (regs->efl & EFL_VM) { - /* - * If hardware assist can handle exception, - * continue execution. - */ - if (v86_assist(thread, regs)) - return; + int exc; + int err; + mach_exception_code_t code; + mach_exception_subcode_t subcode; + int type; + user_addr_t vaddr; + vm_prot_t prot; + thread_t thread = current_thread(); + kern_return_t kret; + user_addr_t rip; + unsigned long dr6 = 0; /* 32 bit for i386, 64 bit for x86_64 */ + + assert((is_saved_state32(saved_state) && !thread_is_64bit_addr(thread)) || + (is_saved_state64(saved_state) && thread_is_64bit_addr(thread))); + + if (is_saved_state64(saved_state)) { + x86_saved_state64_t *regs; + + regs = saved_state64(saved_state); + + /* Record cpu where state was captured */ + regs->isf.cpu = cpu_number(); + + type = regs->isf.trapno; + err = (int)regs->isf.err & 0xffff; + vaddr = (user_addr_t)regs->cr2; + rip = (user_addr_t)regs->isf.rip; + } else { + x86_saved_state32_t *regs; + + regs = saved_state32(saved_state); + + /* Record cpu where state was captured */ + regs->cpu = cpu_number(); + + type = regs->trapno; + err = regs->err & 0xffff; + vaddr = (user_addr_t)regs->cr2; + rip = (user_addr_t)regs->eip; + } + + if ((type == T_DEBUG) && thread->machine.ids) { + unsigned long clear = 0; + /* Stash and clear this processor's DR6 value, in the event + * this was a debug register match + */ + __asm__ volatile ("mov %%db6, %0" : "=r" (dr6)); + __asm__ volatile ("mov %0, %%db6" : : "r" (clear)); } - type = regs->trapno; + pal_sti(); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_EXCP_UTRAP_x86, type)) | DBG_FUNC_NONE, + (unsigned)(vaddr >> 32), (unsigned)vaddr, + (unsigned)(rip >> 32), (unsigned)rip, 0); + code = 0; subcode = 0; + exc = 0; - switch (type) { +#if CONFIG_DTRACE + /* + * DTrace does not consume all user traps, only INT_3's for now. + * Avoid needlessly calling tempDTraceTrapHook here, and let the + * INT_3 case handle them. + */ +#endif + + DEBUG_KPRINT_SYSCALL_MASK(1, + "user_trap: type=0x%x(%s) err=0x%x cr2=%p rip=%p\n", + type, trap_type[type], err, (void *)(long) vaddr, (void *)(long) rip); - case T_DIVIDE_ERROR: + switch (type) { + case T_DIVIDE_ERROR: exc = EXC_ARITHMETIC; code = EXC_I386_DIV; break; - case T_DEBUG: + case T_DEBUG: + { + pcb_t pcb; + /* + * Update the PCB with this processor's DR6 value + * in the event this was a debug register match. + */ + pcb = THREAD_TO_PCB(thread); + if (pcb->ids) { + /* + * We can get and set the status register + * in 32-bit mode even on a 64-bit thread + * because the high order bits are not + * used on x86_64 + */ + if (thread_is_64bit_addr(thread)) { + x86_debug_state64_t *ids = pcb->ids; + ids->dr6 = dr6; + } else { /* 32 bit thread */ + x86_debug_state32_t *ids = pcb->ids; + ids->dr6 = (uint32_t) dr6; + } + } exc = EXC_BREAKPOINT; code = EXC_I386_SGL; break; - - case T_INT3: + } + case T_INT3: +#if CONFIG_DTRACE + if (dtrace_user_probe(saved_state) == KERN_SUCCESS) { + return; /* If it succeeds, we are done... */ + } +#endif exc = EXC_BREAKPOINT; code = EXC_I386_BPT; break; - case T_OVERFLOW: + case T_OVERFLOW: exc = EXC_ARITHMETIC; code = EXC_I386_INTO; break; - case T_OUT_OF_BOUNDS: + case T_OUT_OF_BOUNDS: exc = EXC_SOFTWARE; code = EXC_I386_BOUND; break; - case T_INVALID_OPCODE: + case T_INVALID_OPCODE: +#if !defined(RC_HIDE_XNU_J137) + fpUDflt(rip); /* May return from exception directly */ +#endif exc = EXC_BAD_INSTRUCTION; code = EXC_I386_INVOP; break; - case T_NO_FPU: - case 32: /* XXX */ + case T_NO_FPU: fpnoextflt(); return; - case T_FPU_FAULT: - fpextovrflt(); + case T_FPU_FAULT: + fpextovrflt(); /* Propagates exception directly, doesn't return */ return; - case 10: /* invalid TSS == iret with NT flag set */ + case T_INVALID_TSS: /* invalid TSS == iret with NT flag set */ exc = EXC_BAD_INSTRUCTION; code = EXC_I386_INVTSSFLT; - subcode = regs->err & 0xffff; + subcode = err; break; - case T_SEGMENT_NOT_PRESENT: + case T_SEGMENT_NOT_PRESENT: exc = EXC_BAD_INSTRUCTION; code = EXC_I386_SEGNPFLT; - subcode = regs->err & 0xffff; + subcode = err; break; - case T_STACK_FAULT: + case T_STACK_FAULT: exc = EXC_BAD_INSTRUCTION; code = EXC_I386_STKFLT; - subcode = regs->err & 0xffff; + subcode = err; break; - case T_GENERAL_PROTECTION: - if (!(regs->efl & EFL_VM)) { - if (check_io_fault(regs)) - return; - } - exc = EXC_BAD_INSTRUCTION; + case T_GENERAL_PROTECTION: + /* + * There's a wide range of circumstances which generate this + * class of exception. From user-space, many involve bad + * addresses (such as a non-canonical 64-bit address). + * So we map this to EXC_BAD_ACCESS (and thereby SIGSEGV). + * The trouble is cr2 doesn't contain the faulting address; + * we'd need to decode the faulting instruction to really + * determine this. We'll leave that to debuggers. + * However, attempted execution of privileged instructions + * (e.g. cli) also generate GP faults and so we map these to + * to EXC_BAD_ACCESS (and thence SIGSEGV) also - rather than + * EXC_BAD_INSTRUCTION which is more accurate. We just can't + * win! + */ + exc = EXC_BAD_ACCESS; code = EXC_I386_GPFLT; - subcode = regs->err & 0xffff; + subcode = err; break; - case T_PAGE_FAULT: - subcode = regs->cr2; - prot = VM_PROT_READ|VM_PROT_WRITE; - if (kernel_act == FALSE) { - if (!(regs->err & T_PF_WRITE)) - prot = VM_PROT_READ; - (void) user_page_fault_continue(vm_fault(thr_act->map, - trunc_page((vm_offset_t)subcode), - prot, - FALSE, - THREAD_ABORTSAFE, NULL, 0)); - /* NOTREACHED */ + case T_PAGE_FAULT: + { + prot = VM_PROT_READ; + + if (err & T_PF_WRITE) { + prot |= VM_PROT_WRITE; } - else { - if (subcode > LINEAR_KERNEL_ADDRESS) { - map = kernel_map; - subcode -= LINEAR_KERNEL_ADDRESS; - } - result = vm_fault(thr_act->map, - trunc_page((vm_offset_t)subcode), - prot, - FALSE, - (map == kernel_map) ? THREAD_UNINT : THREAD_ABORTSAFE, NULL, 0); - if ((result != KERN_SUCCESS) && (result != KERN_ABORTED)) { - /* - * Must expand vm_fault by hand, - * so that we can ask for read-only access - * but enter a (kernel) writable mapping. - */ - result = intel_read_fault(thr_act->map, - trunc_page((vm_offset_t)subcode)); + if (__improbable(err & T_PF_EXECUTE)) { + prot |= VM_PROT_EXECUTE; + } +#if DEVELOPMENT || DEBUG + uint32_t fsig = 0; + fsig = thread_fpsimd_hash(thread); +#if DEBUG + fsigs[0] = fsig; +#endif +#endif + kret = vm_fault(thread->map, + vaddr, + prot, FALSE, VM_KERN_MEMORY_NONE, + THREAD_ABORTSAFE, NULL, 0); +#if DEVELOPMENT || DEBUG + if (fsig) { + uint32_t fsig2 = thread_fpsimd_hash(thread); +#if DEBUG + fsigcs++; + fsigs[1] = fsig2; +#endif + if (fsig != fsig2) { + panic("FP/SIMD state hash mismatch across fault thread: %p 0x%x->0x%x", thread, fsig, fsig2); } - user_page_fault_continue(result); + } else { +#if DEBUG + fsigns++; +#endif + } +#endif + if (__probable((kret == KERN_SUCCESS) || (kret == KERN_ABORTED))) { + thread_exception_return(); /*NOTREACHED*/ } - break; - - case T_FLOATING_POINT_ERROR: - fpexterrflt(); - return; - - default: -#if MACH_KGDB - Debugger("Unanticipated user trap"); - return; -#endif /* MACH_KGDB */ -#if MACH_KDB - if (kdb_trap(type, regs->err, regs)) - return; -#endif /* MACH_KDB */ - printf("user trap type %d, code = %x, pc = %x\n", - type, regs->err, regs->eip); - panic("user trap"); - return; - } - -#if MACH_KDB - if (debug_all_traps_with_kdb && - kdb_trap(type, regs->err, regs)) - return; -#endif /* MACH_KDB */ - -#if ETAP_EVENT_MONITOR - if (thread != THREAD_NULL) { - ETAP_DATA_LOAD(probe_data[0], regs->trapno); - ETAP_DATA_LOAD(probe_data[1], - thr_act->exc_actions[exc].port); - ETAP_DATA_LOAD(probe_data[2], - thr_act->task->exc_actions[exc].port); - ETAP_PROBE_DATA(ETAP_P_EXCEPTION, - 0, - thread, - &probe_data, - ETAP_DATA_ENTRY*3); - } -#endif /* ETAP_EVENT_MONITOR */ - - i386_exception(exc, code, subcode); - /*NOTREACHED*/ -} - -/* - * V86 mode assist for interrupt handling. - */ -boolean_t v86_assist_on = TRUE; -boolean_t v86_unsafe_ok = FALSE; -boolean_t v86_do_sti_cli = TRUE; -boolean_t v86_do_sti_immediate = FALSE; -#define V86_IRET_PENDING 0x4000 - -int cli_count = 0; -int sti_count = 0; - -boolean_t -v86_assist( - thread_t thread, - register struct i386_saved_state *regs) -{ - register struct v86_assist_state *v86 = &thread->top_act->mact.pcb->ims.v86s; - -/* - * Build an 8086 address. Use only when off is known to be 16 bits. - */ -#define Addr8086(seg,off) ((((seg) & 0xffff) << 4) + (off)) - -#define EFL_V86_SAFE ( EFL_OF | EFL_DF | EFL_TF \ - | EFL_SF | EFL_ZF | EFL_AF \ - | EFL_PF | EFL_CF ) - struct iret_32 { - int eip; - int cs; - int eflags; - }; - struct iret_16 { - unsigned short ip; - unsigned short cs; - unsigned short flags; - }; - union iret_struct { - struct iret_32 iret_32; - struct iret_16 iret_16; - }; - - struct int_vec { - unsigned short ip; - unsigned short cs; - }; - - if (!v86_assist_on) - return FALSE; - - /* - * If delayed STI pending, enable interrupts. - * Turn off tracing if on only to delay STI. - */ - if (v86->flags & V86_IF_PENDING) { - v86->flags &= ~V86_IF_PENDING; - v86->flags |= EFL_IF; - if ((v86->flags & EFL_TF) == 0) - regs->efl &= ~EFL_TF; - } - - if (regs->trapno == T_DEBUG) { - - if (v86->flags & EFL_TF) { /* - * Trace flag was also set - it has priority + * For a user trap, vm_fault() should never return KERN_FAILURE. + * If it does, we're leaking preemption disables somewhere in the kernel. */ - return FALSE; /* handle as single-step */ - } - /* - * Fall through to check for interrupts. - */ - } - else if (regs->trapno == T_GENERAL_PROTECTION) { - /* - * General protection error - must be an 8086 instruction - * to emulate. - */ - register int eip; - boolean_t addr_32 = FALSE; - boolean_t data_32 = FALSE; - int io_port; - - /* - * Set up error handler for bad instruction/data - * fetches. - */ - __asm__("movl $(addr_error), %0" : : "m" (thread->recover)); - - eip = regs->eip; - while (TRUE) { - unsigned char opcode; - - if (eip > 0xFFFF) { - thread->recover = 0; - return FALSE; /* GP fault: IP out of range */ + if (__improbable(kret == KERN_FAILURE)) { + panic("vm_fault() KERN_FAILURE from user fault on thread %p", thread); } - opcode = *(unsigned char *)Addr8086(regs->cs,eip); - eip++; - switch (opcode) { - case 0xf0: /* lock */ - case 0xf2: /* repne */ - case 0xf3: /* repe */ - case 0x2e: /* cs */ - case 0x36: /* ss */ - case 0x3e: /* ds */ - case 0x26: /* es */ - case 0x64: /* fs */ - case 0x65: /* gs */ - /* ignore prefix */ - continue; - - case 0x66: /* data size */ - data_32 = TRUE; - continue; - - case 0x67: /* address size */ - addr_32 = TRUE; - continue; - - case 0xe4: /* inb imm */ - case 0xe5: /* inw imm */ - case 0xe6: /* outb imm */ - case 0xe7: /* outw imm */ - io_port = *(unsigned char *)Addr8086(regs->cs, eip); - eip++; - goto do_in_out; - - case 0xec: /* inb dx */ - case 0xed: /* inw dx */ - case 0xee: /* outb dx */ - case 0xef: /* outw dx */ - case 0x6c: /* insb */ - case 0x6d: /* insw */ - case 0x6e: /* outsb */ - case 0x6f: /* outsw */ - io_port = regs->edx & 0xffff; - - do_in_out: - if (!data_32) - opcode |= 0x6600; /* word IO */ - - switch (emulate_io(regs, opcode, io_port)) { - case EM_IO_DONE: - /* instruction executed */ - break; - case EM_IO_RETRY: - /* port mapped, retry instruction */ - thread->recover = 0; - return TRUE; - case EM_IO_ERROR: - /* port not mapped */ - thread->recover = 0; - return FALSE; - } - break; - - case 0xfa: /* cli */ - if (!v86_do_sti_cli) { - thread->recover = 0; - return (FALSE); - } - - v86->flags &= ~EFL_IF; - /* disable simulated interrupts */ - cli_count++; - break; - - case 0xfb: /* sti */ - if (!v86_do_sti_cli) { - thread->recover = 0; - return (FALSE); - } + user_page_fault_continue(kret); + } /* NOTREACHED */ + break; - if ((v86->flags & EFL_IF) == 0) { - if (v86_do_sti_immediate) { - v86->flags |= EFL_IF; - } else { - v86->flags |= V86_IF_PENDING; - regs->efl |= EFL_TF; - } - /* single step to set IF next inst. */ - } - sti_count++; - break; - - case 0x9c: /* pushf */ - { - int flags; - vm_offset_t sp; - int size; - - flags = regs->efl; - if ((v86->flags & EFL_IF) == 0) - flags &= ~EFL_IF; - - if ((v86->flags & EFL_TF) == 0) - flags &= ~EFL_TF; - else flags |= EFL_TF; - - sp = regs->uesp; - if (!addr_32) - sp &= 0xffff; - else if (sp > 0xffff) - goto stack_error; - size = (data_32) ? 4 : 2; - if (sp < size) - goto stack_error; - sp -= size; - if (copyout((char *)&flags, - (char *)Addr8086(regs->ss,sp), - size)) - goto addr_error; - if (addr_32) - regs->uesp = sp; - else - regs->uesp = (regs->uesp & 0xffff0000) | sp; - break; - } - - case 0x9d: /* popf */ - { - vm_offset_t sp; - int nflags; - - sp = regs->uesp; - if (!addr_32) - sp &= 0xffff; - else if (sp > 0xffff) - goto stack_error; - - if (data_32) { - if (sp > 0xffff - sizeof(int)) - goto stack_error; - nflags = *(int *)Addr8086(regs->ss,sp); - sp += sizeof(int); - } - else { - if (sp > 0xffff - sizeof(short)) - goto stack_error; - nflags = *(unsigned short *) - Addr8086(regs->ss,sp); - sp += sizeof(short); - } - if (addr_32) - regs->uesp = sp; - else - regs->uesp = (regs->uesp & 0xffff0000) | sp; - - if (v86->flags & V86_IRET_PENDING) { - v86->flags = nflags & (EFL_TF | EFL_IF); - v86->flags |= V86_IRET_PENDING; - } else { - v86->flags = nflags & (EFL_TF | EFL_IF); - } - regs->efl = (regs->efl & ~EFL_V86_SAFE) - | (nflags & EFL_V86_SAFE); - break; - } - case 0xcf: /* iret */ - { - vm_offset_t sp; - int nflags; - int size; - union iret_struct iret_struct; - - v86->flags &= ~V86_IRET_PENDING; - sp = regs->uesp; - if (!addr_32) - sp &= 0xffff; - else if (sp > 0xffff) - goto stack_error; - - if (data_32) { - if (sp > 0xffff - sizeof(struct iret_32)) - goto stack_error; - iret_struct.iret_32 = - *(struct iret_32 *) Addr8086(regs->ss,sp); - sp += sizeof(struct iret_32); - } - else { - if (sp > 0xffff - sizeof(struct iret_16)) - goto stack_error; - iret_struct.iret_16 = - *(struct iret_16 *) Addr8086(regs->ss,sp); - sp += sizeof(struct iret_16); - } - if (addr_32) - regs->uesp = sp; - else - regs->uesp = (regs->uesp & 0xffff0000) | sp; - - if (data_32) { - eip = iret_struct.iret_32.eip; - regs->cs = iret_struct.iret_32.cs & 0xffff; - nflags = iret_struct.iret_32.eflags; - } - else { - eip = iret_struct.iret_16.ip; - regs->cs = iret_struct.iret_16.cs; - nflags = iret_struct.iret_16.flags; - } - - v86->flags = nflags & (EFL_TF | EFL_IF); - regs->efl = (regs->efl & ~EFL_V86_SAFE) - | (nflags & EFL_V86_SAFE); - break; - } - default: - /* - * Instruction not emulated here. - */ - thread->recover = 0; - return FALSE; - } - break; /* exit from 'while TRUE' */ - } - regs->eip = (regs->eip & 0xffff0000 | eip); - } - else { - /* - * Not a trap we handle. - */ - thread->recover = 0; - return FALSE; - } - - if ((v86->flags & EFL_IF) && ((v86->flags & V86_IRET_PENDING)==0)) { + case T_SSE_FLOAT_ERROR: + fpSSEexterrflt(); /* Propagates exception directly, doesn't return */ + return; - struct v86_interrupt_table *int_table; - int int_count; - int vec; - int i; - int_table = (struct v86_interrupt_table *) v86->int_table; - int_count = v86->int_count; + case T_FLOATING_POINT_ERROR: + fpexterrflt(); /* Propagates exception directly, doesn't return */ + return; - vec = 0; - for (i = 0; i < int_count; int_table++, i++) { - if (!int_table->mask && int_table->count > 0) { - int_table->count--; - vec = int_table->vec; - break; + case T_DTRACE_RET: +#if CONFIG_DTRACE + if (dtrace_user_probe(saved_state) == KERN_SUCCESS) { + return; /* If it succeeds, we are done... */ } - } - if (vec != 0) { +#endif /* - * Take this interrupt + * If we get an INT 0x7f when we do not expect to, + * treat it as an illegal instruction */ - vm_offset_t sp; - struct iret_16 iret_16; - struct int_vec int_vec; - - sp = regs->uesp & 0xffff; - if (sp < sizeof(struct iret_16)) - goto stack_error; - sp -= sizeof(struct iret_16); - iret_16.ip = regs->eip; - iret_16.cs = regs->cs; - iret_16.flags = regs->efl & 0xFFFF; - if ((v86->flags & EFL_TF) == 0) - iret_16.flags &= ~EFL_TF; - else iret_16.flags |= EFL_TF; - - (void) memcpy((char *) &int_vec, - (char *) (sizeof(struct int_vec) * vec), - sizeof (struct int_vec)); - if (copyout((char *)&iret_16, - (char *)Addr8086(regs->ss,sp), - sizeof(struct iret_16))) - goto addr_error; - regs->uesp = (regs->uesp & 0xFFFF0000) | (sp & 0xffff); - regs->eip = int_vec.ip; - regs->cs = int_vec.cs; - regs->efl &= ~EFL_TF; - v86->flags &= ~(EFL_IF | EFL_TF); - v86->flags |= V86_IRET_PENDING; - } - } - - thread->recover = 0; - return TRUE; - - /* - * On address error, report a page fault. - * XXX report GP fault - we don`t save - * the faulting address. - */ - addr_error: - __asm__("addr_error:;"); - thread->recover = 0; - return FALSE; - - /* - * On stack address error, return stack fault (12). - */ - stack_error: - thread->recover = 0; - regs->trapno = T_STACK_FAULT; - return FALSE; -} - -/* - * Handle AST traps for i386. - * Check for delayed floating-point exception from - * AT-bus machines. - */ - -extern void log_thread_action (thread_t, char *); - -void -i386_astintr(int preemption) -{ - int mycpu; - ast_t mask = AST_ALL; - spl_t s; - thread_t self = current_thread(); - - s = splsched(); /* block interrupts to check reasons */ - mp_disable_preemption(); - mycpu = cpu_number(); - if (need_ast[mycpu] & AST_I386_FP) { - /* - * AST was for delayed floating-point exception - - * FP interrupt occured while in kernel. - * Turn off this AST reason and handle the FPU error. - */ - - ast_off(AST_I386_FP); - mp_enable_preemption(); - splx(s); - - fpexterrflt(); - } - else { - /* - * Not an FPU trap. Handle the AST. - * Interrupts are still blocked. - */ - -#ifdef XXX - if (preemption) { - - /* - * We don't want to process any AST if we were in - * kernel-mode and the current thread is in any - * funny state (waiting and/or suspended). - */ - - thread_lock (self); - - if (thread_not_preemptable(self) || self->preempt) { - ast_off(AST_URGENT); - thread_unlock (self); - mp_enable_preemption(); - splx(s); - return; - } - else mask = AST_PREEMPTION; - mp_enable_preemption(); - -/* - self->preempt = TH_NOT_PREEMPTABLE; -*/ - - thread_unlock (self); - } else { - mp_enable_preemption(); - } -#else - mp_enable_preemption(); -#endif + exc = EXC_BAD_INSTRUCTION; + code = EXC_I386_INVOP; + break; - ast_taken(mask, s -#if FAST_IDLE - ,NO_IDLE_THREAD -#endif /* FAST_IDLE */ - ); -/* - self->preempt = TH_PREEMPTABLE; -*/ + default: + panic("Unexpected user trap, type %d", type); } + /* Note: Codepaths that directly return from user_trap() have pending + * ASTs processed in locore + */ + i386_exception(exc, code, subcode); + /* NOTREACHED */ } /* @@ -1146,151 +1175,132 @@ i386_astintr(int preemption) */ void i386_exception( - int exc, - int code, - int subcode) + int exc, + mach_exception_code_t code, + mach_exception_subcode_t subcode) { - spl_t s; - exception_data_type_t codes[EXCEPTION_CODE_MAX]; + mach_exception_data_type_t codes[EXCEPTION_CODE_MAX]; - /* - * Turn off delayed FPU error handling. - */ - s = splsched(); - mp_disable_preemption(); - ast_off(AST_I386_FP); - mp_enable_preemption(); - splx(s); - - codes[0] = code; /* new exception interface */ + DEBUG_KPRINT_SYSCALL_MACH("i386_exception: exc=%d code=0x%llx subcode=0x%llx\n", + exc, code, subcode); + codes[0] = code; /* new exception interface */ codes[1] = subcode; - exception(exc, codes, 2); + exception_triage(exc, codes, 2); /*NOTREACHED*/ } -boolean_t -check_io_fault( - struct i386_saved_state *regs) -{ - int eip, opcode, io_port; - boolean_t data_16 = FALSE; - /* - * Get the instruction. - */ - eip = regs->eip; - - for (;;) { - opcode = inst_fetch(eip, regs->cs); - eip++; - switch (opcode) { - case 0x66: /* data-size prefix */ - data_16 = TRUE; - continue; - - case 0xf3: /* rep prefix */ - case 0x26: /* es */ - case 0x2e: /* cs */ - case 0x36: /* ss */ - case 0x3e: /* ds */ - case 0x64: /* fs */ - case 0x65: /* gs */ - continue; - - case 0xE4: /* inb imm */ - case 0xE5: /* inl imm */ - case 0xE6: /* outb imm */ - case 0xE7: /* outl imm */ - /* port is immediate byte */ - io_port = inst_fetch(eip, regs->cs); - eip++; - break; - - case 0xEC: /* inb dx */ - case 0xED: /* inl dx */ - case 0xEE: /* outb dx */ - case 0xEF: /* outl dx */ - case 0x6C: /* insb */ - case 0x6D: /* insl */ - case 0x6E: /* outsb */ - case 0x6F: /* outsl */ - /* port is in DX register */ - io_port = regs->edx & 0xFFFF; - break; - - default: - return FALSE; - } - break; - } +/* Synchronize a thread's x86_kernel_state (if any) with the given + * x86_saved_state_t obtained from the trap/IPI handler; called in + * kernel_trap() prior to entering the debugger, and when receiving + * an "MP_KDP" IPI. Called with null saved_state if an incoming IPI + * was detected from the kernel while spinning with interrupts masked. + */ - if (data_16) - opcode |= 0x6600; /* word IO */ +void +sync_iss_to_iks(x86_saved_state_t *saved_state) +{ + struct x86_kernel_state *iks = NULL; + vm_offset_t kstack; + boolean_t record_active_regs = FALSE; - switch (emulate_io(regs, opcode, io_port)) { - case EM_IO_DONE: - /* instruction executed */ - regs->eip = eip; - return TRUE; + /* The PAL may have a special way to sync registers */ + if (saved_state && saved_state->flavor == THREAD_STATE_NONE) { + pal_get_kern_regs( saved_state ); + } - case EM_IO_RETRY: - /* port mapped, retry instruction */ - return TRUE; + if (current_thread() != NULL && + (kstack = current_thread()->kernel_stack) != 0) { + x86_saved_state64_t *regs = saved_state64(saved_state); + + iks = STACK_IKS(kstack); + + /* Did we take the trap/interrupt in kernel mode? */ + if (saved_state == NULL || /* NULL => polling in kernel */ + regs == USER_REGS64(current_thread())) { + record_active_regs = TRUE; + } else { + iks->k_rbx = regs->rbx; + iks->k_rsp = regs->isf.rsp; + iks->k_rbp = regs->rbp; + iks->k_r12 = regs->r12; + iks->k_r13 = regs->r13; + iks->k_r14 = regs->r14; + iks->k_r15 = regs->r15; + iks->k_rip = regs->isf.rip; + } + } - case EM_IO_ERROR: - /* port not mapped */ - return FALSE; + if (record_active_regs == TRUE) { + /* Show the trap handler path */ + __asm__ volatile ("movq %%rbx, %0" : "=m" (iks->k_rbx)); + __asm__ volatile ("movq %%rsp, %0" : "=m" (iks->k_rsp)); + __asm__ volatile ("movq %%rbp, %0" : "=m" (iks->k_rbp)); + __asm__ volatile ("movq %%r12, %0" : "=m" (iks->k_r12)); + __asm__ volatile ("movq %%r13, %0" : "=m" (iks->k_r13)); + __asm__ volatile ("movq %%r14, %0" : "=m" (iks->k_r14)); + __asm__ volatile ("movq %%r15, %0" : "=m" (iks->k_r15)); + /* "Current" instruction pointer */ + __asm__ volatile ("leaq 1f(%%rip), %%rax; mov %%rax, %0\n1:" + : "=m" (iks->k_rip) + : + : "rax"); } - return FALSE; } +/* + * This is used by the NMI interrupt handler (from mp.c) to + * uncondtionally sync the trap handler context to the IKS + * irrespective of whether the NMI was fielded in kernel + * or user space. + */ void -kernel_preempt_check (void) +sync_iss_to_iks_unconditionally(__unused x86_saved_state_t *saved_state) { - mp_disable_preemption(); - if ((need_ast[cpu_number()] & AST_URGENT) && -#if NCPUS > 1 - get_interrupt_level() == 1 -#else /* NCPUS > 1 */ - get_interrupt_level() == 0 -#endif /* NCPUS > 1 */ - ) { - mp_enable_preemption_no_check(); - __asm__ volatile (" int $0xff"); - } else { - mp_enable_preemption_no_check(); + struct x86_kernel_state *iks; + vm_offset_t kstack; + + if ((kstack = current_thread()->kernel_stack) != 0) { + iks = STACK_IKS(kstack); + /* Display the trap handler path */ + __asm__ volatile ("movq %%rbx, %0" : "=m" (iks->k_rbx)); + __asm__ volatile ("movq %%rsp, %0" : "=m" (iks->k_rsp)); + __asm__ volatile ("movq %%rbp, %0" : "=m" (iks->k_rbp)); + __asm__ volatile ("movq %%r12, %0" : "=m" (iks->k_r12)); + __asm__ volatile ("movq %%r13, %0" : "=m" (iks->k_r13)); + __asm__ volatile ("movq %%r14, %0" : "=m" (iks->k_r14)); + __asm__ volatile ("movq %%r15, %0" : "=m" (iks->k_r15)); + /* "Current" instruction pointer */ + __asm__ volatile ("leaq 1f(%%rip), %%rax; mov %%rax, %0\n1:" : "=m" (iks->k_rip)::"rax"); } } -#if MACH_KDB - -extern void db_i386_state(struct i386_saved_state *regs); +#if DEBUG +#define TERI 1 +#endif -#include +#if TERI +extern void thread_exception_return_internal(void) __dead2; -void -db_i386_state( - struct i386_saved_state *regs) +void +thread_exception_return(void) { - db_printf("eip %8x\n", regs->eip); - db_printf("trap %8x\n", regs->trapno); - db_printf("err %8x\n", regs->err); - db_printf("efl %8x\n", regs->efl); - db_printf("ebp %8x\n", regs->ebp); - db_printf("esp %8x\n", regs->esp); - db_printf("uesp %8x\n", regs->uesp); - db_printf("cs %8x\n", regs->cs & 0xff); - db_printf("ds %8x\n", regs->ds & 0xff); - db_printf("es %8x\n", regs->es & 0xff); - db_printf("fs %8x\n", regs->fs & 0xff); - db_printf("gs %8x\n", regs->gs & 0xff); - db_printf("ss %8x\n", regs->ss & 0xff); - db_printf("eax %8x\n", regs->eax); - db_printf("ebx %8x\n", regs->ebx); - db_printf("ecx %8x\n", regs->ecx); - db_printf("edx %8x\n", regs->edx); - db_printf("esi %8x\n", regs->esi); - db_printf("edi %8x\n", regs->edi); -} + thread_t thread = current_thread(); + ml_set_interrupts_enabled(FALSE); + if (thread_is_64bit_addr(thread) != task_has_64Bit_addr(thread->task)) { + panic("Task/thread bitness mismatch %p %p, task: %d, thread: %d", thread, thread->task, thread_is_64bit_addr(thread), task_has_64Bit_addr(thread->task)); + } -#endif /* MACH_KDB */ + if (thread_is_64bit_addr(thread)) { + if ((gdt_desc_p(USER64_CS)->access & ACC_PL_U) == 0) { + panic("64-GDT mismatch %p, descriptor: %p", thread, gdt_desc_p(USER64_CS)); + } + } else { + if ((gdt_desc_p(USER_CS)->access & ACC_PL_U) == 0) { + panic("32-GDT mismatch %p, descriptor: %p", thread, gdt_desc_p(USER_CS)); + } + } + assert(get_preemption_level() == 0); + thread_exception_return_internal(); +} +#endif