X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/d7e50217d7adf6e52786a38bcaa4cd698cb9a79e..13f56ec4e58bf8687e2a68032c093c0213dd519b:/osfmk/i386/fpu.c diff --git a/osfmk/i386/fpu.c b/osfmk/i386/fpu.c index 34bf4752a..478eb2b4e 100644 --- a/osfmk/i386/fpu.c +++ b/osfmk/i386/fpu.c @@ -1,16 +1,19 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ - * - * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER @@ -20,7 +23,7 @@ * Please see the License for the specific language governing rights and * limitations under the License. * - * @APPLE_LICENSE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ @@ -53,71 +56,162 @@ /* */ -#include #include #include #include #include +#include #include +#include #include #include #include #include #include -#include +#include + +#include +#include #include -#include -#include +#include #include +#include +#include + +int fp_kind = FP_NO; /* not inited */ +zone_t ifps_zone; /* zone for FPU save area */ + +#define ALIGNED(addr,size) (((uintptr_t)(addr)&((size)-1))==0) + +/* Forward */ + +extern void fpinit(void); +extern void fp_save( + thread_t thr_act); +extern void fp_load( + thread_t thr_act); + +static void configure_mxcsr_capability_mask(struct x86_avx_thread_state *fps); + +struct x86_avx_thread_state initial_fp_state __attribute((aligned(64))); + + +/* Global MXCSR capability bitmask */ +static unsigned int mxcsr_capability_mask; -#if 0 -#include -extern int curr_ipl; -#define ASSERT_IPL(L) \ -{ \ - if (curr_ipl != L) { \ - printf("IPL is %d, expected %d\n", curr_ipl, L); \ - panic("fpu: wrong ipl"); \ - } \ +#define fninit() \ + __asm__ volatile("fninit") + +#define fnstcw(control) \ + __asm__("fnstcw %0" : "=m" (*(unsigned short *)(control))) + +#define fldcw(control) \ + __asm__ volatile("fldcw %0" : : "m" (*(unsigned short *) &(control)) ) + +#define fnclex() \ + __asm__ volatile("fnclex") + +#define fnsave(state) \ + __asm__ volatile("fnsave %0" : "=m" (*state)) + +#define frstor(state) \ + __asm__ volatile("frstor %0" : : "m" (state)) + +#define fwait() \ + __asm__("fwait"); + +#define fxrstor(addr) __asm__ __volatile__("fxrstor %0" : : "m" (*(addr))) +#define fxsave(addr) __asm__ __volatile__("fxsave %0" : "=m" (*(addr))) + +static uint32_t fp_register_state_size = 0; +static uint32_t fpu_YMM_present = FALSE; +static uint32_t cpuid_reevaluated = 0; + +static void fpu_store_registers(void *, boolean_t); +static void fpu_load_registers(void *); + +extern void xsave64o(void); +extern void xrstor64o(void); + +#define XMASK ((uint32_t) (XFEM_X87 | XFEM_SSE | XFEM_YMM)) + +/* DRK: TODO replace opcodes with mnemonics when assembler support available */ + +static inline void xsetbv(uint32_t mask_hi, uint32_t mask_lo) { + __asm__ __volatile__(".short 0x010F\n\t.byte 0xD1" :: "a"(mask_lo), "d"(mask_hi), "c" (XCR0)); } -#else -#define ASSERT_IPL(L) -#endif -int fp_kind = FP_387; /* 80387 present */ -zone_t ifps_zone; /* zone for FPU save area */ +static inline void xsave(void *a) { + /* MOD 0x4, operand ECX 0x1 */ + __asm__ __volatile__(".short 0xAE0F\n\t.byte 0x21" :: "a"(XMASK), "d"(0), "c" (a)); +} -#if NCPUS == 1 -volatile thread_act_t fp_act = THR_ACT_NULL; - /* thread whose state is in FPU */ - /* always THR_ACT_NULL if emulating FPU */ -volatile thread_act_t fp_intr_act = THR_ACT_NULL; +static inline void xrstor(void *a) { + /* MOD 0x5, operand ECX 0x1 */ + __asm__ __volatile__(".short 0xAE0F\n\t.byte 0x29" :: "a"(XMASK), "d"(0), "c" (a)); +} +static inline void xsave64(void *a) { + /* Out of line call that executes in 64-bit mode on K32 */ + __asm__ __volatile__("call _xsave64o" :: "a"(XMASK), "d"(0), "c" (a)); +} -#define clear_fpu() \ - { \ - set_ts(); \ - fp_act = THR_ACT_NULL; \ - } +static inline void xrstor64(void *a) { + /* Out of line call that executes in 64-bit mode on K32 */ + __asm__ __volatile__("call _xrstor64o" :: "a"(XMASK), "d"(0), "c" (a)); +} -#else /* NCPUS > 1 */ -#define clear_fpu() \ - { \ - set_ts(); \ - } +static inline unsigned short +fnstsw(void) +{ + unsigned short status; + __asm__ volatile("fnstsw %0" : "=ma" (status)); + return(status); +} -#endif +/* + * Configure the initial FPU state presented to new threads. + * Determine the MXCSR capability mask, which allows us to mask off any + * potentially unsafe "reserved" bits before restoring the FPU context. + * *Not* per-cpu, assumes symmetry. + */ -/* Forward */ +static void +configure_mxcsr_capability_mask(struct x86_avx_thread_state *fps) +{ + /* XSAVE requires a 64 byte aligned store */ + assert(ALIGNED(fps, 64)); + /* Clear, to prepare for the diagnostic FXSAVE */ + bzero(fps, sizeof(*fps)); + + fpinit(); + fpu_store_registers(fps, FALSE); + + mxcsr_capability_mask = fps->fx_MXCSR_MASK; + + /* Set default mask value if necessary */ + if (mxcsr_capability_mask == 0) + mxcsr_capability_mask = 0xffbf; + + /* Clear vector register store */ + bzero(&fps->fx_XMM_reg[0][0], sizeof(fps->fx_XMM_reg)); + bzero(&fps->x_YMMH_reg[0][0], sizeof(fps->x_YMMH_reg)); + + fps->fp_valid = TRUE; + fps->fp_save_layout = fpu_YMM_present ? XSAVE32: FXSAVE32; + fpu_load_registers(fps); + + /* Poison values to trap unsafe usage */ + fps->fp_valid = 0xFFFFFFFF; + fps->fp_save_layout = FP_UNUSED; + + /* Re-enable FPU/SSE DNA exceptions */ + set_ts(); +} -extern void fpinit(void); -extern void fp_save( - thread_act_t thr_act); -extern void fp_load( - thread_act_t thr_act); /* * Look for FPU and initialize it. @@ -126,193 +220,371 @@ extern void fp_load( void init_fpu(void) { - unsigned short status, control; - +#if DEBUG + unsigned short status; + unsigned short control; +#endif /* * Check for FPU by initializing it, * then trying to read the correct bit patterns from * the control and status registers. */ - set_cr0(get_cr0() & ~(CR0_EM|CR0_TS)); /* allow use of FPU */ - + set_cr0((get_cr0() & ~(CR0_EM|CR0_TS)) | CR0_NE); /* allow use of FPU */ fninit(); +#if DEBUG status = fnstsw(); fnstcw(&control); + + assert(((status & 0xff) == 0) && ((control & 0x103f) == 0x3f)); +#endif + /* Advertise SSE support */ + if (cpuid_features() & CPUID_FEATURE_FXSR) { + fp_kind = FP_FXSR; + set_cr4(get_cr4() | CR4_OSFXS); + /* And allow SIMD exceptions if present */ + if (cpuid_features() & CPUID_FEATURE_SSE) { + set_cr4(get_cr4() | CR4_OSXMM); + } + fp_register_state_size = sizeof(struct x86_fx_thread_state); - if ((status & 0xff) == 0 && - (control & 0x103f) == 0x3f) - { -#if 0 - /* - * We have a FPU of some sort. - * Compare -infinity against +infinity - * to check whether we have a 287 or a 387. - */ - volatile double fp_infinity, fp_one, fp_zero; - fp_one = 1.0; - fp_zero = 0.0; - fp_infinity = fp_one / fp_zero; - if (fp_infinity == -fp_infinity) { - /* - * We have an 80287. - */ - fp_kind = FP_287; - __asm__ volatile(".byte 0xdb; .byte 0xe4"); /* fnsetpm */ - } - else + } else + panic("fpu is not FP_FXSR"); + + /* Configure the XSAVE context mechanism if the processor supports + * AVX/YMM registers + */ + if (cpuid_features() & CPUID_FEATURE_XSAVE) { + cpuid_xsave_leaf_t *xsp = &cpuid_info()->cpuid_xsave_leaf; + if (xsp->extended_state[0] & (uint32_t)XFEM_YMM) { + assert(xsp->extended_state[0] & (uint32_t) XFEM_SSE); + /* XSAVE container size for all features */ + assert(xsp->extended_state[2] == sizeof(struct x86_avx_thread_state)); + fp_register_state_size = sizeof(struct x86_avx_thread_state); + fpu_YMM_present = TRUE; + set_cr4(get_cr4() | CR4_OSXSAVE); + xsetbv(0, XMASK); + /* Re-evaluate CPUID, once, to reflect OSXSAVE */ + if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) + cpuid_set_info(); + /* DRK: consider verifying AVX offset with cpuid(d, ECX:2) */ + } + } + else + fpu_YMM_present = FALSE; + + fpinit(); + + /* + * Trap wait instructions. Turn off FPU for now. + */ + set_cr0(get_cr0() | CR0_TS | CR0_MP); +} + +/* + * Allocate and initialize FP state for current thread. + * Don't load state. + */ +static void * +fp_state_alloc(void) +{ + void *ifps = zalloc(ifps_zone); + +#if DEBUG + if (!(ALIGNED(ifps,64))) { + panic("fp_state_alloc: %p, %u, %p, %u", ifps, (unsigned) ifps_zone->elem_size, (void *) ifps_zone->free_elements, (unsigned) ifps_zone->alloc_size); + } #endif - { - /* - * We have a 387. - */ - fp_kind = FP_387; - } - /* - * Trap wait instructions. Turn off FPU for now. - */ - set_cr0(get_cr0() | CR0_TS | CR0_MP); + return ifps; +} + +static inline void +fp_state_free(void *ifps) +{ + zfree(ifps_zone, ifps); +} + +void clear_fpu(void) +{ + set_ts(); +} + + +static void fpu_load_registers(void *fstate) { + struct x86_fx_thread_state *ifps = fstate; + fp_save_layout_t layout = ifps->fp_save_layout; + + assert(layout == FXSAVE32 || layout == FXSAVE64 || layout == XSAVE32 || layout == XSAVE64); + assert(ALIGNED(ifps, 64)); + assert(ml_get_interrupts_enabled() == FALSE); + +#if DEBUG + if (layout == XSAVE32 || layout == XSAVE64) { + struct x86_avx_thread_state *iavx = fstate; + unsigned i; + /* Verify reserved bits in the XSAVE header*/ + if (iavx->_xh.xsbv & ~7) + panic("iavx->_xh.xsbv: 0x%llx", iavx->_xh.xsbv); + for (i = 0; i < sizeof(iavx->_xh.xhrsvd); i++) + if (iavx->_xh.xhrsvd[i]) + panic("Reserved bit set"); + } + if (fpu_YMM_present) { + if (layout != XSAVE32 && layout != XSAVE64) + panic("Inappropriate layout: %u\n", layout); + } +#endif /* DEBUG */ + +#if defined(__i386__) + if (layout == FXSAVE32) { + /* Restore the compatibility/legacy mode XMM+x87 state */ + fxrstor(ifps); + } + else if (layout == FXSAVE64) { + fxrstor64(ifps); + } + else if (layout == XSAVE32) { + xrstor(ifps); } + else if (layout == XSAVE64) { + xrstor64(ifps); + } +#elif defined(__x86_64__) + if ((layout == XSAVE64) || (layout == XSAVE32)) + xrstor(ifps); else - { - /* - * NO FPU. - */ - fp_kind = FP_NO; - set_cr0(get_cr0() | CR0_EM); + fxrstor(ifps); +#endif +} + +static void fpu_store_registers(void *fstate, boolean_t is64) { + struct x86_fx_thread_state *ifps = fstate; + assert(ALIGNED(ifps, 64)); +#if defined(__i386__) + if (!is64) { + if (fpu_YMM_present) { + xsave(ifps); + ifps->fp_save_layout = XSAVE32; + } + else { + /* save the compatibility/legacy mode XMM+x87 state */ + fxsave(ifps); + ifps->fp_save_layout = FXSAVE32; + } + } + else { + if (fpu_YMM_present) { + xsave64(ifps); + ifps->fp_save_layout = XSAVE64; + } + else { + fxsave64(ifps); + ifps->fp_save_layout = FXSAVE64; + } } +#elif defined(__x86_64__) + if (fpu_YMM_present) { + xsave(ifps); + ifps->fp_save_layout = is64 ? XSAVE64 : XSAVE32; + } + else { + fxsave(ifps); + ifps->fp_save_layout = is64 ? FXSAVE64 : FXSAVE32; + } +#endif } /* * Initialize FP handling. */ + void fpu_module_init(void) { - ifps_zone = zinit(sizeof(struct i386_fpsave_state), - THREAD_MAX * sizeof(struct i386_fpsave_state), - THREAD_CHUNK * sizeof(struct i386_fpsave_state), - "i386 fpsave state"); + if ((fp_register_state_size != sizeof(struct x86_fx_thread_state)) && + (fp_register_state_size != sizeof(struct x86_avx_thread_state))) + panic("fpu_module_init: incorrect savearea size %u\n", fp_register_state_size); + + assert(fpu_YMM_present != 0xFFFFFFFF); + + /* We explicitly choose an allocation size of 64 + * to eliminate waste for the 832 byte sized + * AVX XSAVE register save area. + */ + ifps_zone = zinit(fp_register_state_size, + thread_max * fp_register_state_size, + 64 * fp_register_state_size, + "x86 fpsave state"); + +#if ZONE_DEBUG + /* To maintain the required alignment, disable + * zone debugging for this zone as that appends + * 16 bytes to each element. + */ + zone_debug_disable(ifps_zone); +#endif + /* Determine MXCSR reserved bits and configure initial FPU state*/ + configure_mxcsr_capability_mask(&initial_fp_state); } /* - * Free a FPU save area. - * Called only when thread terminating - no locking necessary. + * Save thread`s FPU context. */ void -fp_free(fps) - struct i386_fpsave_state *fps; +fpu_save_context(thread_t thread) { -ASSERT_IPL(SPL0); -#if NCPUS == 1 - if ((fp_act != THR_ACT_NULL) && (fp_act->mact.pcb->ims.ifps == fps)) { - /* - * Make sure we don't get FPU interrupts later for - * this thread - */ - fwait(); + struct x86_fx_thread_state *ifps; - /* Mark it free and disable access */ - clear_fpu(); + assert(ml_get_interrupts_enabled() == FALSE); + ifps = (thread)->machine.ifps; +#if DEBUG + if (ifps && ((ifps->fp_valid != FALSE) && (ifps->fp_valid != TRUE))) { + panic("ifps->fp_valid: %u\n", ifps->fp_valid); } -#endif /* NCPUS == 1 */ - zfree(ifps_zone, (vm_offset_t) fps); +#endif + if (ifps != 0 && (ifps->fp_valid == FALSE)) { + /* Clear CR0.TS in preparation for the FP context save. In + * theory, this shouldn't be necessary since a live FPU should + * indicate that TS is clear. However, various routines + * (such as sendsig & sigreturn) manipulate TS directly. + */ + clear_ts(); + /* registers are in FPU - save to memory */ + fpu_store_registers(ifps, (thread_is_64bit(thread) && is_saved_state64(thread->machine.iss))); + ifps->fp_valid = TRUE; + } + set_ts(); +} + + +/* + * Free a FPU save area. + * Called only when thread terminating - no locking necessary. + */ +void +fpu_free(void *fps) +{ + fp_state_free(fps); } /* - * Set the floating-point state for a thread. + * Set the floating-point state for a thread based + * on the FXSave formatted data. This is basically + * the same as fpu_set_state except it uses the + * expanded data structure. * If the thread is not the current thread, it is * not running (held). Locking needed against * concurrent fpu_set_state or fpu_get_state. */ kern_return_t -fpu_set_state( - thread_act_t thr_act, - struct i386_float_state *state) +fpu_set_fxstate( + thread_t thr_act, + thread_state_t tstate, + thread_flavor_t f) { - register pcb_t pcb; - register struct i386_fpsave_state *ifps; - register struct i386_fpsave_state *new_ifps; - -ASSERT_IPL(SPL0); + struct x86_fx_thread_state *ifps; + struct x86_fx_thread_state *new_ifps; + x86_float_state64_t *state; + pcb_t pcb; + size_t state_size = sizeof(struct x86_fx_thread_state); + boolean_t old_valid; if (fp_kind == FP_NO) return KERN_FAILURE; - assert(thr_act != THR_ACT_NULL); - pcb = thr_act->mact.pcb; - -#if NCPUS == 1 + state = (x86_float_state64_t *)tstate; - /* - * If this thread`s state is in the FPU, - * discard it; we are replacing the entire - * FPU state. - */ - if (fp_act == thr_act) { - fwait(); /* wait for possible interrupt */ - clear_fpu(); /* no state in FPU */ - } -#endif + assert(thr_act != THREAD_NULL); + pcb = THREAD_TO_PCB(thr_act); - if (state->initialized == 0) { + if (state == NULL) { /* * new FPU state is 'invalid'. * Deallocate the fp state if it exists. */ simple_lock(&pcb->lock); - ifps = pcb->ims.ifps; - pcb->ims.ifps = 0; + + ifps = pcb->ifps; + pcb->ifps = 0; + simple_unlock(&pcb->lock); - if (ifps != 0) { - zfree(ifps_zone, (vm_offset_t) ifps); - } - } - else { + if (ifps != 0) + fp_state_free(ifps); + } else { /* * Valid state. Allocate the fp state if there is none. */ - register struct i386_fp_save *user_fp_state; - register struct i386_fp_regs *user_fp_regs; - - user_fp_state = (struct i386_fp_save *) &state->hw_state[0]; - user_fp_regs = (struct i386_fp_regs *) - &state->hw_state[sizeof(struct i386_fp_save)]; - new_ifps = 0; Retry: simple_lock(&pcb->lock); - ifps = pcb->ims.ifps; + + ifps = pcb->ifps; if (ifps == 0) { if (new_ifps == 0) { simple_unlock(&pcb->lock); - new_ifps = (struct i386_fpsave_state *) zalloc(ifps_zone); + new_ifps = fp_state_alloc(); goto Retry; } ifps = new_ifps; new_ifps = 0; - pcb->ims.ifps = ifps; + pcb->ifps = ifps; } + /* + * now copy over the new data. + */ + old_valid = ifps->fp_valid; +#if DEBUG + if ((old_valid == FALSE) && (thr_act != current_thread())) { + panic("fpu_set_fxstate inconsistency, thread: %p not stopped", thr_act); + } +#endif /* - * Ensure that reserved parts of the environment are 0. + * Clear any reserved bits in the MXCSR to prevent a GPF + * when issuing an FXRSTOR. */ - bzero((char *)&ifps->fp_save_state, sizeof(struct i386_fp_save)); - - ifps->fp_save_state.fp_control = user_fp_state->fp_control; - ifps->fp_save_state.fp_status = user_fp_state->fp_status; - ifps->fp_save_state.fp_tag = user_fp_state->fp_tag; - ifps->fp_save_state.fp_eip = user_fp_state->fp_eip; - ifps->fp_save_state.fp_cs = user_fp_state->fp_cs; - ifps->fp_save_state.fp_opcode = user_fp_state->fp_opcode; - ifps->fp_save_state.fp_dp = user_fp_state->fp_dp; - ifps->fp_save_state.fp_ds = user_fp_state->fp_ds; - ifps->fp_regs = *user_fp_regs; + + state->fpu_mxcsr &= mxcsr_capability_mask; + + bcopy((char *)&state->fpu_fcw, (char *)ifps, state_size); + + if (fpu_YMM_present) { + struct x86_avx_thread_state *iavx = (void *) ifps; + uint32_t fpu_nyreg = 0; + + if (f == x86_AVX_STATE32) + fpu_nyreg = 8; + else if (f == x86_AVX_STATE64) + fpu_nyreg = 16; + + if (fpu_nyreg) { + x86_avx_state64_t *ystate = (x86_avx_state64_t *) state; + bcopy(&ystate->__fpu_ymmh0, &iavx->x_YMMH_reg[0][0], fpu_nyreg * sizeof(_STRUCT_XMM_REG)); + } + + iavx->fp_save_layout = thread_is_64bit(thr_act) ? XSAVE64 : XSAVE32; + /* Sanitize XSAVE header */ + bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd)); + if (state_size == sizeof(struct x86_avx_thread_state)) + iavx->_xh.xsbv = (XFEM_YMM | XFEM_SSE | XFEM_X87); + else + iavx->_xh.xsbv = (XFEM_SSE | XFEM_X87); + } + else + ifps->fp_save_layout = thread_is_64bit(thr_act) ? FXSAVE64 : FXSAVE32; + ifps->fp_valid = old_valid; + + if (old_valid == FALSE) { + boolean_t istate = ml_set_interrupts_enabled(FALSE); + ifps->fp_valid = TRUE; + set_ts(); + ml_set_interrupts_enabled(istate); + } simple_unlock(&pcb->lock); + if (new_ifps != 0) - zfree(ifps_zone, (vm_offset_t) ifps); + fp_state_free(new_ifps); } - return KERN_SUCCESS; } @@ -323,98 +595,164 @@ ASSERT_IPL(SPL0); * concurrent fpu_set_state or fpu_get_state. */ kern_return_t -fpu_get_state( - thread_act_t thr_act, - register struct i386_float_state *state) +fpu_get_fxstate( + thread_t thr_act, + thread_state_t tstate, + thread_flavor_t f) { - register pcb_t pcb; - register struct i386_fpsave_state *ifps; + struct x86_fx_thread_state *ifps; + x86_float_state64_t *state; + kern_return_t ret = KERN_FAILURE; + pcb_t pcb; + size_t state_size = sizeof(struct x86_fx_thread_state); -ASSERT_IPL(SPL0); if (fp_kind == FP_NO) - return KERN_FAILURE; + return KERN_FAILURE; + + state = (x86_float_state64_t *)tstate; - assert(thr_act != THR_ACT_NULL); - pcb = thr_act->mact.pcb; + assert(thr_act != THREAD_NULL); + pcb = THREAD_TO_PCB(thr_act); simple_lock(&pcb->lock); - ifps = pcb->ims.ifps; + + ifps = pcb->ifps; if (ifps == 0) { - /* - * No valid floating-point state. - */ - simple_unlock(&pcb->lock); - bzero((char *)state, sizeof(struct i386_float_state)); - return KERN_SUCCESS; + /* + * No valid floating-point state. + */ + + bcopy((char *)&initial_fp_state, (char *)&state->fpu_fcw, + state_size); + + simple_unlock(&pcb->lock); + + return KERN_SUCCESS; } + /* + * Make sure we`ve got the latest fp state info + * If the live fpu state belongs to our target + */ + if (thr_act == current_thread()) { + boolean_t intr; - /* Make sure we`ve got the latest fp state info */ - /* If the live fpu state belongs to our target */ -#if NCPUS == 1 - if (thr_act == fp_act) -#else - if (thr_act == current_act()) -#endif - { - clear_ts(); - fp_save(thr_act); - clear_fpu(); + intr = ml_set_interrupts_enabled(FALSE); + + clear_ts(); + fp_save(thr_act); + clear_fpu(); + + (void)ml_set_interrupts_enabled(intr); } + if (ifps->fp_valid) { + bcopy((char *)ifps, (char *)&state->fpu_fcw, state_size); + if (fpu_YMM_present) { + struct x86_avx_thread_state *iavx = (void *) ifps; + uint32_t fpu_nyreg = 0; + + if (f == x86_AVX_STATE32) + fpu_nyreg = 8; + else if (f == x86_AVX_STATE64) + fpu_nyreg = 16; + + if (fpu_nyreg) { + x86_avx_state64_t *ystate = (x86_avx_state64_t *) state; + bcopy(&iavx->x_YMMH_reg[0][0], &ystate->__fpu_ymmh0, fpu_nyreg * sizeof(_STRUCT_XMM_REG)); + } + } - state->fpkind = fp_kind; - state->exc_status = 0; + ret = KERN_SUCCESS; + } + simple_unlock(&pcb->lock); - { - register struct i386_fp_save *user_fp_state; - register struct i386_fp_regs *user_fp_regs; + return ret; +} - state->initialized = ifps->fp_valid; - user_fp_state = (struct i386_fp_save *) &state->hw_state[0]; - user_fp_regs = (struct i386_fp_regs *) - &state->hw_state[sizeof(struct i386_fp_save)]; - /* - * Ensure that reserved parts of the environment are 0. - */ - bzero((char *)user_fp_state, sizeof(struct i386_fp_save)); - - user_fp_state->fp_control = ifps->fp_save_state.fp_control; - user_fp_state->fp_status = ifps->fp_save_state.fp_status; - user_fp_state->fp_tag = ifps->fp_save_state.fp_tag; - user_fp_state->fp_eip = ifps->fp_save_state.fp_eip; - user_fp_state->fp_cs = ifps->fp_save_state.fp_cs; - user_fp_state->fp_opcode = ifps->fp_save_state.fp_opcode; - user_fp_state->fp_dp = ifps->fp_save_state.fp_dp; - user_fp_state->fp_ds = ifps->fp_save_state.fp_ds; - *user_fp_regs = ifps->fp_regs; +/* + * the child thread is 'stopped' with the thread + * mutex held and is currently not known by anyone + * so no way for fpu state to get manipulated by an + * outside agency -> no need for pcb lock + */ + +void +fpu_dup_fxstate( + thread_t parent, + thread_t child) +{ + struct x86_fx_thread_state *new_ifps = NULL; + boolean_t intr; + pcb_t ppcb; + + ppcb = THREAD_TO_PCB(parent); + + if (ppcb->ifps == NULL) + return; + + if (child->machine.ifps) + panic("fpu_dup_fxstate: child's ifps non-null"); + + new_ifps = fp_state_alloc(); + + simple_lock(&ppcb->lock); + + if (ppcb->ifps != NULL) { + struct x86_fx_thread_state *ifps = ppcb->ifps; + /* + * Make sure we`ve got the latest fp state info + */ + intr = ml_set_interrupts_enabled(FALSE); + assert(current_thread() == parent); + clear_ts(); + fp_save(parent); + clear_fpu(); + + (void)ml_set_interrupts_enabled(intr); + + if (ifps->fp_valid) { + child->machine.ifps = new_ifps; + assert((fp_register_state_size == sizeof(struct x86_fx_thread_state)) || + (fp_register_state_size == sizeof(struct x86_avx_thread_state))); + bcopy((char *)(ppcb->ifps), + (char *)(child->machine.ifps), fp_register_state_size); + + /* Mark the new fp saved state as non-live. */ + /* Temporarily disabled: radar 4647827 + * new_ifps->fp_valid = TRUE; + */ + + /* + * Clear any reserved bits in the MXCSR to prevent a GPF + * when issuing an FXRSTOR. + */ + new_ifps->fx_MXCSR &= mxcsr_capability_mask; + new_ifps = NULL; + } } - simple_unlock(&pcb->lock); + simple_unlock(&ppcb->lock); - return KERN_SUCCESS; + if (new_ifps != NULL) + fp_state_free(new_ifps); } + /* * Initialize FPU. * - * Raise exceptions for: - * invalid operation - * divide by zero - * overflow - * - * Use 53-bit precision. */ + void fpinit(void) { unsigned short control; -ASSERT_IPL(SPL0); clear_ts(); fninit(); fnstcw(&control); control &= ~(FPC_PC|FPC_RC); /* Clear precision & rounding control */ - control |= (FPC_PC_53 | /* Set precision */ + control |= (FPC_PC_64 | /* Set precision */ FPC_RC_RN | /* round-to-nearest */ FPC_ZE | /* Suppress zero-divide */ FPC_OE | /* and overflow */ @@ -423,6 +761,9 @@ ASSERT_IPL(SPL0); FPC_DE | /* Allow denorms as operands */ FPC_PE); /* No trap for precision loss */ fldcw(control); + + /* Initialize SSE/SSE2 */ + __builtin_ia32_ldmxcsr(0x1f80); } /* @@ -432,41 +773,52 @@ ASSERT_IPL(SPL0); void fpnoextflt(void) { - /* - * Enable FPU use. - */ -ASSERT_IPL(SPL0); - clear_ts(); -#if NCPUS == 1 - - /* - * If this thread`s state is in the FPU, we are done. - */ - if (fp_act == current_act()) - return; - - /* Make sure we don't do fpsave() in fp_intr while doing fpsave() - * here if the current fpu instruction generates an error. - */ - fwait(); - /* - * If another thread`s state is in the FPU, save it. - */ - if (fp_act != THR_ACT_NULL) { - fp_save(fp_act); + boolean_t intr; + thread_t thr_act; + pcb_t pcb; + struct x86_fx_thread_state *ifps = 0; + + thr_act = current_thread(); + pcb = THREAD_TO_PCB(thr_act); + + assert(fp_register_state_size != 0); + + if (pcb->ifps == 0 && !get_interrupt_level()) { + ifps = fp_state_alloc(); + bcopy((char *)&initial_fp_state, (char *)ifps, + fp_register_state_size); + if (!thread_is_64bit(thr_act)) { + ifps->fp_save_layout = fpu_YMM_present ? XSAVE32 : FXSAVE32; + } + else + ifps->fp_save_layout = fpu_YMM_present ? XSAVE64 : FXSAVE64; + ifps->fp_valid = TRUE; } + intr = ml_set_interrupts_enabled(FALSE); - /* - * Give this thread the FPU. - */ - fp_act = current_act(); + clear_ts(); /* Enable FPU use */ -#endif /* NCPUS == 1 */ + if (__improbable(get_interrupt_level())) { + /* + * Save current coprocessor context if valid + * Initialize coprocessor live context + */ + fp_save(thr_act); + fpinit(); + } else { + if (pcb->ifps == 0) { + pcb->ifps = ifps; + ifps = 0; + } + /* + * Load this thread`s state into coprocessor live context. + */ + fp_load(thr_act); + } + (void)ml_set_interrupts_enabled(intr); - /* - * Load this thread`s state into the FPU. - */ - fp_load(current_act()); + if (ifps) + fp_state_free(ifps); } /* @@ -477,29 +829,26 @@ ASSERT_IPL(SPL0); void fpextovrflt(void) { - register thread_act_t thr_act = current_act(); - register pcb_t pcb; - register struct i386_fpsave_state *ifps; + thread_t thr_act = current_thread(); + pcb_t pcb; + struct x86_fx_thread_state *ifps; + boolean_t intr; -#if NCPUS == 1 + intr = ml_set_interrupts_enabled(FALSE); - /* - * Is exception for the currently running thread? - */ - if (fp_act != thr_act) { - /* Uh oh... */ - panic("fpextovrflt"); - } -#endif + if (get_interrupt_level()) + panic("FPU segment overrun exception at interrupt context\n"); + if (current_task() == kernel_task) + panic("FPU segment overrun exception in kernel thread context\n"); /* * This is a non-recoverable error. * Invalidate the thread`s FPU state. */ - pcb = thr_act->mact.pcb; + pcb = THREAD_TO_PCB(thr_act); simple_lock(&pcb->lock); - ifps = pcb->ims.ifps; - pcb->ims.ifps = 0; + ifps = pcb->ifps; + pcb->ifps = 0; simple_unlock(&pcb->lock); /* @@ -513,8 +862,10 @@ fpextovrflt(void) */ clear_fpu(); + (void)ml_set_interrupts_enabled(intr); + if (ifps) - zfree(ifps_zone, (vm_offset_t) ifps); + zfree(ifps_zone, ifps); /* * Raise exception. @@ -530,52 +881,33 @@ fpextovrflt(void) void fpexterrflt(void) { - register thread_act_t thr_act = current_act(); + thread_t thr_act = current_thread(); + struct x86_fx_thread_state *ifps = thr_act->machine.ifps; + boolean_t intr; -ASSERT_IPL(SPL0); -#if NCPUS == 1 - /* - * Since FPU errors only occur on ESC or WAIT instructions, - * the current thread should own the FPU. If it didn`t, - * we should have gotten the task-switched interrupt first. - */ - if (fp_act != THR_ACT_NULL) { - panic("fpexterrflt"); - return; - } + intr = ml_set_interrupts_enabled(FALSE); + + if (get_interrupt_level()) + panic("FPU error exception at interrupt context\n"); + if (current_task() == kernel_task) + panic("FPU error exception in kernel thread context\n"); - /* - * Check if we got a context switch between the interrupt and the AST - * This can happen if the interrupt arrived after the FPU AST was - * checked. In this case, raise the exception in fp_load when this - * thread next time uses the FPU. Remember exception condition in - * fp_valid (extended boolean 2). - */ - if (fp_intr_act != thr_act) { - if (fp_intr_act == THR_ACT_NULL) { - panic("fpexterrflt: fp_intr_act == THR_ACT_NULL"); - return; - } - fp_intr_act->mact.pcb->ims.ifps->fp_valid = 2; - fp_intr_act = THR_ACT_NULL; - return; - } - fp_intr_act = THR_ACT_NULL; -#else /* NCPUS == 1 */ /* * Save the FPU state and turn off the FPU. */ fp_save(thr_act); -#endif /* NCPUS == 1 */ + + (void)ml_set_interrupts_enabled(intr); /* * Raise FPU exception. - * Locking not needed on pcb->ims.ifps, + * Locking not needed on pcb->ifps, * since thread is running. */ i386_exception(EXC_ARITHMETIC, EXC_I386_EXTERR, - thr_act->mact.pcb->ims.ifps->fp_save_state.fp_status); + ifps->fx_status); + /*NOTREACHED*/ } @@ -586,19 +918,22 @@ ASSERT_IPL(SPL0); * . if called from fpu_get_state, pcb already locked. * . if called from fpnoextflt or fp_intr, we are single-cpu * . otherwise, thread is running. + * N.B.: Must be called with interrupts disabled */ void fp_save( - thread_act_t thr_act) + thread_t thr_act) { - register pcb_t pcb = thr_act->mact.pcb; - register struct i386_fpsave_state *ifps = pcb->ims.ifps; + pcb_t pcb = THREAD_TO_PCB(thr_act); + struct x86_fx_thread_state *ifps = pcb->ifps; + assert(ifps != 0); if (ifps != 0 && !ifps->fp_valid) { - /* registers are in FPU */ - ifps->fp_valid = TRUE; - fnsave(&ifps->fp_save_state); + assert((get_cr0() & CR0_TS) == 0); + /* registers are in FPU */ + ifps->fp_valid = TRUE; + fpu_store_registers(ifps, thread_is_64bit(thr_act)); } } @@ -610,157 +945,76 @@ fp_save( void fp_load( - thread_act_t thr_act) + thread_t thr_act) { - register pcb_t pcb = thr_act->mact.pcb; - register struct i386_fpsave_state *ifps; + pcb_t pcb = THREAD_TO_PCB(thr_act); + struct x86_fx_thread_state *ifps = pcb->ifps; -ASSERT_IPL(SPL0); - ifps = pcb->ims.ifps; - if (ifps == 0) { - ifps = (struct i386_fpsave_state *) zalloc(ifps_zone); - bzero((char *)ifps, sizeof *ifps); - pcb->ims.ifps = ifps; - fpinit(); -#if 1 -/* - * I'm not sure this is needed. Does the fpu regenerate the interrupt in - * frstor or not? Without this code we may miss some exceptions, with it - * we might send too many exceptions. - */ - } else if (ifps->fp_valid == 2) { - /* delayed exception pending */ + assert(ifps); + assert(ifps->fp_valid == FALSE || ifps->fp_valid == TRUE); - ifps->fp_valid = TRUE; - clear_fpu(); - /* - * Raise FPU exception. - * Locking not needed on pcb->ims.ifps, - * since thread is running. - */ - i386_exception(EXC_ARITHMETIC, - EXC_I386_EXTERR, - thr_act->mact.pcb->ims.ifps->fp_save_state.fp_status); - /*NOTREACHED*/ -#endif + if (ifps->fp_valid == FALSE) { + fpinit(); } else { - frstor(ifps->fp_save_state); + fpu_load_registers(ifps); } ifps->fp_valid = FALSE; /* in FPU */ } /* - * Allocate and initialize FP state for current thread. - * Don't load state. - * - * Locking not needed; always called on the current thread. + * SSE arithmetic exception handling code. + * Basically the same as the x87 exception handler with a different subtype */ -void -fp_state_alloc(void) -{ - pcb_t pcb = current_act()->mact.pcb; - struct i386_fpsave_state *ifps; - - ifps = (struct i386_fpsave_state *)zalloc(ifps_zone); - bzero((char *)ifps, sizeof *ifps); - pcb->ims.ifps = ifps; - - ifps->fp_valid = TRUE; - ifps->fp_save_state.fp_control = (0x037f - & ~(FPC_IM|FPC_ZM|FPC_OM|FPC_PC)) - | (FPC_PC_53|FPC_IC_AFF); - ifps->fp_save_state.fp_status = 0; - ifps->fp_save_state.fp_tag = 0xffff; /* all empty */ -} - -/* - * fpflush(thread_act_t) - * Flush the current act's state, if needed - * (used by thread_terminate_self to ensure fp faults - * aren't satisfied by overly general trap code in the - * context of the reaper thread) - */ void -fpflush(thread_act_t thr_act) +fpSSEexterrflt(void) { -#if NCPUS == 1 - if (fp_act && thr_act == fp_act) { - clear_ts(); - fwait(); - clear_fpu(); - } -#else - /* not needed on MP x86s; fp not lazily evaluated */ -#endif -} + thread_t thr_act = current_thread(); + struct x86_fx_thread_state *ifps = thr_act->machine.ifps; + boolean_t intr; + intr = ml_set_interrupts_enabled(FALSE); -/* - * Handle a coprocessor error interrupt on the AT386. - * This comes in on line 5 of the slave PIC at SPL1. - */ + if (get_interrupt_level()) + panic("SSE exception at interrupt context\n"); + if (current_task() == kernel_task) + panic("SSE exception in kernel thread context\n"); -void -fpintr(void) -{ - spl_t s; - thread_act_t thr_act = current_act(); - -ASSERT_IPL(SPL1); /* - * Turn off the extended 'busy' line. + * Save the FPU state and turn off the FPU. */ - outb(0xf0, 0); + fp_save(thr_act); + (void)ml_set_interrupts_enabled(intr); /* - * Save the FPU context to the thread using it. + * Raise FPU exception. + * Locking not needed on pcb->ifps, + * since thread is running. */ -#if NCPUS == 1 - if (fp_act == THR_ACT_NULL) { - printf("fpintr: FPU not belonging to anyone!\n"); - clear_ts(); - fninit(); - clear_fpu(); - return; - } + assert(ifps->fp_save_layout == FXSAVE32 || ifps->fp_save_layout == FXSAVE64); + i386_exception(EXC_ARITHMETIC, + EXC_I386_SSEEXTERR, + ifps->fx_MXCSR); + /*NOTREACHED*/ +} - if (fp_act != thr_act) { - /* - * FPU exception is for a different thread. - * When that thread again uses the FPU an exception will be - * raised in fp_load. Remember the condition in fp_valid (== 2). - */ - clear_ts(); - fp_save(fp_act); - fp_act->mact.pcb->ims.ifps->fp_valid = 2; - fninit(); - clear_fpu(); - /* leave fp_intr_act THR_ACT_NULL */ - return; - } - if (fp_intr_act != THR_ACT_NULL) - panic("fp_intr: already caught intr"); - fp_intr_act = thr_act; -#endif /* NCPUS == 1 */ +void +fp_setvalid(boolean_t value) { + thread_t thr_act = current_thread(); + struct x86_fx_thread_state *ifps = thr_act->machine.ifps; - clear_ts(); - fp_save(thr_act); - fninit(); - clear_fpu(); + if (ifps) { + ifps->fp_valid = value; - /* - * Since we are running on the interrupt stack, we must - * signal the thread to take the exception when we return - * to user mode. Use an AST to do this. - * - * Don`t set the thread`s AST field. If the thread is - * descheduled before it takes the AST, it will notice - * the FPU error when it reloads its FPU state. - */ - s = splsched(); - mp_disable_preemption(); - ast_on(AST_I386_FP); - mp_enable_preemption(); - splx(s); + if (value == TRUE) { + boolean_t istate = ml_set_interrupts_enabled(FALSE); + clear_fpu(); + ml_set_interrupts_enabled(istate); + } + } +} + +__private_extern__ boolean_t +ml_fpu_avx_enabled(void) { + return (fpu_YMM_present == TRUE); }