X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/378393581903b274cb7a4d18e0d978071a6b592d..527f99514973766e9c0382a4d8550dfb00f54939:/osfmk/i386/fpu.c?ds=sidebyside diff --git a/osfmk/i386/fpu.c b/osfmk/i386/fpu.c index 30de466be..083882238 100644 --- a/osfmk/i386/fpu.c +++ b/osfmk/i386/fpu.c @@ -1,23 +1,29 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. * - * @APPLE_LICENSE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ @@ -50,11 +56,11 @@ /* */ -#include #include #include #include +#include #include #include @@ -64,36 +70,20 @@ #include #include -#include -#include -#include -#include +#include + +#include #include +#include +#include #include +#include +#include -#if 0 -#include -extern int curr_ipl; -#define ASSERT_IPL(L) \ -{ \ - if (curr_ipl != L) { \ - printf("IPL is %d, expected %d\n", curr_ipl, L); \ - panic("fpu: wrong ipl"); \ - } \ -} -#else -#define ASSERT_IPL(L) -#endif - -int fp_kind = FP_387; /* 80387 present */ -zone_t ifps_zone; /* zone for FPU save area */ - -#define clear_fpu() \ - { \ - set_ts(); \ - } +xstate_t fpu_capability = UNDEFINED; /* extended state capability */ +xstate_t fpu_default = UNDEFINED; /* default extended state */ -#define ALIGNED(addr,size) (((unsigned)(addr)&((size)-1))==0) +#define ALIGNED(addr,size) (((uintptr_t)(addr)&((size)-1))==0) /* Forward */ @@ -103,6 +93,327 @@ extern void fp_save( extern void fp_load( thread_t thr_act); +static void configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps); +static xstate_t thread_xstate(thread_t); + +x86_ext_thread_state_t initial_fp_state __attribute((aligned(64))); + + +/* Global MXCSR capability bitmask */ +static unsigned int mxcsr_capability_mask; + +#define fninit() \ + __asm__ volatile("fninit") + +#define fnstcw(control) \ + __asm__("fnstcw %0" : "=m" (*(unsigned short *)(control))) + +#define fldcw(control) \ + __asm__ volatile("fldcw %0" : : "m" (*(unsigned short *) &(control)) ) + +#define fnclex() \ + __asm__ volatile("fnclex") + +#define fnsave(state) \ + __asm__ volatile("fnsave %0" : "=m" (*state)) + +#define frstor(state) \ + __asm__ volatile("frstor %0" : : "m" (state)) + +#define fwait() \ + __asm__("fwait"); + +static inline void fxrstor(struct x86_fx_thread_state *a) { + __asm__ __volatile__("fxrstor %0" :: "m" (*a)); +} + +static inline void fxsave(struct x86_fx_thread_state *a) { + __asm__ __volatile__("fxsave %0" : "=m" (*a)); +} + +static inline void fxrstor64(struct x86_fx_thread_state *a) { + __asm__ __volatile__("fxrstor64 %0" :: "m" (*a)); +} + +static inline void fxsave64(struct x86_fx_thread_state *a) { + __asm__ __volatile__("fxsave64 %0" : "=m" (*a)); +} + +#if !defined(RC_HIDE_XNU_J137) +#define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX || (x) == AVX512) +#else +#define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX) +#endif + +zone_t ifps_zone[] = { + [FP] = NULL, + [AVX] = NULL, +#if !defined(RC_HIDE_XNU_J137) + [AVX512] = NULL +#endif +}; +static uint32_t fp_state_size[] = { + [FP] = sizeof(struct x86_fx_thread_state), + [AVX] = sizeof(struct x86_avx_thread_state), +#if !defined(RC_HIDE_XNU_J137) + [AVX512] = sizeof(struct x86_avx512_thread_state) +#endif +}; + +static const char *xstate_name[] = { + [UNDEFINED] = "UNDEFINED", + [FP] = "FP", + [AVX] = "AVX", +#if !defined(RC_HIDE_XNU_J137) + [AVX512] = "AVX512" +#endif +}; + +#if !defined(RC_HIDE_XNU_J137) +#define fpu_ZMM_capable (fpu_capability == AVX512) +#define fpu_YMM_capable (fpu_capability == AVX || fpu_capability == AVX512) +/* + * On-demand AVX512 support + * ------------------------ + * On machines with AVX512 support, by default, threads are created with + * AVX512 masked off in XCR0 and an AVX-sized savearea is used. However, AVX512 + * capabilities are advertised in the commpage and via sysctl. If a thread + * opts to use AVX512 instructions, the first will result in a #UD exception. + * Faulting AVX512 intructions are recognizable by their unique prefix. + * This exception results in the thread being promoted to use an AVX512-sized + * savearea and for the AVX512 bit masks being set in its XCR0. The faulting + * instruction is re-driven and the thread can proceed to perform AVX512 + * operations. + * + * In addition to AVX512 instructions causing promotion, the thread_set_state() + * primitive with an AVX512 state flavor result in promotion. + * + * AVX512 promotion of the first thread in a task causes the default xstate + * of the task to be promoted so that any subsequently created or subsequently + * DNA-faulted thread will have AVX512 xstate and it will not need to fault-in + * a promoted xstate. + * + * Two savearea zones are used: the default pool of AVX-sized (832 byte) areas + * and a second pool of larger AVX512-sized (2688 byte) areas. + * + * Note the initial state value is an AVX512 object but that the AVX initial + * value is a subset of it. + */ +#else +#define fpu_YMM_capable (fpu_capability == AVX) +#endif +static uint32_t cpuid_reevaluated = 0; + +static void fpu_store_registers(void *, boolean_t); +static void fpu_load_registers(void *); + +#define FP_XMASK ((uint32_t) (XFEM_X87 | XFEM_SSE)) +#define AVX_XMASK ((uint32_t) (XFEM_X87 | XFEM_SSE | XFEM_YMM)) +#if !defined(RC_HIDE_XNU_J137) +#define AVX512_XMASK ((uint32_t) (XFEM_X87 | XFEM_SSE | XFEM_YMM | XFEM_ZMM)) +static const uint32_t xstate_xmask[] = { + [FP] = FP_XMASK, + [AVX] = AVX_XMASK, + [AVX512] = AVX512_XMASK +}; +#else +static const uint32_t xstate_xmask[] = { + [FP] = FP_XMASK, + [AVX] = AVX_XMASK, +}; +#endif + +static inline void xsetbv(uint32_t mask_hi, uint32_t mask_lo) { + __asm__ __volatile__("xsetbv" :: "a"(mask_lo), "d"(mask_hi), "c" (XCR0)); +} + +static inline void xsave(struct x86_fx_thread_state *a, uint32_t rfbm) { + __asm__ __volatile__("xsave %0" :"=m" (*a) : "a"(rfbm), "d"(0)); +} + +static inline void xsave64(struct x86_fx_thread_state *a, uint32_t rfbm) { + __asm__ __volatile__("xsave64 %0" :"=m" (*a) : "a"(rfbm), "d"(0)); +} + +static inline void xrstor(struct x86_fx_thread_state *a, uint32_t rfbm) { + __asm__ __volatile__("xrstor %0" :: "m" (*a), "a"(rfbm), "d"(0)); +} + +static inline void xrstor64(struct x86_fx_thread_state *a, uint32_t rfbm) { + __asm__ __volatile__("xrstor64 %0" :: "m" (*a), "a"(rfbm), "d"(0)); +} + +#if !defined(RC_HIDE_XNU_J137) +static inline void vzeroupper(void) { + __asm__ __volatile__("vzeroupper" ::); +} +#if DEVELOPMENT || DEBUG +static inline uint64_t xgetbv(uint32_t c) { + uint32_t mask_hi, mask_lo; + __asm__ __volatile__("xgetbv" : "=a"(mask_lo), "=d"(mask_hi) : "c" (c)); + return ((uint64_t) mask_hi<<32) + (uint64_t) mask_lo; +} +#endif + +static boolean_t fpu_thread_promote_avx512(thread_t); /* Forward */ + +/* + * Define a wrapper for bcopy to defeat destination size checka. + * This is needed to treat repeated objects such as + * _STRUCT_XMM_REG fpu_ymmh0; + * ... + * _STRUCT_XMM_REG fpu_ymmh7; + * as an array and to copy like so: + * bcopy_nockch(src,&dst->fpu_ymmh0,8*sizeof(_STRUCT_XMM_REG)); + * without the compiler throwing a __builtin__memmove_chk error. + */ +static inline void bcopy_nochk(void *_src, void *_dst, size_t _len) { + bcopy(_src, _dst, _len); +} + +/* + * Furthermore, make compile-time asserts that no padding creeps into structures + * for which we're doing this. + */ +#define ASSERT_PACKED(t, m1, m2, n, mt) \ +extern char assert_packed_ ## t ## _ ## m1 ## _ ## m2 \ + [(offsetof(t,m2) - offsetof(t,m1) == (n - 1)*sizeof(mt)) ? 1 : -1] + +ASSERT_PACKED(x86_avx_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG); + +ASSERT_PACKED(x86_avx_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG); + +ASSERT_PACKED(x86_avx512_state32_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG); +ASSERT_PACKED(x86_avx512_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG); +ASSERT_PACKED(x86_avx512_state32_t, fpu_zmmh0, fpu_zmmh7, 8, _STRUCT_YMM_REG); + +ASSERT_PACKED(x86_avx512_state64_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG); +ASSERT_PACKED(x86_avx512_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG); +ASSERT_PACKED(x86_avx512_state64_t, fpu_zmmh0, fpu_zmmh15, 16, _STRUCT_YMM_REG); +ASSERT_PACKED(x86_avx512_state64_t, fpu_zmm16, fpu_zmm31, 16, _STRUCT_ZMM_REG); + +#if defined(DEBUG_AVX512) + +#define DBG(x...) kprintf("DBG: " x) + +typedef struct { uint8_t byte[8]; } opmask_t; +typedef struct { uint8_t byte[16]; } xmm_t; +typedef struct { uint8_t byte[32]; } ymm_t; +typedef struct { uint8_t byte[64]; } zmm_t; + +static void +DBG_AVX512_STATE(struct x86_avx512_thread_state *sp) +{ + int i, j; + xmm_t *xmm = (xmm_t *) &sp->fp.fx_XMM_reg; + xmm_t *ymmh = (xmm_t *) &sp->x_YMM_Hi128; + ymm_t *zmmh = (ymm_t *) &sp->x_ZMM_Hi256; + zmm_t *zmm = (zmm_t *) &sp->x_Hi16_ZMM; + opmask_t *k = (opmask_t *) &sp->x_Opmask; + + kprintf("x_YMM_Hi128: %lu\n", offsetof(struct x86_avx512_thread_state, x_YMM_Hi128)); + kprintf("x_Opmask: %lu\n", offsetof(struct x86_avx512_thread_state, x_Opmask)); + kprintf("x_ZMM_Hi256: %lu\n", offsetof(struct x86_avx512_thread_state, x_ZMM_Hi256)); + kprintf("x_Hi16_ZMM: %lu\n", offsetof(struct x86_avx512_thread_state, x_Hi16_ZMM)); + + kprintf("XCR0: 0x%016llx\n", xgetbv(XCR0)); + kprintf("XINUSE: 0x%016llx\n", xgetbv(1)); + + /* Print all ZMM registers */ + for (i = 0; i < 16; i++) { + kprintf("zmm%d:\t0x", i); + for (j = 0; j < 16; j++) + kprintf("%02x", xmm[i].byte[j]); + for (j = 0; j < 16; j++) + kprintf("%02x", ymmh[i].byte[j]); + for (j = 0; j < 32; j++) + kprintf("%02x", zmmh[i].byte[j]); + kprintf("\n"); + } + for (i = 0; i < 16; i++) { + kprintf("zmm%d:\t0x", 16+i); + for (j = 0; j < 64; j++) + kprintf("%02x", zmm[i].byte[j]); + kprintf("\n"); + } + for (i = 0; i < 8; i++) { + kprintf("k%d:\t0x", i); + for (j = 0; j < 8; j++) + kprintf("%02x", k[i].byte[j]); + kprintf("\n"); + } + + kprintf("xstate_bv: 0x%016llx\n", sp->_xh.xstate_bv); + kprintf("xcomp_bv: 0x%016llx\n", sp->_xh.xcomp_bv); +} +#else +#define DBG(x...) +static void +DBG_AVX512_STATE(__unused struct x86_avx512_thread_state *sp) +{ + return; +} +#endif /* DEBUG_AVX512 */ + +#endif + +#if DEBUG +static inline unsigned short +fnstsw(void) +{ + unsigned short status; + __asm__ volatile("fnstsw %0" : "=ma" (status)); + return(status); +} +#endif + +/* + * Configure the initial FPU state presented to new threads. + * Determine the MXCSR capability mask, which allows us to mask off any + * potentially unsafe "reserved" bits before restoring the FPU context. + * *Not* per-cpu, assumes symmetry. + */ + +static void +configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps) +{ + /* XSAVE requires a 64 byte aligned store */ + assert(ALIGNED(fps, 64)); + /* Clear, to prepare for the diagnostic FXSAVE */ + bzero(fps, sizeof(*fps)); + + fpinit(); + fpu_store_registers(fps, FALSE); + + mxcsr_capability_mask = fps->fx.fx_MXCSR_MASK; + + /* Set default mask value if necessary */ + if (mxcsr_capability_mask == 0) + mxcsr_capability_mask = 0xffbf; + + /* Clear vector register store */ + bzero(&fps->fx.fx_XMM_reg[0][0], sizeof(fps->fx.fx_XMM_reg)); + bzero(fps->avx.x_YMM_Hi128, sizeof(fps->avx.x_YMM_Hi128)); +#if !defined(RC_HIDE_XNU_J137) + if (fpu_ZMM_capable) { + bzero(fps->avx512.x_ZMM_Hi256, sizeof(fps->avx512.x_ZMM_Hi256)); + bzero(fps->avx512.x_Hi16_ZMM, sizeof(fps->avx512.x_Hi16_ZMM)); + bzero(fps->avx512.x_Opmask, sizeof(fps->avx512.x_Opmask)); + } +#endif + + fps->fx.fp_valid = TRUE; + fps->fx.fp_save_layout = fpu_YMM_capable ? XSAVE32: FXSAVE32; + fpu_load_registers(fps); + + /* Poison values to trap unsafe usage */ + fps->fx.fp_valid = 0xFFFFFFFF; + fps->fx.fp_save_layout = FP_UNUSED; + + /* Re-enable FPU/SSE DNA exceptions */ + set_ts(); +} + /* * Look for FPU and initialize it. * Called on each CPU. @@ -110,73 +421,326 @@ extern void fp_load( void init_fpu(void) { - unsigned short status, control; - +#if DEBUG + unsigned short status; + unsigned short control; +#endif /* * Check for FPU by initializing it, * then trying to read the correct bit patterns from * the control and status registers. */ set_cr0((get_cr0() & ~(CR0_EM|CR0_TS)) | CR0_NE); /* allow use of FPU */ - fninit(); +#if DEBUG status = fnstsw(); fnstcw(&control); - - if ((status & 0xff) == 0 && - (control & 0x103f) == 0x3f) - { - fp_kind = FP_387; /* assume we have a 387 compatible instruction set */ - /* Use FPU save/restore instructions if available */ - if (cpuid_features() & CPUID_FEATURE_FXSR) { - fp_kind = FP_FXSR; - set_cr4(get_cr4() | CR4_FXS); - printf("Enabling XMM register save/restore"); - /* And allow SIMD instructions if present */ + + assert(((status & 0xff) == 0) && ((control & 0x103f) == 0x3f)); +#endif + /* Advertise SSE support */ + if (cpuid_features() & CPUID_FEATURE_FXSR) { + set_cr4(get_cr4() | CR4_OSFXS); + /* And allow SIMD exceptions if present */ if (cpuid_features() & CPUID_FEATURE_SSE) { - printf(" and SSE/SSE2"); - set_cr4(get_cr4() | CR4_XMM); + set_cr4(get_cr4() | CR4_OSXMM); + } + } else + panic("fpu is not FP_FXSR"); + + fpu_capability = fpu_default = FP; + +#if !defined(RC_HIDE_XNU_J137) + static boolean_t is_avx512_enabled = TRUE; + if (cpu_number() == master_cpu) { + if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512F) { + PE_parse_boot_argn("avx512", &is_avx512_enabled, sizeof(boolean_t)); + kprintf("AVX512 supported %s\n", + is_avx512_enabled ? "and enabled" : "but disabled"); + } + } +#endif + + /* Configure the XSAVE context mechanism if the processor supports + * AVX/YMM registers + */ + if (cpuid_features() & CPUID_FEATURE_XSAVE) { + cpuid_xsave_leaf_t *xs0p = &cpuid_info()->cpuid_xsave_leaf[0]; +#if !defined(RC_HIDE_XNU_J137) + if (is_avx512_enabled && + (xs0p->extended_state[eax] & XFEM_ZMM) == XFEM_ZMM) { + assert(xs0p->extended_state[eax] & XFEM_SSE); + assert(xs0p->extended_state[eax] & XFEM_YMM); + fpu_capability = AVX512; + /* XSAVE container size for all features */ + set_cr4(get_cr4() | CR4_OSXSAVE); + xsetbv(0, AVX512_XMASK); + /* Re-evaluate CPUID, once, to reflect OSXSAVE */ + if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) + cpuid_set_info(); + /* Verify that now selected state can be accommodated */ + assert(xs0p->extended_state[ebx] == fp_state_size[AVX512]); + /* + * AVX set until AVX512 is used. + * See comment above about on-demand AVX512 support. + */ + xsetbv(0, AVX_XMASK); + fpu_default = AVX; + } else +#endif + if (xs0p->extended_state[eax] & XFEM_YMM) { + assert(xs0p->extended_state[eax] & XFEM_SSE); + fpu_capability = AVX; + fpu_default = AVX; + /* XSAVE container size for all features */ + set_cr4(get_cr4() | CR4_OSXSAVE); + xsetbv(0, AVX_XMASK); + /* Re-evaluate CPUID, once, to reflect OSXSAVE */ + if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) + cpuid_set_info(); + /* Verify that now selected state can be accommodated */ + assert(xs0p->extended_state[ebx] == fp_state_size[AVX]); } - printf(" opcodes\n"); - } + } + + if (cpu_number() == master_cpu) + kprintf("fpu_state: %s, state_size: %d\n", + xstate_name[fpu_capability], + fp_state_size[fpu_capability]); + + fpinit(); + + /* + * Trap wait instructions. Turn off FPU for now. + */ + set_cr0(get_cr0() | CR0_TS | CR0_MP); +} + +/* + * Allocate and initialize FP state for specified xstate. + * Don't load state. + */ +static void * +fp_state_alloc(xstate_t xs) +{ + struct x86_fx_thread_state *ifps; - /* - * Trap wait instructions. Turn off FPU for now. - */ - set_cr0(get_cr0() | CR0_TS | CR0_MP); + assert(ifps_zone[xs] != NULL); + ifps = zalloc(ifps_zone[xs]); + +#if DEBUG + if (!(ALIGNED(ifps,64))) { + panic("fp_state_alloc: %p, %u, %p, %u", + ifps, (unsigned) ifps_zone[xs]->elem_size, + (void *) ifps_zone[xs]->free_elements, + (unsigned) ifps_zone[xs]->alloc_size); } - else - { - /* - * NO FPU. - */ - fp_kind = FP_NO; - set_cr0(get_cr0() | CR0_EM); +#endif + bzero(ifps, fp_state_size[xs]); + + return ifps; +} + +static inline void +fp_state_free(void *ifps, xstate_t xs) +{ + assert(ifps_zone[xs] != NULL); + zfree(ifps_zone[xs], ifps); +} + +void clear_fpu(void) +{ + set_ts(); +} + + +static void fpu_load_registers(void *fstate) { + struct x86_fx_thread_state *ifps = fstate; + fp_save_layout_t layout = ifps->fp_save_layout; + + assert(current_task() == NULL || \ + (thread_is_64bit(current_thread()) ? \ + (layout == FXSAVE64 || layout == XSAVE64) : \ + (layout == FXSAVE32 || layout == XSAVE32))); + assert(ALIGNED(ifps, 64)); + assert(ml_get_interrupts_enabled() == FALSE); + +#if DEBUG + if (layout == XSAVE32 || layout == XSAVE64) { + struct x86_avx_thread_state *iavx = fstate; + unsigned i; + /* Verify reserved bits in the XSAVE header*/ + if (iavx->_xh.xstate_bv & ~xstate_xmask[current_xstate()]) + panic("iavx->_xh.xstate_bv: 0x%llx", iavx->_xh.xstate_bv); + for (i = 0; i < sizeof(iavx->_xh.xhrsvd); i++) + if (iavx->_xh.xhrsvd[i]) + panic("Reserved bit set"); + } + if (fpu_YMM_capable) { + if (layout != XSAVE32 && layout != XSAVE64) + panic("Inappropriate layout: %u\n", layout); + } +#endif /* DEBUG */ + + switch (layout) { + case FXSAVE64: + fxrstor64(ifps); + break; + case FXSAVE32: + fxrstor(ifps); + break; + case XSAVE64: + xrstor64(ifps, xstate_xmask[current_xstate()]); + break; + case XSAVE32: + xrstor(ifps, xstate_xmask[current_xstate()]); + break; + default: + panic("fpu_load_registers() bad layout: %d\n", layout); + } +} + +static void fpu_store_registers(void *fstate, boolean_t is64) { + struct x86_fx_thread_state *ifps = fstate; + assert(ALIGNED(ifps, 64)); + xstate_t xs = current_xstate(); + switch (xs) { + case FP: + if (is64) { + fxsave64(fstate); + ifps->fp_save_layout = FXSAVE64; + } else { + fxsave(fstate); + ifps->fp_save_layout = FXSAVE32; + } + break; + case AVX: +#if !defined(RC_HIDE_XNU_J137) + case AVX512: +#endif + if (is64) { + xsave64(ifps, xstate_xmask[xs]); + ifps->fp_save_layout = XSAVE64; + } else { + xsave(ifps, xstate_xmask[xs]); + ifps->fp_save_layout = XSAVE32; + } + break; + default: + panic("fpu_store_registers() bad xstate: %d\n", xs); } } /* * Initialize FP handling. */ + void fpu_module_init(void) { - ifps_zone = zinit(sizeof(struct i386_fpsave_state), - THREAD_MAX * sizeof(struct i386_fpsave_state), - THREAD_CHUNK * sizeof(struct i386_fpsave_state), - "i386 fpsave state"); + if (!IS_VALID_XSTATE(fpu_default)) + panic("fpu_module_init: invalid extended state %u\n", + fpu_default); + + /* We explicitly choose an allocation size of 13 pages = 64 * 832 + * to eliminate waste for the 832 byte sized + * AVX XSAVE register save area. + */ + ifps_zone[fpu_default] = zinit(fp_state_size[fpu_default], + thread_max * fp_state_size[fpu_default], + 64 * fp_state_size[fpu_default], + "x86 fpsave state"); + + /* To maintain the required alignment, disable + * zone debugging for this zone as that appends + * 16 bytes to each element. + */ + zone_change(ifps_zone[fpu_default], Z_ALIGNMENT_REQUIRED, TRUE); + +#if !defined(RC_HIDE_XNU_J137) + /* + * If AVX512 is supported, create a separate savearea zone. + * with allocation size: 19 pages = 32 * 2668 + */ + if (fpu_capability == AVX512) { + ifps_zone[AVX512] = zinit(fp_state_size[AVX512], + thread_max * fp_state_size[AVX512], + 32 * fp_state_size[AVX512], + "x86 avx512 save state"); + zone_change(ifps_zone[AVX512], Z_ALIGNMENT_REQUIRED, TRUE); + } +#endif + + /* Determine MXCSR reserved bits and configure initial FPU state*/ + configure_mxcsr_capability_mask(&initial_fp_state); } +/* + * Context switch fpu state. + * Always save old thread`s FPU context but don't load new .. allow that to fault-in. + * Switch to the new task's xstate. + */ +void +fpu_switch_context(thread_t old, thread_t new) +{ + struct x86_fx_thread_state *ifps; + boolean_t is_ts_cleared = FALSE; + + assert(ml_get_interrupts_enabled() == FALSE); + ifps = (old)->machine.ifps; +#if DEBUG + if (ifps && ((ifps->fp_valid != FALSE) && (ifps->fp_valid != TRUE))) { + panic("ifps->fp_valid: %u\n", ifps->fp_valid); + } +#endif + if (ifps != 0 && (ifps->fp_valid == FALSE)) { + /* Clear CR0.TS in preparation for the FP context save. In + * theory, this shouldn't be necessary since a live FPU should + * indicate that TS is clear. However, various routines + * (such as sendsig & sigreturn) manipulate TS directly. + */ + clear_ts(); + is_ts_cleared = TRUE; + /* registers are in FPU - save to memory */ + fpu_store_registers(ifps, (thread_is_64bit(old) && is_saved_state64(old->machine.iss))); + ifps->fp_valid = TRUE; + } +#if !defined(RC_HIDE_XNU_J137) + xstate_t old_xstate = thread_xstate(old); + xstate_t new_xstate = new ? thread_xstate(new) : fpu_default; + if (old_xstate == AVX512 && ifps != 0) { + DBG_AVX512_STATE((struct x86_avx512_thread_state *) ifps); + /* + * Clear upper bits for potential power-saving + * but first ensure the TS bit is clear. + */ + if (!is_ts_cleared) + clear_ts(); + vzeroupper(); + } + if (new_xstate != old_xstate) { + DBG("fpu_switch_context(%p,%p) new xstate: %s\n", + old, new, xstate_name[new_xstate]); + xsetbv(0, xstate_xmask[new_xstate]); + } +#else +#pragma unused(new) +#endif + set_ts(); +} + + /* * Free a FPU save area. * Called only when thread terminating - no locking necessary. */ void -fpu_free(fps) - struct i386_fpsave_state *fps; +fpu_free(thread_t thread, void *fps) { -ASSERT_IPL(SPL0); - zfree(ifps_zone, fps); + pcb_t pcb = THREAD_TO_PCB(thread); + + fp_state_free(fps, pcb->xstate); + pcb->xstate = UNDEFINED; } /* @@ -190,71 +754,178 @@ ASSERT_IPL(SPL0); */ kern_return_t fpu_set_fxstate( - thread_t thr_act, - struct i386_float_state *state) + thread_t thr_act, + thread_state_t tstate, + thread_flavor_t f) { - register pcb_t pcb; - register struct i386_fpsave_state *ifps; - register struct i386_fpsave_state *new_ifps; - -ASSERT_IPL(SPL0); - if (fp_kind == FP_NO) - return KERN_FAILURE; - - if (state->fpkind != FP_FXSR) { - /* strange if this happens, but in case someone builds one of these manually... */ - return fpu_set_state(thr_act, state); - } - + struct x86_fx_thread_state *ifps; + struct x86_fx_thread_state *new_ifps; + x86_float_state64_t *state; + pcb_t pcb; + boolean_t old_valid, fresh_state = FALSE; + + if (fpu_capability == UNDEFINED) + return KERN_FAILURE; + + if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) && + fpu_capability < AVX) + return KERN_FAILURE; + +#if !defined(RC_HIDE_XNU_J137) + if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) && + thread_xstate(thr_act) == AVX) + if (!fpu_thread_promote_avx512(thr_act)) + return KERN_FAILURE; +#endif + + state = (x86_float_state64_t *)tstate; + assert(thr_act != THREAD_NULL); - pcb = thr_act->machine.pcb; - - if (state->initialized == 0) { - /* - * new FPU state is 'invalid'. - * Deallocate the fp state if it exists. - */ - simple_lock(&pcb->lock); - ifps = pcb->ims.ifps; - pcb->ims.ifps = 0; - simple_unlock(&pcb->lock); - - if (ifps != 0) { - zfree(ifps_zone, ifps); - } - } - else { - /* - * Valid state. Allocate the fp state if there is none. - */ - - new_ifps = 0; - Retry: - simple_lock(&pcb->lock); - ifps = pcb->ims.ifps; - if (ifps == 0) { - if (new_ifps == 0) { - simple_unlock(&pcb->lock); - new_ifps = (struct i386_fpsave_state *) zalloc(ifps_zone); - assert(ALIGNED(new_ifps,16)); - goto Retry; + pcb = THREAD_TO_PCB(thr_act); + + if (state == NULL) { + /* + * new FPU state is 'invalid'. + * Deallocate the fp state if it exists. + */ + simple_lock(&pcb->lock); + + ifps = pcb->ifps; + pcb->ifps = 0; + + simple_unlock(&pcb->lock); + + if (ifps != 0) { + fp_state_free(ifps, thread_xstate(thr_act)); } - ifps = new_ifps; + } else { + /* + * Valid incoming state. Allocate the fp state if there is none. + */ new_ifps = 0; - bzero((char *)ifps, sizeof *ifps); - pcb->ims.ifps = ifps; - } + Retry: + simple_lock(&pcb->lock); + + ifps = pcb->ifps; + if (ifps == 0) { + if (new_ifps == 0) { + simple_unlock(&pcb->lock); + new_ifps = fp_state_alloc(thread_xstate(thr_act)); + goto Retry; + } + ifps = new_ifps; + new_ifps = 0; + pcb->ifps = ifps; + pcb->xstate = thread_xstate(thr_act); + fresh_state = TRUE; + } - /* - * now copy over the new data. - */ - bcopy((char *)&state->hw_state[0], (char *)&ifps->fx_save_state, sizeof(struct i386_fx_save)); - ifps->fp_save_flavor = FP_FXSR; - simple_unlock(&pcb->lock); - if (new_ifps != 0) - zfree(ifps_zone, ifps); - } + /* + * now copy over the new data. + */ + + old_valid = ifps->fp_valid; + +#if DEBUG || DEVELOPMENT + if ((fresh_state == FALSE) && (old_valid == FALSE) && (thr_act != current_thread())) { + panic("fpu_set_fxstate inconsistency, thread: %p not stopped", thr_act); + } +#endif + /* + * Clear any reserved bits in the MXCSR to prevent a GPF + * when issuing an FXRSTOR. + */ + + state->fpu_mxcsr &= mxcsr_capability_mask; + + bcopy((char *)&state->fpu_fcw, (char *)ifps, fp_state_size[FP]); + + switch (thread_xstate(thr_act)) { + case UNDEFINED: + panic("fpu_set_fxstate() UNDEFINED xstate"); + break; + case FP: + ifps->fp_save_layout = thread_is_64bit(thr_act) ? FXSAVE64 : FXSAVE32; + break; + case AVX: { + struct x86_avx_thread_state *iavx = (void *) ifps; + x86_avx_state64_t *xs = (x86_avx_state64_t *) state; + + iavx->fp.fp_save_layout = thread_is_64bit(thr_act) ? XSAVE64 : XSAVE32; + + /* Sanitize XSAVE header */ + bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd)); + iavx->_xh.xstate_bv = AVX_XMASK; + iavx->_xh.xcomp_bv = 0; + + if (f == x86_AVX_STATE32) { + bcopy_nochk(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)); + } else if (f == x86_AVX_STATE64) { + bcopy_nochk(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)); + } else { + iavx->_xh.xstate_bv = (XFEM_SSE | XFEM_X87); + } + break; + } +#if !defined(RC_HIDE_XNU_J137) + case AVX512: { + struct x86_avx512_thread_state *iavx = (void *) ifps; + union { + thread_state_t ts; + x86_avx512_state32_t *s32; + x86_avx512_state64_t *s64; + } xs = { .ts = tstate }; + + iavx->fp.fp_save_layout = thread_is_64bit(thr_act) ? XSAVE64 : XSAVE32; + + /* Sanitize XSAVE header */ + bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd)); + iavx->_xh.xstate_bv = AVX512_XMASK; + iavx->_xh.xcomp_bv = 0; + + switch (f) { + case x86_AVX512_STATE32: + bcopy_nochk(&xs.s32->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)); + bcopy_nochk(&xs.s32->fpu_zmmh0, iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG)); + bcopy_nochk(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)); + DBG_AVX512_STATE(iavx); + break; + case x86_AVX_STATE32: + bcopy_nochk(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)); + break; + case x86_AVX512_STATE64: + bcopy_nochk(&xs.s64->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)); + bcopy_nochk(&xs.s64->fpu_zmm16, iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG)); + bcopy_nochk(&xs.s64->fpu_zmmh0, iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG)); + bcopy_nochk(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)); + DBG_AVX512_STATE(iavx); + break; + case x86_AVX_STATE64: + bcopy_nochk(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)); + break; + } + break; + } +#endif + } + + ifps->fp_valid = old_valid; + if (old_valid == FALSE) { + boolean_t istate = ml_set_interrupts_enabled(FALSE); + ifps->fp_valid = TRUE; + /* If altering the current thread's state, disable FPU */ + if (thr_act == current_thread()) + set_ts(); + + ml_set_interrupts_enabled(istate); + } + + simple_unlock(&pcb->lock); + + if (new_ifps != 0) + fp_state_free(new_ifps, thread_xstate(thr_act)); + } return KERN_SUCCESS; } @@ -266,234 +937,210 @@ ASSERT_IPL(SPL0); */ kern_return_t fpu_get_fxstate( - thread_t thr_act, - register struct i386_float_state *state) + thread_t thr_act, + thread_state_t tstate, + thread_flavor_t f) { - register pcb_t pcb; - register struct i386_fpsave_state *ifps; + struct x86_fx_thread_state *ifps; + x86_float_state64_t *state; + kern_return_t ret = KERN_FAILURE; + pcb_t pcb; + + if (fpu_capability == UNDEFINED) + return KERN_FAILURE; + + if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) && + fpu_capability < AVX) + return KERN_FAILURE; + +#if !defined(RC_HIDE_XNU_J137) + if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) && + thread_xstate(thr_act) != AVX512) + return KERN_FAILURE; +#endif -ASSERT_IPL(SPL0); - if (fp_kind == FP_NO) { - return KERN_FAILURE; - } else if (fp_kind == FP_387) { - return fpu_get_state(thr_act, state); - } + state = (x86_float_state64_t *)tstate; assert(thr_act != THREAD_NULL); - pcb = thr_act->machine.pcb; + pcb = THREAD_TO_PCB(thr_act); simple_lock(&pcb->lock); - ifps = pcb->ims.ifps; + + ifps = pcb->ifps; if (ifps == 0) { - /* - * No valid floating-point state. - */ - simple_unlock(&pcb->lock); - bzero((char *)state, sizeof(struct i386_float_state)); - return KERN_SUCCESS; - } + /* + * No valid floating-point state. + */ + + bcopy((char *)&initial_fp_state, (char *)&state->fpu_fcw, + fp_state_size[FP]); + + simple_unlock(&pcb->lock); - /* Make sure we`ve got the latest fp state info */ - /* If the live fpu state belongs to our target */ - if (thr_act == current_thread()) - { - clear_ts(); - fp_save(thr_act); - clear_fpu(); + return KERN_SUCCESS; } + /* + * Make sure we`ve got the latest fp state info + * If the live fpu state belongs to our target + */ + if (thr_act == current_thread()) { + boolean_t intr; - state->fpkind = fp_kind; - state->exc_status = 0; - state->initialized = ifps->fp_valid; - bcopy( (char *)&ifps->fx_save_state, (char *)&state->hw_state[0], sizeof(struct i386_fx_save)); + intr = ml_set_interrupts_enabled(FALSE); + + clear_ts(); + fp_save(thr_act); + clear_fpu(); + (void)ml_set_interrupts_enabled(intr); + } + if (ifps->fp_valid) { + bcopy((char *)ifps, (char *)&state->fpu_fcw, fp_state_size[FP]); + switch (thread_xstate(thr_act)) { + case UNDEFINED: + panic("fpu_get_fxstate() UNDEFINED xstate"); + break; + case FP: + break; /* already done */ + case AVX: { + struct x86_avx_thread_state *iavx = (void *) ifps; + x86_avx_state64_t *xs = (x86_avx_state64_t *) state; + if (f == x86_AVX_STATE32) { + bcopy_nochk(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG)); + } else if (f == x86_AVX_STATE64) { + bcopy_nochk(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG)); + } + break; + } +#if !defined(RC_HIDE_XNU_J137) + case AVX512: { + struct x86_avx512_thread_state *iavx = (void *) ifps; + union { + thread_state_t ts; + x86_avx512_state32_t *s32; + x86_avx512_state64_t *s64; + } xs = { .ts = tstate }; + switch (f) { + case x86_AVX512_STATE32: + bcopy_nochk(iavx->x_Opmask, &xs.s32->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG)); + bcopy_nochk(iavx->x_ZMM_Hi256, &xs.s32->fpu_zmmh0, 8 * sizeof(_STRUCT_YMM_REG)); + bcopy_nochk(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG)); + DBG_AVX512_STATE(iavx); + break; + case x86_AVX_STATE32: + bcopy_nochk(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG)); + break; + case x86_AVX512_STATE64: + bcopy_nochk(iavx->x_Opmask, &xs.s64->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG)); + bcopy_nochk(iavx->x_Hi16_ZMM, &xs.s64->fpu_zmm16, 16 * sizeof(_STRUCT_ZMM_REG)); + bcopy_nochk(iavx->x_ZMM_Hi256, &xs.s64->fpu_zmmh0, 16 * sizeof(_STRUCT_YMM_REG)); + bcopy_nochk(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG)); + DBG_AVX512_STATE(iavx); + break; + case x86_AVX_STATE64: + bcopy_nochk(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG)); + break; + } + break; + } +#endif + } + + ret = KERN_SUCCESS; + } simple_unlock(&pcb->lock); - return KERN_SUCCESS; + return ret; } + + /* - * Set the floating-point state for a thread. - * If the thread is not the current thread, it is - * not running (held). Locking needed against - * concurrent fpu_set_state or fpu_get_state. + * the child thread is 'stopped' with the thread + * mutex held and is currently not known by anyone + * so no way for fpu state to get manipulated by an + * outside agency -> no need for pcb lock */ -kern_return_t -fpu_set_state( - thread_t thr_act, - struct i386_float_state *state) + +void +fpu_dup_fxstate( + thread_t parent, + thread_t child) { - register pcb_t pcb; - register struct i386_fpsave_state *ifps; - register struct i386_fpsave_state *new_ifps; + struct x86_fx_thread_state *new_ifps = NULL; + boolean_t intr; + pcb_t ppcb; + xstate_t xstate = thread_xstate(parent); -ASSERT_IPL(SPL0); - if (fp_kind == FP_NO) - return KERN_FAILURE; + ppcb = THREAD_TO_PCB(parent); - assert(thr_act != THREAD_NULL); - pcb = thr_act->machine.pcb; - - if (state->initialized == 0) { - /* - * new FPU state is 'invalid'. - * Deallocate the fp state if it exists. - */ - simple_lock(&pcb->lock); - ifps = pcb->ims.ifps; - pcb->ims.ifps = 0; - simple_unlock(&pcb->lock); - - if (ifps != 0) { - zfree(ifps_zone, ifps); - } - } - else { - /* - * Valid state. Allocate the fp state if there is none. - */ - register struct i386_fp_save *user_fp_state; - register struct i386_fp_regs *user_fp_regs; - - user_fp_state = (struct i386_fp_save *) &state->hw_state[0]; - user_fp_regs = (struct i386_fp_regs *) - &state->hw_state[sizeof(struct i386_fp_save)]; - - new_ifps = 0; - Retry: - simple_lock(&pcb->lock); - ifps = pcb->ims.ifps; - if (ifps == 0) { - if (new_ifps == 0) { - simple_unlock(&pcb->lock); - new_ifps = (struct i386_fpsave_state *) zalloc(ifps_zone); - assert(ALIGNED(new_ifps,16)); - goto Retry; - } - ifps = new_ifps; - new_ifps = 0; - bzero((char *)ifps, sizeof *ifps); // zero ALL fields first - pcb->ims.ifps = ifps; - } - - /* - * Ensure that reserved parts of the environment are 0. - */ - bzero((char *)&ifps->fp_save_state, sizeof(struct i386_fp_save)); - - ifps->fp_save_state.fp_control = user_fp_state->fp_control; - ifps->fp_save_state.fp_status = user_fp_state->fp_status; - ifps->fp_save_state.fp_tag = user_fp_state->fp_tag; - ifps->fp_save_state.fp_eip = user_fp_state->fp_eip; - ifps->fp_save_state.fp_cs = user_fp_state->fp_cs; - ifps->fp_save_state.fp_opcode = user_fp_state->fp_opcode; - ifps->fp_save_state.fp_dp = user_fp_state->fp_dp; - ifps->fp_save_state.fp_ds = user_fp_state->fp_ds; - ifps->fp_regs = *user_fp_regs; - ifps->fp_save_flavor = FP_387; - simple_unlock(&pcb->lock); - if (new_ifps != 0) - zfree(ifps_zone, ifps); - } + if (ppcb->ifps == NULL) + return; - return KERN_SUCCESS; -} + if (child->machine.ifps) + panic("fpu_dup_fxstate: child's ifps non-null"); -/* - * Get the floating-point state for a thread. - * If the thread is not the current thread, it is - * not running (held). Locking needed against - * concurrent fpu_set_state or fpu_get_state. - */ -kern_return_t -fpu_get_state( - thread_t thr_act, - register struct i386_float_state *state) -{ - register pcb_t pcb; - register struct i386_fpsave_state *ifps; + new_ifps = fp_state_alloc(xstate); -ASSERT_IPL(SPL0); - if (fp_kind == FP_NO) - return KERN_FAILURE; + simple_lock(&ppcb->lock); - assert(thr_act != THREAD_NULL); - pcb = thr_act->machine.pcb; + if (ppcb->ifps != NULL) { + struct x86_fx_thread_state *ifps = ppcb->ifps; + /* + * Make sure we`ve got the latest fp state info + */ + if (current_thread() == parent) { + intr = ml_set_interrupts_enabled(FALSE); + assert(current_thread() == parent); + clear_ts(); + fp_save(parent); + clear_fpu(); + + (void)ml_set_interrupts_enabled(intr); + } - simple_lock(&pcb->lock); - ifps = pcb->ims.ifps; - if (ifps == 0) { - /* - * No valid floating-point state. - */ - simple_unlock(&pcb->lock); - bzero((char *)state, sizeof(struct i386_float_state)); - return KERN_SUCCESS; - } - - /* Make sure we`ve got the latest fp state info */ - /* If the live fpu state belongs to our target */ - if (thr_act == current_thread()) - { - clear_ts(); - fp_save(thr_act); - clear_fpu(); - } - - state->fpkind = fp_kind; - state->exc_status = 0; - - { - register struct i386_fp_save *user_fp_state; - register struct i386_fp_regs *user_fp_regs; - - state->initialized = ifps->fp_valid; - - user_fp_state = (struct i386_fp_save *) &state->hw_state[0]; - user_fp_regs = (struct i386_fp_regs *) - &state->hw_state[sizeof(struct i386_fp_save)]; - - /* - * Ensure that reserved parts of the environment are 0. - */ - bzero((char *)user_fp_state, sizeof(struct i386_fp_save)); - - user_fp_state->fp_control = ifps->fp_save_state.fp_control; - user_fp_state->fp_status = ifps->fp_save_state.fp_status; - user_fp_state->fp_tag = ifps->fp_save_state.fp_tag; - user_fp_state->fp_eip = ifps->fp_save_state.fp_eip; - user_fp_state->fp_cs = ifps->fp_save_state.fp_cs; - user_fp_state->fp_opcode = ifps->fp_save_state.fp_opcode; - user_fp_state->fp_dp = ifps->fp_save_state.fp_dp; - user_fp_state->fp_ds = ifps->fp_save_state.fp_ds; - *user_fp_regs = ifps->fp_regs; + if (ifps->fp_valid) { + child->machine.ifps = new_ifps; + child->machine.xstate = xstate; + bcopy((char *)(ppcb->ifps), + (char *)(child->machine.ifps), + fp_state_size[xstate]); + + /* Mark the new fp saved state as non-live. */ + /* Temporarily disabled: radar 4647827 + * new_ifps->fp_valid = TRUE; + */ + + /* + * Clear any reserved bits in the MXCSR to prevent a GPF + * when issuing an FXRSTOR. + */ + new_ifps->fx_MXCSR &= mxcsr_capability_mask; + new_ifps = NULL; + } } - simple_unlock(&pcb->lock); + simple_unlock(&ppcb->lock); - return KERN_SUCCESS; + if (new_ifps != NULL) + fp_state_free(new_ifps, xstate); } + /* * Initialize FPU. * - * Raise exceptions for: - * invalid operation - * divide by zero - * overflow - * - * Use 53-bit precision. */ + void fpinit(void) { unsigned short control; -ASSERT_IPL(SPL0); clear_ts(); fninit(); fnstcw(&control); control &= ~(FPC_PC|FPC_RC); /* Clear precision & rounding control */ - control |= (FPC_PC_53 | /* Set precision */ + control |= (FPC_PC_64 | /* Set precision */ FPC_RC_RN | /* round-to-nearest */ FPC_ZE | /* Suppress zero-divide */ FPC_OE | /* and overflow */ @@ -502,25 +1149,72 @@ ASSERT_IPL(SPL0); FPC_DE | /* Allow denorms as operands */ FPC_PE); /* No trap for precision loss */ fldcw(control); + + /* Initialize SSE/SSE2 */ + __builtin_ia32_ldmxcsr(0x1f80); } /* * Coprocessor not present. */ +uint64_t x86_isr_fp_simd_use; + void fpnoextflt(void) { - /* - * Enable FPU use. - */ -ASSERT_IPL(SPL0); - clear_ts(); + boolean_t intr; + thread_t thr_act; + pcb_t pcb; + struct x86_fx_thread_state *ifps = 0; + xstate_t xstate = current_xstate(); + + thr_act = current_thread(); + pcb = THREAD_TO_PCB(thr_act); + + if (pcb->ifps == 0 && !get_interrupt_level()) { + ifps = fp_state_alloc(xstate); + bcopy((char *)&initial_fp_state, (char *)ifps, + fp_state_size[xstate]); + if (!thread_is_64bit(thr_act)) { + ifps->fp_save_layout = fpu_YMM_capable ? XSAVE32 : FXSAVE32; + } + else + ifps->fp_save_layout = fpu_YMM_capable ? XSAVE64 : FXSAVE64; + ifps->fp_valid = TRUE; + } + intr = ml_set_interrupts_enabled(FALSE); - /* - * Load this thread`s state into the FPU. - */ - fp_load(current_thread()); + clear_ts(); /* Enable FPU use */ + + if (__improbable(get_interrupt_level())) { + /* Track number of #DNA traps at interrupt context, + * which is likely suboptimal. Racy, but good enough. + */ + x86_isr_fp_simd_use++; + /* + * Save current FP/SIMD context if valid + * Initialize live FP/SIMD registers + */ + if (pcb->ifps) { + fp_save(thr_act); + } + fpinit(); + } else { + if (pcb->ifps == 0) { + pcb->ifps = ifps; + pcb->xstate = xstate; + ifps = 0; + } + /* + * Load this thread`s state into coprocessor live context. + */ + fp_load(thr_act); + } + (void)ml_set_interrupts_enabled(intr); + + if (ifps) + fp_state_free(ifps, xstate); } /* @@ -531,18 +1225,27 @@ ASSERT_IPL(SPL0); void fpextovrflt(void) { - register thread_t thr_act = current_thread(); - register pcb_t pcb; - register struct i386_fpsave_state *ifps; + thread_t thr_act = current_thread(); + pcb_t pcb; + struct x86_fx_thread_state *ifps; + boolean_t intr; + xstate_t xstate = current_xstate(); + + intr = ml_set_interrupts_enabled(FALSE); + + if (get_interrupt_level()) + panic("FPU segment overrun exception at interrupt context\n"); + if (current_task() == kernel_task) + panic("FPU segment overrun exception in kernel thread context\n"); /* * This is a non-recoverable error. * Invalidate the thread`s FPU state. */ - pcb = thr_act->machine.pcb; + pcb = THREAD_TO_PCB(thr_act); simple_lock(&pcb->lock); - ifps = pcb->ims.ifps; - pcb->ims.ifps = 0; + ifps = pcb->ifps; + pcb->ifps = 0; simple_unlock(&pcb->lock); /* @@ -556,8 +1259,10 @@ fpextovrflt(void) */ clear_fpu(); + (void)ml_set_interrupts_enabled(intr); + if (ifps) - zfree(ifps_zone, ifps); + fp_state_free(ifps, xstate); /* * Raise exception. @@ -566,6 +1271,8 @@ fpextovrflt(void) /*NOTREACHED*/ } +extern void fpxlog(int, uint32_t, uint32_t, uint32_t); + /* * FPU error. Called by AST. */ @@ -573,22 +1280,38 @@ fpextovrflt(void) void fpexterrflt(void) { - register thread_t thr_act = current_thread(); + thread_t thr_act = current_thread(); + struct x86_fx_thread_state *ifps = thr_act->machine.ifps; + boolean_t intr; + + intr = ml_set_interrupts_enabled(FALSE); + + if (get_interrupt_level()) + panic("FPU error exception at interrupt context\n"); + if (current_task() == kernel_task) + panic("FPU error exception in kernel thread context\n"); -ASSERT_IPL(SPL0); /* * Save the FPU state and turn off the FPU. */ fp_save(thr_act); + (void)ml_set_interrupts_enabled(intr); + + const uint32_t mask = ifps->fx_control & + (FPC_IM | FPC_DM | FPC_ZM | FPC_OM | FPC_UE | FPC_PE); + const uint32_t xcpt = ~mask & (ifps->fx_status & + (FPS_IE | FPS_DE | FPS_ZE | FPS_OE | FPS_UE | FPS_PE)); + fpxlog(EXC_I386_EXTERR, ifps->fx_status, ifps->fx_control, xcpt); /* * Raise FPU exception. - * Locking not needed on pcb->ims.ifps, + * Locking not needed on pcb->ifps, * since thread is running. */ i386_exception(EXC_ARITHMETIC, EXC_I386_EXTERR, - thr_act->machine.pcb->ims.ifps->fp_save_state.fp_status); + ifps->fx_status); + /*NOTREACHED*/ } @@ -599,22 +1322,22 @@ ASSERT_IPL(SPL0); * . if called from fpu_get_state, pcb already locked. * . if called from fpnoextflt or fp_intr, we are single-cpu * . otherwise, thread is running. + * N.B.: Must be called with interrupts disabled */ + void fp_save( thread_t thr_act) { - register pcb_t pcb = thr_act->machine.pcb; - register struct i386_fpsave_state *ifps = pcb->ims.ifps; + pcb_t pcb = THREAD_TO_PCB(thr_act); + struct x86_fx_thread_state *ifps = pcb->ifps; + + assert(ifps != 0); if (ifps != 0 && !ifps->fp_valid) { - /* registers are in FPU */ - ifps->fp_valid = TRUE; - ifps->fp_save_flavor = FP_387; - if (FXSAFE()) { - fxsave(&ifps->fx_save_state); // save the SSE2/Fp state in addition is enabled - ifps->fp_save_flavor = FP_FXSR; - } - fnsave(&ifps->fp_save_state); // also update the old save area for now... + assert((get_cr0() & CR0_TS) == 0); + /* registers are in FPU */ + ifps->fp_valid = TRUE; + fpu_store_registers(ifps, thread_is_64bit(thr_act)); } } @@ -628,128 +1351,307 @@ void fp_load( thread_t thr_act) { - register pcb_t pcb = thr_act->machine.pcb; - register struct i386_fpsave_state *ifps; - -ASSERT_IPL(SPL0); - ifps = pcb->ims.ifps; - if (ifps == 0) { - ifps = (struct i386_fpsave_state *) zalloc(ifps_zone); - assert(ALIGNED(ifps,16)); - bzero((char *)ifps, sizeof *ifps); - pcb->ims.ifps = ifps; - fpinit(); -#if 1 -/* - * I'm not sure this is needed. Does the fpu regenerate the interrupt in - * frstor or not? Without this code we may miss some exceptions, with it - * we might send too many exceptions. - */ - } else if (ifps->fp_valid == 2) { - /* delayed exception pending */ - - ifps->fp_valid = TRUE; - clear_fpu(); - /* - * Raise FPU exception. - * Locking not needed on pcb->ims.ifps, - * since thread is running. - */ - i386_exception(EXC_ARITHMETIC, - EXC_I386_EXTERR, - thr_act->machine.pcb->ims.ifps->fp_save_state.fp_status); - /*NOTREACHED*/ + pcb_t pcb = THREAD_TO_PCB(thr_act); + struct x86_fx_thread_state *ifps = pcb->ifps; + + assert(ifps); +#if DEBUG + if (ifps->fp_valid != FALSE && ifps->fp_valid != TRUE) { + panic("fp_load() invalid fp_valid: %u, fp_save_layout: %u\n", + ifps->fp_valid, ifps->fp_save_layout); + } #endif + + if (ifps->fp_valid == FALSE) { + fpinit(); } else { - if (ifps->fp_save_flavor == FP_FXSR) fxrstor(&ifps->fx_save_state); - else frstor(ifps->fp_save_state); + fpu_load_registers(ifps); } ifps->fp_valid = FALSE; /* in FPU */ } - /* - * Allocate and initialize FP state for current thread. - * Don't load state. - * - * Locking not needed; always called on the current thread. + * SSE arithmetic exception handling code. + * Basically the same as the x87 exception handler with a different subtype */ + void -fp_state_alloc(void) +fpSSEexterrflt(void) { - pcb_t pcb = current_thread()->machine.pcb; - struct i386_fpsave_state *ifps; + thread_t thr_act = current_thread(); + struct x86_fx_thread_state *ifps = thr_act->machine.ifps; + boolean_t intr; - ifps = (struct i386_fpsave_state *)zalloc(ifps_zone); - assert(ALIGNED(ifps,16)); - bzero((char *)ifps, sizeof *ifps); - pcb->ims.ifps = ifps; + intr = ml_set_interrupts_enabled(FALSE); + + if (get_interrupt_level()) + panic("SSE exception at interrupt context\n"); + if (current_task() == kernel_task) + panic("SSE exception in kernel thread context\n"); + + /* + * Save the FPU state and turn off the FPU. + */ + fp_save(thr_act); + + (void)ml_set_interrupts_enabled(intr); + /* + * Raise FPU exception. + * Locking not needed on pcb->ifps, + * since thread is running. + */ + const uint32_t mask = (ifps->fx_MXCSR >> 7) & + (FPC_IM | FPC_DM | FPC_ZM | FPC_OM | FPC_UE | FPC_PE); + const uint32_t xcpt = ~mask & (ifps->fx_MXCSR & + (FPS_IE | FPS_DE | FPS_ZE | FPS_OE | FPS_UE | FPS_PE)); + fpxlog(EXC_I386_SSEEXTERR, ifps->fx_MXCSR, ifps->fx_MXCSR, xcpt); - ifps->fp_valid = TRUE; - ifps->fp_save_state.fp_control = (0x037f - & ~(FPC_IM|FPC_ZM|FPC_OM|FPC_PC)) - | (FPC_PC_53|FPC_IC_AFF); - ifps->fp_save_state.fp_status = 0; - ifps->fp_save_state.fp_tag = 0xffff; /* all empty */ - ifps->fx_save_state.fx_control = ifps->fp_save_state.fp_control; - ifps->fx_save_state.fx_status = ifps->fp_save_state.fp_status; - ifps->fx_save_state.fx_tag = 0x00; - ifps->fx_save_state.fx_MXCSR = 0x1f80; - + i386_exception(EXC_ARITHMETIC, + EXC_I386_SSEEXTERR, + ifps->fx_MXCSR); + /*NOTREACHED*/ } +#if !defined(RC_HIDE_XNU_J137) /* - * fpflush(thread_t) - * Flush the current act's state, if needed - * (used by thread_terminate_self to ensure fp faults - * aren't satisfied by overly general trap code in the - * context of the reaper thread) + * If a thread is using an AVX-sized savearea: + * - allocate a new AVX512-sized area, + * - copy the 256-bit state into the 512-bit area, + * - deallocate the smaller area */ -void -fpflush(__unused thread_t thr_act) +static void +fpu_savearea_promote_avx512(thread_t thread) { - /* not needed on MP x86s; fp not lazily evaluated */ -} + struct x86_avx_thread_state *ifps = NULL; + struct x86_avx512_thread_state *ifps512 = NULL; + pcb_t pcb = THREAD_TO_PCB(thread); + boolean_t do_avx512_alloc = FALSE; + DBG("fpu_upgrade_savearea(%p)\n", thread); + + simple_lock(&pcb->lock); + + ifps = pcb->ifps; + if (ifps == NULL) { + pcb->xstate = AVX512; + simple_unlock(&pcb->lock); + if (thread != current_thread()) { + /* nothing to be done */ + + return; + } + fpnoextflt(); + return; + } + + if (pcb->xstate != AVX512) { + do_avx512_alloc = TRUE; + } + simple_unlock(&pcb->lock); + + if (do_avx512_alloc == TRUE) { + ifps512 = fp_state_alloc(AVX512); + } + + simple_lock(&pcb->lock); + if (thread == current_thread()) { + boolean_t intr; + + intr = ml_set_interrupts_enabled(FALSE); + + clear_ts(); + fp_save(thread); + clear_fpu(); + + xsetbv(0, AVX512_XMASK); + + (void)ml_set_interrupts_enabled(intr); + } + assert(ifps->fp.fp_valid); + + /* Allocate an AVX512 savearea and copy AVX state into it */ + if (pcb->xstate != AVX512) { + bcopy(ifps, ifps512, fp_state_size[AVX]); + pcb->ifps = ifps512; + pcb->xstate = AVX512; + ifps512 = NULL; + } else { + ifps = NULL; + } + /* The PCB lock is redundant in some scenarios given the higher level + * thread mutex, but its pre-emption disablement is relied upon here + */ + simple_unlock(&pcb->lock); + + if (ifps) { + fp_state_free(ifps, AVX); + } + if (ifps512) { + fp_state_free(ifps, AVX512); + } +} /* - * Handle a coprocessor error interrupt on the AT386. - * This comes in on line 5 of the slave PIC at SPL1. + * Upgrade the calling thread to AVX512. */ +boolean_t +fpu_thread_promote_avx512(thread_t thread) +{ + task_t task = current_task(); + + if (thread != current_thread()) + return FALSE; + if (!ml_fpu_avx512_enabled()) + return FALSE; + fpu_savearea_promote_avx512(thread); + + /* Racy but the task's xstate is only a hint */ + task->xstate = AVX512; + + return TRUE; +} + + +/* + * Called from user_trap() when an invalid opcode fault is taken. + * If the user is attempting an AVX512 instruction on a machine + * that supports this, we switch the calling thread to use + * a larger savearea, set its XCR0 bit mask to enable AVX512 and + * return directly via thread_exception_return(). + * Otherwise simply return. + */ +#define MAX_X86_INSN_LENGTH (16) void -fpintr(void) +fpUDflt(user_addr_t rip) { - spl_t s; - thread_t thr_act = current_thread(); + uint8_t instruction_prefix; + boolean_t is_AVX512_instruction = FALSE; + user_addr_t original_rip = rip; + do { + /* TODO: as an optimisation, copy up to the lesser of the + * next page boundary or maximal prefix length in one pass + * rather than issue multiple copyins + */ + if (copyin(rip, (char *) &instruction_prefix, 1)) { + return; + } + DBG("fpUDflt(0x%016llx) prefix: 0x%x\n", + rip, instruction_prefix); + /* TODO: determine more specifically which prefixes + * are sane possibilities for AVX512 insns + */ + switch (instruction_prefix) { + case 0x2E: /* CS segment override */ + case 0x36: /* SS segment override */ + case 0x3E: /* DS segment override */ + case 0x26: /* ES segment override */ + case 0x64: /* FS segment override */ + case 0x65: /* GS segment override */ + case 0x66: /* Operand-size override */ + case 0x67: /* address-size override */ + /* Skip optional prefixes */ + rip++; + if ((rip - original_rip) > MAX_X86_INSN_LENGTH) { + return; + } + break; + case 0x62: /* EVEX */ + case 0xC5: /* VEX 2-byte */ + case 0xC4: /* VEX 3-byte */ + is_AVX512_instruction = TRUE; + break; + default: + return; + } + } while (!is_AVX512_instruction); -ASSERT_IPL(SPL1); - /* - * Turn off the extended 'busy' line. - */ - outb(0xf0, 0); + /* Here if we detect attempted execution of an AVX512 instruction */ /* - * Save the FPU context to the thread using it. + * Fail if this machine doesn't support AVX512 */ - clear_ts(); - fp_save(thr_act); - fninit(); - clear_fpu(); + if (fpu_capability != AVX512) + return; - /* - * Since we are running on the interrupt stack, we must - * signal the thread to take the exception when we return - * to user mode. Use an AST to do this. - * - * Don`t set the thread`s AST field. If the thread is - * descheduled before it takes the AST, it will notice - * the FPU error when it reloads its FPU state. - */ - s = splsched(); - mp_disable_preemption(); - ast_on(AST_I386_FP); - mp_enable_preemption(); - splx(s); + assert(xgetbv(XCR0) == AVX_XMASK); + + DBG("fpUDflt() switching xstate to AVX512\n"); + (void) fpu_thread_promote_avx512(current_thread()); + + thread_exception_return(); + /* NOT REACHED */ +} +#endif /* !defined(RC_HIDE_XNU_J137) */ + +void +fp_setvalid(boolean_t value) { + thread_t thr_act = current_thread(); + struct x86_fx_thread_state *ifps = thr_act->machine.ifps; + + if (ifps) { + ifps->fp_valid = value; + + if (value == TRUE) { + boolean_t istate = ml_set_interrupts_enabled(FALSE); + clear_fpu(); + ml_set_interrupts_enabled(istate); + } + } +} + +boolean_t +ml_fpu_avx_enabled(void) { + return (fpu_capability >= AVX); +} + +#if !defined(RC_HIDE_XNU_J137) +boolean_t +ml_fpu_avx512_enabled(void) { + return (fpu_capability == AVX512); +} +#endif + +static xstate_t +task_xstate(task_t task) +{ + if (task == TASK_NULL) + return fpu_default; + else + return task->xstate; +} + +static xstate_t +thread_xstate(thread_t thread) +{ + xstate_t xs = THREAD_TO_PCB(thread)->xstate; + if (xs == UNDEFINED) + return task_xstate(thread->task); + else + return xs; +} + +xstate_t +current_xstate(void) +{ + return thread_xstate(current_thread()); +} + +/* + * Called when exec'ing between bitnesses. + * If valid FPU state exists, adjust the layout. + */ +void +fpu_switch_addrmode(thread_t thread, boolean_t is_64bit) +{ + struct x86_fx_thread_state *ifps = thread->machine.ifps; + + if (ifps && ifps->fp_valid) { + if (thread_xstate(thread) == FP) { + ifps->fp_save_layout = is_64bit ? FXSAVE64 : FXSAVE32; + } else { + ifps->fp_save_layout = is_64bit ? XSAVE64 : XSAVE32; + } + } }