+static void configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps);
+static xstate_t thread_xstate(thread_t);
+
+x86_ext_thread_state_t initial_fp_state __attribute((aligned(64)));
+
+
+/* Global MXCSR capability bitmask */
+static unsigned int mxcsr_capability_mask;
+
+#define fninit() \
+ __asm__ volatile("fninit")
+
+#define fnstcw(control) \
+ __asm__("fnstcw %0" : "=m" (*(unsigned short *)(control)))
+
+#define fldcw(control) \
+ __asm__ volatile("fldcw %0" : : "m" (*(unsigned short *) &(control)) )
+
+#define fnclex() \
+ __asm__ volatile("fnclex")
+
+#define fnsave(state) \
+ __asm__ volatile("fnsave %0" : "=m" (*state))
+
+#define frstor(state) \
+ __asm__ volatile("frstor %0" : : "m" (state))
+
+#define fwait() \
+ __asm__("fwait");
+
+static inline void fxrstor(struct x86_fx_thread_state *a) {
+ __asm__ __volatile__("fxrstor %0" :: "m" (*a));
+}
+
+static inline void fxsave(struct x86_fx_thread_state *a) {
+ __asm__ __volatile__("fxsave %0" : "=m" (*a));
+}
+
+static inline void fxrstor64(struct x86_fx_thread_state *a) {
+ __asm__ __volatile__("fxrstor64 %0" :: "m" (*a));
+}
+
+static inline void fxsave64(struct x86_fx_thread_state *a) {
+ __asm__ __volatile__("fxsave64 %0" : "=m" (*a));
+}
+
+#if !defined(RC_HIDE_XNU_J137)
+#define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX || (x) == AVX512)
+#else
+#define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX)
+#endif
+
+zone_t ifps_zone[] = {
+ [FP] = NULL,
+ [AVX] = NULL,
+#if !defined(RC_HIDE_XNU_J137)
+ [AVX512] = NULL
+#endif
+};
+static uint32_t fp_state_size[] = {
+ [FP] = sizeof(struct x86_fx_thread_state),
+ [AVX] = sizeof(struct x86_avx_thread_state),
+#if !defined(RC_HIDE_XNU_J137)
+ [AVX512] = sizeof(struct x86_avx512_thread_state)
+#endif
+};
+
+static const char *xstate_name[] = {
+ [UNDEFINED] = "UNDEFINED",
+ [FP] = "FP",
+ [AVX] = "AVX",
+#if !defined(RC_HIDE_XNU_J137)
+ [AVX512] = "AVX512"
+#endif
+};
+
+#if !defined(RC_HIDE_XNU_J137)
+#define fpu_ZMM_capable (fpu_capability == AVX512)
+#define fpu_YMM_capable (fpu_capability == AVX || fpu_capability == AVX512)
+/*
+ * On-demand AVX512 support
+ * ------------------------
+ * On machines with AVX512 support, by default, threads are created with
+ * AVX512 masked off in XCR0 and an AVX-sized savearea is used. However, AVX512
+ * capabilities are advertised in the commpage and via sysctl. If a thread
+ * opts to use AVX512 instructions, the first will result in a #UD exception.
+ * Faulting AVX512 intructions are recognizable by their unique prefix.
+ * This exception results in the thread being promoted to use an AVX512-sized
+ * savearea and for the AVX512 bit masks being set in its XCR0. The faulting
+ * instruction is re-driven and the thread can proceed to perform AVX512
+ * operations.
+ *
+ * In addition to AVX512 instructions causing promotion, the thread_set_state()
+ * primitive with an AVX512 state flavor result in promotion.
+ *
+ * AVX512 promotion of the first thread in a task causes the default xstate
+ * of the task to be promoted so that any subsequently created or subsequently
+ * DNA-faulted thread will have AVX512 xstate and it will not need to fault-in
+ * a promoted xstate.
+ *
+ * Two savearea zones are used: the default pool of AVX-sized (832 byte) areas
+ * and a second pool of larger AVX512-sized (2688 byte) areas.
+ *
+ * Note the initial state value is an AVX512 object but that the AVX initial
+ * value is a subset of it.
+ */
+#else
+#define fpu_YMM_capable (fpu_capability == AVX)
+#endif
+static uint32_t cpuid_reevaluated = 0;
+
+static void fpu_store_registers(void *, boolean_t);
+static void fpu_load_registers(void *);
+
+#define FP_XMASK ((uint32_t) (XFEM_X87 | XFEM_SSE))
+#define AVX_XMASK ((uint32_t) (XFEM_X87 | XFEM_SSE | XFEM_YMM))
+#if !defined(RC_HIDE_XNU_J137)
+#define AVX512_XMASK ((uint32_t) (XFEM_X87 | XFEM_SSE | XFEM_YMM | XFEM_ZMM))
+static const uint32_t xstate_xmask[] = {
+ [FP] = FP_XMASK,
+ [AVX] = AVX_XMASK,
+ [AVX512] = AVX512_XMASK
+};
+#else
+static const uint32_t xstate_xmask[] = {
+ [FP] = FP_XMASK,
+ [AVX] = AVX_XMASK,
+};
+#endif
+
+static inline void xsetbv(uint32_t mask_hi, uint32_t mask_lo) {
+ __asm__ __volatile__("xsetbv" :: "a"(mask_lo), "d"(mask_hi), "c" (XCR0));
+}
+
+static inline void xsave(struct x86_fx_thread_state *a, uint32_t rfbm) {
+ __asm__ __volatile__("xsave %0" :"=m" (*a) : "a"(rfbm), "d"(0));
+}
+
+static inline void xsave64(struct x86_fx_thread_state *a, uint32_t rfbm) {
+ __asm__ __volatile__("xsave64 %0" :"=m" (*a) : "a"(rfbm), "d"(0));
+}
+
+static inline void xrstor(struct x86_fx_thread_state *a, uint32_t rfbm) {
+ __asm__ __volatile__("xrstor %0" :: "m" (*a), "a"(rfbm), "d"(0));
+}
+
+static inline void xrstor64(struct x86_fx_thread_state *a, uint32_t rfbm) {
+ __asm__ __volatile__("xrstor64 %0" :: "m" (*a), "a"(rfbm), "d"(0));
+}
+
+#if !defined(RC_HIDE_XNU_J137)
+static inline void vzeroupper(void) {
+ __asm__ __volatile__("vzeroupper" ::);
+}
+#if DEVELOPMENT || DEBUG
+static inline uint64_t xgetbv(uint32_t c) {
+ uint32_t mask_hi, mask_lo;
+ __asm__ __volatile__("xgetbv" : "=a"(mask_lo), "=d"(mask_hi) : "c" (c));
+ return ((uint64_t) mask_hi<<32) + (uint64_t) mask_lo;
+}
+#endif
+
+static boolean_t fpu_thread_promote_avx512(thread_t); /* Forward */
+
+/*
+ * Define a wrapper for bcopy to defeat destination size checka.
+ * This is needed to treat repeated objects such as
+ * _STRUCT_XMM_REG fpu_ymmh0;
+ * ...
+ * _STRUCT_XMM_REG fpu_ymmh7;
+ * as an array and to copy like so:
+ * bcopy_nockch(src,&dst->fpu_ymmh0,8*sizeof(_STRUCT_XMM_REG));
+ * without the compiler throwing a __builtin__memmove_chk error.
+ */
+static inline void bcopy_nochk(void *_src, void *_dst, size_t _len) {
+ bcopy(_src, _dst, _len);
+}
+
+/*
+ * Furthermore, make compile-time asserts that no padding creeps into structures
+ * for which we're doing this.
+ */
+#define ASSERT_PACKED(t, m1, m2, n, mt) \
+extern char assert_packed_ ## t ## _ ## m1 ## _ ## m2 \
+ [(offsetof(t,m2) - offsetof(t,m1) == (n - 1)*sizeof(mt)) ? 1 : -1]
+
+ASSERT_PACKED(x86_avx_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
+
+ASSERT_PACKED(x86_avx_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
+
+ASSERT_PACKED(x86_avx512_state32_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
+ASSERT_PACKED(x86_avx512_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
+ASSERT_PACKED(x86_avx512_state32_t, fpu_zmmh0, fpu_zmmh7, 8, _STRUCT_YMM_REG);
+
+ASSERT_PACKED(x86_avx512_state64_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
+ASSERT_PACKED(x86_avx512_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
+ASSERT_PACKED(x86_avx512_state64_t, fpu_zmmh0, fpu_zmmh15, 16, _STRUCT_YMM_REG);
+ASSERT_PACKED(x86_avx512_state64_t, fpu_zmm16, fpu_zmm31, 16, _STRUCT_ZMM_REG);
+
+#if defined(DEBUG_AVX512)
+
+#define DBG(x...) kprintf("DBG: " x)
+
+typedef struct { uint8_t byte[8]; } opmask_t;
+typedef struct { uint8_t byte[16]; } xmm_t;
+typedef struct { uint8_t byte[32]; } ymm_t;
+typedef struct { uint8_t byte[64]; } zmm_t;
+
+static void
+DBG_AVX512_STATE(struct x86_avx512_thread_state *sp)
+{
+ int i, j;
+ xmm_t *xmm = (xmm_t *) &sp->fp.fx_XMM_reg;
+ xmm_t *ymmh = (xmm_t *) &sp->x_YMM_Hi128;
+ ymm_t *zmmh = (ymm_t *) &sp->x_ZMM_Hi256;
+ zmm_t *zmm = (zmm_t *) &sp->x_Hi16_ZMM;
+ opmask_t *k = (opmask_t *) &sp->x_Opmask;
+
+ kprintf("x_YMM_Hi128: %lu\n", offsetof(struct x86_avx512_thread_state, x_YMM_Hi128));
+ kprintf("x_Opmask: %lu\n", offsetof(struct x86_avx512_thread_state, x_Opmask));
+ kprintf("x_ZMM_Hi256: %lu\n", offsetof(struct x86_avx512_thread_state, x_ZMM_Hi256));
+ kprintf("x_Hi16_ZMM: %lu\n", offsetof(struct x86_avx512_thread_state, x_Hi16_ZMM));
+
+ kprintf("XCR0: 0x%016llx\n", xgetbv(XCR0));
+ kprintf("XINUSE: 0x%016llx\n", xgetbv(1));
+
+ /* Print all ZMM registers */
+ for (i = 0; i < 16; i++) {
+ kprintf("zmm%d:\t0x", i);
+ for (j = 0; j < 16; j++)
+ kprintf("%02x", xmm[i].byte[j]);
+ for (j = 0; j < 16; j++)
+ kprintf("%02x", ymmh[i].byte[j]);
+ for (j = 0; j < 32; j++)
+ kprintf("%02x", zmmh[i].byte[j]);
+ kprintf("\n");
+ }
+ for (i = 0; i < 16; i++) {
+ kprintf("zmm%d:\t0x", 16+i);
+ for (j = 0; j < 64; j++)
+ kprintf("%02x", zmm[i].byte[j]);
+ kprintf("\n");
+ }
+ for (i = 0; i < 8; i++) {
+ kprintf("k%d:\t0x", i);
+ for (j = 0; j < 8; j++)
+ kprintf("%02x", k[i].byte[j]);
+ kprintf("\n");
+ }
+
+ kprintf("xstate_bv: 0x%016llx\n", sp->_xh.xstate_bv);
+ kprintf("xcomp_bv: 0x%016llx\n", sp->_xh.xcomp_bv);
+}
+#else
+#define DBG(x...)
+static void
+DBG_AVX512_STATE(__unused struct x86_avx512_thread_state *sp)
+{
+ return;
+}
+#endif /* DEBUG_AVX512 */
+
+#endif
+
+#if DEBUG
+static inline unsigned short
+fnstsw(void)
+{
+ unsigned short status;
+ __asm__ volatile("fnstsw %0" : "=ma" (status));
+ return(status);
+}
+#endif
+
+/*
+ * Configure the initial FPU state presented to new threads.
+ * Determine the MXCSR capability mask, which allows us to mask off any
+ * potentially unsafe "reserved" bits before restoring the FPU context.
+ * *Not* per-cpu, assumes symmetry.
+ */
+
+static void
+configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps)
+{
+ /* XSAVE requires a 64 byte aligned store */
+ assert(ALIGNED(fps, 64));
+ /* Clear, to prepare for the diagnostic FXSAVE */
+ bzero(fps, sizeof(*fps));
+
+ fpinit();
+ fpu_store_registers(fps, FALSE);
+
+ mxcsr_capability_mask = fps->fx.fx_MXCSR_MASK;
+
+ /* Set default mask value if necessary */
+ if (mxcsr_capability_mask == 0)
+ mxcsr_capability_mask = 0xffbf;
+
+ /* Clear vector register store */
+ bzero(&fps->fx.fx_XMM_reg[0][0], sizeof(fps->fx.fx_XMM_reg));
+ bzero(fps->avx.x_YMM_Hi128, sizeof(fps->avx.x_YMM_Hi128));
+#if !defined(RC_HIDE_XNU_J137)
+ if (fpu_ZMM_capable) {
+ bzero(fps->avx512.x_ZMM_Hi256, sizeof(fps->avx512.x_ZMM_Hi256));
+ bzero(fps->avx512.x_Hi16_ZMM, sizeof(fps->avx512.x_Hi16_ZMM));
+ bzero(fps->avx512.x_Opmask, sizeof(fps->avx512.x_Opmask));
+ }
+#endif
+
+ fps->fx.fp_valid = TRUE;
+ fps->fx.fp_save_layout = fpu_YMM_capable ? XSAVE32: FXSAVE32;
+ fpu_load_registers(fps);
+
+ /* Poison values to trap unsafe usage */
+ fps->fx.fp_valid = 0xFFFFFFFF;
+ fps->fx.fp_save_layout = FP_UNUSED;
+
+ /* Re-enable FPU/SSE DNA exceptions */
+ set_ts();
+}
+