+#include <i386/thread.h>
+#include <i386/trap.h>
+
+xstate_t fpu_capability = UNDEFINED; /* extended state capability */
+xstate_t fpu_default = UNDEFINED; /* default extended state */
+
+#define ALIGNED(addr, size) (((uintptr_t)(addr)&((size)-1))==0)
+#define VERIFY_SAVEAREA_ALIGNED(p, a) \
+ assertf(!(((uintptr_t)(p)) & ((a) - 1)), \
+ "FP save area component @ 0x%lx not 8-byte aligned", ((uintptr_t)(p)))
+
+/* Forward */
+
+extern void fpinit(void);
+extern void fp_save(
+ thread_t thr_act);
+extern void fp_load(
+ thread_t thr_act);
+
+static void configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps);
+static xstate_t thread_xstate(thread_t);
+
+x86_ext_thread_state_t initial_fp_state __attribute((aligned(64)));
+x86_ext_thread_state_t default_avx512_state __attribute((aligned(64)));
+x86_ext_thread_state_t default_avx_state __attribute((aligned(64)));
+x86_ext_thread_state_t default_fx_state __attribute((aligned(64)));
+
+/* Global MXCSR capability bitmask */
+static unsigned int mxcsr_capability_mask;
+
+#define fninit() \
+ __asm__ volatile("fninit")
+
+#define fnstcw(control) \
+ __asm__("fnstcw %0" : "=m" (*(unsigned short *)(control)))
+
+#define fldcw(control) \
+ __asm__ volatile("fldcw %0" : : "m" (*(unsigned short *) &(control)) )
+
+#define fnclex() \
+ __asm__ volatile("fnclex")
+
+#define fnsave(state) \
+ __asm__ volatile("fnsave %0" : "=m" (*state))
+
+#define frstor(state) \
+ __asm__ volatile("frstor %0" : : "m" (state))
+
+#define fwait() \
+ __asm__("fwait");
+
+static inline void
+fxrstor(struct x86_fx_thread_state *a)
+{
+ __asm__ __volatile__ ("fxrstor %0" :: "m" (*a));
+}
+
+static inline void
+fxsave(struct x86_fx_thread_state *a)
+{
+ __asm__ __volatile__ ("fxsave %0" : "=m" (*a));
+}
+
+static inline void
+fxrstor64(struct x86_fx_thread_state *a)
+{
+ __asm__ __volatile__ ("fxrstor64 %0" :: "m" (*a));
+}
+
+static inline void
+fxsave64(struct x86_fx_thread_state *a)
+{
+ __asm__ __volatile__ ("fxsave64 %0" : "=m" (*a));
+}
+
+#define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX || (x) == AVX512)
+
+zone_t ifps_zone[] = {
+ [FP] = NULL,
+ [AVX] = NULL,
+ [AVX512] = NULL
+};
+static uint32_t fp_state_size[] = {
+ [FP] = sizeof(struct x86_fx_thread_state),
+ [AVX] = sizeof(struct x86_avx_thread_state),
+ [AVX512] = sizeof(struct x86_avx512_thread_state)
+};
+
+static const char *xstate_name[] = {
+ [UNDEFINED] = "UNDEFINED",
+ [FP] = "FP",
+ [AVX] = "AVX",
+ [AVX512] = "AVX512"
+};
+
+#define fpu_ZMM_capable (fpu_capability == AVX512)
+#define fpu_YMM_capable (fpu_capability == AVX || fpu_capability == AVX512)
+/*
+ * On-demand AVX512 support
+ * ------------------------
+ * On machines with AVX512 support, by default, threads are created with
+ * AVX512 masked off in XCR0 and an AVX-sized savearea is used. However, AVX512
+ * capabilities are advertised in the commpage and via sysctl. If a thread
+ * opts to use AVX512 instructions, the first will result in a #UD exception.
+ * Faulting AVX512 intructions are recognizable by their unique prefix.
+ * This exception results in the thread being promoted to use an AVX512-sized
+ * savearea and for the AVX512 bit masks being set in its XCR0. The faulting
+ * instruction is re-driven and the thread can proceed to perform AVX512
+ * operations.
+ *
+ * In addition to AVX512 instructions causing promotion, the thread_set_state()
+ * primitive with an AVX512 state flavor result in promotion.
+ *
+ * AVX512 promotion of the first thread in a task causes the default xstate
+ * of the task to be promoted so that any subsequently created or subsequently
+ * DNA-faulted thread will have AVX512 xstate and it will not need to fault-in
+ * a promoted xstate.
+ *
+ * Two savearea zones are used: the default pool of AVX-sized (832 byte) areas
+ * and a second pool of larger AVX512-sized (2688 byte) areas.
+ *
+ * Note the initial state value is an AVX512 object but that the AVX initial
+ * value is a subset of it.
+ */
+static uint32_t cpuid_reevaluated = 0;
+
+static void fpu_store_registers(void *, boolean_t);
+static void fpu_load_registers(void *);
+
+static const uint32_t xstate_xmask[] = {
+ [FP] = FP_XMASK,
+ [AVX] = AVX_XMASK,
+ [AVX512] = AVX512_XMASK
+};
+
+static inline void
+xsave(struct x86_fx_thread_state *a, uint32_t rfbm)
+{
+ __asm__ __volatile__ ("xsave %0" :"=m" (*a) : "a"(rfbm), "d"(0));
+}
+
+static inline void
+xsave64(struct x86_fx_thread_state *a, uint32_t rfbm)
+{
+ __asm__ __volatile__ ("xsave64 %0" :"=m" (*a) : "a"(rfbm), "d"(0));
+}
+
+static inline void
+xrstor(struct x86_fx_thread_state *a, uint32_t rfbm)
+{
+ __asm__ __volatile__ ("xrstor %0" :: "m" (*a), "a"(rfbm), "d"(0));
+}
+
+static inline void
+xrstor64(struct x86_fx_thread_state *a, uint32_t rfbm)
+{
+ __asm__ __volatile__ ("xrstor64 %0" :: "m" (*a), "a"(rfbm), "d"(0));
+}
+
+__unused static inline void
+vzeroupper(void)
+{
+ __asm__ __volatile__ ("vzeroupper" ::);
+}
+
+static boolean_t fpu_thread_promote_avx512(thread_t); /* Forward */
+
+
+/*
+ * Furthermore, make compile-time asserts that no padding creeps into structures
+ * for which we're doing this.
+ */
+#define ASSERT_PACKED(t, m1, m2, n, mt) \
+extern char assert_packed_ ## t ## _ ## m1 ## _ ## m2 \
+ [(offsetof(t,m2) - offsetof(t,m1) == (n - 1)*sizeof(mt)) ? 1 : -1]
+
+ASSERT_PACKED(x86_avx_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
+
+ASSERT_PACKED(x86_avx_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
+
+ASSERT_PACKED(x86_avx512_state32_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
+ASSERT_PACKED(x86_avx512_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
+ASSERT_PACKED(x86_avx512_state32_t, fpu_zmmh0, fpu_zmmh7, 8, _STRUCT_YMM_REG);
+
+ASSERT_PACKED(x86_avx512_state64_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
+ASSERT_PACKED(x86_avx512_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
+ASSERT_PACKED(x86_avx512_state64_t, fpu_zmmh0, fpu_zmmh15, 16, _STRUCT_YMM_REG);
+ASSERT_PACKED(x86_avx512_state64_t, fpu_zmm16, fpu_zmm31, 16, _STRUCT_ZMM_REG);
+
+#if defined(DEBUG_AVX512)
+
+#define DBG(x...) kprintf("DBG: " x)
+
+typedef struct { uint8_t byte[8]; } opmask_t;
+typedef struct { uint8_t byte[16]; } xmm_t;
+typedef struct { uint8_t byte[32]; } ymm_t;
+typedef struct { uint8_t byte[64]; } zmm_t;
+
+static void
+DBG_AVX512_STATE(struct x86_avx512_thread_state *sp)
+{
+ int i, j;
+ xmm_t *xmm = (xmm_t *) &sp->fp.fx_XMM_reg;
+ xmm_t *ymmh = (xmm_t *) &sp->x_YMM_Hi128;
+ ymm_t *zmmh = (ymm_t *) &sp->x_ZMM_Hi256;
+ zmm_t *zmm = (zmm_t *) &sp->x_Hi16_ZMM;
+ opmask_t *k = (opmask_t *) &sp->x_Opmask;
+
+ kprintf("x_YMM_Hi128: %lu\n", offsetof(struct x86_avx512_thread_state, x_YMM_Hi128));
+ kprintf("x_Opmask: %lu\n", offsetof(struct x86_avx512_thread_state, x_Opmask));
+ kprintf("x_ZMM_Hi256: %lu\n", offsetof(struct x86_avx512_thread_state, x_ZMM_Hi256));
+ kprintf("x_Hi16_ZMM: %lu\n", offsetof(struct x86_avx512_thread_state, x_Hi16_ZMM));
+
+ kprintf("XCR0: 0x%016llx\n", xgetbv(XCR0));
+ kprintf("XINUSE: 0x%016llx\n", xgetbv(1));
+
+ /* Print all ZMM registers */
+ for (i = 0; i < 16; i++) {
+ kprintf("zmm%d:\t0x", i);
+ for (j = 0; j < 16; j++) {
+ kprintf("%02x", xmm[i].byte[j]);
+ }
+ for (j = 0; j < 16; j++) {
+ kprintf("%02x", ymmh[i].byte[j]);
+ }
+ for (j = 0; j < 32; j++) {
+ kprintf("%02x", zmmh[i].byte[j]);
+ }
+ kprintf("\n");
+ }
+ for (i = 0; i < 16; i++) {
+ kprintf("zmm%d:\t0x", 16 + i);
+ for (j = 0; j < 64; j++) {
+ kprintf("%02x", zmm[i].byte[j]);
+ }
+ kprintf("\n");
+ }
+ for (i = 0; i < 8; i++) {
+ kprintf("k%d:\t0x", i);
+ for (j = 0; j < 8; j++) {
+ kprintf("%02x", k[i].byte[j]);
+ }
+ kprintf("\n");
+ }