+#if !defined(RC_HIDE_XNU_J137)
+static const uint32_t xstate_xmask[] = {
+ [FP] = FP_XMASK,
+ [AVX] = AVX_XMASK,
+ [AVX512] = AVX512_XMASK
+};
+#else
+static const uint32_t xstate_xmask[] = {
+ [FP] = FP_XMASK,
+ [AVX] = AVX_XMASK,
+};
+#endif
+
+static inline void
+xsave(struct x86_fx_thread_state *a, uint32_t rfbm)
+{
+ __asm__ __volatile__ ("xsave %0" :"=m" (*a) : "a"(rfbm), "d"(0));
+}
+
+static inline void
+xsave64(struct x86_fx_thread_state *a, uint32_t rfbm)
+{
+ __asm__ __volatile__ ("xsave64 %0" :"=m" (*a) : "a"(rfbm), "d"(0));
+}
+
+static inline void
+xrstor(struct x86_fx_thread_state *a, uint32_t rfbm)
+{
+ __asm__ __volatile__ ("xrstor %0" :: "m" (*a), "a"(rfbm), "d"(0));
+}
+
+static inline void
+xrstor64(struct x86_fx_thread_state *a, uint32_t rfbm)
+{
+ __asm__ __volatile__ ("xrstor64 %0" :: "m" (*a), "a"(rfbm), "d"(0));
+}
+
+#if !defined(RC_HIDE_XNU_J137)
+__unused static inline void
+vzeroupper(void)
+{
+ __asm__ __volatile__ ("vzeroupper" ::);
+}
+
+static boolean_t fpu_thread_promote_avx512(thread_t); /* Forward */
+
+/*
+ * Define a wrapper for bcopy to defeat destination size checka.
+ * This is needed to treat repeated objects such as
+ * _STRUCT_XMM_REG fpu_ymmh0;
+ * ...
+ * _STRUCT_XMM_REG fpu_ymmh7;
+ * as an array and to copy like so:
+ * bcopy_nockch(src,&dst->fpu_ymmh0,8*sizeof(_STRUCT_XMM_REG));
+ * without the compiler throwing a __builtin__memmove_chk error.
+ */
+static inline void
+bcopy_nochk(void *_src, void *_dst, size_t _len)
+{
+ bcopy(_src, _dst, _len);
+}
+
+/*
+ * Furthermore, make compile-time asserts that no padding creeps into structures
+ * for which we're doing this.
+ */
+#define ASSERT_PACKED(t, m1, m2, n, mt) \
+extern char assert_packed_ ## t ## _ ## m1 ## _ ## m2 \
+ [(offsetof(t,m2) - offsetof(t,m1) == (n - 1)*sizeof(mt)) ? 1 : -1]
+
+ASSERT_PACKED(x86_avx_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
+
+ASSERT_PACKED(x86_avx_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
+
+ASSERT_PACKED(x86_avx512_state32_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
+ASSERT_PACKED(x86_avx512_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
+ASSERT_PACKED(x86_avx512_state32_t, fpu_zmmh0, fpu_zmmh7, 8, _STRUCT_YMM_REG);
+
+ASSERT_PACKED(x86_avx512_state64_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
+ASSERT_PACKED(x86_avx512_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
+ASSERT_PACKED(x86_avx512_state64_t, fpu_zmmh0, fpu_zmmh15, 16, _STRUCT_YMM_REG);
+ASSERT_PACKED(x86_avx512_state64_t, fpu_zmm16, fpu_zmm31, 16, _STRUCT_ZMM_REG);
+
+#if defined(DEBUG_AVX512)
+
+#define DBG(x...) kprintf("DBG: " x)
+
+typedef struct { uint8_t byte[8]; } opmask_t;
+typedef struct { uint8_t byte[16]; } xmm_t;
+typedef struct { uint8_t byte[32]; } ymm_t;
+typedef struct { uint8_t byte[64]; } zmm_t;
+
+static void
+DBG_AVX512_STATE(struct x86_avx512_thread_state *sp)
+{
+ int i, j;
+ xmm_t *xmm = (xmm_t *) &sp->fp.fx_XMM_reg;
+ xmm_t *ymmh = (xmm_t *) &sp->x_YMM_Hi128;
+ ymm_t *zmmh = (ymm_t *) &sp->x_ZMM_Hi256;
+ zmm_t *zmm = (zmm_t *) &sp->x_Hi16_ZMM;
+ opmask_t *k = (opmask_t *) &sp->x_Opmask;
+
+ kprintf("x_YMM_Hi128: %lu\n", offsetof(struct x86_avx512_thread_state, x_YMM_Hi128));
+ kprintf("x_Opmask: %lu\n", offsetof(struct x86_avx512_thread_state, x_Opmask));
+ kprintf("x_ZMM_Hi256: %lu\n", offsetof(struct x86_avx512_thread_state, x_ZMM_Hi256));
+ kprintf("x_Hi16_ZMM: %lu\n", offsetof(struct x86_avx512_thread_state, x_Hi16_ZMM));
+
+ kprintf("XCR0: 0x%016llx\n", xgetbv(XCR0));
+ kprintf("XINUSE: 0x%016llx\n", xgetbv(1));
+
+ /* Print all ZMM registers */
+ for (i = 0; i < 16; i++) {
+ kprintf("zmm%d:\t0x", i);
+ for (j = 0; j < 16; j++) {
+ kprintf("%02x", xmm[i].byte[j]);
+ }
+ for (j = 0; j < 16; j++) {
+ kprintf("%02x", ymmh[i].byte[j]);
+ }
+ for (j = 0; j < 32; j++) {
+ kprintf("%02x", zmmh[i].byte[j]);
+ }
+ kprintf("\n");
+ }
+ for (i = 0; i < 16; i++) {
+ kprintf("zmm%d:\t0x", 16 + i);
+ for (j = 0; j < 64; j++) {
+ kprintf("%02x", zmm[i].byte[j]);
+ }
+ kprintf("\n");
+ }
+ for (i = 0; i < 8; i++) {
+ kprintf("k%d:\t0x", i);
+ for (j = 0; j < 8; j++) {
+ kprintf("%02x", k[i].byte[j]);
+ }
+ kprintf("\n");
+ }
+
+ kprintf("xstate_bv: 0x%016llx\n", sp->_xh.xstate_bv);
+ kprintf("xcomp_bv: 0x%016llx\n", sp->_xh.xcomp_bv);
+}
+#else
+#define DBG(x...)
+static void
+DBG_AVX512_STATE(__unused struct x86_avx512_thread_state *sp)
+{
+ return;
+}
+#endif /* DEBUG_AVX512 */
+
+#endif
+
+#if DEBUG
+static inline unsigned short
+fnstsw(void)
+{
+ unsigned short status;
+ __asm__ volatile ("fnstsw %0" : "=ma" (status));
+ return status;
+}
+#endif
+
+/*
+ * Configure the initial FPU state presented to new threads.
+ * Determine the MXCSR capability mask, which allows us to mask off any
+ * potentially unsafe "reserved" bits before restoring the FPU context.
+ * *Not* per-cpu, assumes symmetry.
+ */
+
+static void
+configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps)
+{
+ /* XSAVE requires a 64 byte aligned store */
+ assert(ALIGNED(fps, 64));
+ /* Clear, to prepare for the diagnostic FXSAVE */
+ bzero(fps, sizeof(*fps));
+
+ fpinit();
+ fpu_store_registers(fps, FALSE);
+
+ mxcsr_capability_mask = fps->fx.fx_MXCSR_MASK;
+
+ /* Set default mask value if necessary */
+ if (mxcsr_capability_mask == 0) {
+ mxcsr_capability_mask = 0xffbf;
+ }
+
+ /* Clear vector register store */
+ bzero(&fps->fx.fx_XMM_reg[0][0], sizeof(fps->fx.fx_XMM_reg));
+ bzero(fps->avx.x_YMM_Hi128, sizeof(fps->avx.x_YMM_Hi128));
+#if !defined(RC_HIDE_XNU_J137)
+ if (fpu_ZMM_capable) {
+ bzero(fps->avx512.x_ZMM_Hi256, sizeof(fps->avx512.x_ZMM_Hi256));
+ bzero(fps->avx512.x_Hi16_ZMM, sizeof(fps->avx512.x_Hi16_ZMM));
+ bzero(fps->avx512.x_Opmask, sizeof(fps->avx512.x_Opmask));
+ }
+#endif
+
+ fps->fx.fp_valid = TRUE;
+ fps->fx.fp_save_layout = fpu_YMM_capable ? XSAVE32: FXSAVE32;
+ fpu_load_registers(fps);
+
+ if (fpu_ZMM_capable) {
+ xsave64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
+ }
+ if (fpu_YMM_capable) {
+ xsave64((struct x86_fx_thread_state *)&default_avx_state, xstate_xmask[AVX]);
+ } else {
+ fxsave64((struct x86_fx_thread_state *)&default_fx_state);
+ }