]> git.saurik.com Git - apple/xnu.git/blobdiff - osfmk/i386/fpu.c
xnu-7195.60.75.tar.gz
[apple/xnu.git] / osfmk / i386 / fpu.c
index 82ab4423f91e3a704bfb6c94f1a62f6099d997ba..e960ee4efb3a0f1c7fe8af694afd8044d2c26ffe 100644 (file)
@@ -1,5 +1,5 @@
 /*
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -80,6 +80,9 @@ xstate_t        fpu_capability = UNDEFINED;     /* extended state capability */
 xstate_t        fpu_default = UNDEFINED;        /* default extended state */
 
 #define ALIGNED(addr, size)      (((uintptr_t)(addr)&((size)-1))==0)
 xstate_t        fpu_default = UNDEFINED;        /* default extended state */
 
 #define ALIGNED(addr, size)      (((uintptr_t)(addr)&((size)-1))==0)
+#define VERIFY_SAVEAREA_ALIGNED(p, a) \
+       assertf(!(((uintptr_t)(p)) & ((a) - 1)), \
+           "FP save area component @ 0x%lx not 8-byte aligned", ((uintptr_t)(p)))
 
 /* Forward */
 
 
 /* Forward */
 
@@ -145,37 +148,26 @@ fxsave64(struct x86_fx_thread_state *a)
        __asm__ __volatile__ ("fxsave64 %0" : "=m" (*a));
 }
 
        __asm__ __volatile__ ("fxsave64 %0" : "=m" (*a));
 }
 
-#if !defined(RC_HIDE_XNU_J137)
 #define IS_VALID_XSTATE(x)      ((x) == FP || (x) == AVX || (x) == AVX512)
 #define IS_VALID_XSTATE(x)      ((x) == FP || (x) == AVX || (x) == AVX512)
-#else
-#define IS_VALID_XSTATE(x)      ((x) == FP || (x) == AVX)
-#endif
 
 
-zone_t          ifps_zone[] = {
+SECURITY_READ_ONLY_LATE(zone_t) ifps_zone[] = {
        [FP]     = NULL,
        [AVX]    = NULL,
        [FP]     = NULL,
        [AVX]    = NULL,
-#if !defined(RC_HIDE_XNU_J137)
        [AVX512] = NULL
        [AVX512] = NULL
-#endif
 };
 };
-static uint32_t fp_state_size[] = {
+static const uint32_t fp_state_size[] = {
        [FP]     = sizeof(struct x86_fx_thread_state),
        [AVX]    = sizeof(struct x86_avx_thread_state),
        [FP]     = sizeof(struct x86_fx_thread_state),
        [AVX]    = sizeof(struct x86_avx_thread_state),
-#if !defined(RC_HIDE_XNU_J137)
        [AVX512] = sizeof(struct x86_avx512_thread_state)
        [AVX512] = sizeof(struct x86_avx512_thread_state)
-#endif
 };
 
 };
 
-static const char *xstate_name[] = {
+static const char *const xstate_name[] = {
        [UNDEFINED] = "UNDEFINED",
        [FP] = "FP",
        [AVX] = "AVX",
        [UNDEFINED] = "UNDEFINED",
        [FP] = "FP",
        [AVX] = "AVX",
-#if !defined(RC_HIDE_XNU_J137)
        [AVX512] = "AVX512"
        [AVX512] = "AVX512"
-#endif
 };
 
 };
 
-#if !defined(RC_HIDE_XNU_J137)
 #define fpu_ZMM_capable (fpu_capability == AVX512)
 #define fpu_YMM_capable (fpu_capability == AVX || fpu_capability == AVX512)
 /*
 #define fpu_ZMM_capable (fpu_capability == AVX512)
 #define fpu_YMM_capable (fpu_capability == AVX || fpu_capability == AVX512)
 /*
@@ -205,26 +197,16 @@ static const char *xstate_name[] = {
  * Note the initial state value is an AVX512 object but that the AVX initial
  * value is a subset of it.
  */
  * Note the initial state value is an AVX512 object but that the AVX initial
  * value is a subset of it.
  */
-#else
-#define fpu_YMM_capable (fpu_capability == AVX)
-#endif
 static uint32_t cpuid_reevaluated = 0;
 
 static void fpu_store_registers(void *, boolean_t);
 static void fpu_load_registers(void *);
 
 static uint32_t cpuid_reevaluated = 0;
 
 static void fpu_store_registers(void *, boolean_t);
 static void fpu_load_registers(void *);
 
-#if !defined(RC_HIDE_XNU_J137)
 static const uint32_t xstate_xmask[] = {
        [FP] =          FP_XMASK,
        [AVX] =         AVX_XMASK,
        [AVX512] =      AVX512_XMASK
 };
 static const uint32_t xstate_xmask[] = {
        [FP] =          FP_XMASK,
        [AVX] =         AVX_XMASK,
        [AVX512] =      AVX512_XMASK
 };
-#else
-static const uint32_t xstate_xmask[] = {
-       [FP] =          FP_XMASK,
-       [AVX] =         AVX_XMASK,
-};
-#endif
 
 static inline void
 xsave(struct x86_fx_thread_state *a, uint32_t rfbm)
 
 static inline void
 xsave(struct x86_fx_thread_state *a, uint32_t rfbm)
@@ -250,7 +232,6 @@ xrstor64(struct x86_fx_thread_state *a, uint32_t rfbm)
        __asm__ __volatile__ ("xrstor64 %0" ::  "m" (*a), "a"(rfbm), "d"(0));
 }
 
        __asm__ __volatile__ ("xrstor64 %0" ::  "m" (*a), "a"(rfbm), "d"(0));
 }
 
-#if !defined(RC_HIDE_XNU_J137)
 __unused static inline void
 vzeroupper(void)
 {
 __unused static inline void
 vzeroupper(void)
 {
@@ -349,8 +330,6 @@ DBG_AVX512_STATE(__unused struct x86_avx512_thread_state *sp)
 }
 #endif /* DEBUG_AVX512 */
 
 }
 #endif /* DEBUG_AVX512 */
 
-#endif
-
 #if     DEBUG
 static inline unsigned short
 fnstsw(void)
 #if     DEBUG
 static inline unsigned short
 fnstsw(void)
@@ -389,13 +368,11 @@ configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps)
        /* Clear vector register store */
        bzero(&fps->fx.fx_XMM_reg[0][0], sizeof(fps->fx.fx_XMM_reg));
        bzero(fps->avx.x_YMM_Hi128, sizeof(fps->avx.x_YMM_Hi128));
        /* Clear vector register store */
        bzero(&fps->fx.fx_XMM_reg[0][0], sizeof(fps->fx.fx_XMM_reg));
        bzero(fps->avx.x_YMM_Hi128, sizeof(fps->avx.x_YMM_Hi128));
-#if !defined(RC_HIDE_XNU_J137)
        if (fpu_ZMM_capable) {
                bzero(fps->avx512.x_ZMM_Hi256, sizeof(fps->avx512.x_ZMM_Hi256));
                bzero(fps->avx512.x_Hi16_ZMM, sizeof(fps->avx512.x_Hi16_ZMM));
                bzero(fps->avx512.x_Opmask, sizeof(fps->avx512.x_Opmask));
        }
        if (fpu_ZMM_capable) {
                bzero(fps->avx512.x_ZMM_Hi256, sizeof(fps->avx512.x_ZMM_Hi256));
                bzero(fps->avx512.x_Hi16_ZMM, sizeof(fps->avx512.x_Hi16_ZMM));
                bzero(fps->avx512.x_Opmask, sizeof(fps->avx512.x_Opmask));
        }
-#endif
 
        fps->fx.fp_valid = TRUE;
        fps->fx.fp_save_layout = fpu_YMM_capable ? XSAVE32: FXSAVE32;
 
        fps->fx.fp_valid = TRUE;
        fps->fx.fp_save_layout = fpu_YMM_capable ? XSAVE32: FXSAVE32;
@@ -458,7 +435,6 @@ init_fpu(void)
 
        PE_parse_boot_argn("fpsimd_fault_popc", &fpsimd_fault_popc, sizeof(fpsimd_fault_popc));
 
 
        PE_parse_boot_argn("fpsimd_fault_popc", &fpsimd_fault_popc, sizeof(fpsimd_fault_popc));
 
-#if !defined(RC_HIDE_XNU_J137)
        static boolean_t is_avx512_enabled = TRUE;
        if (cpu_number() == master_cpu) {
                if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512F) {
        static boolean_t is_avx512_enabled = TRUE;
        if (cpu_number() == master_cpu) {
                if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512F) {
@@ -467,14 +443,12 @@ init_fpu(void)
                            is_avx512_enabled ? "and enabled" : "but disabled");
                }
        }
                            is_avx512_enabled ? "and enabled" : "but disabled");
                }
        }
-#endif
 
        /* Configure the XSAVE context mechanism if the processor supports
         * AVX/YMM registers
         */
        if (cpuid_features() & CPUID_FEATURE_XSAVE) {
                cpuid_xsave_leaf_t *xs0p = &cpuid_info()->cpuid_xsave_leaf[0];
 
        /* Configure the XSAVE context mechanism if the processor supports
         * AVX/YMM registers
         */
        if (cpuid_features() & CPUID_FEATURE_XSAVE) {
                cpuid_xsave_leaf_t *xs0p = &cpuid_info()->cpuid_xsave_leaf[0];
-#if !defined(RC_HIDE_XNU_J137)
                if (is_avx512_enabled &&
                    (xs0p->extended_state[eax] & XFEM_ZMM) == XFEM_ZMM) {
                        assert(xs0p->extended_state[eax] & XFEM_SSE);
                if (is_avx512_enabled &&
                    (xs0p->extended_state[eax] & XFEM_ZMM) == XFEM_ZMM) {
                        assert(xs0p->extended_state[eax] & XFEM_SSE);
@@ -495,9 +469,7 @@ init_fpu(void)
                         */
                        xsetbv(0, AVX_XMASK);
                        fpu_default = AVX;
                         */
                        xsetbv(0, AVX_XMASK);
                        fpu_default = AVX;
-               } else
-#endif
-               if (xs0p->extended_state[eax] & XFEM_YMM) {
+               } else if (xs0p->extended_state[eax] & XFEM_YMM) {
                        assert(xs0p->extended_state[eax] & XFEM_SSE);
                        fpu_capability = AVX;
                        fpu_default = AVX;
                        assert(xs0p->extended_state[eax] & XFEM_SSE);
                        fpu_capability = AVX;
                        fpu_default = AVX;
@@ -535,22 +507,8 @@ init_fpu(void)
 static void *
 fp_state_alloc(xstate_t xs)
 {
 static void *
 fp_state_alloc(xstate_t xs)
 {
-       struct x86_fx_thread_state *ifps;
-
        assert(ifps_zone[xs] != NULL);
        assert(ifps_zone[xs] != NULL);
-       ifps = zalloc(ifps_zone[xs]);
-
-#if     DEBUG
-       if (!(ALIGNED(ifps, 64))) {
-               panic("fp_state_alloc: %p, %u, %p, %u",
-                   ifps, (unsigned) ifps_zone[xs]->elem_size,
-                   (void *) ifps_zone[xs]->free_elements,
-                   (unsigned) ifps_zone[xs]->alloc_size);
-       }
-#endif
-       bzero(ifps, fp_state_size[xs]);
-
-       return ifps;
+       return zalloc_flags(ifps_zone[xs], Z_WAITOK | Z_ZERO);
 }
 
 static inline void
 }
 
 static inline void
@@ -566,6 +524,19 @@ clear_fpu(void)
        set_ts();
 }
 
        set_ts();
 }
 
+static boolean_t
+fpu_allzeroes(uint64_t * __attribute((aligned(8)))ptr, uint32_t size)
+{
+       VERIFY_SAVEAREA_ALIGNED(ptr, sizeof(uint64_t));
+       assertf((size & (sizeof(uint64_t) - 1)) == 0, "FP save area component not a multiple of 8 bytes");
+
+       for (uint32_t count = 0; count < (size / sizeof(uint64_t)); count++) {
+               if (ptr[count] != 0) {
+                       return FALSE;
+               }
+       }
+       return TRUE;
+}
 
 static void
 fpu_load_registers(void *fstate)
 
 static void
 fpu_load_registers(void *fstate)
@@ -636,9 +607,7 @@ fpu_store_registers(void *fstate, boolean_t is64)
                }
                break;
        case AVX:
                }
                break;
        case AVX:
-#if !defined(RC_HIDE_XNU_J137)
        case AVX512:
        case AVX512:
-#endif
                if (is64) {
                        xsave64(ifps, xstate_xmask[xs]);
                        ifps->fp_save_layout = XSAVE64;
                if (is64) {
                        xsave64(ifps, xstate_xmask[xs]);
                        ifps->fp_save_layout = XSAVE64;
@@ -664,34 +633,20 @@ fpu_module_init(void)
                    fpu_default);
        }
 
                    fpu_default);
        }
 
-       /* We explicitly choose an allocation size of 13 pages = 64 * 832
-        * to eliminate waste for the 832 byte sized
-        * AVX XSAVE register save area.
-        */
-       ifps_zone[fpu_default] = zinit(fp_state_size[fpu_default],
-           thread_max * fp_state_size[fpu_default],
-           64 * fp_state_size[fpu_default],
-           "x86 fpsave state");
-
        /* To maintain the required alignment, disable
         * zone debugging for this zone as that appends
         * 16 bytes to each element.
         */
        /* To maintain the required alignment, disable
         * zone debugging for this zone as that appends
         * 16 bytes to each element.
         */
-       zone_change(ifps_zone[fpu_default], Z_ALIGNMENT_REQUIRED, TRUE);
+       ifps_zone[fpu_default] = zone_create("x86 fpsave state",
+           fp_state_size[fpu_default], ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM);
 
 
-#if !defined(RC_HIDE_XNU_J137)
        /*
         * If AVX512 is supported, create a separate savearea zone.
        /*
         * If AVX512 is supported, create a separate savearea zone.
-        * with allocation size: 19 pages = 32 * 2668
         */
        if (fpu_capability == AVX512) {
         */
        if (fpu_capability == AVX512) {
-               ifps_zone[AVX512] = zinit(fp_state_size[AVX512],
-                   thread_max * fp_state_size[AVX512],
-                   32 * fp_state_size[AVX512],
-                   "x86 avx512 save state");
-               zone_change(ifps_zone[AVX512], Z_ALIGNMENT_REQUIRED, TRUE);
+               ifps_zone[AVX512] = zone_create("x86 avx512 save state",
+                   fp_state_size[AVX512], ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM);
        }
        }
-#endif
 
        /* Determine MXCSR reserved bits and configure initial FPU state*/
        configure_mxcsr_capability_mask(&initial_fp_state);
 
        /* Determine MXCSR reserved bits and configure initial FPU state*/
        configure_mxcsr_capability_mask(&initial_fp_state);
@@ -765,13 +720,19 @@ fpu_free(thread_t thread, void *fps)
 }
 
 /*
 }
 
 /*
- * Set the floating-point state for a thread based
- * on the FXSave formatted data. This is basically
- * the same as fpu_set_state except it uses the
- * expanded data structure.
- * If the thread is not the current thread, it is
- * not running (held).  Locking needed against
- * concurrent fpu_set_state or fpu_get_state.
+ * Set the floating-point state for a thread based on the FXSave formatted data.
+ * This is basically the same as fpu_set_state except it uses the expanded data
+ * structure.
+ * If the thread is not the current thread, it is not running (held).  Locking
+ * needed against concurrent fpu_set_state or fpu_get_state.
+ *
+ * While translating between XNU FP state structures and the CPU-native XSAVE area,
+ * if we detect state components that are all zeroes, we clear the corresponding
+ * xstate_bv bit in the XSAVE area, because that allows the corresponding state to
+ * be initialized to a "clean" state.  That's most important when clearing the YMM
+ * bit, since an initialized "upper clean" state results in a massive performance
+ * improvement due to elimination of false dependencies between the XMMs and the
+ * upper bits of the YMMs.
  */
 kern_return_t
 fpu_set_fxstate(
  */
 kern_return_t
 fpu_set_fxstate(
@@ -784,6 +745,7 @@ fpu_set_fxstate(
        x86_float_state64_t             *state;
        pcb_t                           pcb;
        boolean_t                       old_valid, fresh_state = FALSE;
        x86_float_state64_t             *state;
        pcb_t                           pcb;
        boolean_t                       old_valid, fresh_state = FALSE;
+       xstate_t                        thr_xstate;
 
        if (fpu_capability == UNDEFINED) {
                return KERN_FAILURE;
 
        if (fpu_capability == UNDEFINED) {
                return KERN_FAILURE;
@@ -794,18 +756,22 @@ fpu_set_fxstate(
                return KERN_FAILURE;
        }
 
                return KERN_FAILURE;
        }
 
-#if !defined(RC_HIDE_XNU_J137)
+       assert(thr_act != THREAD_NULL);
+
+       thr_xstate = thread_xstate(thr_act);
+
        if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
        if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
-           thread_xstate(thr_act) == AVX) {
+           thr_xstate == AVX) {
                if (!fpu_thread_promote_avx512(thr_act)) {
                        return KERN_FAILURE;
                if (!fpu_thread_promote_avx512(thr_act)) {
                        return KERN_FAILURE;
+               } else {
+                       /* Reload thr_xstate after successful promotion */
+                       thr_xstate = thread_xstate(thr_act);
                }
        }
                }
        }
-#endif
 
        state = (x86_float_state64_t *)tstate;
 
 
        state = (x86_float_state64_t *)tstate;
 
-       assert(thr_act != THREAD_NULL);
        pcb = THREAD_TO_PCB(thr_act);
 
        if (state == NULL) {
        pcb = THREAD_TO_PCB(thr_act);
 
        if (state == NULL) {
@@ -821,7 +787,7 @@ fpu_set_fxstate(
                simple_unlock(&pcb->lock);
 
                if (ifps != 0) {
                simple_unlock(&pcb->lock);
 
                if (ifps != 0) {
-                       fp_state_free(ifps, thread_xstate(thr_act));
+                       fp_state_free(ifps, thr_xstate);
                }
        } else {
                /*
                }
        } else {
                /*
@@ -835,13 +801,13 @@ Retry:
                if (ifps == 0) {
                        if (new_ifps == 0) {
                                simple_unlock(&pcb->lock);
                if (ifps == 0) {
                        if (new_ifps == 0) {
                                simple_unlock(&pcb->lock);
-                               new_ifps = fp_state_alloc(thread_xstate(thr_act));
+                               new_ifps = fp_state_alloc(thr_xstate);
                                goto Retry;
                        }
                        ifps = new_ifps;
                        new_ifps = 0;
                        pcb->ifps = ifps;
                                goto Retry;
                        }
                        ifps = new_ifps;
                        new_ifps = 0;
                        pcb->ifps = ifps;
-                       pcb->xstate = thread_xstate(thr_act);
+                       pcb->xstate = thr_xstate;
                        fresh_state = TRUE;
                }
 
                        fresh_state = TRUE;
                }
 
@@ -865,12 +831,12 @@ Retry:
 
                __nochk_bcopy((char *)&state->fpu_fcw, (char *)ifps, fp_state_size[FP]);
 
 
                __nochk_bcopy((char *)&state->fpu_fcw, (char *)ifps, fp_state_size[FP]);
 
-               switch (thread_xstate(thr_act)) {
+               switch (thr_xstate) {
                case UNDEFINED_FULL:
                case FP_FULL:
                case AVX_FULL:
                case AVX512_FULL:
                case UNDEFINED_FULL:
                case FP_FULL:
                case AVX_FULL:
                case AVX512_FULL:
-                       panic("fpu_set_fxstate() INVALID xstate: 0x%x", thread_xstate(thr_act));
+                       panic("fpu_set_fxstate() INVALID xstate: 0x%x", thr_xstate);
                        break;
 
                case UNDEFINED:
                        break;
 
                case UNDEFINED:
@@ -890,16 +856,25 @@ Retry:
                        iavx->_xh.xstate_bv = AVX_XMASK;
                        iavx->_xh.xcomp_bv  = 0;
 
                        iavx->_xh.xstate_bv = AVX_XMASK;
                        iavx->_xh.xcomp_bv  = 0;
 
+                       /*
+                        * See the block comment at the top of the function for a description of why we're clearing
+                        * xstate_bv bits.
+                        */
                        if (f == x86_AVX_STATE32) {
                                __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
                        if (f == x86_AVX_STATE32) {
                                __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
+                               if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
+                                       iavx->_xh.xstate_bv &= ~XFEM_YMM;
+                               }
                        } else if (f == x86_AVX_STATE64) {
                                __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
                        } else if (f == x86_AVX_STATE64) {
                                __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
+                               if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
+                                       iavx->_xh.xstate_bv &= ~XFEM_YMM;
+                               }
                        } else {
                                iavx->_xh.xstate_bv = (XFEM_SSE | XFEM_X87);
                        }
                        break;
                }
                        } else {
                                iavx->_xh.xstate_bv = (XFEM_SSE | XFEM_X87);
                        }
                        break;
                }
-#if !defined(RC_HIDE_XNU_J137)
                case AVX512: {
                        struct x86_avx512_thread_state *iavx = (void *) ifps;
                        union {
                case AVX512: {
                        struct x86_avx512_thread_state *iavx = (void *) ifps;
                        union {
@@ -915,30 +890,59 @@ Retry:
                        iavx->_xh.xstate_bv = AVX512_XMASK;
                        iavx->_xh.xcomp_bv  = 0;
 
                        iavx->_xh.xstate_bv = AVX512_XMASK;
                        iavx->_xh.xcomp_bv  = 0;
 
+                       /*
+                        * See the block comment at the top of the function for a description of why we're clearing
+                        * xstate_bv bits.
+                        */
                        switch (f) {
                        case x86_AVX512_STATE32:
                                __nochk_bcopy(&xs.s32->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
                                __nochk_bcopy(&xs.s32->fpu_zmmh0, iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG));
                        switch (f) {
                        case x86_AVX512_STATE32:
                                __nochk_bcopy(&xs.s32->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
                                __nochk_bcopy(&xs.s32->fpu_zmmh0, iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG));
+                               if (fpu_allzeroes((uint64_t *)(void *)iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG)) == TRUE) {
+                                       iavx->_xh.xstate_bv &= ~XFEM_ZMM;
+                               }
                                __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
                                __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
+                               if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
+                                       iavx->_xh.xstate_bv &= ~XFEM_YMM;
+                               }
+
                                DBG_AVX512_STATE(iavx);
                                break;
                        case x86_AVX_STATE32:
                                __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
                                DBG_AVX512_STATE(iavx);
                                break;
                        case x86_AVX_STATE32:
                                __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
+                               if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
+                                       iavx->_xh.xstate_bv &= ~XFEM_YMM;
+                               }
                                break;
                        case x86_AVX512_STATE64:
                                __nochk_bcopy(&xs.s64->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
                                __nochk_bcopy(&xs.s64->fpu_zmm16, iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG));
                                __nochk_bcopy(&xs.s64->fpu_zmmh0, iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG));
                                break;
                        case x86_AVX512_STATE64:
                                __nochk_bcopy(&xs.s64->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
                                __nochk_bcopy(&xs.s64->fpu_zmm16, iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG));
                                __nochk_bcopy(&xs.s64->fpu_zmmh0, iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG));
+                               /*
+                                * Note that it is valid to have XFEM_ZMM set but XFEM_YMM cleared.  In that case,
+                                * the upper bits of the YMMs would be cleared and would result in a clean-upper
+                                * state, allowing SSE instruction to avoid false dependencies.
+                                */
+                               if (fpu_allzeroes((uint64_t *)(void *)iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG)) == TRUE &&
+                                   fpu_allzeroes((uint64_t *)(void *)iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG)) == TRUE) {
+                                       iavx->_xh.xstate_bv &= ~XFEM_ZMM;
+                               }
+
                                __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
                                __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
+                               if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
+                                       iavx->_xh.xstate_bv &= ~XFEM_YMM;
+                               }
                                DBG_AVX512_STATE(iavx);
                                break;
                        case x86_AVX_STATE64:
                                __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
                                DBG_AVX512_STATE(iavx);
                                break;
                        case x86_AVX_STATE64:
                                __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
+                               if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
+                                       iavx->_xh.xstate_bv &= ~XFEM_YMM;
+                               }
                                break;
                        }
                        break;
                }
                                break;
                        }
                        break;
                }
-#endif
                }
 
                ifps->fp_valid = old_valid;
                }
 
                ifps->fp_valid = old_valid;
@@ -957,7 +961,7 @@ Retry:
                simple_unlock(&pcb->lock);
 
                if (new_ifps != 0) {
                simple_unlock(&pcb->lock);
 
                if (new_ifps != 0) {
-                       fp_state_free(new_ifps, thread_xstate(thr_act));
+                       fp_state_free(new_ifps, thr_xstate);
                }
        }
        return KERN_SUCCESS;
                }
        }
        return KERN_SUCCESS;
@@ -979,6 +983,7 @@ fpu_get_fxstate(
        x86_float_state64_t             *state;
        kern_return_t                   ret = KERN_FAILURE;
        pcb_t                           pcb;
        x86_float_state64_t             *state;
        kern_return_t                   ret = KERN_FAILURE;
        pcb_t                           pcb;
+       xstate_t                        thr_xstate = thread_xstate(thr_act);
 
        if (fpu_capability == UNDEFINED) {
                return KERN_FAILURE;
 
        if (fpu_capability == UNDEFINED) {
                return KERN_FAILURE;
@@ -989,12 +994,10 @@ fpu_get_fxstate(
                return KERN_FAILURE;
        }
 
                return KERN_FAILURE;
        }
 
-#if !defined(RC_HIDE_XNU_J137)
        if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
        if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
-           thread_xstate(thr_act) != AVX512) {
+           thr_xstate != AVX512) {
                return KERN_FAILURE;
        }
                return KERN_FAILURE;
        }
-#endif
 
        state = (x86_float_state64_t *)tstate;
 
 
        state = (x86_float_state64_t *)tstate;
 
@@ -1033,12 +1036,12 @@ fpu_get_fxstate(
        }
        if (ifps->fp_valid) {
                __nochk_bcopy((char *)ifps, (char *)&state->fpu_fcw, fp_state_size[FP]);
        }
        if (ifps->fp_valid) {
                __nochk_bcopy((char *)ifps, (char *)&state->fpu_fcw, fp_state_size[FP]);
-               switch (thread_xstate(thr_act)) {
+               switch (thr_xstate) {
                case UNDEFINED_FULL:
                case FP_FULL:
                case AVX_FULL:
                case AVX512_FULL:
                case UNDEFINED_FULL:
                case FP_FULL:
                case AVX_FULL:
                case AVX512_FULL:
-                       panic("fpu_get_fxstate() INVALID xstate: 0x%x", thread_xstate(thr_act));
+                       panic("fpu_get_fxstate() INVALID xstate: 0x%x", thr_xstate);
                        break;
 
                case UNDEFINED:
                        break;
 
                case UNDEFINED:
@@ -1056,7 +1059,6 @@ fpu_get_fxstate(
                        }
                        break;
                }
                        }
                        break;
                }
-#if !defined(RC_HIDE_XNU_J137)
                case AVX512: {
                        struct x86_avx512_thread_state *iavx = (void *) ifps;
                        union {
                case AVX512: {
                        struct x86_avx512_thread_state *iavx = (void *) ifps;
                        union {
@@ -1087,7 +1089,6 @@ fpu_get_fxstate(
                        }
                        break;
                }
                        }
                        break;
                }
-#endif
                }
 
                ret = KERN_SUCCESS;
                }
 
                ret = KERN_SUCCESS;
@@ -1295,7 +1296,7 @@ fpextovrflt(void)
        intr = ml_set_interrupts_enabled(FALSE);
 
        if (get_interrupt_level()) {
        intr = ml_set_interrupts_enabled(FALSE);
 
        if (get_interrupt_level()) {
-               panic("FPU segment overrun exception  at interrupt context\n");
+               panic("FPU segment overrun exception at interrupt context\n");
        }
        if (current_task() == kernel_task) {
                panic("FPU segment overrun exception in kernel thread context\n");
        }
        if (current_task() == kernel_task) {
                panic("FPU segment overrun exception in kernel thread context\n");
@@ -1327,12 +1328,6 @@ fpextovrflt(void)
        if (ifps) {
                fp_state_free(ifps, xstate);
        }
        if (ifps) {
                fp_state_free(ifps, xstate);
        }
-
-       /*
-        * Raise exception.
-        */
-       i386_exception(EXC_BAD_ACCESS, VM_PROT_READ | VM_PROT_EXECUTE, 0);
-       /*NOTREACHED*/
 }
 
 extern void fpxlog(int, uint32_t, uint32_t, uint32_t);
 }
 
 extern void fpxlog(int, uint32_t, uint32_t, uint32_t);
@@ -1369,16 +1364,6 @@ fpexterrflt(void)
        const uint32_t xcpt = ~mask & (ifps->fx_status &
            (FPS_IE | FPS_DE | FPS_ZE | FPS_OE | FPS_UE | FPS_PE));
        fpxlog(EXC_I386_EXTERR, ifps->fx_status, ifps->fx_control, xcpt);
        const uint32_t xcpt = ~mask & (ifps->fx_status &
            (FPS_IE | FPS_DE | FPS_ZE | FPS_OE | FPS_UE | FPS_PE));
        fpxlog(EXC_I386_EXTERR, ifps->fx_status, ifps->fx_control, xcpt);
-       /*
-        * Raise FPU exception.
-        * Locking not needed on pcb->ifps,
-        * since thread is running.
-        */
-       i386_exception(EXC_ARITHMETIC,
-           EXC_I386_EXTERR,
-           ifps->fx_status);
-
-       /*NOTREACHED*/
 }
 
 /*
 }
 
 /*
@@ -1473,20 +1458,15 @@ fpSSEexterrflt(void)
        const uint32_t xcpt = ~mask & (ifps->fx_MXCSR &
            (FPS_IE | FPS_DE | FPS_ZE | FPS_OE | FPS_UE | FPS_PE));
        fpxlog(EXC_I386_SSEEXTERR, ifps->fx_MXCSR, ifps->fx_MXCSR, xcpt);
        const uint32_t xcpt = ~mask & (ifps->fx_MXCSR &
            (FPS_IE | FPS_DE | FPS_ZE | FPS_OE | FPS_UE | FPS_PE));
        fpxlog(EXC_I386_SSEEXTERR, ifps->fx_MXCSR, ifps->fx_MXCSR, xcpt);
-
-       i386_exception(EXC_ARITHMETIC,
-           EXC_I386_SSEEXTERR,
-           ifps->fx_MXCSR);
-       /*NOTREACHED*/
 }
 
 
 }
 
 
-#if !defined(RC_HIDE_XNU_J137)
 /*
  * If a thread is using an AVX-sized savearea:
  * - allocate a new AVX512-sized  area,
  * - copy the 256-bit state into the 512-bit area,
  * - deallocate the smaller area
 /*
  * If a thread is using an AVX-sized savearea:
  * - allocate a new AVX512-sized  area,
  * - copy the 256-bit state into the 512-bit area,
  * - deallocate the smaller area
+ * ASSUMES: thread is the current thread.
  */
 static void
 fpu_savearea_promote_avx512(thread_t thread)
  */
 static void
 fpu_savearea_promote_avx512(thread_t thread)
@@ -1495,8 +1475,11 @@ fpu_savearea_promote_avx512(thread_t thread)
        struct x86_avx512_thread_state  *ifps512 = NULL;
        pcb_t                           pcb = THREAD_TO_PCB(thread);
        boolean_t                       do_avx512_alloc = FALSE;
        struct x86_avx512_thread_state  *ifps512 = NULL;
        pcb_t                           pcb = THREAD_TO_PCB(thread);
        boolean_t                       do_avx512_alloc = FALSE;
+       boolean_t                       intr;
+
+       assert(thread == current_thread());
 
 
-       DBG("fpu_upgrade_savearea(%p)\n", thread);
+       DBG("fpu_savearea_promote_avx512(%p)\n", thread);
 
        simple_lock(&pcb->lock, LCK_GRP_NULL);
 
 
        simple_lock(&pcb->lock, LCK_GRP_NULL);
 
@@ -1504,11 +1487,21 @@ fpu_savearea_promote_avx512(thread_t thread)
        if (ifps == NULL) {
                pcb->xstate = AVX512;
                simple_unlock(&pcb->lock);
        if (ifps == NULL) {
                pcb->xstate = AVX512;
                simple_unlock(&pcb->lock);
-               if (thread != current_thread()) {
-                       /* nothing to be done */
+               /*
+                * Now that the PCB xstate has been promoted, set XCR0 so
+                * that we don't re-trip #UD on the next AVX-512 instruction.
+                *
+                * Since this branch is taken when the first FP instruction
+                * attempted by this thread is an AVX-512 instruction, we
+                * call fpnoextflt() to allocate an appropriately-sized
+                * AVX-512 save-area, thereby avoiding the overhead of another
+                * fault that would be triggered immediately on return.
+                */
+               intr = ml_set_interrupts_enabled(FALSE);
+               xsetbv(0, AVX512_XMASK);
+               current_cpu_datap()->cpu_xstate = AVX512;
+               (void)ml_set_interrupts_enabled(intr);
 
 
-                       return;
-               }
                fpnoextflt();
                return;
        }
                fpnoextflt();
                return;
        }
@@ -1516,6 +1509,7 @@ fpu_savearea_promote_avx512(thread_t thread)
        if (pcb->xstate != AVX512) {
                do_avx512_alloc = TRUE;
        }
        if (pcb->xstate != AVX512) {
                do_avx512_alloc = TRUE;
        }
+
        simple_unlock(&pcb->lock);
 
        if (do_avx512_alloc == TRUE) {
        simple_unlock(&pcb->lock);
 
        if (do_avx512_alloc == TRUE) {
@@ -1523,19 +1517,17 @@ fpu_savearea_promote_avx512(thread_t thread)
        }
 
        simple_lock(&pcb->lock, LCK_GRP_NULL);
        }
 
        simple_lock(&pcb->lock, LCK_GRP_NULL);
-       if (thread == current_thread()) {
-               boolean_t       intr;
 
 
-               intr = ml_set_interrupts_enabled(FALSE);
+       intr = ml_set_interrupts_enabled(FALSE);
 
 
-               clear_ts();
-               fp_save(thread);
-               clear_fpu();
+       clear_ts();
+       fp_save(thread);
+       clear_fpu();
+
+       xsetbv(0, AVX512_XMASK);
+       current_cpu_datap()->cpu_xstate = AVX512;
+       (void)ml_set_interrupts_enabled(intr);
 
 
-               xsetbv(0, AVX512_XMASK);
-               current_cpu_datap()->cpu_xstate = AVX512;
-               (void)ml_set_interrupts_enabled(intr);
-       }
        assert(ifps->fp.fp_valid);
 
        /* Allocate an AVX512 savearea and copy AVX state into it */
        assert(ifps->fp.fp_valid);
 
        /* Allocate an AVX512 savearea and copy AVX state into it */
@@ -1589,11 +1581,12 @@ fpu_thread_promote_avx512(thread_t thread)
  * If the user is attempting an AVX512 instruction on a machine
  * that supports this, we switch the calling thread to use
  * a larger savearea, set its XCR0 bit mask to enable AVX512 and
  * If the user is attempting an AVX512 instruction on a machine
  * that supports this, we switch the calling thread to use
  * a larger savearea, set its XCR0 bit mask to enable AVX512 and
- * return directly via thread_exception_return().
- * Otherwise simply return.
+ * return to user_trap() with a 0 return value.
+ * Otherwise, simply return a nonzero value.
  */
  */
-#define MAX_X86_INSN_LENGTH (16)
-void
+
+#define MAX_X86_INSN_LENGTH (15)
+int
 fpUDflt(user_addr_t rip)
 {
        uint8_t         instruction_prefix;
 fpUDflt(user_addr_t rip)
 {
        uint8_t         instruction_prefix;
@@ -1605,7 +1598,7 @@ fpUDflt(user_addr_t rip)
                 * rather than issue multiple copyins
                 */
                if (copyin(rip, (char *) &instruction_prefix, 1)) {
                 * rather than issue multiple copyins
                 */
                if (copyin(rip, (char *) &instruction_prefix, 1)) {
-                       return;
+                       return 1;
                }
                DBG("fpUDflt(0x%016llx) prefix: 0x%x\n",
                    rip, instruction_prefix);
                }
                DBG("fpUDflt(0x%016llx) prefix: 0x%x\n",
                    rip, instruction_prefix);
@@ -1624,7 +1617,7 @@ fpUDflt(user_addr_t rip)
                        /* Skip optional prefixes */
                        rip++;
                        if ((rip - original_rip) > MAX_X86_INSN_LENGTH) {
                        /* Skip optional prefixes */
                        rip++;
                        if ((rip - original_rip) > MAX_X86_INSN_LENGTH) {
-                               return;
+                               return 1;
                        }
                        break;
                case 0x62:      /* EVEX */
                        }
                        break;
                case 0x62:      /* EVEX */
@@ -1633,7 +1626,7 @@ fpUDflt(user_addr_t rip)
                        is_AVX512_instruction = TRUE;
                        break;
                default:
                        is_AVX512_instruction = TRUE;
                        break;
                default:
-                       return;
+                       return 1;
                }
        } while (!is_AVX512_instruction);
 
                }
        } while (!is_AVX512_instruction);
 
@@ -1643,7 +1636,7 @@ fpUDflt(user_addr_t rip)
         * Fail if this machine doesn't support AVX512
         */
        if (fpu_capability != AVX512) {
         * Fail if this machine doesn't support AVX512
         */
        if (fpu_capability != AVX512) {
-               return;
+               return 1;
        }
 
        assert(xgetbv(XCR0) == AVX_XMASK);
        }
 
        assert(xgetbv(XCR0) == AVX_XMASK);
@@ -1651,10 +1644,8 @@ fpUDflt(user_addr_t rip)
        DBG("fpUDflt() switching xstate to AVX512\n");
        (void) fpu_thread_promote_avx512(current_thread());
 
        DBG("fpUDflt() switching xstate to AVX512\n");
        (void) fpu_thread_promote_avx512(current_thread());
 
-       thread_exception_return();
-       /* NOT REACHED */
+       return 0;
 }
 }
-#endif /* !defined(RC_HIDE_XNU_J137) */
 
 void
 fp_setvalid(boolean_t value)
 
 void
 fp_setvalid(boolean_t value)
@@ -1679,13 +1670,11 @@ ml_fpu_avx_enabled(void)
        return fpu_capability >= AVX;
 }
 
        return fpu_capability >= AVX;
 }
 
-#if !defined(RC_HIDE_XNU_J137)
 boolean_t
 ml_fpu_avx512_enabled(void)
 {
        return fpu_capability == AVX512;
 }
 boolean_t
 ml_fpu_avx512_enabled(void)
 {
        return fpu_capability == AVX512;
 }
-#endif
 
 static xstate_t
 task_xstate(task_t task)
 
 static xstate_t
 task_xstate(task_t task)