X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/ea3f04195ba4a5034c9c8e9b726d4f7ce96f1832..2a1bd2d3eef5c7a7bb14f4bb9fdbca9a96ee4752:/osfmk/i386/fpu.c?ds=sidebyside

diff --git a/osfmk/i386/fpu.c b/osfmk/i386/fpu.c
index 1d7475429..e960ee4ef 100644
--- a/osfmk/i386/fpu.c
+++ b/osfmk/i386/fpu.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -80,6 +80,9 @@ xstate_t        fpu_capability = UNDEFINED;     /* extended state capability */
 xstate_t        fpu_default = UNDEFINED;        /* default extended state */
 
 #define ALIGNED(addr, size)      (((uintptr_t)(addr)&((size)-1))==0)
+#define VERIFY_SAVEAREA_ALIGNED(p, a) \
+	assertf(!(((uintptr_t)(p)) & ((a) - 1)), \
+	    "FP save area component @ 0x%lx not 8-byte aligned", ((uintptr_t)(p)))
 
 /* Forward */
 
@@ -147,18 +150,18 @@ fxsave64(struct x86_fx_thread_state *a)
 
 #define IS_VALID_XSTATE(x)      ((x) == FP || (x) == AVX || (x) == AVX512)
 
-zone_t          ifps_zone[] = {
+SECURITY_READ_ONLY_LATE(zone_t) ifps_zone[] = {
 	[FP]     = NULL,
 	[AVX]    = NULL,
 	[AVX512] = NULL
 };
-static uint32_t fp_state_size[] = {
+static const uint32_t fp_state_size[] = {
 	[FP]     = sizeof(struct x86_fx_thread_state),
 	[AVX]    = sizeof(struct x86_avx_thread_state),
 	[AVX512] = sizeof(struct x86_avx512_thread_state)
 };
 
-static const char *xstate_name[] = {
+static const char *const xstate_name[] = {
 	[UNDEFINED] = "UNDEFINED",
 	[FP] = "FP",
 	[AVX] = "AVX",
@@ -504,22 +507,8 @@ init_fpu(void)
 static void *
 fp_state_alloc(xstate_t xs)
 {
-	struct x86_fx_thread_state *ifps;
-
 	assert(ifps_zone[xs] != NULL);
-	ifps = zalloc(ifps_zone[xs]);
-
-#if     DEBUG
-	if (!(ALIGNED(ifps, 64))) {
-		panic("fp_state_alloc: %p, %u, %p, %u",
-		    ifps, (unsigned) ifps_zone[xs]->elem_size,
-		    (void *) ifps_zone[xs]->free_elements,
-		    (unsigned) ifps_zone[xs]->alloc_size);
-	}
-#endif
-	bzero(ifps, fp_state_size[xs]);
-
-	return ifps;
+	return zalloc_flags(ifps_zone[xs], Z_WAITOK | Z_ZERO);
 }
 
 static inline void
@@ -535,6 +524,19 @@ clear_fpu(void)
 	set_ts();
 }
 
+static boolean_t
+fpu_allzeroes(uint64_t * __attribute((aligned(8)))ptr, uint32_t size)
+{
+	VERIFY_SAVEAREA_ALIGNED(ptr, sizeof(uint64_t));
+	assertf((size & (sizeof(uint64_t) - 1)) == 0, "FP save area component not a multiple of 8 bytes");
+
+	for (uint32_t count = 0; count < (size / sizeof(uint64_t)); count++) {
+		if (ptr[count] != 0) {
+			return FALSE;
+		}
+	}
+	return TRUE;
+}
 
 static void
 fpu_load_registers(void *fstate)
@@ -631,31 +633,19 @@ fpu_module_init(void)
 		    fpu_default);
 	}
 
-	/* We explicitly choose an allocation size of 13 pages = 64 * 832
-	 * to eliminate waste for the 832 byte sized
-	 * AVX XSAVE register save area.
-	 */
-	ifps_zone[fpu_default] = zinit(fp_state_size[fpu_default],
-	    thread_max * fp_state_size[fpu_default],
-	    64 * fp_state_size[fpu_default],
-	    "x86 fpsave state");
-
 	/* To maintain the required alignment, disable
 	 * zone debugging for this zone as that appends
 	 * 16 bytes to each element.
 	 */
-	zone_change(ifps_zone[fpu_default], Z_ALIGNMENT_REQUIRED, TRUE);
+	ifps_zone[fpu_default] = zone_create("x86 fpsave state",
+	    fp_state_size[fpu_default], ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM);
 
 	/*
 	 * If AVX512 is supported, create a separate savearea zone.
-	 * with allocation size: 19 pages = 32 * 2668
 	 */
 	if (fpu_capability == AVX512) {
-		ifps_zone[AVX512] = zinit(fp_state_size[AVX512],
-		    thread_max * fp_state_size[AVX512],
-		    32 * fp_state_size[AVX512],
-		    "x86 avx512 save state");
-		zone_change(ifps_zone[AVX512], Z_ALIGNMENT_REQUIRED, TRUE);
+		ifps_zone[AVX512] = zone_create("x86 avx512 save state",
+		    fp_state_size[AVX512], ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM);
 	}
 
 	/* Determine MXCSR reserved bits and configure initial FPU state*/
@@ -730,13 +720,19 @@ fpu_free(thread_t thread, void *fps)
 }
 
 /*
- * Set the floating-point state for a thread based
- * on the FXSave formatted data. This is basically
- * the same as fpu_set_state except it uses the
- * expanded data structure.
- * If the thread is not the current thread, it is
- * not running (held).  Locking needed against
- * concurrent fpu_set_state or fpu_get_state.
+ * Set the floating-point state for a thread based on the FXSave formatted data.
+ * This is basically the same as fpu_set_state except it uses the expanded data
+ * structure.
+ * If the thread is not the current thread, it is not running (held).  Locking
+ * needed against concurrent fpu_set_state or fpu_get_state.
+ *
+ * While translating between XNU FP state structures and the CPU-native XSAVE area,
+ * if we detect state components that are all zeroes, we clear the corresponding
+ * xstate_bv bit in the XSAVE area, because that allows the corresponding state to
+ * be initialized to a "clean" state.  That's most important when clearing the YMM
+ * bit, since an initialized "upper clean" state results in a massive performance
+ * improvement due to elimination of false dependencies between the XMMs and the
+ * upper bits of the YMMs.
  */
 kern_return_t
 fpu_set_fxstate(
@@ -860,10 +856,20 @@ Retry:
 			iavx->_xh.xstate_bv = AVX_XMASK;
 			iavx->_xh.xcomp_bv  = 0;
 
+			/*
+			 * See the block comment at the top of the function for a description of why we're clearing
+			 * xstate_bv bits.
+			 */
 			if (f == x86_AVX_STATE32) {
 				__nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
+				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
+					iavx->_xh.xstate_bv &= ~XFEM_YMM;
+				}
 			} else if (f == x86_AVX_STATE64) {
 				__nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
+				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
+					iavx->_xh.xstate_bv &= ~XFEM_YMM;
+				}
 			} else {
 				iavx->_xh.xstate_bv = (XFEM_SSE | XFEM_X87);
 			}
@@ -884,25 +890,55 @@ Retry:
 			iavx->_xh.xstate_bv = AVX512_XMASK;
 			iavx->_xh.xcomp_bv  = 0;
 
+			/*
+			 * See the block comment at the top of the function for a description of why we're clearing
+			 * xstate_bv bits.
+			 */
 			switch (f) {
 			case x86_AVX512_STATE32:
 				__nochk_bcopy(&xs.s32->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
 				__nochk_bcopy(&xs.s32->fpu_zmmh0, iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG));
+				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG)) == TRUE) {
+					iavx->_xh.xstate_bv &= ~XFEM_ZMM;
+				}
 				__nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
+				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
+					iavx->_xh.xstate_bv &= ~XFEM_YMM;
+				}
+
 				DBG_AVX512_STATE(iavx);
 				break;
 			case x86_AVX_STATE32:
 				__nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
+				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
+					iavx->_xh.xstate_bv &= ~XFEM_YMM;
+				}
 				break;
 			case x86_AVX512_STATE64:
 				__nochk_bcopy(&xs.s64->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
 				__nochk_bcopy(&xs.s64->fpu_zmm16, iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG));
 				__nochk_bcopy(&xs.s64->fpu_zmmh0, iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG));
+				/*
+				 * Note that it is valid to have XFEM_ZMM set but XFEM_YMM cleared.  In that case,
+				 * the upper bits of the YMMs would be cleared and would result in a clean-upper
+				 * state, allowing SSE instruction to avoid false dependencies.
+				 */
+				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG)) == TRUE &&
+				    fpu_allzeroes((uint64_t *)(void *)iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG)) == TRUE) {
+					iavx->_xh.xstate_bv &= ~XFEM_ZMM;
+				}
+
 				__nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
+				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
+					iavx->_xh.xstate_bv &= ~XFEM_YMM;
+				}
 				DBG_AVX512_STATE(iavx);
 				break;
 			case x86_AVX_STATE64:
 				__nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
+				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
+					iavx->_xh.xstate_bv &= ~XFEM_YMM;
+				}
 				break;
 			}
 			break;