X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/ea3f04195ba4a5034c9c8e9b726d4f7ce96f1832..2a1bd2d3eef5c7a7bb14f4bb9fdbca9a96ee4752:/osfmk/i386/fpu.c?ds=sidebyside diff --git a/osfmk/i386/fpu.c b/osfmk/i386/fpu.c index 1d7475429..e960ee4ef 100644 --- a/osfmk/i386/fpu.c +++ b/osfmk/i386/fpu.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2019 Apple Inc. All rights reserved. + * Copyright (c) 2000-2020 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -80,6 +80,9 @@ xstate_t fpu_capability = UNDEFINED; /* extended state capability */ xstate_t fpu_default = UNDEFINED; /* default extended state */ #define ALIGNED(addr, size) (((uintptr_t)(addr)&((size)-1))==0) +#define VERIFY_SAVEAREA_ALIGNED(p, a) \ + assertf(!(((uintptr_t)(p)) & ((a) - 1)), \ + "FP save area component @ 0x%lx not 8-byte aligned", ((uintptr_t)(p))) /* Forward */ @@ -147,18 +150,18 @@ fxsave64(struct x86_fx_thread_state *a) #define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX || (x) == AVX512) -zone_t ifps_zone[] = { +SECURITY_READ_ONLY_LATE(zone_t) ifps_zone[] = { [FP] = NULL, [AVX] = NULL, [AVX512] = NULL }; -static uint32_t fp_state_size[] = { +static const uint32_t fp_state_size[] = { [FP] = sizeof(struct x86_fx_thread_state), [AVX] = sizeof(struct x86_avx_thread_state), [AVX512] = sizeof(struct x86_avx512_thread_state) }; -static const char *xstate_name[] = { +static const char *const xstate_name[] = { [UNDEFINED] = "UNDEFINED", [FP] = "FP", [AVX] = "AVX", @@ -504,22 +507,8 @@ init_fpu(void) static void * fp_state_alloc(xstate_t xs) { - struct x86_fx_thread_state *ifps; - assert(ifps_zone[xs] != NULL); - ifps = zalloc(ifps_zone[xs]); - -#if DEBUG - if (!(ALIGNED(ifps, 64))) { - panic("fp_state_alloc: %p, %u, %p, %u", - ifps, (unsigned) ifps_zone[xs]->elem_size, - (void *) ifps_zone[xs]->free_elements, - (unsigned) ifps_zone[xs]->alloc_size); - } -#endif - bzero(ifps, fp_state_size[xs]); - - return ifps; + return zalloc_flags(ifps_zone[xs], Z_WAITOK | Z_ZERO); } static inline void @@ -535,6 +524,19 @@ clear_fpu(void) set_ts(); } +static boolean_t +fpu_allzeroes(uint64_t * __attribute((aligned(8)))ptr, uint32_t size) +{ + VERIFY_SAVEAREA_ALIGNED(ptr, sizeof(uint64_t)); + assertf((size & (sizeof(uint64_t) - 1)) == 0, "FP save area component not a multiple of 8 bytes"); + + for (uint32_t count = 0; count < (size / sizeof(uint64_t)); count++) { + if (ptr[count] != 0) { + return FALSE; + } + } + return TRUE; +} static void fpu_load_registers(void *fstate) @@ -631,31 +633,19 @@ fpu_module_init(void) fpu_default); } - /* We explicitly choose an allocation size of 13 pages = 64 * 832 - * to eliminate waste for the 832 byte sized - * AVX XSAVE register save area. - */ - ifps_zone[fpu_default] = zinit(fp_state_size[fpu_default], - thread_max * fp_state_size[fpu_default], - 64 * fp_state_size[fpu_default], - "x86 fpsave state"); - /* To maintain the required alignment, disable * zone debugging for this zone as that appends * 16 bytes to each element. */ - zone_change(ifps_zone[fpu_default], Z_ALIGNMENT_REQUIRED, TRUE); + ifps_zone[fpu_default] = zone_create("x86 fpsave state", + fp_state_size[fpu_default], ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM); /* * If AVX512 is supported, create a separate savearea zone. - * with allocation size: 19 pages = 32 * 2668 */ if (fpu_capability == AVX512) { - ifps_zone[AVX512] = zinit(fp_state_size[AVX512], - thread_max * fp_state_size[AVX512], - 32 * fp_state_size[AVX512], - "x86 avx512 save state"); - zone_change(ifps_zone[AVX512], Z_ALIGNMENT_REQUIRED, TRUE); + ifps_zone[AVX512] = zone_create("x86 avx512 save state", + fp_state_size[AVX512], ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM); } /* Determine MXCSR reserved bits and configure initial FPU state*/ @@ -730,13 +720,19 @@ fpu_free(thread_t thread, void *fps) } /* - * Set the floating-point state for a thread based - * on the FXSave formatted data. This is basically - * the same as fpu_set_state except it uses the - * expanded data structure. - * If the thread is not the current thread, it is - * not running (held). Locking needed against - * concurrent fpu_set_state or fpu_get_state. + * Set the floating-point state for a thread based on the FXSave formatted data. + * This is basically the same as fpu_set_state except it uses the expanded data + * structure. + * If the thread is not the current thread, it is not running (held). Locking + * needed against concurrent fpu_set_state or fpu_get_state. + * + * While translating between XNU FP state structures and the CPU-native XSAVE area, + * if we detect state components that are all zeroes, we clear the corresponding + * xstate_bv bit in the XSAVE area, because that allows the corresponding state to + * be initialized to a "clean" state. That's most important when clearing the YMM + * bit, since an initialized "upper clean" state results in a massive performance + * improvement due to elimination of false dependencies between the XMMs and the + * upper bits of the YMMs. */ kern_return_t fpu_set_fxstate( @@ -860,10 +856,20 @@ Retry: iavx->_xh.xstate_bv = AVX_XMASK; iavx->_xh.xcomp_bv = 0; + /* + * See the block comment at the top of the function for a description of why we're clearing + * xstate_bv bits. + */ if (f == x86_AVX_STATE32) { __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)); + if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) { + iavx->_xh.xstate_bv &= ~XFEM_YMM; + } } else if (f == x86_AVX_STATE64) { __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)); + if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) { + iavx->_xh.xstate_bv &= ~XFEM_YMM; + } } else { iavx->_xh.xstate_bv = (XFEM_SSE | XFEM_X87); } @@ -884,25 +890,55 @@ Retry: iavx->_xh.xstate_bv = AVX512_XMASK; iavx->_xh.xcomp_bv = 0; + /* + * See the block comment at the top of the function for a description of why we're clearing + * xstate_bv bits. + */ switch (f) { case x86_AVX512_STATE32: __nochk_bcopy(&xs.s32->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)); __nochk_bcopy(&xs.s32->fpu_zmmh0, iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG)); + if (fpu_allzeroes((uint64_t *)(void *)iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG)) == TRUE) { + iavx->_xh.xstate_bv &= ~XFEM_ZMM; + } __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)); + if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) { + iavx->_xh.xstate_bv &= ~XFEM_YMM; + } + DBG_AVX512_STATE(iavx); break; case x86_AVX_STATE32: __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)); + if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) { + iavx->_xh.xstate_bv &= ~XFEM_YMM; + } break; case x86_AVX512_STATE64: __nochk_bcopy(&xs.s64->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)); __nochk_bcopy(&xs.s64->fpu_zmm16, iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG)); __nochk_bcopy(&xs.s64->fpu_zmmh0, iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG)); + /* + * Note that it is valid to have XFEM_ZMM set but XFEM_YMM cleared. In that case, + * the upper bits of the YMMs would be cleared and would result in a clean-upper + * state, allowing SSE instruction to avoid false dependencies. + */ + if (fpu_allzeroes((uint64_t *)(void *)iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG)) == TRUE && + fpu_allzeroes((uint64_t *)(void *)iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG)) == TRUE) { + iavx->_xh.xstate_bv &= ~XFEM_ZMM; + } + __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)); + if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) { + iavx->_xh.xstate_bv &= ~XFEM_YMM; + } DBG_AVX512_STATE(iavx); break; case x86_AVX_STATE64: __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)); + if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) { + iavx->_xh.xstate_bv &= ~XFEM_YMM; + } break; } break;