2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Mach Operating System
33 * Copyright (c) 1992-1990 Carnegie Mellon University
34 * All Rights Reserved.
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 * Carnegie Mellon requests users of this software to return to
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
57 #include <mach/exception_types.h>
58 #include <mach/i386/thread_status.h>
59 #include <mach/i386/fp_reg.h>
61 #include <kern/mach_param.h>
62 #include <kern/processor.h>
63 #include <kern/thread.h>
64 #include <kern/zalloc.h>
65 #include <kern/misc_protos.h>
67 #include <kern/assert.h>
69 #include <libkern/OSAtomic.h>
71 #include <architecture/i386/pio.h>
72 #include <i386/cpuid.h>
74 #include <i386/proc_reg.h>
75 #include <i386/misc_protos.h>
76 #include <i386/thread.h>
77 #include <i386/trap.h>
79 xstate_t fpu_capability
= UNDEFINED
; /* extended state capability */
80 xstate_t fpu_default
= UNDEFINED
; /* default extended state */
82 #define ALIGNED(addr, size) (((uintptr_t)(addr)&((size)-1))==0)
83 #define VERIFY_SAVEAREA_ALIGNED(p, a) \
84 assertf(!(((uintptr_t)(p)) & ((a) - 1)), \
85 "FP save area component @ 0x%lx not 8-byte aligned", ((uintptr_t)(p)))
89 extern void fpinit(void);
95 static void configure_mxcsr_capability_mask(x86_ext_thread_state_t
*fps
);
96 static xstate_t
thread_xstate(thread_t
);
98 x86_ext_thread_state_t initial_fp_state
__attribute((aligned(64)));
99 x86_ext_thread_state_t default_avx512_state
__attribute((aligned(64)));
100 x86_ext_thread_state_t default_avx_state
__attribute((aligned(64)));
101 x86_ext_thread_state_t default_fx_state
__attribute((aligned(64)));
103 /* Global MXCSR capability bitmask */
104 static unsigned int mxcsr_capability_mask
;
107 __asm__ volatile("fninit")
109 #define fnstcw(control) \
110 __asm__("fnstcw %0" : "=m" (*(unsigned short *)(control)))
112 #define fldcw(control) \
113 __asm__ volatile("fldcw %0" : : "m" (*(unsigned short *) &(control)) )
116 __asm__ volatile("fnclex")
118 #define fnsave(state) \
119 __asm__ volatile("fnsave %0" : "=m" (*state))
121 #define frstor(state) \
122 __asm__ volatile("frstor %0" : : "m" (state))
128 fxrstor(struct x86_fx_thread_state
*a
)
130 __asm__
__volatile__ ("fxrstor %0" :: "m" (*a
));
134 fxsave(struct x86_fx_thread_state
*a
)
136 __asm__
__volatile__ ("fxsave %0" : "=m" (*a
));
140 fxrstor64(struct x86_fx_thread_state
*a
)
142 __asm__
__volatile__ ("fxrstor64 %0" :: "m" (*a
));
146 fxsave64(struct x86_fx_thread_state
*a
)
148 __asm__
__volatile__ ("fxsave64 %0" : "=m" (*a
));
151 #define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX || (x) == AVX512)
153 SECURITY_READ_ONLY_LATE(zone_t
) ifps_zone
[] = {
158 static const uint32_t fp_state_size
[] = {
159 [FP
] = sizeof(struct x86_fx_thread_state
),
160 [AVX
] = sizeof(struct x86_avx_thread_state
),
161 [AVX512
] = sizeof(struct x86_avx512_thread_state
)
164 static const char *const xstate_name
[] = {
165 [UNDEFINED
] = "UNDEFINED",
171 #define fpu_ZMM_capable (fpu_capability == AVX512)
172 #define fpu_YMM_capable (fpu_capability == AVX || fpu_capability == AVX512)
174 * On-demand AVX512 support
175 * ------------------------
176 * On machines with AVX512 support, by default, threads are created with
177 * AVX512 masked off in XCR0 and an AVX-sized savearea is used. However, AVX512
178 * capabilities are advertised in the commpage and via sysctl. If a thread
179 * opts to use AVX512 instructions, the first will result in a #UD exception.
180 * Faulting AVX512 intructions are recognizable by their unique prefix.
181 * This exception results in the thread being promoted to use an AVX512-sized
182 * savearea and for the AVX512 bit masks being set in its XCR0. The faulting
183 * instruction is re-driven and the thread can proceed to perform AVX512
186 * In addition to AVX512 instructions causing promotion, the thread_set_state()
187 * primitive with an AVX512 state flavor result in promotion.
189 * AVX512 promotion of the first thread in a task causes the default xstate
190 * of the task to be promoted so that any subsequently created or subsequently
191 * DNA-faulted thread will have AVX512 xstate and it will not need to fault-in
194 * Two savearea zones are used: the default pool of AVX-sized (832 byte) areas
195 * and a second pool of larger AVX512-sized (2688 byte) areas.
197 * Note the initial state value is an AVX512 object but that the AVX initial
198 * value is a subset of it.
200 static uint32_t cpuid_reevaluated
= 0;
202 static void fpu_store_registers(void *, boolean_t
);
203 static void fpu_load_registers(void *);
205 static const uint32_t xstate_xmask
[] = {
208 [AVX512
] = AVX512_XMASK
212 xsave(struct x86_fx_thread_state
*a
, uint32_t rfbm
)
214 __asm__
__volatile__ ("xsave %0" :"=m" (*a
) : "a"(rfbm
), "d"(0));
218 xsave64(struct x86_fx_thread_state
*a
, uint32_t rfbm
)
220 __asm__
__volatile__ ("xsave64 %0" :"=m" (*a
) : "a"(rfbm
), "d"(0));
224 xrstor(struct x86_fx_thread_state
*a
, uint32_t rfbm
)
226 __asm__
__volatile__ ("xrstor %0" :: "m" (*a
), "a"(rfbm
), "d"(0));
230 xrstor64(struct x86_fx_thread_state
*a
, uint32_t rfbm
)
232 __asm__
__volatile__ ("xrstor64 %0" :: "m" (*a
), "a"(rfbm
), "d"(0));
235 __unused
static inline void
238 __asm__
__volatile__ ("vzeroupper" ::);
241 static boolean_t
fpu_thread_promote_avx512(thread_t
); /* Forward */
245 * Furthermore, make compile-time asserts that no padding creeps into structures
246 * for which we're doing this.
248 #define ASSERT_PACKED(t, m1, m2, n, mt) \
249 extern char assert_packed_ ## t ## _ ## m1 ## _ ## m2 \
250 [(offsetof(t,m2) - offsetof(t,m1) == (n - 1)*sizeof(mt)) ? 1 : -1]
252 ASSERT_PACKED(x86_avx_state32_t
, fpu_ymmh0
, fpu_ymmh7
, 8, _STRUCT_XMM_REG
);
254 ASSERT_PACKED(x86_avx_state64_t
, fpu_ymmh0
, fpu_ymmh15
, 16, _STRUCT_XMM_REG
);
256 ASSERT_PACKED(x86_avx512_state32_t
, fpu_k0
, fpu_k7
, 8, _STRUCT_OPMASK_REG
);
257 ASSERT_PACKED(x86_avx512_state32_t
, fpu_ymmh0
, fpu_ymmh7
, 8, _STRUCT_XMM_REG
);
258 ASSERT_PACKED(x86_avx512_state32_t
, fpu_zmmh0
, fpu_zmmh7
, 8, _STRUCT_YMM_REG
);
260 ASSERT_PACKED(x86_avx512_state64_t
, fpu_k0
, fpu_k7
, 8, _STRUCT_OPMASK_REG
);
261 ASSERT_PACKED(x86_avx512_state64_t
, fpu_ymmh0
, fpu_ymmh15
, 16, _STRUCT_XMM_REG
);
262 ASSERT_PACKED(x86_avx512_state64_t
, fpu_zmmh0
, fpu_zmmh15
, 16, _STRUCT_YMM_REG
);
263 ASSERT_PACKED(x86_avx512_state64_t
, fpu_zmm16
, fpu_zmm31
, 16, _STRUCT_ZMM_REG
);
265 #if defined(DEBUG_AVX512)
267 #define DBG(x...) kprintf("DBG: " x)
269 typedef struct { uint8_t byte
[8]; } opmask_t
;
270 typedef struct { uint8_t byte
[16]; } xmm_t
;
271 typedef struct { uint8_t byte
[32]; } ymm_t
;
272 typedef struct { uint8_t byte
[64]; } zmm_t
;
275 DBG_AVX512_STATE(struct x86_avx512_thread_state
*sp
)
278 xmm_t
*xmm
= (xmm_t
*) &sp
->fp
.fx_XMM_reg
;
279 xmm_t
*ymmh
= (xmm_t
*) &sp
->x_YMM_Hi128
;
280 ymm_t
*zmmh
= (ymm_t
*) &sp
->x_ZMM_Hi256
;
281 zmm_t
*zmm
= (zmm_t
*) &sp
->x_Hi16_ZMM
;
282 opmask_t
*k
= (opmask_t
*) &sp
->x_Opmask
;
284 kprintf("x_YMM_Hi128: %lu\n", offsetof(struct x86_avx512_thread_state
, x_YMM_Hi128
));
285 kprintf("x_Opmask: %lu\n", offsetof(struct x86_avx512_thread_state
, x_Opmask
));
286 kprintf("x_ZMM_Hi256: %lu\n", offsetof(struct x86_avx512_thread_state
, x_ZMM_Hi256
));
287 kprintf("x_Hi16_ZMM: %lu\n", offsetof(struct x86_avx512_thread_state
, x_Hi16_ZMM
));
289 kprintf("XCR0: 0x%016llx\n", xgetbv(XCR0
));
290 kprintf("XINUSE: 0x%016llx\n", xgetbv(1));
292 /* Print all ZMM registers */
293 for (i
= 0; i
< 16; i
++) {
294 kprintf("zmm%d:\t0x", i
);
295 for (j
= 0; j
< 16; j
++) {
296 kprintf("%02x", xmm
[i
].byte
[j
]);
298 for (j
= 0; j
< 16; j
++) {
299 kprintf("%02x", ymmh
[i
].byte
[j
]);
301 for (j
= 0; j
< 32; j
++) {
302 kprintf("%02x", zmmh
[i
].byte
[j
]);
306 for (i
= 0; i
< 16; i
++) {
307 kprintf("zmm%d:\t0x", 16 + i
);
308 for (j
= 0; j
< 64; j
++) {
309 kprintf("%02x", zmm
[i
].byte
[j
]);
313 for (i
= 0; i
< 8; i
++) {
314 kprintf("k%d:\t0x", i
);
315 for (j
= 0; j
< 8; j
++) {
316 kprintf("%02x", k
[i
].byte
[j
]);
321 kprintf("xstate_bv: 0x%016llx\n", sp
->_xh
.xstate_bv
);
322 kprintf("xcomp_bv: 0x%016llx\n", sp
->_xh
.xcomp_bv
);
327 DBG_AVX512_STATE(__unused
struct x86_avx512_thread_state
*sp
)
331 #endif /* DEBUG_AVX512 */
334 static inline unsigned short
337 unsigned short status
;
338 __asm__
volatile ("fnstsw %0" : "=ma" (status
));
344 * Configure the initial FPU state presented to new threads.
345 * Determine the MXCSR capability mask, which allows us to mask off any
346 * potentially unsafe "reserved" bits before restoring the FPU context.
347 * *Not* per-cpu, assumes symmetry.
351 configure_mxcsr_capability_mask(x86_ext_thread_state_t
*fps
)
353 /* XSAVE requires a 64 byte aligned store */
354 assert(ALIGNED(fps
, 64));
355 /* Clear, to prepare for the diagnostic FXSAVE */
356 bzero(fps
, sizeof(*fps
));
359 fpu_store_registers(fps
, FALSE
);
361 mxcsr_capability_mask
= fps
->fx
.fx_MXCSR_MASK
;
363 /* Set default mask value if necessary */
364 if (mxcsr_capability_mask
== 0) {
365 mxcsr_capability_mask
= 0xffbf;
368 /* Clear vector register store */
369 bzero(&fps
->fx
.fx_XMM_reg
[0][0], sizeof(fps
->fx
.fx_XMM_reg
));
370 bzero(fps
->avx
.x_YMM_Hi128
, sizeof(fps
->avx
.x_YMM_Hi128
));
371 if (fpu_ZMM_capable
) {
372 bzero(fps
->avx512
.x_ZMM_Hi256
, sizeof(fps
->avx512
.x_ZMM_Hi256
));
373 bzero(fps
->avx512
.x_Hi16_ZMM
, sizeof(fps
->avx512
.x_Hi16_ZMM
));
374 bzero(fps
->avx512
.x_Opmask
, sizeof(fps
->avx512
.x_Opmask
));
377 fps
->fx
.fp_valid
= TRUE
;
378 fps
->fx
.fp_save_layout
= fpu_YMM_capable
? XSAVE32
: FXSAVE32
;
379 fpu_load_registers(fps
);
381 if (fpu_ZMM_capable
) {
382 xsave64((struct x86_fx_thread_state
*)&default_avx512_state
, xstate_xmask
[AVX512
]);
384 if (fpu_YMM_capable
) {
385 xsave64((struct x86_fx_thread_state
*)&default_avx_state
, xstate_xmask
[AVX
]);
387 fxsave64((struct x86_fx_thread_state
*)&default_fx_state
);
390 /* Poison values to trap unsafe usage */
391 fps
->fx
.fp_valid
= 0xFFFFFFFF;
392 fps
->fx
.fp_save_layout
= FP_UNUSED
;
394 /* Re-enable FPU/SSE DNA exceptions */
398 int fpsimd_fault_popc
= 0;
400 * Look for FPU and initialize it.
401 * Called on each CPU.
407 unsigned short status
;
408 unsigned short control
;
411 * Check for FPU by initializing it,
412 * then trying to read the correct bit patterns from
413 * the control and status registers.
415 set_cr0((get_cr0() & ~(CR0_EM
| CR0_TS
)) | CR0_NE
); /* allow use of FPU */
421 assert(((status
& 0xff) == 0) && ((control
& 0x103f) == 0x3f));
423 /* Advertise SSE support */
424 if (cpuid_features() & CPUID_FEATURE_FXSR
) {
425 set_cr4(get_cr4() | CR4_OSFXS
);
426 /* And allow SIMD exceptions if present */
427 if (cpuid_features() & CPUID_FEATURE_SSE
) {
428 set_cr4(get_cr4() | CR4_OSXMM
);
431 panic("fpu is not FP_FXSR");
434 fpu_capability
= fpu_default
= FP
;
436 PE_parse_boot_argn("fpsimd_fault_popc", &fpsimd_fault_popc
, sizeof(fpsimd_fault_popc
));
438 static boolean_t is_avx512_enabled
= TRUE
;
439 if (cpu_number() == master_cpu
) {
440 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512F
) {
441 PE_parse_boot_argn("avx512", &is_avx512_enabled
, sizeof(boolean_t
));
442 kprintf("AVX512 supported %s\n",
443 is_avx512_enabled
? "and enabled" : "but disabled");
447 /* Configure the XSAVE context mechanism if the processor supports
450 if (cpuid_features() & CPUID_FEATURE_XSAVE
) {
451 cpuid_xsave_leaf_t
*xs0p
= &cpuid_info()->cpuid_xsave_leaf
[0];
452 if (is_avx512_enabled
&&
453 (xs0p
->extended_state
[eax
] & XFEM_ZMM
) == XFEM_ZMM
) {
454 assert(xs0p
->extended_state
[eax
] & XFEM_SSE
);
455 assert(xs0p
->extended_state
[eax
] & XFEM_YMM
);
456 fpu_capability
= AVX512
;
457 /* XSAVE container size for all features */
458 set_cr4(get_cr4() | CR4_OSXSAVE
);
459 xsetbv(0, AVX512_XMASK
);
460 /* Re-evaluate CPUID, once, to reflect OSXSAVE */
461 if (OSCompareAndSwap(0, 1, &cpuid_reevaluated
)) {
464 /* Verify that now selected state can be accommodated */
465 assert(xs0p
->extended_state
[ebx
] == fp_state_size
[AVX512
]);
467 * AVX set until AVX512 is used.
468 * See comment above about on-demand AVX512 support.
470 xsetbv(0, AVX_XMASK
);
472 } else if (xs0p
->extended_state
[eax
] & XFEM_YMM
) {
473 assert(xs0p
->extended_state
[eax
] & XFEM_SSE
);
474 fpu_capability
= AVX
;
476 /* XSAVE container size for all features */
477 set_cr4(get_cr4() | CR4_OSXSAVE
);
478 xsetbv(0, AVX_XMASK
);
479 /* Re-evaluate CPUID, once, to reflect OSXSAVE */
480 if (OSCompareAndSwap(0, 1, &cpuid_reevaluated
)) {
483 /* Verify that now selected state can be accommodated */
484 assert(xs0p
->extended_state
[ebx
] == fp_state_size
[AVX
]);
488 if (cpu_number() == master_cpu
) {
489 kprintf("fpu_state: %s, state_size: %d\n",
490 xstate_name
[fpu_capability
],
491 fp_state_size
[fpu_capability
]);
495 current_cpu_datap()->cpu_xstate
= fpu_default
;
498 * Trap wait instructions. Turn off FPU for now.
500 set_cr0(get_cr0() | CR0_TS
| CR0_MP
);
504 * Allocate and initialize FP state for specified xstate.
508 fp_state_alloc(xstate_t xs
)
510 assert(ifps_zone
[xs
] != NULL
);
511 return zalloc_flags(ifps_zone
[xs
], Z_WAITOK
| Z_ZERO
);
515 fp_state_free(void *ifps
, xstate_t xs
)
517 assert(ifps_zone
[xs
] != NULL
);
518 zfree(ifps_zone
[xs
], ifps
);
528 fpu_allzeroes(uint64_t * __attribute((aligned(8)))ptr
, uint32_t size
)
530 VERIFY_SAVEAREA_ALIGNED(ptr
, sizeof(uint64_t));
531 assertf((size
& (sizeof(uint64_t) - 1)) == 0, "FP save area component not a multiple of 8 bytes");
533 for (uint32_t count
= 0; count
< (size
/ sizeof(uint64_t)); count
++) {
534 if (ptr
[count
] != 0) {
542 fpu_load_registers(void *fstate
)
544 struct x86_fx_thread_state
*ifps
= fstate
;
545 fp_save_layout_t layout
= ifps
->fp_save_layout
;
547 assert(current_task() == NULL
|| \
548 (thread_is_64bit_addr(current_thread()) ? \
549 (layout
== FXSAVE64
|| layout
== XSAVE64
) : \
550 (layout
== FXSAVE32
|| layout
== XSAVE32
)));
551 assert(ALIGNED(ifps
, 64));
552 assert(ml_get_interrupts_enabled() == FALSE
);
555 if (layout
== XSAVE32
|| layout
== XSAVE64
) {
556 struct x86_avx_thread_state
*iavx
= fstate
;
558 /* Verify reserved bits in the XSAVE header*/
559 if (iavx
->_xh
.xstate_bv
& ~xstate_xmask
[current_xstate()]) {
560 panic("iavx->_xh.xstate_bv: 0x%llx", iavx
->_xh
.xstate_bv
);
562 for (i
= 0; i
< sizeof(iavx
->_xh
.xhrsvd
); i
++) {
563 if (iavx
->_xh
.xhrsvd
[i
]) {
564 panic("Reserved bit set");
568 if (fpu_YMM_capable
) {
569 if (layout
!= XSAVE32
&& layout
!= XSAVE64
) {
570 panic("Inappropriate layout: %u\n", layout
);
583 xrstor64(ifps
, xstate_xmask
[current_xstate()]);
586 xrstor(ifps
, xstate_xmask
[current_xstate()]);
589 panic("fpu_load_registers() bad layout: %d\n", layout
);
594 fpu_store_registers(void *fstate
, boolean_t is64
)
596 struct x86_fx_thread_state
*ifps
= fstate
;
597 assert(ALIGNED(ifps
, 64));
598 xstate_t xs
= current_xstate();
603 ifps
->fp_save_layout
= FXSAVE64
;
606 ifps
->fp_save_layout
= FXSAVE32
;
612 xsave64(ifps
, xstate_xmask
[xs
]);
613 ifps
->fp_save_layout
= XSAVE64
;
615 xsave(ifps
, xstate_xmask
[xs
]);
616 ifps
->fp_save_layout
= XSAVE32
;
620 panic("fpu_store_registers() bad xstate: %d\n", xs
);
625 * Initialize FP handling.
629 fpu_module_init(void)
631 if (!IS_VALID_XSTATE(fpu_default
)) {
632 panic("fpu_module_init: invalid extended state %u\n",
636 /* To maintain the required alignment, disable
637 * zone debugging for this zone as that appends
638 * 16 bytes to each element.
640 ifps_zone
[fpu_default
] = zone_create("x86 fpsave state",
641 fp_state_size
[fpu_default
], ZC_ALIGNMENT_REQUIRED
| ZC_ZFREE_CLEARMEM
);
644 * If AVX512 is supported, create a separate savearea zone.
646 if (fpu_capability
== AVX512
) {
647 ifps_zone
[AVX512
] = zone_create("x86 avx512 save state",
648 fp_state_size
[AVX512
], ZC_ALIGNMENT_REQUIRED
| ZC_ZFREE_CLEARMEM
);
651 /* Determine MXCSR reserved bits and configure initial FPU state*/
652 configure_mxcsr_capability_mask(&initial_fp_state
);
656 * Context switch fpu state.
657 * Always save old thread`s FPU context but don't load new .. allow that to fault-in.
658 * Switch to the new task's xstate.
662 fpu_switch_context(thread_t old
, thread_t
new)
664 struct x86_fx_thread_state
*ifps
;
665 cpu_data_t
*cdp
= current_cpu_datap();
666 xstate_t new_xstate
= new ? thread_xstate(new) : fpu_default
;
668 assert(ml_get_interrupts_enabled() == FALSE
);
669 ifps
= (old
)->machine
.ifps
;
671 if (ifps
&& ((ifps
->fp_valid
!= FALSE
) && (ifps
->fp_valid
!= TRUE
))) {
672 panic("ifps->fp_valid: %u\n", ifps
->fp_valid
);
675 if (ifps
!= 0 && (ifps
->fp_valid
== FALSE
)) {
676 /* Clear CR0.TS in preparation for the FP context save. In
677 * theory, this shouldn't be necessary since a live FPU should
678 * indicate that TS is clear. However, various routines
679 * (such as sendsig & sigreturn) manipulate TS directly.
682 /* registers are in FPU - save to memory */
683 boolean_t is64
= (thread_is_64bit_addr(old
) &&
684 is_saved_state64(old
->machine
.iss
));
686 fpu_store_registers(ifps
, is64
);
687 ifps
->fp_valid
= TRUE
;
689 if (fpu_ZMM_capable
&& (cdp
->cpu_xstate
== AVX512
)) {
690 xrstor64((struct x86_fx_thread_state
*)&default_avx512_state
, xstate_xmask
[AVX512
]);
691 } else if (fpu_YMM_capable
) {
692 xrstor64((struct x86_fx_thread_state
*) &default_avx_state
, xstate_xmask
[AVX
]);
694 fxrstor64((struct x86_fx_thread_state
*)&default_fx_state
);
698 assertf(fpu_YMM_capable
? (xgetbv(XCR0
) == xstate_xmask
[cdp
->cpu_xstate
]) : TRUE
, "XCR0 mismatch: 0x%llx 0x%x 0x%x", xgetbv(XCR0
), cdp
->cpu_xstate
, xstate_xmask
[cdp
->cpu_xstate
]);
699 if (new_xstate
!= (xstate_t
) cdp
->cpu_xstate
) {
700 DBG("fpu_switch_context(%p,%p) new xstate: %s\n",
701 old
, new, xstate_name
[new_xstate
]);
702 xsetbv(0, xstate_xmask
[new_xstate
]);
703 cdp
->cpu_xstate
= new_xstate
;
710 * Free a FPU save area.
711 * Called only when thread terminating - no locking necessary.
714 fpu_free(thread_t thread
, void *fps
)
716 pcb_t pcb
= THREAD_TO_PCB(thread
);
718 fp_state_free(fps
, pcb
->xstate
);
719 pcb
->xstate
= UNDEFINED
;
723 * Set the floating-point state for a thread based on the FXSave formatted data.
724 * This is basically the same as fpu_set_state except it uses the expanded data
726 * If the thread is not the current thread, it is not running (held). Locking
727 * needed against concurrent fpu_set_state or fpu_get_state.
729 * While translating between XNU FP state structures and the CPU-native XSAVE area,
730 * if we detect state components that are all zeroes, we clear the corresponding
731 * xstate_bv bit in the XSAVE area, because that allows the corresponding state to
732 * be initialized to a "clean" state. That's most important when clearing the YMM
733 * bit, since an initialized "upper clean" state results in a massive performance
734 * improvement due to elimination of false dependencies between the XMMs and the
735 * upper bits of the YMMs.
740 thread_state_t tstate
,
743 struct x86_fx_thread_state
*ifps
;
744 struct x86_fx_thread_state
*new_ifps
;
745 x86_float_state64_t
*state
;
747 boolean_t old_valid
, fresh_state
= FALSE
;
750 if (fpu_capability
== UNDEFINED
) {
754 if ((f
== x86_AVX_STATE32
|| f
== x86_AVX_STATE64
) &&
755 fpu_capability
< AVX
) {
759 assert(thr_act
!= THREAD_NULL
);
761 thr_xstate
= thread_xstate(thr_act
);
763 if ((f
== x86_AVX512_STATE32
|| f
== x86_AVX512_STATE64
) &&
765 if (!fpu_thread_promote_avx512(thr_act
)) {
768 /* Reload thr_xstate after successful promotion */
769 thr_xstate
= thread_xstate(thr_act
);
773 state
= (x86_float_state64_t
*)tstate
;
775 pcb
= THREAD_TO_PCB(thr_act
);
779 * new FPU state is 'invalid'.
780 * Deallocate the fp state if it exists.
782 simple_lock(&pcb
->lock
, LCK_GRP_NULL
);
787 simple_unlock(&pcb
->lock
);
790 fp_state_free(ifps
, thr_xstate
);
794 * Valid incoming state. Allocate the fp state if there is none.
798 simple_lock(&pcb
->lock
, LCK_GRP_NULL
);
803 simple_unlock(&pcb
->lock
);
804 new_ifps
= fp_state_alloc(thr_xstate
);
810 pcb
->xstate
= thr_xstate
;
815 * now copy over the new data.
818 old_valid
= ifps
->fp_valid
;
820 #if DEBUG || DEVELOPMENT
821 if ((fresh_state
== FALSE
) && (old_valid
== FALSE
) && (thr_act
!= current_thread())) {
822 panic("fpu_set_fxstate inconsistency, thread: %p not stopped", thr_act
);
826 * Clear any reserved bits in the MXCSR to prevent a GPF
827 * when issuing an FXRSTOR.
830 state
->fpu_mxcsr
&= mxcsr_capability_mask
;
832 __nochk_bcopy((char *)&state
->fpu_fcw
, (char *)ifps
, fp_state_size
[FP
]);
834 switch (thr_xstate
) {
839 panic("fpu_set_fxstate() INVALID xstate: 0x%x", thr_xstate
);
843 panic("fpu_set_fxstate() UNDEFINED xstate");
846 ifps
->fp_save_layout
= thread_is_64bit_addr(thr_act
) ? FXSAVE64
: FXSAVE32
;
849 struct x86_avx_thread_state
*iavx
= (void *) ifps
;
850 x86_avx_state64_t
*xs
= (x86_avx_state64_t
*) state
;
852 iavx
->fp
.fp_save_layout
= thread_is_64bit_addr(thr_act
) ? XSAVE64
: XSAVE32
;
854 /* Sanitize XSAVE header */
855 bzero(&iavx
->_xh
.xhrsvd
[0], sizeof(iavx
->_xh
.xhrsvd
));
856 iavx
->_xh
.xstate_bv
= AVX_XMASK
;
857 iavx
->_xh
.xcomp_bv
= 0;
860 * See the block comment at the top of the function for a description of why we're clearing
863 if (f
== x86_AVX_STATE32
) {
864 __nochk_bcopy(&xs
->fpu_ymmh0
, iavx
->x_YMM_Hi128
, 8 * sizeof(_STRUCT_XMM_REG
));
865 if (fpu_allzeroes((uint64_t *)(void *)iavx
->x_YMM_Hi128
, 8 * sizeof(_STRUCT_XMM_REG
)) == TRUE
) {
866 iavx
->_xh
.xstate_bv
&= ~XFEM_YMM
;
868 } else if (f
== x86_AVX_STATE64
) {
869 __nochk_bcopy(&xs
->fpu_ymmh0
, iavx
->x_YMM_Hi128
, 16 * sizeof(_STRUCT_XMM_REG
));
870 if (fpu_allzeroes((uint64_t *)(void *)iavx
->x_YMM_Hi128
, 16 * sizeof(_STRUCT_XMM_REG
)) == TRUE
) {
871 iavx
->_xh
.xstate_bv
&= ~XFEM_YMM
;
874 iavx
->_xh
.xstate_bv
= (XFEM_SSE
| XFEM_X87
);
879 struct x86_avx512_thread_state
*iavx
= (void *) ifps
;
882 x86_avx512_state32_t
*s32
;
883 x86_avx512_state64_t
*s64
;
884 } xs
= { .ts
= tstate
};
886 iavx
->fp
.fp_save_layout
= thread_is_64bit_addr(thr_act
) ? XSAVE64
: XSAVE32
;
888 /* Sanitize XSAVE header */
889 bzero(&iavx
->_xh
.xhrsvd
[0], sizeof(iavx
->_xh
.xhrsvd
));
890 iavx
->_xh
.xstate_bv
= AVX512_XMASK
;
891 iavx
->_xh
.xcomp_bv
= 0;
894 * See the block comment at the top of the function for a description of why we're clearing
898 case x86_AVX512_STATE32
:
899 __nochk_bcopy(&xs
.s32
->fpu_k0
, iavx
->x_Opmask
, 8 * sizeof(_STRUCT_OPMASK_REG
));
900 __nochk_bcopy(&xs
.s32
->fpu_zmmh0
, iavx
->x_ZMM_Hi256
, 8 * sizeof(_STRUCT_YMM_REG
));
901 if (fpu_allzeroes((uint64_t *)(void *)iavx
->x_ZMM_Hi256
, 8 * sizeof(_STRUCT_YMM_REG
)) == TRUE
) {
902 iavx
->_xh
.xstate_bv
&= ~XFEM_ZMM
;
904 __nochk_bcopy(&xs
.s32
->fpu_ymmh0
, iavx
->x_YMM_Hi128
, 8 * sizeof(_STRUCT_XMM_REG
));
905 if (fpu_allzeroes((uint64_t *)(void *)iavx
->x_YMM_Hi128
, 8 * sizeof(_STRUCT_XMM_REG
)) == TRUE
) {
906 iavx
->_xh
.xstate_bv
&= ~XFEM_YMM
;
909 DBG_AVX512_STATE(iavx
);
911 case x86_AVX_STATE32
:
912 __nochk_bcopy(&xs
.s32
->fpu_ymmh0
, iavx
->x_YMM_Hi128
, 8 * sizeof(_STRUCT_XMM_REG
));
913 if (fpu_allzeroes((uint64_t *)(void *)iavx
->x_YMM_Hi128
, 8 * sizeof(_STRUCT_XMM_REG
)) == TRUE
) {
914 iavx
->_xh
.xstate_bv
&= ~XFEM_YMM
;
917 case x86_AVX512_STATE64
:
918 __nochk_bcopy(&xs
.s64
->fpu_k0
, iavx
->x_Opmask
, 8 * sizeof(_STRUCT_OPMASK_REG
));
919 __nochk_bcopy(&xs
.s64
->fpu_zmm16
, iavx
->x_Hi16_ZMM
, 16 * sizeof(_STRUCT_ZMM_REG
));
920 __nochk_bcopy(&xs
.s64
->fpu_zmmh0
, iavx
->x_ZMM_Hi256
, 16 * sizeof(_STRUCT_YMM_REG
));
922 * Note that it is valid to have XFEM_ZMM set but XFEM_YMM cleared. In that case,
923 * the upper bits of the YMMs would be cleared and would result in a clean-upper
924 * state, allowing SSE instruction to avoid false dependencies.
926 if (fpu_allzeroes((uint64_t *)(void *)iavx
->x_Hi16_ZMM
, 16 * sizeof(_STRUCT_ZMM_REG
)) == TRUE
&&
927 fpu_allzeroes((uint64_t *)(void *)iavx
->x_ZMM_Hi256
, 16 * sizeof(_STRUCT_YMM_REG
)) == TRUE
) {
928 iavx
->_xh
.xstate_bv
&= ~XFEM_ZMM
;
931 __nochk_bcopy(&xs
.s64
->fpu_ymmh0
, iavx
->x_YMM_Hi128
, 16 * sizeof(_STRUCT_XMM_REG
));
932 if (fpu_allzeroes((uint64_t *)(void *)iavx
->x_YMM_Hi128
, 16 * sizeof(_STRUCT_XMM_REG
)) == TRUE
) {
933 iavx
->_xh
.xstate_bv
&= ~XFEM_YMM
;
935 DBG_AVX512_STATE(iavx
);
937 case x86_AVX_STATE64
:
938 __nochk_bcopy(&xs
.s64
->fpu_ymmh0
, iavx
->x_YMM_Hi128
, 16 * sizeof(_STRUCT_XMM_REG
));
939 if (fpu_allzeroes((uint64_t *)(void *)iavx
->x_YMM_Hi128
, 16 * sizeof(_STRUCT_XMM_REG
)) == TRUE
) {
940 iavx
->_xh
.xstate_bv
&= ~XFEM_YMM
;
948 ifps
->fp_valid
= old_valid
;
950 if (old_valid
== FALSE
) {
951 boolean_t istate
= ml_set_interrupts_enabled(FALSE
);
952 ifps
->fp_valid
= TRUE
;
953 /* If altering the current thread's state, disable FPU */
954 if (thr_act
== current_thread()) {
958 ml_set_interrupts_enabled(istate
);
961 simple_unlock(&pcb
->lock
);
964 fp_state_free(new_ifps
, thr_xstate
);
971 * Get the floating-point state for a thread.
972 * If the thread is not the current thread, it is
973 * not running (held). Locking needed against
974 * concurrent fpu_set_state or fpu_get_state.
979 thread_state_t tstate
,
982 struct x86_fx_thread_state
*ifps
;
983 x86_float_state64_t
*state
;
984 kern_return_t ret
= KERN_FAILURE
;
986 xstate_t thr_xstate
= thread_xstate(thr_act
);
988 if (fpu_capability
== UNDEFINED
) {
992 if ((f
== x86_AVX_STATE32
|| f
== x86_AVX_STATE64
) &&
993 fpu_capability
< AVX
) {
997 if ((f
== x86_AVX512_STATE32
|| f
== x86_AVX512_STATE64
) &&
998 thr_xstate
!= AVX512
) {
1002 state
= (x86_float_state64_t
*)tstate
;
1004 assert(thr_act
!= THREAD_NULL
);
1005 pcb
= THREAD_TO_PCB(thr_act
);
1007 simple_lock(&pcb
->lock
, LCK_GRP_NULL
);
1012 * No valid floating-point state.
1015 __nochk_bcopy((char *)&initial_fp_state
, (char *)&state
->fpu_fcw
,
1018 simple_unlock(&pcb
->lock
);
1020 return KERN_SUCCESS
;
1023 * Make sure we`ve got the latest fp state info
1024 * If the live fpu state belongs to our target
1026 if (thr_act
== current_thread()) {
1029 intr
= ml_set_interrupts_enabled(FALSE
);
1035 (void)ml_set_interrupts_enabled(intr
);
1037 if (ifps
->fp_valid
) {
1038 __nochk_bcopy((char *)ifps
, (char *)&state
->fpu_fcw
, fp_state_size
[FP
]);
1039 switch (thr_xstate
) {
1040 case UNDEFINED_FULL
:
1044 panic("fpu_get_fxstate() INVALID xstate: 0x%x", thr_xstate
);
1048 panic("fpu_get_fxstate() UNDEFINED xstate");
1051 break; /* already done */
1053 struct x86_avx_thread_state
*iavx
= (void *) ifps
;
1054 x86_avx_state64_t
*xs
= (x86_avx_state64_t
*) state
;
1055 if (f
== x86_AVX_STATE32
) {
1056 __nochk_bcopy(iavx
->x_YMM_Hi128
, &xs
->fpu_ymmh0
, 8 * sizeof(_STRUCT_XMM_REG
));
1057 } else if (f
== x86_AVX_STATE64
) {
1058 __nochk_bcopy(iavx
->x_YMM_Hi128
, &xs
->fpu_ymmh0
, 16 * sizeof(_STRUCT_XMM_REG
));
1063 struct x86_avx512_thread_state
*iavx
= (void *) ifps
;
1066 x86_avx512_state32_t
*s32
;
1067 x86_avx512_state64_t
*s64
;
1068 } xs
= { .ts
= tstate
};
1070 case x86_AVX512_STATE32
:
1071 __nochk_bcopy(iavx
->x_Opmask
, &xs
.s32
->fpu_k0
, 8 * sizeof(_STRUCT_OPMASK_REG
));
1072 __nochk_bcopy(iavx
->x_ZMM_Hi256
, &xs
.s32
->fpu_zmmh0
, 8 * sizeof(_STRUCT_YMM_REG
));
1073 __nochk_bcopy(iavx
->x_YMM_Hi128
, &xs
.s32
->fpu_ymmh0
, 8 * sizeof(_STRUCT_XMM_REG
));
1074 DBG_AVX512_STATE(iavx
);
1076 case x86_AVX_STATE32
:
1077 __nochk_bcopy(iavx
->x_YMM_Hi128
, &xs
.s32
->fpu_ymmh0
, 8 * sizeof(_STRUCT_XMM_REG
));
1079 case x86_AVX512_STATE64
:
1080 __nochk_bcopy(iavx
->x_Opmask
, &xs
.s64
->fpu_k0
, 8 * sizeof(_STRUCT_OPMASK_REG
));
1081 __nochk_bcopy(iavx
->x_Hi16_ZMM
, &xs
.s64
->fpu_zmm16
, 16 * sizeof(_STRUCT_ZMM_REG
));
1082 __nochk_bcopy(iavx
->x_ZMM_Hi256
, &xs
.s64
->fpu_zmmh0
, 16 * sizeof(_STRUCT_YMM_REG
));
1083 __nochk_bcopy(iavx
->x_YMM_Hi128
, &xs
.s64
->fpu_ymmh0
, 16 * sizeof(_STRUCT_XMM_REG
));
1084 DBG_AVX512_STATE(iavx
);
1086 case x86_AVX_STATE64
:
1087 __nochk_bcopy(iavx
->x_YMM_Hi128
, &xs
.s64
->fpu_ymmh0
, 16 * sizeof(_STRUCT_XMM_REG
));
1096 simple_unlock(&pcb
->lock
);
1104 * the child thread is 'stopped' with the thread
1105 * mutex held and is currently not known by anyone
1106 * so no way for fpu state to get manipulated by an
1107 * outside agency -> no need for pcb lock
1115 struct x86_fx_thread_state
*new_ifps
= NULL
;
1118 xstate_t xstate
= thread_xstate(parent
);
1120 ppcb
= THREAD_TO_PCB(parent
);
1122 if (ppcb
->ifps
== NULL
) {
1126 if (child
->machine
.ifps
) {
1127 panic("fpu_dup_fxstate: child's ifps non-null");
1130 new_ifps
= fp_state_alloc(xstate
);
1132 simple_lock(&ppcb
->lock
, LCK_GRP_NULL
);
1134 if (ppcb
->ifps
!= NULL
) {
1135 struct x86_fx_thread_state
*ifps
= ppcb
->ifps
;
1137 * Make sure we`ve got the latest fp state info
1139 if (current_thread() == parent
) {
1140 intr
= ml_set_interrupts_enabled(FALSE
);
1141 assert(current_thread() == parent
);
1146 (void)ml_set_interrupts_enabled(intr
);
1149 if (ifps
->fp_valid
) {
1150 child
->machine
.ifps
= new_ifps
;
1151 child
->machine
.xstate
= xstate
;
1152 __nochk_bcopy((char *)(ppcb
->ifps
),
1153 (char *)(child
->machine
.ifps
),
1154 fp_state_size
[xstate
]);
1156 /* Mark the new fp saved state as non-live. */
1157 /* Temporarily disabled: radar 4647827
1158 * new_ifps->fp_valid = TRUE;
1162 * Clear any reserved bits in the MXCSR to prevent a GPF
1163 * when issuing an FXRSTOR.
1165 new_ifps
->fx_MXCSR
&= mxcsr_capability_mask
;
1169 simple_unlock(&ppcb
->lock
);
1171 if (new_ifps
!= NULL
) {
1172 fp_state_free(new_ifps
, xstate
);
1178 * FNINIT programs the x87 control word to 0x37f, which matches
1179 * the desired default for macOS.
1185 boolean_t istate
= ml_set_interrupts_enabled(FALSE
);
1189 /* We skip this power-on-default verification sequence on
1190 * non-DEBUG, as dirtying the x87 control word may slow down
1191 * xsave/xrstor and affect energy use.
1193 unsigned short control
, control2
;
1196 control
&= ~(FPC_PC
| FPC_RC
); /* Clear precision & rounding control */
1197 control
|= (FPC_PC_64
| /* Set precision */
1198 FPC_RC_RN
| /* round-to-nearest */
1199 FPC_ZE
| /* Suppress zero-divide */
1200 FPC_OE
| /* and overflow */
1201 FPC_UE
| /* underflow */
1202 FPC_IE
| /* Allow NaNQs and +-INF */
1203 FPC_DE
| /* Allow denorms as operands */
1204 FPC_PE
); /* No trap for precision loss */
1205 assert(control
== control2
);
1208 /* Initialize SSE/SSE2 */
1209 __builtin_ia32_ldmxcsr(0x1f80);
1210 if (fpu_YMM_capable
) {
1215 ml_set_interrupts_enabled(istate
);
1219 * Coprocessor not present.
1222 uint64_t x86_isr_fp_simd_use
;
1230 struct x86_fx_thread_state
*ifps
= 0;
1231 xstate_t xstate
= current_xstate();
1233 thr_act
= current_thread();
1234 pcb
= THREAD_TO_PCB(thr_act
);
1236 if (pcb
->ifps
== 0 && !get_interrupt_level()) {
1237 ifps
= fp_state_alloc(xstate
);
1238 __nochk_bcopy((char *)&initial_fp_state
, (char *)ifps
,
1239 fp_state_size
[xstate
]);
1240 if (!thread_is_64bit_addr(thr_act
)) {
1241 ifps
->fp_save_layout
= fpu_YMM_capable
? XSAVE32
: FXSAVE32
;
1243 ifps
->fp_save_layout
= fpu_YMM_capable
? XSAVE64
: FXSAVE64
;
1245 ifps
->fp_valid
= TRUE
;
1247 intr
= ml_set_interrupts_enabled(FALSE
);
1249 clear_ts(); /* Enable FPU use */
1251 if (__improbable(get_interrupt_level())) {
1252 /* Track number of #DNA traps at interrupt context,
1253 * which is likely suboptimal. Racy, but good enough.
1255 x86_isr_fp_simd_use
++;
1257 * Save current FP/SIMD context if valid
1258 * Initialize live FP/SIMD registers
1265 if (pcb
->ifps
== 0) {
1267 pcb
->xstate
= xstate
;
1271 * Load this thread`s state into coprocessor live context.
1275 (void)ml_set_interrupts_enabled(intr
);
1278 fp_state_free(ifps
, xstate
);
1283 * FPU overran end of segment.
1284 * Re-initialize FPU. Floating point state is not valid.
1290 thread_t thr_act
= current_thread();
1292 struct x86_fx_thread_state
*ifps
;
1294 xstate_t xstate
= current_xstate();
1296 intr
= ml_set_interrupts_enabled(FALSE
);
1298 if (get_interrupt_level()) {
1299 panic("FPU segment overrun exception at interrupt context\n");
1301 if (current_task() == kernel_task
) {
1302 panic("FPU segment overrun exception in kernel thread context\n");
1306 * This is a non-recoverable error.
1307 * Invalidate the thread`s FPU state.
1309 pcb
= THREAD_TO_PCB(thr_act
);
1310 simple_lock(&pcb
->lock
, LCK_GRP_NULL
);
1313 simple_unlock(&pcb
->lock
);
1316 * Re-initialize the FPU.
1322 * And disable access.
1326 (void)ml_set_interrupts_enabled(intr
);
1329 fp_state_free(ifps
, xstate
);
1333 extern void fpxlog(int, uint32_t, uint32_t, uint32_t);
1336 * FPU error. Called by AST.
1342 thread_t thr_act
= current_thread();
1343 struct x86_fx_thread_state
*ifps
= thr_act
->machine
.ifps
;
1346 intr
= ml_set_interrupts_enabled(FALSE
);
1348 if (get_interrupt_level()) {
1349 panic("FPU error exception at interrupt context\n");
1351 if (current_task() == kernel_task
) {
1352 panic("FPU error exception in kernel thread context\n");
1356 * Save the FPU state and turn off the FPU.
1360 (void)ml_set_interrupts_enabled(intr
);
1362 const uint32_t mask
= ifps
->fx_control
&
1363 (FPC_IM
| FPC_DM
| FPC_ZM
| FPC_OM
| FPC_UE
| FPC_PE
);
1364 const uint32_t xcpt
= ~mask
& (ifps
->fx_status
&
1365 (FPS_IE
| FPS_DE
| FPS_ZE
| FPS_OE
| FPS_UE
| FPS_PE
));
1366 fpxlog(EXC_I386_EXTERR
, ifps
->fx_status
, ifps
->fx_control
, xcpt
);
1372 * Locking not needed:
1373 * . if called from fpu_get_state, pcb already locked.
1374 * . if called from fpnoextflt or fp_intr, we are single-cpu
1375 * . otherwise, thread is running.
1376 * N.B.: Must be called with interrupts disabled
1383 pcb_t pcb
= THREAD_TO_PCB(thr_act
);
1384 struct x86_fx_thread_state
*ifps
= pcb
->ifps
;
1387 if (ifps
!= 0 && !ifps
->fp_valid
) {
1388 assert((get_cr0() & CR0_TS
) == 0);
1389 /* registers are in FPU */
1390 ifps
->fp_valid
= TRUE
;
1391 fpu_store_registers(ifps
, thread_is_64bit_addr(thr_act
));
1396 * Restore FPU state from PCB.
1398 * Locking not needed; always called on the current thread.
1405 pcb_t pcb
= THREAD_TO_PCB(thr_act
);
1406 struct x86_fx_thread_state
*ifps
= pcb
->ifps
;
1410 if (ifps
->fp_valid
!= FALSE
&& ifps
->fp_valid
!= TRUE
) {
1411 panic("fp_load() invalid fp_valid: %u, fp_save_layout: %u\n",
1412 ifps
->fp_valid
, ifps
->fp_save_layout
);
1416 if (ifps
->fp_valid
== FALSE
) {
1419 fpu_load_registers(ifps
);
1421 ifps
->fp_valid
= FALSE
; /* in FPU */
1425 * SSE arithmetic exception handling code.
1426 * Basically the same as the x87 exception handler with a different subtype
1430 fpSSEexterrflt(void)
1432 thread_t thr_act
= current_thread();
1433 struct x86_fx_thread_state
*ifps
= thr_act
->machine
.ifps
;
1436 intr
= ml_set_interrupts_enabled(FALSE
);
1438 if (get_interrupt_level()) {
1439 panic("SSE exception at interrupt context\n");
1441 if (current_task() == kernel_task
) {
1442 panic("SSE exception in kernel thread context\n");
1446 * Save the FPU state and turn off the FPU.
1450 (void)ml_set_interrupts_enabled(intr
);
1452 * Raise FPU exception.
1453 * Locking not needed on pcb->ifps,
1454 * since thread is running.
1456 const uint32_t mask
= (ifps
->fx_MXCSR
>> 7) &
1457 (FPC_IM
| FPC_DM
| FPC_ZM
| FPC_OM
| FPC_UE
| FPC_PE
);
1458 const uint32_t xcpt
= ~mask
& (ifps
->fx_MXCSR
&
1459 (FPS_IE
| FPS_DE
| FPS_ZE
| FPS_OE
| FPS_UE
| FPS_PE
));
1460 fpxlog(EXC_I386_SSEEXTERR
, ifps
->fx_MXCSR
, ifps
->fx_MXCSR
, xcpt
);
1465 * If a thread is using an AVX-sized savearea:
1466 * - allocate a new AVX512-sized area,
1467 * - copy the 256-bit state into the 512-bit area,
1468 * - deallocate the smaller area
1469 * ASSUMES: thread is the current thread.
1472 fpu_savearea_promote_avx512(thread_t thread
)
1474 struct x86_avx_thread_state
*ifps
= NULL
;
1475 struct x86_avx512_thread_state
*ifps512
= NULL
;
1476 pcb_t pcb
= THREAD_TO_PCB(thread
);
1477 boolean_t do_avx512_alloc
= FALSE
;
1480 assert(thread
== current_thread());
1482 DBG("fpu_savearea_promote_avx512(%p)\n", thread
);
1484 simple_lock(&pcb
->lock
, LCK_GRP_NULL
);
1488 pcb
->xstate
= AVX512
;
1489 simple_unlock(&pcb
->lock
);
1491 * Now that the PCB xstate has been promoted, set XCR0 so
1492 * that we don't re-trip #UD on the next AVX-512 instruction.
1494 * Since this branch is taken when the first FP instruction
1495 * attempted by this thread is an AVX-512 instruction, we
1496 * call fpnoextflt() to allocate an appropriately-sized
1497 * AVX-512 save-area, thereby avoiding the overhead of another
1498 * fault that would be triggered immediately on return.
1500 intr
= ml_set_interrupts_enabled(FALSE
);
1501 xsetbv(0, AVX512_XMASK
);
1502 current_cpu_datap()->cpu_xstate
= AVX512
;
1503 (void)ml_set_interrupts_enabled(intr
);
1509 if (pcb
->xstate
!= AVX512
) {
1510 do_avx512_alloc
= TRUE
;
1513 simple_unlock(&pcb
->lock
);
1515 if (do_avx512_alloc
== TRUE
) {
1516 ifps512
= fp_state_alloc(AVX512
);
1519 simple_lock(&pcb
->lock
, LCK_GRP_NULL
);
1521 intr
= ml_set_interrupts_enabled(FALSE
);
1527 xsetbv(0, AVX512_XMASK
);
1528 current_cpu_datap()->cpu_xstate
= AVX512
;
1529 (void)ml_set_interrupts_enabled(intr
);
1531 assert(ifps
->fp
.fp_valid
);
1533 /* Allocate an AVX512 savearea and copy AVX state into it */
1534 if (pcb
->xstate
!= AVX512
) {
1535 __nochk_bcopy(ifps
, ifps512
, fp_state_size
[AVX
]);
1536 pcb
->ifps
= ifps512
;
1537 pcb
->xstate
= AVX512
;
1542 /* The PCB lock is redundant in some scenarios given the higher level
1543 * thread mutex, but its pre-emption disablement is relied upon here
1545 simple_unlock(&pcb
->lock
);
1548 fp_state_free(ifps
, AVX
);
1551 fp_state_free(ifps
, AVX512
);
1556 * Upgrade the calling thread to AVX512.
1559 fpu_thread_promote_avx512(thread_t thread
)
1561 task_t task
= current_task();
1563 if (thread
!= current_thread()) {
1566 if (!ml_fpu_avx512_enabled()) {
1570 fpu_savearea_promote_avx512(thread
);
1572 /* Racy but the task's xstate is only a hint */
1573 task
->xstate
= AVX512
;
1580 * Called from user_trap() when an invalid opcode fault is taken.
1581 * If the user is attempting an AVX512 instruction on a machine
1582 * that supports this, we switch the calling thread to use
1583 * a larger savearea, set its XCR0 bit mask to enable AVX512 and
1584 * return to user_trap() with a 0 return value.
1585 * Otherwise, simply return a nonzero value.
1588 #define MAX_X86_INSN_LENGTH (15)
1590 fpUDflt(user_addr_t rip
)
1592 uint8_t instruction_prefix
;
1593 boolean_t is_AVX512_instruction
= FALSE
;
1594 user_addr_t original_rip
= rip
;
1596 /* TODO: as an optimisation, copy up to the lesser of the
1597 * next page boundary or maximal prefix length in one pass
1598 * rather than issue multiple copyins
1600 if (copyin(rip
, (char *) &instruction_prefix
, 1)) {
1603 DBG("fpUDflt(0x%016llx) prefix: 0x%x\n",
1604 rip
, instruction_prefix
);
1605 /* TODO: determine more specifically which prefixes
1606 * are sane possibilities for AVX512 insns
1608 switch (instruction_prefix
) {
1609 case 0x2E: /* CS segment override */
1610 case 0x36: /* SS segment override */
1611 case 0x3E: /* DS segment override */
1612 case 0x26: /* ES segment override */
1613 case 0x64: /* FS segment override */
1614 case 0x65: /* GS segment override */
1615 case 0x66: /* Operand-size override */
1616 case 0x67: /* address-size override */
1617 /* Skip optional prefixes */
1619 if ((rip
- original_rip
) > MAX_X86_INSN_LENGTH
) {
1623 case 0x62: /* EVEX */
1624 case 0xC5: /* VEX 2-byte */
1625 case 0xC4: /* VEX 3-byte */
1626 is_AVX512_instruction
= TRUE
;
1631 } while (!is_AVX512_instruction
);
1633 /* Here if we detect attempted execution of an AVX512 instruction */
1636 * Fail if this machine doesn't support AVX512
1638 if (fpu_capability
!= AVX512
) {
1642 assert(xgetbv(XCR0
) == AVX_XMASK
);
1644 DBG("fpUDflt() switching xstate to AVX512\n");
1645 (void) fpu_thread_promote_avx512(current_thread());
1651 fp_setvalid(boolean_t value
)
1653 thread_t thr_act
= current_thread();
1654 struct x86_fx_thread_state
*ifps
= thr_act
->machine
.ifps
;
1657 ifps
->fp_valid
= value
;
1659 if (value
== TRUE
) {
1660 boolean_t istate
= ml_set_interrupts_enabled(FALSE
);
1662 ml_set_interrupts_enabled(istate
);
1668 ml_fpu_avx_enabled(void)
1670 return fpu_capability
>= AVX
;
1674 ml_fpu_avx512_enabled(void)
1676 return fpu_capability
== AVX512
;
1680 task_xstate(task_t task
)
1682 if (task
== TASK_NULL
) {
1685 return task
->xstate
;
1690 thread_xstate(thread_t thread
)
1692 xstate_t xs
= THREAD_TO_PCB(thread
)->xstate
;
1693 if (xs
== UNDEFINED
) {
1694 return task_xstate(thread
->task
);
1701 current_xstate(void)
1703 return thread_xstate(current_thread());
1707 * Called when exec'ing between bitnesses.
1708 * If valid FPU state exists, adjust the layout.
1711 fpu_switch_addrmode(thread_t thread
, boolean_t is_64bit
)
1713 struct x86_fx_thread_state
*ifps
= thread
->machine
.ifps
;
1714 mp_disable_preemption();
1716 if (ifps
&& ifps
->fp_valid
) {
1717 if (thread_xstate(thread
) == FP
) {
1718 ifps
->fp_save_layout
= is_64bit
? FXSAVE64
: FXSAVE32
;
1720 ifps
->fp_save_layout
= is_64bit
? XSAVE64
: XSAVE32
;
1723 mp_enable_preemption();
1726 static inline uint32_t
1727 fpsimd_pop(uintptr_t ins
, int sz
)
1734 uint64_t *ins64
= (uint64_t *) ins
;
1735 uint64_t *ins642
= (uint64_t *) (ins
+ 8);
1736 rv1
= __builtin_popcountll(*ins64
);
1737 rv2
= __builtin_popcountll(*ins642
);
1744 uint32_t *ins32
= (uint32_t *) ins
;
1745 rv
+= __builtin_popcount(*ins32
);
1751 char *ins8
= (char *)ins
;
1752 rv
+= __builtin_popcount(*ins8
);
1760 thread_fpsimd_hash(thread_t ft
)
1762 if (fpsimd_fault_popc
== 0) {
1767 boolean_t istate
= ml_set_interrupts_enabled(FALSE
);
1768 struct x86_fx_thread_state
*pifps
= THREAD_TO_PCB(ft
)->ifps
;
1771 if (pifps
->fp_valid
) {
1772 prv
= fpsimd_pop((uintptr_t) &pifps
->fx_XMM_reg
[0][0],
1773 sizeof(pifps
->fx_XMM_reg
));
1775 uintptr_t cr0
= get_cr0();
1778 prv
= fpsimd_pop((uintptr_t) &pifps
->fx_XMM_reg
[0][0],
1779 sizeof(pifps
->fx_XMM_reg
));
1780 pifps
->fp_valid
= FALSE
;
1786 ml_set_interrupts_enabled(istate
);