2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Mach Operating System
33 * Copyright (c) 1992-1990 Carnegie Mellon University
34 * All Rights Reserved.
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 * Carnegie Mellon requests users of this software to return to
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
60 #include <mach/exception_types.h>
61 #include <mach/i386/thread_status.h>
62 #include <mach/i386/fp_reg.h>
63 #include <mach/branch_predicates.h>
65 #include <kern/mach_param.h>
66 #include <kern/processor.h>
67 #include <kern/thread.h>
68 #include <kern/zalloc.h>
69 #include <kern/misc_protos.h>
71 #include <kern/assert.h>
73 #include <libkern/OSAtomic.h>
75 #include <architecture/i386/pio.h>
76 #include <i386/cpuid.h>
78 #include <i386/proc_reg.h>
79 #include <i386/misc_protos.h>
80 #include <i386/thread.h>
81 #include <i386/trap.h>
83 xstate_t fpu_capability
= UNDEFINED
; /* extended state capability */
84 xstate_t fpu_default
= UNDEFINED
; /* default extended state */
86 #define ALIGNED(addr,size) (((uintptr_t)(addr)&((size)-1))==0)
90 extern void fpinit(void);
96 static void configure_mxcsr_capability_mask(x86_ext_thread_state_t
*fps
);
97 static xstate_t
thread_xstate(thread_t
);
99 x86_ext_thread_state_t initial_fp_state
__attribute((aligned(64)));
102 /* Global MXCSR capability bitmask */
103 static unsigned int mxcsr_capability_mask
;
106 __asm__ volatile("fninit")
108 #define fnstcw(control) \
109 __asm__("fnstcw %0" : "=m" (*(unsigned short *)(control)))
111 #define fldcw(control) \
112 __asm__ volatile("fldcw %0" : : "m" (*(unsigned short *) &(control)) )
115 __asm__ volatile("fnclex")
117 #define fnsave(state) \
118 __asm__ volatile("fnsave %0" : "=m" (*state))
120 #define frstor(state) \
121 __asm__ volatile("frstor %0" : : "m" (state))
126 static inline void fxrstor(struct x86_fx_thread_state
*a
) {
127 __asm__
__volatile__("fxrstor %0" :: "m" (*a
));
130 static inline void fxsave(struct x86_fx_thread_state
*a
) {
131 __asm__
__volatile__("fxsave %0" : "=m" (*a
));
134 static inline void fxrstor64(struct x86_fx_thread_state
*a
) {
135 __asm__
__volatile__("fxrstor64 %0" :: "m" (*a
));
138 static inline void fxsave64(struct x86_fx_thread_state
*a
) {
139 __asm__
__volatile__("fxsave64 %0" : "=m" (*a
));
142 #if !defined(RC_HIDE_XNU_J137)
143 #define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX || (x) == AVX512)
145 #define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX)
148 zone_t ifps_zone
[] = {
151 #if !defined(RC_HIDE_XNU_J137)
155 static uint32_t fp_state_size
[] = {
156 [FP
] = sizeof(struct x86_fx_thread_state
),
157 [AVX
] = sizeof(struct x86_avx_thread_state
),
158 #if !defined(RC_HIDE_XNU_J137)
159 [AVX512
] = sizeof(struct x86_avx512_thread_state
)
163 static const char *xstate_name
[] = {
164 [UNDEFINED
] = "UNDEFINED",
167 #if !defined(RC_HIDE_XNU_J137)
172 #if !defined(RC_HIDE_XNU_J137)
173 #define fpu_ZMM_capable (fpu_capability == AVX512)
174 #define fpu_YMM_capable (fpu_capability == AVX || fpu_capability == AVX512)
176 * On-demand AVX512 support
177 * ------------------------
178 * On machines with AVX512 support, by default, threads are created with
179 * AVX512 masked off in XCR0 and an AVX-sized savearea is used. However, AVX512
180 * capabilities are advertised in the commpage and via sysctl. If a thread
181 * opts to use AVX512 instructions, the first will result in a #UD exception.
182 * Faulting AVX512 intructions are recognizable by their unique prefix.
183 * This exception results in the thread being promoted to use an AVX512-sized
184 * savearea and for the AVX512 bit masks being set in its XCR0. The faulting
185 * instruction is re-driven and the thread can proceed to perform AVX512
188 * In addition to AVX512 instructions causing promotion, the thread_set_state()
189 * primitive with an AVX512 state flavor result in promotion.
191 * AVX512 promotion of the first thread in a task causes the default xstate
192 * of the task to be promoted so that any subsequently created or subsequently
193 * DNA-faulted thread will have AVX512 xstate and it will not need to fault-in
196 * Two savearea zones are used: the default pool of AVX-sized (832 byte) areas
197 * and a second pool of larger AVX512-sized (2688 byte) areas.
199 * Note the initial state value is an AVX512 object but that the AVX initial
200 * value is a subset of it.
203 #define fpu_YMM_capable (fpu_capability == AVX)
205 static uint32_t cpuid_reevaluated
= 0;
207 static void fpu_store_registers(void *, boolean_t
);
208 static void fpu_load_registers(void *);
210 #define FP_XMASK ((uint32_t) (XFEM_X87 | XFEM_SSE))
211 #define AVX_XMASK ((uint32_t) (XFEM_X87 | XFEM_SSE | XFEM_YMM))
212 #if !defined(RC_HIDE_XNU_J137)
213 #define AVX512_XMASK ((uint32_t) (XFEM_X87 | XFEM_SSE | XFEM_YMM | XFEM_ZMM))
214 static const uint32_t xstate_xmask
[] = {
217 [AVX512
] = AVX512_XMASK
220 static const uint32_t xstate_xmask
[] = {
226 static inline void xsetbv(uint32_t mask_hi
, uint32_t mask_lo
) {
227 __asm__
__volatile__("xsetbv" :: "a"(mask_lo
), "d"(mask_hi
), "c" (XCR0
));
230 static inline void xsave(struct x86_fx_thread_state
*a
, uint32_t rfbm
) {
231 __asm__
__volatile__("xsave %0" :"=m" (*a
) : "a"(rfbm
), "d"(0));
234 static inline void xsave64(struct x86_fx_thread_state
*a
, uint32_t rfbm
) {
235 __asm__
__volatile__("xsave64 %0" :"=m" (*a
) : "a"(rfbm
), "d"(0));
238 static inline void xrstor(struct x86_fx_thread_state
*a
, uint32_t rfbm
) {
239 __asm__
__volatile__("xrstor %0" :: "m" (*a
), "a"(rfbm
), "d"(0));
242 static inline void xrstor64(struct x86_fx_thread_state
*a
, uint32_t rfbm
) {
243 __asm__
__volatile__("xrstor64 %0" :: "m" (*a
), "a"(rfbm
), "d"(0));
246 #if !defined(RC_HIDE_XNU_J137)
247 static inline void vzeroupper(void) {
248 __asm__
__volatile__("vzeroupper" ::);
250 #if DEVELOPMENT || DEBUG
251 static inline uint64_t xgetbv(uint32_t c
) {
252 uint32_t mask_hi
, mask_lo
;
253 __asm__
__volatile__("xgetbv" : "=a"(mask_lo
), "=d"(mask_hi
) : "c" (c
));
254 return ((uint64_t) mask_hi
<<32) + (uint64_t) mask_lo
;
258 static boolean_t
fpu_thread_promote_avx512(thread_t
); /* Forward */
261 * Define a wrapper for bcopy to defeat destination size checka.
262 * This is needed to treat repeated objects such as
263 * _STRUCT_XMM_REG fpu_ymmh0;
265 * _STRUCT_XMM_REG fpu_ymmh7;
266 * as an array and to copy like so:
267 * bcopy_nockch(src,&dst->fpu_ymmh0,8*sizeof(_STRUCT_XMM_REG));
268 * without the compiler throwing a __builtin__memmove_chk error.
270 static inline void bcopy_nochk(void *_src
, void *_dst
, size_t _len
) {
271 bcopy(_src
, _dst
, _len
);
275 * Furthermore, make compile-time asserts that no padding creeps into structures
276 * for which we're doing this.
278 #define ASSERT_PACKED(t, m1, m2, n, mt) \
279 extern char assert_packed_ ## t ## _ ## m1 ## _ ## m2 \
280 [(offsetof(t,m2) - offsetof(t,m1) == (n - 1)*sizeof(mt)) ? 1 : -1]
282 ASSERT_PACKED(x86_avx_state32_t
, fpu_ymmh0
, fpu_ymmh7
, 8, _STRUCT_XMM_REG
);
284 ASSERT_PACKED(x86_avx_state64_t
, fpu_ymmh0
, fpu_ymmh15
, 16, _STRUCT_XMM_REG
);
286 ASSERT_PACKED(x86_avx512_state32_t
, fpu_k0
, fpu_k7
, 8, _STRUCT_OPMASK_REG
);
287 ASSERT_PACKED(x86_avx512_state32_t
, fpu_ymmh0
, fpu_ymmh7
, 8, _STRUCT_XMM_REG
);
288 ASSERT_PACKED(x86_avx512_state32_t
, fpu_zmmh0
, fpu_zmmh7
, 8, _STRUCT_YMM_REG
);
290 ASSERT_PACKED(x86_avx512_state64_t
, fpu_k0
, fpu_k7
, 8, _STRUCT_OPMASK_REG
);
291 ASSERT_PACKED(x86_avx512_state64_t
, fpu_ymmh0
, fpu_ymmh15
, 16, _STRUCT_XMM_REG
);
292 ASSERT_PACKED(x86_avx512_state64_t
, fpu_zmmh0
, fpu_zmmh15
, 16, _STRUCT_YMM_REG
);
293 ASSERT_PACKED(x86_avx512_state64_t
, fpu_zmm16
, fpu_zmm31
, 16, _STRUCT_ZMM_REG
);
295 #if defined(DEBUG_AVX512)
297 #define DBG(x...) kprintf("DBG: " x)
299 typedef struct { uint8_t byte
[8]; } opmask_t
;
300 typedef struct { uint8_t byte
[16]; } xmm_t
;
301 typedef struct { uint8_t byte
[32]; } ymm_t
;
302 typedef struct { uint8_t byte
[64]; } zmm_t
;
305 DBG_AVX512_STATE(struct x86_avx512_thread_state
*sp
)
308 xmm_t
*xmm
= (xmm_t
*) &sp
->fp
.fx_XMM_reg
;
309 xmm_t
*ymmh
= (xmm_t
*) &sp
->x_YMM_Hi128
;
310 ymm_t
*zmmh
= (ymm_t
*) &sp
->x_ZMM_Hi256
;
311 zmm_t
*zmm
= (zmm_t
*) &sp
->x_Hi16_ZMM
;
312 opmask_t
*k
= (opmask_t
*) &sp
->x_Opmask
;
314 kprintf("x_YMM_Hi128: %lu\n", offsetof(struct x86_avx512_thread_state
, x_YMM_Hi128
));
315 kprintf("x_Opmask: %lu\n", offsetof(struct x86_avx512_thread_state
, x_Opmask
));
316 kprintf("x_ZMM_Hi256: %lu\n", offsetof(struct x86_avx512_thread_state
, x_ZMM_Hi256
));
317 kprintf("x_Hi16_ZMM: %lu\n", offsetof(struct x86_avx512_thread_state
, x_Hi16_ZMM
));
319 kprintf("XCR0: 0x%016llx\n", xgetbv(XCR0
));
320 kprintf("XINUSE: 0x%016llx\n", xgetbv(1));
322 /* Print all ZMM registers */
323 for (i
= 0; i
< 16; i
++) {
324 kprintf("zmm%d:\t0x", i
);
325 for (j
= 0; j
< 16; j
++)
326 kprintf("%02x", xmm
[i
].byte
[j
]);
327 for (j
= 0; j
< 16; j
++)
328 kprintf("%02x", ymmh
[i
].byte
[j
]);
329 for (j
= 0; j
< 32; j
++)
330 kprintf("%02x", zmmh
[i
].byte
[j
]);
333 for (i
= 0; i
< 16; i
++) {
334 kprintf("zmm%d:\t0x", 16+i
);
335 for (j
= 0; j
< 64; j
++)
336 kprintf("%02x", zmm
[i
].byte
[j
]);
339 for (i
= 0; i
< 8; i
++) {
340 kprintf("k%d:\t0x", i
);
341 for (j
= 0; j
< 8; j
++)
342 kprintf("%02x", k
[i
].byte
[j
]);
346 kprintf("xstate_bv: 0x%016llx\n", sp
->_xh
.xstate_bv
);
347 kprintf("xcomp_bv: 0x%016llx\n", sp
->_xh
.xcomp_bv
);
352 DBG_AVX512_STATE(__unused
struct x86_avx512_thread_state
*sp
)
356 #endif /* DEBUG_AVX512 */
361 static inline unsigned short
364 unsigned short status
;
365 __asm__
volatile("fnstsw %0" : "=ma" (status
));
371 * Configure the initial FPU state presented to new threads.
372 * Determine the MXCSR capability mask, which allows us to mask off any
373 * potentially unsafe "reserved" bits before restoring the FPU context.
374 * *Not* per-cpu, assumes symmetry.
378 configure_mxcsr_capability_mask(x86_ext_thread_state_t
*fps
)
380 /* XSAVE requires a 64 byte aligned store */
381 assert(ALIGNED(fps
, 64));
382 /* Clear, to prepare for the diagnostic FXSAVE */
383 bzero(fps
, sizeof(*fps
));
386 fpu_store_registers(fps
, FALSE
);
388 mxcsr_capability_mask
= fps
->fx
.fx_MXCSR_MASK
;
390 /* Set default mask value if necessary */
391 if (mxcsr_capability_mask
== 0)
392 mxcsr_capability_mask
= 0xffbf;
394 /* Clear vector register store */
395 bzero(&fps
->fx
.fx_XMM_reg
[0][0], sizeof(fps
->fx
.fx_XMM_reg
));
396 bzero(fps
->avx
.x_YMM_Hi128
, sizeof(fps
->avx
.x_YMM_Hi128
));
397 #if !defined(RC_HIDE_XNU_J137)
398 if (fpu_ZMM_capable
) {
399 bzero(fps
->avx512
.x_ZMM_Hi256
, sizeof(fps
->avx512
.x_ZMM_Hi256
));
400 bzero(fps
->avx512
.x_Hi16_ZMM
, sizeof(fps
->avx512
.x_Hi16_ZMM
));
401 bzero(fps
->avx512
.x_Opmask
, sizeof(fps
->avx512
.x_Opmask
));
405 fps
->fx
.fp_valid
= TRUE
;
406 fps
->fx
.fp_save_layout
= fpu_YMM_capable
? XSAVE32
: FXSAVE32
;
407 fpu_load_registers(fps
);
409 /* Poison values to trap unsafe usage */
410 fps
->fx
.fp_valid
= 0xFFFFFFFF;
411 fps
->fx
.fp_save_layout
= FP_UNUSED
;
413 /* Re-enable FPU/SSE DNA exceptions */
418 * Look for FPU and initialize it.
419 * Called on each CPU.
425 unsigned short status
;
426 unsigned short control
;
429 * Check for FPU by initializing it,
430 * then trying to read the correct bit patterns from
431 * the control and status registers.
433 set_cr0((get_cr0() & ~(CR0_EM
|CR0_TS
)) | CR0_NE
); /* allow use of FPU */
439 assert(((status
& 0xff) == 0) && ((control
& 0x103f) == 0x3f));
441 /* Advertise SSE support */
442 if (cpuid_features() & CPUID_FEATURE_FXSR
) {
443 set_cr4(get_cr4() | CR4_OSFXS
);
444 /* And allow SIMD exceptions if present */
445 if (cpuid_features() & CPUID_FEATURE_SSE
) {
446 set_cr4(get_cr4() | CR4_OSXMM
);
449 panic("fpu is not FP_FXSR");
451 fpu_capability
= fpu_default
= FP
;
453 #if !defined(RC_HIDE_XNU_J137)
454 static boolean_t is_avx512_enabled
= TRUE
;
455 if (cpu_number() == master_cpu
) {
456 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512F
) {
457 PE_parse_boot_argn("avx512", &is_avx512_enabled
, sizeof(boolean_t
));
458 kprintf("AVX512 supported %s\n",
459 is_avx512_enabled
? "and enabled" : "but disabled");
464 /* Configure the XSAVE context mechanism if the processor supports
467 if (cpuid_features() & CPUID_FEATURE_XSAVE
) {
468 cpuid_xsave_leaf_t
*xs0p
= &cpuid_info()->cpuid_xsave_leaf
[0];
469 #if !defined(RC_HIDE_XNU_J137)
470 if (is_avx512_enabled
&&
471 (xs0p
->extended_state
[eax
] & XFEM_ZMM
) == XFEM_ZMM
) {
472 assert(xs0p
->extended_state
[eax
] & XFEM_SSE
);
473 assert(xs0p
->extended_state
[eax
] & XFEM_YMM
);
474 fpu_capability
= AVX512
;
475 /* XSAVE container size for all features */
476 set_cr4(get_cr4() | CR4_OSXSAVE
);
477 xsetbv(0, AVX512_XMASK
);
478 /* Re-evaluate CPUID, once, to reflect OSXSAVE */
479 if (OSCompareAndSwap(0, 1, &cpuid_reevaluated
))
481 /* Verify that now selected state can be accommodated */
482 assert(xs0p
->extended_state
[ebx
] == fp_state_size
[AVX512
]);
484 * AVX set until AVX512 is used.
485 * See comment above about on-demand AVX512 support.
487 xsetbv(0, AVX_XMASK
);
491 if (xs0p
->extended_state
[eax
] & XFEM_YMM
) {
492 assert(xs0p
->extended_state
[eax
] & XFEM_SSE
);
493 fpu_capability
= AVX
;
495 /* XSAVE container size for all features */
496 set_cr4(get_cr4() | CR4_OSXSAVE
);
497 xsetbv(0, AVX_XMASK
);
498 /* Re-evaluate CPUID, once, to reflect OSXSAVE */
499 if (OSCompareAndSwap(0, 1, &cpuid_reevaluated
))
501 /* Verify that now selected state can be accommodated */
502 assert(xs0p
->extended_state
[ebx
] == fp_state_size
[AVX
]);
506 if (cpu_number() == master_cpu
)
507 kprintf("fpu_state: %s, state_size: %d\n",
508 xstate_name
[fpu_capability
],
509 fp_state_size
[fpu_capability
]);
514 * Trap wait instructions. Turn off FPU for now.
516 set_cr0(get_cr0() | CR0_TS
| CR0_MP
);
520 * Allocate and initialize FP state for specified xstate.
524 fp_state_alloc(xstate_t xs
)
526 struct x86_fx_thread_state
*ifps
;
528 assert(ifps_zone
[xs
] != NULL
);
529 ifps
= zalloc(ifps_zone
[xs
]);
532 if (!(ALIGNED(ifps
,64))) {
533 panic("fp_state_alloc: %p, %u, %p, %u",
534 ifps
, (unsigned) ifps_zone
[xs
]->elem_size
,
535 (void *) ifps_zone
[xs
]->free_elements
,
536 (unsigned) ifps_zone
[xs
]->alloc_size
);
539 bzero(ifps
, fp_state_size
[xs
]);
545 fp_state_free(void *ifps
, xstate_t xs
)
547 assert(ifps_zone
[xs
] != NULL
);
548 zfree(ifps_zone
[xs
], ifps
);
557 static void fpu_load_registers(void *fstate
) {
558 struct x86_fx_thread_state
*ifps
= fstate
;
559 fp_save_layout_t layout
= ifps
->fp_save_layout
;
561 assert(current_task() == NULL
|| \
562 (thread_is_64bit(current_thread()) ? \
563 (layout
== FXSAVE64
|| layout
== XSAVE64
) : \
564 (layout
== FXSAVE32
|| layout
== XSAVE32
)));
565 assert(ALIGNED(ifps
, 64));
566 assert(ml_get_interrupts_enabled() == FALSE
);
569 if (layout
== XSAVE32
|| layout
== XSAVE64
) {
570 struct x86_avx_thread_state
*iavx
= fstate
;
572 /* Verify reserved bits in the XSAVE header*/
573 if (iavx
->_xh
.xstate_bv
& ~xstate_xmask
[current_xstate()])
574 panic("iavx->_xh.xstate_bv: 0x%llx", iavx
->_xh
.xstate_bv
);
575 for (i
= 0; i
< sizeof(iavx
->_xh
.xhrsvd
); i
++)
576 if (iavx
->_xh
.xhrsvd
[i
])
577 panic("Reserved bit set");
579 if (fpu_YMM_capable
) {
580 if (layout
!= XSAVE32
&& layout
!= XSAVE64
)
581 panic("Inappropriate layout: %u\n", layout
);
593 xrstor64(ifps
, xstate_xmask
[current_xstate()]);
596 xrstor(ifps
, xstate_xmask
[current_xstate()]);
599 panic("fpu_load_registers() bad layout: %d\n", layout
);
603 static void fpu_store_registers(void *fstate
, boolean_t is64
) {
604 struct x86_fx_thread_state
*ifps
= fstate
;
605 assert(ALIGNED(ifps
, 64));
606 xstate_t xs
= current_xstate();
611 ifps
->fp_save_layout
= FXSAVE64
;
614 ifps
->fp_save_layout
= FXSAVE32
;
618 #if !defined(RC_HIDE_XNU_J137)
622 xsave64(ifps
, xstate_xmask
[xs
]);
623 ifps
->fp_save_layout
= XSAVE64
;
625 xsave(ifps
, xstate_xmask
[xs
]);
626 ifps
->fp_save_layout
= XSAVE32
;
630 panic("fpu_store_registers() bad xstate: %d\n", xs
);
635 * Initialize FP handling.
639 fpu_module_init(void)
641 if (!IS_VALID_XSTATE(fpu_default
))
642 panic("fpu_module_init: invalid extended state %u\n",
645 /* We explicitly choose an allocation size of 13 pages = 64 * 832
646 * to eliminate waste for the 832 byte sized
647 * AVX XSAVE register save area.
649 ifps_zone
[fpu_default
] = zinit(fp_state_size
[fpu_default
],
650 thread_max
* fp_state_size
[fpu_default
],
651 64 * fp_state_size
[fpu_default
],
654 /* To maintain the required alignment, disable
655 * zone debugging for this zone as that appends
656 * 16 bytes to each element.
658 zone_change(ifps_zone
[fpu_default
], Z_ALIGNMENT_REQUIRED
, TRUE
);
660 #if !defined(RC_HIDE_XNU_J137)
662 * If AVX512 is supported, create a separate savearea zone.
663 * with allocation size: 19 pages = 32 * 2668
665 if (fpu_capability
== AVX512
) {
666 ifps_zone
[AVX512
] = zinit(fp_state_size
[AVX512
],
667 thread_max
* fp_state_size
[AVX512
],
668 32 * fp_state_size
[AVX512
],
669 "x86 avx512 save state");
670 zone_change(ifps_zone
[AVX512
], Z_ALIGNMENT_REQUIRED
, TRUE
);
674 /* Determine MXCSR reserved bits and configure initial FPU state*/
675 configure_mxcsr_capability_mask(&initial_fp_state
);
679 * Context switch fpu state.
680 * Always save old thread`s FPU context but don't load new .. allow that to fault-in.
681 * Switch to the new task's xstate.
684 fpu_switch_context(thread_t old
, thread_t
new)
686 struct x86_fx_thread_state
*ifps
;
687 boolean_t is_ts_cleared
= FALSE
;
689 assert(ml_get_interrupts_enabled() == FALSE
);
690 ifps
= (old
)->machine
.ifps
;
692 if (ifps
&& ((ifps
->fp_valid
!= FALSE
) && (ifps
->fp_valid
!= TRUE
))) {
693 panic("ifps->fp_valid: %u\n", ifps
->fp_valid
);
696 if (ifps
!= 0 && (ifps
->fp_valid
== FALSE
)) {
697 /* Clear CR0.TS in preparation for the FP context save. In
698 * theory, this shouldn't be necessary since a live FPU should
699 * indicate that TS is clear. However, various routines
700 * (such as sendsig & sigreturn) manipulate TS directly.
703 is_ts_cleared
= TRUE
;
704 /* registers are in FPU - save to memory */
705 fpu_store_registers(ifps
, (thread_is_64bit(old
) && is_saved_state64(old
->machine
.iss
)));
706 ifps
->fp_valid
= TRUE
;
708 #if !defined(RC_HIDE_XNU_J137)
709 xstate_t old_xstate
= thread_xstate(old
);
710 xstate_t new_xstate
= new ? thread_xstate(new) : fpu_default
;
711 if (old_xstate
== AVX512
&& ifps
!= 0) {
712 DBG_AVX512_STATE((struct x86_avx512_thread_state
*) ifps
);
714 * Clear upper bits for potential power-saving
715 * but first ensure the TS bit is clear.
721 if (new_xstate
!= old_xstate
) {
722 DBG("fpu_switch_context(%p,%p) new xstate: %s\n",
723 old
, new, xstate_name
[new_xstate
]);
724 xsetbv(0, xstate_xmask
[new_xstate
]);
734 * Free a FPU save area.
735 * Called only when thread terminating - no locking necessary.
738 fpu_free(thread_t thread
, void *fps
)
740 pcb_t pcb
= THREAD_TO_PCB(thread
);
742 fp_state_free(fps
, pcb
->xstate
);
743 pcb
->xstate
= UNDEFINED
;
747 * Set the floating-point state for a thread based
748 * on the FXSave formatted data. This is basically
749 * the same as fpu_set_state except it uses the
750 * expanded data structure.
751 * If the thread is not the current thread, it is
752 * not running (held). Locking needed against
753 * concurrent fpu_set_state or fpu_get_state.
758 thread_state_t tstate
,
761 struct x86_fx_thread_state
*ifps
;
762 struct x86_fx_thread_state
*new_ifps
;
763 x86_float_state64_t
*state
;
765 boolean_t old_valid
, fresh_state
= FALSE
;
767 if (fpu_capability
== UNDEFINED
)
770 if ((f
== x86_AVX_STATE32
|| f
== x86_AVX_STATE64
) &&
771 fpu_capability
< AVX
)
774 #if !defined(RC_HIDE_XNU_J137)
775 if ((f
== x86_AVX512_STATE32
|| f
== x86_AVX512_STATE64
) &&
776 thread_xstate(thr_act
) == AVX
)
777 if (!fpu_thread_promote_avx512(thr_act
))
781 state
= (x86_float_state64_t
*)tstate
;
783 assert(thr_act
!= THREAD_NULL
);
784 pcb
= THREAD_TO_PCB(thr_act
);
788 * new FPU state is 'invalid'.
789 * Deallocate the fp state if it exists.
791 simple_lock(&pcb
->lock
);
796 simple_unlock(&pcb
->lock
);
799 fp_state_free(ifps
, thread_xstate(thr_act
));
803 * Valid incoming state. Allocate the fp state if there is none.
807 simple_lock(&pcb
->lock
);
812 simple_unlock(&pcb
->lock
);
813 new_ifps
= fp_state_alloc(thread_xstate(thr_act
));
819 pcb
->xstate
= thread_xstate(thr_act
);
824 * now copy over the new data.
827 old_valid
= ifps
->fp_valid
;
829 #if DEBUG || DEVELOPMENT
830 if ((fresh_state
== FALSE
) && (old_valid
== FALSE
) && (thr_act
!= current_thread())) {
831 panic("fpu_set_fxstate inconsistency, thread: %p not stopped", thr_act
);
835 * Clear any reserved bits in the MXCSR to prevent a GPF
836 * when issuing an FXRSTOR.
839 state
->fpu_mxcsr
&= mxcsr_capability_mask
;
841 bcopy((char *)&state
->fpu_fcw
, (char *)ifps
, fp_state_size
[FP
]);
843 switch (thread_xstate(thr_act
)) {
845 panic("fpu_set_fxstate() UNDEFINED xstate");
848 ifps
->fp_save_layout
= thread_is_64bit(thr_act
) ? FXSAVE64
: FXSAVE32
;
851 struct x86_avx_thread_state
*iavx
= (void *) ifps
;
852 x86_avx_state64_t
*xs
= (x86_avx_state64_t
*) state
;
854 iavx
->fp
.fp_save_layout
= thread_is_64bit(thr_act
) ? XSAVE64
: XSAVE32
;
856 /* Sanitize XSAVE header */
857 bzero(&iavx
->_xh
.xhrsvd
[0], sizeof(iavx
->_xh
.xhrsvd
));
858 iavx
->_xh
.xstate_bv
= AVX_XMASK
;
859 iavx
->_xh
.xcomp_bv
= 0;
861 if (f
== x86_AVX_STATE32
) {
862 bcopy_nochk(&xs
->fpu_ymmh0
, iavx
->x_YMM_Hi128
, 8 * sizeof(_STRUCT_XMM_REG
));
863 } else if (f
== x86_AVX_STATE64
) {
864 bcopy_nochk(&xs
->fpu_ymmh0
, iavx
->x_YMM_Hi128
, 16 * sizeof(_STRUCT_XMM_REG
));
866 iavx
->_xh
.xstate_bv
= (XFEM_SSE
| XFEM_X87
);
870 #if !defined(RC_HIDE_XNU_J137)
872 struct x86_avx512_thread_state
*iavx
= (void *) ifps
;
875 x86_avx512_state32_t
*s32
;
876 x86_avx512_state64_t
*s64
;
877 } xs
= { .ts
= tstate
};
879 iavx
->fp
.fp_save_layout
= thread_is_64bit(thr_act
) ? XSAVE64
: XSAVE32
;
881 /* Sanitize XSAVE header */
882 bzero(&iavx
->_xh
.xhrsvd
[0], sizeof(iavx
->_xh
.xhrsvd
));
883 iavx
->_xh
.xstate_bv
= AVX512_XMASK
;
884 iavx
->_xh
.xcomp_bv
= 0;
887 case x86_AVX512_STATE32
:
888 bcopy_nochk(&xs
.s32
->fpu_k0
, iavx
->x_Opmask
, 8 * sizeof(_STRUCT_OPMASK_REG
));
889 bcopy_nochk(&xs
.s32
->fpu_zmmh0
, iavx
->x_ZMM_Hi256
, 8 * sizeof(_STRUCT_YMM_REG
));
890 bcopy_nochk(&xs
.s32
->fpu_ymmh0
, iavx
->x_YMM_Hi128
, 8 * sizeof(_STRUCT_XMM_REG
));
891 DBG_AVX512_STATE(iavx
);
893 case x86_AVX_STATE32
:
894 bcopy_nochk(&xs
.s32
->fpu_ymmh0
, iavx
->x_YMM_Hi128
, 8 * sizeof(_STRUCT_XMM_REG
));
896 case x86_AVX512_STATE64
:
897 bcopy_nochk(&xs
.s64
->fpu_k0
, iavx
->x_Opmask
, 8 * sizeof(_STRUCT_OPMASK_REG
));
898 bcopy_nochk(&xs
.s64
->fpu_zmm16
, iavx
->x_Hi16_ZMM
, 16 * sizeof(_STRUCT_ZMM_REG
));
899 bcopy_nochk(&xs
.s64
->fpu_zmmh0
, iavx
->x_ZMM_Hi256
, 16 * sizeof(_STRUCT_YMM_REG
));
900 bcopy_nochk(&xs
.s64
->fpu_ymmh0
, iavx
->x_YMM_Hi128
, 16 * sizeof(_STRUCT_XMM_REG
));
901 DBG_AVX512_STATE(iavx
);
903 case x86_AVX_STATE64
:
904 bcopy_nochk(&xs
.s64
->fpu_ymmh0
, iavx
->x_YMM_Hi128
, 16 * sizeof(_STRUCT_XMM_REG
));
912 ifps
->fp_valid
= old_valid
;
914 if (old_valid
== FALSE
) {
915 boolean_t istate
= ml_set_interrupts_enabled(FALSE
);
916 ifps
->fp_valid
= TRUE
;
917 /* If altering the current thread's state, disable FPU */
918 if (thr_act
== current_thread())
921 ml_set_interrupts_enabled(istate
);
924 simple_unlock(&pcb
->lock
);
927 fp_state_free(new_ifps
, thread_xstate(thr_act
));
933 * Get the floating-point state for a thread.
934 * If the thread is not the current thread, it is
935 * not running (held). Locking needed against
936 * concurrent fpu_set_state or fpu_get_state.
941 thread_state_t tstate
,
944 struct x86_fx_thread_state
*ifps
;
945 x86_float_state64_t
*state
;
946 kern_return_t ret
= KERN_FAILURE
;
949 if (fpu_capability
== UNDEFINED
)
952 if ((f
== x86_AVX_STATE32
|| f
== x86_AVX_STATE64
) &&
953 fpu_capability
< AVX
)
956 #if !defined(RC_HIDE_XNU_J137)
957 if ((f
== x86_AVX512_STATE32
|| f
== x86_AVX512_STATE64
) &&
958 thread_xstate(thr_act
) != AVX512
)
962 state
= (x86_float_state64_t
*)tstate
;
964 assert(thr_act
!= THREAD_NULL
);
965 pcb
= THREAD_TO_PCB(thr_act
);
967 simple_lock(&pcb
->lock
);
972 * No valid floating-point state.
975 bcopy((char *)&initial_fp_state
, (char *)&state
->fpu_fcw
,
978 simple_unlock(&pcb
->lock
);
983 * Make sure we`ve got the latest fp state info
984 * If the live fpu state belongs to our target
986 if (thr_act
== current_thread()) {
989 intr
= ml_set_interrupts_enabled(FALSE
);
995 (void)ml_set_interrupts_enabled(intr
);
997 if (ifps
->fp_valid
) {
998 bcopy((char *)ifps
, (char *)&state
->fpu_fcw
, fp_state_size
[FP
]);
999 switch (thread_xstate(thr_act
)) {
1001 panic("fpu_get_fxstate() UNDEFINED xstate");
1004 break; /* already done */
1006 struct x86_avx_thread_state
*iavx
= (void *) ifps
;
1007 x86_avx_state64_t
*xs
= (x86_avx_state64_t
*) state
;
1008 if (f
== x86_AVX_STATE32
) {
1009 bcopy_nochk(iavx
->x_YMM_Hi128
, &xs
->fpu_ymmh0
, 8 * sizeof(_STRUCT_XMM_REG
));
1010 } else if (f
== x86_AVX_STATE64
) {
1011 bcopy_nochk(iavx
->x_YMM_Hi128
, &xs
->fpu_ymmh0
, 16 * sizeof(_STRUCT_XMM_REG
));
1015 #if !defined(RC_HIDE_XNU_J137)
1017 struct x86_avx512_thread_state
*iavx
= (void *) ifps
;
1020 x86_avx512_state32_t
*s32
;
1021 x86_avx512_state64_t
*s64
;
1022 } xs
= { .ts
= tstate
};
1024 case x86_AVX512_STATE32
:
1025 bcopy_nochk(iavx
->x_Opmask
, &xs
.s32
->fpu_k0
, 8 * sizeof(_STRUCT_OPMASK_REG
));
1026 bcopy_nochk(iavx
->x_ZMM_Hi256
, &xs
.s32
->fpu_zmmh0
, 8 * sizeof(_STRUCT_YMM_REG
));
1027 bcopy_nochk(iavx
->x_YMM_Hi128
, &xs
.s32
->fpu_ymmh0
, 8 * sizeof(_STRUCT_XMM_REG
));
1028 DBG_AVX512_STATE(iavx
);
1030 case x86_AVX_STATE32
:
1031 bcopy_nochk(iavx
->x_YMM_Hi128
, &xs
.s32
->fpu_ymmh0
, 8 * sizeof(_STRUCT_XMM_REG
));
1033 case x86_AVX512_STATE64
:
1034 bcopy_nochk(iavx
->x_Opmask
, &xs
.s64
->fpu_k0
, 8 * sizeof(_STRUCT_OPMASK_REG
));
1035 bcopy_nochk(iavx
->x_Hi16_ZMM
, &xs
.s64
->fpu_zmm16
, 16 * sizeof(_STRUCT_ZMM_REG
));
1036 bcopy_nochk(iavx
->x_ZMM_Hi256
, &xs
.s64
->fpu_zmmh0
, 16 * sizeof(_STRUCT_YMM_REG
));
1037 bcopy_nochk(iavx
->x_YMM_Hi128
, &xs
.s64
->fpu_ymmh0
, 16 * sizeof(_STRUCT_XMM_REG
));
1038 DBG_AVX512_STATE(iavx
);
1040 case x86_AVX_STATE64
:
1041 bcopy_nochk(iavx
->x_YMM_Hi128
, &xs
.s64
->fpu_ymmh0
, 16 * sizeof(_STRUCT_XMM_REG
));
1051 simple_unlock(&pcb
->lock
);
1059 * the child thread is 'stopped' with the thread
1060 * mutex held and is currently not known by anyone
1061 * so no way for fpu state to get manipulated by an
1062 * outside agency -> no need for pcb lock
1070 struct x86_fx_thread_state
*new_ifps
= NULL
;
1073 xstate_t xstate
= thread_xstate(parent
);
1075 ppcb
= THREAD_TO_PCB(parent
);
1077 if (ppcb
->ifps
== NULL
)
1080 if (child
->machine
.ifps
)
1081 panic("fpu_dup_fxstate: child's ifps non-null");
1083 new_ifps
= fp_state_alloc(xstate
);
1085 simple_lock(&ppcb
->lock
);
1087 if (ppcb
->ifps
!= NULL
) {
1088 struct x86_fx_thread_state
*ifps
= ppcb
->ifps
;
1090 * Make sure we`ve got the latest fp state info
1092 if (current_thread() == parent
) {
1093 intr
= ml_set_interrupts_enabled(FALSE
);
1094 assert(current_thread() == parent
);
1099 (void)ml_set_interrupts_enabled(intr
);
1102 if (ifps
->fp_valid
) {
1103 child
->machine
.ifps
= new_ifps
;
1104 child
->machine
.xstate
= xstate
;
1105 bcopy((char *)(ppcb
->ifps
),
1106 (char *)(child
->machine
.ifps
),
1107 fp_state_size
[xstate
]);
1109 /* Mark the new fp saved state as non-live. */
1110 /* Temporarily disabled: radar 4647827
1111 * new_ifps->fp_valid = TRUE;
1115 * Clear any reserved bits in the MXCSR to prevent a GPF
1116 * when issuing an FXRSTOR.
1118 new_ifps
->fx_MXCSR
&= mxcsr_capability_mask
;
1122 simple_unlock(&ppcb
->lock
);
1124 if (new_ifps
!= NULL
)
1125 fp_state_free(new_ifps
, xstate
);
1137 unsigned short control
;
1142 control
&= ~(FPC_PC
|FPC_RC
); /* Clear precision & rounding control */
1143 control
|= (FPC_PC_64
| /* Set precision */
1144 FPC_RC_RN
| /* round-to-nearest */
1145 FPC_ZE
| /* Suppress zero-divide */
1146 FPC_OE
| /* and overflow */
1147 FPC_UE
| /* underflow */
1148 FPC_IE
| /* Allow NaNQs and +-INF */
1149 FPC_DE
| /* Allow denorms as operands */
1150 FPC_PE
); /* No trap for precision loss */
1153 /* Initialize SSE/SSE2 */
1154 __builtin_ia32_ldmxcsr(0x1f80);
1158 * Coprocessor not present.
1161 uint64_t x86_isr_fp_simd_use
;
1169 struct x86_fx_thread_state
*ifps
= 0;
1170 xstate_t xstate
= current_xstate();
1172 thr_act
= current_thread();
1173 pcb
= THREAD_TO_PCB(thr_act
);
1175 if (pcb
->ifps
== 0 && !get_interrupt_level()) {
1176 ifps
= fp_state_alloc(xstate
);
1177 bcopy((char *)&initial_fp_state
, (char *)ifps
,
1178 fp_state_size
[xstate
]);
1179 if (!thread_is_64bit(thr_act
)) {
1180 ifps
->fp_save_layout
= fpu_YMM_capable
? XSAVE32
: FXSAVE32
;
1183 ifps
->fp_save_layout
= fpu_YMM_capable
? XSAVE64
: FXSAVE64
;
1184 ifps
->fp_valid
= TRUE
;
1186 intr
= ml_set_interrupts_enabled(FALSE
);
1188 clear_ts(); /* Enable FPU use */
1190 if (__improbable(get_interrupt_level())) {
1191 /* Track number of #DNA traps at interrupt context,
1192 * which is likely suboptimal. Racy, but good enough.
1194 x86_isr_fp_simd_use
++;
1196 * Save current FP/SIMD context if valid
1197 * Initialize live FP/SIMD registers
1204 if (pcb
->ifps
== 0) {
1206 pcb
->xstate
= xstate
;
1210 * Load this thread`s state into coprocessor live context.
1214 (void)ml_set_interrupts_enabled(intr
);
1217 fp_state_free(ifps
, xstate
);
1221 * FPU overran end of segment.
1222 * Re-initialize FPU. Floating point state is not valid.
1228 thread_t thr_act
= current_thread();
1230 struct x86_fx_thread_state
*ifps
;
1232 xstate_t xstate
= current_xstate();
1234 intr
= ml_set_interrupts_enabled(FALSE
);
1236 if (get_interrupt_level())
1237 panic("FPU segment overrun exception at interrupt context\n");
1238 if (current_task() == kernel_task
)
1239 panic("FPU segment overrun exception in kernel thread context\n");
1242 * This is a non-recoverable error.
1243 * Invalidate the thread`s FPU state.
1245 pcb
= THREAD_TO_PCB(thr_act
);
1246 simple_lock(&pcb
->lock
);
1249 simple_unlock(&pcb
->lock
);
1252 * Re-initialize the FPU.
1258 * And disable access.
1262 (void)ml_set_interrupts_enabled(intr
);
1265 fp_state_free(ifps
, xstate
);
1270 i386_exception(EXC_BAD_ACCESS
, VM_PROT_READ
|VM_PROT_EXECUTE
, 0);
1274 extern void fpxlog(int, uint32_t, uint32_t, uint32_t);
1277 * FPU error. Called by AST.
1283 thread_t thr_act
= current_thread();
1284 struct x86_fx_thread_state
*ifps
= thr_act
->machine
.ifps
;
1287 intr
= ml_set_interrupts_enabled(FALSE
);
1289 if (get_interrupt_level())
1290 panic("FPU error exception at interrupt context\n");
1291 if (current_task() == kernel_task
)
1292 panic("FPU error exception in kernel thread context\n");
1295 * Save the FPU state and turn off the FPU.
1299 (void)ml_set_interrupts_enabled(intr
);
1301 const uint32_t mask
= ifps
->fx_control
&
1302 (FPC_IM
| FPC_DM
| FPC_ZM
| FPC_OM
| FPC_UE
| FPC_PE
);
1303 const uint32_t xcpt
= ~mask
& (ifps
->fx_status
&
1304 (FPS_IE
| FPS_DE
| FPS_ZE
| FPS_OE
| FPS_UE
| FPS_PE
));
1305 fpxlog(EXC_I386_EXTERR
, ifps
->fx_status
, ifps
->fx_control
, xcpt
);
1307 * Raise FPU exception.
1308 * Locking not needed on pcb->ifps,
1309 * since thread is running.
1311 i386_exception(EXC_ARITHMETIC
,
1321 * Locking not needed:
1322 * . if called from fpu_get_state, pcb already locked.
1323 * . if called from fpnoextflt or fp_intr, we are single-cpu
1324 * . otherwise, thread is running.
1325 * N.B.: Must be called with interrupts disabled
1332 pcb_t pcb
= THREAD_TO_PCB(thr_act
);
1333 struct x86_fx_thread_state
*ifps
= pcb
->ifps
;
1336 if (ifps
!= 0 && !ifps
->fp_valid
) {
1337 assert((get_cr0() & CR0_TS
) == 0);
1338 /* registers are in FPU */
1339 ifps
->fp_valid
= TRUE
;
1340 fpu_store_registers(ifps
, thread_is_64bit(thr_act
));
1345 * Restore FPU state from PCB.
1347 * Locking not needed; always called on the current thread.
1354 pcb_t pcb
= THREAD_TO_PCB(thr_act
);
1355 struct x86_fx_thread_state
*ifps
= pcb
->ifps
;
1359 if (ifps
->fp_valid
!= FALSE
&& ifps
->fp_valid
!= TRUE
) {
1360 panic("fp_load() invalid fp_valid: %u, fp_save_layout: %u\n",
1361 ifps
->fp_valid
, ifps
->fp_save_layout
);
1365 if (ifps
->fp_valid
== FALSE
) {
1368 fpu_load_registers(ifps
);
1370 ifps
->fp_valid
= FALSE
; /* in FPU */
1374 * SSE arithmetic exception handling code.
1375 * Basically the same as the x87 exception handler with a different subtype
1379 fpSSEexterrflt(void)
1381 thread_t thr_act
= current_thread();
1382 struct x86_fx_thread_state
*ifps
= thr_act
->machine
.ifps
;
1385 intr
= ml_set_interrupts_enabled(FALSE
);
1387 if (get_interrupt_level())
1388 panic("SSE exception at interrupt context\n");
1389 if (current_task() == kernel_task
)
1390 panic("SSE exception in kernel thread context\n");
1393 * Save the FPU state and turn off the FPU.
1397 (void)ml_set_interrupts_enabled(intr
);
1399 * Raise FPU exception.
1400 * Locking not needed on pcb->ifps,
1401 * since thread is running.
1403 const uint32_t mask
= (ifps
->fx_MXCSR
>> 7) &
1404 (FPC_IM
| FPC_DM
| FPC_ZM
| FPC_OM
| FPC_UE
| FPC_PE
);
1405 const uint32_t xcpt
= ~mask
& (ifps
->fx_MXCSR
&
1406 (FPS_IE
| FPS_DE
| FPS_ZE
| FPS_OE
| FPS_UE
| FPS_PE
));
1407 fpxlog(EXC_I386_SSEEXTERR
, ifps
->fx_MXCSR
, ifps
->fx_MXCSR
, xcpt
);
1409 i386_exception(EXC_ARITHMETIC
,
1416 #if !defined(RC_HIDE_XNU_J137)
1418 * If a thread is using an AVX-sized savearea:
1419 * - allocate a new AVX512-sized area,
1420 * - copy the 256-bit state into the 512-bit area,
1421 * - deallocate the smaller area
1424 fpu_savearea_promote_avx512(thread_t thread
)
1426 struct x86_avx_thread_state
*ifps
= NULL
;
1427 struct x86_avx512_thread_state
*ifps512
= NULL
;
1428 pcb_t pcb
= THREAD_TO_PCB(thread
);
1429 boolean_t do_avx512_alloc
= FALSE
;
1431 DBG("fpu_upgrade_savearea(%p)\n", thread
);
1433 simple_lock(&pcb
->lock
);
1437 pcb
->xstate
= AVX512
;
1438 simple_unlock(&pcb
->lock
);
1439 if (thread
!= current_thread()) {
1440 /* nothing to be done */
1448 if (pcb
->xstate
!= AVX512
) {
1449 do_avx512_alloc
= TRUE
;
1451 simple_unlock(&pcb
->lock
);
1453 if (do_avx512_alloc
== TRUE
) {
1454 ifps512
= fp_state_alloc(AVX512
);
1457 simple_lock(&pcb
->lock
);
1458 if (thread
== current_thread()) {
1461 intr
= ml_set_interrupts_enabled(FALSE
);
1467 xsetbv(0, AVX512_XMASK
);
1469 (void)ml_set_interrupts_enabled(intr
);
1471 assert(ifps
->fp
.fp_valid
);
1473 /* Allocate an AVX512 savearea and copy AVX state into it */
1474 if (pcb
->xstate
!= AVX512
) {
1475 bcopy(ifps
, ifps512
, fp_state_size
[AVX
]);
1476 pcb
->ifps
= ifps512
;
1477 pcb
->xstate
= AVX512
;
1482 /* The PCB lock is redundant in some scenarios given the higher level
1483 * thread mutex, but its pre-emption disablement is relied upon here
1485 simple_unlock(&pcb
->lock
);
1488 fp_state_free(ifps
, AVX
);
1491 fp_state_free(ifps
, AVX512
);
1496 * Upgrade the calling thread to AVX512.
1499 fpu_thread_promote_avx512(thread_t thread
)
1501 task_t task
= current_task();
1503 if (thread
!= current_thread())
1505 if (!ml_fpu_avx512_enabled())
1508 fpu_savearea_promote_avx512(thread
);
1510 /* Racy but the task's xstate is only a hint */
1511 task
->xstate
= AVX512
;
1518 * Called from user_trap() when an invalid opcode fault is taken.
1519 * If the user is attempting an AVX512 instruction on a machine
1520 * that supports this, we switch the calling thread to use
1521 * a larger savearea, set its XCR0 bit mask to enable AVX512 and
1522 * return directly via thread_exception_return().
1523 * Otherwise simply return.
1525 #define MAX_X86_INSN_LENGTH (16)
1527 fpUDflt(user_addr_t rip
)
1529 uint8_t instruction_prefix
;
1530 boolean_t is_AVX512_instruction
= FALSE
;
1531 user_addr_t original_rip
= rip
;
1533 /* TODO: as an optimisation, copy up to the lesser of the
1534 * next page boundary or maximal prefix length in one pass
1535 * rather than issue multiple copyins
1537 if (copyin(rip
, (char *) &instruction_prefix
, 1)) {
1540 DBG("fpUDflt(0x%016llx) prefix: 0x%x\n",
1541 rip
, instruction_prefix
);
1542 /* TODO: determine more specifically which prefixes
1543 * are sane possibilities for AVX512 insns
1545 switch (instruction_prefix
) {
1546 case 0x2E: /* CS segment override */
1547 case 0x36: /* SS segment override */
1548 case 0x3E: /* DS segment override */
1549 case 0x26: /* ES segment override */
1550 case 0x64: /* FS segment override */
1551 case 0x65: /* GS segment override */
1552 case 0x66: /* Operand-size override */
1553 case 0x67: /* address-size override */
1554 /* Skip optional prefixes */
1556 if ((rip
- original_rip
) > MAX_X86_INSN_LENGTH
) {
1560 case 0x62: /* EVEX */
1561 case 0xC5: /* VEX 2-byte */
1562 case 0xC4: /* VEX 3-byte */
1563 is_AVX512_instruction
= TRUE
;
1568 } while (!is_AVX512_instruction
);
1570 /* Here if we detect attempted execution of an AVX512 instruction */
1573 * Fail if this machine doesn't support AVX512
1575 if (fpu_capability
!= AVX512
)
1578 assert(xgetbv(XCR0
) == AVX_XMASK
);
1580 DBG("fpUDflt() switching xstate to AVX512\n");
1581 (void) fpu_thread_promote_avx512(current_thread());
1583 thread_exception_return();
1586 #endif /* !defined(RC_HIDE_XNU_J137) */
1589 fp_setvalid(boolean_t value
) {
1590 thread_t thr_act
= current_thread();
1591 struct x86_fx_thread_state
*ifps
= thr_act
->machine
.ifps
;
1594 ifps
->fp_valid
= value
;
1596 if (value
== TRUE
) {
1597 boolean_t istate
= ml_set_interrupts_enabled(FALSE
);
1599 ml_set_interrupts_enabled(istate
);
1605 ml_fpu_avx_enabled(void) {
1606 return (fpu_capability
>= AVX
);
1609 #if !defined(RC_HIDE_XNU_J137)
1611 ml_fpu_avx512_enabled(void) {
1612 return (fpu_capability
== AVX512
);
1617 task_xstate(task_t task
)
1619 if (task
== TASK_NULL
)
1622 return task
->xstate
;
1626 thread_xstate(thread_t thread
)
1628 xstate_t xs
= THREAD_TO_PCB(thread
)->xstate
;
1629 if (xs
== UNDEFINED
)
1630 return task_xstate(thread
->task
);
1636 current_xstate(void)
1638 return thread_xstate(current_thread());
1642 * Called when exec'ing between bitnesses.
1643 * If valid FPU state exists, adjust the layout.
1646 fpu_switch_addrmode(thread_t thread
, boolean_t is_64bit
)
1648 struct x86_fx_thread_state
*ifps
= thread
->machine
.ifps
;
1650 if (ifps
&& ifps
->fp_valid
) {
1651 if (thread_xstate(thread
) == FP
) {
1652 ifps
->fp_save_layout
= is_64bit
? FXSAVE64
: FXSAVE32
;
1654 ifps
->fp_save_layout
= is_64bit
? XSAVE64
: XSAVE32
;