]> git.saurik.com Git - apple/xnu.git/blame - osfmk/i386/fpu.c
xnu-6153.101.6.tar.gz
[apple/xnu.git] / osfmk / i386 / fpu.c
CommitLineData
1c79356b 1/*
ea3f0419 2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
0a7de745 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
0a7de745 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
0a7de745 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
0a7de745 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
0a7de745 31/*
1c79356b
A
32 * Mach Operating System
33 * Copyright (c) 1992-1990 Carnegie Mellon University
34 * All Rights Reserved.
0a7de745 35 *
1c79356b
A
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
0a7de745 41 *
1c79356b
A
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
0a7de745 45 *
1c79356b 46 * Carnegie Mellon requests users of this software to return to
0a7de745 47 *
1c79356b
A
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
0a7de745 52 *
1c79356b
A
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
1c79356b
A
56
57#include <mach/exception_types.h>
58#include <mach/i386/thread_status.h>
59#include <mach/i386/fp_reg.h>
60
61#include <kern/mach_param.h>
91447636 62#include <kern/processor.h>
1c79356b
A
63#include <kern/thread.h>
64#include <kern/zalloc.h>
65#include <kern/misc_protos.h>
66#include <kern/spl.h>
67#include <kern/assert.h>
68
060df5ea
A
69#include <libkern/OSAtomic.h>
70
0c530ab8 71#include <architecture/i386/pio.h>
55e303ae 72#include <i386/cpuid.h>
b0d623f7 73#include <i386/fpu.h>
0c530ab8 74#include <i386/proc_reg.h>
b0d623f7
A
75#include <i386/misc_protos.h>
76#include <i386/thread.h>
77#include <i386/trap.h>
1c79356b 78
0a7de745
A
79xstate_t fpu_capability = UNDEFINED; /* extended state capability */
80xstate_t fpu_default = UNDEFINED; /* default extended state */
1c79356b 81
0a7de745 82#define ALIGNED(addr, size) (((uintptr_t)(addr)&((size)-1))==0)
1c79356b
A
83
84/* Forward */
85
0a7de745
A
86extern void fpinit(void);
87extern void fp_save(
88 thread_t thr_act);
89extern void fp_load(
90 thread_t thr_act);
1c79356b 91
5ba3f43e
A
92static void configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps);
93static xstate_t thread_xstate(thread_t);
0c530ab8 94
0a7de745
A
95x86_ext_thread_state_t initial_fp_state __attribute((aligned(64)));
96x86_ext_thread_state_t default_avx512_state __attribute((aligned(64)));
97x86_ext_thread_state_t default_avx_state __attribute((aligned(64)));
98x86_ext_thread_state_t default_fx_state __attribute((aligned(64)));
0c530ab8
A
99
100/* Global MXCSR capability bitmask */
101static unsigned int mxcsr_capability_mask;
102
0a7de745 103#define fninit() \
060df5ea
A
104 __asm__ volatile("fninit")
105
0a7de745 106#define fnstcw(control) \
060df5ea
A
107 __asm__("fnstcw %0" : "=m" (*(unsigned short *)(control)))
108
0a7de745 109#define fldcw(control) \
060df5ea
A
110 __asm__ volatile("fldcw %0" : : "m" (*(unsigned short *) &(control)) )
111
0a7de745 112#define fnclex() \
060df5ea
A
113 __asm__ volatile("fnclex")
114
0a7de745 115#define fnsave(state) \
060df5ea
A
116 __asm__ volatile("fnsave %0" : "=m" (*state))
117
0a7de745 118#define frstor(state) \
060df5ea
A
119 __asm__ volatile("frstor %0" : : "m" (state))
120
121#define fwait() \
0a7de745 122 __asm__("fwait");
060df5ea 123
0a7de745
A
124static inline void
125fxrstor(struct x86_fx_thread_state *a)
126{
127 __asm__ __volatile__ ("fxrstor %0" :: "m" (*a));
5ba3f43e
A
128}
129
0a7de745
A
130static inline void
131fxsave(struct x86_fx_thread_state *a)
132{
133 __asm__ __volatile__ ("fxsave %0" : "=m" (*a));
5ba3f43e
A
134}
135
0a7de745
A
136static inline void
137fxrstor64(struct x86_fx_thread_state *a)
138{
139 __asm__ __volatile__ ("fxrstor64 %0" :: "m" (*a));
5ba3f43e
A
140}
141
0a7de745
A
142static inline void
143fxsave64(struct x86_fx_thread_state *a)
144{
145 __asm__ __volatile__ ("fxsave64 %0" : "=m" (*a));
5ba3f43e
A
146}
147
0a7de745 148#define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX || (x) == AVX512)
5ba3f43e 149
0a7de745 150zone_t ifps_zone[] = {
5ba3f43e
A
151 [FP] = NULL,
152 [AVX] = NULL,
5ba3f43e 153 [AVX512] = NULL
5ba3f43e 154};
0a7de745 155static uint32_t fp_state_size[] = {
5ba3f43e
A
156 [FP] = sizeof(struct x86_fx_thread_state),
157 [AVX] = sizeof(struct x86_avx_thread_state),
5ba3f43e 158 [AVX512] = sizeof(struct x86_avx512_thread_state)
5ba3f43e
A
159};
160
161static const char *xstate_name[] = {
162 [UNDEFINED] = "UNDEFINED",
163 [FP] = "FP",
164 [AVX] = "AVX",
5ba3f43e 165 [AVX512] = "AVX512"
5ba3f43e 166};
060df5ea 167
5ba3f43e
A
168#define fpu_ZMM_capable (fpu_capability == AVX512)
169#define fpu_YMM_capable (fpu_capability == AVX || fpu_capability == AVX512)
170/*
171 * On-demand AVX512 support
172 * ------------------------
173 * On machines with AVX512 support, by default, threads are created with
174 * AVX512 masked off in XCR0 and an AVX-sized savearea is used. However, AVX512
175 * capabilities are advertised in the commpage and via sysctl. If a thread
176 * opts to use AVX512 instructions, the first will result in a #UD exception.
177 * Faulting AVX512 intructions are recognizable by their unique prefix.
178 * This exception results in the thread being promoted to use an AVX512-sized
179 * savearea and for the AVX512 bit masks being set in its XCR0. The faulting
180 * instruction is re-driven and the thread can proceed to perform AVX512
181 * operations.
182 *
183 * In addition to AVX512 instructions causing promotion, the thread_set_state()
184 * primitive with an AVX512 state flavor result in promotion.
185 *
186 * AVX512 promotion of the first thread in a task causes the default xstate
187 * of the task to be promoted so that any subsequently created or subsequently
188 * DNA-faulted thread will have AVX512 xstate and it will not need to fault-in
189 * a promoted xstate.
190 *
191 * Two savearea zones are used: the default pool of AVX-sized (832 byte) areas
192 * and a second pool of larger AVX512-sized (2688 byte) areas.
193 *
194 * Note the initial state value is an AVX512 object but that the AVX initial
195 * value is a subset of it.
196 */
0a7de745 197static uint32_t cpuid_reevaluated = 0;
060df5ea
A
198
199static void fpu_store_registers(void *, boolean_t);
200static void fpu_load_registers(void *);
201
5ba3f43e 202static const uint32_t xstate_xmask[] = {
0a7de745
A
203 [FP] = FP_XMASK,
204 [AVX] = AVX_XMASK,
205 [AVX512] = AVX512_XMASK
5ba3f43e 206};
060df5ea 207
0a7de745
A
208static inline void
209xsave(struct x86_fx_thread_state *a, uint32_t rfbm)
210{
211 __asm__ __volatile__ ("xsave %0" :"=m" (*a) : "a"(rfbm), "d"(0));
5ba3f43e
A
212}
213
0a7de745
A
214static inline void
215xsave64(struct x86_fx_thread_state *a, uint32_t rfbm)
216{
217 __asm__ __volatile__ ("xsave64 %0" :"=m" (*a) : "a"(rfbm), "d"(0));
5ba3f43e
A
218}
219
0a7de745
A
220static inline void
221xrstor(struct x86_fx_thread_state *a, uint32_t rfbm)
222{
223 __asm__ __volatile__ ("xrstor %0" :: "m" (*a), "a"(rfbm), "d"(0));
5ba3f43e
A
224}
225
0a7de745
A
226static inline void
227xrstor64(struct x86_fx_thread_state *a, uint32_t rfbm)
228{
229 __asm__ __volatile__ ("xrstor64 %0" :: "m" (*a), "a"(rfbm), "d"(0));
060df5ea
A
230}
231
0a7de745
A
232__unused static inline void
233vzeroupper(void)
234{
235 __asm__ __volatile__ ("vzeroupper" ::);
5ba3f43e 236}
5ba3f43e 237
0a7de745 238static boolean_t fpu_thread_promote_avx512(thread_t); /* Forward */
5ba3f43e 239
5ba3f43e
A
240
241/*
242 * Furthermore, make compile-time asserts that no padding creeps into structures
243 * for which we're doing this.
244 */
0a7de745
A
245#define ASSERT_PACKED(t, m1, m2, n, mt) \
246extern char assert_packed_ ## t ## _ ## m1 ## _ ## m2 \
5ba3f43e
A
247 [(offsetof(t,m2) - offsetof(t,m1) == (n - 1)*sizeof(mt)) ? 1 : -1]
248
249ASSERT_PACKED(x86_avx_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
250
251ASSERT_PACKED(x86_avx_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
252
253ASSERT_PACKED(x86_avx512_state32_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
254ASSERT_PACKED(x86_avx512_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
255ASSERT_PACKED(x86_avx512_state32_t, fpu_zmmh0, fpu_zmmh7, 8, _STRUCT_YMM_REG);
256
257ASSERT_PACKED(x86_avx512_state64_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
258ASSERT_PACKED(x86_avx512_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
259ASSERT_PACKED(x86_avx512_state64_t, fpu_zmmh0, fpu_zmmh15, 16, _STRUCT_YMM_REG);
260ASSERT_PACKED(x86_avx512_state64_t, fpu_zmm16, fpu_zmm31, 16, _STRUCT_ZMM_REG);
261
262#if defined(DEBUG_AVX512)
263
0a7de745 264#define DBG(x...) kprintf("DBG: " x)
5ba3f43e
A
265
266typedef struct { uint8_t byte[8]; } opmask_t;
267typedef struct { uint8_t byte[16]; } xmm_t;
268typedef struct { uint8_t byte[32]; } ymm_t;
269typedef struct { uint8_t byte[64]; } zmm_t;
270
271static void
272DBG_AVX512_STATE(struct x86_avx512_thread_state *sp)
273{
0a7de745 274 int i, j;
5ba3f43e
A
275 xmm_t *xmm = (xmm_t *) &sp->fp.fx_XMM_reg;
276 xmm_t *ymmh = (xmm_t *) &sp->x_YMM_Hi128;
277 ymm_t *zmmh = (ymm_t *) &sp->x_ZMM_Hi256;
278 zmm_t *zmm = (zmm_t *) &sp->x_Hi16_ZMM;
279 opmask_t *k = (opmask_t *) &sp->x_Opmask;
280
281 kprintf("x_YMM_Hi128: %lu\n", offsetof(struct x86_avx512_thread_state, x_YMM_Hi128));
282 kprintf("x_Opmask: %lu\n", offsetof(struct x86_avx512_thread_state, x_Opmask));
283 kprintf("x_ZMM_Hi256: %lu\n", offsetof(struct x86_avx512_thread_state, x_ZMM_Hi256));
284 kprintf("x_Hi16_ZMM: %lu\n", offsetof(struct x86_avx512_thread_state, x_Hi16_ZMM));
285
286 kprintf("XCR0: 0x%016llx\n", xgetbv(XCR0));
287 kprintf("XINUSE: 0x%016llx\n", xgetbv(1));
288
289 /* Print all ZMM registers */
290 for (i = 0; i < 16; i++) {
291 kprintf("zmm%d:\t0x", i);
0a7de745 292 for (j = 0; j < 16; j++) {
5ba3f43e 293 kprintf("%02x", xmm[i].byte[j]);
0a7de745
A
294 }
295 for (j = 0; j < 16; j++) {
5ba3f43e 296 kprintf("%02x", ymmh[i].byte[j]);
0a7de745
A
297 }
298 for (j = 0; j < 32; j++) {
5ba3f43e 299 kprintf("%02x", zmmh[i].byte[j]);
0a7de745 300 }
5ba3f43e
A
301 kprintf("\n");
302 }
303 for (i = 0; i < 16; i++) {
0a7de745
A
304 kprintf("zmm%d:\t0x", 16 + i);
305 for (j = 0; j < 64; j++) {
5ba3f43e 306 kprintf("%02x", zmm[i].byte[j]);
0a7de745 307 }
5ba3f43e
A
308 kprintf("\n");
309 }
310 for (i = 0; i < 8; i++) {
311 kprintf("k%d:\t0x", i);
0a7de745 312 for (j = 0; j < 8; j++) {
5ba3f43e 313 kprintf("%02x", k[i].byte[j]);
0a7de745 314 }
5ba3f43e
A
315 kprintf("\n");
316 }
317
318 kprintf("xstate_bv: 0x%016llx\n", sp->_xh.xstate_bv);
319 kprintf("xcomp_bv: 0x%016llx\n", sp->_xh.xcomp_bv);
320}
321#else
0a7de745 322#define DBG(x...)
5ba3f43e
A
323static void
324DBG_AVX512_STATE(__unused struct x86_avx512_thread_state *sp)
325{
326 return;
327}
328#endif /* DEBUG_AVX512 */
329
0a7de745 330#if DEBUG
060df5ea
A
331static inline unsigned short
332fnstsw(void)
333{
334 unsigned short status;
0a7de745
A
335 __asm__ volatile ("fnstsw %0" : "=ma" (status));
336 return status;
060df5ea 337}
fe8ab488 338#endif
060df5ea 339
0c530ab8 340/*
060df5ea 341 * Configure the initial FPU state presented to new threads.
0c530ab8
A
342 * Determine the MXCSR capability mask, which allows us to mask off any
343 * potentially unsafe "reserved" bits before restoring the FPU context.
344 * *Not* per-cpu, assumes symmetry.
345 */
060df5ea 346
0c530ab8 347static void
5ba3f43e 348configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps)
0c530ab8 349{
060df5ea
A
350 /* XSAVE requires a 64 byte aligned store */
351 assert(ALIGNED(fps, 64));
0c530ab8 352 /* Clear, to prepare for the diagnostic FXSAVE */
060df5ea
A
353 bzero(fps, sizeof(*fps));
354
355 fpinit();
356 fpu_store_registers(fps, FALSE);
357
5ba3f43e 358 mxcsr_capability_mask = fps->fx.fx_MXCSR_MASK;
0c530ab8
A
359
360 /* Set default mask value if necessary */
0a7de745 361 if (mxcsr_capability_mask == 0) {
0c530ab8 362 mxcsr_capability_mask = 0xffbf;
0a7de745
A
363 }
364
060df5ea 365 /* Clear vector register store */
0a7de745 366 bzero(&fps->fx.fx_XMM_reg[0][0], sizeof(fps->fx.fx_XMM_reg));
5ba3f43e 367 bzero(fps->avx.x_YMM_Hi128, sizeof(fps->avx.x_YMM_Hi128));
5ba3f43e
A
368 if (fpu_ZMM_capable) {
369 bzero(fps->avx512.x_ZMM_Hi256, sizeof(fps->avx512.x_ZMM_Hi256));
0a7de745
A
370 bzero(fps->avx512.x_Hi16_ZMM, sizeof(fps->avx512.x_Hi16_ZMM));
371 bzero(fps->avx512.x_Opmask, sizeof(fps->avx512.x_Opmask));
5ba3f43e 372 }
0c530ab8 373
5ba3f43e
A
374 fps->fx.fp_valid = TRUE;
375 fps->fx.fp_save_layout = fpu_YMM_capable ? XSAVE32: FXSAVE32;
060df5ea 376 fpu_load_registers(fps);
0c530ab8 377
d26ffc64
A
378 if (fpu_ZMM_capable) {
379 xsave64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
380 }
381 if (fpu_YMM_capable) {
382 xsave64((struct x86_fx_thread_state *)&default_avx_state, xstate_xmask[AVX]);
383 } else {
384 fxsave64((struct x86_fx_thread_state *)&default_fx_state);
385 }
386
060df5ea 387 /* Poison values to trap unsafe usage */
5ba3f43e
A
388 fps->fx.fp_valid = 0xFFFFFFFF;
389 fps->fx.fp_save_layout = FP_UNUSED;
0c530ab8 390
060df5ea
A
391 /* Re-enable FPU/SSE DNA exceptions */
392 set_ts();
0c530ab8
A
393}
394
d26ffc64 395int fpsimd_fault_popc = 0;
1c79356b
A
396/*
397 * Look for FPU and initialize it.
398 * Called on each CPU.
399 */
400void
401init_fpu(void)
402{
0a7de745
A
403#if DEBUG
404 unsigned short status;
405 unsigned short control;
060df5ea 406#endif
1c79356b
A
407 /*
408 * Check for FPU by initializing it,
409 * then trying to read the correct bit patterns from
410 * the control and status registers.
411 */
0a7de745 412 set_cr0((get_cr0() & ~(CR0_EM | CR0_TS)) | CR0_NE); /* allow use of FPU */
1c79356b 413 fninit();
0a7de745 414#if DEBUG
1c79356b
A
415 status = fnstsw();
416 fnstcw(&control);
0a7de745 417
060df5ea
A
418 assert(((status & 0xff) == 0) && ((control & 0x103f) == 0x3f));
419#endif
420 /* Advertise SSE support */
421 if (cpuid_features() & CPUID_FEATURE_FXSR) {
060df5ea
A
422 set_cr4(get_cr4() | CR4_OSFXS);
423 /* And allow SIMD exceptions if present */
424 if (cpuid_features() & CPUID_FEATURE_SSE) {
425 set_cr4(get_cr4() | CR4_OSXMM);
426 }
0a7de745 427 } else {
060df5ea 428 panic("fpu is not FP_FXSR");
0a7de745 429 }
55e303ae 430
5ba3f43e
A
431 fpu_capability = fpu_default = FP;
432
d26ffc64
A
433 PE_parse_boot_argn("fpsimd_fault_popc", &fpsimd_fault_popc, sizeof(fpsimd_fault_popc));
434
5ba3f43e
A
435 static boolean_t is_avx512_enabled = TRUE;
436 if (cpu_number() == master_cpu) {
437 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512F) {
438 PE_parse_boot_argn("avx512", &is_avx512_enabled, sizeof(boolean_t));
439 kprintf("AVX512 supported %s\n",
0a7de745 440 is_avx512_enabled ? "and enabled" : "but disabled");
5ba3f43e
A
441 }
442 }
0a7de745 443
060df5ea
A
444 /* Configure the XSAVE context mechanism if the processor supports
445 * AVX/YMM registers
446 */
447 if (cpuid_features() & CPUID_FEATURE_XSAVE) {
5ba3f43e 448 cpuid_xsave_leaf_t *xs0p = &cpuid_info()->cpuid_xsave_leaf[0];
5ba3f43e
A
449 if (is_avx512_enabled &&
450 (xs0p->extended_state[eax] & XFEM_ZMM) == XFEM_ZMM) {
451 assert(xs0p->extended_state[eax] & XFEM_SSE);
452 assert(xs0p->extended_state[eax] & XFEM_YMM);
453 fpu_capability = AVX512;
454 /* XSAVE container size for all features */
455 set_cr4(get_cr4() | CR4_OSXSAVE);
456 xsetbv(0, AVX512_XMASK);
457 /* Re-evaluate CPUID, once, to reflect OSXSAVE */
0a7de745 458 if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) {
5ba3f43e 459 cpuid_set_info();
0a7de745 460 }
5ba3f43e
A
461 /* Verify that now selected state can be accommodated */
462 assert(xs0p->extended_state[ebx] == fp_state_size[AVX512]);
463 /*
464 * AVX set until AVX512 is used.
465 * See comment above about on-demand AVX512 support.
466 */
467 xsetbv(0, AVX_XMASK);
468 fpu_default = AVX;
ea3f0419 469 } else if (xs0p->extended_state[eax] & XFEM_YMM) {
5ba3f43e
A
470 assert(xs0p->extended_state[eax] & XFEM_SSE);
471 fpu_capability = AVX;
472 fpu_default = AVX;
060df5ea 473 /* XSAVE container size for all features */
060df5ea 474 set_cr4(get_cr4() | CR4_OSXSAVE);
5ba3f43e 475 xsetbv(0, AVX_XMASK);
060df5ea 476 /* Re-evaluate CPUID, once, to reflect OSXSAVE */
0a7de745 477 if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) {
060df5ea 478 cpuid_set_info();
0a7de745 479 }
3e170ce0 480 /* Verify that now selected state can be accommodated */
5ba3f43e 481 assert(xs0p->extended_state[ebx] == fp_state_size[AVX]);
060df5ea
A
482 }
483 }
5ba3f43e 484
0a7de745 485 if (cpu_number() == master_cpu) {
5ba3f43e 486 kprintf("fpu_state: %s, state_size: %d\n",
0a7de745
A
487 xstate_name[fpu_capability],
488 fp_state_size[fpu_capability]);
489 }
060df5ea
A
490
491 fpinit();
d26ffc64 492 current_cpu_datap()->cpu_xstate = fpu_default;
060df5ea
A
493
494 /*
495 * Trap wait instructions. Turn off FPU for now.
496 */
497 set_cr0(get_cr0() | CR0_TS | CR0_MP);
498}
499
500/*
5ba3f43e 501 * Allocate and initialize FP state for specified xstate.
060df5ea
A
502 * Don't load state.
503 */
504static void *
5ba3f43e 505fp_state_alloc(xstate_t xs)
060df5ea 506{
5ba3f43e
A
507 struct x86_fx_thread_state *ifps;
508
509 assert(ifps_zone[xs] != NULL);
510 ifps = zalloc(ifps_zone[xs]);
0c530ab8 511
0a7de745
A
512#if DEBUG
513 if (!(ALIGNED(ifps, 64))) {
5ba3f43e 514 panic("fp_state_alloc: %p, %u, %p, %u",
0a7de745
A
515 ifps, (unsigned) ifps_zone[xs]->elem_size,
516 (void *) ifps_zone[xs]->free_elements,
517 (unsigned) ifps_zone[xs]->alloc_size);
1c79356b 518 }
060df5ea 519#endif
5ba3f43e
A
520 bzero(ifps, fp_state_size[xs]);
521
060df5ea
A
522 return ifps;
523}
524
525static inline void
5ba3f43e 526fp_state_free(void *ifps, xstate_t xs)
060df5ea 527{
5ba3f43e
A
528 assert(ifps_zone[xs] != NULL);
529 zfree(ifps_zone[xs], ifps);
060df5ea
A
530}
531
0a7de745
A
532void
533clear_fpu(void)
060df5ea
A
534{
535 set_ts();
536}
537
538
0a7de745
A
539static void
540fpu_load_registers(void *fstate)
541{
060df5ea
A
542 struct x86_fx_thread_state *ifps = fstate;
543 fp_save_layout_t layout = ifps->fp_save_layout;
544
0a7de745
A
545 assert(current_task() == NULL || \
546 (thread_is_64bit_addr(current_thread()) ? \
547 (layout == FXSAVE64 || layout == XSAVE64) : \
548 (layout == FXSAVE32 || layout == XSAVE32)));
060df5ea
A
549 assert(ALIGNED(ifps, 64));
550 assert(ml_get_interrupts_enabled() == FALSE);
551
0a7de745 552#if DEBUG
060df5ea
A
553 if (layout == XSAVE32 || layout == XSAVE64) {
554 struct x86_avx_thread_state *iavx = fstate;
555 unsigned i;
556 /* Verify reserved bits in the XSAVE header*/
0a7de745 557 if (iavx->_xh.xstate_bv & ~xstate_xmask[current_xstate()]) {
5ba3f43e 558 panic("iavx->_xh.xstate_bv: 0x%llx", iavx->_xh.xstate_bv);
0a7de745
A
559 }
560 for (i = 0; i < sizeof(iavx->_xh.xhrsvd); i++) {
561 if (iavx->_xh.xhrsvd[i]) {
060df5ea 562 panic("Reserved bit set");
0a7de745
A
563 }
564 }
060df5ea 565 }
5ba3f43e 566 if (fpu_YMM_capable) {
0a7de745 567 if (layout != XSAVE32 && layout != XSAVE64) {
060df5ea 568 panic("Inappropriate layout: %u\n", layout);
0a7de745 569 }
060df5ea 570 }
0a7de745 571#endif /* DEBUG */
060df5ea 572
5ba3f43e 573 switch (layout) {
0a7de745 574 case FXSAVE64:
5ba3f43e
A
575 fxrstor64(ifps);
576 break;
0a7de745 577 case FXSAVE32:
060df5ea 578 fxrstor(ifps);
5ba3f43e 579 break;
0a7de745 580 case XSAVE64:
5ba3f43e
A
581 xrstor64(ifps, xstate_xmask[current_xstate()]);
582 break;
0a7de745 583 case XSAVE32:
5ba3f43e
A
584 xrstor(ifps, xstate_xmask[current_xstate()]);
585 break;
0a7de745 586 default:
5ba3f43e
A
587 panic("fpu_load_registers() bad layout: %d\n", layout);
588 }
060df5ea
A
589}
590
0a7de745
A
591static void
592fpu_store_registers(void *fstate, boolean_t is64)
593{
060df5ea
A
594 struct x86_fx_thread_state *ifps = fstate;
595 assert(ALIGNED(ifps, 64));
5ba3f43e
A
596 xstate_t xs = current_xstate();
597 switch (xs) {
0a7de745 598 case FP:
5ba3f43e
A
599 if (is64) {
600 fxsave64(fstate);
601 ifps->fp_save_layout = FXSAVE64;
602 } else {
603 fxsave(fstate);
604 ifps->fp_save_layout = FXSAVE32;
605 }
606 break;
0a7de745 607 case AVX:
0a7de745 608 case AVX512:
5ba3f43e
A
609 if (is64) {
610 xsave64(ifps, xstate_xmask[xs]);
611 ifps->fp_save_layout = XSAVE64;
612 } else {
613 xsave(ifps, xstate_xmask[xs]);
614 ifps->fp_save_layout = XSAVE32;
615 }
616 break;
0a7de745 617 default:
5ba3f43e 618 panic("fpu_store_registers() bad xstate: %d\n", xs);
060df5ea 619 }
1c79356b
A
620}
621
622/*
623 * Initialize FP handling.
624 */
060df5ea 625
1c79356b
A
626void
627fpu_module_init(void)
628{
0a7de745 629 if (!IS_VALID_XSTATE(fpu_default)) {
5ba3f43e 630 panic("fpu_module_init: invalid extended state %u\n",
0a7de745
A
631 fpu_default);
632 }
060df5ea 633
5ba3f43e 634 /* We explicitly choose an allocation size of 13 pages = 64 * 832
060df5ea
A
635 * to eliminate waste for the 832 byte sized
636 * AVX XSAVE register save area.
637 */
5ba3f43e 638 ifps_zone[fpu_default] = zinit(fp_state_size[fpu_default],
0a7de745
A
639 thread_max * fp_state_size[fpu_default],
640 64 * fp_state_size[fpu_default],
641 "x86 fpsave state");
060df5ea 642
060df5ea
A
643 /* To maintain the required alignment, disable
644 * zone debugging for this zone as that appends
645 * 16 bytes to each element.
646 */
5ba3f43e
A
647 zone_change(ifps_zone[fpu_default], Z_ALIGNMENT_REQUIRED, TRUE);
648
5ba3f43e
A
649 /*
650 * If AVX512 is supported, create a separate savearea zone.
651 * with allocation size: 19 pages = 32 * 2668
652 */
653 if (fpu_capability == AVX512) {
654 ifps_zone[AVX512] = zinit(fp_state_size[AVX512],
0a7de745
A
655 thread_max * fp_state_size[AVX512],
656 32 * fp_state_size[AVX512],
657 "x86 avx512 save state");
5ba3f43e
A
658 zone_change(ifps_zone[AVX512], Z_ALIGNMENT_REQUIRED, TRUE);
659 }
5ba3f43e 660
060df5ea
A
661 /* Determine MXCSR reserved bits and configure initial FPU state*/
662 configure_mxcsr_capability_mask(&initial_fp_state);
663}
664
665/*
5ba3f43e
A
666 * Context switch fpu state.
667 * Always save old thread`s FPU context but don't load new .. allow that to fault-in.
668 * Switch to the new task's xstate.
060df5ea 669 */
d26ffc64 670
060df5ea 671void
5ba3f43e 672fpu_switch_context(thread_t old, thread_t new)
060df5ea 673{
0a7de745 674 struct x86_fx_thread_state *ifps;
d26ffc64
A
675 cpu_data_t *cdp = current_cpu_datap();
676 xstate_t new_xstate = new ? thread_xstate(new) : fpu_default;
060df5ea
A
677
678 assert(ml_get_interrupts_enabled() == FALSE);
5ba3f43e 679 ifps = (old)->machine.ifps;
0a7de745 680#if DEBUG
060df5ea
A
681 if (ifps && ((ifps->fp_valid != FALSE) && (ifps->fp_valid != TRUE))) {
682 panic("ifps->fp_valid: %u\n", ifps->fp_valid);
683 }
684#endif
685 if (ifps != 0 && (ifps->fp_valid == FALSE)) {
686 /* Clear CR0.TS in preparation for the FP context save. In
687 * theory, this shouldn't be necessary since a live FPU should
688 * indicate that TS is clear. However, various routines
689 * (such as sendsig & sigreturn) manipulate TS directly.
690 */
691 clear_ts();
692 /* registers are in FPU - save to memory */
d9a64523
A
693 boolean_t is64 = (thread_is_64bit_addr(old) &&
694 is_saved_state64(old->machine.iss));
695
696 fpu_store_registers(ifps, is64);
060df5ea 697 ifps->fp_valid = TRUE;
d26ffc64
A
698
699 if (fpu_ZMM_capable && (cdp->cpu_xstate == AVX512)) {
700 xrstor64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
701 } else if (fpu_YMM_capable) {
702 xrstor64((struct x86_fx_thread_state *) &default_avx_state, xstate_xmask[AVX]);
703 } else {
704 fxrstor64((struct x86_fx_thread_state *)&default_fx_state);
705 }
060df5ea 706 }
d26ffc64
A
707
708 assertf(fpu_YMM_capable ? (xgetbv(XCR0) == xstate_xmask[cdp->cpu_xstate]) : TRUE, "XCR0 mismatch: 0x%llx 0x%x 0x%x", xgetbv(XCR0), cdp->cpu_xstate, xstate_xmask[cdp->cpu_xstate]);
0a7de745 709 if (new_xstate != (xstate_t) cdp->cpu_xstate) {
5ba3f43e 710 DBG("fpu_switch_context(%p,%p) new xstate: %s\n",
0a7de745 711 old, new, xstate_name[new_xstate]);
5ba3f43e 712 xsetbv(0, xstate_xmask[new_xstate]);
d26ffc64 713 cdp->cpu_xstate = new_xstate;
5ba3f43e 714 }
060df5ea 715 set_ts();
1c79356b
A
716}
717
060df5ea 718
1c79356b
A
719/*
720 * Free a FPU save area.
721 * Called only when thread terminating - no locking necessary.
722 */
723void
5ba3f43e 724fpu_free(thread_t thread, void *fps)
1c79356b 725{
0a7de745
A
726 pcb_t pcb = THREAD_TO_PCB(thread);
727
5ba3f43e
A
728 fp_state_free(fps, pcb->xstate);
729 pcb->xstate = UNDEFINED;
1c79356b
A
730}
731
55e303ae 732/*
0a7de745
A
733 * Set the floating-point state for a thread based
734 * on the FXSave formatted data. This is basically
735 * the same as fpu_set_state except it uses the
736 * expanded data structure.
55e303ae
A
737 * If the thread is not the current thread, it is
738 * not running (held). Locking needed against
739 * concurrent fpu_set_state or fpu_get_state.
740 */
741kern_return_t
742fpu_set_fxstate(
0a7de745
A
743 thread_t thr_act,
744 thread_state_t tstate,
060df5ea 745 thread_flavor_t f)
55e303ae 746{
0a7de745
A
747 struct x86_fx_thread_state *ifps;
748 struct x86_fx_thread_state *new_ifps;
749 x86_float_state64_t *state;
750 pcb_t pcb;
751 boolean_t old_valid, fresh_state = FALSE;
ea3f0419 752 xstate_t thr_xstate;
fe8ab488 753
0a7de745 754 if (fpu_capability == UNDEFINED) {
fe8ab488 755 return KERN_FAILURE;
0a7de745 756 }
0c530ab8 757
bd504ef0 758 if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
0a7de745 759 fpu_capability < AVX) {
fe8ab488 760 return KERN_FAILURE;
0a7de745 761 }
bd504ef0 762
ea3f0419
A
763 assert(thr_act != THREAD_NULL);
764
765 thr_xstate = thread_xstate(thr_act);
766
5ba3f43e 767 if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
ea3f0419 768 thr_xstate == AVX) {
0a7de745 769 if (!fpu_thread_promote_avx512(thr_act)) {
5ba3f43e 770 return KERN_FAILURE;
ea3f0419
A
771 } else {
772 /* Reload thr_xstate after successful promotion */
773 thr_xstate = thread_xstate(thr_act);
0a7de745
A
774 }
775 }
5ba3f43e 776
0c530ab8 777 state = (x86_float_state64_t *)tstate;
55e303ae 778
6d2010ae 779 pcb = THREAD_TO_PCB(thr_act);
55e303ae 780
0c530ab8 781 if (state == NULL) {
fe8ab488
A
782 /*
783 * new FPU state is 'invalid'.
784 * Deallocate the fp state if it exists.
785 */
0a7de745 786 simple_lock(&pcb->lock, LCK_GRP_NULL);
0c530ab8
A
787
788 ifps = pcb->ifps;
789 pcb->ifps = 0;
4452a7af 790
fe8ab488 791 simple_unlock(&pcb->lock);
0c530ab8 792
fe8ab488 793 if (ifps != 0) {
ea3f0419 794 fp_state_free(ifps, thr_xstate);
fe8ab488 795 }
0c530ab8 796 } else {
fe8ab488
A
797 /*
798 * Valid incoming state. Allocate the fp state if there is none.
799 */
800 new_ifps = 0;
0a7de745
A
801Retry:
802 simple_lock(&pcb->lock, LCK_GRP_NULL);
0c530ab8
A
803
804 ifps = pcb->ifps;
fe8ab488
A
805 if (ifps == 0) {
806 if (new_ifps == 0) {
807 simple_unlock(&pcb->lock);
ea3f0419 808 new_ifps = fp_state_alloc(thr_xstate);
fe8ab488
A
809 goto Retry;
810 }
811 ifps = new_ifps;
812 new_ifps = 0;
813 pcb->ifps = ifps;
ea3f0419 814 pcb->xstate = thr_xstate;
fe8ab488
A
815 fresh_state = TRUE;
816 }
817
818 /*
819 * now copy over the new data.
820 */
821
822 old_valid = ifps->fp_valid;
823
0a7de745 824#if DEBUG || DEVELOPMENT
fe8ab488
A
825 if ((fresh_state == FALSE) && (old_valid == FALSE) && (thr_act != current_thread())) {
826 panic("fpu_set_fxstate inconsistency, thread: %p not stopped", thr_act);
55e303ae 827 }
060df5ea 828#endif
fe8ab488
A
829 /*
830 * Clear any reserved bits in the MXCSR to prevent a GPF
831 * when issuing an FXRSTOR.
832 */
7ddcb079 833
fe8ab488 834 state->fpu_mxcsr &= mxcsr_capability_mask;
060df5ea 835
cb323159 836 __nochk_bcopy((char *)&state->fpu_fcw, (char *)ifps, fp_state_size[FP]);
060df5ea 837
ea3f0419 838 switch (thr_xstate) {
0a7de745
A
839 case UNDEFINED_FULL:
840 case FP_FULL:
841 case AVX_FULL:
842 case AVX512_FULL:
ea3f0419 843 panic("fpu_set_fxstate() INVALID xstate: 0x%x", thr_xstate);
0a7de745
A
844 break;
845
846 case UNDEFINED:
5ba3f43e
A
847 panic("fpu_set_fxstate() UNDEFINED xstate");
848 break;
0a7de745 849 case FP:
d9a64523 850 ifps->fp_save_layout = thread_is_64bit_addr(thr_act) ? FXSAVE64 : FXSAVE32;
5ba3f43e 851 break;
0a7de745 852 case AVX: {
fe8ab488 853 struct x86_avx_thread_state *iavx = (void *) ifps;
5ba3f43e 854 x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
7ddcb079 855
d9a64523 856 iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
7ddcb079 857
5ba3f43e
A
858 /* Sanitize XSAVE header */
859 bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
860 iavx->_xh.xstate_bv = AVX_XMASK;
861 iavx->_xh.xcomp_bv = 0;
862
863 if (f == x86_AVX_STATE32) {
cb323159 864 __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
5ba3f43e 865 } else if (f == x86_AVX_STATE64) {
cb323159 866 __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
5ba3f43e
A
867 } else {
868 iavx->_xh.xstate_bv = (XFEM_SSE | XFEM_X87);
fe8ab488 869 }
5ba3f43e 870 break;
0a7de745 871 }
0a7de745 872 case AVX512: {
5ba3f43e
A
873 struct x86_avx512_thread_state *iavx = (void *) ifps;
874 union {
875 thread_state_t ts;
876 x86_avx512_state32_t *s32;
877 x86_avx512_state64_t *s64;
878 } xs = { .ts = tstate };
879
d9a64523 880 iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
fe8ab488 881
fe8ab488
A
882 /* Sanitize XSAVE header */
883 bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
5ba3f43e
A
884 iavx->_xh.xstate_bv = AVX512_XMASK;
885 iavx->_xh.xcomp_bv = 0;
886
887 switch (f) {
0a7de745 888 case x86_AVX512_STATE32:
cb323159
A
889 __nochk_bcopy(&xs.s32->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
890 __nochk_bcopy(&xs.s32->fpu_zmmh0, iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG));
891 __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
5ba3f43e
A
892 DBG_AVX512_STATE(iavx);
893 break;
0a7de745 894 case x86_AVX_STATE32:
cb323159 895 __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
5ba3f43e 896 break;
0a7de745 897 case x86_AVX512_STATE64:
cb323159
A
898 __nochk_bcopy(&xs.s64->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
899 __nochk_bcopy(&xs.s64->fpu_zmm16, iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG));
900 __nochk_bcopy(&xs.s64->fpu_zmmh0, iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG));
901 __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
5ba3f43e
A
902 DBG_AVX512_STATE(iavx);
903 break;
0a7de745 904 case x86_AVX_STATE64:
cb323159 905 __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
5ba3f43e
A
906 break;
907 }
908 break;
0a7de745 909 }
7ddcb079 910 }
5ba3f43e 911
fe8ab488 912 ifps->fp_valid = old_valid;
7ddcb079 913
fe8ab488
A
914 if (old_valid == FALSE) {
915 boolean_t istate = ml_set_interrupts_enabled(FALSE);
916 ifps->fp_valid = TRUE;
917 /* If altering the current thread's state, disable FPU */
0a7de745 918 if (thr_act == current_thread()) {
fe8ab488 919 set_ts();
0a7de745 920 }
fe8ab488
A
921
922 ml_set_interrupts_enabled(istate);
923 }
924
925 simple_unlock(&pcb->lock);
926
0a7de745 927 if (new_ifps != 0) {
ea3f0419 928 fp_state_free(new_ifps, thr_xstate);
0a7de745 929 }
0c530ab8 930 }
55e303ae
A
931 return KERN_SUCCESS;
932}
933
934/*
935 * Get the floating-point state for a thread.
936 * If the thread is not the current thread, it is
937 * not running (held). Locking needed against
938 * concurrent fpu_set_state or fpu_get_state.
939 */
940kern_return_t
941fpu_get_fxstate(
0a7de745
A
942 thread_t thr_act,
943 thread_state_t tstate,
060df5ea 944 thread_flavor_t f)
55e303ae 945{
0a7de745
A
946 struct x86_fx_thread_state *ifps;
947 x86_float_state64_t *state;
948 kern_return_t ret = KERN_FAILURE;
949 pcb_t pcb;
ea3f0419 950 xstate_t thr_xstate = thread_xstate(thr_act);
55e303ae 951
0a7de745 952 if (fpu_capability == UNDEFINED) {
2d21ac55 953 return KERN_FAILURE;
0a7de745 954 }
0c530ab8 955
bd504ef0 956 if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
0a7de745 957 fpu_capability < AVX) {
bd504ef0 958 return KERN_FAILURE;
0a7de745 959 }
bd504ef0 960
5ba3f43e 961 if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
ea3f0419 962 thr_xstate != AVX512) {
5ba3f43e 963 return KERN_FAILURE;
0a7de745 964 }
5ba3f43e 965
0c530ab8 966 state = (x86_float_state64_t *)tstate;
55e303ae 967
91447636 968 assert(thr_act != THREAD_NULL);
6d2010ae 969 pcb = THREAD_TO_PCB(thr_act);
55e303ae 970
0a7de745 971 simple_lock(&pcb->lock, LCK_GRP_NULL);
0c530ab8
A
972
973 ifps = pcb->ifps;
55e303ae 974 if (ifps == 0) {
2d21ac55 975 /*
0c530ab8
A
976 * No valid floating-point state.
977 */
060df5ea 978
cb323159 979 __nochk_bcopy((char *)&initial_fp_state, (char *)&state->fpu_fcw,
5ba3f43e 980 fp_state_size[FP]);
0c530ab8
A
981
982 simple_unlock(&pcb->lock);
6601e61a 983
0c530ab8
A
984 return KERN_SUCCESS;
985 }
986 /*
987 * Make sure we`ve got the latest fp state info
988 * If the live fpu state belongs to our target
989 */
2d21ac55 990 if (thr_act == current_thread()) {
0a7de745 991 boolean_t intr;
8f6c56a5 992
0c530ab8 993 intr = ml_set_interrupts_enabled(FALSE);
89b3af67 994
0c530ab8
A
995 clear_ts();
996 fp_save(thr_act);
997 clear_fpu();
6601e61a 998
0c530ab8 999 (void)ml_set_interrupts_enabled(intr);
6601e61a 1000 }
0c530ab8 1001 if (ifps->fp_valid) {
cb323159 1002 __nochk_bcopy((char *)ifps, (char *)&state->fpu_fcw, fp_state_size[FP]);
ea3f0419 1003 switch (thr_xstate) {
0a7de745
A
1004 case UNDEFINED_FULL:
1005 case FP_FULL:
1006 case AVX_FULL:
1007 case AVX512_FULL:
ea3f0419 1008 panic("fpu_get_fxstate() INVALID xstate: 0x%x", thr_xstate);
0a7de745
A
1009 break;
1010
1011 case UNDEFINED:
5ba3f43e
A
1012 panic("fpu_get_fxstate() UNDEFINED xstate");
1013 break;
0a7de745
A
1014 case FP:
1015 break; /* already done */
1016 case AVX: {
7ddcb079 1017 struct x86_avx_thread_state *iavx = (void *) ifps;
5ba3f43e
A
1018 x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
1019 if (f == x86_AVX_STATE32) {
cb323159 1020 __nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
5ba3f43e 1021 } else if (f == x86_AVX_STATE64) {
cb323159 1022 __nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
5ba3f43e
A
1023 }
1024 break;
0a7de745 1025 }
0a7de745 1026 case AVX512: {
5ba3f43e
A
1027 struct x86_avx512_thread_state *iavx = (void *) ifps;
1028 union {
1029 thread_state_t ts;
1030 x86_avx512_state32_t *s32;
1031 x86_avx512_state64_t *s64;
1032 } xs = { .ts = tstate };
1033 switch (f) {
0a7de745 1034 case x86_AVX512_STATE32:
cb323159
A
1035 __nochk_bcopy(iavx->x_Opmask, &xs.s32->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
1036 __nochk_bcopy(iavx->x_ZMM_Hi256, &xs.s32->fpu_zmmh0, 8 * sizeof(_STRUCT_YMM_REG));
1037 __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
5ba3f43e
A
1038 DBG_AVX512_STATE(iavx);
1039 break;
0a7de745 1040 case x86_AVX_STATE32:
cb323159 1041 __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
5ba3f43e 1042 break;
0a7de745 1043 case x86_AVX512_STATE64:
cb323159
A
1044 __nochk_bcopy(iavx->x_Opmask, &xs.s64->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
1045 __nochk_bcopy(iavx->x_Hi16_ZMM, &xs.s64->fpu_zmm16, 16 * sizeof(_STRUCT_ZMM_REG));
1046 __nochk_bcopy(iavx->x_ZMM_Hi256, &xs.s64->fpu_zmmh0, 16 * sizeof(_STRUCT_YMM_REG));
1047 __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
5ba3f43e
A
1048 DBG_AVX512_STATE(iavx);
1049 break;
0a7de745 1050 case x86_AVX_STATE64:
cb323159 1051 __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
5ba3f43e 1052 break;
7ddcb079 1053 }
5ba3f43e 1054 break;
0a7de745 1055 }
7ddcb079
A
1056 }
1057
0c530ab8 1058 ret = KERN_SUCCESS;
6601e61a 1059 }
0c530ab8 1060 simple_unlock(&pcb->lock);
21362eb3 1061
0c530ab8 1062 return ret;
6601e61a 1063}
21362eb3 1064
0c530ab8 1065
2d21ac55 1066
6601e61a 1067/*
0c530ab8
A
1068 * the child thread is 'stopped' with the thread
1069 * mutex held and is currently not known by anyone
1070 * so no way for fpu state to get manipulated by an
1071 * outside agency -> no need for pcb lock
6601e61a 1072 */
0c530ab8
A
1073
1074void
1075fpu_dup_fxstate(
0a7de745
A
1076 thread_t parent,
1077 thread_t child)
6601e61a 1078{
060df5ea 1079 struct x86_fx_thread_state *new_ifps = NULL;
0a7de745
A
1080 boolean_t intr;
1081 pcb_t ppcb;
1082 xstate_t xstate = thread_xstate(parent);
21362eb3 1083
6d2010ae 1084 ppcb = THREAD_TO_PCB(parent);
21362eb3 1085
0a7de745
A
1086 if (ppcb->ifps == NULL) {
1087 return;
1088 }
4452a7af 1089
0a7de745
A
1090 if (child->machine.ifps) {
1091 panic("fpu_dup_fxstate: child's ifps non-null");
1092 }
4452a7af 1093
5ba3f43e 1094 new_ifps = fp_state_alloc(xstate);
5d5c5d0d 1095
0a7de745 1096 simple_lock(&ppcb->lock, LCK_GRP_NULL);
6601e61a 1097
0c530ab8 1098 if (ppcb->ifps != NULL) {
060df5ea 1099 struct x86_fx_thread_state *ifps = ppcb->ifps;
0a7de745 1100 /*
0c530ab8
A
1101 * Make sure we`ve got the latest fp state info
1102 */
39037602
A
1103 if (current_thread() == parent) {
1104 intr = ml_set_interrupts_enabled(FALSE);
1105 assert(current_thread() == parent);
1106 clear_ts();
1107 fp_save(parent);
1108 clear_fpu();
1109
1110 (void)ml_set_interrupts_enabled(intr);
1111 }
6601e61a 1112
060df5ea 1113 if (ifps->fp_valid) {
6d2010ae 1114 child->machine.ifps = new_ifps;
5ba3f43e 1115 child->machine.xstate = xstate;
cb323159 1116 __nochk_bcopy((char *)(ppcb->ifps),
0a7de745
A
1117 (char *)(child->machine.ifps),
1118 fp_state_size[xstate]);
0c530ab8 1119
2d21ac55
A
1120 /* Mark the new fp saved state as non-live. */
1121 /* Temporarily disabled: radar 4647827
1122 * new_ifps->fp_valid = TRUE;
1123 */
060df5ea 1124
0c530ab8
A
1125 /*
1126 * Clear any reserved bits in the MXCSR to prevent a GPF
1127 * when issuing an FXRSTOR.
1128 */
060df5ea 1129 new_ifps->fx_MXCSR &= mxcsr_capability_mask;
0c530ab8
A
1130 new_ifps = NULL;
1131 }
6601e61a 1132 }
0c530ab8 1133 simple_unlock(&ppcb->lock);
89b3af67 1134
0a7de745
A
1135 if (new_ifps != NULL) {
1136 fp_state_free(new_ifps, xstate);
1137 }
6601e61a 1138}
4452a7af 1139
1c79356b
A
1140/*
1141 * Initialize FPU.
d26ffc64
A
1142 * FNINIT programs the x87 control word to 0x37f, which matches
1143 * the desired default for macOS.
1c79356b 1144 */
060df5ea 1145
1c79356b 1146void
0a7de745
A
1147fpinit(void)
1148{
d26ffc64 1149 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1c79356b
A
1150 clear_ts();
1151 fninit();
d26ffc64
A
1152#if DEBUG
1153 /* We skip this power-on-default verification sequence on
1154 * non-DEBUG, as dirtying the x87 control word may slow down
1155 * xsave/xrstor and affect energy use.
1156 */
0a7de745 1157 unsigned short control, control2;
1c79356b 1158 fnstcw(&control);
d26ffc64 1159 control2 = control;
0a7de745
A
1160 control &= ~(FPC_PC | FPC_RC); /* Clear precision & rounding control */
1161 control |= (FPC_PC_64 | /* Set precision */
1162 FPC_RC_RN | /* round-to-nearest */
1163 FPC_ZE | /* Suppress zero-divide */
1164 FPC_OE | /* and overflow */
1165 FPC_UE | /* underflow */
1166 FPC_IE | /* Allow NaNQs and +-INF */
1167 FPC_DE | /* Allow denorms as operands */
1168 FPC_PE); /* No trap for precision loss */
d26ffc64 1169 assert(control == control2);
1c79356b 1170 fldcw(control);
d26ffc64 1171#endif
0c530ab8 1172 /* Initialize SSE/SSE2 */
060df5ea 1173 __builtin_ia32_ldmxcsr(0x1f80);
d26ffc64
A
1174 if (fpu_YMM_capable) {
1175 vzeroall();
1176 } else {
1177 xmmzeroall();
1178 }
1179 ml_set_interrupts_enabled(istate);
b0d623f7 1180}
1c79356b
A
1181
1182/*
1183 * Coprocessor not present.
1184 */
1185
3e170ce0
A
1186uint64_t x86_isr_fp_simd_use;
1187
1c79356b
A
1188void
1189fpnoextflt(void)
1190{
0a7de745
A
1191 boolean_t intr;
1192 thread_t thr_act;
1193 pcb_t pcb;
060df5ea 1194 struct x86_fx_thread_state *ifps = 0;
0a7de745 1195 xstate_t xstate = current_xstate();
2d21ac55
A
1196
1197 thr_act = current_thread();
6d2010ae 1198 pcb = THREAD_TO_PCB(thr_act);
2d21ac55 1199
060df5ea 1200 if (pcb->ifps == 0 && !get_interrupt_level()) {
0a7de745 1201 ifps = fp_state_alloc(xstate);
cb323159 1202 __nochk_bcopy((char *)&initial_fp_state, (char *)ifps,
5ba3f43e 1203 fp_state_size[xstate]);
d9a64523 1204 if (!thread_is_64bit_addr(thr_act)) {
5ba3f43e 1205 ifps->fp_save_layout = fpu_YMM_capable ? XSAVE32 : FXSAVE32;
0a7de745 1206 } else {
5ba3f43e 1207 ifps->fp_save_layout = fpu_YMM_capable ? XSAVE64 : FXSAVE64;
0a7de745 1208 }
060df5ea
A
1209 ifps->fp_valid = TRUE;
1210 }
0c530ab8
A
1211 intr = ml_set_interrupts_enabled(FALSE);
1212
0a7de745 1213 clear_ts(); /* Enable FPU use */
0c530ab8 1214
6d2010ae 1215 if (__improbable(get_interrupt_level())) {
3e170ce0
A
1216 /* Track number of #DNA traps at interrupt context,
1217 * which is likely suboptimal. Racy, but good enough.
1218 */
1219 x86_isr_fp_simd_use++;
0c530ab8 1220 /*
3e170ce0
A
1221 * Save current FP/SIMD context if valid
1222 * Initialize live FP/SIMD registers
0c530ab8 1223 */
3e170ce0
A
1224 if (pcb->ifps) {
1225 fp_save(thr_act);
1226 }
0c530ab8
A
1227 fpinit();
1228 } else {
0a7de745
A
1229 if (pcb->ifps == 0) {
1230 pcb->ifps = ifps;
1231 pcb->xstate = xstate;
2d21ac55
A
1232 ifps = 0;
1233 }
0c530ab8
A
1234 /*
1235 * Load this thread`s state into coprocessor live context.
1236 */
2d21ac55 1237 fp_load(thr_act);
0c530ab8 1238 }
0c530ab8 1239 (void)ml_set_interrupts_enabled(intr);
2d21ac55 1240
0a7de745
A
1241 if (ifps) {
1242 fp_state_free(ifps, xstate);
1243 }
1c79356b
A
1244}
1245
1246/*
1247 * FPU overran end of segment.
1248 * Re-initialize FPU. Floating point state is not valid.
1249 */
1250
1251void
1252fpextovrflt(void)
1253{
0a7de745
A
1254 thread_t thr_act = current_thread();
1255 pcb_t pcb;
060df5ea 1256 struct x86_fx_thread_state *ifps;
0a7de745
A
1257 boolean_t intr;
1258 xstate_t xstate = current_xstate();
0c530ab8
A
1259
1260 intr = ml_set_interrupts_enabled(FALSE);
1261
0a7de745 1262 if (get_interrupt_level()) {
94ff46dc 1263 panic("FPU segment overrun exception at interrupt context\n");
0a7de745
A
1264 }
1265 if (current_task() == kernel_task) {
0c530ab8 1266 panic("FPU segment overrun exception in kernel thread context\n");
0a7de745 1267 }
1c79356b 1268
1c79356b
A
1269 /*
1270 * This is a non-recoverable error.
1271 * Invalidate the thread`s FPU state.
1272 */
6d2010ae 1273 pcb = THREAD_TO_PCB(thr_act);
0a7de745 1274 simple_lock(&pcb->lock, LCK_GRP_NULL);
0c530ab8
A
1275 ifps = pcb->ifps;
1276 pcb->ifps = 0;
1c79356b
A
1277 simple_unlock(&pcb->lock);
1278
1279 /*
1280 * Re-initialize the FPU.
1281 */
1282 clear_ts();
1283 fninit();
1284
1285 /*
1286 * And disable access.
1287 */
1288 clear_fpu();
1289
0c530ab8
A
1290 (void)ml_set_interrupts_enabled(intr);
1291
0a7de745
A
1292 if (ifps) {
1293 fp_state_free(ifps, xstate);
1294 }
1c79356b
A
1295}
1296
cc8bc92a
A
1297extern void fpxlog(int, uint32_t, uint32_t, uint32_t);
1298
1c79356b
A
1299/*
1300 * FPU error. Called by AST.
1301 */
1302
1303void
1304fpexterrflt(void)
1305{
0a7de745 1306 thread_t thr_act = current_thread();
6d2010ae 1307 struct x86_fx_thread_state *ifps = thr_act->machine.ifps;
0a7de745 1308 boolean_t intr;
0c530ab8
A
1309
1310 intr = ml_set_interrupts_enabled(FALSE);
1311
0a7de745 1312 if (get_interrupt_level()) {
0c530ab8 1313 panic("FPU error exception at interrupt context\n");
0a7de745
A
1314 }
1315 if (current_task() == kernel_task) {
0c530ab8 1316 panic("FPU error exception in kernel thread context\n");
0a7de745 1317 }
1c79356b 1318
1c79356b
A
1319 /*
1320 * Save the FPU state and turn off the FPU.
1321 */
1322 fp_save(thr_act);
1c79356b 1323
0c530ab8
A
1324 (void)ml_set_interrupts_enabled(intr);
1325
cc8bc92a
A
1326 const uint32_t mask = ifps->fx_control &
1327 (FPC_IM | FPC_DM | FPC_ZM | FPC_OM | FPC_UE | FPC_PE);
1328 const uint32_t xcpt = ~mask & (ifps->fx_status &
1329 (FPS_IE | FPS_DE | FPS_ZE | FPS_OE | FPS_UE | FPS_PE));
1330 fpxlog(EXC_I386_EXTERR, ifps->fx_status, ifps->fx_control, xcpt);
1c79356b
A
1331}
1332
1333/*
1334 * Save FPU state.
1335 *
1336 * Locking not needed:
1337 * . if called from fpu_get_state, pcb already locked.
1338 * . if called from fpnoextflt or fp_intr, we are single-cpu
1339 * . otherwise, thread is running.
0c530ab8 1340 * N.B.: Must be called with interrupts disabled
1c79356b 1341 */
0c530ab8 1342
1c79356b
A
1343void
1344fp_save(
0a7de745 1345 thread_t thr_act)
1c79356b 1346{
6d2010ae 1347 pcb_t pcb = THREAD_TO_PCB(thr_act);
060df5ea 1348 struct x86_fx_thread_state *ifps = pcb->ifps;
0c530ab8 1349
060df5ea 1350 assert(ifps != 0);
1c79356b 1351 if (ifps != 0 && !ifps->fp_valid) {
0c530ab8
A
1352 assert((get_cr0() & CR0_TS) == 0);
1353 /* registers are in FPU */
1354 ifps->fp_valid = TRUE;
d9a64523 1355 fpu_store_registers(ifps, thread_is_64bit_addr(thr_act));
1c79356b
A
1356 }
1357}
1358
1359/*
1360 * Restore FPU state from PCB.
1361 *
1362 * Locking not needed; always called on the current thread.
1363 */
1364
1365void
1366fp_load(
0a7de745 1367 thread_t thr_act)
1c79356b 1368{
6d2010ae 1369 pcb_t pcb = THREAD_TO_PCB(thr_act);
060df5ea 1370 struct x86_fx_thread_state *ifps = pcb->ifps;
0c530ab8 1371
060df5ea 1372 assert(ifps);
0a7de745 1373#if DEBUG
39236c6e
A
1374 if (ifps->fp_valid != FALSE && ifps->fp_valid != TRUE) {
1375 panic("fp_load() invalid fp_valid: %u, fp_save_layout: %u\n",
0a7de745 1376 ifps->fp_valid, ifps->fp_save_layout);
39236c6e
A
1377 }
1378#endif
060df5ea
A
1379
1380 if (ifps->fp_valid == FALSE) {
0c530ab8 1381 fpinit();
1c79356b 1382 } else {
060df5ea 1383 fpu_load_registers(ifps);
1c79356b 1384 }
0a7de745 1385 ifps->fp_valid = FALSE; /* in FPU */
1c79356b
A
1386}
1387
1c79356b 1388/*
0c530ab8
A
1389 * SSE arithmetic exception handling code.
1390 * Basically the same as the x87 exception handler with a different subtype
1c79356b
A
1391 */
1392
1393void
0c530ab8 1394fpSSEexterrflt(void)
1c79356b 1395{
0a7de745 1396 thread_t thr_act = current_thread();
6d2010ae 1397 struct x86_fx_thread_state *ifps = thr_act->machine.ifps;
0a7de745 1398 boolean_t intr;
4452a7af 1399
0c530ab8
A
1400 intr = ml_set_interrupts_enabled(FALSE);
1401
0a7de745 1402 if (get_interrupt_level()) {
0c530ab8 1403 panic("SSE exception at interrupt context\n");
0a7de745
A
1404 }
1405 if (current_task() == kernel_task) {
0c530ab8 1406 panic("SSE exception in kernel thread context\n");
0a7de745 1407 }
1c79356b
A
1408
1409 /*
0c530ab8 1410 * Save the FPU state and turn off the FPU.
1c79356b 1411 */
1c79356b 1412 fp_save(thr_act);
1c79356b 1413
0c530ab8 1414 (void)ml_set_interrupts_enabled(intr);
1c79356b 1415 /*
0c530ab8
A
1416 * Raise FPU exception.
1417 * Locking not needed on pcb->ifps,
1418 * since thread is running.
1c79356b 1419 */
cc8bc92a 1420 const uint32_t mask = (ifps->fx_MXCSR >> 7) &
0a7de745 1421 (FPC_IM | FPC_DM | FPC_ZM | FPC_OM | FPC_UE | FPC_PE);
cc8bc92a 1422 const uint32_t xcpt = ~mask & (ifps->fx_MXCSR &
0a7de745 1423 (FPS_IE | FPS_DE | FPS_ZE | FPS_OE | FPS_UE | FPS_PE));
cc8bc92a 1424 fpxlog(EXC_I386_SSEEXTERR, ifps->fx_MXCSR, ifps->fx_MXCSR, xcpt);
0c530ab8
A
1425}
1426
5ba3f43e 1427
5ba3f43e
A
1428/*
1429 * If a thread is using an AVX-sized savearea:
1430 * - allocate a new AVX512-sized area,
1431 * - copy the 256-bit state into the 512-bit area,
1432 * - deallocate the smaller area
ea3f0419 1433 * ASSUMES: thread is the current thread.
5ba3f43e
A
1434 */
1435static void
1436fpu_savearea_promote_avx512(thread_t thread)
1437{
0a7de745
A
1438 struct x86_avx_thread_state *ifps = NULL;
1439 struct x86_avx512_thread_state *ifps512 = NULL;
1440 pcb_t pcb = THREAD_TO_PCB(thread);
1441 boolean_t do_avx512_alloc = FALSE;
ea3f0419 1442 boolean_t intr;
5ba3f43e 1443
ea3f0419
A
1444 assert(thread == current_thread());
1445
1446 DBG("fpu_savearea_promote_avx512(%p)\n", thread);
cc8bc92a 1447
0a7de745 1448 simple_lock(&pcb->lock, LCK_GRP_NULL);
cc8bc92a 1449
5ba3f43e
A
1450 ifps = pcb->ifps;
1451 if (ifps == NULL) {
cc8bc92a 1452 pcb->xstate = AVX512;
5ba3f43e 1453 simple_unlock(&pcb->lock);
ea3f0419
A
1454 /*
1455 * Now that the PCB xstate has been promoted, set XCR0 so
1456 * that we don't re-trip #UD on the next AVX-512 instruction.
1457 *
1458 * Since this branch is taken when the first FP instruction
1459 * attempted by this thread is an AVX-512 instruction, we
1460 * call fpnoextflt() to allocate an appropriately-sized
1461 * AVX-512 save-area, thereby avoiding the overhead of another
1462 * fault that would be triggered immediately on return.
1463 */
1464 intr = ml_set_interrupts_enabled(FALSE);
1465 xsetbv(0, AVX512_XMASK);
1466 current_cpu_datap()->cpu_xstate = AVX512;
1467 (void)ml_set_interrupts_enabled(intr);
cc8bc92a 1468
cc8bc92a 1469 fpnoextflt();
5ba3f43e
A
1470 return;
1471 }
cc8bc92a
A
1472
1473 if (pcb->xstate != AVX512) {
1474 do_avx512_alloc = TRUE;
1475 }
ea3f0419 1476
cc8bc92a
A
1477 simple_unlock(&pcb->lock);
1478
1479 if (do_avx512_alloc == TRUE) {
1480 ifps512 = fp_state_alloc(AVX512);
1481 }
1482
0a7de745 1483 simple_lock(&pcb->lock, LCK_GRP_NULL);
5ba3f43e 1484
ea3f0419 1485 intr = ml_set_interrupts_enabled(FALSE);
5ba3f43e 1486
ea3f0419
A
1487 clear_ts();
1488 fp_save(thread);
1489 clear_fpu();
1490
1491 xsetbv(0, AVX512_XMASK);
1492 current_cpu_datap()->cpu_xstate = AVX512;
1493 (void)ml_set_interrupts_enabled(intr);
5ba3f43e 1494
5ba3f43e
A
1495 assert(ifps->fp.fp_valid);
1496
1497 /* Allocate an AVX512 savearea and copy AVX state into it */
cc8bc92a 1498 if (pcb->xstate != AVX512) {
cb323159 1499 __nochk_bcopy(ifps, ifps512, fp_state_size[AVX]);
cc8bc92a
A
1500 pcb->ifps = ifps512;
1501 pcb->xstate = AVX512;
1502 ifps512 = NULL;
1503 } else {
1504 ifps = NULL;
1505 }
1506 /* The PCB lock is redundant in some scenarios given the higher level
1507 * thread mutex, but its pre-emption disablement is relied upon here
1508 */
5ba3f43e 1509 simple_unlock(&pcb->lock);
cc8bc92a
A
1510
1511 if (ifps) {
1512 fp_state_free(ifps, AVX);
1513 }
1514 if (ifps512) {
1515 fp_state_free(ifps, AVX512);
1516 }
5ba3f43e
A
1517}
1518
1519/*
1520 * Upgrade the calling thread to AVX512.
1521 */
1522boolean_t
1523fpu_thread_promote_avx512(thread_t thread)
1524{
0a7de745 1525 task_t task = current_task();
5ba3f43e 1526
0a7de745 1527 if (thread != current_thread()) {
5ba3f43e 1528 return FALSE;
0a7de745
A
1529 }
1530 if (!ml_fpu_avx512_enabled()) {
5ba3f43e 1531 return FALSE;
0a7de745 1532 }
5ba3f43e
A
1533
1534 fpu_savearea_promote_avx512(thread);
1535
1536 /* Racy but the task's xstate is only a hint */
1537 task->xstate = AVX512;
1538
1539 return TRUE;
1540}
1541
1542
1543/*
1544 * Called from user_trap() when an invalid opcode fault is taken.
1545 * If the user is attempting an AVX512 instruction on a machine
1546 * that supports this, we switch the calling thread to use
1547 * a larger savearea, set its XCR0 bit mask to enable AVX512 and
ea3f0419
A
1548 * return to user_trap() with a 0 return value.
1549 * Otherwise, simply return a nonzero value.
5ba3f43e 1550 */
ea3f0419 1551
94ff46dc
A
1552#define MAX_X86_INSN_LENGTH (15)
1553int
5ba3f43e
A
1554fpUDflt(user_addr_t rip)
1555{
0a7de745
A
1556 uint8_t instruction_prefix;
1557 boolean_t is_AVX512_instruction = FALSE;
1558 user_addr_t original_rip = rip;
5ba3f43e 1559 do {
cc8bc92a
A
1560 /* TODO: as an optimisation, copy up to the lesser of the
1561 * next page boundary or maximal prefix length in one pass
1562 * rather than issue multiple copyins
1563 */
1564 if (copyin(rip, (char *) &instruction_prefix, 1)) {
94ff46dc 1565 return 1;
cc8bc92a 1566 }
5ba3f43e 1567 DBG("fpUDflt(0x%016llx) prefix: 0x%x\n",
0a7de745 1568 rip, instruction_prefix);
cc8bc92a
A
1569 /* TODO: determine more specifically which prefixes
1570 * are sane possibilities for AVX512 insns
1571 */
5ba3f43e 1572 switch (instruction_prefix) {
0a7de745
A
1573 case 0x2E: /* CS segment override */
1574 case 0x36: /* SS segment override */
1575 case 0x3E: /* DS segment override */
1576 case 0x26: /* ES segment override */
1577 case 0x64: /* FS segment override */
1578 case 0x65: /* GS segment override */
1579 case 0x66: /* Operand-size override */
1580 case 0x67: /* address-size override */
5ba3f43e
A
1581 /* Skip optional prefixes */
1582 rip++;
cc8bc92a 1583 if ((rip - original_rip) > MAX_X86_INSN_LENGTH) {
94ff46dc 1584 return 1;
cc8bc92a 1585 }
5ba3f43e 1586 break;
0a7de745
A
1587 case 0x62: /* EVEX */
1588 case 0xC5: /* VEX 2-byte */
1589 case 0xC4: /* VEX 3-byte */
5ba3f43e
A
1590 is_AVX512_instruction = TRUE;
1591 break;
0a7de745 1592 default:
94ff46dc 1593 return 1;
5ba3f43e
A
1594 }
1595 } while (!is_AVX512_instruction);
1596
1597 /* Here if we detect attempted execution of an AVX512 instruction */
1598
1599 /*
cc8bc92a 1600 * Fail if this machine doesn't support AVX512
5ba3f43e 1601 */
0a7de745 1602 if (fpu_capability != AVX512) {
94ff46dc 1603 return 1;
0a7de745 1604 }
5ba3f43e
A
1605
1606 assert(xgetbv(XCR0) == AVX_XMASK);
1607
1608 DBG("fpUDflt() switching xstate to AVX512\n");
1609 (void) fpu_thread_promote_avx512(current_thread());
1610
94ff46dc 1611 return 0;
5ba3f43e 1612}
5ba3f43e 1613
0c530ab8 1614void
0a7de745
A
1615fp_setvalid(boolean_t value)
1616{
1617 thread_t thr_act = current_thread();
6d2010ae 1618 struct x86_fx_thread_state *ifps = thr_act->machine.ifps;
0c530ab8
A
1619
1620 if (ifps) {
0a7de745 1621 ifps->fp_valid = value;
0c530ab8 1622
060df5ea
A
1623 if (value == TRUE) {
1624 boolean_t istate = ml_set_interrupts_enabled(FALSE);
0a7de745 1625 clear_fpu();
060df5ea
A
1626 ml_set_interrupts_enabled(istate);
1627 }
0c530ab8 1628 }
1c79356b 1629}
060df5ea 1630
316670eb 1631boolean_t
0a7de745
A
1632ml_fpu_avx_enabled(void)
1633{
1634 return fpu_capability >= AVX;
5ba3f43e
A
1635}
1636
5ba3f43e 1637boolean_t
0a7de745
A
1638ml_fpu_avx512_enabled(void)
1639{
1640 return fpu_capability == AVX512;
5ba3f43e 1641}
5ba3f43e
A
1642
1643static xstate_t
1644task_xstate(task_t task)
1645{
0a7de745 1646 if (task == TASK_NULL) {
5ba3f43e 1647 return fpu_default;
0a7de745 1648 } else {
5ba3f43e 1649 return task->xstate;
0a7de745 1650 }
5ba3f43e
A
1651}
1652
1653static xstate_t
1654thread_xstate(thread_t thread)
1655{
1656 xstate_t xs = THREAD_TO_PCB(thread)->xstate;
0a7de745 1657 if (xs == UNDEFINED) {
5ba3f43e 1658 return task_xstate(thread->task);
0a7de745 1659 } else {
5ba3f43e 1660 return xs;
0a7de745 1661 }
5ba3f43e
A
1662}
1663
1664xstate_t
1665current_xstate(void)
1666{
1667 return thread_xstate(current_thread());
1668}
1669
1670/*
1671 * Called when exec'ing between bitnesses.
1672 * If valid FPU state exists, adjust the layout.
1673 */
1674void
1675fpu_switch_addrmode(thread_t thread, boolean_t is_64bit)
1676{
1677 struct x86_fx_thread_state *ifps = thread->machine.ifps;
d26ffc64 1678 mp_disable_preemption();
5ba3f43e
A
1679
1680 if (ifps && ifps->fp_valid) {
1681 if (thread_xstate(thread) == FP) {
1682 ifps->fp_save_layout = is_64bit ? FXSAVE64 : FXSAVE32;
1683 } else {
1684 ifps->fp_save_layout = is_64bit ? XSAVE64 : XSAVE32;
1685 }
1686 }
d26ffc64
A
1687 mp_enable_preemption();
1688}
1689
0a7de745
A
1690static inline uint32_t
1691fpsimd_pop(uintptr_t ins, int sz)
1692{
d26ffc64
A
1693 uint32_t rv = 0;
1694
1695
1696 while (sz >= 16) {
1697 uint32_t rv1, rv2;
1698 uint64_t *ins64 = (uint64_t *) ins;
1699 uint64_t *ins642 = (uint64_t *) (ins + 8);
1700 rv1 = __builtin_popcountll(*ins64);
1701 rv2 = __builtin_popcountll(*ins642);
1702 rv += rv1 + rv2;
1703 sz -= 16;
1704 ins += 16;
1705 }
1706
1707 while (sz >= 4) {
1708 uint32_t *ins32 = (uint32_t *) ins;
1709 rv += __builtin_popcount(*ins32);
1710 sz -= 4;
1711 ins += 4;
1712 }
1713
1714 while (sz > 0) {
1715 char *ins8 = (char *)ins;
1716 rv += __builtin_popcount(*ins8);
1717 sz--;
1718 ins++;
1719 }
1720 return rv;
1721}
1722
0a7de745
A
1723uint32_t
1724thread_fpsimd_hash(thread_t ft)
1725{
1726 if (fpsimd_fault_popc == 0) {
d26ffc64 1727 return 0;
0a7de745 1728 }
d26ffc64
A
1729
1730 uint32_t prv = 0;
1731 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1732 struct x86_fx_thread_state *pifps = THREAD_TO_PCB(ft)->ifps;
1733
1734 if (pifps) {
1735 if (pifps->fp_valid) {
1736 prv = fpsimd_pop((uintptr_t) &pifps->fx_XMM_reg[0][0],
1737 sizeof(pifps->fx_XMM_reg));
1738 } else {
1739 uintptr_t cr0 = get_cr0();
1740 clear_ts();
1741 fp_save(ft);
1742 prv = fpsimd_pop((uintptr_t) &pifps->fx_XMM_reg[0][0],
1743 sizeof(pifps->fx_XMM_reg));
1744 pifps->fp_valid = FALSE;
1745 if (cr0 & CR0_TS) {
1746 set_cr0(cr0);
1747 }
1748 }
1749 }
1750 ml_set_interrupts_enabled(istate);
1751 return prv;
060df5ea 1752}