]> git.saurik.com Git - apple/xnu.git/blame - osfmk/i386/fpu.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / osfmk / i386 / fpu.c
CommitLineData
1c79356b 1/*
f427ee49 2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
0a7de745 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
0a7de745 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
0a7de745 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
0a7de745 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
0a7de745 31/*
1c79356b
A
32 * Mach Operating System
33 * Copyright (c) 1992-1990 Carnegie Mellon University
34 * All Rights Reserved.
0a7de745 35 *
1c79356b
A
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
0a7de745 41 *
1c79356b
A
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
0a7de745 45 *
1c79356b 46 * Carnegie Mellon requests users of this software to return to
0a7de745 47 *
1c79356b
A
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
0a7de745 52 *
1c79356b
A
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
1c79356b
A
56
57#include <mach/exception_types.h>
58#include <mach/i386/thread_status.h>
59#include <mach/i386/fp_reg.h>
60
61#include <kern/mach_param.h>
91447636 62#include <kern/processor.h>
1c79356b
A
63#include <kern/thread.h>
64#include <kern/zalloc.h>
65#include <kern/misc_protos.h>
66#include <kern/spl.h>
67#include <kern/assert.h>
68
060df5ea
A
69#include <libkern/OSAtomic.h>
70
0c530ab8 71#include <architecture/i386/pio.h>
55e303ae 72#include <i386/cpuid.h>
b0d623f7 73#include <i386/fpu.h>
0c530ab8 74#include <i386/proc_reg.h>
b0d623f7
A
75#include <i386/misc_protos.h>
76#include <i386/thread.h>
77#include <i386/trap.h>
1c79356b 78
0a7de745
A
79xstate_t fpu_capability = UNDEFINED; /* extended state capability */
80xstate_t fpu_default = UNDEFINED; /* default extended state */
1c79356b 81
0a7de745 82#define ALIGNED(addr, size) (((uintptr_t)(addr)&((size)-1))==0)
eb6b6ca3
A
83#define VERIFY_SAVEAREA_ALIGNED(p, a) \
84 assertf(!(((uintptr_t)(p)) & ((a) - 1)), \
85 "FP save area component @ 0x%lx not 8-byte aligned", ((uintptr_t)(p)))
1c79356b
A
86
87/* Forward */
88
0a7de745
A
89extern void fpinit(void);
90extern void fp_save(
91 thread_t thr_act);
92extern void fp_load(
93 thread_t thr_act);
1c79356b 94
5ba3f43e
A
95static void configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps);
96static xstate_t thread_xstate(thread_t);
0c530ab8 97
0a7de745
A
98x86_ext_thread_state_t initial_fp_state __attribute((aligned(64)));
99x86_ext_thread_state_t default_avx512_state __attribute((aligned(64)));
100x86_ext_thread_state_t default_avx_state __attribute((aligned(64)));
101x86_ext_thread_state_t default_fx_state __attribute((aligned(64)));
0c530ab8
A
102
103/* Global MXCSR capability bitmask */
104static unsigned int mxcsr_capability_mask;
105
0a7de745 106#define fninit() \
060df5ea
A
107 __asm__ volatile("fninit")
108
0a7de745 109#define fnstcw(control) \
060df5ea
A
110 __asm__("fnstcw %0" : "=m" (*(unsigned short *)(control)))
111
0a7de745 112#define fldcw(control) \
060df5ea
A
113 __asm__ volatile("fldcw %0" : : "m" (*(unsigned short *) &(control)) )
114
0a7de745 115#define fnclex() \
060df5ea
A
116 __asm__ volatile("fnclex")
117
0a7de745 118#define fnsave(state) \
060df5ea
A
119 __asm__ volatile("fnsave %0" : "=m" (*state))
120
0a7de745 121#define frstor(state) \
060df5ea
A
122 __asm__ volatile("frstor %0" : : "m" (state))
123
124#define fwait() \
0a7de745 125 __asm__("fwait");
060df5ea 126
0a7de745
A
127static inline void
128fxrstor(struct x86_fx_thread_state *a)
129{
130 __asm__ __volatile__ ("fxrstor %0" :: "m" (*a));
5ba3f43e
A
131}
132
0a7de745
A
133static inline void
134fxsave(struct x86_fx_thread_state *a)
135{
136 __asm__ __volatile__ ("fxsave %0" : "=m" (*a));
5ba3f43e
A
137}
138
0a7de745
A
139static inline void
140fxrstor64(struct x86_fx_thread_state *a)
141{
142 __asm__ __volatile__ ("fxrstor64 %0" :: "m" (*a));
5ba3f43e
A
143}
144
0a7de745
A
145static inline void
146fxsave64(struct x86_fx_thread_state *a)
147{
148 __asm__ __volatile__ ("fxsave64 %0" : "=m" (*a));
5ba3f43e
A
149}
150
0a7de745 151#define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX || (x) == AVX512)
5ba3f43e 152
f427ee49 153SECURITY_READ_ONLY_LATE(zone_t) ifps_zone[] = {
5ba3f43e
A
154 [FP] = NULL,
155 [AVX] = NULL,
5ba3f43e 156 [AVX512] = NULL
5ba3f43e 157};
f427ee49 158static const uint32_t fp_state_size[] = {
5ba3f43e
A
159 [FP] = sizeof(struct x86_fx_thread_state),
160 [AVX] = sizeof(struct x86_avx_thread_state),
5ba3f43e 161 [AVX512] = sizeof(struct x86_avx512_thread_state)
5ba3f43e
A
162};
163
f427ee49 164static const char *const xstate_name[] = {
5ba3f43e
A
165 [UNDEFINED] = "UNDEFINED",
166 [FP] = "FP",
167 [AVX] = "AVX",
5ba3f43e 168 [AVX512] = "AVX512"
5ba3f43e 169};
060df5ea 170
5ba3f43e
A
171#define fpu_ZMM_capable (fpu_capability == AVX512)
172#define fpu_YMM_capable (fpu_capability == AVX || fpu_capability == AVX512)
173/*
174 * On-demand AVX512 support
175 * ------------------------
176 * On machines with AVX512 support, by default, threads are created with
177 * AVX512 masked off in XCR0 and an AVX-sized savearea is used. However, AVX512
178 * capabilities are advertised in the commpage and via sysctl. If a thread
179 * opts to use AVX512 instructions, the first will result in a #UD exception.
180 * Faulting AVX512 intructions are recognizable by their unique prefix.
181 * This exception results in the thread being promoted to use an AVX512-sized
182 * savearea and for the AVX512 bit masks being set in its XCR0. The faulting
183 * instruction is re-driven and the thread can proceed to perform AVX512
184 * operations.
185 *
186 * In addition to AVX512 instructions causing promotion, the thread_set_state()
187 * primitive with an AVX512 state flavor result in promotion.
188 *
189 * AVX512 promotion of the first thread in a task causes the default xstate
190 * of the task to be promoted so that any subsequently created or subsequently
191 * DNA-faulted thread will have AVX512 xstate and it will not need to fault-in
192 * a promoted xstate.
193 *
194 * Two savearea zones are used: the default pool of AVX-sized (832 byte) areas
195 * and a second pool of larger AVX512-sized (2688 byte) areas.
196 *
197 * Note the initial state value is an AVX512 object but that the AVX initial
198 * value is a subset of it.
199 */
0a7de745 200static uint32_t cpuid_reevaluated = 0;
060df5ea
A
201
202static void fpu_store_registers(void *, boolean_t);
203static void fpu_load_registers(void *);
204
5ba3f43e 205static const uint32_t xstate_xmask[] = {
0a7de745
A
206 [FP] = FP_XMASK,
207 [AVX] = AVX_XMASK,
208 [AVX512] = AVX512_XMASK
5ba3f43e 209};
060df5ea 210
0a7de745
A
211static inline void
212xsave(struct x86_fx_thread_state *a, uint32_t rfbm)
213{
214 __asm__ __volatile__ ("xsave %0" :"=m" (*a) : "a"(rfbm), "d"(0));
5ba3f43e
A
215}
216
0a7de745
A
217static inline void
218xsave64(struct x86_fx_thread_state *a, uint32_t rfbm)
219{
220 __asm__ __volatile__ ("xsave64 %0" :"=m" (*a) : "a"(rfbm), "d"(0));
5ba3f43e
A
221}
222
0a7de745
A
223static inline void
224xrstor(struct x86_fx_thread_state *a, uint32_t rfbm)
225{
226 __asm__ __volatile__ ("xrstor %0" :: "m" (*a), "a"(rfbm), "d"(0));
5ba3f43e
A
227}
228
0a7de745
A
229static inline void
230xrstor64(struct x86_fx_thread_state *a, uint32_t rfbm)
231{
232 __asm__ __volatile__ ("xrstor64 %0" :: "m" (*a), "a"(rfbm), "d"(0));
060df5ea
A
233}
234
0a7de745
A
235__unused static inline void
236vzeroupper(void)
237{
238 __asm__ __volatile__ ("vzeroupper" ::);
5ba3f43e 239}
5ba3f43e 240
0a7de745 241static boolean_t fpu_thread_promote_avx512(thread_t); /* Forward */
5ba3f43e 242
5ba3f43e
A
243
244/*
245 * Furthermore, make compile-time asserts that no padding creeps into structures
246 * for which we're doing this.
247 */
0a7de745
A
248#define ASSERT_PACKED(t, m1, m2, n, mt) \
249extern char assert_packed_ ## t ## _ ## m1 ## _ ## m2 \
5ba3f43e
A
250 [(offsetof(t,m2) - offsetof(t,m1) == (n - 1)*sizeof(mt)) ? 1 : -1]
251
252ASSERT_PACKED(x86_avx_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
253
254ASSERT_PACKED(x86_avx_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
255
256ASSERT_PACKED(x86_avx512_state32_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
257ASSERT_PACKED(x86_avx512_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
258ASSERT_PACKED(x86_avx512_state32_t, fpu_zmmh0, fpu_zmmh7, 8, _STRUCT_YMM_REG);
259
260ASSERT_PACKED(x86_avx512_state64_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
261ASSERT_PACKED(x86_avx512_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
262ASSERT_PACKED(x86_avx512_state64_t, fpu_zmmh0, fpu_zmmh15, 16, _STRUCT_YMM_REG);
263ASSERT_PACKED(x86_avx512_state64_t, fpu_zmm16, fpu_zmm31, 16, _STRUCT_ZMM_REG);
264
265#if defined(DEBUG_AVX512)
266
0a7de745 267#define DBG(x...) kprintf("DBG: " x)
5ba3f43e
A
268
269typedef struct { uint8_t byte[8]; } opmask_t;
270typedef struct { uint8_t byte[16]; } xmm_t;
271typedef struct { uint8_t byte[32]; } ymm_t;
272typedef struct { uint8_t byte[64]; } zmm_t;
273
274static void
275DBG_AVX512_STATE(struct x86_avx512_thread_state *sp)
276{
0a7de745 277 int i, j;
5ba3f43e
A
278 xmm_t *xmm = (xmm_t *) &sp->fp.fx_XMM_reg;
279 xmm_t *ymmh = (xmm_t *) &sp->x_YMM_Hi128;
280 ymm_t *zmmh = (ymm_t *) &sp->x_ZMM_Hi256;
281 zmm_t *zmm = (zmm_t *) &sp->x_Hi16_ZMM;
282 opmask_t *k = (opmask_t *) &sp->x_Opmask;
283
284 kprintf("x_YMM_Hi128: %lu\n", offsetof(struct x86_avx512_thread_state, x_YMM_Hi128));
285 kprintf("x_Opmask: %lu\n", offsetof(struct x86_avx512_thread_state, x_Opmask));
286 kprintf("x_ZMM_Hi256: %lu\n", offsetof(struct x86_avx512_thread_state, x_ZMM_Hi256));
287 kprintf("x_Hi16_ZMM: %lu\n", offsetof(struct x86_avx512_thread_state, x_Hi16_ZMM));
288
289 kprintf("XCR0: 0x%016llx\n", xgetbv(XCR0));
290 kprintf("XINUSE: 0x%016llx\n", xgetbv(1));
291
292 /* Print all ZMM registers */
293 for (i = 0; i < 16; i++) {
294 kprintf("zmm%d:\t0x", i);
0a7de745 295 for (j = 0; j < 16; j++) {
5ba3f43e 296 kprintf("%02x", xmm[i].byte[j]);
0a7de745
A
297 }
298 for (j = 0; j < 16; j++) {
5ba3f43e 299 kprintf("%02x", ymmh[i].byte[j]);
0a7de745
A
300 }
301 for (j = 0; j < 32; j++) {
5ba3f43e 302 kprintf("%02x", zmmh[i].byte[j]);
0a7de745 303 }
5ba3f43e
A
304 kprintf("\n");
305 }
306 for (i = 0; i < 16; i++) {
0a7de745
A
307 kprintf("zmm%d:\t0x", 16 + i);
308 for (j = 0; j < 64; j++) {
5ba3f43e 309 kprintf("%02x", zmm[i].byte[j]);
0a7de745 310 }
5ba3f43e
A
311 kprintf("\n");
312 }
313 for (i = 0; i < 8; i++) {
314 kprintf("k%d:\t0x", i);
0a7de745 315 for (j = 0; j < 8; j++) {
5ba3f43e 316 kprintf("%02x", k[i].byte[j]);
0a7de745 317 }
5ba3f43e
A
318 kprintf("\n");
319 }
320
321 kprintf("xstate_bv: 0x%016llx\n", sp->_xh.xstate_bv);
322 kprintf("xcomp_bv: 0x%016llx\n", sp->_xh.xcomp_bv);
323}
324#else
0a7de745 325#define DBG(x...)
5ba3f43e
A
326static void
327DBG_AVX512_STATE(__unused struct x86_avx512_thread_state *sp)
328{
329 return;
330}
331#endif /* DEBUG_AVX512 */
332
0a7de745 333#if DEBUG
060df5ea
A
334static inline unsigned short
335fnstsw(void)
336{
337 unsigned short status;
0a7de745
A
338 __asm__ volatile ("fnstsw %0" : "=ma" (status));
339 return status;
060df5ea 340}
fe8ab488 341#endif
060df5ea 342
0c530ab8 343/*
060df5ea 344 * Configure the initial FPU state presented to new threads.
0c530ab8
A
345 * Determine the MXCSR capability mask, which allows us to mask off any
346 * potentially unsafe "reserved" bits before restoring the FPU context.
347 * *Not* per-cpu, assumes symmetry.
348 */
060df5ea 349
0c530ab8 350static void
5ba3f43e 351configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps)
0c530ab8 352{
060df5ea
A
353 /* XSAVE requires a 64 byte aligned store */
354 assert(ALIGNED(fps, 64));
0c530ab8 355 /* Clear, to prepare for the diagnostic FXSAVE */
060df5ea
A
356 bzero(fps, sizeof(*fps));
357
358 fpinit();
359 fpu_store_registers(fps, FALSE);
360
5ba3f43e 361 mxcsr_capability_mask = fps->fx.fx_MXCSR_MASK;
0c530ab8
A
362
363 /* Set default mask value if necessary */
0a7de745 364 if (mxcsr_capability_mask == 0) {
0c530ab8 365 mxcsr_capability_mask = 0xffbf;
0a7de745
A
366 }
367
060df5ea 368 /* Clear vector register store */
0a7de745 369 bzero(&fps->fx.fx_XMM_reg[0][0], sizeof(fps->fx.fx_XMM_reg));
5ba3f43e 370 bzero(fps->avx.x_YMM_Hi128, sizeof(fps->avx.x_YMM_Hi128));
5ba3f43e
A
371 if (fpu_ZMM_capable) {
372 bzero(fps->avx512.x_ZMM_Hi256, sizeof(fps->avx512.x_ZMM_Hi256));
0a7de745
A
373 bzero(fps->avx512.x_Hi16_ZMM, sizeof(fps->avx512.x_Hi16_ZMM));
374 bzero(fps->avx512.x_Opmask, sizeof(fps->avx512.x_Opmask));
5ba3f43e 375 }
0c530ab8 376
5ba3f43e
A
377 fps->fx.fp_valid = TRUE;
378 fps->fx.fp_save_layout = fpu_YMM_capable ? XSAVE32: FXSAVE32;
060df5ea 379 fpu_load_registers(fps);
0c530ab8 380
d26ffc64
A
381 if (fpu_ZMM_capable) {
382 xsave64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
383 }
384 if (fpu_YMM_capable) {
385 xsave64((struct x86_fx_thread_state *)&default_avx_state, xstate_xmask[AVX]);
386 } else {
387 fxsave64((struct x86_fx_thread_state *)&default_fx_state);
388 }
389
060df5ea 390 /* Poison values to trap unsafe usage */
5ba3f43e
A
391 fps->fx.fp_valid = 0xFFFFFFFF;
392 fps->fx.fp_save_layout = FP_UNUSED;
0c530ab8 393
060df5ea
A
394 /* Re-enable FPU/SSE DNA exceptions */
395 set_ts();
0c530ab8
A
396}
397
d26ffc64 398int fpsimd_fault_popc = 0;
1c79356b
A
399/*
400 * Look for FPU and initialize it.
401 * Called on each CPU.
402 */
403void
404init_fpu(void)
405{
0a7de745
A
406#if DEBUG
407 unsigned short status;
408 unsigned short control;
060df5ea 409#endif
1c79356b
A
410 /*
411 * Check for FPU by initializing it,
412 * then trying to read the correct bit patterns from
413 * the control and status registers.
414 */
0a7de745 415 set_cr0((get_cr0() & ~(CR0_EM | CR0_TS)) | CR0_NE); /* allow use of FPU */
1c79356b 416 fninit();
0a7de745 417#if DEBUG
1c79356b
A
418 status = fnstsw();
419 fnstcw(&control);
0a7de745 420
060df5ea
A
421 assert(((status & 0xff) == 0) && ((control & 0x103f) == 0x3f));
422#endif
423 /* Advertise SSE support */
424 if (cpuid_features() & CPUID_FEATURE_FXSR) {
060df5ea
A
425 set_cr4(get_cr4() | CR4_OSFXS);
426 /* And allow SIMD exceptions if present */
427 if (cpuid_features() & CPUID_FEATURE_SSE) {
428 set_cr4(get_cr4() | CR4_OSXMM);
429 }
0a7de745 430 } else {
060df5ea 431 panic("fpu is not FP_FXSR");
0a7de745 432 }
55e303ae 433
5ba3f43e
A
434 fpu_capability = fpu_default = FP;
435
d26ffc64
A
436 PE_parse_boot_argn("fpsimd_fault_popc", &fpsimd_fault_popc, sizeof(fpsimd_fault_popc));
437
5ba3f43e
A
438 static boolean_t is_avx512_enabled = TRUE;
439 if (cpu_number() == master_cpu) {
440 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512F) {
441 PE_parse_boot_argn("avx512", &is_avx512_enabled, sizeof(boolean_t));
442 kprintf("AVX512 supported %s\n",
0a7de745 443 is_avx512_enabled ? "and enabled" : "but disabled");
5ba3f43e
A
444 }
445 }
0a7de745 446
060df5ea
A
447 /* Configure the XSAVE context mechanism if the processor supports
448 * AVX/YMM registers
449 */
450 if (cpuid_features() & CPUID_FEATURE_XSAVE) {
5ba3f43e 451 cpuid_xsave_leaf_t *xs0p = &cpuid_info()->cpuid_xsave_leaf[0];
5ba3f43e
A
452 if (is_avx512_enabled &&
453 (xs0p->extended_state[eax] & XFEM_ZMM) == XFEM_ZMM) {
454 assert(xs0p->extended_state[eax] & XFEM_SSE);
455 assert(xs0p->extended_state[eax] & XFEM_YMM);
456 fpu_capability = AVX512;
457 /* XSAVE container size for all features */
458 set_cr4(get_cr4() | CR4_OSXSAVE);
459 xsetbv(0, AVX512_XMASK);
460 /* Re-evaluate CPUID, once, to reflect OSXSAVE */
0a7de745 461 if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) {
5ba3f43e 462 cpuid_set_info();
0a7de745 463 }
5ba3f43e
A
464 /* Verify that now selected state can be accommodated */
465 assert(xs0p->extended_state[ebx] == fp_state_size[AVX512]);
466 /*
467 * AVX set until AVX512 is used.
468 * See comment above about on-demand AVX512 support.
469 */
470 xsetbv(0, AVX_XMASK);
471 fpu_default = AVX;
ea3f0419 472 } else if (xs0p->extended_state[eax] & XFEM_YMM) {
5ba3f43e
A
473 assert(xs0p->extended_state[eax] & XFEM_SSE);
474 fpu_capability = AVX;
475 fpu_default = AVX;
060df5ea 476 /* XSAVE container size for all features */
060df5ea 477 set_cr4(get_cr4() | CR4_OSXSAVE);
5ba3f43e 478 xsetbv(0, AVX_XMASK);
060df5ea 479 /* Re-evaluate CPUID, once, to reflect OSXSAVE */
0a7de745 480 if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) {
060df5ea 481 cpuid_set_info();
0a7de745 482 }
3e170ce0 483 /* Verify that now selected state can be accommodated */
5ba3f43e 484 assert(xs0p->extended_state[ebx] == fp_state_size[AVX]);
060df5ea
A
485 }
486 }
5ba3f43e 487
0a7de745 488 if (cpu_number() == master_cpu) {
5ba3f43e 489 kprintf("fpu_state: %s, state_size: %d\n",
0a7de745
A
490 xstate_name[fpu_capability],
491 fp_state_size[fpu_capability]);
492 }
060df5ea
A
493
494 fpinit();
d26ffc64 495 current_cpu_datap()->cpu_xstate = fpu_default;
060df5ea
A
496
497 /*
498 * Trap wait instructions. Turn off FPU for now.
499 */
500 set_cr0(get_cr0() | CR0_TS | CR0_MP);
501}
502
503/*
5ba3f43e 504 * Allocate and initialize FP state for specified xstate.
060df5ea
A
505 * Don't load state.
506 */
507static void *
5ba3f43e 508fp_state_alloc(xstate_t xs)
060df5ea 509{
5ba3f43e 510 assert(ifps_zone[xs] != NULL);
f427ee49 511 return zalloc_flags(ifps_zone[xs], Z_WAITOK | Z_ZERO);
060df5ea
A
512}
513
514static inline void
5ba3f43e 515fp_state_free(void *ifps, xstate_t xs)
060df5ea 516{
5ba3f43e
A
517 assert(ifps_zone[xs] != NULL);
518 zfree(ifps_zone[xs], ifps);
060df5ea
A
519}
520
0a7de745
A
521void
522clear_fpu(void)
060df5ea
A
523{
524 set_ts();
525}
526
eb6b6ca3
A
527static boolean_t
528fpu_allzeroes(uint64_t * __attribute((aligned(8)))ptr, uint32_t size)
529{
530 VERIFY_SAVEAREA_ALIGNED(ptr, sizeof(uint64_t));
531 assertf((size & (sizeof(uint64_t) - 1)) == 0, "FP save area component not a multiple of 8 bytes");
532
533 for (uint32_t count = 0; count < (size / sizeof(uint64_t)); count++) {
534 if (ptr[count] != 0) {
535 return FALSE;
536 }
537 }
538 return TRUE;
539}
060df5ea 540
0a7de745
A
541static void
542fpu_load_registers(void *fstate)
543{
060df5ea
A
544 struct x86_fx_thread_state *ifps = fstate;
545 fp_save_layout_t layout = ifps->fp_save_layout;
546
0a7de745
A
547 assert(current_task() == NULL || \
548 (thread_is_64bit_addr(current_thread()) ? \
549 (layout == FXSAVE64 || layout == XSAVE64) : \
550 (layout == FXSAVE32 || layout == XSAVE32)));
060df5ea
A
551 assert(ALIGNED(ifps, 64));
552 assert(ml_get_interrupts_enabled() == FALSE);
553
0a7de745 554#if DEBUG
060df5ea
A
555 if (layout == XSAVE32 || layout == XSAVE64) {
556 struct x86_avx_thread_state *iavx = fstate;
557 unsigned i;
558 /* Verify reserved bits in the XSAVE header*/
0a7de745 559 if (iavx->_xh.xstate_bv & ~xstate_xmask[current_xstate()]) {
5ba3f43e 560 panic("iavx->_xh.xstate_bv: 0x%llx", iavx->_xh.xstate_bv);
0a7de745
A
561 }
562 for (i = 0; i < sizeof(iavx->_xh.xhrsvd); i++) {
563 if (iavx->_xh.xhrsvd[i]) {
060df5ea 564 panic("Reserved bit set");
0a7de745
A
565 }
566 }
060df5ea 567 }
5ba3f43e 568 if (fpu_YMM_capable) {
0a7de745 569 if (layout != XSAVE32 && layout != XSAVE64) {
060df5ea 570 panic("Inappropriate layout: %u\n", layout);
0a7de745 571 }
060df5ea 572 }
0a7de745 573#endif /* DEBUG */
060df5ea 574
5ba3f43e 575 switch (layout) {
0a7de745 576 case FXSAVE64:
5ba3f43e
A
577 fxrstor64(ifps);
578 break;
0a7de745 579 case FXSAVE32:
060df5ea 580 fxrstor(ifps);
5ba3f43e 581 break;
0a7de745 582 case XSAVE64:
5ba3f43e
A
583 xrstor64(ifps, xstate_xmask[current_xstate()]);
584 break;
0a7de745 585 case XSAVE32:
5ba3f43e
A
586 xrstor(ifps, xstate_xmask[current_xstate()]);
587 break;
0a7de745 588 default:
5ba3f43e
A
589 panic("fpu_load_registers() bad layout: %d\n", layout);
590 }
060df5ea
A
591}
592
0a7de745
A
593static void
594fpu_store_registers(void *fstate, boolean_t is64)
595{
060df5ea
A
596 struct x86_fx_thread_state *ifps = fstate;
597 assert(ALIGNED(ifps, 64));
5ba3f43e
A
598 xstate_t xs = current_xstate();
599 switch (xs) {
0a7de745 600 case FP:
5ba3f43e
A
601 if (is64) {
602 fxsave64(fstate);
603 ifps->fp_save_layout = FXSAVE64;
604 } else {
605 fxsave(fstate);
606 ifps->fp_save_layout = FXSAVE32;
607 }
608 break;
0a7de745 609 case AVX:
0a7de745 610 case AVX512:
5ba3f43e
A
611 if (is64) {
612 xsave64(ifps, xstate_xmask[xs]);
613 ifps->fp_save_layout = XSAVE64;
614 } else {
615 xsave(ifps, xstate_xmask[xs]);
616 ifps->fp_save_layout = XSAVE32;
617 }
618 break;
0a7de745 619 default:
5ba3f43e 620 panic("fpu_store_registers() bad xstate: %d\n", xs);
060df5ea 621 }
1c79356b
A
622}
623
624/*
625 * Initialize FP handling.
626 */
060df5ea 627
1c79356b
A
628void
629fpu_module_init(void)
630{
0a7de745 631 if (!IS_VALID_XSTATE(fpu_default)) {
5ba3f43e 632 panic("fpu_module_init: invalid extended state %u\n",
0a7de745
A
633 fpu_default);
634 }
060df5ea 635
060df5ea
A
636 /* To maintain the required alignment, disable
637 * zone debugging for this zone as that appends
638 * 16 bytes to each element.
639 */
f427ee49
A
640 ifps_zone[fpu_default] = zone_create("x86 fpsave state",
641 fp_state_size[fpu_default], ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM);
5ba3f43e 642
5ba3f43e
A
643 /*
644 * If AVX512 is supported, create a separate savearea zone.
5ba3f43e
A
645 */
646 if (fpu_capability == AVX512) {
f427ee49
A
647 ifps_zone[AVX512] = zone_create("x86 avx512 save state",
648 fp_state_size[AVX512], ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM);
5ba3f43e 649 }
5ba3f43e 650
060df5ea
A
651 /* Determine MXCSR reserved bits and configure initial FPU state*/
652 configure_mxcsr_capability_mask(&initial_fp_state);
653}
654
655/*
5ba3f43e
A
656 * Context switch fpu state.
657 * Always save old thread`s FPU context but don't load new .. allow that to fault-in.
658 * Switch to the new task's xstate.
060df5ea 659 */
d26ffc64 660
060df5ea 661void
5ba3f43e 662fpu_switch_context(thread_t old, thread_t new)
060df5ea 663{
0a7de745 664 struct x86_fx_thread_state *ifps;
d26ffc64
A
665 cpu_data_t *cdp = current_cpu_datap();
666 xstate_t new_xstate = new ? thread_xstate(new) : fpu_default;
060df5ea
A
667
668 assert(ml_get_interrupts_enabled() == FALSE);
5ba3f43e 669 ifps = (old)->machine.ifps;
0a7de745 670#if DEBUG
060df5ea
A
671 if (ifps && ((ifps->fp_valid != FALSE) && (ifps->fp_valid != TRUE))) {
672 panic("ifps->fp_valid: %u\n", ifps->fp_valid);
673 }
674#endif
675 if (ifps != 0 && (ifps->fp_valid == FALSE)) {
676 /* Clear CR0.TS in preparation for the FP context save. In
677 * theory, this shouldn't be necessary since a live FPU should
678 * indicate that TS is clear. However, various routines
679 * (such as sendsig & sigreturn) manipulate TS directly.
680 */
681 clear_ts();
682 /* registers are in FPU - save to memory */
d9a64523
A
683 boolean_t is64 = (thread_is_64bit_addr(old) &&
684 is_saved_state64(old->machine.iss));
685
686 fpu_store_registers(ifps, is64);
060df5ea 687 ifps->fp_valid = TRUE;
d26ffc64
A
688
689 if (fpu_ZMM_capable && (cdp->cpu_xstate == AVX512)) {
690 xrstor64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
691 } else if (fpu_YMM_capable) {
692 xrstor64((struct x86_fx_thread_state *) &default_avx_state, xstate_xmask[AVX]);
693 } else {
694 fxrstor64((struct x86_fx_thread_state *)&default_fx_state);
695 }
060df5ea 696 }
d26ffc64
A
697
698 assertf(fpu_YMM_capable ? (xgetbv(XCR0) == xstate_xmask[cdp->cpu_xstate]) : TRUE, "XCR0 mismatch: 0x%llx 0x%x 0x%x", xgetbv(XCR0), cdp->cpu_xstate, xstate_xmask[cdp->cpu_xstate]);
0a7de745 699 if (new_xstate != (xstate_t) cdp->cpu_xstate) {
5ba3f43e 700 DBG("fpu_switch_context(%p,%p) new xstate: %s\n",
0a7de745 701 old, new, xstate_name[new_xstate]);
5ba3f43e 702 xsetbv(0, xstate_xmask[new_xstate]);
d26ffc64 703 cdp->cpu_xstate = new_xstate;
5ba3f43e 704 }
060df5ea 705 set_ts();
1c79356b
A
706}
707
060df5ea 708
1c79356b
A
709/*
710 * Free a FPU save area.
711 * Called only when thread terminating - no locking necessary.
712 */
713void
5ba3f43e 714fpu_free(thread_t thread, void *fps)
1c79356b 715{
0a7de745
A
716 pcb_t pcb = THREAD_TO_PCB(thread);
717
5ba3f43e
A
718 fp_state_free(fps, pcb->xstate);
719 pcb->xstate = UNDEFINED;
1c79356b
A
720}
721
55e303ae 722/*
eb6b6ca3
A
723 * Set the floating-point state for a thread based on the FXSave formatted data.
724 * This is basically the same as fpu_set_state except it uses the expanded data
725 * structure.
726 * If the thread is not the current thread, it is not running (held). Locking
727 * needed against concurrent fpu_set_state or fpu_get_state.
728 *
729 * While translating between XNU FP state structures and the CPU-native XSAVE area,
730 * if we detect state components that are all zeroes, we clear the corresponding
731 * xstate_bv bit in the XSAVE area, because that allows the corresponding state to
732 * be initialized to a "clean" state. That's most important when clearing the YMM
733 * bit, since an initialized "upper clean" state results in a massive performance
734 * improvement due to elimination of false dependencies between the XMMs and the
735 * upper bits of the YMMs.
55e303ae
A
736 */
737kern_return_t
738fpu_set_fxstate(
0a7de745
A
739 thread_t thr_act,
740 thread_state_t tstate,
060df5ea 741 thread_flavor_t f)
55e303ae 742{
0a7de745
A
743 struct x86_fx_thread_state *ifps;
744 struct x86_fx_thread_state *new_ifps;
745 x86_float_state64_t *state;
746 pcb_t pcb;
747 boolean_t old_valid, fresh_state = FALSE;
ea3f0419 748 xstate_t thr_xstate;
fe8ab488 749
0a7de745 750 if (fpu_capability == UNDEFINED) {
fe8ab488 751 return KERN_FAILURE;
0a7de745 752 }
0c530ab8 753
bd504ef0 754 if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
0a7de745 755 fpu_capability < AVX) {
fe8ab488 756 return KERN_FAILURE;
0a7de745 757 }
bd504ef0 758
ea3f0419
A
759 assert(thr_act != THREAD_NULL);
760
761 thr_xstate = thread_xstate(thr_act);
762
5ba3f43e 763 if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
ea3f0419 764 thr_xstate == AVX) {
0a7de745 765 if (!fpu_thread_promote_avx512(thr_act)) {
5ba3f43e 766 return KERN_FAILURE;
ea3f0419
A
767 } else {
768 /* Reload thr_xstate after successful promotion */
769 thr_xstate = thread_xstate(thr_act);
0a7de745
A
770 }
771 }
5ba3f43e 772
0c530ab8 773 state = (x86_float_state64_t *)tstate;
55e303ae 774
6d2010ae 775 pcb = THREAD_TO_PCB(thr_act);
55e303ae 776
0c530ab8 777 if (state == NULL) {
fe8ab488
A
778 /*
779 * new FPU state is 'invalid'.
780 * Deallocate the fp state if it exists.
781 */
0a7de745 782 simple_lock(&pcb->lock, LCK_GRP_NULL);
0c530ab8
A
783
784 ifps = pcb->ifps;
785 pcb->ifps = 0;
4452a7af 786
fe8ab488 787 simple_unlock(&pcb->lock);
0c530ab8 788
fe8ab488 789 if (ifps != 0) {
ea3f0419 790 fp_state_free(ifps, thr_xstate);
fe8ab488 791 }
0c530ab8 792 } else {
fe8ab488
A
793 /*
794 * Valid incoming state. Allocate the fp state if there is none.
795 */
796 new_ifps = 0;
0a7de745
A
797Retry:
798 simple_lock(&pcb->lock, LCK_GRP_NULL);
0c530ab8
A
799
800 ifps = pcb->ifps;
fe8ab488
A
801 if (ifps == 0) {
802 if (new_ifps == 0) {
803 simple_unlock(&pcb->lock);
ea3f0419 804 new_ifps = fp_state_alloc(thr_xstate);
fe8ab488
A
805 goto Retry;
806 }
807 ifps = new_ifps;
808 new_ifps = 0;
809 pcb->ifps = ifps;
ea3f0419 810 pcb->xstate = thr_xstate;
fe8ab488
A
811 fresh_state = TRUE;
812 }
813
814 /*
815 * now copy over the new data.
816 */
817
818 old_valid = ifps->fp_valid;
819
0a7de745 820#if DEBUG || DEVELOPMENT
fe8ab488
A
821 if ((fresh_state == FALSE) && (old_valid == FALSE) && (thr_act != current_thread())) {
822 panic("fpu_set_fxstate inconsistency, thread: %p not stopped", thr_act);
55e303ae 823 }
060df5ea 824#endif
fe8ab488
A
825 /*
826 * Clear any reserved bits in the MXCSR to prevent a GPF
827 * when issuing an FXRSTOR.
828 */
7ddcb079 829
fe8ab488 830 state->fpu_mxcsr &= mxcsr_capability_mask;
060df5ea 831
cb323159 832 __nochk_bcopy((char *)&state->fpu_fcw, (char *)ifps, fp_state_size[FP]);
060df5ea 833
ea3f0419 834 switch (thr_xstate) {
0a7de745
A
835 case UNDEFINED_FULL:
836 case FP_FULL:
837 case AVX_FULL:
838 case AVX512_FULL:
ea3f0419 839 panic("fpu_set_fxstate() INVALID xstate: 0x%x", thr_xstate);
0a7de745
A
840 break;
841
842 case UNDEFINED:
5ba3f43e
A
843 panic("fpu_set_fxstate() UNDEFINED xstate");
844 break;
0a7de745 845 case FP:
d9a64523 846 ifps->fp_save_layout = thread_is_64bit_addr(thr_act) ? FXSAVE64 : FXSAVE32;
5ba3f43e 847 break;
0a7de745 848 case AVX: {
fe8ab488 849 struct x86_avx_thread_state *iavx = (void *) ifps;
5ba3f43e 850 x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
7ddcb079 851
d9a64523 852 iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
7ddcb079 853
5ba3f43e
A
854 /* Sanitize XSAVE header */
855 bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
856 iavx->_xh.xstate_bv = AVX_XMASK;
857 iavx->_xh.xcomp_bv = 0;
858
eb6b6ca3
A
859 /*
860 * See the block comment at the top of the function for a description of why we're clearing
861 * xstate_bv bits.
862 */
5ba3f43e 863 if (f == x86_AVX_STATE32) {
cb323159 864 __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
eb6b6ca3
A
865 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
866 iavx->_xh.xstate_bv &= ~XFEM_YMM;
867 }
5ba3f43e 868 } else if (f == x86_AVX_STATE64) {
cb323159 869 __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
eb6b6ca3
A
870 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
871 iavx->_xh.xstate_bv &= ~XFEM_YMM;
872 }
5ba3f43e
A
873 } else {
874 iavx->_xh.xstate_bv = (XFEM_SSE | XFEM_X87);
fe8ab488 875 }
5ba3f43e 876 break;
0a7de745 877 }
0a7de745 878 case AVX512: {
5ba3f43e
A
879 struct x86_avx512_thread_state *iavx = (void *) ifps;
880 union {
881 thread_state_t ts;
882 x86_avx512_state32_t *s32;
883 x86_avx512_state64_t *s64;
884 } xs = { .ts = tstate };
885
d9a64523 886 iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
fe8ab488 887
fe8ab488
A
888 /* Sanitize XSAVE header */
889 bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
5ba3f43e
A
890 iavx->_xh.xstate_bv = AVX512_XMASK;
891 iavx->_xh.xcomp_bv = 0;
892
eb6b6ca3
A
893 /*
894 * See the block comment at the top of the function for a description of why we're clearing
895 * xstate_bv bits.
896 */
5ba3f43e 897 switch (f) {
0a7de745 898 case x86_AVX512_STATE32:
cb323159
A
899 __nochk_bcopy(&xs.s32->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
900 __nochk_bcopy(&xs.s32->fpu_zmmh0, iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG));
eb6b6ca3
A
901 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG)) == TRUE) {
902 iavx->_xh.xstate_bv &= ~XFEM_ZMM;
903 }
cb323159 904 __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
eb6b6ca3
A
905 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
906 iavx->_xh.xstate_bv &= ~XFEM_YMM;
907 }
908
5ba3f43e
A
909 DBG_AVX512_STATE(iavx);
910 break;
0a7de745 911 case x86_AVX_STATE32:
cb323159 912 __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
eb6b6ca3
A
913 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
914 iavx->_xh.xstate_bv &= ~XFEM_YMM;
915 }
5ba3f43e 916 break;
0a7de745 917 case x86_AVX512_STATE64:
cb323159
A
918 __nochk_bcopy(&xs.s64->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
919 __nochk_bcopy(&xs.s64->fpu_zmm16, iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG));
920 __nochk_bcopy(&xs.s64->fpu_zmmh0, iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG));
eb6b6ca3
A
921 /*
922 * Note that it is valid to have XFEM_ZMM set but XFEM_YMM cleared. In that case,
923 * the upper bits of the YMMs would be cleared and would result in a clean-upper
924 * state, allowing SSE instruction to avoid false dependencies.
925 */
926 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG)) == TRUE &&
927 fpu_allzeroes((uint64_t *)(void *)iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG)) == TRUE) {
928 iavx->_xh.xstate_bv &= ~XFEM_ZMM;
929 }
930
cb323159 931 __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
eb6b6ca3
A
932 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
933 iavx->_xh.xstate_bv &= ~XFEM_YMM;
934 }
5ba3f43e
A
935 DBG_AVX512_STATE(iavx);
936 break;
0a7de745 937 case x86_AVX_STATE64:
cb323159 938 __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
eb6b6ca3
A
939 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
940 iavx->_xh.xstate_bv &= ~XFEM_YMM;
941 }
5ba3f43e
A
942 break;
943 }
944 break;
0a7de745 945 }
7ddcb079 946 }
5ba3f43e 947
fe8ab488 948 ifps->fp_valid = old_valid;
7ddcb079 949
fe8ab488
A
950 if (old_valid == FALSE) {
951 boolean_t istate = ml_set_interrupts_enabled(FALSE);
952 ifps->fp_valid = TRUE;
953 /* If altering the current thread's state, disable FPU */
0a7de745 954 if (thr_act == current_thread()) {
fe8ab488 955 set_ts();
0a7de745 956 }
fe8ab488
A
957
958 ml_set_interrupts_enabled(istate);
959 }
960
961 simple_unlock(&pcb->lock);
962
0a7de745 963 if (new_ifps != 0) {
ea3f0419 964 fp_state_free(new_ifps, thr_xstate);
0a7de745 965 }
0c530ab8 966 }
55e303ae
A
967 return KERN_SUCCESS;
968}
969
970/*
971 * Get the floating-point state for a thread.
972 * If the thread is not the current thread, it is
973 * not running (held). Locking needed against
974 * concurrent fpu_set_state or fpu_get_state.
975 */
976kern_return_t
977fpu_get_fxstate(
0a7de745
A
978 thread_t thr_act,
979 thread_state_t tstate,
060df5ea 980 thread_flavor_t f)
55e303ae 981{
0a7de745
A
982 struct x86_fx_thread_state *ifps;
983 x86_float_state64_t *state;
984 kern_return_t ret = KERN_FAILURE;
985 pcb_t pcb;
ea3f0419 986 xstate_t thr_xstate = thread_xstate(thr_act);
55e303ae 987
0a7de745 988 if (fpu_capability == UNDEFINED) {
2d21ac55 989 return KERN_FAILURE;
0a7de745 990 }
0c530ab8 991
bd504ef0 992 if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
0a7de745 993 fpu_capability < AVX) {
bd504ef0 994 return KERN_FAILURE;
0a7de745 995 }
bd504ef0 996
5ba3f43e 997 if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
ea3f0419 998 thr_xstate != AVX512) {
5ba3f43e 999 return KERN_FAILURE;
0a7de745 1000 }
5ba3f43e 1001
0c530ab8 1002 state = (x86_float_state64_t *)tstate;
55e303ae 1003
91447636 1004 assert(thr_act != THREAD_NULL);
6d2010ae 1005 pcb = THREAD_TO_PCB(thr_act);
55e303ae 1006
0a7de745 1007 simple_lock(&pcb->lock, LCK_GRP_NULL);
0c530ab8
A
1008
1009 ifps = pcb->ifps;
55e303ae 1010 if (ifps == 0) {
2d21ac55 1011 /*
0c530ab8
A
1012 * No valid floating-point state.
1013 */
060df5ea 1014
cb323159 1015 __nochk_bcopy((char *)&initial_fp_state, (char *)&state->fpu_fcw,
5ba3f43e 1016 fp_state_size[FP]);
0c530ab8
A
1017
1018 simple_unlock(&pcb->lock);
6601e61a 1019
0c530ab8
A
1020 return KERN_SUCCESS;
1021 }
1022 /*
1023 * Make sure we`ve got the latest fp state info
1024 * If the live fpu state belongs to our target
1025 */
2d21ac55 1026 if (thr_act == current_thread()) {
0a7de745 1027 boolean_t intr;
8f6c56a5 1028
0c530ab8 1029 intr = ml_set_interrupts_enabled(FALSE);
89b3af67 1030
0c530ab8
A
1031 clear_ts();
1032 fp_save(thr_act);
1033 clear_fpu();
6601e61a 1034
0c530ab8 1035 (void)ml_set_interrupts_enabled(intr);
6601e61a 1036 }
0c530ab8 1037 if (ifps->fp_valid) {
cb323159 1038 __nochk_bcopy((char *)ifps, (char *)&state->fpu_fcw, fp_state_size[FP]);
ea3f0419 1039 switch (thr_xstate) {
0a7de745
A
1040 case UNDEFINED_FULL:
1041 case FP_FULL:
1042 case AVX_FULL:
1043 case AVX512_FULL:
ea3f0419 1044 panic("fpu_get_fxstate() INVALID xstate: 0x%x", thr_xstate);
0a7de745
A
1045 break;
1046
1047 case UNDEFINED:
5ba3f43e
A
1048 panic("fpu_get_fxstate() UNDEFINED xstate");
1049 break;
0a7de745
A
1050 case FP:
1051 break; /* already done */
1052 case AVX: {
7ddcb079 1053 struct x86_avx_thread_state *iavx = (void *) ifps;
5ba3f43e
A
1054 x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
1055 if (f == x86_AVX_STATE32) {
cb323159 1056 __nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
5ba3f43e 1057 } else if (f == x86_AVX_STATE64) {
cb323159 1058 __nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
5ba3f43e
A
1059 }
1060 break;
0a7de745 1061 }
0a7de745 1062 case AVX512: {
5ba3f43e
A
1063 struct x86_avx512_thread_state *iavx = (void *) ifps;
1064 union {
1065 thread_state_t ts;
1066 x86_avx512_state32_t *s32;
1067 x86_avx512_state64_t *s64;
1068 } xs = { .ts = tstate };
1069 switch (f) {
0a7de745 1070 case x86_AVX512_STATE32:
cb323159
A
1071 __nochk_bcopy(iavx->x_Opmask, &xs.s32->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
1072 __nochk_bcopy(iavx->x_ZMM_Hi256, &xs.s32->fpu_zmmh0, 8 * sizeof(_STRUCT_YMM_REG));
1073 __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
5ba3f43e
A
1074 DBG_AVX512_STATE(iavx);
1075 break;
0a7de745 1076 case x86_AVX_STATE32:
cb323159 1077 __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
5ba3f43e 1078 break;
0a7de745 1079 case x86_AVX512_STATE64:
cb323159
A
1080 __nochk_bcopy(iavx->x_Opmask, &xs.s64->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
1081 __nochk_bcopy(iavx->x_Hi16_ZMM, &xs.s64->fpu_zmm16, 16 * sizeof(_STRUCT_ZMM_REG));
1082 __nochk_bcopy(iavx->x_ZMM_Hi256, &xs.s64->fpu_zmmh0, 16 * sizeof(_STRUCT_YMM_REG));
1083 __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
5ba3f43e
A
1084 DBG_AVX512_STATE(iavx);
1085 break;
0a7de745 1086 case x86_AVX_STATE64:
cb323159 1087 __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
5ba3f43e 1088 break;
7ddcb079 1089 }
5ba3f43e 1090 break;
0a7de745 1091 }
7ddcb079
A
1092 }
1093
0c530ab8 1094 ret = KERN_SUCCESS;
6601e61a 1095 }
0c530ab8 1096 simple_unlock(&pcb->lock);
21362eb3 1097
0c530ab8 1098 return ret;
6601e61a 1099}
21362eb3 1100
0c530ab8 1101
2d21ac55 1102
6601e61a 1103/*
0c530ab8
A
1104 * the child thread is 'stopped' with the thread
1105 * mutex held and is currently not known by anyone
1106 * so no way for fpu state to get manipulated by an
1107 * outside agency -> no need for pcb lock
6601e61a 1108 */
0c530ab8
A
1109
1110void
1111fpu_dup_fxstate(
0a7de745
A
1112 thread_t parent,
1113 thread_t child)
6601e61a 1114{
060df5ea 1115 struct x86_fx_thread_state *new_ifps = NULL;
0a7de745
A
1116 boolean_t intr;
1117 pcb_t ppcb;
1118 xstate_t xstate = thread_xstate(parent);
21362eb3 1119
6d2010ae 1120 ppcb = THREAD_TO_PCB(parent);
21362eb3 1121
0a7de745
A
1122 if (ppcb->ifps == NULL) {
1123 return;
1124 }
4452a7af 1125
0a7de745
A
1126 if (child->machine.ifps) {
1127 panic("fpu_dup_fxstate: child's ifps non-null");
1128 }
4452a7af 1129
5ba3f43e 1130 new_ifps = fp_state_alloc(xstate);
5d5c5d0d 1131
0a7de745 1132 simple_lock(&ppcb->lock, LCK_GRP_NULL);
6601e61a 1133
0c530ab8 1134 if (ppcb->ifps != NULL) {
060df5ea 1135 struct x86_fx_thread_state *ifps = ppcb->ifps;
0a7de745 1136 /*
0c530ab8
A
1137 * Make sure we`ve got the latest fp state info
1138 */
39037602
A
1139 if (current_thread() == parent) {
1140 intr = ml_set_interrupts_enabled(FALSE);
1141 assert(current_thread() == parent);
1142 clear_ts();
1143 fp_save(parent);
1144 clear_fpu();
1145
1146 (void)ml_set_interrupts_enabled(intr);
1147 }
6601e61a 1148
060df5ea 1149 if (ifps->fp_valid) {
6d2010ae 1150 child->machine.ifps = new_ifps;
5ba3f43e 1151 child->machine.xstate = xstate;
cb323159 1152 __nochk_bcopy((char *)(ppcb->ifps),
0a7de745
A
1153 (char *)(child->machine.ifps),
1154 fp_state_size[xstate]);
0c530ab8 1155
2d21ac55
A
1156 /* Mark the new fp saved state as non-live. */
1157 /* Temporarily disabled: radar 4647827
1158 * new_ifps->fp_valid = TRUE;
1159 */
060df5ea 1160
0c530ab8
A
1161 /*
1162 * Clear any reserved bits in the MXCSR to prevent a GPF
1163 * when issuing an FXRSTOR.
1164 */
060df5ea 1165 new_ifps->fx_MXCSR &= mxcsr_capability_mask;
0c530ab8
A
1166 new_ifps = NULL;
1167 }
6601e61a 1168 }
0c530ab8 1169 simple_unlock(&ppcb->lock);
89b3af67 1170
0a7de745
A
1171 if (new_ifps != NULL) {
1172 fp_state_free(new_ifps, xstate);
1173 }
6601e61a 1174}
4452a7af 1175
1c79356b
A
1176/*
1177 * Initialize FPU.
d26ffc64
A
1178 * FNINIT programs the x87 control word to 0x37f, which matches
1179 * the desired default for macOS.
1c79356b 1180 */
060df5ea 1181
1c79356b 1182void
0a7de745
A
1183fpinit(void)
1184{
d26ffc64 1185 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1c79356b
A
1186 clear_ts();
1187 fninit();
d26ffc64
A
1188#if DEBUG
1189 /* We skip this power-on-default verification sequence on
1190 * non-DEBUG, as dirtying the x87 control word may slow down
1191 * xsave/xrstor and affect energy use.
1192 */
0a7de745 1193 unsigned short control, control2;
1c79356b 1194 fnstcw(&control);
d26ffc64 1195 control2 = control;
0a7de745
A
1196 control &= ~(FPC_PC | FPC_RC); /* Clear precision & rounding control */
1197 control |= (FPC_PC_64 | /* Set precision */
1198 FPC_RC_RN | /* round-to-nearest */
1199 FPC_ZE | /* Suppress zero-divide */
1200 FPC_OE | /* and overflow */
1201 FPC_UE | /* underflow */
1202 FPC_IE | /* Allow NaNQs and +-INF */
1203 FPC_DE | /* Allow denorms as operands */
1204 FPC_PE); /* No trap for precision loss */
d26ffc64 1205 assert(control == control2);
1c79356b 1206 fldcw(control);
d26ffc64 1207#endif
0c530ab8 1208 /* Initialize SSE/SSE2 */
060df5ea 1209 __builtin_ia32_ldmxcsr(0x1f80);
d26ffc64
A
1210 if (fpu_YMM_capable) {
1211 vzeroall();
1212 } else {
1213 xmmzeroall();
1214 }
1215 ml_set_interrupts_enabled(istate);
b0d623f7 1216}
1c79356b
A
1217
1218/*
1219 * Coprocessor not present.
1220 */
1221
3e170ce0
A
1222uint64_t x86_isr_fp_simd_use;
1223
1c79356b
A
1224void
1225fpnoextflt(void)
1226{
0a7de745
A
1227 boolean_t intr;
1228 thread_t thr_act;
1229 pcb_t pcb;
060df5ea 1230 struct x86_fx_thread_state *ifps = 0;
0a7de745 1231 xstate_t xstate = current_xstate();
2d21ac55
A
1232
1233 thr_act = current_thread();
6d2010ae 1234 pcb = THREAD_TO_PCB(thr_act);
2d21ac55 1235
060df5ea 1236 if (pcb->ifps == 0 && !get_interrupt_level()) {
0a7de745 1237 ifps = fp_state_alloc(xstate);
cb323159 1238 __nochk_bcopy((char *)&initial_fp_state, (char *)ifps,
5ba3f43e 1239 fp_state_size[xstate]);
d9a64523 1240 if (!thread_is_64bit_addr(thr_act)) {
5ba3f43e 1241 ifps->fp_save_layout = fpu_YMM_capable ? XSAVE32 : FXSAVE32;
0a7de745 1242 } else {
5ba3f43e 1243 ifps->fp_save_layout = fpu_YMM_capable ? XSAVE64 : FXSAVE64;
0a7de745 1244 }
060df5ea
A
1245 ifps->fp_valid = TRUE;
1246 }
0c530ab8
A
1247 intr = ml_set_interrupts_enabled(FALSE);
1248
0a7de745 1249 clear_ts(); /* Enable FPU use */
0c530ab8 1250
6d2010ae 1251 if (__improbable(get_interrupt_level())) {
3e170ce0
A
1252 /* Track number of #DNA traps at interrupt context,
1253 * which is likely suboptimal. Racy, but good enough.
1254 */
1255 x86_isr_fp_simd_use++;
0c530ab8 1256 /*
3e170ce0
A
1257 * Save current FP/SIMD context if valid
1258 * Initialize live FP/SIMD registers
0c530ab8 1259 */
3e170ce0
A
1260 if (pcb->ifps) {
1261 fp_save(thr_act);
1262 }
0c530ab8
A
1263 fpinit();
1264 } else {
0a7de745
A
1265 if (pcb->ifps == 0) {
1266 pcb->ifps = ifps;
1267 pcb->xstate = xstate;
2d21ac55
A
1268 ifps = 0;
1269 }
0c530ab8
A
1270 /*
1271 * Load this thread`s state into coprocessor live context.
1272 */
2d21ac55 1273 fp_load(thr_act);
0c530ab8 1274 }
0c530ab8 1275 (void)ml_set_interrupts_enabled(intr);
2d21ac55 1276
0a7de745
A
1277 if (ifps) {
1278 fp_state_free(ifps, xstate);
1279 }
1c79356b
A
1280}
1281
1282/*
1283 * FPU overran end of segment.
1284 * Re-initialize FPU. Floating point state is not valid.
1285 */
1286
1287void
1288fpextovrflt(void)
1289{
0a7de745
A
1290 thread_t thr_act = current_thread();
1291 pcb_t pcb;
060df5ea 1292 struct x86_fx_thread_state *ifps;
0a7de745
A
1293 boolean_t intr;
1294 xstate_t xstate = current_xstate();
0c530ab8
A
1295
1296 intr = ml_set_interrupts_enabled(FALSE);
1297
0a7de745 1298 if (get_interrupt_level()) {
94ff46dc 1299 panic("FPU segment overrun exception at interrupt context\n");
0a7de745
A
1300 }
1301 if (current_task() == kernel_task) {
0c530ab8 1302 panic("FPU segment overrun exception in kernel thread context\n");
0a7de745 1303 }
1c79356b 1304
1c79356b
A
1305 /*
1306 * This is a non-recoverable error.
1307 * Invalidate the thread`s FPU state.
1308 */
6d2010ae 1309 pcb = THREAD_TO_PCB(thr_act);
0a7de745 1310 simple_lock(&pcb->lock, LCK_GRP_NULL);
0c530ab8
A
1311 ifps = pcb->ifps;
1312 pcb->ifps = 0;
1c79356b
A
1313 simple_unlock(&pcb->lock);
1314
1315 /*
1316 * Re-initialize the FPU.
1317 */
1318 clear_ts();
1319 fninit();
1320
1321 /*
1322 * And disable access.
1323 */
1324 clear_fpu();
1325
0c530ab8
A
1326 (void)ml_set_interrupts_enabled(intr);
1327
0a7de745
A
1328 if (ifps) {
1329 fp_state_free(ifps, xstate);
1330 }
1c79356b
A
1331}
1332
cc8bc92a
A
1333extern void fpxlog(int, uint32_t, uint32_t, uint32_t);
1334
1c79356b
A
1335/*
1336 * FPU error. Called by AST.
1337 */
1338
1339void
1340fpexterrflt(void)
1341{
0a7de745 1342 thread_t thr_act = current_thread();
6d2010ae 1343 struct x86_fx_thread_state *ifps = thr_act->machine.ifps;
0a7de745 1344 boolean_t intr;
0c530ab8
A
1345
1346 intr = ml_set_interrupts_enabled(FALSE);
1347
0a7de745 1348 if (get_interrupt_level()) {
0c530ab8 1349 panic("FPU error exception at interrupt context\n");
0a7de745
A
1350 }
1351 if (current_task() == kernel_task) {
0c530ab8 1352 panic("FPU error exception in kernel thread context\n");
0a7de745 1353 }
1c79356b 1354
1c79356b
A
1355 /*
1356 * Save the FPU state and turn off the FPU.
1357 */
1358 fp_save(thr_act);
1c79356b 1359
0c530ab8
A
1360 (void)ml_set_interrupts_enabled(intr);
1361
cc8bc92a
A
1362 const uint32_t mask = ifps->fx_control &
1363 (FPC_IM | FPC_DM | FPC_ZM | FPC_OM | FPC_UE | FPC_PE);
1364 const uint32_t xcpt = ~mask & (ifps->fx_status &
1365 (FPS_IE | FPS_DE | FPS_ZE | FPS_OE | FPS_UE | FPS_PE));
1366 fpxlog(EXC_I386_EXTERR, ifps->fx_status, ifps->fx_control, xcpt);
1c79356b
A
1367}
1368
1369/*
1370 * Save FPU state.
1371 *
1372 * Locking not needed:
1373 * . if called from fpu_get_state, pcb already locked.
1374 * . if called from fpnoextflt or fp_intr, we are single-cpu
1375 * . otherwise, thread is running.
0c530ab8 1376 * N.B.: Must be called with interrupts disabled
1c79356b 1377 */
0c530ab8 1378
1c79356b
A
1379void
1380fp_save(
0a7de745 1381 thread_t thr_act)
1c79356b 1382{
6d2010ae 1383 pcb_t pcb = THREAD_TO_PCB(thr_act);
060df5ea 1384 struct x86_fx_thread_state *ifps = pcb->ifps;
0c530ab8 1385
060df5ea 1386 assert(ifps != 0);
1c79356b 1387 if (ifps != 0 && !ifps->fp_valid) {
0c530ab8
A
1388 assert((get_cr0() & CR0_TS) == 0);
1389 /* registers are in FPU */
1390 ifps->fp_valid = TRUE;
d9a64523 1391 fpu_store_registers(ifps, thread_is_64bit_addr(thr_act));
1c79356b
A
1392 }
1393}
1394
1395/*
1396 * Restore FPU state from PCB.
1397 *
1398 * Locking not needed; always called on the current thread.
1399 */
1400
1401void
1402fp_load(
0a7de745 1403 thread_t thr_act)
1c79356b 1404{
6d2010ae 1405 pcb_t pcb = THREAD_TO_PCB(thr_act);
060df5ea 1406 struct x86_fx_thread_state *ifps = pcb->ifps;
0c530ab8 1407
060df5ea 1408 assert(ifps);
0a7de745 1409#if DEBUG
39236c6e
A
1410 if (ifps->fp_valid != FALSE && ifps->fp_valid != TRUE) {
1411 panic("fp_load() invalid fp_valid: %u, fp_save_layout: %u\n",
0a7de745 1412 ifps->fp_valid, ifps->fp_save_layout);
39236c6e
A
1413 }
1414#endif
060df5ea
A
1415
1416 if (ifps->fp_valid == FALSE) {
0c530ab8 1417 fpinit();
1c79356b 1418 } else {
060df5ea 1419 fpu_load_registers(ifps);
1c79356b 1420 }
0a7de745 1421 ifps->fp_valid = FALSE; /* in FPU */
1c79356b
A
1422}
1423
1c79356b 1424/*
0c530ab8
A
1425 * SSE arithmetic exception handling code.
1426 * Basically the same as the x87 exception handler with a different subtype
1c79356b
A
1427 */
1428
1429void
0c530ab8 1430fpSSEexterrflt(void)
1c79356b 1431{
0a7de745 1432 thread_t thr_act = current_thread();
6d2010ae 1433 struct x86_fx_thread_state *ifps = thr_act->machine.ifps;
0a7de745 1434 boolean_t intr;
4452a7af 1435
0c530ab8
A
1436 intr = ml_set_interrupts_enabled(FALSE);
1437
0a7de745 1438 if (get_interrupt_level()) {
0c530ab8 1439 panic("SSE exception at interrupt context\n");
0a7de745
A
1440 }
1441 if (current_task() == kernel_task) {
0c530ab8 1442 panic("SSE exception in kernel thread context\n");
0a7de745 1443 }
1c79356b
A
1444
1445 /*
0c530ab8 1446 * Save the FPU state and turn off the FPU.
1c79356b 1447 */
1c79356b 1448 fp_save(thr_act);
1c79356b 1449
0c530ab8 1450 (void)ml_set_interrupts_enabled(intr);
1c79356b 1451 /*
0c530ab8
A
1452 * Raise FPU exception.
1453 * Locking not needed on pcb->ifps,
1454 * since thread is running.
1c79356b 1455 */
cc8bc92a 1456 const uint32_t mask = (ifps->fx_MXCSR >> 7) &
0a7de745 1457 (FPC_IM | FPC_DM | FPC_ZM | FPC_OM | FPC_UE | FPC_PE);
cc8bc92a 1458 const uint32_t xcpt = ~mask & (ifps->fx_MXCSR &
0a7de745 1459 (FPS_IE | FPS_DE | FPS_ZE | FPS_OE | FPS_UE | FPS_PE));
cc8bc92a 1460 fpxlog(EXC_I386_SSEEXTERR, ifps->fx_MXCSR, ifps->fx_MXCSR, xcpt);
0c530ab8
A
1461}
1462
5ba3f43e 1463
5ba3f43e
A
1464/*
1465 * If a thread is using an AVX-sized savearea:
1466 * - allocate a new AVX512-sized area,
1467 * - copy the 256-bit state into the 512-bit area,
1468 * - deallocate the smaller area
ea3f0419 1469 * ASSUMES: thread is the current thread.
5ba3f43e
A
1470 */
1471static void
1472fpu_savearea_promote_avx512(thread_t thread)
1473{
0a7de745
A
1474 struct x86_avx_thread_state *ifps = NULL;
1475 struct x86_avx512_thread_state *ifps512 = NULL;
1476 pcb_t pcb = THREAD_TO_PCB(thread);
1477 boolean_t do_avx512_alloc = FALSE;
ea3f0419 1478 boolean_t intr;
5ba3f43e 1479
ea3f0419
A
1480 assert(thread == current_thread());
1481
1482 DBG("fpu_savearea_promote_avx512(%p)\n", thread);
cc8bc92a 1483
0a7de745 1484 simple_lock(&pcb->lock, LCK_GRP_NULL);
cc8bc92a 1485
5ba3f43e
A
1486 ifps = pcb->ifps;
1487 if (ifps == NULL) {
cc8bc92a 1488 pcb->xstate = AVX512;
5ba3f43e 1489 simple_unlock(&pcb->lock);
ea3f0419
A
1490 /*
1491 * Now that the PCB xstate has been promoted, set XCR0 so
1492 * that we don't re-trip #UD on the next AVX-512 instruction.
1493 *
1494 * Since this branch is taken when the first FP instruction
1495 * attempted by this thread is an AVX-512 instruction, we
1496 * call fpnoextflt() to allocate an appropriately-sized
1497 * AVX-512 save-area, thereby avoiding the overhead of another
1498 * fault that would be triggered immediately on return.
1499 */
1500 intr = ml_set_interrupts_enabled(FALSE);
1501 xsetbv(0, AVX512_XMASK);
1502 current_cpu_datap()->cpu_xstate = AVX512;
1503 (void)ml_set_interrupts_enabled(intr);
cc8bc92a 1504
cc8bc92a 1505 fpnoextflt();
5ba3f43e
A
1506 return;
1507 }
cc8bc92a
A
1508
1509 if (pcb->xstate != AVX512) {
1510 do_avx512_alloc = TRUE;
1511 }
ea3f0419 1512
cc8bc92a
A
1513 simple_unlock(&pcb->lock);
1514
1515 if (do_avx512_alloc == TRUE) {
1516 ifps512 = fp_state_alloc(AVX512);
1517 }
1518
0a7de745 1519 simple_lock(&pcb->lock, LCK_GRP_NULL);
5ba3f43e 1520
ea3f0419 1521 intr = ml_set_interrupts_enabled(FALSE);
5ba3f43e 1522
ea3f0419
A
1523 clear_ts();
1524 fp_save(thread);
1525 clear_fpu();
1526
1527 xsetbv(0, AVX512_XMASK);
1528 current_cpu_datap()->cpu_xstate = AVX512;
1529 (void)ml_set_interrupts_enabled(intr);
5ba3f43e 1530
5ba3f43e
A
1531 assert(ifps->fp.fp_valid);
1532
1533 /* Allocate an AVX512 savearea and copy AVX state into it */
cc8bc92a 1534 if (pcb->xstate != AVX512) {
cb323159 1535 __nochk_bcopy(ifps, ifps512, fp_state_size[AVX]);
cc8bc92a
A
1536 pcb->ifps = ifps512;
1537 pcb->xstate = AVX512;
1538 ifps512 = NULL;
1539 } else {
1540 ifps = NULL;
1541 }
1542 /* The PCB lock is redundant in some scenarios given the higher level
1543 * thread mutex, but its pre-emption disablement is relied upon here
1544 */
5ba3f43e 1545 simple_unlock(&pcb->lock);
cc8bc92a
A
1546
1547 if (ifps) {
1548 fp_state_free(ifps, AVX);
1549 }
1550 if (ifps512) {
1551 fp_state_free(ifps, AVX512);
1552 }
5ba3f43e
A
1553}
1554
1555/*
1556 * Upgrade the calling thread to AVX512.
1557 */
1558boolean_t
1559fpu_thread_promote_avx512(thread_t thread)
1560{
0a7de745 1561 task_t task = current_task();
5ba3f43e 1562
0a7de745 1563 if (thread != current_thread()) {
5ba3f43e 1564 return FALSE;
0a7de745
A
1565 }
1566 if (!ml_fpu_avx512_enabled()) {
5ba3f43e 1567 return FALSE;
0a7de745 1568 }
5ba3f43e
A
1569
1570 fpu_savearea_promote_avx512(thread);
1571
1572 /* Racy but the task's xstate is only a hint */
1573 task->xstate = AVX512;
1574
1575 return TRUE;
1576}
1577
1578
1579/*
1580 * Called from user_trap() when an invalid opcode fault is taken.
1581 * If the user is attempting an AVX512 instruction on a machine
1582 * that supports this, we switch the calling thread to use
1583 * a larger savearea, set its XCR0 bit mask to enable AVX512 and
ea3f0419
A
1584 * return to user_trap() with a 0 return value.
1585 * Otherwise, simply return a nonzero value.
5ba3f43e 1586 */
ea3f0419 1587
94ff46dc
A
1588#define MAX_X86_INSN_LENGTH (15)
1589int
5ba3f43e
A
1590fpUDflt(user_addr_t rip)
1591{
0a7de745
A
1592 uint8_t instruction_prefix;
1593 boolean_t is_AVX512_instruction = FALSE;
1594 user_addr_t original_rip = rip;
5ba3f43e 1595 do {
cc8bc92a
A
1596 /* TODO: as an optimisation, copy up to the lesser of the
1597 * next page boundary or maximal prefix length in one pass
1598 * rather than issue multiple copyins
1599 */
1600 if (copyin(rip, (char *) &instruction_prefix, 1)) {
94ff46dc 1601 return 1;
cc8bc92a 1602 }
5ba3f43e 1603 DBG("fpUDflt(0x%016llx) prefix: 0x%x\n",
0a7de745 1604 rip, instruction_prefix);
cc8bc92a
A
1605 /* TODO: determine more specifically which prefixes
1606 * are sane possibilities for AVX512 insns
1607 */
5ba3f43e 1608 switch (instruction_prefix) {
0a7de745
A
1609 case 0x2E: /* CS segment override */
1610 case 0x36: /* SS segment override */
1611 case 0x3E: /* DS segment override */
1612 case 0x26: /* ES segment override */
1613 case 0x64: /* FS segment override */
1614 case 0x65: /* GS segment override */
1615 case 0x66: /* Operand-size override */
1616 case 0x67: /* address-size override */
5ba3f43e
A
1617 /* Skip optional prefixes */
1618 rip++;
cc8bc92a 1619 if ((rip - original_rip) > MAX_X86_INSN_LENGTH) {
94ff46dc 1620 return 1;
cc8bc92a 1621 }
5ba3f43e 1622 break;
0a7de745
A
1623 case 0x62: /* EVEX */
1624 case 0xC5: /* VEX 2-byte */
1625 case 0xC4: /* VEX 3-byte */
5ba3f43e
A
1626 is_AVX512_instruction = TRUE;
1627 break;
0a7de745 1628 default:
94ff46dc 1629 return 1;
5ba3f43e
A
1630 }
1631 } while (!is_AVX512_instruction);
1632
1633 /* Here if we detect attempted execution of an AVX512 instruction */
1634
1635 /*
cc8bc92a 1636 * Fail if this machine doesn't support AVX512
5ba3f43e 1637 */
0a7de745 1638 if (fpu_capability != AVX512) {
94ff46dc 1639 return 1;
0a7de745 1640 }
5ba3f43e
A
1641
1642 assert(xgetbv(XCR0) == AVX_XMASK);
1643
1644 DBG("fpUDflt() switching xstate to AVX512\n");
1645 (void) fpu_thread_promote_avx512(current_thread());
1646
94ff46dc 1647 return 0;
5ba3f43e 1648}
5ba3f43e 1649
0c530ab8 1650void
0a7de745
A
1651fp_setvalid(boolean_t value)
1652{
1653 thread_t thr_act = current_thread();
6d2010ae 1654 struct x86_fx_thread_state *ifps = thr_act->machine.ifps;
0c530ab8
A
1655
1656 if (ifps) {
0a7de745 1657 ifps->fp_valid = value;
0c530ab8 1658
060df5ea
A
1659 if (value == TRUE) {
1660 boolean_t istate = ml_set_interrupts_enabled(FALSE);
0a7de745 1661 clear_fpu();
060df5ea
A
1662 ml_set_interrupts_enabled(istate);
1663 }
0c530ab8 1664 }
1c79356b 1665}
060df5ea 1666
316670eb 1667boolean_t
0a7de745
A
1668ml_fpu_avx_enabled(void)
1669{
1670 return fpu_capability >= AVX;
5ba3f43e
A
1671}
1672
5ba3f43e 1673boolean_t
0a7de745
A
1674ml_fpu_avx512_enabled(void)
1675{
1676 return fpu_capability == AVX512;
5ba3f43e 1677}
5ba3f43e
A
1678
1679static xstate_t
1680task_xstate(task_t task)
1681{
0a7de745 1682 if (task == TASK_NULL) {
5ba3f43e 1683 return fpu_default;
0a7de745 1684 } else {
5ba3f43e 1685 return task->xstate;
0a7de745 1686 }
5ba3f43e
A
1687}
1688
1689static xstate_t
1690thread_xstate(thread_t thread)
1691{
1692 xstate_t xs = THREAD_TO_PCB(thread)->xstate;
0a7de745 1693 if (xs == UNDEFINED) {
5ba3f43e 1694 return task_xstate(thread->task);
0a7de745 1695 } else {
5ba3f43e 1696 return xs;
0a7de745 1697 }
5ba3f43e
A
1698}
1699
1700xstate_t
1701current_xstate(void)
1702{
1703 return thread_xstate(current_thread());
1704}
1705
1706/*
1707 * Called when exec'ing between bitnesses.
1708 * If valid FPU state exists, adjust the layout.
1709 */
1710void
1711fpu_switch_addrmode(thread_t thread, boolean_t is_64bit)
1712{
1713 struct x86_fx_thread_state *ifps = thread->machine.ifps;
d26ffc64 1714 mp_disable_preemption();
5ba3f43e
A
1715
1716 if (ifps && ifps->fp_valid) {
1717 if (thread_xstate(thread) == FP) {
1718 ifps->fp_save_layout = is_64bit ? FXSAVE64 : FXSAVE32;
1719 } else {
1720 ifps->fp_save_layout = is_64bit ? XSAVE64 : XSAVE32;
1721 }
1722 }
d26ffc64
A
1723 mp_enable_preemption();
1724}
1725
0a7de745
A
1726static inline uint32_t
1727fpsimd_pop(uintptr_t ins, int sz)
1728{
d26ffc64
A
1729 uint32_t rv = 0;
1730
1731
1732 while (sz >= 16) {
1733 uint32_t rv1, rv2;
1734 uint64_t *ins64 = (uint64_t *) ins;
1735 uint64_t *ins642 = (uint64_t *) (ins + 8);
1736 rv1 = __builtin_popcountll(*ins64);
1737 rv2 = __builtin_popcountll(*ins642);
1738 rv += rv1 + rv2;
1739 sz -= 16;
1740 ins += 16;
1741 }
1742
1743 while (sz >= 4) {
1744 uint32_t *ins32 = (uint32_t *) ins;
1745 rv += __builtin_popcount(*ins32);
1746 sz -= 4;
1747 ins += 4;
1748 }
1749
1750 while (sz > 0) {
1751 char *ins8 = (char *)ins;
1752 rv += __builtin_popcount(*ins8);
1753 sz--;
1754 ins++;
1755 }
1756 return rv;
1757}
1758
0a7de745
A
1759uint32_t
1760thread_fpsimd_hash(thread_t ft)
1761{
1762 if (fpsimd_fault_popc == 0) {
d26ffc64 1763 return 0;
0a7de745 1764 }
d26ffc64
A
1765
1766 uint32_t prv = 0;
1767 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1768 struct x86_fx_thread_state *pifps = THREAD_TO_PCB(ft)->ifps;
1769
1770 if (pifps) {
1771 if (pifps->fp_valid) {
1772 prv = fpsimd_pop((uintptr_t) &pifps->fx_XMM_reg[0][0],
1773 sizeof(pifps->fx_XMM_reg));
1774 } else {
1775 uintptr_t cr0 = get_cr0();
1776 clear_ts();
1777 fp_save(ft);
1778 prv = fpsimd_pop((uintptr_t) &pifps->fx_XMM_reg[0][0],
1779 sizeof(pifps->fx_XMM_reg));
1780 pifps->fp_valid = FALSE;
1781 if (cr0 & CR0_TS) {
1782 set_cr0(cr0);
1783 }
1784 }
1785 }
1786 ml_set_interrupts_enabled(istate);
1787 return prv;
060df5ea 1788}