]> git.saurik.com Git - apple/xnu.git/blame - osfmk/i386/fpu.c
xnu-4903.241.1.tar.gz
[apple/xnu.git] / osfmk / i386 / fpu.c
CommitLineData
1c79356b 1/*
d26ffc64 2 * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1992-1990 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
1c79356b
A
56
57#include <mach/exception_types.h>
58#include <mach/i386/thread_status.h>
59#include <mach/i386/fp_reg.h>
60
61#include <kern/mach_param.h>
91447636 62#include <kern/processor.h>
1c79356b
A
63#include <kern/thread.h>
64#include <kern/zalloc.h>
65#include <kern/misc_protos.h>
66#include <kern/spl.h>
67#include <kern/assert.h>
68
060df5ea
A
69#include <libkern/OSAtomic.h>
70
0c530ab8 71#include <architecture/i386/pio.h>
55e303ae 72#include <i386/cpuid.h>
b0d623f7 73#include <i386/fpu.h>
0c530ab8 74#include <i386/proc_reg.h>
b0d623f7
A
75#include <i386/misc_protos.h>
76#include <i386/thread.h>
77#include <i386/trap.h>
1c79356b 78
5ba3f43e
A
79xstate_t fpu_capability = UNDEFINED; /* extended state capability */
80xstate_t fpu_default = UNDEFINED; /* default extended state */
1c79356b 81
b0d623f7 82#define ALIGNED(addr,size) (((uintptr_t)(addr)&((size)-1))==0)
1c79356b
A
83
84/* Forward */
85
86extern void fpinit(void);
87extern void fp_save(
91447636 88 thread_t thr_act);
1c79356b 89extern void fp_load(
91447636 90 thread_t thr_act);
1c79356b 91
5ba3f43e
A
92static void configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps);
93static xstate_t thread_xstate(thread_t);
0c530ab8 94
5ba3f43e 95x86_ext_thread_state_t initial_fp_state __attribute((aligned(64)));
d26ffc64
A
96x86_ext_thread_state_t default_avx512_state __attribute((aligned(64)));
97x86_ext_thread_state_t default_avx_state __attribute((aligned(64)));
98x86_ext_thread_state_t default_fx_state __attribute((aligned(64)));
0c530ab8
A
99
100/* Global MXCSR capability bitmask */
101static unsigned int mxcsr_capability_mask;
102
060df5ea
A
103#define fninit() \
104 __asm__ volatile("fninit")
105
106#define fnstcw(control) \
107 __asm__("fnstcw %0" : "=m" (*(unsigned short *)(control)))
108
109#define fldcw(control) \
110 __asm__ volatile("fldcw %0" : : "m" (*(unsigned short *) &(control)) )
111
112#define fnclex() \
113 __asm__ volatile("fnclex")
114
115#define fnsave(state) \
116 __asm__ volatile("fnsave %0" : "=m" (*state))
117
118#define frstor(state) \
119 __asm__ volatile("frstor %0" : : "m" (state))
120
121#define fwait() \
122 __asm__("fwait");
123
5ba3f43e
A
124static inline void fxrstor(struct x86_fx_thread_state *a) {
125 __asm__ __volatile__("fxrstor %0" :: "m" (*a));
126}
127
128static inline void fxsave(struct x86_fx_thread_state *a) {
129 __asm__ __volatile__("fxsave %0" : "=m" (*a));
130}
131
132static inline void fxrstor64(struct x86_fx_thread_state *a) {
133 __asm__ __volatile__("fxrstor64 %0" :: "m" (*a));
134}
135
136static inline void fxsave64(struct x86_fx_thread_state *a) {
137 __asm__ __volatile__("fxsave64 %0" : "=m" (*a));
138}
139
140#if !defined(RC_HIDE_XNU_J137)
141#define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX || (x) == AVX512)
142#else
143#define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX)
144#endif
145
146zone_t ifps_zone[] = {
147 [FP] = NULL,
148 [AVX] = NULL,
149#if !defined(RC_HIDE_XNU_J137)
150 [AVX512] = NULL
151#endif
152};
153static uint32_t fp_state_size[] = {
154 [FP] = sizeof(struct x86_fx_thread_state),
155 [AVX] = sizeof(struct x86_avx_thread_state),
156#if !defined(RC_HIDE_XNU_J137)
157 [AVX512] = sizeof(struct x86_avx512_thread_state)
158#endif
159};
160
161static const char *xstate_name[] = {
162 [UNDEFINED] = "UNDEFINED",
163 [FP] = "FP",
164 [AVX] = "AVX",
165#if !defined(RC_HIDE_XNU_J137)
166 [AVX512] = "AVX512"
167#endif
168};
060df5ea 169
5ba3f43e
A
170#if !defined(RC_HIDE_XNU_J137)
171#define fpu_ZMM_capable (fpu_capability == AVX512)
172#define fpu_YMM_capable (fpu_capability == AVX || fpu_capability == AVX512)
173/*
174 * On-demand AVX512 support
175 * ------------------------
176 * On machines with AVX512 support, by default, threads are created with
177 * AVX512 masked off in XCR0 and an AVX-sized savearea is used. However, AVX512
178 * capabilities are advertised in the commpage and via sysctl. If a thread
179 * opts to use AVX512 instructions, the first will result in a #UD exception.
180 * Faulting AVX512 intructions are recognizable by their unique prefix.
181 * This exception results in the thread being promoted to use an AVX512-sized
182 * savearea and for the AVX512 bit masks being set in its XCR0. The faulting
183 * instruction is re-driven and the thread can proceed to perform AVX512
184 * operations.
185 *
186 * In addition to AVX512 instructions causing promotion, the thread_set_state()
187 * primitive with an AVX512 state flavor result in promotion.
188 *
189 * AVX512 promotion of the first thread in a task causes the default xstate
190 * of the task to be promoted so that any subsequently created or subsequently
191 * DNA-faulted thread will have AVX512 xstate and it will not need to fault-in
192 * a promoted xstate.
193 *
194 * Two savearea zones are used: the default pool of AVX-sized (832 byte) areas
195 * and a second pool of larger AVX512-sized (2688 byte) areas.
196 *
197 * Note the initial state value is an AVX512 object but that the AVX initial
198 * value is a subset of it.
199 */
200#else
201#define fpu_YMM_capable (fpu_capability == AVX)
202#endif
060df5ea
A
203static uint32_t cpuid_reevaluated = 0;
204
205static void fpu_store_registers(void *, boolean_t);
206static void fpu_load_registers(void *);
207
5ba3f43e 208#if !defined(RC_HIDE_XNU_J137)
5ba3f43e
A
209static const uint32_t xstate_xmask[] = {
210 [FP] = FP_XMASK,
211 [AVX] = AVX_XMASK,
212 [AVX512] = AVX512_XMASK
213};
214#else
215static const uint32_t xstate_xmask[] = {
216 [FP] = FP_XMASK,
217 [AVX] = AVX_XMASK,
218};
219#endif
060df5ea 220
5ba3f43e
A
221static inline void xsave(struct x86_fx_thread_state *a, uint32_t rfbm) {
222 __asm__ __volatile__("xsave %0" :"=m" (*a) : "a"(rfbm), "d"(0));
223}
224
225static inline void xsave64(struct x86_fx_thread_state *a, uint32_t rfbm) {
226 __asm__ __volatile__("xsave64 %0" :"=m" (*a) : "a"(rfbm), "d"(0));
227}
228
229static inline void xrstor(struct x86_fx_thread_state *a, uint32_t rfbm) {
230 __asm__ __volatile__("xrstor %0" :: "m" (*a), "a"(rfbm), "d"(0));
231}
232
233static inline void xrstor64(struct x86_fx_thread_state *a, uint32_t rfbm) {
234 __asm__ __volatile__("xrstor64 %0" :: "m" (*a), "a"(rfbm), "d"(0));
060df5ea
A
235}
236
5ba3f43e 237#if !defined(RC_HIDE_XNU_J137)
d26ffc64 238__unused static inline void vzeroupper(void) {
5ba3f43e
A
239 __asm__ __volatile__("vzeroupper" ::);
240}
5ba3f43e
A
241
242static boolean_t fpu_thread_promote_avx512(thread_t); /* Forward */
243
244/*
245 * Define a wrapper for bcopy to defeat destination size checka.
246 * This is needed to treat repeated objects such as
247 * _STRUCT_XMM_REG fpu_ymmh0;
248 * ...
249 * _STRUCT_XMM_REG fpu_ymmh7;
250 * as an array and to copy like so:
251 * bcopy_nockch(src,&dst->fpu_ymmh0,8*sizeof(_STRUCT_XMM_REG));
252 * without the compiler throwing a __builtin__memmove_chk error.
253 */
254static inline void bcopy_nochk(void *_src, void *_dst, size_t _len) {
255 bcopy(_src, _dst, _len);
256}
257
258/*
259 * Furthermore, make compile-time asserts that no padding creeps into structures
260 * for which we're doing this.
261 */
262#define ASSERT_PACKED(t, m1, m2, n, mt) \
263extern char assert_packed_ ## t ## _ ## m1 ## _ ## m2 \
264 [(offsetof(t,m2) - offsetof(t,m1) == (n - 1)*sizeof(mt)) ? 1 : -1]
265
266ASSERT_PACKED(x86_avx_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
267
268ASSERT_PACKED(x86_avx_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
269
270ASSERT_PACKED(x86_avx512_state32_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
271ASSERT_PACKED(x86_avx512_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
272ASSERT_PACKED(x86_avx512_state32_t, fpu_zmmh0, fpu_zmmh7, 8, _STRUCT_YMM_REG);
273
274ASSERT_PACKED(x86_avx512_state64_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
275ASSERT_PACKED(x86_avx512_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
276ASSERT_PACKED(x86_avx512_state64_t, fpu_zmmh0, fpu_zmmh15, 16, _STRUCT_YMM_REG);
277ASSERT_PACKED(x86_avx512_state64_t, fpu_zmm16, fpu_zmm31, 16, _STRUCT_ZMM_REG);
278
279#if defined(DEBUG_AVX512)
280
281#define DBG(x...) kprintf("DBG: " x)
282
283typedef struct { uint8_t byte[8]; } opmask_t;
284typedef struct { uint8_t byte[16]; } xmm_t;
285typedef struct { uint8_t byte[32]; } ymm_t;
286typedef struct { uint8_t byte[64]; } zmm_t;
287
288static void
289DBG_AVX512_STATE(struct x86_avx512_thread_state *sp)
290{
291 int i, j;
292 xmm_t *xmm = (xmm_t *) &sp->fp.fx_XMM_reg;
293 xmm_t *ymmh = (xmm_t *) &sp->x_YMM_Hi128;
294 ymm_t *zmmh = (ymm_t *) &sp->x_ZMM_Hi256;
295 zmm_t *zmm = (zmm_t *) &sp->x_Hi16_ZMM;
296 opmask_t *k = (opmask_t *) &sp->x_Opmask;
297
298 kprintf("x_YMM_Hi128: %lu\n", offsetof(struct x86_avx512_thread_state, x_YMM_Hi128));
299 kprintf("x_Opmask: %lu\n", offsetof(struct x86_avx512_thread_state, x_Opmask));
300 kprintf("x_ZMM_Hi256: %lu\n", offsetof(struct x86_avx512_thread_state, x_ZMM_Hi256));
301 kprintf("x_Hi16_ZMM: %lu\n", offsetof(struct x86_avx512_thread_state, x_Hi16_ZMM));
302
303 kprintf("XCR0: 0x%016llx\n", xgetbv(XCR0));
304 kprintf("XINUSE: 0x%016llx\n", xgetbv(1));
305
306 /* Print all ZMM registers */
307 for (i = 0; i < 16; i++) {
308 kprintf("zmm%d:\t0x", i);
309 for (j = 0; j < 16; j++)
310 kprintf("%02x", xmm[i].byte[j]);
311 for (j = 0; j < 16; j++)
312 kprintf("%02x", ymmh[i].byte[j]);
313 for (j = 0; j < 32; j++)
314 kprintf("%02x", zmmh[i].byte[j]);
315 kprintf("\n");
316 }
317 for (i = 0; i < 16; i++) {
318 kprintf("zmm%d:\t0x", 16+i);
319 for (j = 0; j < 64; j++)
320 kprintf("%02x", zmm[i].byte[j]);
321 kprintf("\n");
322 }
323 for (i = 0; i < 8; i++) {
324 kprintf("k%d:\t0x", i);
325 for (j = 0; j < 8; j++)
326 kprintf("%02x", k[i].byte[j]);
327 kprintf("\n");
328 }
329
330 kprintf("xstate_bv: 0x%016llx\n", sp->_xh.xstate_bv);
331 kprintf("xcomp_bv: 0x%016llx\n", sp->_xh.xcomp_bv);
332}
333#else
334#define DBG(x...)
335static void
336DBG_AVX512_STATE(__unused struct x86_avx512_thread_state *sp)
337{
338 return;
339}
340#endif /* DEBUG_AVX512 */
341
342#endif
060df5ea 343
fe8ab488 344#if DEBUG
060df5ea
A
345static inline unsigned short
346fnstsw(void)
347{
348 unsigned short status;
349 __asm__ volatile("fnstsw %0" : "=ma" (status));
350 return(status);
351}
fe8ab488 352#endif
060df5ea 353
0c530ab8 354/*
060df5ea 355 * Configure the initial FPU state presented to new threads.
0c530ab8
A
356 * Determine the MXCSR capability mask, which allows us to mask off any
357 * potentially unsafe "reserved" bits before restoring the FPU context.
358 * *Not* per-cpu, assumes symmetry.
359 */
060df5ea 360
0c530ab8 361static void
5ba3f43e 362configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps)
0c530ab8 363{
060df5ea
A
364 /* XSAVE requires a 64 byte aligned store */
365 assert(ALIGNED(fps, 64));
0c530ab8 366 /* Clear, to prepare for the diagnostic FXSAVE */
060df5ea
A
367 bzero(fps, sizeof(*fps));
368
369 fpinit();
370 fpu_store_registers(fps, FALSE);
371
5ba3f43e 372 mxcsr_capability_mask = fps->fx.fx_MXCSR_MASK;
0c530ab8
A
373
374 /* Set default mask value if necessary */
375 if (mxcsr_capability_mask == 0)
376 mxcsr_capability_mask = 0xffbf;
377
060df5ea 378 /* Clear vector register store */
5ba3f43e
A
379 bzero(&fps->fx.fx_XMM_reg[0][0], sizeof(fps->fx.fx_XMM_reg));
380 bzero(fps->avx.x_YMM_Hi128, sizeof(fps->avx.x_YMM_Hi128));
381#if !defined(RC_HIDE_XNU_J137)
382 if (fpu_ZMM_capable) {
383 bzero(fps->avx512.x_ZMM_Hi256, sizeof(fps->avx512.x_ZMM_Hi256));
384 bzero(fps->avx512.x_Hi16_ZMM, sizeof(fps->avx512.x_Hi16_ZMM));
385 bzero(fps->avx512.x_Opmask, sizeof(fps->avx512.x_Opmask));
386 }
387#endif
0c530ab8 388
5ba3f43e
A
389 fps->fx.fp_valid = TRUE;
390 fps->fx.fp_save_layout = fpu_YMM_capable ? XSAVE32: FXSAVE32;
060df5ea 391 fpu_load_registers(fps);
0c530ab8 392
d26ffc64
A
393 if (fpu_ZMM_capable) {
394 xsave64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
395 }
396 if (fpu_YMM_capable) {
397 xsave64((struct x86_fx_thread_state *)&default_avx_state, xstate_xmask[AVX]);
398 } else {
399 fxsave64((struct x86_fx_thread_state *)&default_fx_state);
400 }
401
060df5ea 402 /* Poison values to trap unsafe usage */
5ba3f43e
A
403 fps->fx.fp_valid = 0xFFFFFFFF;
404 fps->fx.fp_save_layout = FP_UNUSED;
0c530ab8 405
060df5ea
A
406 /* Re-enable FPU/SSE DNA exceptions */
407 set_ts();
0c530ab8
A
408}
409
d26ffc64 410int fpsimd_fault_popc = 0;
1c79356b
A
411/*
412 * Look for FPU and initialize it.
413 * Called on each CPU.
414 */
415void
416init_fpu(void)
417{
060df5ea
A
418#if DEBUG
419 unsigned short status;
420 unsigned short control;
421#endif
1c79356b
A
422 /*
423 * Check for FPU by initializing it,
424 * then trying to read the correct bit patterns from
425 * the control and status registers.
426 */
91447636 427 set_cr0((get_cr0() & ~(CR0_EM|CR0_TS)) | CR0_NE); /* allow use of FPU */
1c79356b 428 fninit();
060df5ea 429#if DEBUG
1c79356b
A
430 status = fnstsw();
431 fnstcw(&control);
060df5ea
A
432
433 assert(((status & 0xff) == 0) && ((control & 0x103f) == 0x3f));
434#endif
435 /* Advertise SSE support */
436 if (cpuid_features() & CPUID_FEATURE_FXSR) {
060df5ea
A
437 set_cr4(get_cr4() | CR4_OSFXS);
438 /* And allow SIMD exceptions if present */
439 if (cpuid_features() & CPUID_FEATURE_SSE) {
440 set_cr4(get_cr4() | CR4_OSXMM);
441 }
060df5ea
A
442 } else
443 panic("fpu is not FP_FXSR");
55e303ae 444
5ba3f43e
A
445 fpu_capability = fpu_default = FP;
446
d26ffc64
A
447 PE_parse_boot_argn("fpsimd_fault_popc", &fpsimd_fault_popc, sizeof(fpsimd_fault_popc));
448
5ba3f43e
A
449#if !defined(RC_HIDE_XNU_J137)
450 static boolean_t is_avx512_enabled = TRUE;
451 if (cpu_number() == master_cpu) {
452 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512F) {
453 PE_parse_boot_argn("avx512", &is_avx512_enabled, sizeof(boolean_t));
454 kprintf("AVX512 supported %s\n",
455 is_avx512_enabled ? "and enabled" : "but disabled");
456 }
457 }
458#endif
459
060df5ea
A
460 /* Configure the XSAVE context mechanism if the processor supports
461 * AVX/YMM registers
462 */
463 if (cpuid_features() & CPUID_FEATURE_XSAVE) {
5ba3f43e
A
464 cpuid_xsave_leaf_t *xs0p = &cpuid_info()->cpuid_xsave_leaf[0];
465#if !defined(RC_HIDE_XNU_J137)
466 if (is_avx512_enabled &&
467 (xs0p->extended_state[eax] & XFEM_ZMM) == XFEM_ZMM) {
468 assert(xs0p->extended_state[eax] & XFEM_SSE);
469 assert(xs0p->extended_state[eax] & XFEM_YMM);
470 fpu_capability = AVX512;
471 /* XSAVE container size for all features */
472 set_cr4(get_cr4() | CR4_OSXSAVE);
473 xsetbv(0, AVX512_XMASK);
474 /* Re-evaluate CPUID, once, to reflect OSXSAVE */
475 if (OSCompareAndSwap(0, 1, &cpuid_reevaluated))
476 cpuid_set_info();
477 /* Verify that now selected state can be accommodated */
478 assert(xs0p->extended_state[ebx] == fp_state_size[AVX512]);
479 /*
480 * AVX set until AVX512 is used.
481 * See comment above about on-demand AVX512 support.
482 */
483 xsetbv(0, AVX_XMASK);
484 fpu_default = AVX;
485 } else
486#endif
487 if (xs0p->extended_state[eax] & XFEM_YMM) {
488 assert(xs0p->extended_state[eax] & XFEM_SSE);
489 fpu_capability = AVX;
490 fpu_default = AVX;
060df5ea 491 /* XSAVE container size for all features */
060df5ea 492 set_cr4(get_cr4() | CR4_OSXSAVE);
5ba3f43e 493 xsetbv(0, AVX_XMASK);
060df5ea
A
494 /* Re-evaluate CPUID, once, to reflect OSXSAVE */
495 if (OSCompareAndSwap(0, 1, &cpuid_reevaluated))
496 cpuid_set_info();
3e170ce0 497 /* Verify that now selected state can be accommodated */
5ba3f43e 498 assert(xs0p->extended_state[ebx] == fp_state_size[AVX]);
060df5ea
A
499 }
500 }
5ba3f43e
A
501
502 if (cpu_number() == master_cpu)
503 kprintf("fpu_state: %s, state_size: %d\n",
504 xstate_name[fpu_capability],
505 fp_state_size[fpu_capability]);
060df5ea
A
506
507 fpinit();
d26ffc64 508 current_cpu_datap()->cpu_xstate = fpu_default;
060df5ea
A
509
510 /*
511 * Trap wait instructions. Turn off FPU for now.
512 */
513 set_cr0(get_cr0() | CR0_TS | CR0_MP);
514}
515
516/*
5ba3f43e 517 * Allocate and initialize FP state for specified xstate.
060df5ea
A
518 * Don't load state.
519 */
520static void *
5ba3f43e 521fp_state_alloc(xstate_t xs)
060df5ea 522{
5ba3f43e
A
523 struct x86_fx_thread_state *ifps;
524
525 assert(ifps_zone[xs] != NULL);
526 ifps = zalloc(ifps_zone[xs]);
0c530ab8 527
060df5ea
A
528#if DEBUG
529 if (!(ALIGNED(ifps,64))) {
5ba3f43e
A
530 panic("fp_state_alloc: %p, %u, %p, %u",
531 ifps, (unsigned) ifps_zone[xs]->elem_size,
532 (void *) ifps_zone[xs]->free_elements,
533 (unsigned) ifps_zone[xs]->alloc_size);
1c79356b 534 }
060df5ea 535#endif
5ba3f43e
A
536 bzero(ifps, fp_state_size[xs]);
537
060df5ea
A
538 return ifps;
539}
540
541static inline void
5ba3f43e 542fp_state_free(void *ifps, xstate_t xs)
060df5ea 543{
5ba3f43e
A
544 assert(ifps_zone[xs] != NULL);
545 zfree(ifps_zone[xs], ifps);
060df5ea
A
546}
547
548void clear_fpu(void)
549{
550 set_ts();
551}
552
553
554static void fpu_load_registers(void *fstate) {
555 struct x86_fx_thread_state *ifps = fstate;
556 fp_save_layout_t layout = ifps->fp_save_layout;
557
5ba3f43e 558 assert(current_task() == NULL || \
d9a64523 559 (thread_is_64bit_addr(current_thread()) ? \
5ba3f43e
A
560 (layout == FXSAVE64 || layout == XSAVE64) : \
561 (layout == FXSAVE32 || layout == XSAVE32)));
060df5ea
A
562 assert(ALIGNED(ifps, 64));
563 assert(ml_get_interrupts_enabled() == FALSE);
564
565#if DEBUG
566 if (layout == XSAVE32 || layout == XSAVE64) {
567 struct x86_avx_thread_state *iavx = fstate;
568 unsigned i;
569 /* Verify reserved bits in the XSAVE header*/
5ba3f43e
A
570 if (iavx->_xh.xstate_bv & ~xstate_xmask[current_xstate()])
571 panic("iavx->_xh.xstate_bv: 0x%llx", iavx->_xh.xstate_bv);
060df5ea
A
572 for (i = 0; i < sizeof(iavx->_xh.xhrsvd); i++)
573 if (iavx->_xh.xhrsvd[i])
574 panic("Reserved bit set");
575 }
5ba3f43e 576 if (fpu_YMM_capable) {
060df5ea
A
577 if (layout != XSAVE32 && layout != XSAVE64)
578 panic("Inappropriate layout: %u\n", layout);
579 }
580#endif /* DEBUG */
581
5ba3f43e
A
582 switch (layout) {
583 case FXSAVE64:
584 fxrstor64(ifps);
585 break;
586 case FXSAVE32:
060df5ea 587 fxrstor(ifps);
5ba3f43e
A
588 break;
589 case XSAVE64:
590 xrstor64(ifps, xstate_xmask[current_xstate()]);
591 break;
592 case XSAVE32:
593 xrstor(ifps, xstate_xmask[current_xstate()]);
594 break;
595 default:
596 panic("fpu_load_registers() bad layout: %d\n", layout);
597 }
060df5ea
A
598}
599
600static void fpu_store_registers(void *fstate, boolean_t is64) {
601 struct x86_fx_thread_state *ifps = fstate;
602 assert(ALIGNED(ifps, 64));
5ba3f43e
A
603 xstate_t xs = current_xstate();
604 switch (xs) {
605 case FP:
606 if (is64) {
607 fxsave64(fstate);
608 ifps->fp_save_layout = FXSAVE64;
609 } else {
610 fxsave(fstate);
611 ifps->fp_save_layout = FXSAVE32;
612 }
613 break;
614 case AVX:
615#if !defined(RC_HIDE_XNU_J137)
616 case AVX512:
617#endif
618 if (is64) {
619 xsave64(ifps, xstate_xmask[xs]);
620 ifps->fp_save_layout = XSAVE64;
621 } else {
622 xsave(ifps, xstate_xmask[xs]);
623 ifps->fp_save_layout = XSAVE32;
624 }
625 break;
626 default:
627 panic("fpu_store_registers() bad xstate: %d\n", xs);
060df5ea 628 }
1c79356b
A
629}
630
631/*
632 * Initialize FP handling.
633 */
060df5ea 634
1c79356b
A
635void
636fpu_module_init(void)
637{
5ba3f43e
A
638 if (!IS_VALID_XSTATE(fpu_default))
639 panic("fpu_module_init: invalid extended state %u\n",
640 fpu_default);
060df5ea 641
5ba3f43e 642 /* We explicitly choose an allocation size of 13 pages = 64 * 832
060df5ea
A
643 * to eliminate waste for the 832 byte sized
644 * AVX XSAVE register save area.
645 */
5ba3f43e
A
646 ifps_zone[fpu_default] = zinit(fp_state_size[fpu_default],
647 thread_max * fp_state_size[fpu_default],
648 64 * fp_state_size[fpu_default],
649 "x86 fpsave state");
060df5ea 650
060df5ea
A
651 /* To maintain the required alignment, disable
652 * zone debugging for this zone as that appends
653 * 16 bytes to each element.
654 */
5ba3f43e
A
655 zone_change(ifps_zone[fpu_default], Z_ALIGNMENT_REQUIRED, TRUE);
656
657#if !defined(RC_HIDE_XNU_J137)
658 /*
659 * If AVX512 is supported, create a separate savearea zone.
660 * with allocation size: 19 pages = 32 * 2668
661 */
662 if (fpu_capability == AVX512) {
663 ifps_zone[AVX512] = zinit(fp_state_size[AVX512],
664 thread_max * fp_state_size[AVX512],
665 32 * fp_state_size[AVX512],
666 "x86 avx512 save state");
667 zone_change(ifps_zone[AVX512], Z_ALIGNMENT_REQUIRED, TRUE);
668 }
669#endif
670
060df5ea
A
671 /* Determine MXCSR reserved bits and configure initial FPU state*/
672 configure_mxcsr_capability_mask(&initial_fp_state);
673}
674
675/*
5ba3f43e
A
676 * Context switch fpu state.
677 * Always save old thread`s FPU context but don't load new .. allow that to fault-in.
678 * Switch to the new task's xstate.
060df5ea 679 */
d26ffc64 680
060df5ea 681void
5ba3f43e 682fpu_switch_context(thread_t old, thread_t new)
060df5ea 683{
5ba3f43e 684 struct x86_fx_thread_state *ifps;
d26ffc64
A
685 cpu_data_t *cdp = current_cpu_datap();
686 xstate_t new_xstate = new ? thread_xstate(new) : fpu_default;
060df5ea
A
687
688 assert(ml_get_interrupts_enabled() == FALSE);
5ba3f43e 689 ifps = (old)->machine.ifps;
060df5ea
A
690#if DEBUG
691 if (ifps && ((ifps->fp_valid != FALSE) && (ifps->fp_valid != TRUE))) {
692 panic("ifps->fp_valid: %u\n", ifps->fp_valid);
693 }
694#endif
695 if (ifps != 0 && (ifps->fp_valid == FALSE)) {
696 /* Clear CR0.TS in preparation for the FP context save. In
697 * theory, this shouldn't be necessary since a live FPU should
698 * indicate that TS is clear. However, various routines
699 * (such as sendsig & sigreturn) manipulate TS directly.
700 */
701 clear_ts();
702 /* registers are in FPU - save to memory */
d9a64523
A
703 boolean_t is64 = (thread_is_64bit_addr(old) &&
704 is_saved_state64(old->machine.iss));
705
706 fpu_store_registers(ifps, is64);
060df5ea 707 ifps->fp_valid = TRUE;
d26ffc64
A
708
709 if (fpu_ZMM_capable && (cdp->cpu_xstate == AVX512)) {
710 xrstor64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
711 } else if (fpu_YMM_capable) {
712 xrstor64((struct x86_fx_thread_state *) &default_avx_state, xstate_xmask[AVX]);
713 } else {
714 fxrstor64((struct x86_fx_thread_state *)&default_fx_state);
715 }
060df5ea 716 }
d26ffc64
A
717
718 assertf(fpu_YMM_capable ? (xgetbv(XCR0) == xstate_xmask[cdp->cpu_xstate]) : TRUE, "XCR0 mismatch: 0x%llx 0x%x 0x%x", xgetbv(XCR0), cdp->cpu_xstate, xstate_xmask[cdp->cpu_xstate]);
719 if (new_xstate != cdp->cpu_xstate) {
5ba3f43e
A
720 DBG("fpu_switch_context(%p,%p) new xstate: %s\n",
721 old, new, xstate_name[new_xstate]);
722 xsetbv(0, xstate_xmask[new_xstate]);
d26ffc64 723 cdp->cpu_xstate = new_xstate;
5ba3f43e 724 }
060df5ea 725 set_ts();
1c79356b
A
726}
727
060df5ea 728
1c79356b
A
729/*
730 * Free a FPU save area.
731 * Called only when thread terminating - no locking necessary.
732 */
733void
5ba3f43e 734fpu_free(thread_t thread, void *fps)
1c79356b 735{
5ba3f43e
A
736 pcb_t pcb = THREAD_TO_PCB(thread);
737
738 fp_state_free(fps, pcb->xstate);
739 pcb->xstate = UNDEFINED;
1c79356b
A
740}
741
55e303ae
A
742/*
743 * Set the floating-point state for a thread based
744 * on the FXSave formatted data. This is basically
745 * the same as fpu_set_state except it uses the
746 * expanded data structure.
747 * If the thread is not the current thread, it is
748 * not running (held). Locking needed against
749 * concurrent fpu_set_state or fpu_get_state.
750 */
751kern_return_t
752fpu_set_fxstate(
060df5ea
A
753 thread_t thr_act,
754 thread_state_t tstate,
755 thread_flavor_t f)
55e303ae 756{
5ba3f43e
A
757 struct x86_fx_thread_state *ifps;
758 struct x86_fx_thread_state *new_ifps;
759 x86_float_state64_t *state;
760 pcb_t pcb;
761 boolean_t old_valid, fresh_state = FALSE;
fe8ab488 762
5ba3f43e 763 if (fpu_capability == UNDEFINED)
fe8ab488 764 return KERN_FAILURE;
0c530ab8 765
bd504ef0 766 if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
5ba3f43e 767 fpu_capability < AVX)
fe8ab488 768 return KERN_FAILURE;
bd504ef0 769
5ba3f43e
A
770#if !defined(RC_HIDE_XNU_J137)
771 if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
772 thread_xstate(thr_act) == AVX)
773 if (!fpu_thread_promote_avx512(thr_act))
774 return KERN_FAILURE;
775#endif
776
0c530ab8 777 state = (x86_float_state64_t *)tstate;
55e303ae 778
91447636 779 assert(thr_act != THREAD_NULL);
6d2010ae 780 pcb = THREAD_TO_PCB(thr_act);
55e303ae 781
0c530ab8 782 if (state == NULL) {
fe8ab488
A
783 /*
784 * new FPU state is 'invalid'.
785 * Deallocate the fp state if it exists.
786 */
787 simple_lock(&pcb->lock);
0c530ab8
A
788
789 ifps = pcb->ifps;
790 pcb->ifps = 0;
4452a7af 791
fe8ab488 792 simple_unlock(&pcb->lock);
0c530ab8 793
fe8ab488 794 if (ifps != 0) {
5ba3f43e 795 fp_state_free(ifps, thread_xstate(thr_act));
fe8ab488 796 }
0c530ab8 797 } else {
fe8ab488
A
798 /*
799 * Valid incoming state. Allocate the fp state if there is none.
800 */
801 new_ifps = 0;
802 Retry:
803 simple_lock(&pcb->lock);
0c530ab8
A
804
805 ifps = pcb->ifps;
fe8ab488
A
806 if (ifps == 0) {
807 if (new_ifps == 0) {
808 simple_unlock(&pcb->lock);
5ba3f43e 809 new_ifps = fp_state_alloc(thread_xstate(thr_act));
fe8ab488
A
810 goto Retry;
811 }
812 ifps = new_ifps;
813 new_ifps = 0;
814 pcb->ifps = ifps;
5ba3f43e 815 pcb->xstate = thread_xstate(thr_act);
fe8ab488
A
816 fresh_state = TRUE;
817 }
818
819 /*
820 * now copy over the new data.
821 */
822
823 old_valid = ifps->fp_valid;
824
825#if DEBUG || DEVELOPMENT
826 if ((fresh_state == FALSE) && (old_valid == FALSE) && (thr_act != current_thread())) {
827 panic("fpu_set_fxstate inconsistency, thread: %p not stopped", thr_act);
55e303ae 828 }
060df5ea 829#endif
fe8ab488
A
830 /*
831 * Clear any reserved bits in the MXCSR to prevent a GPF
832 * when issuing an FXRSTOR.
833 */
7ddcb079 834
fe8ab488 835 state->fpu_mxcsr &= mxcsr_capability_mask;
060df5ea 836
5ba3f43e 837 bcopy((char *)&state->fpu_fcw, (char *)ifps, fp_state_size[FP]);
060df5ea 838
5ba3f43e
A
839 switch (thread_xstate(thr_act)) {
840 case UNDEFINED:
841 panic("fpu_set_fxstate() UNDEFINED xstate");
842 break;
843 case FP:
d9a64523 844 ifps->fp_save_layout = thread_is_64bit_addr(thr_act) ? FXSAVE64 : FXSAVE32;
5ba3f43e
A
845 break;
846 case AVX: {
fe8ab488 847 struct x86_avx_thread_state *iavx = (void *) ifps;
5ba3f43e 848 x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
7ddcb079 849
d9a64523 850 iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
7ddcb079 851
5ba3f43e
A
852 /* Sanitize XSAVE header */
853 bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
854 iavx->_xh.xstate_bv = AVX_XMASK;
855 iavx->_xh.xcomp_bv = 0;
856
857 if (f == x86_AVX_STATE32) {
858 bcopy_nochk(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
859 } else if (f == x86_AVX_STATE64) {
860 bcopy_nochk(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
861 } else {
862 iavx->_xh.xstate_bv = (XFEM_SSE | XFEM_X87);
fe8ab488 863 }
5ba3f43e
A
864 break;
865 }
866#if !defined(RC_HIDE_XNU_J137)
867 case AVX512: {
868 struct x86_avx512_thread_state *iavx = (void *) ifps;
869 union {
870 thread_state_t ts;
871 x86_avx512_state32_t *s32;
872 x86_avx512_state64_t *s64;
873 } xs = { .ts = tstate };
874
d9a64523 875 iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
fe8ab488 876
fe8ab488
A
877 /* Sanitize XSAVE header */
878 bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
5ba3f43e
A
879 iavx->_xh.xstate_bv = AVX512_XMASK;
880 iavx->_xh.xcomp_bv = 0;
881
882 switch (f) {
883 case x86_AVX512_STATE32:
884 bcopy_nochk(&xs.s32->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
885 bcopy_nochk(&xs.s32->fpu_zmmh0, iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG));
886 bcopy_nochk(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
887 DBG_AVX512_STATE(iavx);
888 break;
889 case x86_AVX_STATE32:
890 bcopy_nochk(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
891 break;
892 case x86_AVX512_STATE64:
893 bcopy_nochk(&xs.s64->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
894 bcopy_nochk(&xs.s64->fpu_zmm16, iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG));
895 bcopy_nochk(&xs.s64->fpu_zmmh0, iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG));
896 bcopy_nochk(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
897 DBG_AVX512_STATE(iavx);
898 break;
899 case x86_AVX_STATE64:
900 bcopy_nochk(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
901 break;
902 }
903 break;
904 }
905#endif
7ddcb079 906 }
5ba3f43e 907
fe8ab488 908 ifps->fp_valid = old_valid;
7ddcb079 909
fe8ab488
A
910 if (old_valid == FALSE) {
911 boolean_t istate = ml_set_interrupts_enabled(FALSE);
912 ifps->fp_valid = TRUE;
913 /* If altering the current thread's state, disable FPU */
914 if (thr_act == current_thread())
915 set_ts();
916
917 ml_set_interrupts_enabled(istate);
918 }
919
920 simple_unlock(&pcb->lock);
921
922 if (new_ifps != 0)
5ba3f43e 923 fp_state_free(new_ifps, thread_xstate(thr_act));
0c530ab8 924 }
55e303ae
A
925 return KERN_SUCCESS;
926}
927
928/*
929 * Get the floating-point state for a thread.
930 * If the thread is not the current thread, it is
931 * not running (held). Locking needed against
932 * concurrent fpu_set_state or fpu_get_state.
933 */
934kern_return_t
935fpu_get_fxstate(
060df5ea
A
936 thread_t thr_act,
937 thread_state_t tstate,
938 thread_flavor_t f)
55e303ae 939{
060df5ea 940 struct x86_fx_thread_state *ifps;
5ba3f43e
A
941 x86_float_state64_t *state;
942 kern_return_t ret = KERN_FAILURE;
943 pcb_t pcb;
55e303ae 944
5ba3f43e 945 if (fpu_capability == UNDEFINED)
2d21ac55 946 return KERN_FAILURE;
0c530ab8 947
bd504ef0 948 if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
5ba3f43e 949 fpu_capability < AVX)
bd504ef0
A
950 return KERN_FAILURE;
951
5ba3f43e
A
952#if !defined(RC_HIDE_XNU_J137)
953 if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
954 thread_xstate(thr_act) != AVX512)
955 return KERN_FAILURE;
956#endif
957
0c530ab8 958 state = (x86_float_state64_t *)tstate;
55e303ae 959
91447636 960 assert(thr_act != THREAD_NULL);
6d2010ae 961 pcb = THREAD_TO_PCB(thr_act);
55e303ae
A
962
963 simple_lock(&pcb->lock);
0c530ab8
A
964
965 ifps = pcb->ifps;
55e303ae 966 if (ifps == 0) {
2d21ac55 967 /*
0c530ab8
A
968 * No valid floating-point state.
969 */
060df5ea
A
970
971 bcopy((char *)&initial_fp_state, (char *)&state->fpu_fcw,
5ba3f43e 972 fp_state_size[FP]);
0c530ab8
A
973
974 simple_unlock(&pcb->lock);
6601e61a 975
0c530ab8
A
976 return KERN_SUCCESS;
977 }
978 /*
979 * Make sure we`ve got the latest fp state info
980 * If the live fpu state belongs to our target
981 */
2d21ac55
A
982 if (thr_act == current_thread()) {
983 boolean_t intr;
8f6c56a5 984
0c530ab8 985 intr = ml_set_interrupts_enabled(FALSE);
89b3af67 986
0c530ab8
A
987 clear_ts();
988 fp_save(thr_act);
989 clear_fpu();
6601e61a 990
0c530ab8 991 (void)ml_set_interrupts_enabled(intr);
6601e61a 992 }
0c530ab8 993 if (ifps->fp_valid) {
5ba3f43e
A
994 bcopy((char *)ifps, (char *)&state->fpu_fcw, fp_state_size[FP]);
995 switch (thread_xstate(thr_act)) {
996 case UNDEFINED:
997 panic("fpu_get_fxstate() UNDEFINED xstate");
998 break;
999 case FP:
1000 break; /* already done */
1001 case AVX: {
7ddcb079 1002 struct x86_avx_thread_state *iavx = (void *) ifps;
5ba3f43e
A
1003 x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
1004 if (f == x86_AVX_STATE32) {
1005 bcopy_nochk(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1006 } else if (f == x86_AVX_STATE64) {
1007 bcopy_nochk(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1008 }
1009 break;
1010 }
1011#if !defined(RC_HIDE_XNU_J137)
1012 case AVX512: {
1013 struct x86_avx512_thread_state *iavx = (void *) ifps;
1014 union {
1015 thread_state_t ts;
1016 x86_avx512_state32_t *s32;
1017 x86_avx512_state64_t *s64;
1018 } xs = { .ts = tstate };
1019 switch (f) {
1020 case x86_AVX512_STATE32:
1021 bcopy_nochk(iavx->x_Opmask, &xs.s32->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
1022 bcopy_nochk(iavx->x_ZMM_Hi256, &xs.s32->fpu_zmmh0, 8 * sizeof(_STRUCT_YMM_REG));
1023 bcopy_nochk(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1024 DBG_AVX512_STATE(iavx);
1025 break;
1026 case x86_AVX_STATE32:
1027 bcopy_nochk(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1028 break;
1029 case x86_AVX512_STATE64:
1030 bcopy_nochk(iavx->x_Opmask, &xs.s64->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
1031 bcopy_nochk(iavx->x_Hi16_ZMM, &xs.s64->fpu_zmm16, 16 * sizeof(_STRUCT_ZMM_REG));
1032 bcopy_nochk(iavx->x_ZMM_Hi256, &xs.s64->fpu_zmmh0, 16 * sizeof(_STRUCT_YMM_REG));
1033 bcopy_nochk(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1034 DBG_AVX512_STATE(iavx);
1035 break;
1036 case x86_AVX_STATE64:
1037 bcopy_nochk(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1038 break;
7ddcb079 1039 }
5ba3f43e
A
1040 break;
1041 }
1042#endif
7ddcb079
A
1043 }
1044
0c530ab8 1045 ret = KERN_SUCCESS;
6601e61a 1046 }
0c530ab8 1047 simple_unlock(&pcb->lock);
21362eb3 1048
0c530ab8 1049 return ret;
6601e61a 1050}
21362eb3 1051
0c530ab8 1052
2d21ac55 1053
6601e61a 1054/*
0c530ab8
A
1055 * the child thread is 'stopped' with the thread
1056 * mutex held and is currently not known by anyone
1057 * so no way for fpu state to get manipulated by an
1058 * outside agency -> no need for pcb lock
6601e61a 1059 */
0c530ab8
A
1060
1061void
1062fpu_dup_fxstate(
1063 thread_t parent,
1064 thread_t child)
6601e61a 1065{
060df5ea
A
1066 struct x86_fx_thread_state *new_ifps = NULL;
1067 boolean_t intr;
0c530ab8 1068 pcb_t ppcb;
5ba3f43e 1069 xstate_t xstate = thread_xstate(parent);
21362eb3 1070
6d2010ae 1071 ppcb = THREAD_TO_PCB(parent);
21362eb3 1072
0c530ab8
A
1073 if (ppcb->ifps == NULL)
1074 return;
4452a7af 1075
6d2010ae 1076 if (child->machine.ifps)
0c530ab8 1077 panic("fpu_dup_fxstate: child's ifps non-null");
4452a7af 1078
5ba3f43e 1079 new_ifps = fp_state_alloc(xstate);
5d5c5d0d 1080
0c530ab8 1081 simple_lock(&ppcb->lock);
6601e61a 1082
0c530ab8 1083 if (ppcb->ifps != NULL) {
060df5ea 1084 struct x86_fx_thread_state *ifps = ppcb->ifps;
0c530ab8
A
1085 /*
1086 * Make sure we`ve got the latest fp state info
1087 */
39037602
A
1088 if (current_thread() == parent) {
1089 intr = ml_set_interrupts_enabled(FALSE);
1090 assert(current_thread() == parent);
1091 clear_ts();
1092 fp_save(parent);
1093 clear_fpu();
1094
1095 (void)ml_set_interrupts_enabled(intr);
1096 }
6601e61a 1097
060df5ea 1098 if (ifps->fp_valid) {
6d2010ae 1099 child->machine.ifps = new_ifps;
5ba3f43e 1100 child->machine.xstate = xstate;
060df5ea 1101 bcopy((char *)(ppcb->ifps),
5ba3f43e
A
1102 (char *)(child->machine.ifps),
1103 fp_state_size[xstate]);
0c530ab8 1104
2d21ac55
A
1105 /* Mark the new fp saved state as non-live. */
1106 /* Temporarily disabled: radar 4647827
1107 * new_ifps->fp_valid = TRUE;
1108 */
060df5ea 1109
0c530ab8
A
1110 /*
1111 * Clear any reserved bits in the MXCSR to prevent a GPF
1112 * when issuing an FXRSTOR.
1113 */
060df5ea 1114 new_ifps->fx_MXCSR &= mxcsr_capability_mask;
0c530ab8
A
1115 new_ifps = NULL;
1116 }
6601e61a 1117 }
0c530ab8 1118 simple_unlock(&ppcb->lock);
89b3af67 1119
0c530ab8 1120 if (new_ifps != NULL)
5ba3f43e 1121 fp_state_free(new_ifps, xstate);
6601e61a 1122}
4452a7af 1123
1c79356b
A
1124/*
1125 * Initialize FPU.
d26ffc64
A
1126 * FNINIT programs the x87 control word to 0x37f, which matches
1127 * the desired default for macOS.
1c79356b 1128 */
060df5ea 1129
1c79356b 1130void
d26ffc64
A
1131fpinit(void) {
1132 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1c79356b
A
1133 clear_ts();
1134 fninit();
d26ffc64
A
1135#if DEBUG
1136 /* We skip this power-on-default verification sequence on
1137 * non-DEBUG, as dirtying the x87 control word may slow down
1138 * xsave/xrstor and affect energy use.
1139 */
1140 unsigned short control, control2;
1c79356b 1141 fnstcw(&control);
d26ffc64 1142 control2 = control;
1c79356b 1143 control &= ~(FPC_PC|FPC_RC); /* Clear precision & rounding control */
0c530ab8 1144 control |= (FPC_PC_64 | /* Set precision */
1c79356b
A
1145 FPC_RC_RN | /* round-to-nearest */
1146 FPC_ZE | /* Suppress zero-divide */
1147 FPC_OE | /* and overflow */
1148 FPC_UE | /* underflow */
1149 FPC_IE | /* Allow NaNQs and +-INF */
1150 FPC_DE | /* Allow denorms as operands */
1151 FPC_PE); /* No trap for precision loss */
d26ffc64 1152 assert(control == control2);
1c79356b 1153 fldcw(control);
d26ffc64 1154#endif
0c530ab8 1155 /* Initialize SSE/SSE2 */
060df5ea 1156 __builtin_ia32_ldmxcsr(0x1f80);
d26ffc64
A
1157 if (fpu_YMM_capable) {
1158 vzeroall();
1159 } else {
1160 xmmzeroall();
1161 }
1162 ml_set_interrupts_enabled(istate);
b0d623f7 1163}
1c79356b
A
1164
1165/*
1166 * Coprocessor not present.
1167 */
1168
3e170ce0
A
1169uint64_t x86_isr_fp_simd_use;
1170
1c79356b
A
1171void
1172fpnoextflt(void)
1173{
0c530ab8 1174 boolean_t intr;
2d21ac55
A
1175 thread_t thr_act;
1176 pcb_t pcb;
060df5ea 1177 struct x86_fx_thread_state *ifps = 0;
5ba3f43e 1178 xstate_t xstate = current_xstate();
2d21ac55
A
1179
1180 thr_act = current_thread();
6d2010ae 1181 pcb = THREAD_TO_PCB(thr_act);
2d21ac55 1182
060df5ea 1183 if (pcb->ifps == 0 && !get_interrupt_level()) {
5ba3f43e 1184 ifps = fp_state_alloc(xstate);
060df5ea 1185 bcopy((char *)&initial_fp_state, (char *)ifps,
5ba3f43e 1186 fp_state_size[xstate]);
d9a64523 1187 if (!thread_is_64bit_addr(thr_act)) {
5ba3f43e 1188 ifps->fp_save_layout = fpu_YMM_capable ? XSAVE32 : FXSAVE32;
060df5ea
A
1189 }
1190 else
5ba3f43e 1191 ifps->fp_save_layout = fpu_YMM_capable ? XSAVE64 : FXSAVE64;
060df5ea
A
1192 ifps->fp_valid = TRUE;
1193 }
0c530ab8
A
1194 intr = ml_set_interrupts_enabled(FALSE);
1195
1196 clear_ts(); /* Enable FPU use */
1197
6d2010ae 1198 if (__improbable(get_interrupt_level())) {
3e170ce0
A
1199 /* Track number of #DNA traps at interrupt context,
1200 * which is likely suboptimal. Racy, but good enough.
1201 */
1202 x86_isr_fp_simd_use++;
0c530ab8 1203 /*
3e170ce0
A
1204 * Save current FP/SIMD context if valid
1205 * Initialize live FP/SIMD registers
0c530ab8 1206 */
3e170ce0
A
1207 if (pcb->ifps) {
1208 fp_save(thr_act);
1209 }
0c530ab8
A
1210 fpinit();
1211 } else {
2d21ac55
A
1212 if (pcb->ifps == 0) {
1213 pcb->ifps = ifps;
5ba3f43e 1214 pcb->xstate = xstate;
2d21ac55
A
1215 ifps = 0;
1216 }
0c530ab8
A
1217 /*
1218 * Load this thread`s state into coprocessor live context.
1219 */
2d21ac55 1220 fp_load(thr_act);
0c530ab8 1221 }
0c530ab8 1222 (void)ml_set_interrupts_enabled(intr);
2d21ac55
A
1223
1224 if (ifps)
5ba3f43e 1225 fp_state_free(ifps, xstate);
1c79356b
A
1226}
1227
1228/*
1229 * FPU overran end of segment.
1230 * Re-initialize FPU. Floating point state is not valid.
1231 */
1232
1233void
1234fpextovrflt(void)
1235{
0c530ab8
A
1236 thread_t thr_act = current_thread();
1237 pcb_t pcb;
060df5ea 1238 struct x86_fx_thread_state *ifps;
0c530ab8 1239 boolean_t intr;
5ba3f43e 1240 xstate_t xstate = current_xstate();
0c530ab8
A
1241
1242 intr = ml_set_interrupts_enabled(FALSE);
1243
1244 if (get_interrupt_level())
2d21ac55 1245 panic("FPU segment overrun exception at interrupt context\n");
0c530ab8
A
1246 if (current_task() == kernel_task)
1247 panic("FPU segment overrun exception in kernel thread context\n");
1c79356b 1248
1c79356b
A
1249 /*
1250 * This is a non-recoverable error.
1251 * Invalidate the thread`s FPU state.
1252 */
6d2010ae 1253 pcb = THREAD_TO_PCB(thr_act);
1c79356b 1254 simple_lock(&pcb->lock);
0c530ab8
A
1255 ifps = pcb->ifps;
1256 pcb->ifps = 0;
1c79356b
A
1257 simple_unlock(&pcb->lock);
1258
1259 /*
1260 * Re-initialize the FPU.
1261 */
1262 clear_ts();
1263 fninit();
1264
1265 /*
1266 * And disable access.
1267 */
1268 clear_fpu();
1269
0c530ab8
A
1270 (void)ml_set_interrupts_enabled(intr);
1271
1c79356b 1272 if (ifps)
5ba3f43e 1273 fp_state_free(ifps, xstate);
1c79356b
A
1274
1275 /*
1276 * Raise exception.
1277 */
1278 i386_exception(EXC_BAD_ACCESS, VM_PROT_READ|VM_PROT_EXECUTE, 0);
1279 /*NOTREACHED*/
1280}
1281
cc8bc92a
A
1282extern void fpxlog(int, uint32_t, uint32_t, uint32_t);
1283
1c79356b
A
1284/*
1285 * FPU error. Called by AST.
1286 */
1287
1288void
1289fpexterrflt(void)
1290{
0c530ab8 1291 thread_t thr_act = current_thread();
6d2010ae 1292 struct x86_fx_thread_state *ifps = thr_act->machine.ifps;
0c530ab8
A
1293 boolean_t intr;
1294
1295 intr = ml_set_interrupts_enabled(FALSE);
1296
1297 if (get_interrupt_level())
1298 panic("FPU error exception at interrupt context\n");
1299 if (current_task() == kernel_task)
1300 panic("FPU error exception in kernel thread context\n");
1c79356b 1301
1c79356b
A
1302 /*
1303 * Save the FPU state and turn off the FPU.
1304 */
1305 fp_save(thr_act);
1c79356b 1306
0c530ab8
A
1307 (void)ml_set_interrupts_enabled(intr);
1308
cc8bc92a
A
1309 const uint32_t mask = ifps->fx_control &
1310 (FPC_IM | FPC_DM | FPC_ZM | FPC_OM | FPC_UE | FPC_PE);
1311 const uint32_t xcpt = ~mask & (ifps->fx_status &
1312 (FPS_IE | FPS_DE | FPS_ZE | FPS_OE | FPS_UE | FPS_PE));
1313 fpxlog(EXC_I386_EXTERR, ifps->fx_status, ifps->fx_control, xcpt);
1c79356b
A
1314 /*
1315 * Raise FPU exception.
0c530ab8 1316 * Locking not needed on pcb->ifps,
1c79356b
A
1317 * since thread is running.
1318 */
1319 i386_exception(EXC_ARITHMETIC,
1320 EXC_I386_EXTERR,
060df5ea 1321 ifps->fx_status);
0c530ab8 1322
1c79356b
A
1323 /*NOTREACHED*/
1324}
1325
1326/*
1327 * Save FPU state.
1328 *
1329 * Locking not needed:
1330 * . if called from fpu_get_state, pcb already locked.
1331 * . if called from fpnoextflt or fp_intr, we are single-cpu
1332 * . otherwise, thread is running.
0c530ab8 1333 * N.B.: Must be called with interrupts disabled
1c79356b 1334 */
0c530ab8 1335
1c79356b
A
1336void
1337fp_save(
91447636 1338 thread_t thr_act)
1c79356b 1339{
6d2010ae 1340 pcb_t pcb = THREAD_TO_PCB(thr_act);
060df5ea 1341 struct x86_fx_thread_state *ifps = pcb->ifps;
0c530ab8 1342
060df5ea 1343 assert(ifps != 0);
1c79356b 1344 if (ifps != 0 && !ifps->fp_valid) {
0c530ab8
A
1345 assert((get_cr0() & CR0_TS) == 0);
1346 /* registers are in FPU */
1347 ifps->fp_valid = TRUE;
d9a64523 1348 fpu_store_registers(ifps, thread_is_64bit_addr(thr_act));
1c79356b
A
1349 }
1350}
1351
1352/*
1353 * Restore FPU state from PCB.
1354 *
1355 * Locking not needed; always called on the current thread.
1356 */
1357
1358void
1359fp_load(
91447636 1360 thread_t thr_act)
1c79356b 1361{
6d2010ae 1362 pcb_t pcb = THREAD_TO_PCB(thr_act);
060df5ea 1363 struct x86_fx_thread_state *ifps = pcb->ifps;
0c530ab8 1364
060df5ea 1365 assert(ifps);
39236c6e
A
1366#if DEBUG
1367 if (ifps->fp_valid != FALSE && ifps->fp_valid != TRUE) {
1368 panic("fp_load() invalid fp_valid: %u, fp_save_layout: %u\n",
1369 ifps->fp_valid, ifps->fp_save_layout);
1370 }
1371#endif
060df5ea
A
1372
1373 if (ifps->fp_valid == FALSE) {
0c530ab8 1374 fpinit();
1c79356b 1375 } else {
060df5ea 1376 fpu_load_registers(ifps);
1c79356b
A
1377 }
1378 ifps->fp_valid = FALSE; /* in FPU */
1379}
1380
1c79356b 1381/*
0c530ab8
A
1382 * SSE arithmetic exception handling code.
1383 * Basically the same as the x87 exception handler with a different subtype
1c79356b
A
1384 */
1385
1386void
0c530ab8 1387fpSSEexterrflt(void)
1c79356b 1388{
0c530ab8 1389 thread_t thr_act = current_thread();
6d2010ae 1390 struct x86_fx_thread_state *ifps = thr_act->machine.ifps;
0c530ab8 1391 boolean_t intr;
4452a7af 1392
0c530ab8
A
1393 intr = ml_set_interrupts_enabled(FALSE);
1394
1395 if (get_interrupt_level())
1396 panic("SSE exception at interrupt context\n");
1397 if (current_task() == kernel_task)
1398 panic("SSE exception in kernel thread context\n");
1c79356b
A
1399
1400 /*
0c530ab8 1401 * Save the FPU state and turn off the FPU.
1c79356b 1402 */
1c79356b 1403 fp_save(thr_act);
1c79356b 1404
0c530ab8 1405 (void)ml_set_interrupts_enabled(intr);
1c79356b 1406 /*
0c530ab8
A
1407 * Raise FPU exception.
1408 * Locking not needed on pcb->ifps,
1409 * since thread is running.
1c79356b 1410 */
cc8bc92a
A
1411 const uint32_t mask = (ifps->fx_MXCSR >> 7) &
1412 (FPC_IM | FPC_DM | FPC_ZM | FPC_OM | FPC_UE | FPC_PE);
1413 const uint32_t xcpt = ~mask & (ifps->fx_MXCSR &
1414 (FPS_IE | FPS_DE | FPS_ZE | FPS_OE | FPS_UE | FPS_PE));
1415 fpxlog(EXC_I386_SSEEXTERR, ifps->fx_MXCSR, ifps->fx_MXCSR, xcpt);
fe8ab488 1416
0c530ab8
A
1417 i386_exception(EXC_ARITHMETIC,
1418 EXC_I386_SSEEXTERR,
060df5ea 1419 ifps->fx_MXCSR);
0c530ab8
A
1420 /*NOTREACHED*/
1421}
1422
5ba3f43e
A
1423
1424#if !defined(RC_HIDE_XNU_J137)
1425/*
1426 * If a thread is using an AVX-sized savearea:
1427 * - allocate a new AVX512-sized area,
1428 * - copy the 256-bit state into the 512-bit area,
1429 * - deallocate the smaller area
1430 */
1431static void
1432fpu_savearea_promote_avx512(thread_t thread)
1433{
cc8bc92a
A
1434 struct x86_avx_thread_state *ifps = NULL;
1435 struct x86_avx512_thread_state *ifps512 = NULL;
5ba3f43e 1436 pcb_t pcb = THREAD_TO_PCB(thread);
cc8bc92a 1437 boolean_t do_avx512_alloc = FALSE;
5ba3f43e
A
1438
1439 DBG("fpu_upgrade_savearea(%p)\n", thread);
cc8bc92a 1440
5ba3f43e 1441 simple_lock(&pcb->lock);
cc8bc92a 1442
5ba3f43e
A
1443 ifps = pcb->ifps;
1444 if (ifps == NULL) {
cc8bc92a 1445 pcb->xstate = AVX512;
5ba3f43e 1446 simple_unlock(&pcb->lock);
cc8bc92a
A
1447 if (thread != current_thread()) {
1448 /* nothing to be done */
1449
1450 return;
1451 }
1452 fpnoextflt();
5ba3f43e
A
1453 return;
1454 }
cc8bc92a
A
1455
1456 if (pcb->xstate != AVX512) {
1457 do_avx512_alloc = TRUE;
1458 }
1459 simple_unlock(&pcb->lock);
1460
1461 if (do_avx512_alloc == TRUE) {
1462 ifps512 = fp_state_alloc(AVX512);
1463 }
1464
1465 simple_lock(&pcb->lock);
5ba3f43e
A
1466 if (thread == current_thread()) {
1467 boolean_t intr;
1468
1469 intr = ml_set_interrupts_enabled(FALSE);
1470
1471 clear_ts();
1472 fp_save(thread);
1473 clear_fpu();
1474
1475 xsetbv(0, AVX512_XMASK);
d26ffc64 1476 current_cpu_datap()->cpu_xstate = AVX512;
5ba3f43e
A
1477 (void)ml_set_interrupts_enabled(intr);
1478 }
1479 assert(ifps->fp.fp_valid);
1480
1481 /* Allocate an AVX512 savearea and copy AVX state into it */
cc8bc92a
A
1482 if (pcb->xstate != AVX512) {
1483 bcopy(ifps, ifps512, fp_state_size[AVX]);
1484 pcb->ifps = ifps512;
1485 pcb->xstate = AVX512;
1486 ifps512 = NULL;
1487 } else {
1488 ifps = NULL;
1489 }
1490 /* The PCB lock is redundant in some scenarios given the higher level
1491 * thread mutex, but its pre-emption disablement is relied upon here
1492 */
5ba3f43e 1493 simple_unlock(&pcb->lock);
cc8bc92a
A
1494
1495 if (ifps) {
1496 fp_state_free(ifps, AVX);
1497 }
1498 if (ifps512) {
1499 fp_state_free(ifps, AVX512);
1500 }
5ba3f43e
A
1501}
1502
1503/*
1504 * Upgrade the calling thread to AVX512.
1505 */
1506boolean_t
1507fpu_thread_promote_avx512(thread_t thread)
1508{
1509 task_t task = current_task();
1510
1511 if (thread != current_thread())
1512 return FALSE;
1513 if (!ml_fpu_avx512_enabled())
1514 return FALSE;
1515
1516 fpu_savearea_promote_avx512(thread);
1517
1518 /* Racy but the task's xstate is only a hint */
1519 task->xstate = AVX512;
1520
1521 return TRUE;
1522}
1523
1524
1525/*
1526 * Called from user_trap() when an invalid opcode fault is taken.
1527 * If the user is attempting an AVX512 instruction on a machine
1528 * that supports this, we switch the calling thread to use
1529 * a larger savearea, set its XCR0 bit mask to enable AVX512 and
1530 * return directly via thread_exception_return().
1531 * Otherwise simply return.
1532 */
cc8bc92a 1533#define MAX_X86_INSN_LENGTH (16)
5ba3f43e
A
1534void
1535fpUDflt(user_addr_t rip)
1536{
1537 uint8_t instruction_prefix;
1538 boolean_t is_AVX512_instruction = FALSE;
cc8bc92a 1539 user_addr_t original_rip = rip;
5ba3f43e 1540 do {
cc8bc92a
A
1541 /* TODO: as an optimisation, copy up to the lesser of the
1542 * next page boundary or maximal prefix length in one pass
1543 * rather than issue multiple copyins
1544 */
1545 if (copyin(rip, (char *) &instruction_prefix, 1)) {
5ba3f43e 1546 return;
cc8bc92a 1547 }
5ba3f43e
A
1548 DBG("fpUDflt(0x%016llx) prefix: 0x%x\n",
1549 rip, instruction_prefix);
cc8bc92a
A
1550 /* TODO: determine more specifically which prefixes
1551 * are sane possibilities for AVX512 insns
1552 */
5ba3f43e
A
1553 switch (instruction_prefix) {
1554 case 0x2E: /* CS segment override */
1555 case 0x36: /* SS segment override */
1556 case 0x3E: /* DS segment override */
1557 case 0x26: /* ES segment override */
1558 case 0x64: /* FS segment override */
1559 case 0x65: /* GS segment override */
cc8bc92a 1560 case 0x66: /* Operand-size override */
5ba3f43e
A
1561 case 0x67: /* address-size override */
1562 /* Skip optional prefixes */
1563 rip++;
cc8bc92a
A
1564 if ((rip - original_rip) > MAX_X86_INSN_LENGTH) {
1565 return;
1566 }
5ba3f43e
A
1567 break;
1568 case 0x62: /* EVEX */
1569 case 0xC5: /* VEX 2-byte */
1570 case 0xC4: /* VEX 3-byte */
1571 is_AVX512_instruction = TRUE;
1572 break;
1573 default:
1574 return;
1575 }
1576 } while (!is_AVX512_instruction);
1577
1578 /* Here if we detect attempted execution of an AVX512 instruction */
1579
1580 /*
cc8bc92a 1581 * Fail if this machine doesn't support AVX512
5ba3f43e 1582 */
cc8bc92a 1583 if (fpu_capability != AVX512)
5ba3f43e
A
1584 return;
1585
1586 assert(xgetbv(XCR0) == AVX_XMASK);
1587
1588 DBG("fpUDflt() switching xstate to AVX512\n");
1589 (void) fpu_thread_promote_avx512(current_thread());
1590
1591 thread_exception_return();
1592 /* NOT REACHED */
1593}
1594#endif /* !defined(RC_HIDE_XNU_J137) */
1595
0c530ab8
A
1596void
1597fp_setvalid(boolean_t value) {
1598 thread_t thr_act = current_thread();
6d2010ae 1599 struct x86_fx_thread_state *ifps = thr_act->machine.ifps;
0c530ab8
A
1600
1601 if (ifps) {
1602 ifps->fp_valid = value;
1603
060df5ea
A
1604 if (value == TRUE) {
1605 boolean_t istate = ml_set_interrupts_enabled(FALSE);
0c530ab8 1606 clear_fpu();
060df5ea
A
1607 ml_set_interrupts_enabled(istate);
1608 }
0c530ab8 1609 }
1c79356b 1610}
060df5ea 1611
316670eb 1612boolean_t
060df5ea 1613ml_fpu_avx_enabled(void) {
5ba3f43e
A
1614 return (fpu_capability >= AVX);
1615}
1616
1617#if !defined(RC_HIDE_XNU_J137)
1618boolean_t
1619ml_fpu_avx512_enabled(void) {
1620 return (fpu_capability == AVX512);
1621}
1622#endif
1623
1624static xstate_t
1625task_xstate(task_t task)
1626{
1627 if (task == TASK_NULL)
1628 return fpu_default;
1629 else
1630 return task->xstate;
1631}
1632
1633static xstate_t
1634thread_xstate(thread_t thread)
1635{
1636 xstate_t xs = THREAD_TO_PCB(thread)->xstate;
1637 if (xs == UNDEFINED)
1638 return task_xstate(thread->task);
1639 else
1640 return xs;
1641}
1642
1643xstate_t
1644current_xstate(void)
1645{
1646 return thread_xstate(current_thread());
1647}
1648
1649/*
1650 * Called when exec'ing between bitnesses.
1651 * If valid FPU state exists, adjust the layout.
1652 */
1653void
1654fpu_switch_addrmode(thread_t thread, boolean_t is_64bit)
1655{
1656 struct x86_fx_thread_state *ifps = thread->machine.ifps;
d26ffc64 1657 mp_disable_preemption();
5ba3f43e
A
1658
1659 if (ifps && ifps->fp_valid) {
1660 if (thread_xstate(thread) == FP) {
1661 ifps->fp_save_layout = is_64bit ? FXSAVE64 : FXSAVE32;
1662 } else {
1663 ifps->fp_save_layout = is_64bit ? XSAVE64 : XSAVE32;
1664 }
1665 }
d26ffc64
A
1666 mp_enable_preemption();
1667}
1668
1669static inline uint32_t fpsimd_pop(uintptr_t ins, int sz) {
1670 uint32_t rv = 0;
1671
1672
1673 while (sz >= 16) {
1674 uint32_t rv1, rv2;
1675 uint64_t *ins64 = (uint64_t *) ins;
1676 uint64_t *ins642 = (uint64_t *) (ins + 8);
1677 rv1 = __builtin_popcountll(*ins64);
1678 rv2 = __builtin_popcountll(*ins642);
1679 rv += rv1 + rv2;
1680 sz -= 16;
1681 ins += 16;
1682 }
1683
1684 while (sz >= 4) {
1685 uint32_t *ins32 = (uint32_t *) ins;
1686 rv += __builtin_popcount(*ins32);
1687 sz -= 4;
1688 ins += 4;
1689 }
1690
1691 while (sz > 0) {
1692 char *ins8 = (char *)ins;
1693 rv += __builtin_popcount(*ins8);
1694 sz--;
1695 ins++;
1696 }
1697 return rv;
1698}
1699
1700uint32_t thread_fpsimd_hash(thread_t ft) {
1701 if (fpsimd_fault_popc == 0)
1702 return 0;
1703
1704 uint32_t prv = 0;
1705 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1706 struct x86_fx_thread_state *pifps = THREAD_TO_PCB(ft)->ifps;
1707
1708 if (pifps) {
1709 if (pifps->fp_valid) {
1710 prv = fpsimd_pop((uintptr_t) &pifps->fx_XMM_reg[0][0],
1711 sizeof(pifps->fx_XMM_reg));
1712 } else {
1713 uintptr_t cr0 = get_cr0();
1714 clear_ts();
1715 fp_save(ft);
1716 prv = fpsimd_pop((uintptr_t) &pifps->fx_XMM_reg[0][0],
1717 sizeof(pifps->fx_XMM_reg));
1718 pifps->fp_valid = FALSE;
1719 if (cr0 & CR0_TS) {
1720 set_cr0(cr0);
1721 }
1722 }
1723 }
1724 ml_set_interrupts_enabled(istate);
1725 return prv;
060df5ea 1726}