git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2013 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	#include <mach/mach_types.h>
	29	#include <kern/assert.h>
	30	#include <kern/clock.h>
	31	#include <kern/coalition.h>
	32	#include <kern/debug.h>
	33	#include <kern/startup.h>
	34	#include <kern/host.h>
	35	#include <kern/kern_types.h>
	36	#include <kern/machine.h>
	37	#include <kern/simple_lock.h>
	38	#include <kern/misc_protos.h>
	39	#include <kern/sched.h>
	40	#include <kern/sched_prim.h>
	41	#include <kern/sfi.h>
	42	#include <kern/timer_call.h>
	43	#include <kern/waitq.h>
	44	#include <kern/ledger.h>
	45	#include <kern/policy_internal.h>
	46
	47	#include <machine/atomic.h>
	48
	49	#include <pexpert/pexpert.h>
	50
	51	#include <libkern/kernel_mach_header.h>
	52
	53	#include <sys/kdebug.h>
	54
	55	#if CONFIG_SCHED_SFI
	56
	57	#define SFI_DEBUG 0
	58
	59	#if SFI_DEBUG
	60	#define dprintf(...) kprintf(__VA_ARGS__)
	61	#else
	62	#define dprintf(...) do { } while(0)
	63	#endif
	64
	65	/*
	66	* SFI (Selective Forced Idle) operates by enabling a global
	67	* timer on the SFI window interval. When it fires, all processors
	68	* running a thread that should be SFI-ed are sent an AST.
	69	* As threads become runnable while in their "off phase", they
	70	* are placed on a deferred ready queue. When a per-class
	71	* "on timer" fires, the ready threads for that class are
	72	* re-enqueued for running. As an optimization to avoid spurious
	73	* wakeups, the timer may be lazily programmed.
	74	*/
	75
	76	/*
	77	* The "sfi_lock" simple lock guards access to static configuration
	78	* parameters (as specified by userspace), dynamic state changes
	79	* (as updated by the timer event routine), and timer data structures.
	80	* Since it can be taken with interrupts disabled in some cases, all
	81	* uses should be taken with interrupts disabled at splsched(). The
	82	* "sfi_lock" also guards the "sfi_wait_class" field of thread_t, and
	83	* must only be accessed with it held.
	84	*
	85	* When an "on timer" fires, we must deterministically be able to drain
	86	* the wait queue, since if any threads are added to the queue afterwards,
	87	* they may never get woken out of SFI wait. So sfi_lock must be
	88	* taken before the wait queue's own spinlock.
	89	*
	90	* The wait queue will take the thread's scheduling lock. We may also take
	91	* the thread_lock directly to update the "sfi_class" field and determine
	92	* if the thread should block in the wait queue, but the lock will be
	93	* released before doing so.
	94	*
	95	* The pset lock may also be taken, but not while any other locks are held.
	96	*
	97	* The task and thread mutex may also be held while reevaluating sfi state.
	98	*
	99	* splsched ---> sfi_lock ---> waitq ---> thread_lock
	100	* \ \ \__ thread_lock (*)
	101	* \ \__ pset_lock
	102	* \
	103	* \__ thread_lock
	104	*/
	105
	106	decl_simple_lock_data(static, sfi_lock);
	107	static timer_call_data_t sfi_timer_call_entry;
	108	volatile boolean_t sfi_is_enabled;
	109
	110	boolean_t sfi_window_is_set;
	111	uint64_t sfi_window_usecs;
	112	uint64_t sfi_window_interval;
	113	uint64_t sfi_next_off_deadline;
	114
	115	typedef struct {
	116	sfi_class_id_t class_id;
	117	thread_continue_t class_continuation;
	118	const char * class_name;
	119	const char * class_ledger_name;
	120	} sfi_class_registration_t;
	121
	122	/*
	123	* To add a new SFI class:
	124	*
	125	* 1) Raise MAX_SFI_CLASS_ID in mach/sfi_class.h
	126	* 2) Add a #define for it to mach/sfi_class.h. It need not be inserted in order of restrictiveness.
	127	* 3) Add a call to SFI_CLASS_REGISTER below
	128	* 4) Augment sfi_thread_classify to categorize threads as early as possible for as restrictive as possible.
	129	* 5) Modify thermald to use the SFI class
	130	*/
	131
	132	static inline void _sfi_wait_cleanup(void);
	133
	134	static void sfi_class_register(sfi_class_registration_t *);
	135
	136	#define SFI_CLASS_REGISTER(clsid, ledger_name) \
	137	\
	138	static void __attribute__((noinline, noreturn)) \
	139	SFI_ ## clsid ## _THREAD_IS_WAITING(void *arg __unused, wait_result_t wret __unused) \
	140	{ \
	141	_sfi_wait_cleanup(); \
	142	thread_exception_return(); \
	143	} \
	144	\
	145	static_assert(SFI_CLASS_ ## clsid < MAX_SFI_CLASS_ID, "Invalid ID"); \
	146	\
	147	static __startup_data sfi_class_registration_t \
	148	SFI_ ## clsid ## _registration = { \
	149	.class_id = SFI_CLASS_ ## clsid, \
	150	.class_continuation = SFI_ ## clsid ## _THREAD_IS_WAITING, \
	151	.class_name = "SFI_CLASS_" # clsid, \
	152	.class_ledger_name = "SFI_CLASS_" # ledger_name, \
	153	}; \
	154	STARTUP_ARG(TUNABLES, STARTUP_RANK_MIDDLE, \
	155	sfi_class_register, &SFI_ ## clsid ## _registration)
	156
	157	/* SFI_CLASS_UNSPECIFIED not included here */
	158	SFI_CLASS_REGISTER(MAINTENANCE, MAINTENANCE);
	159	SFI_CLASS_REGISTER(DARWIN_BG, DARWIN_BG);
	160	SFI_CLASS_REGISTER(APP_NAP, APP_NAP);
	161	SFI_CLASS_REGISTER(MANAGED_FOCAL, MANAGED);
	162	SFI_CLASS_REGISTER(MANAGED_NONFOCAL, MANAGED);
	163	SFI_CLASS_REGISTER(UTILITY, UTILITY);
	164	SFI_CLASS_REGISTER(DEFAULT_FOCAL, DEFAULT);
	165	SFI_CLASS_REGISTER(DEFAULT_NONFOCAL, DEFAULT);
	166	SFI_CLASS_REGISTER(LEGACY_FOCAL, LEGACY);
	167	SFI_CLASS_REGISTER(LEGACY_NONFOCAL, LEGACY);
	168	SFI_CLASS_REGISTER(USER_INITIATED_FOCAL, USER_INITIATED);
	169	SFI_CLASS_REGISTER(USER_INITIATED_NONFOCAL, USER_INITIATED);
	170	SFI_CLASS_REGISTER(USER_INTERACTIVE_FOCAL, USER_INTERACTIVE);
	171	SFI_CLASS_REGISTER(USER_INTERACTIVE_NONFOCAL, USER_INTERACTIVE);
	172	SFI_CLASS_REGISTER(KERNEL, OPTED_OUT);
	173	SFI_CLASS_REGISTER(OPTED_OUT, OPTED_OUT);
	174
	175	struct sfi_class_state {
	176	uint64_t off_time_usecs;
	177	uint64_t off_time_interval;
	178
	179	timer_call_data_t on_timer;
	180	uint64_t on_timer_deadline;
	181	boolean_t on_timer_programmed;
	182
	183	boolean_t class_sfi_is_enabled;
	184	volatile boolean_t class_in_on_phase;
	185
	186	struct waitq waitq; /* threads in ready state */
	187	thread_continue_t continuation;
	188
	189	const char * class_name;
	190	const char * class_ledger_name;
	191	};
	192
	193	/* Static configuration performed in sfi_early_init() */
	194	struct sfi_class_state sfi_classes[MAX_SFI_CLASS_ID];
	195
	196	int sfi_enabled_class_count; // protected by sfi_lock and used atomically
	197
	198	static void sfi_timer_global_off(
	199	timer_call_param_t param0,
	200	timer_call_param_t param1);
	201
	202	static void sfi_timer_per_class_on(
	203	timer_call_param_t param0,
	204	timer_call_param_t param1);
	205
	206	/* Called early in boot, when kernel is single-threaded */
	207	__startup_func
	208	static void
	209	sfi_class_register(sfi_class_registration_t *reg)
	210	{
	211	sfi_class_id_t class_id = reg->class_id;
	212
	213	if (class_id >= MAX_SFI_CLASS_ID) {
	214	panic("Invalid SFI class 0x%x", class_id);
	215	}
	216	if (sfi_classes[class_id].continuation != NULL) {
	217	panic("Duplicate SFI registration for class 0x%x", class_id);
	218	}
	219	sfi_classes[class_id].class_sfi_is_enabled = FALSE;
	220	sfi_classes[class_id].class_in_on_phase = TRUE;
	221	sfi_classes[class_id].continuation = reg->class_continuation;
	222	sfi_classes[class_id].class_name = reg->class_name;
	223	sfi_classes[class_id].class_ledger_name = reg->class_ledger_name;
	224	}
	225
	226	void
	227	sfi_init(void)
	228	{
	229	sfi_class_id_t i;
	230	kern_return_t kret;
	231
	232	simple_lock_init(&sfi_lock, 0);
	233	timer_call_setup(&sfi_timer_call_entry, sfi_timer_global_off, NULL);
	234	sfi_window_is_set = FALSE;
	235	os_atomic_init(&sfi_enabled_class_count, 0);
	236	sfi_is_enabled = FALSE;
	237
	238	for (i = 0; i < MAX_SFI_CLASS_ID; i++) {
	239	/* If the class was set up in sfi_early_init(), initialize remaining fields */
	240	if (sfi_classes[i].continuation) {
	241	timer_call_setup(&sfi_classes[i].on_timer, sfi_timer_per_class_on, (void *)(uintptr_t)i);
	242	sfi_classes[i].on_timer_programmed = FALSE;
	243
	244	kret = waitq_init(&sfi_classes[i].waitq, SYNC_POLICY_FIFO \| SYNC_POLICY_DISABLE_IRQ);
	245	assert(kret == KERN_SUCCESS);
	246	} else {
	247	/* The only allowed gap is for SFI_CLASS_UNSPECIFIED */
	248	if (i != SFI_CLASS_UNSPECIFIED) {
	249	panic("Gap in registered SFI classes");
	250	}
	251	}
	252	}
	253	}
	254
	255	/* Can be called before sfi_init() by task initialization, but after sfi_early_init() */
	256	sfi_class_id_t
	257	sfi_get_ledger_alias_for_class(sfi_class_id_t class_id)
	258	{
	259	sfi_class_id_t i;
	260	const char *ledger_name = NULL;
	261
	262	ledger_name = sfi_classes[class_id].class_ledger_name;
	263
	264	/* Find the first class in the registration table with this ledger name */
	265	if (ledger_name) {
	266	for (i = SFI_CLASS_UNSPECIFIED + 1; i < class_id; i++) {
	267	if (0 == strcmp(sfi_classes[i].class_ledger_name, ledger_name)) {
	268	dprintf("sfi_get_ledger_alias_for_class(0x%x) -> 0x%x\n", class_id, i);
	269	return i;
	270	}
	271	}
	272
	273	/* This class is the primary one for the ledger, so there is no alias */
	274	dprintf("sfi_get_ledger_alias_for_class(0x%x) -> 0x%x\n", class_id, SFI_CLASS_UNSPECIFIED);
	275	return SFI_CLASS_UNSPECIFIED;
	276	}
	277
	278	/* We are permissive on SFI class lookup failures. In sfi_init(), we assert more */
	279	return SFI_CLASS_UNSPECIFIED;
	280	}
	281
	282	int
	283	sfi_ledger_entry_add(ledger_template_t template, sfi_class_id_t class_id)
	284	{
	285	const char *ledger_name = NULL;
	286
	287	ledger_name = sfi_classes[class_id].class_ledger_name;
	288
	289	dprintf("sfi_ledger_entry_add(%p, 0x%x) -> %s\n", template, class_id, ledger_name);
	290	return ledger_entry_add(template, ledger_name, "sfi", "MATUs");
	291	}
	292
	293	static void
	294	sfi_timer_global_off(
	295	timer_call_param_t param0 __unused,
	296	timer_call_param_t param1 __unused)
	297	{
	298	uint64_t now = mach_absolute_time();
	299	sfi_class_id_t i;
	300	processor_set_t pset, nset;
	301	processor_t processor;
	302	uint32_t needs_cause_ast_mask = 0x0;
	303	spl_t s;
	304
	305	s = splsched();
	306
	307	simple_lock(&sfi_lock, LCK_GRP_NULL);
	308	if (!sfi_is_enabled) {
	309	/* If SFI has been disabled, let all "on" timers drain naturally */
	310	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) \| DBG_FUNC_NONE, 1, 0, 0, 0, 0);
	311
	312	simple_unlock(&sfi_lock);
	313	splx(s);
	314	return;
	315	}
	316
	317	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) \| DBG_FUNC_START, 0, 0, 0, 0, 0);
	318
	319	/* First set all configured classes into the off state, and program their "on" timer */
	320	for (i = 0; i < MAX_SFI_CLASS_ID; i++) {
	321	if (sfi_classes[i].class_sfi_is_enabled) {
	322	uint64_t on_timer_deadline;
	323
	324	sfi_classes[i].class_in_on_phase = FALSE;
	325	sfi_classes[i].on_timer_programmed = TRUE;
	326
	327	/* Push out on-timer */
	328	on_timer_deadline = now + sfi_classes[i].off_time_interval;
	329	sfi_classes[i].on_timer_deadline = on_timer_deadline;
	330
	331	timer_call_enter1(&sfi_classes[i].on_timer, NULL, on_timer_deadline, TIMER_CALL_SYS_CRITICAL);
	332	} else {
	333	/* If this class no longer needs SFI, make sure the timer is cancelled */
	334	sfi_classes[i].class_in_on_phase = TRUE;
	335	if (sfi_classes[i].on_timer_programmed) {
	336	sfi_classes[i].on_timer_programmed = FALSE;
	337	sfi_classes[i].on_timer_deadline = ~0ULL;
	338	timer_call_cancel(&sfi_classes[i].on_timer);
	339	}
	340	}
	341	}
	342	simple_unlock(&sfi_lock);
	343
	344	/* Iterate over processors, call cause_ast_check() on ones running a thread that should be in an off phase */
	345	processor = processor_list;
	346	pset = processor->processor_set;
	347
	348	pset_lock(pset);
	349
	350	do {
	351	nset = processor->processor_set;
	352	if (nset != pset) {
	353	pset_unlock(pset);
	354	pset = nset;
	355	pset_lock(pset);
	356	}
	357
	358	/* "processor" and its pset are locked */
	359	if (processor->state == PROCESSOR_RUNNING) {
	360	if (AST_NONE != sfi_processor_needs_ast(processor)) {
	361	needs_cause_ast_mask \|= (1U << processor->cpu_id);
	362	}
	363	}
	364	} while ((processor = processor->processor_list) != NULL);
	365
	366	pset_unlock(pset);
	367
	368	for (int cpuid = lsb_first(needs_cause_ast_mask); cpuid >= 0; cpuid = lsb_next(needs_cause_ast_mask, cpuid)) {
	369	processor = processor_array[cpuid];
	370	if (processor == current_processor()) {
	371	ast_on(AST_SFI);
	372	} else {
	373	cause_ast_check(processor);
	374	}
	375	}
	376
	377	/* Re-arm timer if still enabled */
	378	simple_lock(&sfi_lock, LCK_GRP_NULL);
	379	if (sfi_is_enabled) {
	380	clock_deadline_for_periodic_event(sfi_window_interval,
	381	now,
	382	&sfi_next_off_deadline);
	383	timer_call_enter1(&sfi_timer_call_entry,
	384	NULL,
	385	sfi_next_off_deadline,
	386	TIMER_CALL_SYS_CRITICAL);
	387	}
	388
	389	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) \| DBG_FUNC_END, 0, 0, 0, 0, 0);
	390
	391	simple_unlock(&sfi_lock);
	392
	393	splx(s);
	394	}
	395
	396	static void
	397	sfi_timer_per_class_on(
	398	timer_call_param_t param0,
	399	timer_call_param_t param1 __unused)
	400	{
	401	sfi_class_id_t sfi_class_id = (sfi_class_id_t)(uintptr_t)param0;
	402	struct sfi_class_state *sfi_class = &sfi_classes[sfi_class_id];
	403	kern_return_t kret;
	404	spl_t s;
	405
	406	s = splsched();
	407
	408	simple_lock(&sfi_lock, LCK_GRP_NULL);
	409
	410	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_ON_TIMER) \| DBG_FUNC_START, sfi_class_id, 0, 0, 0, 0);
	411
	412	/*
	413	* Any threads that may have accumulated in the ready queue for this class should get re-enqueued.
	414	* Since we have the sfi_lock held and have changed "class_in_on_phase", we expect
	415	* no new threads to be put on this wait queue until the global "off timer" has fired.
	416	*/
	417
	418	sfi_class->class_in_on_phase = TRUE;
	419	sfi_class->on_timer_programmed = FALSE;
	420
	421	kret = waitq_wakeup64_all(&sfi_class->waitq,
	422	CAST_EVENT64_T(sfi_class_id),
	423	THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
	424	assert(kret == KERN_SUCCESS \|\| kret == KERN_NOT_WAITING);
	425
	426	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_ON_TIMER) \| DBG_FUNC_END, 0, 0, 0, 0, 0);
	427
	428	simple_unlock(&sfi_lock);
	429
	430	splx(s);
	431	}
	432
	433
	434	kern_return_t
	435	sfi_set_window(uint64_t window_usecs)
	436	{
	437	uint64_t interval, deadline;
	438	uint64_t now = mach_absolute_time();
	439	sfi_class_id_t i;
	440	spl_t s;
	441	uint64_t largest_class_off_interval = 0;
	442
	443	if (window_usecs < MIN_SFI_WINDOW_USEC) {
	444	window_usecs = MIN_SFI_WINDOW_USEC;
	445	}
	446
	447	if (window_usecs > UINT32_MAX) {
	448	return KERN_INVALID_ARGUMENT;
	449	}
	450
	451	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_SET_WINDOW), window_usecs, 0, 0, 0, 0);
	452
	453	clock_interval_to_absolutetime_interval((uint32_t)window_usecs, NSEC_PER_USEC, &interval);
	454	deadline = now + interval;
	455
	456	s = splsched();
	457
	458	simple_lock(&sfi_lock, LCK_GRP_NULL);
	459
	460	/* Check that we are not bringing in the SFI window smaller than any class */
	461	for (i = 0; i < MAX_SFI_CLASS_ID; i++) {
	462	if (sfi_classes[i].class_sfi_is_enabled) {
	463	largest_class_off_interval = MAX(largest_class_off_interval, sfi_classes[i].off_time_interval);
	464	}
	465	}
	466
	467	/*
	468	* Off window must be strictly greater than all enabled classes,
	469	* otherwise threads would build up on ready queue and never be able to run.
	470	*/
	471	if (interval <= largest_class_off_interval) {
	472	simple_unlock(&sfi_lock);
	473	splx(s);
	474	return KERN_INVALID_ARGUMENT;
	475	}
	476
	477	/*
	478	* If the new "off" deadline is further out than the current programmed timer,
	479	* just let the current one expire (and the new cadence will be established thereafter).
	480	* If the new "off" deadline is nearer than the current one, bring it in, so we
	481	* can start the new behavior sooner. Note that this may cause the "off" timer to
	482	* fire before some of the class "on" timers have fired.
	483	*/
	484	sfi_window_usecs = window_usecs;
	485	sfi_window_interval = interval;
	486	sfi_window_is_set = TRUE;
	487
	488	if (os_atomic_load(&sfi_enabled_class_count, relaxed) == 0) {
	489	/* Can't program timer yet */
	490	} else if (!sfi_is_enabled) {
	491	sfi_is_enabled = TRUE;
	492	sfi_next_off_deadline = deadline;
	493	timer_call_enter1(&sfi_timer_call_entry,
	494	NULL,
	495	sfi_next_off_deadline,
	496	TIMER_CALL_SYS_CRITICAL);
	497	} else if (deadline >= sfi_next_off_deadline) {
	498	sfi_next_off_deadline = deadline;
	499	} else {
	500	sfi_next_off_deadline = deadline;
	501	timer_call_enter1(&sfi_timer_call_entry,
	502	NULL,
	503	sfi_next_off_deadline,
	504	TIMER_CALL_SYS_CRITICAL);
	505	}
	506
	507	simple_unlock(&sfi_lock);
	508	splx(s);
	509
	510	return KERN_SUCCESS;
	511	}
	512
	513	kern_return_t
	514	sfi_window_cancel(void)
	515	{
	516	spl_t s;
	517
	518	s = splsched();
	519
	520	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_CANCEL_WINDOW), 0, 0, 0, 0, 0);
	521
	522	/* Disable globals so that global "off-timer" is not re-armed */
	523	simple_lock(&sfi_lock, LCK_GRP_NULL);
	524	sfi_window_is_set = FALSE;
	525	sfi_window_usecs = 0;
	526	sfi_window_interval = 0;
	527	sfi_next_off_deadline = 0;
	528	sfi_is_enabled = FALSE;
	529	simple_unlock(&sfi_lock);
	530
	531	splx(s);
	532
	533	return KERN_SUCCESS;
	534	}
	535
	536	/* Defers SFI off and per-class on timers (if live) by the specified interval
	537	* in Mach Absolute Time Units. Currently invoked to align with the global
	538	* forced idle mechanism. Making some simplifying assumptions, the iterative GFI
	539	* induced SFI on+off deferrals form a geometric series that converges to yield
	540	* an effective SFI duty cycle that is scaled by the GFI duty cycle. Initial phase
	541	* alignment and congruency of the SFI/GFI periods can distort this to some extent.
	542	*/
	543
	544	kern_return_t
	545	sfi_defer(uint64_t sfi_defer_matus)
	546	{
	547	spl_t s;
	548	kern_return_t kr = KERN_FAILURE;
	549	s = splsched();
	550
	551	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_GLOBAL_DEFER), sfi_defer_matus, 0, 0, 0, 0);
	552
	553	simple_lock(&sfi_lock, LCK_GRP_NULL);
	554	if (!sfi_is_enabled) {
	555	goto sfi_defer_done;
	556	}
	557
	558	assert(sfi_next_off_deadline != 0);
	559
	560	sfi_next_off_deadline += sfi_defer_matus;
	561	timer_call_enter1(&sfi_timer_call_entry, NULL, sfi_next_off_deadline, TIMER_CALL_SYS_CRITICAL);
	562
	563	int i;
	564	for (i = 0; i < MAX_SFI_CLASS_ID; i++) {
	565	if (sfi_classes[i].class_sfi_is_enabled) {
	566	if (sfi_classes[i].on_timer_programmed) {
	567	uint64_t new_on_deadline = sfi_classes[i].on_timer_deadline + sfi_defer_matus;
	568	sfi_classes[i].on_timer_deadline = new_on_deadline;
	569	timer_call_enter1(&sfi_classes[i].on_timer, NULL, new_on_deadline, TIMER_CALL_SYS_CRITICAL);
	570	}
	571	}
	572	}
	573
	574	kr = KERN_SUCCESS;
	575	sfi_defer_done:
	576	simple_unlock(&sfi_lock);
	577
	578	splx(s);
	579
	580	return kr;
	581	}
	582
	583
	584	kern_return_t
	585	sfi_get_window(uint64_t *window_usecs)
	586	{
	587	spl_t s;
	588	uint64_t off_window_us;
	589
	590	s = splsched();
	591	simple_lock(&sfi_lock, LCK_GRP_NULL);
	592
	593	off_window_us = sfi_window_usecs;
	594
	595	simple_unlock(&sfi_lock);
	596	splx(s);
	597
	598	*window_usecs = off_window_us;
	599
	600	return KERN_SUCCESS;
	601	}
	602
	603
	604	kern_return_t
	605	sfi_set_class_offtime(sfi_class_id_t class_id, uint64_t offtime_usecs)
	606	{
	607	uint64_t interval;
	608	spl_t s;
	609	uint64_t off_window_interval;
	610
	611	if (offtime_usecs < MIN_SFI_WINDOW_USEC) {
	612	offtime_usecs = MIN_SFI_WINDOW_USEC;
	613	}
	614
	615	if (class_id == SFI_CLASS_UNSPECIFIED \|\| class_id >= MAX_SFI_CLASS_ID) {
	616	return KERN_INVALID_ARGUMENT;
	617	}
	618
	619	if (offtime_usecs > UINT32_MAX) {
	620	return KERN_INVALID_ARGUMENT;
	621	}
	622
	623	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_SET_CLASS_OFFTIME), offtime_usecs, class_id, 0, 0, 0);
	624
	625	clock_interval_to_absolutetime_interval((uint32_t)offtime_usecs, NSEC_PER_USEC, &interval);
	626
	627	s = splsched();
	628
	629	simple_lock(&sfi_lock, LCK_GRP_NULL);
	630	off_window_interval = sfi_window_interval;
	631
	632	/* Check that we are not bringing in class off-time larger than the SFI window */
	633	if (off_window_interval && (interval >= off_window_interval)) {
	634	simple_unlock(&sfi_lock);
	635	splx(s);
	636	return KERN_INVALID_ARGUMENT;
	637	}
	638
	639	/* We never re-program the per-class on-timer, but rather just let it expire naturally */
	640	if (!sfi_classes[class_id].class_sfi_is_enabled) {
	641	os_atomic_inc(&sfi_enabled_class_count, relaxed);
	642	}
	643	sfi_classes[class_id].off_time_usecs = offtime_usecs;
	644	sfi_classes[class_id].off_time_interval = interval;
	645	sfi_classes[class_id].class_sfi_is_enabled = TRUE;
	646
	647	if (sfi_window_is_set && !sfi_is_enabled) {
	648	/* start global off timer */
	649	sfi_is_enabled = TRUE;
	650	sfi_next_off_deadline = mach_absolute_time() + sfi_window_interval;
	651	timer_call_enter1(&sfi_timer_call_entry,
	652	NULL,
	653	sfi_next_off_deadline,
	654	TIMER_CALL_SYS_CRITICAL);
	655	}
	656
	657	simple_unlock(&sfi_lock);
	658
	659	splx(s);
	660
	661	return KERN_SUCCESS;
	662	}
	663
	664	kern_return_t
	665	sfi_class_offtime_cancel(sfi_class_id_t class_id)
	666	{
	667	spl_t s;
	668
	669	if (class_id == SFI_CLASS_UNSPECIFIED \|\| class_id >= MAX_SFI_CLASS_ID) {
	670	return KERN_INVALID_ARGUMENT;
	671	}
	672
	673	s = splsched();
	674
	675	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_CANCEL_CLASS_OFFTIME), class_id, 0, 0, 0, 0);
	676
	677	simple_lock(&sfi_lock, LCK_GRP_NULL);
	678
	679	/* We never re-program the per-class on-timer, but rather just let it expire naturally */
	680	if (sfi_classes[class_id].class_sfi_is_enabled) {
	681	os_atomic_dec(&sfi_enabled_class_count, relaxed);
	682	}
	683	sfi_classes[class_id].off_time_usecs = 0;
	684	sfi_classes[class_id].off_time_interval = 0;
	685	sfi_classes[class_id].class_sfi_is_enabled = FALSE;
	686
	687	if (os_atomic_load(&sfi_enabled_class_count, relaxed) == 0) {
	688	sfi_is_enabled = FALSE;
	689	}
	690
	691	simple_unlock(&sfi_lock);
	692
	693	splx(s);
	694
	695	return KERN_SUCCESS;
	696	}
	697
	698	kern_return_t
	699	sfi_get_class_offtime(sfi_class_id_t class_id, uint64_t *offtime_usecs)
	700	{
	701	uint64_t off_time_us;
	702	spl_t s;
	703
	704	if (class_id == SFI_CLASS_UNSPECIFIED \|\| class_id >= MAX_SFI_CLASS_ID) {
	705	return 0;
	706	}
	707
	708	s = splsched();
	709
	710	simple_lock(&sfi_lock, LCK_GRP_NULL);
	711	off_time_us = sfi_classes[class_id].off_time_usecs;
	712	simple_unlock(&sfi_lock);
	713
	714	splx(s);
	715
	716	*offtime_usecs = off_time_us;
	717
	718	return KERN_SUCCESS;
	719	}
	720
	721	/*
	722	* sfi_thread_classify and sfi_processor_active_thread_classify perform the critical
	723	* role of quickly categorizing a thread into its SFI class so that an AST_SFI can be
	724	* set. As the thread is unwinding to userspace, sfi_ast() performs full locking
	725	* and determines whether the thread should enter an SFI wait state. Because of
	726	* the inherent races between the time the AST is set and when it is evaluated,
	727	* thread classification can be inaccurate (but should always be safe). This is
	728	* especially the case for sfi_processor_active_thread_classify, which must
	729	* classify the active thread on a remote processor without taking the thread lock.
	730	* When in doubt, classification should err on the side of not classifying a
	731	* thread at all, and wait for the thread itself to either hit a quantum expiration
	732	* or block inside the kernel.
	733	*/
	734
	735	/*
	736	* Thread must be locked. Ultimately, the real decision to enter
	737	* SFI wait happens at the AST boundary.
	738	*/
	739	sfi_class_id_t
	740	sfi_thread_classify(thread_t thread)
	741	{
	742	task_t task = thread->task;
	743	boolean_t is_kernel_thread = (task == kernel_task);
	744	sched_mode_t thmode = thread->sched_mode;
	745	boolean_t focal = FALSE;
	746
	747	/* kernel threads never reach the user AST boundary, and are in a separate world for SFI */
	748	if (is_kernel_thread) {
	749	return SFI_CLASS_KERNEL;
	750	}
	751
	752	/* no need to re-classify threads unless there is at least one enabled SFI class */
	753	if (os_atomic_load(&sfi_enabled_class_count, relaxed) == 0) {
	754	return SFI_CLASS_OPTED_OUT;
	755	}
	756
	757	int task_role = proc_get_effective_task_policy(task, TASK_POLICY_ROLE);
	758	int latency_qos = proc_get_effective_task_policy(task, TASK_POLICY_LATENCY_QOS);
	759	int managed_task = proc_get_effective_task_policy(task, TASK_POLICY_SFI_MANAGED);
	760
	761	int thread_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS);
	762	int thread_bg = proc_get_effective_thread_policy(thread, TASK_POLICY_DARWIN_BG);
	763
	764	if (thread_qos == THREAD_QOS_MAINTENANCE) {
	765	return SFI_CLASS_MAINTENANCE;
	766	}
	767
	768	if (thread_bg \|\| thread_qos == THREAD_QOS_BACKGROUND) {
	769	return SFI_CLASS_DARWIN_BG;
	770	}
	771
	772	if (latency_qos != 0) {
	773	int latency_qos_wtf = latency_qos - 1;
	774
	775	if ((latency_qos_wtf >= 4) && (latency_qos_wtf <= 5)) {
	776	return SFI_CLASS_APP_NAP;
	777	}
	778	}
	779
	780	/*
	781	* Realtime and fixed priority threads express their duty cycle constraints
	782	* via other mechanisms, and are opted out of (most) forms of SFI
	783	*/
	784	if (thmode == TH_MODE_REALTIME \|\| thmode == TH_MODE_FIXED \|\| task_role == TASK_GRAPHICS_SERVER) {
	785	return SFI_CLASS_OPTED_OUT;
	786	}
	787
	788	/*
	789	* Threads with unspecified, legacy, or user-initiated QOS class can be individually managed.
	790	*/
	791	switch (task_role) {
	792	case TASK_CONTROL_APPLICATION:
	793	case TASK_FOREGROUND_APPLICATION:
	794	focal = TRUE;
	795	break;
	796	case TASK_BACKGROUND_APPLICATION:
	797	case TASK_DEFAULT_APPLICATION:
	798	case TASK_UNSPECIFIED:
	799	/* Focal if the task is in a coalition with a FG/focal app */
	800	if (task_coalition_focal_count(thread->task) > 0) {
	801	focal = TRUE;
	802	}
	803	break;
	804	case TASK_THROTTLE_APPLICATION:
	805	case TASK_DARWINBG_APPLICATION:
	806	case TASK_NONUI_APPLICATION:
	807	/* Definitely not focal */
	808	default:
	809	break;
	810	}
	811
	812	if (managed_task) {
	813	switch (thread_qos) {
	814	case THREAD_QOS_UNSPECIFIED:
	815	case THREAD_QOS_LEGACY:
	816	case THREAD_QOS_USER_INITIATED:
	817	if (focal) {
	818	return SFI_CLASS_MANAGED_FOCAL;
	819	} else {
	820	return SFI_CLASS_MANAGED_NONFOCAL;
	821	}
	822	default:
	823	break;
	824	}
	825	}
	826
	827	if (thread_qos == THREAD_QOS_UTILITY) {
	828	return SFI_CLASS_UTILITY;
	829	}
	830
	831	/*
	832	* Classify threads in non-managed tasks
	833	*/
	834	if (focal) {
	835	switch (thread_qos) {
	836	case THREAD_QOS_USER_INTERACTIVE:
	837	return SFI_CLASS_USER_INTERACTIVE_FOCAL;
	838	case THREAD_QOS_USER_INITIATED:
	839	return SFI_CLASS_USER_INITIATED_FOCAL;
	840	case THREAD_QOS_LEGACY:
	841	return SFI_CLASS_LEGACY_FOCAL;
	842	default:
	843	return SFI_CLASS_DEFAULT_FOCAL;
	844	}
	845	} else {
	846	switch (thread_qos) {
	847	case THREAD_QOS_USER_INTERACTIVE:
	848	return SFI_CLASS_USER_INTERACTIVE_NONFOCAL;
	849	case THREAD_QOS_USER_INITIATED:
	850	return SFI_CLASS_USER_INITIATED_NONFOCAL;
	851	case THREAD_QOS_LEGACY:
	852	return SFI_CLASS_LEGACY_NONFOCAL;
	853	default:
	854	return SFI_CLASS_DEFAULT_NONFOCAL;
	855	}
	856	}
	857	}
	858
	859	/*
	860	* pset must be locked.
	861	*/
	862	sfi_class_id_t
	863	sfi_processor_active_thread_classify(processor_t processor)
	864	{
	865	return processor->current_sfi_class;
	866	}
	867
	868	/*
	869	* thread must be locked. This is inherently racy, with the intent that
	870	* at the AST boundary, it will be fully evaluated whether we need to
	871	* perform an AST wait
	872	*/
	873	ast_t
	874	sfi_thread_needs_ast(thread_t thread, sfi_class_id_t *out_class)
	875	{
	876	sfi_class_id_t class_id;
	877
	878	class_id = sfi_thread_classify(thread);
	879
	880	if (out_class) {
	881	*out_class = class_id;
	882	}
	883
	884	/* No lock taken, so a stale value may be used. */
	885	if (!sfi_classes[class_id].class_in_on_phase) {
	886	return AST_SFI;
	887	} else {
	888	return AST_NONE;
	889	}
	890	}
	891
	892	/*
	893	* pset must be locked. We take the SFI class for
	894	* the currently running thread which is cached on
	895	* the processor_t, and assume it is accurate. In the
	896	* worst case, the processor will get an IPI and be asked
	897	* to evaluate if the current running thread at that
	898	* later point in time should be in an SFI wait.
	899	*/
	900	ast_t
	901	sfi_processor_needs_ast(processor_t processor)
	902	{
	903	sfi_class_id_t class_id;
	904
	905	class_id = sfi_processor_active_thread_classify(processor);
	906
	907	/* No lock taken, so a stale value may be used. */
	908	if (!sfi_classes[class_id].class_in_on_phase) {
	909	return AST_SFI;
	910	} else {
	911	return AST_NONE;
	912	}
	913	}
	914
	915	static inline void
	916	_sfi_wait_cleanup(void)
	917	{
	918	thread_t self = current_thread();
	919
	920	spl_t s = splsched();
	921	simple_lock(&sfi_lock, LCK_GRP_NULL);
	922
	923	sfi_class_id_t current_sfi_wait_class = self->sfi_wait_class;
	924
	925	assert((SFI_CLASS_UNSPECIFIED < current_sfi_wait_class) &&
	926	(current_sfi_wait_class < MAX_SFI_CLASS_ID));
	927
	928	self->sfi_wait_class = SFI_CLASS_UNSPECIFIED;
	929
	930	simple_unlock(&sfi_lock);
	931	splx(s);
	932
	933	/*
	934	* It's possible for the thread to be woken up due to the SFI period
	935	* ending before it finishes blocking. In that case,
	936	* wait_sfi_begin_time won't be set.
	937	*
	938	* Derive the time sacrificed to SFI by looking at when this thread was
	939	* awoken by the on-timer, to avoid counting the time this thread spent
	940	* waiting to get scheduled.
	941	*
	942	* Note that last_made_runnable_time could be reset if this thread
	943	* gets preempted before we read the value. To fix that, we'd need to
	944	* track wait time in a thread timer, sample the timer before blocking,
	945	* pass the value through thread->parameter, and subtract that.
	946	*/
	947
	948	if (self->wait_sfi_begin_time != 0) {
	949	uint64_t made_runnable = os_atomic_load(&self->last_made_runnable_time, relaxed);
	950	int64_t sfi_wait_time = made_runnable - self->wait_sfi_begin_time;
	951	assert(sfi_wait_time >= 0);
	952
	953	ledger_credit(self->task->ledger, task_ledgers.sfi_wait_times[current_sfi_wait_class],
	954	sfi_wait_time);
	955
	956	self->wait_sfi_begin_time = 0;
	957	}
	958	}
	959
	960	/*
	961	* Called at AST context to fully evaluate if the current thread
	962	* (which is obviously running) should instead block in an SFI wait.
	963	* We must take the sfi_lock to check whether we are in the "off" period
	964	* for the class, and if so, block.
	965	*/
	966	void
	967	sfi_ast(thread_t thread)
	968	{
	969	sfi_class_id_t class_id;
	970	spl_t s;
	971	struct sfi_class_state *sfi_class;
	972	wait_result_t waitret;
	973	boolean_t did_wait = FALSE;
	974	thread_continue_t continuation;
	975
	976	s = splsched();
	977
	978	simple_lock(&sfi_lock, LCK_GRP_NULL);
	979
	980	if (!sfi_is_enabled) {
	981	/*
	982	* SFI is not enabled, or has recently been disabled.
	983	* There is no point putting this thread on a deferred ready
	984	* queue, even if it were classified as needing it, since
	985	* SFI will truly be off at the next global off timer
	986	*/
	987	simple_unlock(&sfi_lock);
	988	splx(s);
	989
	990	return;
	991	}
	992
	993	thread_lock(thread);
	994	thread->sfi_class = class_id = sfi_thread_classify(thread);
	995	thread_unlock(thread);
	996
	997	/*
	998	* Once the sfi_lock is taken and the thread's ->sfi_class field is updated, we
	999	* are committed to transitioning to whatever state is indicated by "->class_in_on_phase".
	1000	* If another thread tries to call sfi_reevaluate() after this point, it will take the
	1001	* sfi_lock and see the thread in this wait state. If another thread calls
	1002	* sfi_reevaluate() before this point, it would see a runnable thread and at most
	1003	* attempt to send an AST to this processor, but we would have the most accurate
	1004	* classification.
	1005	*/
	1006
	1007	sfi_class = &sfi_classes[class_id];
	1008	if (!sfi_class->class_in_on_phase) {
	1009	/* Need to block thread in wait queue */
	1010	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_THREAD_DEFER),
	1011	thread_tid(thread), class_id, 0, 0, 0);
	1012
	1013	waitret = waitq_assert_wait64(&sfi_class->waitq,
	1014	CAST_EVENT64_T(class_id),
	1015	THREAD_INTERRUPTIBLE \| THREAD_WAIT_NOREPORT, 0);
	1016	if (waitret == THREAD_WAITING) {
	1017	thread->sfi_wait_class = class_id;
	1018	did_wait = TRUE;
	1019	continuation = sfi_class->continuation;
	1020	} else {
	1021	/* thread may be exiting already, all other errors are unexpected */
	1022	assert(waitret == THREAD_INTERRUPTED);
	1023	}
	1024	}
	1025	simple_unlock(&sfi_lock);
	1026
	1027	splx(s);
	1028
	1029	if (did_wait) {
	1030	assert(thread->wait_sfi_begin_time == 0);
	1031
	1032	thread_block_reason(continuation, NULL, AST_SFI);
	1033	}
	1034	}
	1035
	1036	/* Thread must be unlocked */
	1037	void
	1038	sfi_reevaluate(thread_t thread)
	1039	{
	1040	kern_return_t kret;
	1041	spl_t s;
	1042	sfi_class_id_t class_id, current_class_id;
	1043	ast_t sfi_ast;
	1044
	1045	s = splsched();
	1046
	1047	simple_lock(&sfi_lock, LCK_GRP_NULL);
	1048
	1049	thread_lock(thread);
	1050	sfi_ast = sfi_thread_needs_ast(thread, &class_id);
	1051	thread->sfi_class = class_id;
	1052
	1053	/*
	1054	* This routine chiefly exists to boost threads out of an SFI wait
	1055	* if their classification changes before the "on" timer fires.
	1056	*
	1057	* If we calculate that a thread is in a different ->sfi_wait_class
	1058	* than we think it should be (including no-SFI-wait), we need to
	1059	* correct that:
	1060	*
	1061	* If the thread is in SFI wait and should not be (or should be waiting
	1062	* on a different class' "on" timer), we wake it up. If needed, the
	1063	* thread may immediately block again in the different SFI wait state.
	1064	*
	1065	* If the thread is not in an SFI wait state and it should be, we need
	1066	* to get that thread's attention, possibly by sending an AST to another
	1067	* processor.
	1068	*/
	1069
	1070	if ((current_class_id = thread->sfi_wait_class) != SFI_CLASS_UNSPECIFIED) {
	1071	thread_unlock(thread); /* not needed anymore */
	1072
	1073	assert(current_class_id < MAX_SFI_CLASS_ID);
	1074
	1075	if ((sfi_ast == AST_NONE) \|\| (class_id != current_class_id)) {
	1076	struct sfi_class_state *sfi_class = &sfi_classes[current_class_id];
	1077
	1078	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_WAIT_CANCELED), thread_tid(thread), current_class_id, class_id, 0, 0);
	1079
	1080	kret = waitq_wakeup64_thread(&sfi_class->waitq,
	1081	CAST_EVENT64_T(current_class_id),
	1082	thread,
	1083	THREAD_AWAKENED);
	1084	assert(kret == KERN_SUCCESS \|\| kret == KERN_NOT_WAITING);
	1085	}
	1086	} else {
	1087	/*
	1088	* Thread's current SFI wait class is not set, and because we
	1089	* have the sfi_lock, it won't get set.
	1090	*/
	1091
	1092	if ((thread->state & (TH_RUN \| TH_IDLE)) == TH_RUN) {
	1093	if (sfi_ast != AST_NONE) {
	1094	if (thread == current_thread()) {
	1095	ast_on(sfi_ast);
	1096	} else {
	1097	processor_t processor = thread->last_processor;
	1098
	1099	if (processor != PROCESSOR_NULL &&
	1100	processor->state == PROCESSOR_RUNNING &&
	1101	processor->active_thread == thread) {
	1102	cause_ast_check(processor);
	1103	} else {
	1104	/*
	1105	* Runnable thread that's not on a CPU currently. When a processor
	1106	* does context switch to it, the AST will get set based on whether
	1107	* the thread is in its "off time".
	1108	*/
	1109	}
	1110	}
	1111	}
	1112	}
	1113
	1114	thread_unlock(thread);
	1115	}
	1116
	1117	simple_unlock(&sfi_lock);
	1118	splx(s);
	1119	}
	1120
	1121	#else /* !CONFIG_SCHED_SFI */
	1122
	1123	kern_return_t
	1124	sfi_set_window(uint64_t window_usecs __unused)
	1125	{
	1126	return KERN_NOT_SUPPORTED;
	1127	}
	1128
	1129	kern_return_t
	1130	sfi_window_cancel(void)
	1131	{
	1132	return KERN_NOT_SUPPORTED;
	1133	}
	1134
	1135
	1136	kern_return_t
	1137	sfi_get_window(uint64_t *window_usecs __unused)
	1138	{
	1139	return KERN_NOT_SUPPORTED;
	1140	}
	1141
	1142
	1143	kern_return_t
	1144	sfi_set_class_offtime(sfi_class_id_t class_id __unused, uint64_t offtime_usecs __unused)
	1145	{
	1146	return KERN_NOT_SUPPORTED;
	1147	}
	1148
	1149	kern_return_t
	1150	sfi_class_offtime_cancel(sfi_class_id_t class_id __unused)
	1151	{
	1152	return KERN_NOT_SUPPORTED;
	1153	}
	1154
	1155	kern_return_t
	1156	sfi_get_class_offtime(sfi_class_id_t class_id __unused, uint64_t *offtime_usecs __unused)
	1157	{
	1158	return KERN_NOT_SUPPORTED;
	1159	}
	1160
	1161	void
	1162	sfi_reevaluate(thread_t thread __unused)
	1163	{
	1164	return;
	1165	}
	1166
	1167	sfi_class_id_t
	1168	sfi_thread_classify(thread_t thread)
	1169	{
	1170	task_t task = thread->task;
	1171	boolean_t is_kernel_thread = (task == kernel_task);
	1172
	1173	if (is_kernel_thread) {
	1174	return SFI_CLASS_KERNEL;
	1175	}
	1176
	1177	return SFI_CLASS_OPTED_OUT;
	1178	}
	1179
	1180	#endif /* !CONFIG_SCHED_SFI */