git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2018 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	#include <mach/mach_types.h>
	30	#include <mach/machine.h>
	31	#include <machine/machine_routines.h>
	32	#include <machine/sched_param.h>
	33	#include <machine/machine_cpu.h>
	34	#include <kern/kern_types.h>
	35	#include <kern/debug.h>
	36	#include <kern/machine.h>
	37	#include <kern/misc_protos.h>
	38	#include <kern/processor.h>
	39	#include <kern/queue.h>
	40	#include <kern/sched.h>
	41	#include <kern/sched_prim.h>
	42	#include <kern/task.h>
	43	#include <kern/thread.h>
	44	#include <kern/sched_clutch.h>
	45	#include <machine/atomic.h>
	46	#include <kern/sched_clutch.h>
	47	#include <sys/kdebug.h>
	48
	49	#if __AMP__
	50	#include <kern/sched_amp_common.h>
	51	#endif /* __AMP__ */
	52
	53	#if CONFIG_SCHED_CLUTCH
	54
	55	/* Forward declarations of static routines */
	56
	57	/* Root level hierarchy management */
	58	static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
	59	static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t);
	60	static void sched_clutch_root_pri_update(sched_clutch_root_t);
	61	static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t);
	62	static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
	63	static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
	64
	65	/* Root bucket level hierarchy management */
	66	static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
	67	static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t);
	68	static int sched_clutch_root_bucket_pri_compare(sched_clutch_root_bucket_t, sched_clutch_root_bucket_t);
	69
	70	/* Clutch bucket level hierarchy management */
	71	static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t);
	72	static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t);
	73	static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t);
	74	static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t);
	75	static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t);
	76
	77	static void sched_clutch_bucket_cpu_usage_update(sched_clutch_bucket_t, uint64_t);
	78	static void sched_clutch_bucket_cpu_blocked_update(sched_clutch_bucket_t, uint64_t);
	79	static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
	80	static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t);
	81
	82	/* Clutch timeshare properties updates */
	83	static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
	84	static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
	85	static void sched_clutch_bucket_cpu_adjust(sched_clutch_bucket_t);
	86	static void sched_clutch_bucket_timeshare_update(sched_clutch_bucket_t);
	87	static boolean_t sched_thread_sched_pri_promoted(thread_t);
	88	/* Clutch membership management */
	89	static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
	90	static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t);
	91	static thread_t sched_clutch_thread_highest(sched_clutch_root_t);
	92
	93	/* Clutch properties updates */
	94	static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
	95	static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
	96	static int sched_clutch_root_priority(sched_clutch_root_t);
	97
	98	#if __AMP__
	99	/* System based routines */
	100	static bool sched_clutch_pset_available(processor_set_t);
	101	#endif /* __AMP__ */
	102
	103	/* Helper debugging routines */
	104	static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
	105
	106
	107
	108	/*
	109	* Global priority queue comparator routine for root buckets. The
	110	* routine implements the priority queue as a minimum deadline queue
	111	* to achieve EDF scheduling.
	112	*/
	113	priority_queue_compare_fn_t sched_clutch_root_bucket_compare;
	114
	115
	116	/*
	117	* Special markers for buckets that have invalid WCELs/quantums etc.
	118	*/
	119	#define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
	120	#define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
	121
	122	/*
	123	* Root level bucket WCELs
	124	*
	125	* The root level bucket selection algorithm is an Earliest Deadline
	126	* First (EDF) algorithm where the deadline for buckets are defined
	127	* by the worst-case-execution-latency and the make runnable timestamp
	128	* for the bucket.
	129	*
	130	*/
	131	static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
	132	SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
	133	0, /* FG */
	134	37500, /* IN (37.5ms) */
	135	75000, /* DF (75ms) */
	136	150000, /* UT (150ms) */
	137	250000 /* BG (250ms) */
	138	};
	139	static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
	140
	141	/*
	142	* Root level bucket warp
	143	*
	144	* Each root level bucket has a warp value associated with it as well.
	145	* The warp value allows the root bucket to effectively warp ahead of
	146	* lower priority buckets for a limited time even if it has a later
	147	* deadline. The warping behavior provides extra (but limited)
	148	* opportunity for high priority buckets to remain responsive.
	149	*/
	150
	151	/* Special warp deadline value to indicate that the bucket has not used any warp yet */
	152	#define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED (SCHED_CLUTCH_INVALID_TIME_64)
	153
	154	/* Warp window durations for various tiers */
	155	static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
	156	SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
	157	8000, /* FG (8ms)*/
	158	4000, /* IN (4ms) */
	159	2000, /* DF (2ms) */
	160	1000, /* UT (1ms) */
	161	0 /* BG (0ms) */
	162	};
	163	static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
	164
	165	/*
	166	* Thread level quantum
	167	*
	168	* The algorithm defines quantums for threads at various buckets. This
	169	* (combined with the root level bucket quantums) restricts how much
	170	* the lower priority levels can preempt the higher priority threads.
	171	*/
	172	static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
	173	10000, /* FIXPRI (10ms) */
	174	10000, /* FG (10ms) */
	175	8000, /* IN (8ms) */
	176	6000, /* DF (6ms) */
	177	4000, /* UT (4ms) */
	178	2000 /* BG (2ms) */
	179	};
	180	static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
	181
	182	enum sched_clutch_state {
	183	SCHED_CLUTCH_STATE_EMPTY = 0,
	184	SCHED_CLUTCH_STATE_RUNNABLE,
	185	};
	186
	187	/*
	188	* sched_clutch_us_to_abstime()
	189	*
	190	* Initializer for converting all durations in usec to abstime
	191	*/
	192	static void
	193	sched_clutch_us_to_abstime(uint32_t us_vals, uint64_t abstime_vals)
	194	{
	195	for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
	196	if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
	197	abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
	198	} else {
	199	clock_interval_to_absolutetime_interval(us_vals[i],
	200	NSEC_PER_USEC, &abstime_vals[i]);
	201	}
	202	}
	203	}
	204
	205	#if DEVELOPMENT \|\| DEBUG
	206
	207	/*
	208	* sched_clutch_hierarchy_locked_assert()
	209	*
	210	* Debugging helper routine. Asserts that the hierarchy is locked. The locking
	211	* for the hierarchy depends on where the hierarchy is hooked. The current
	212	* implementation hooks the hierarchy at the pset, so the hierarchy is locked
	213	* using the pset lock.
	214	*/
	215	static inline void
	216	sched_clutch_hierarchy_locked_assert(
	217	sched_clutch_root_t root_clutch)
	218	{
	219	pset_assert_locked(root_clutch->scr_pset);
	220	}
	221
	222	#else /* DEVELOPMENT \|\| DEBUG */
	223
	224	static inline void
	225	sched_clutch_hierarchy_locked_assert(
	226	__unused sched_clutch_root_t root_clutch)
	227	{
	228	}
	229
	230	#endif /* DEVELOPMENT \|\| DEBUG */
	231
	232	/*
	233	* sched_clutch_thr_count_inc()
	234	*
	235	* Increment thread count at a hierarchy level with overflow checks.
	236	*/
	237	static void
	238	sched_clutch_thr_count_inc(
	239	uint16_t *thr_count)
	240	{
	241	if (__improbable(os_inc_overflow(thr_count))) {
	242	panic("sched_clutch thread count overflowed!");
	243	}
	244	}
	245
	246	/*
	247	* sched_clutch_thr_count_dec()
	248	*
	249	* Decrement thread count at a hierarchy level with underflow checks.
	250	*/
	251	static void
	252	sched_clutch_thr_count_dec(
	253	uint16_t *thr_count)
	254	{
	255	if (__improbable(os_dec_overflow(thr_count))) {
	256	panic("sched_clutch thread count underflowed!");
	257	}
	258	}
	259
	260	#if __AMP__
	261
	262	/*
	263	* sched_clutch_pset_available()
	264	*
	265	* Routine to determine if a pset is available for scheduling.
	266	*/
	267	static bool
	268	sched_clutch_pset_available(processor_set_t pset)
	269	{
	270	/* Check if cluster has none of the CPUs available */
	271	if (pset->online_processor_count == 0) {
	272	return false;
	273	}
	274
	275	/* Check if the cluster is not recommended by CLPC */
	276	if (!pset_is_recommended(pset)) {
	277	return false;
	278	}
	279
	280	return true;
	281	}
	282
	283	#endif /* __AMP__ */
	284
	285	/*
	286	* sched_clutch_root_init()
	287	*
	288	* Routine to initialize the scheduler hierarchy root.
	289	*/
	290	static void
	291	sched_clutch_root_init(
	292	sched_clutch_root_t root_clutch,
	293	processor_set_t pset)
	294	{
	295	root_clutch->scr_thr_count = 0;
	296	root_clutch->scr_priority = NOPRI;
	297	root_clutch->scr_urgency = 0;
	298	root_clutch->scr_pset = pset;
	299
	300	/* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
	301	queue_init(&root_clutch->scr_clutch_buckets);
	302
	303	/* Initialize the queue which maintains all runnable foreign clutch buckets */
	304	queue_init(&root_clutch->scr_foreign_buckets);
	305
	306	/* Initialize the bitmap and priority queue of runnable root buckets */
	307	sched_clutch_root_bucket_compare = priority_heap_make_comparator(a, b, struct sched_clutch_root_bucket, scrb_pqlink, {
	308	return (a->scrb_deadline < b->scrb_deadline) ? 1 : ((a->scrb_deadline == b->scrb_deadline) ? 0 : -1);
	309	});
	310	priority_queue_init(&root_clutch->scr_root_buckets, PRIORITY_QUEUE_GENERIC_KEY \| PRIORITY_QUEUE_MIN_HEAP);
	311	bitmap_zero(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX);
	312	bitmap_zero(root_clutch->scr_warp_available, TH_BUCKET_SCHED_MAX);
	313
	314	/* Initialize all the root buckets */
	315	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
	316	sched_clutch_root_bucket_init(&root_clutch->scr_buckets[i], i);
	317	}
	318	}
	319
	320	/*
	321	* sched_clutch_root_bucket_init()
	322	*
	323	* Routine to initialize root buckets.
	324	*/
	325	static void
	326	sched_clutch_root_bucket_init(
	327	sched_clutch_root_bucket_t root_bucket,
	328	sched_bucket_t bucket)
	329	{
	330	root_bucket->scrb_bucket = bucket;
	331	priority_queue_init(&root_bucket->scrb_clutch_buckets, PRIORITY_QUEUE_BUILTIN_KEY \| PRIORITY_QUEUE_MAX_HEAP);
	332	priority_queue_entry_init(&root_bucket->scrb_pqlink);
	333	root_bucket->scrb_deadline = SCHED_CLUTCH_INVALID_TIME_64;
	334	root_bucket->scrb_warped_deadline = 0;
	335	root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
	336	}
	337
	338	/*
	339	* sched_clutch_root_bucket_pri_compare()
	340	*
	341	* Routine to compare root buckets based on the highest runnable clutch
	342	* bucket priorities in the root buckets.
	343	*/
	344	static int
	345	sched_clutch_root_bucket_pri_compare(
	346	sched_clutch_root_bucket_t a,
	347	sched_clutch_root_bucket_t b)
	348	{
	349	sched_clutch_bucket_t a_highest = sched_clutch_root_bucket_highest_clutch_bucket(a);
	350	sched_clutch_bucket_t b_highest = sched_clutch_root_bucket_highest_clutch_bucket(b);
	351	return (a_highest->scb_priority > b_highest->scb_priority) ?
	352	1 : ((a_highest->scb_priority == b_highest->scb_priority) ? 0 : -1);
	353	}
	354
	355	/*
	356	* sched_clutch_root_select_aboveui()
	357	*
	358	* Special case scheduling for Above UI bucket.
	359	*
	360	* AboveUI threads are typically system critical threads that need low latency
	361	* which is why they are handled specially.
	362	*
	363	* Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
	364	* important to maintain some native priority order between those buckets. The policy
	365	* implemented here is to compare the highest clutch buckets of both buckets; if the
	366	* Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
	367	* deadline based scheduling which should pickup the timeshare buckets.
	368	*
	369	* The implementation allows extremely low latency CPU access for Above UI threads
	370	* while supporting the use case of high priority timeshare threads contending with
	371	* lower priority fixed priority threads.
	372	*/
	373	static boolean_t
	374	sched_clutch_root_select_aboveui(
	375	sched_clutch_root_t root_clutch)
	376	{
	377	if (bitmap_test(root_clutch->scr_runnable_bitmap, TH_BUCKET_FIXPRI)) {
	378	sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_buckets[TH_BUCKET_FIXPRI];
	379	sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_buckets[TH_BUCKET_SHARE_FG];
	380
	381	if (!bitmap_test(root_clutch->scr_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
	382	/* If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling */
	383	return true;
	384	}
	385	if (sched_clutch_root_bucket_pri_compare(root_bucket_aboveui, root_bucket_sharefg) >= 0) {
	386	/* If the aboveUI bucket has a higher native clutch bucket priority, schedule it */
	387	return true;
	388	}
	389	}
	390	return false;
	391	}
	392
	393
	394	/*
	395	* sched_clutch_root_highest_root_bucket()
	396	*
	397	* Main routine to find the highest runnable root level bucket.
	398	* This routine is called from performance sensitive contexts; so it is
	399	* crucial to keep this O(1).
	400	*
	401	*/
	402	static sched_clutch_root_bucket_t
	403	sched_clutch_root_highest_root_bucket(
	404	sched_clutch_root_t root_clutch,
	405	uint64_t timestamp)
	406	{
	407	sched_clutch_hierarchy_locked_assert(root_clutch);
	408	if (bitmap_lsb_first(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
	409	return NULL;
	410	}
	411
	412	if (sched_clutch_root_select_aboveui(root_clutch)) {
	413	return &root_clutch->scr_buckets[TH_BUCKET_FIXPRI];
	414	}
	415
	416	/*
	417	* Above UI bucket is not runnable or has a low priority clutch bucket; use the earliest deadline model
	418	* to schedule threads. The idea is that as the timeshare buckets use CPU, they will drop their
	419	* interactivity score and allow low priority AboveUI clutch buckets to be scheduled.
	420	*/
	421
	422	/* Find the earliest deadline bucket */
	423	sched_clutch_root_bucket_t edf_bucket = priority_queue_min(&root_clutch->scr_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
	424
	425	sched_clutch_root_bucket_t warp_bucket = NULL;
	426	int warp_bucket_index = -1;
	427	evaluate_warp_buckets:
	428	/* Check if any higher runnable buckets have warp available */
	429	warp_bucket_index = bitmap_lsb_first(root_clutch->scr_warp_available, TH_BUCKET_SCHED_MAX);
	430
	431	if ((warp_bucket_index == -1) \|\| (warp_bucket_index >= edf_bucket->scrb_bucket)) {
	432	/* No higher buckets have warp available; choose the edf bucket and replenish its warp */
	433	sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
	434	edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
	435	edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
	436	bitmap_set(root_clutch->scr_warp_available, edf_bucket->scrb_bucket);
	437	return edf_bucket;
	438	}
	439
	440	/*
	441	* Looks like there is a root bucket which is higher in the natural priority
	442	* order than edf_bucket and might have some warp remaining.
	443	*/
	444	warp_bucket = &root_clutch->scr_buckets[warp_bucket_index];
	445	if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
	446	/* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
	447	warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
	448	sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
	449	return warp_bucket;
	450	}
	451	if (warp_bucket->scrb_warped_deadline > timestamp) {
	452	/* Root bucket already has a warp window open with some warp remaining */
	453	sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
	454	return warp_bucket;
	455	}
	456
	457	/* For this bucket, warp window was opened sometime in the past but has now
	458	* expired. Mark the bucket as not avilable for warp anymore and re-run the
	459	* warp bucket selection logic.
	460	*/
	461	warp_bucket->scrb_warp_remaining = 0;
	462	bitmap_clear(root_clutch->scr_warp_available, warp_bucket->scrb_bucket);
	463	goto evaluate_warp_buckets;
	464	}
	465
	466	/*
	467	* sched_clutch_root_bucket_deadline_calculate()
	468	*
	469	* Calculate the deadline for the bucket based on its WCEL
	470	*/
	471	static uint64_t
	472	sched_clutch_root_bucket_deadline_calculate(
	473	sched_clutch_root_bucket_t root_bucket,
	474	uint64_t timestamp)
	475	{
	476	/* For fixpri AboveUI bucket always return it as the earliest deadline */
	477	if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
	478	return 0;
	479	}
	480
	481	/* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
	482	return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
	483	}
	484
	485	/*
	486	* sched_clutch_root_bucket_deadline_update()
	487	*
	488	* Routine to update the deadline of the root bucket when it is selected.
	489	* Updating the deadline also moves the root_bucket in the EDF priority
	490	* queue.
	491	*/
	492	static void
	493	sched_clutch_root_bucket_deadline_update(
	494	sched_clutch_root_bucket_t root_bucket,
	495	sched_clutch_root_t root_clutch,
	496	uint64_t timestamp)
	497	{
	498	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
	499	/* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
	500	return;
	501	}
	502	uint64_t old_deadline = root_bucket->scrb_deadline;
	503	uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
	504	assert(old_deadline <= new_deadline);
	505	if (old_deadline != new_deadline) {
	506	root_bucket->scrb_deadline = new_deadline;
	507	/* Since the priority queue is a min-heap, use the decrease routine even though the deadline has a larger value now */
	508	priority_queue_entry_decrease(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, PRIORITY_QUEUE_KEY_NONE, sched_clutch_root_bucket_compare);
	509	}
	510	}
	511
	512	/*
	513	* sched_clutch_root_bucket_runnable()
	514	*
	515	* Routine to insert a newly runnable root bucket into the hierarchy.
	516	* Also updates the deadline and warp parameters as necessary.
	517	*/
	518	static void
	519	sched_clutch_root_bucket_runnable(
	520	sched_clutch_root_bucket_t root_bucket,
	521	sched_clutch_root_t root_clutch,
	522	uint64_t timestamp)
	523	{
	524	/* Mark the root bucket as runnable */
	525	bitmap_set(root_clutch->scr_runnable_bitmap, root_bucket->scrb_bucket);
	526	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_ROOT_BUCKET_STATE) \| DBG_FUNC_NONE,
	527	root_bucket->scrb_bucket, SCHED_CLUTCH_STATE_RUNNABLE, 0, 0, 0);
	528
	529	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
	530	/* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
	531	return;
	532	}
	533
	534	root_bucket->scrb_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
	535	priority_queue_insert(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, PRIORITY_QUEUE_KEY_NONE, sched_clutch_root_bucket_compare);
	536
	537	if (root_bucket->scrb_warp_remaining) {
	538	/* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
	539	bitmap_set(root_clutch->scr_warp_available, root_bucket->scrb_bucket);
	540	}
	541	}
	542
	543	/*
	544	* sched_clutch_root_bucket_empty()
	545	*
	546	* Routine to remove an empty root bucket from the hierarchy.
	547	* Also updates the deadline and warp parameters as necessary.
	548	*/
	549	static void
	550	sched_clutch_root_bucket_empty(
	551	sched_clutch_root_bucket_t root_bucket,
	552	sched_clutch_root_t root_clutch,
	553	uint64_t timestamp)
	554	{
	555	bitmap_clear(root_clutch->scr_runnable_bitmap, root_bucket->scrb_bucket);
	556	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_ROOT_BUCKET_STATE) \| DBG_FUNC_NONE,
	557	root_bucket->scrb_bucket, SCHED_CLUTCH_STATE_EMPTY, 0, 0, 0);
	558
	559	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
	560	/* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
	561	return;
	562	}
	563
	564	priority_queue_remove(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, sched_clutch_root_bucket_compare);
	565
	566	bitmap_clear(root_clutch->scr_warp_available, root_bucket->scrb_bucket);
	567	if (root_bucket->scrb_warped_deadline > timestamp) {
	568	/*
	569	* For root buckets that were using the warp, check if the warp
	570	* deadline is in the future. If yes, remove the wall time the
	571	* warp was active and update the warp remaining. This allows
	572	* the root bucket to use the remaining warp the next time it
	573	* becomes runnable.
	574	*/
	575	root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
	576	} else if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
	577	/*
	578	* If the root bucket's warped deadline is in the past, it has used up
	579	* all the warp it was assigned. Empty out its warp remaining.
	580	*/
	581	root_bucket->scrb_warp_remaining = 0;
	582	}
	583	}
	584
	585	/*
	586	* sched_clutch_root_pri_update()
	587	*
	588	* The root level priority is used for thread selection and preemption
	589	* logic.
	590	*/
	591	static void
	592	sched_clutch_root_pri_update(
	593	sched_clutch_root_t root_clutch)
	594	{
	595	sched_clutch_hierarchy_locked_assert(root_clutch);
	596	if (bitmap_lsb_first(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
	597	/* No runnable root buckets */
	598	root_clutch->scr_priority = NOPRI;
	599	assert(root_clutch->scr_urgency == 0);
	600	return;
	601	}
	602	sched_clutch_root_bucket_t root_bucket = NULL;
	603	/* Special case for AboveUI (uses same logic as thread selection) */
	604	if (sched_clutch_root_select_aboveui(root_clutch)) {
	605	root_bucket = &root_clutch->scr_buckets[TH_BUCKET_FIXPRI];
	606	} else {
	607	/*
	608	* AboveUI bucket is not runnable or has a low clutch bucket priority,
	609	* select the next runnable root bucket in natural priority order. This logic
	610	* is slightly different from thread selection, because thread selection
	611	* considers deadlines, warps etc. to decide the most optimal bucket at a
	612	* given timestamp. Since the priority value is used for preemption decisions
	613	* only, it needs to be based on the highest runnable thread available in
	614	* the timeshare domain.
	615	*/
	616	int root_bucket_index = bitmap_lsb_next(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
	617	assert(root_bucket_index != -1);
	618	root_bucket = &root_clutch->scr_buckets[root_bucket_index];
	619	}
	620	/* For the selected root bucket, find the highest priority clutch bucket */
	621	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
	622	root_clutch->scr_priority = priority_queue_max_key(&clutch_bucket->scb_clutchpri_prioq);
	623	}
	624
	625	/*
	626	* sched_clutch_root_urgency_inc()
	627	*
	628	* Routine to increment the urgency at the root level based on the thread
	629	* priority that is being inserted into the hierarchy. The root urgency
	630	* counter is updated based on the urgency of threads in any of the
	631	* clutch buckets which are part of the hierarchy.
	632	*
	633	* Always called with the pset lock held.
	634	*/
	635	static void
	636	sched_clutch_root_urgency_inc(
	637	sched_clutch_root_t root_clutch,
	638	thread_t thread)
	639	{
	640	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
	641	root_clutch->scr_urgency++;
	642	}
	643	}
	644
	645	/*
	646	* sched_clutch_root_urgency_dec()
	647	*
	648	* Routine to decrement the urgency at the root level based on the thread
	649	* priority that is being removed from the hierarchy. The root urgency
	650	* counter is updated based on the urgency of threads in any of the
	651	* clutch buckets which are part of the hierarchy.
	652	*
	653	* Always called with the pset lock held.
	654	*/
	655	static void
	656	sched_clutch_root_urgency_dec(
	657	sched_clutch_root_t root_clutch,
	658	thread_t thread)
	659	{
	660	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
	661	root_clutch->scr_urgency--;
	662	}
	663	}
	664
	665	/*
	666	* Clutch bucket level scheduling
	667	*
	668	* The second level of scheduling is the clutch bucket level scheduling
	669	* which tries to schedule thread groups within root_buckets. Each
	670	* clutch represents a thread group and a clutch_bucket represents
	671	* threads at a particular sched_bucket within that thread group. The
	672	* goal of this level of scheduling is to allow interactive thread
	673	* groups low latency access to the CPU. It also provides slight
	674	* scheduling preference for App and unrestricted thread groups.
	675	*
	676	* The clutch bucket scheduling algorithm measures an interactivity
	677	* score for all clutch buckets. The interactivity score is based
	678	* on the ratio of the CPU used and the voluntary blocking of threads
	679	* within the clutch bucket. The algorithm is very close to the ULE
	680	* scheduler on FreeBSD in terms of calculations. The interactivity
	681	* score provides an interactivity boost in the range of
	682	* [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
	683	* thread groups to win over CPU spinners.
	684	*/
	685
	686	/* Priority boost range for interactivity */
	687	#define SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT (8)
	688	uint8_t sched_clutch_bucket_interactive_pri = SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT;
	689
	690	/* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
	691	uint64_t sched_clutch_bucket_adjust_threshold = 0;
	692	#define SCHED_CLUTCH_BUCKET_ADJUST_THRESHOLD_USECS (500000)
	693
	694	/* The ratio to scale the cpu/blocked time per window */
	695	#define SCHED_CLUTCH_BUCKET_ADJUST_RATIO (10)
	696
	697	/* rate at which interactivity score is recalculated. This keeps the score smooth in terms of extremely bursty behavior */
	698	uint64_t sched_clutch_bucket_interactivity_delta = 0;
	699	#define SCHED_CLUTCH_BUCKET_INTERACTIVITY_DELTA_USECS_DEFAULT (25000)
	700
	701	/*
	702	* In order to allow App thread groups some preference over daemon thread
	703	* groups, the App clutch_buckets get a 8 point boost. The boost value should
	704	* be chosen such that badly behaved apps are still penalized over well
	705	* behaved interactive daemon clutch_buckets.
	706	*/
	707	#define SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT (8)
	708	uint8_t sched_clutch_bucket_pri_boost = SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT;
	709
	710	/* Initial value for voluntary blocking time for the clutch_bucket */
	711	#define SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID (uint32_t)(~0)
	712
	713	/*
	714	* sched_clutch_bucket_init()
	715	*
	716	* Initializer for clutch buckets.
	717	*/
	718	static void
	719	sched_clutch_bucket_init(
	720	sched_clutch_bucket_t clutch_bucket,
	721	sched_clutch_t clutch,
	722	sched_bucket_t bucket)
	723	{
	724	bzero(clutch_bucket, sizeof(struct sched_clutch_bucket));
	725
	726	clutch_bucket->scb_bucket = bucket;
	727	/* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
	728	clutch_bucket->scb_priority = 0;
	729	/*
	730	* All thread groups should be initialized to be interactive; this allows the newly launched
	731	* thread groups to fairly compete with already running thread groups.
	732	*/
	733	clutch_bucket->scb_interactivity_score = (sched_clutch_bucket_interactive_pri * 2);
	734	clutch_bucket->scb_foreign = false;
	735
	736	os_atomic_store(&clutch_bucket->scb_timeshare_tick, 0, relaxed);
	737	os_atomic_store(&clutch_bucket->scb_pri_shift, INT8_MAX, relaxed);
	738
	739	clutch_bucket->scb_interactivity_ts = 0;
	740	clutch_bucket->scb_blocked_ts = SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID;
	741	priority_queue_entry_init(&clutch_bucket->scb_pqlink);
	742	clutch_bucket->scb_clutch = clutch;
	743	clutch_bucket->scb_root = NULL;
	744	priority_queue_init(&clutch_bucket->scb_clutchpri_prioq, PRIORITY_QUEUE_BUILTIN_KEY \| PRIORITY_QUEUE_MAX_HEAP);
	745	run_queue_init(&clutch_bucket->scb_runq);
	746	}
	747
	748	/*
	749	* sched_clutch_init_with_thread_group()
	750	*
	751	* Initialize the sched_clutch when the thread group is being created
	752	*/
	753	void
	754	sched_clutch_init_with_thread_group(
	755	sched_clutch_t clutch,
	756	struct thread_group *tg)
	757	{
	758	os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
	759
	760	/* Initialize all the clutch buckets */
	761	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
	762	sched_clutch_bucket_init(&(clutch->sc_clutch_buckets[i]), clutch, i);
	763	}
	764
	765	/* Grouping specific fields */
	766	clutch->sc_tg = tg;
	767	os_atomic_store(&clutch->sc_tg_priority, 0, relaxed);
	768	}
	769
	770	/*
	771	* sched_clutch_destroy()
	772	*
	773	* Destructor for clutch; called from thread group release code.
	774	*/
	775	void
	776	sched_clutch_destroy(
	777	__unused sched_clutch_t clutch)
	778	{
	779	assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
	780	}
	781
	782	#if __AMP__
	783
	784	/*
	785	* sched_clutch_bucket_foreign()
	786	*
	787	* Identifies if the clutch bucket is a foreign (not recommended for) this
	788	* hierarchy. This is possible due to the recommended hierarchy/pset not
	789	* available for scheduling currently.
	790	*/
	791	static boolean_t
	792	sched_clutch_bucket_foreign(sched_clutch_root_t root_clutch, sched_clutch_bucket_t clutch_bucket)
	793	{
	794	assert(clutch_bucket->scb_thr_count > 0);
	795	if (!sched_clutch_pset_available(root_clutch->scr_pset)) {
	796	/* Even though the pset was not available for scheduling, threads
	797	* are being put in its runq (this might be due to the other pset
	798	* being turned off and this being the master processor pset).
	799	* Mark the clutch bucket as foreign so that when the other
	800	* pset becomes available, it moves the clutch bucket accordingly.
	801	*/
	802	return true;
	803	}
	804	thread_t thread = run_queue_peek(&clutch_bucket->scb_runq);
	805	pset_cluster_type_t pset_type = recommended_pset_type(thread);
	806	return pset_type != root_clutch->scr_pset->pset_cluster_type;
	807	}
	808
	809	#endif /* __AMP__ */
	810
	811	/*
	812	* sched_clutch_bucket_hierarchy_insert()
	813	*
	814	* Routine to insert a newly runnable clutch_bucket into the root hierarchy.
	815	*/
	816	static void
	817	sched_clutch_bucket_hierarchy_insert(
	818	sched_clutch_root_t root_clutch,
	819	sched_clutch_bucket_t clutch_bucket,
	820	sched_bucket_t bucket,
	821	uint64_t timestamp)
	822	{
	823	sched_clutch_hierarchy_locked_assert(root_clutch);
	824	if (bucket > TH_BUCKET_FIXPRI) {
	825	/* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
	826	enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
	827	}
	828	#if __AMP__
	829	/* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
	830	if (sched_clutch_bucket_foreign(root_clutch, clutch_bucket)) {
	831	clutch_bucket->scb_foreign = true;
	832	enqueue_tail(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
	833	}
	834	#endif /* __AMP__ */
	835	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket];
	836
	837	/* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
	838	if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) {
	839	sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
	840	}
	841
	842	/* Insert the clutch bucket into the root bucket priority queue */
	843	priority_queue_insert(&root_bucket->scrb_clutch_buckets, &clutch_bucket->scb_pqlink, clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
	844	os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
	845	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_STATE) \| DBG_FUNC_NONE,
	846	thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, SCHED_CLUTCH_STATE_RUNNABLE, clutch_bucket->scb_priority, 0);
	847	}
	848
	849	/*
	850	* sched_clutch_bucket_hierarchy_remove()
	851	*
	852	* Rotuine to remove a empty clutch bucket from the root hierarchy.
	853	*/
	854	static void
	855	sched_clutch_bucket_hierarchy_remove(
	856	sched_clutch_root_t root_clutch,
	857	sched_clutch_bucket_t clutch_bucket,
	858	sched_bucket_t bucket,
	859	uint64_t timestamp)
	860	{
	861	sched_clutch_hierarchy_locked_assert(root_clutch);
	862	if (bucket > TH_BUCKET_FIXPRI) {
	863	/* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
	864	remqueue(&clutch_bucket->scb_listlink);
	865	}
	866	#if __AMP__
	867	if (clutch_bucket->scb_foreign) {
	868	clutch_bucket->scb_foreign = false;
	869	remqueue(&clutch_bucket->scb_foreignlink);
	870	}
	871	#endif /* __AMP__ */
	872
	873	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket];
	874
	875	/* Remove the clutch bucket from the root bucket priority queue */
	876	priority_queue_remove(&root_bucket->scrb_clutch_buckets, &clutch_bucket->scb_pqlink, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
	877	os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
	878	clutch_bucket->scb_blocked_ts = timestamp;
	879	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_STATE) \| DBG_FUNC_NONE,
	880	thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, SCHED_CLUTCH_STATE_EMPTY, 0, 0);
	881
	882	/* If the root bucket priority queue is now empty, remove it from the root priority queue */
	883	if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) {
	884	sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
	885	}
	886	}
	887
	888	/*
	889	* sched_clutch_bucket_base_pri()
	890	*
	891	* Calculates the "base" priority of the clutch bucket. The base
	892	* priority of the clutch bucket is the sum of the max of highest
	893	* base_pri and highest sched_pri in the clutch bucket and any
	894	* grouping specific (App/Daemon...) boosts applicable to the
	895	* clutch_bucket.
	896	*/
	897	static uint8_t
	898	sched_clutch_bucket_base_pri(
	899	sched_clutch_bucket_t clutch_bucket)
	900	{
	901	uint8_t clutch_boost = 0;
	902	assert(clutch_bucket->scb_runq.count != 0);
	903
	904	sched_clutch_t clutch = clutch_bucket->scb_clutch;
	905
	906	/*
	907	* Since the clutch bucket can contain threads that are members of the group due
	908	* to the sched_pri being promoted or due to their base pri, the base priority of
	909	* the entire clutch bucket should be based on the highest thread (promoted or base)
	910	* in the clutch bucket.
	911	*/
	912	uint8_t max_pri = priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq) ? 0 : priority_queue_max_key(&clutch_bucket->scb_clutchpri_prioq);
	913
	914	/*
	915	* For all AboveUI clutch buckets and clutch buckets for thread groups that
	916	* havent been specified as SCHED_CLUTCH_TG_PRI_LOW, give a priority boost
	917	*/
	918	if ((clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) \|\|
	919	(os_atomic_load(&clutch->sc_tg_priority, relaxed) != SCHED_CLUTCH_TG_PRI_LOW)) {
	920	clutch_boost = sched_clutch_bucket_pri_boost;
	921	}
	922	return max_pri + clutch_boost;
	923	}
	924
	925	/*
	926	* sched_clutch_bucket_interactivity_score_calculate()
	927	*
	928	* Routine to calculate the interactivity score for the clutch bucket. The
	929	* interactivity score is based on the ratio of CPU used by all threads in
	930	* the bucket and the blocked time of the bucket as a whole.
	931	*/
	932	static uint8_t
	933	sched_clutch_bucket_interactivity_score_calculate(
	934	sched_clutch_bucket_t clutch_bucket,
	935	uint64_t timestamp)
	936	{
	937	if (clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) {
	938	/*
	939	* Since the root bucket selection algorithm for Above UI looks at clutch bucket
	940	* priorities, make sure all AboveUI buckets are marked interactive.
	941	*/
	942	assert(clutch_bucket->scb_interactivity_score == (2 * sched_clutch_bucket_interactive_pri));
	943	return clutch_bucket->scb_interactivity_score;
	944	}
	945
	946	if (clutch_bucket->scb_interactivity_ts == 0) {
	947	/*
	948	* This indicates a newly initialized clutch bucket; return the default interactivity score
	949	* and update timestamp.
	950	*/
	951	clutch_bucket->scb_interactivity_ts = timestamp;
	952	return clutch_bucket->scb_interactivity_score;
	953	}
	954
	955	if (timestamp < (clutch_bucket->scb_interactivity_ts + sched_clutch_bucket_interactivity_delta)) {
	956	return clutch_bucket->scb_interactivity_score;
	957	}
	958
	959	/* Check if the clutch bucket accounting needs to be scaled */
	960	sched_clutch_bucket_cpu_adjust(clutch_bucket);
	961	clutch_bucket->scb_interactivity_ts = timestamp;
	962
	963	sched_clutch_bucket_cpu_data_t scb_cpu_data;
	964	scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, relaxed);
	965	clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
	966	clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
	967
	968	/*
	969	* In extremely CPU contended cases, it is possible that the clutch bucket has been runnable
	970	* for a long time but none of its threads have been picked up for execution. In that case, both
	971	* the CPU used and blocked would be 0.
	972	*/
	973	if ((cpu_blocked == 0) && (cpu_used == 0)) {
	974	return clutch_bucket->scb_interactivity_score;
	975	}
	976
	977	/*
	978	* For all timeshare buckets, calculate the interactivity score of the bucket
	979	* and add it to the base priority
	980	*/
	981	uint8_t interactive_score = 0;
	982	if (cpu_blocked > cpu_used) {
	983	/* Interactive clutch_bucket case */
	984	interactive_score = sched_clutch_bucket_interactive_pri +
	985	((sched_clutch_bucket_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
	986	} else {
	987	/* Non-interactive clutch_bucket case */
	988	interactive_score = ((sched_clutch_bucket_interactive_pri * cpu_blocked) / cpu_used);
	989	}
	990	clutch_bucket->scb_interactivity_score = interactive_score;
	991	return interactive_score;
	992	}
	993
	994	/*
	995	* sched_clutch_bucket_pri_calculate()
	996	*
	997	* The priority calculation algorithm for the clutch_bucket is a slight
	998	* modification on the ULE interactivity score. It uses the base priority
	999	* of the clutch bucket and applies an interactivity score boost to the
	1000	* highly responsive clutch buckets.
	1001	*/
	1002
	1003	static uint8_t
	1004	sched_clutch_bucket_pri_calculate(
	1005	sched_clutch_bucket_t clutch_bucket,
	1006	uint64_t timestamp)
	1007	{
	1008	/* For empty clutch buckets, return priority 0 */
	1009	if (clutch_bucket->scb_thr_count == 0) {
	1010	return 0;
	1011	}
	1012
	1013	uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
	1014	uint8_t interactive_score = sched_clutch_bucket_interactivity_score_calculate(clutch_bucket, timestamp);
	1015
	1016	assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
	1017	uint8_t pri = base_pri + interactive_score;
	1018	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) \| DBG_FUNC_NONE,
	1019	thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
	1020	return pri;
	1021	}
	1022
	1023	/*
	1024	* sched_clutch_root_bucket_highest_clutch_bucket()
	1025	*
	1026	* Routine to find the highest priority clutch bucket
	1027	* within the root bucket.
	1028	*/
	1029	static sched_clutch_bucket_t
	1030	sched_clutch_root_bucket_highest_clutch_bucket(
	1031	sched_clutch_root_bucket_t root_bucket)
	1032	{
	1033	if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) {
	1034	return NULL;
	1035	}
	1036	return priority_queue_max(&root_bucket->scrb_clutch_buckets, struct sched_clutch_bucket, scb_pqlink);
	1037	}
	1038
	1039	/*
	1040	* sched_clutch_bucket_runnable()
	1041	*
	1042	* Perform all operations needed when a new clutch bucket becomes runnable.
	1043	* It involves inserting the clutch_bucket into the hierarchy and updating the
	1044	* root priority appropriately.
	1045	*/
	1046	static boolean_t
	1047	sched_clutch_bucket_runnable(
	1048	sched_clutch_bucket_t clutch_bucket,
	1049	sched_clutch_root_t root_clutch,
	1050	uint64_t timestamp)
	1051	{
	1052	sched_clutch_hierarchy_locked_assert(root_clutch);
	1053	sched_clutch_bucket_cpu_blocked_update(clutch_bucket, timestamp);
	1054	clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
	1055	sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp);
	1056	/* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
	1057	sched_clutch_bucket_timeshare_update(clutch_bucket);
	1058	int16_t root_old_pri = root_clutch->scr_priority;
	1059	sched_clutch_root_pri_update(root_clutch);
	1060	return root_clutch->scr_priority > root_old_pri;
	1061	}
	1062
	1063	/*
	1064	* sched_clutch_bucket_update()
	1065	*
	1066	* Update the clutch_bucket's position in the hierarchy based on whether
	1067	* the newly runnable thread changes its priority. Also update the root
	1068	* priority accordingly.
	1069	*/
	1070	static boolean_t
	1071	sched_clutch_bucket_update(
	1072	sched_clutch_bucket_t clutch_bucket,
	1073	sched_clutch_root_t root_clutch,
	1074	uint64_t timestamp)
	1075	{
	1076	sched_clutch_hierarchy_locked_assert(root_clutch);
	1077	uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
	1078	if (new_pri == clutch_bucket->scb_priority) {
	1079	return false;
	1080	}
	1081	struct priority_queue *bucket_prioq = &root_clutch->scr_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
	1082
	1083	if (new_pri < clutch_bucket->scb_priority) {
	1084	clutch_bucket->scb_priority = new_pri;
	1085	priority_queue_entry_decrease(bucket_prioq, &clutch_bucket->scb_pqlink,
	1086	clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
	1087	} else {
	1088	clutch_bucket->scb_priority = new_pri;
	1089	priority_queue_entry_increase(bucket_prioq, &clutch_bucket->scb_pqlink,
	1090	clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
	1091	}
	1092
	1093	int16_t root_old_pri = root_clutch->scr_priority;
	1094	sched_clutch_root_pri_update(root_clutch);
	1095	return root_clutch->scr_priority > root_old_pri;
	1096	}
	1097
	1098	/*
	1099	* sched_clutch_bucket_empty()
	1100	*
	1101	* Perform all the operations needed when a clutch_bucket is no longer runnable.
	1102	* It involves removing the clutch bucket from the hierarchy and updaing the root
	1103	* priority appropriately.
	1104	*/
	1105	static void
	1106	sched_clutch_bucket_empty(
	1107	sched_clutch_bucket_t clutch_bucket,
	1108	sched_clutch_root_t root_clutch,
	1109	uint64_t timestamp)
	1110	{
	1111	sched_clutch_hierarchy_locked_assert(root_clutch);
	1112	sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp);
	1113	clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
	1114	sched_clutch_root_pri_update(root_clutch);
	1115	}
	1116
	1117	/*
	1118	* sched_clutch_cpu_usage_update()
	1119	*
	1120	* Routine to update CPU usage of the thread in the hierarchy.
	1121	*/
	1122	void
	1123	sched_clutch_cpu_usage_update(
	1124	thread_t thread,
	1125	uint64_t delta)
	1126	{
	1127	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
	1128	return;
	1129	}
	1130	sched_clutch_t clutch = sched_clutch_for_thread(thread);
	1131	sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
	1132	sched_clutch_bucket_cpu_usage_update(clutch_bucket, delta);
	1133	}
	1134
	1135	/*
	1136	* sched_clutch_bucket_cpu_usage_update()
	1137	*
	1138	* Routine to update the CPU usage of the clutch_bucket.
	1139	*/
	1140	static void
	1141	sched_clutch_bucket_cpu_usage_update(
	1142	sched_clutch_bucket_t clutch_bucket,
	1143	uint64_t delta)
	1144	{
	1145	if (clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) {
	1146	/* Since Above UI bucket has maximum interactivity score always, nothing to do here */
	1147	return;
	1148	}
	1149
	1150	/*
	1151	* The CPU usage should not overflow the clutch_cpu_data_t type. Since the usage is used to
	1152	* calculate interactivity score, it is safe to restrict it to CLUTCH_CPU_DATA_MAX.
	1153	*/
	1154	delta = MIN(delta, CLUTCH_CPU_DATA_MAX);
	1155	os_atomic_add_orig(&(clutch_bucket->scb_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
	1156	}
	1157
	1158	/*
	1159	* sched_clutch_bucket_cpu_blocked_update()
	1160	*
	1161	* Routine to update CPU blocked time for clutch_bucket.
	1162	*/
	1163	static void
	1164	sched_clutch_bucket_cpu_blocked_update(
	1165	sched_clutch_bucket_t clutch_bucket,
	1166	uint64_t timestamp)
	1167	{
	1168	if ((clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) \|\|
	1169	(clutch_bucket->scb_blocked_ts == SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID)) {
	1170	/* For Above UI bucket and a newly initialized clutch bucket, nothing to do here */
	1171	return;
	1172	}
	1173
	1174	uint64_t blocked_time = timestamp - clutch_bucket->scb_blocked_ts;
	1175	if (blocked_time > sched_clutch_bucket_adjust_threshold) {
	1176	blocked_time = sched_clutch_bucket_adjust_threshold;
	1177	}
	1178
	1179	/*
	1180	* The CPU blocked should not overflow the clutch_cpu_data_t type. Since the blocked is used to
	1181	* calculate interactivity score, it is safe to restrict it to CLUTCH_CPU_DATA_MAX.
	1182	*/
	1183	blocked_time = MIN(blocked_time, CLUTCH_CPU_DATA_MAX);
	1184	clutch_cpu_data_t __assert_only cpu_blocked_orig = os_atomic_add_orig(&(clutch_bucket->scb_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
	1185	/* The blocked time is scaled every so often, it should never overflow */
	1186	assert(blocked_time <= (CLUTCH_CPU_DATA_MAX - cpu_blocked_orig));
	1187	}
	1188
	1189	/*
	1190	* sched_clutch_bucket_cpu_adjust()
	1191	*
	1192	* Routine to scale the cpu usage and blocked time once the sum gets bigger
	1193	* than sched_clutch_bucket_adjust_threshold. Allows the values to remain
	1194	* manageable and maintain the same ratio while allowing clutch buckets to
	1195	* adjust behavior and reflect in the interactivity score in a reasonable
	1196	* amount of time.
	1197	*/
	1198	static void
	1199	sched_clutch_bucket_cpu_adjust(
	1200	sched_clutch_bucket_t clutch_bucket)
	1201	{
	1202	sched_clutch_bucket_cpu_data_t old_cpu_data = {};
	1203	sched_clutch_bucket_cpu_data_t new_cpu_data = {};
	1204	do {
	1205	old_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, relaxed);
	1206	clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
	1207	clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
	1208	if ((cpu_used + cpu_blocked) < sched_clutch_bucket_adjust_threshold) {
	1209	return;
	1210	}
	1211
	1212	/*
	1213	* The accumulation of CPU used and blocked is past the threshold; scale it
	1214	* down to lose old history.
	1215	*/
	1216	new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_ADJUST_RATIO;
	1217	new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_ADJUST_RATIO;
	1218	} while (!os_atomic_cmpxchg(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed));
	1219	}
	1220
	1221	/*
	1222	* Thread level scheduling algorithm
	1223	*
	1224	* The thread level scheduling algorithm uses the mach timeshare
	1225	* decay based algorithm to achieve sharing between threads within the
	1226	* same clutch bucket. The load/priority shifts etc. are all maintained
	1227	* at the clutch bucket level and used for decay calculation of the
	1228	* threads. The load sampling is still driven off the scheduler tick
	1229	* for runnable clutch buckets (it does not use the new higher frequency
	1230	* EWMA based load calculation). The idea is that the contention and load
	1231	* within clutch_buckets should be limited enough to not see heavy decay
	1232	* and timeshare effectively.
	1233	*/
	1234
	1235	/*
	1236	* sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
	1237	*
	1238	* Increment the run count for the clutch bucket associated with the
	1239	* thread.
	1240	*/
	1241	uint32_t
	1242	sched_clutch_thread_run_bucket_incr(
	1243	thread_t thread,
	1244	sched_bucket_t bucket)
	1245	{
	1246	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
	1247	return 0;
	1248	}
	1249	sched_clutch_t clutch = sched_clutch_for_thread(thread);
	1250	return sched_clutch_run_bucket_incr(clutch, bucket);
	1251	}
	1252
	1253	static uint32_t
	1254	sched_clutch_run_bucket_incr(
	1255	sched_clutch_t clutch,
	1256	sched_bucket_t bucket)
	1257	{
	1258	assert(bucket != TH_BUCKET_RUN);
	1259	sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]);
	1260	uint32_t result = os_atomic_inc(&(clutch_bucket->scb_run_count), relaxed);
	1261	return result;
	1262	}
	1263
	1264	/*
	1265	* sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
	1266	*
	1267	* Decrement the run count for the clutch bucket associated with the
	1268	* thread.
	1269	*/
	1270	uint32_t
	1271	sched_clutch_thread_run_bucket_decr(
	1272	thread_t thread,
	1273	sched_bucket_t bucket)
	1274	{
	1275	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
	1276	return 0;
	1277	}
	1278	sched_clutch_t clutch = sched_clutch_for_thread(thread);
	1279	return sched_clutch_run_bucket_decr(clutch, bucket);
	1280	}
	1281
	1282	static uint32_t
	1283	sched_clutch_run_bucket_decr(
	1284	sched_clutch_t clutch,
	1285	sched_bucket_t bucket)
	1286	{
	1287	assert(bucket != TH_BUCKET_RUN);
	1288	sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]);
	1289	uint32_t result = os_atomic_dec(&(clutch_bucket->scb_run_count), relaxed);
	1290	return result;
	1291	}
	1292
	1293	/*
	1294	* sched_clutch_bucket_timeshare_update()
	1295	*
	1296	* Routine to update the load and priority shift for the clutch_bucket every
	1297	* sched_tick. For runnable clutch_buckets, the sched tick handling code
	1298	* iterates the clutch buckets and calls this routine. For all others, the
	1299	* clutch_bucket maintains a "last updated schedtick" parameter. As threads
	1300	* become runnable in the clutch bucket, if this value is outdated, the load
	1301	* and shifts are updated.
	1302	*
	1303	* Possible optimization:
	1304	* - The current algorithm samples the load every sched tick (125ms).
	1305	* This is prone to spikes in runnable counts; if that turns out to be
	1306	* a problem, a simple solution would be to do the EWMA trick to sample
	1307	* load at every load_tick (30ms) and use the averaged value for the pri
	1308	* shift calculation.
	1309	*/
	1310	static void
	1311	sched_clutch_bucket_timeshare_update(
	1312	sched_clutch_bucket_t clutch_bucket)
	1313	{
	1314	if (clutch_bucket->scb_bucket < TH_BUCKET_SHARE_FG) {
	1315	return;
	1316	}
	1317
	1318	/*
	1319	* Update the timeshare parameters for the clutch bucket if they havent been updated
	1320	* in this tick.
	1321	*/
	1322	uint32_t bucket_sched_ts = os_atomic_load(&clutch_bucket->scb_timeshare_tick, relaxed);
	1323	uint32_t current_sched_ts = sched_tick;
	1324	if (bucket_sched_ts != current_sched_ts) {
	1325	os_atomic_store(&clutch_bucket->scb_timeshare_tick, current_sched_ts, relaxed);
	1326	uint32_t bucket_load = (os_atomic_load(&clutch_bucket->scb_run_count, relaxed) / processor_avail_count);
	1327	bucket_load = MIN(bucket_load, NRQS - 1);
	1328	uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_load];
	1329	os_atomic_store(&clutch_bucket->scb_pri_shift, pri_shift, relaxed);
	1330	}
	1331	}
	1332
	1333	/*
	1334	* sched_clutch_thread_clutch_update()
	1335	*
	1336	* Routine called when the thread changes its thread group. The current
	1337	* implementation relies on the fact that the thread group is changed only
	1338	* from the context of the thread itself. Due to this fact, the thread
	1339	* group change causes only counter updates in the old & new clutch
	1340	* buckets and no hierarchy changes. The routine also attributes the CPU
	1341	* used so far to the old clutch.
	1342	*/
	1343	void
	1344	sched_clutch_thread_clutch_update(
	1345	thread_t thread,
	1346	sched_clutch_t old_clutch,
	1347	sched_clutch_t new_clutch)
	1348	{
	1349	uint32_t cpu_delta;
	1350	assert(current_thread() == thread);
	1351
	1352	if (old_clutch) {
	1353	sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
	1354	/*
	1355	* Calculate the CPU used by this thread in the old bucket and
	1356	* add it to the old clutch bucket. This uses the same CPU usage
	1357	* logic as update_priority etc.
	1358	*/
	1359	thread_timer_delta(thread, cpu_delta);
	1360	if (thread->pri_shift < INT8_MAX) {
	1361	thread->sched_usage += cpu_delta;
	1362	}
	1363	thread->cpu_delta += cpu_delta;
	1364	sched_clutch_bucket_cpu_usage_update(&(old_clutch->sc_clutch_buckets[thread->th_sched_bucket]), cpu_delta);
	1365	}
	1366
	1367	if (new_clutch) {
	1368	sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
	1369	}
	1370	}
	1371
	1372	/* Thread Insertion/Removal/Selection routines */
	1373
	1374	/*
	1375	* sched_clutch_thread_insert()
	1376	*
	1377	* Routine to insert a thread into the sched clutch hierarchy.
	1378	* Update the counts at all levels of the hierarchy and insert the nodes
	1379	* as they become runnable. Always called with the pset lock held.
	1380	*/
	1381	static boolean_t
	1382	sched_clutch_thread_insert(
	1383	sched_clutch_root_t root_clutch,
	1384	thread_t thread,
	1385	integer_t options)
	1386	{
	1387	boolean_t result = FALSE;
	1388
	1389	sched_clutch_hierarchy_locked_assert(root_clutch);
	1390	sched_clutch_t clutch = sched_clutch_for_thread(thread);
	1391	assert(thread->thread_group == clutch->sc_tg);
	1392
	1393	uint64_t current_timestamp = mach_absolute_time();
	1394	sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
	1395	assert((clutch_bucket->scb_root == NULL) \|\| (clutch_bucket->scb_root == root_clutch));
	1396
	1397	/* Insert thread into the clutch_bucket runq using sched_pri */
	1398	run_queue_enqueue(&clutch_bucket->scb_runq, thread, options);
	1399	/* Increment the urgency counter for the root if necessary */
	1400	sched_clutch_root_urgency_inc(root_clutch, thread);
	1401
	1402	/* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
	1403	priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->sched_clutchpri_link,
	1404	sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri,
	1405	PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
	1406	os_atomic_inc(&clutch->sc_thr_count, relaxed);
	1407	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_STATE) \| DBG_FUNC_NONE,
	1408	thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, thread_tid(thread), SCHED_CLUTCH_STATE_RUNNABLE, 0);
	1409
	1410	/* Enqueue the clutch into the hierarchy (if needed) and update properties */
	1411	if (clutch_bucket->scb_thr_count == 0) {
	1412	sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
	1413	sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
	1414	/* Insert the newly runnable clutch bucket into the hierarchy */
	1415	result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp);
	1416	} else {
	1417	sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
	1418	sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
	1419	/* Update the position of the clutch bucket in the hierarchy */
	1420	result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp);
	1421	}
	1422	return result;
	1423	}
	1424
	1425	/*
	1426	* sched_clutch_thread_remove()
	1427	*
	1428	* Routine to remove a thread from the sched clutch hierarchy.
	1429	* Update the counts at all levels of the hierarchy and remove the nodes
	1430	* as they become empty. Always called with the pset lock held.
	1431	*/
	1432	static void
	1433	sched_clutch_thread_remove(
	1434	sched_clutch_root_t root_clutch,
	1435	thread_t thread,
	1436	uint64_t current_timestamp)
	1437	{
	1438	sched_clutch_hierarchy_locked_assert(root_clutch);
	1439	sched_clutch_t clutch = sched_clutch_for_thread(thread);
	1440	assert(thread->thread_group == clutch->sc_tg);
	1441	assert(thread->runq != PROCESSOR_NULL);
	1442
	1443	sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
	1444	assert(clutch_bucket->scb_root == root_clutch);
	1445
	1446	/* Decrement the urgency counter for the root if necessary */
	1447	sched_clutch_root_urgency_dec(root_clutch, thread);
	1448	/* Remove thread from the clutch_bucket */
	1449	run_queue_remove(&clutch_bucket->scb_runq, thread);
	1450
	1451	priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->sched_clutchpri_link,
	1452	PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
	1453	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_STATE) \| DBG_FUNC_NONE,
	1454	thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, thread_tid(thread), SCHED_CLUTCH_STATE_EMPTY, 0);
	1455
	1456	/* Update counts at various levels of the hierarchy */
	1457	os_atomic_dec(&clutch->sc_thr_count, relaxed);
	1458	sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
	1459	sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
	1460
	1461	/* Remove the clutch from hierarchy (if needed) and update properties */
	1462	if (clutch_bucket->scb_thr_count == 0) {
	1463	sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp);
	1464	} else {
	1465	sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp);
	1466	}
	1467	}
	1468
	1469	/*
	1470	* sched_clutch_thread_highest()
	1471	*
	1472	* Routine to find and remove the highest priority thread
	1473	* from the sched clutch hierarchy. The algorithm looks at the
	1474	* hierarchy for the most eligible runnable thread and calls
	1475	* sched_clutch_thread_remove(). Always called with the
	1476	* pset lock held.
	1477	*/
	1478	static thread_t
	1479	sched_clutch_thread_highest(
	1480	sched_clutch_root_t root_clutch)
	1481	{
	1482	sched_clutch_hierarchy_locked_assert(root_clutch);
	1483	uint64_t current_timestamp = mach_absolute_time();
	1484
	1485	/* Select the highest priority root bucket */
	1486	sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp);
	1487	if (root_bucket == NULL) {
	1488	return THREAD_NULL;
	1489	}
	1490	/* Since a thread is being picked from this root bucket, update its deadline */
	1491	sched_clutch_root_bucket_deadline_update(root_bucket, root_clutch, current_timestamp);
	1492
	1493	/* Find the highest priority clutch bucket in this root bucket */
	1494	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
	1495	assert(clutch_bucket != NULL);
	1496
	1497	/* Find the highest priority runnable thread in this clutch bucket */
	1498	thread_t thread = run_queue_peek(&clutch_bucket->scb_runq);
	1499	assert(thread != NULL);
	1500
	1501	/* Remove and return the thread from the hierarchy */
	1502	sched_clutch_thread_remove(root_clutch, thread, current_timestamp);
	1503	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) \| DBG_FUNC_NONE,
	1504	thread_tid(thread), thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, 0, 0);
	1505	return thread;
	1506	}
	1507
	1508
	1509	/* High level global accessor routines */
	1510
	1511	/*
	1512	* sched_clutch_root_urgency()
	1513	*
	1514	* Routine to get the urgency of the highest runnable
	1515	* thread in the hierarchy.
	1516	*/
	1517	static uint32_t
	1518	sched_clutch_root_urgency(
	1519	sched_clutch_root_t root_clutch)
	1520	{
	1521	return root_clutch->scr_urgency;
	1522	}
	1523
	1524	/*
	1525	* sched_clutch_root_count_sum()
	1526	*
	1527	* The count_sum mechanism is used for scheduler runq
	1528	* statistics calculation. Its only useful for debugging
	1529	* purposes; since it takes a mach_absolute_time() on
	1530	* other scheduler implementations, its better to avoid
	1531	* populating this until absolutely necessary.
	1532	*/
	1533	static uint32_t
	1534	sched_clutch_root_count_sum(
	1535	__unused sched_clutch_root_t root_clutch)
	1536	{
	1537	return 0;
	1538	}
	1539
	1540	/*
	1541	* sched_clutch_root_priority()
	1542	*
	1543	* Routine to get the priority of the highest runnable
	1544	* thread in the hierarchy.
	1545	*/
	1546	static int
	1547	sched_clutch_root_priority(
	1548	sched_clutch_root_t root_clutch)
	1549	{
	1550	return root_clutch->scr_priority;
	1551	}
	1552
	1553	/*
	1554	* sched_clutch_root_count()
	1555	*
	1556	* Returns total number of runnable threads in the hierarchy.
	1557	*/
	1558	uint32_t
	1559	sched_clutch_root_count(
	1560	sched_clutch_root_t root_clutch)
	1561	{
	1562	return root_clutch->scr_thr_count;
	1563	}
	1564
	1565	/*
	1566	* sched_clutch_thread_pri_shift()
	1567	*
	1568	* Routine to get the priority shift value for a thread.
	1569	* Since the timesharing is done at the clutch_bucket level,
	1570	* this routine gets the clutch_bucket and retrieves the
	1571	* values from there.
	1572	*/
	1573	uint32_t
	1574	sched_clutch_thread_pri_shift(
	1575	thread_t thread,
	1576	sched_bucket_t bucket)
	1577	{
	1578	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
	1579	return UINT8_MAX;
	1580	}
	1581	assert(bucket != TH_BUCKET_RUN);
	1582	sched_clutch_t clutch = sched_clutch_for_thread(thread);
	1583	sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]);
	1584	return os_atomic_load(&clutch_bucket->scb_pri_shift, relaxed);
	1585	}
	1586
	1587	#pragma mark -- Clutch Scheduler Algorithm
	1588
	1589	static void
	1590	sched_clutch_init(void);
	1591
	1592	static void
	1593	sched_clutch_timebase_init(void);
	1594
	1595	static thread_t
	1596	sched_clutch_steal_thread(processor_set_t pset);
	1597
	1598	static void
	1599	sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
	1600
	1601	static boolean_t
	1602	sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
	1603	sched_options_t options);
	1604
	1605	static boolean_t
	1606	sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
	1607
	1608	static ast_t
	1609	sched_clutch_processor_csw_check(processor_t processor);
	1610
	1611	static boolean_t
	1612	sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
	1613
	1614	static int
	1615	sched_clutch_runq_count(processor_t processor);
	1616
	1617	static boolean_t
	1618	sched_clutch_processor_queue_empty(processor_t processor);
	1619
	1620	static uint64_t
	1621	sched_clutch_runq_stats_count_sum(processor_t processor);
	1622
	1623	static int
	1624	sched_clutch_processor_bound_count(processor_t processor);
	1625
	1626	static void
	1627	sched_clutch_pset_init(processor_set_t pset);
	1628
	1629	static void
	1630	sched_clutch_processor_init(processor_t processor);
	1631
	1632	static thread_t
	1633	sched_clutch_choose_thread(processor_t processor, int priority, ast_t reason);
	1634
	1635	static void
	1636	sched_clutch_processor_queue_shutdown(processor_t processor);
	1637
	1638	static sched_mode_t
	1639	sched_clutch_initial_thread_sched_mode(task_t parent_task);
	1640
	1641	static uint32_t
	1642	sched_clutch_initial_quantum_size(thread_t thread);
	1643
	1644	static bool
	1645	sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread);
	1646
	1647	static uint32_t
	1648	sched_clutch_run_incr(thread_t thread);
	1649
	1650	static uint32_t
	1651	sched_clutch_run_decr(thread_t thread);
	1652
	1653	static void
	1654	sched_clutch_update_thread_bucket(thread_t thread);
	1655
	1656	const struct sched_dispatch_table sched_clutch_dispatch = {
	1657	.sched_name = "clutch",
	1658	.init = sched_clutch_init,
	1659	.timebase_init = sched_clutch_timebase_init,
	1660	.processor_init = sched_clutch_processor_init,
	1661	.pset_init = sched_clutch_pset_init,
	1662	.maintenance_continuation = sched_timeshare_maintenance_continue,
	1663	.choose_thread = sched_clutch_choose_thread,
	1664	.steal_thread_enabled = sched_steal_thread_enabled,
	1665	.steal_thread = sched_clutch_steal_thread,
	1666	.compute_timeshare_priority = sched_compute_timeshare_priority,
	1667	.choose_processor = choose_processor,
	1668	.processor_enqueue = sched_clutch_processor_enqueue,
	1669	.processor_queue_shutdown = sched_clutch_processor_queue_shutdown,
	1670	.processor_queue_remove = sched_clutch_processor_queue_remove,
	1671	.processor_queue_empty = sched_clutch_processor_queue_empty,
	1672	.priority_is_urgent = priority_is_urgent,
	1673	.processor_csw_check = sched_clutch_processor_csw_check,
	1674	.processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
	1675	.initial_quantum_size = sched_clutch_initial_quantum_size,
	1676	.initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
	1677	.can_update_priority = can_update_priority,
	1678	.update_priority = update_priority,
	1679	.lightweight_update_priority = lightweight_update_priority,
	1680	.quantum_expire = sched_default_quantum_expire,
	1681	.processor_runq_count = sched_clutch_runq_count,
	1682	.processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
	1683	.processor_bound_count = sched_clutch_processor_bound_count,
	1684	.thread_update_scan = sched_clutch_thread_update_scan,
	1685	.multiple_psets_enabled = TRUE,
	1686	.sched_groups_enabled = FALSE,
	1687	.avoid_processor_enabled = TRUE,
	1688	.thread_avoid_processor = sched_clutch_thread_avoid_processor,
	1689	.processor_balance = sched_SMT_balance,
	1690
	1691	.rt_runq = sched_rtglobal_runq,
	1692	.rt_init = sched_rtglobal_init,
	1693	.rt_queue_shutdown = sched_rtglobal_queue_shutdown,
	1694	.rt_runq_scan = sched_rtglobal_runq_scan,
	1695	.rt_runq_count_sum = sched_rtglobal_runq_count_sum,
	1696
	1697	.qos_max_parallelism = sched_qos_max_parallelism,
	1698	.check_spill = sched_check_spill,
	1699	.ipi_policy = sched_ipi_policy,
	1700	.thread_should_yield = sched_thread_should_yield,
	1701	.run_count_incr = sched_clutch_run_incr,
	1702	.run_count_decr = sched_clutch_run_decr,
	1703	.update_thread_bucket = sched_clutch_update_thread_bucket,
	1704	.pset_made_schedulable = sched_pset_made_schedulable,
	1705	};
	1706
	1707	__attribute__((always_inline))
	1708	static inline run_queue_t
	1709	sched_clutch_bound_runq(processor_t processor)
	1710	{
	1711	return &processor->runq;
	1712	}
	1713
	1714	__attribute__((always_inline))
	1715	static inline sched_clutch_root_t
	1716	sched_clutch_processor_root_clutch(processor_t processor)
	1717	{
	1718	return &processor->processor_set->pset_clutch_root;
	1719	}
	1720
	1721	__attribute__((always_inline))
	1722	static inline run_queue_t
	1723	sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
	1724	{
	1725	assert(thread->bound_processor == processor);
	1726	return sched_clutch_bound_runq(processor);
	1727	}
	1728
	1729	static uint32_t
	1730	sched_clutch_initial_quantum_size(thread_t thread)
	1731	{
	1732	if (thread == THREAD_NULL) {
	1733	return std_quantum;
	1734	}
	1735	assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
	1736	return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
	1737	}
	1738
	1739	static sched_mode_t
	1740	sched_clutch_initial_thread_sched_mode(task_t parent_task)
	1741	{
	1742	if (parent_task == kernel_task) {
	1743	return TH_MODE_FIXED;
	1744	} else {
	1745	return TH_MODE_TIMESHARE;
	1746	}
	1747	}
	1748
	1749	static void
	1750	sched_clutch_processor_init(processor_t processor)
	1751	{
	1752	run_queue_init(&processor->runq);
	1753	}
	1754
	1755	static void
	1756	sched_clutch_pset_init(processor_set_t pset)
	1757	{
	1758	sched_clutch_root_init(&pset->pset_clutch_root, pset);
	1759	}
	1760
	1761	static void
	1762	sched_clutch_init(void)
	1763	{
	1764	if (!PE_parse_boot_argn("sched_clutch_bucket_interactive_pri", &sched_clutch_bucket_interactive_pri, sizeof(sched_clutch_bucket_interactive_pri))) {
	1765	sched_clutch_bucket_interactive_pri = SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT;
	1766	}
	1767	if (!PE_parse_boot_argn("sched_clutch_bucket_pri_boost", &sched_clutch_bucket_pri_boost, sizeof(sched_clutch_bucket_pri_boost))) {
	1768	sched_clutch_bucket_pri_boost = SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT;
	1769	}
	1770	sched_timeshare_init();
	1771	}
	1772
	1773	static void
	1774	sched_clutch_timebase_init(void)
	1775	{
	1776	sched_timeshare_timebase_init();
	1777	sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
	1778	sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
	1779	sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
	1780	clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_ADJUST_THRESHOLD_USECS,
	1781	NSEC_PER_USEC, &sched_clutch_bucket_adjust_threshold);
	1782
	1783	uint32_t interactivity_delta = 0;
	1784	if (!PE_parse_boot_argn("sched_clutch_bucket_interactivity_delta_usecs", &interactivity_delta, sizeof(interactivity_delta))) {
	1785	interactivity_delta = SCHED_CLUTCH_BUCKET_INTERACTIVITY_DELTA_USECS_DEFAULT;
	1786	}
	1787	clock_interval_to_absolutetime_interval(interactivity_delta, NSEC_PER_USEC, &sched_clutch_bucket_interactivity_delta);
	1788	}
	1789
	1790	static thread_t
	1791	sched_clutch_choose_thread(
	1792	processor_t processor,
	1793	int priority,
	1794	__unused ast_t reason)
	1795	{
	1796	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
	1797	uint32_t clutch_count = sched_clutch_root_count(sched_clutch_processor_root_clutch(processor));
	1798	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
	1799	boolean_t choose_from_boundq = false;
	1800
	1801	if (bound_runq->highq < priority &&
	1802	clutch_pri < priority) {
	1803	return THREAD_NULL;
	1804	}
	1805
	1806	if (bound_runq->count && clutch_count) {
	1807	if (bound_runq->highq >= clutch_pri) {
	1808	choose_from_boundq = true;
	1809	}
	1810	} else if (bound_runq->count) {
	1811	choose_from_boundq = true;
	1812	} else if (clutch_count) {
	1813	choose_from_boundq = false;
	1814	} else {
	1815	return THREAD_NULL;
	1816	}
	1817
	1818	thread_t thread = THREAD_NULL;
	1819	if (choose_from_boundq == false) {
	1820	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
	1821	thread = sched_clutch_thread_highest(pset_clutch_root);
	1822	} else {
	1823	thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
	1824	}
	1825	return thread;
	1826	}
	1827
	1828	static boolean_t
	1829	sched_clutch_processor_enqueue(
	1830	processor_t processor,
	1831	thread_t thread,
	1832	sched_options_t options)
	1833	{
	1834	boolean_t result;
	1835
	1836	thread->runq = processor;
	1837	if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
	1838	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
	1839	result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
	1840	} else {
	1841	run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
	1842	result = run_queue_enqueue(rq, thread, options);
	1843	}
	1844	return result;
	1845	}
	1846
	1847	static boolean_t
	1848	sched_clutch_processor_queue_empty(processor_t processor)
	1849	{
	1850	return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
	1851	sched_clutch_bound_runq(processor)->count == 0;
	1852	}
	1853
	1854	static ast_t
	1855	sched_clutch_processor_csw_check(processor_t processor)
	1856	{
	1857	boolean_t has_higher;
	1858	int pri;
	1859
	1860	if (sched_clutch_thread_avoid_processor(processor, current_thread())) {
	1861	return AST_PREEMPT \| AST_URGENT;
	1862	}
	1863
	1864	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
	1865	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
	1866
	1867	assert(processor->active_thread != NULL);
	1868
	1869	pri = MAX(clutch_pri, bound_runq->highq);
	1870
	1871	if (processor->first_timeslice) {
	1872	has_higher = (pri > processor->current_pri);
	1873	} else {
	1874	has_higher = (pri >= processor->current_pri);
	1875	}
	1876
	1877	if (has_higher) {
	1878	if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
	1879	return AST_PREEMPT \| AST_URGENT;
	1880	}
	1881
	1882	if (bound_runq->urgency > 0) {
	1883	return AST_PREEMPT \| AST_URGENT;
	1884	}
	1885
	1886	return AST_PREEMPT;
	1887	}
	1888
	1889	return AST_NONE;
	1890	}
	1891
	1892	static boolean_t
	1893	sched_clutch_processor_queue_has_priority(processor_t processor,
	1894	int priority,
	1895	boolean_t gte)
	1896	{
	1897	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
	1898
	1899	int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
	1900
	1901	if (gte) {
	1902	return qpri >= priority;
	1903	} else {
	1904	return qpri > priority;
	1905	}
	1906	}
	1907
	1908	static int
	1909	sched_clutch_runq_count(processor_t processor)
	1910	{
	1911	return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
	1912	}
	1913
	1914	static uint64_t
	1915	sched_clutch_runq_stats_count_sum(processor_t processor)
	1916	{
	1917	uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
	1918
	1919	if (processor->cpu_id == processor->processor_set->cpu_set_low) {
	1920	return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
	1921	} else {
	1922	return bound_sum;
	1923	}
	1924	}
	1925	static int
	1926	sched_clutch_processor_bound_count(processor_t processor)
	1927	{
	1928	return sched_clutch_bound_runq(processor)->count;
	1929	}
	1930
	1931	static void
	1932	sched_clutch_processor_queue_shutdown(processor_t processor)
	1933	{
	1934	processor_set_t pset = processor->processor_set;
	1935	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
	1936	thread_t thread;
	1937	queue_head_t tqueue;
	1938
	1939	/* We only need to migrate threads if this is the last active processor in the pset */
	1940	if (pset->online_processor_count > 0) {
	1941	pset_unlock(pset);
	1942	return;
	1943	}
	1944
	1945	queue_init(&tqueue);
	1946	while (sched_clutch_root_count(pset_clutch_root) > 0) {
	1947	thread = sched_clutch_thread_highest(pset_clutch_root);
	1948	enqueue_tail(&tqueue, &thread->runq_links);
	1949	}
	1950
	1951	pset_unlock(pset);
	1952
	1953	qe_foreach_element_safe(thread, &tqueue, runq_links) {
	1954	remqueue(&thread->runq_links);
	1955
	1956	thread_lock(thread);
	1957
	1958	thread_setrun(thread, SCHED_TAILQ);
	1959
	1960	thread_unlock(thread);
	1961	}
	1962	}
	1963
	1964	static boolean_t
	1965	sched_clutch_processor_queue_remove(
	1966	processor_t processor,
	1967	thread_t thread)
	1968	{
	1969	run_queue_t rq;
	1970	processor_set_t pset = processor->processor_set;
	1971
	1972	pset_lock(pset);
	1973
	1974	if (processor == thread->runq) {
	1975	/*
	1976	* Thread is on a run queue and we have a lock on
	1977	* that run queue.
	1978	*/
	1979	if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
	1980	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
	1981	sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time());
	1982	} else {
	1983	rq = sched_clutch_thread_bound_runq(processor, thread);
	1984	run_queue_remove(rq, thread);
	1985	}
	1986	} else {
	1987	/*
	1988	* The thread left the run queue before we could
	1989	* lock the run queue.
	1990	*/
	1991	assert(thread->runq == PROCESSOR_NULL);
	1992	processor = PROCESSOR_NULL;
	1993	}
	1994
	1995	pset_unlock(pset);
	1996
	1997	return processor != PROCESSOR_NULL;
	1998	}
	1999
	2000	static thread_t
	2001	sched_clutch_steal_thread(processor_set_t pset)
	2002	{
	2003	processor_set_t nset, cset = pset;
	2004	thread_t thread;
	2005
	2006	do {
	2007	sched_clutch_root_t pset_clutch_root = &cset->pset_clutch_root;
	2008	if (sched_clutch_root_count(pset_clutch_root) > 0) {
	2009	thread = sched_clutch_thread_highest(pset_clutch_root);
	2010	pset_unlock(cset);
	2011	return thread;
	2012	}
	2013
	2014	nset = next_pset(cset);
	2015
	2016	if (nset != pset) {
	2017	pset_unlock(cset);
	2018
	2019	cset = nset;
	2020	pset_lock(cset);
	2021	}
	2022	} while (nset != pset);
	2023
	2024	pset_unlock(cset);
	2025
	2026	return THREAD_NULL;
	2027	}
	2028
	2029	static void
	2030	sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
	2031	{
	2032	boolean_t restart_needed = FALSE;
	2033	processor_t processor = processor_list;
	2034	processor_set_t pset;
	2035	thread_t thread;
	2036	spl_t s;
	2037
	2038	/*
	2039	* We update the threads associated with each processor (bound and idle threads)
	2040	* and then update the threads in each pset runqueue.
	2041	*/
	2042
	2043	do {
	2044	do {
	2045	pset = processor->processor_set;
	2046
	2047	s = splsched();
	2048	pset_lock(pset);
	2049
	2050	restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
	2051
	2052	pset_unlock(pset);
	2053	splx(s);
	2054
	2055	if (restart_needed) {
	2056	break;
	2057	}
	2058
	2059	thread = processor->idle_thread;
	2060	if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
	2061	if (thread_update_add_thread(thread) == FALSE) {
	2062	restart_needed = TRUE;
	2063	break;
	2064	}
	2065	}
	2066	} while ((processor = processor->processor_list) != NULL);
	2067
	2068	/* Ok, we now have a collection of candidates -- fix them. */
	2069	thread_update_process_threads();
	2070	} while (restart_needed);
	2071
	2072	pset = &pset0;
	2073
	2074	do {
	2075	do {
	2076	s = splsched();
	2077	pset_lock(pset);
	2078
	2079	if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
	2080	queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
	2081	sched_clutch_bucket_t clutch_bucket;
	2082	qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
	2083	sched_clutch_bucket_timeshare_update(clutch_bucket);
	2084	restart_needed = runq_scan(&clutch_bucket->scb_runq, scan_context);
	2085	if (restart_needed) {
	2086	break;
	2087	}
	2088	}
	2089	}
	2090
	2091	pset_unlock(pset);
	2092	splx(s);
	2093	if (restart_needed) {
	2094	break;
	2095	}
	2096	} while ((pset = pset->pset_list) != NULL);
	2097
	2098	/* Ok, we now have a collection of candidates -- fix them. */
	2099	thread_update_process_threads();
	2100	} while (restart_needed);
	2101	}
	2102
	2103	extern int sched_allow_rt_smt;
	2104
	2105	/* Return true if this thread should not continue running on this processor */
	2106	static bool
	2107	sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread)
	2108	{
	2109	if (processor->processor_primary != processor) {
	2110	/*
	2111	* This is a secondary SMT processor. If the primary is running
	2112	* a realtime thread, only allow realtime threads on the secondary.
	2113	*/
	2114	if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) \|\| !sched_allow_rt_smt)) {
	2115	return true;
	2116	}
	2117	}
	2118
	2119	return false;
	2120	}
	2121
	2122	/*
	2123	* For the clutch scheduler, the run counts are maintained in the clutch
	2124	* buckets (i.e thread group scheduling structure).
	2125	*/
	2126	static uint32_t
	2127	sched_clutch_run_incr(thread_t thread)
	2128	{
	2129	assert((thread->state & (TH_RUN \| TH_IDLE)) == TH_RUN);
	2130	uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
	2131	sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
	2132	return new_count;
	2133	}
	2134
	2135	static uint32_t
	2136	sched_clutch_run_decr(thread_t thread)
	2137	{
	2138	assert((thread->state & (TH_RUN \| TH_IDLE)) != TH_RUN);
	2139	uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
	2140	sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
	2141	return new_count;
	2142	}
	2143
	2144	static sched_bucket_t
	2145	sched_convert_pri_to_bucket(uint8_t priority)
	2146	{
	2147	sched_bucket_t bucket = TH_BUCKET_RUN;
	2148
	2149	if (priority > BASEPRI_USER_INITIATED) {
	2150	bucket = TH_BUCKET_SHARE_FG;
	2151	} else if (priority > BASEPRI_DEFAULT) {
	2152	bucket = TH_BUCKET_SHARE_IN;
	2153	} else if (priority > BASEPRI_UTILITY) {
	2154	bucket = TH_BUCKET_SHARE_DF;
	2155	} else if (priority > MAXPRI_THROTTLE) {
	2156	bucket = TH_BUCKET_SHARE_UT;
	2157	} else {
	2158	bucket = TH_BUCKET_SHARE_BG;
	2159	}
	2160	return bucket;
	2161	}
	2162
	2163	/*
	2164	* For threads that have changed sched_pri without changing the
	2165	* base_pri for any reason other than decay, use the sched_pri
	2166	* as the bucketizing priority instead of base_pri. All such
	2167	* changes are typically due to kernel locking primitives boosts
	2168	* or demotions.
	2169	*/
	2170	static boolean_t
	2171	sched_thread_sched_pri_promoted(thread_t thread)
	2172	{
	2173	return (thread->sched_flags & TH_SFLAG_PROMOTED) \|\|
	2174	(thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) \|\|
	2175	(thread->sched_flags & TH_SFLAG_DEMOTED_MASK) \|\|
	2176	(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) \|\|
	2177	(thread->kern_promotion_schedpri != 0);
	2178	}
	2179
	2180	/*
	2181	* Routine to update the scheduling bucket for the thread.
	2182	*
	2183	* In the clutch scheduler implementation, the thread's bucket
	2184	* is based on sched_pri if it was promoted due to a kernel
	2185	* primitive; otherwise its based on the thread base_pri. This
	2186	* enhancement allows promoted threads to reach a higher priority
	2187	* bucket and potentially get selected sooner for scheduling.
	2188	*
	2189	* Also, the clutch scheduler does not honor fixed priority below
	2190	* FG priority. It simply puts those threads in the corresponding
	2191	* timeshare bucket. The reason for to do that is because it is
	2192	* extremely hard to define the scheduling properties of such threads
	2193	* and they typically lead to performance issues.
	2194	*/
	2195
	2196	void
	2197	sched_clutch_update_thread_bucket(thread_t thread)
	2198	{
	2199	sched_bucket_t old_bucket = thread->th_sched_bucket;
	2200	sched_bucket_t new_bucket = TH_BUCKET_RUN;
	2201	assert(thread->runq == PROCESSOR_NULL);
	2202
	2203	int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
	2204
	2205	switch (thread->sched_mode) {
	2206	case TH_MODE_FIXED:
	2207	if (pri >= BASEPRI_FOREGROUND) {
	2208	new_bucket = TH_BUCKET_FIXPRI;
	2209	} else {
	2210	new_bucket = sched_convert_pri_to_bucket(pri);
	2211	}
	2212	break;
	2213
	2214	case TH_MODE_REALTIME:
	2215	new_bucket = TH_BUCKET_FIXPRI;
	2216	break;
	2217
	2218	case TH_MODE_TIMESHARE:
	2219	new_bucket = sched_convert_pri_to_bucket(pri);
	2220	break;
	2221
	2222	default:
	2223	panic("unexpected mode: %d", thread->sched_mode);
	2224	break;
	2225	}
	2226
	2227	if (old_bucket == new_bucket) {
	2228	return;
	2229	}
	2230
	2231	thread->th_sched_bucket = new_bucket;
	2232	thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
	2233
	2234	/*
	2235	* Since this is called after the thread has been removed from the runq,
	2236	* only the run counts need to be updated. The re-insert into the runq
	2237	* would put the thread into the correct new bucket's runq.
	2238	*/
	2239	if ((thread->state & (TH_RUN \| TH_IDLE)) == TH_RUN) {
	2240	sched_clutch_thread_run_bucket_decr(thread, old_bucket);
	2241	sched_clutch_thread_run_bucket_incr(thread, new_bucket);
	2242	}
	2243	}
	2244
	2245	#if __AMP__
	2246
	2247	/* Implementation of the AMP version of the clutch scheduler */
	2248
	2249	static thread_t
	2250	sched_clutch_amp_steal_thread(processor_set_t pset);
	2251
	2252	static ast_t
	2253	sched_clutch_amp_processor_csw_check(processor_t processor);
	2254
	2255	static boolean_t
	2256	sched_clutch_amp_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
	2257
	2258	static boolean_t
	2259	sched_clutch_amp_processor_queue_empty(processor_t processor);
	2260
	2261	static thread_t
	2262	sched_clutch_amp_choose_thread(processor_t processor, int priority, ast_t reason);
	2263
	2264	static void
	2265	sched_clutch_amp_processor_queue_shutdown(processor_t processor);
	2266
	2267	static processor_t
	2268	sched_clutch_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
	2269
	2270	static bool
	2271	sched_clutch_amp_thread_avoid_processor(processor_t processor, thread_t thread);
	2272
	2273	static bool
	2274	sched_clutch_amp_thread_should_yield(processor_t processor, thread_t thread);
	2275
	2276	static void
	2277	sched_clutch_migrate_foreign_buckets(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
	2278
	2279	static void
	2280	sched_clutch_amp_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation);
	2281
	2282	const struct sched_dispatch_table sched_clutch_amp_dispatch = {
	2283	.sched_name = "clutch_amp",
	2284	.init = sched_amp_init,
	2285	.timebase_init = sched_clutch_timebase_init,
	2286	.processor_init = sched_clutch_processor_init,
	2287	.pset_init = sched_clutch_pset_init,
	2288	.maintenance_continuation = sched_timeshare_maintenance_continue,
	2289	.choose_thread = sched_clutch_amp_choose_thread,
	2290	.steal_thread_enabled = sched_amp_steal_thread_enabled,
	2291	.steal_thread = sched_clutch_amp_steal_thread,
	2292	.compute_timeshare_priority = sched_compute_timeshare_priority,
	2293	.choose_processor = sched_clutch_amp_choose_processor,
	2294	.processor_enqueue = sched_clutch_processor_enqueue,
	2295	.processor_queue_shutdown = sched_clutch_amp_processor_queue_shutdown,
	2296	.processor_queue_remove = sched_clutch_processor_queue_remove,
	2297	.processor_queue_empty = sched_clutch_amp_processor_queue_empty,
	2298	.priority_is_urgent = priority_is_urgent,
	2299	.processor_csw_check = sched_clutch_amp_processor_csw_check,
	2300	.processor_queue_has_priority = sched_clutch_amp_processor_queue_has_priority,
	2301	.initial_quantum_size = sched_clutch_initial_quantum_size,
	2302	.initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
	2303	.can_update_priority = can_update_priority,
	2304	.update_priority = update_priority,
	2305	.lightweight_update_priority = lightweight_update_priority,
	2306	.quantum_expire = sched_default_quantum_expire,
	2307	.processor_runq_count = sched_clutch_runq_count,
	2308	.processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
	2309	.processor_bound_count = sched_clutch_processor_bound_count,
	2310	.thread_update_scan = sched_clutch_thread_update_scan,
	2311	.multiple_psets_enabled = TRUE,
	2312	.sched_groups_enabled = FALSE,
	2313	.avoid_processor_enabled = TRUE,
	2314	.thread_avoid_processor = sched_clutch_amp_thread_avoid_processor,
	2315	.processor_balance = sched_amp_balance,
	2316
	2317	.rt_runq = sched_amp_rt_runq,
	2318	.rt_init = sched_amp_rt_init,
	2319	.rt_queue_shutdown = sched_amp_rt_queue_shutdown,
	2320	.rt_runq_scan = sched_amp_rt_runq_scan,
	2321	.rt_runq_count_sum = sched_amp_rt_runq_count_sum,
	2322
	2323	.qos_max_parallelism = sched_amp_qos_max_parallelism,
	2324	.check_spill = sched_amp_check_spill,
	2325	.ipi_policy = sched_amp_ipi_policy,
	2326	.thread_should_yield = sched_clutch_amp_thread_should_yield,
	2327	.run_count_incr = sched_clutch_run_incr,
	2328	.run_count_decr = sched_clutch_run_decr,
	2329	.update_thread_bucket = sched_clutch_update_thread_bucket,
	2330	.pset_made_schedulable = sched_clutch_migrate_foreign_buckets,
	2331	.thread_group_recommendation_change = sched_clutch_amp_thread_group_recommendation_change,
	2332	};
	2333
	2334	extern processor_set_t ecore_set;
	2335	extern processor_set_t pcore_set;
	2336
	2337	static thread_t
	2338	sched_clutch_amp_choose_thread(
	2339	processor_t processor,
	2340	int priority,
	2341	__unused ast_t reason)
	2342	{
	2343	processor_set_t pset = processor->processor_set;
	2344	bool spill_pending = false;
	2345	int spill_pri = -1;
	2346
	2347	if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
	2348	spill_pending = true;
	2349	spill_pri = sched_clutch_root_priority(&pcore_set->pset_clutch_root);
	2350	}
	2351
	2352	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
	2353	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
	2354	boolean_t choose_from_boundq = false;
	2355
	2356	if ((bound_runq->highq < priority) &&
	2357	(clutch_pri < priority) &&
	2358	(spill_pri < priority)) {
	2359	return THREAD_NULL;
	2360	}
	2361
	2362	if ((spill_pri > bound_runq->highq) &&
	2363	(spill_pri > clutch_pri)) {
	2364	/*
	2365	* There is a higher priority thread on the P-core runq,
	2366	* so returning THREAD_NULL here will cause thread_select()
	2367	* to call sched_clutch_amp_steal_thread() to try to get it.
	2368	*/
	2369	return THREAD_NULL;
	2370	}
	2371
	2372	if (bound_runq->highq >= clutch_pri) {
	2373	choose_from_boundq = true;
	2374	}
	2375
	2376	thread_t thread = THREAD_NULL;
	2377	if (choose_from_boundq == false) {
	2378	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
	2379	thread = sched_clutch_thread_highest(pset_clutch_root);
	2380	} else {
	2381	thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
	2382	}
	2383	return thread;
	2384	}
	2385
	2386	static boolean_t
	2387	sched_clutch_amp_processor_queue_empty(processor_t processor)
	2388	{
	2389	processor_set_t pset = processor->processor_set;
	2390	bool spill_pending = bit_test(pset->pending_spill_cpu_mask, processor->cpu_id);
	2391
	2392	return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
	2393	(sched_clutch_bound_runq(processor)->count == 0) &&
	2394	!spill_pending;
	2395	}
	2396
	2397	static bool
	2398	sched_clutch_amp_thread_should_yield(processor_t processor, thread_t thread)
	2399	{
	2400	if (!sched_clutch_amp_processor_queue_empty(processor) \|\| (rt_runq_count(processor->processor_set) > 0)) {
	2401	return true;
	2402	}
	2403
	2404	if ((processor->processor_set->pset_cluster_type == PSET_AMP_E) && (recommended_pset_type(thread) == PSET_AMP_P)) {
	2405	return sched_clutch_root_count(&pcore_set->pset_clutch_root) > 0;
	2406	}
	2407
	2408	return false;
	2409	}
	2410
	2411	static ast_t
	2412	sched_clutch_amp_processor_csw_check(processor_t processor)
	2413	{
	2414	boolean_t has_higher;
	2415	int pri;
	2416
	2417	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
	2418	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
	2419
	2420	assert(processor->active_thread != NULL);
	2421
	2422	processor_set_t pset = processor->processor_set;
	2423	bool spill_pending = false;
	2424	int spill_pri = -1;
	2425	int spill_urgency = 0;
	2426
	2427	if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
	2428	spill_pending = true;
	2429	spill_pri = sched_clutch_root_priority(&pcore_set->pset_clutch_root);
	2430	spill_urgency = sched_clutch_root_urgency(&pcore_set->pset_clutch_root);
	2431	}
	2432
	2433	pri = MAX(clutch_pri, bound_runq->highq);
	2434	if (spill_pending) {
	2435	pri = MAX(pri, spill_pri);
	2436	}
	2437
	2438	if (processor->first_timeslice) {
	2439	has_higher = (pri > processor->current_pri);
	2440	} else {
	2441	has_higher = (pri >= processor->current_pri);
	2442	}
	2443
	2444	if (has_higher) {
	2445	if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
	2446	return AST_PREEMPT \| AST_URGENT;
	2447	}
	2448
	2449	if (bound_runq->urgency > 0) {
	2450	return AST_PREEMPT \| AST_URGENT;
	2451	}
	2452
	2453	if (spill_urgency > 0) {
	2454	return AST_PREEMPT \| AST_URGENT;
	2455	}
	2456
	2457	return AST_PREEMPT;
	2458	}
	2459
	2460	return AST_NONE;
	2461	}
	2462
	2463	static boolean_t
	2464	sched_clutch_amp_processor_queue_has_priority(processor_t processor,
	2465	int priority,
	2466	boolean_t gte)
	2467	{
	2468	bool spill_pending = false;
	2469	int spill_pri = -1;
	2470	processor_set_t pset = processor->processor_set;
	2471
	2472	if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
	2473	spill_pending = true;
	2474	spill_pri = sched_clutch_root_priority(&pcore_set->pset_clutch_root);
	2475	}
	2476	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
	2477
	2478	int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
	2479	if (spill_pending) {
	2480	qpri = MAX(qpri, spill_pri);
	2481	}
	2482
	2483	if (gte) {
	2484	return qpri >= priority;
	2485	} else {
	2486	return qpri > priority;
	2487	}
	2488	}
	2489
	2490	/*
	2491	* sched_clutch_hierarchy_thread_pset()
	2492	*
	2493	* Routine to determine where a thread should be enqueued based on its
	2494	* recommendation if this is the first runnable thread in the clutch_bucket
	2495	* or its clutch bucket's hierarchy membership.
	2496	*/
	2497	static processor_set_t
	2498	sched_clutch_hierarchy_thread_pset(thread_t thread)
	2499	{
	2500	if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread) == false) {
	2501	return (recommended_pset_type(thread) == PSET_AMP_P) ? pcore_set : ecore_set;
	2502	}
	2503
	2504	sched_clutch_t clutch = sched_clutch_for_thread(thread);
	2505	sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
	2506	sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
	2507	if (scb_root) {
	2508	/* Clutch bucket is already runnable, return the pset hierarchy its part of */
	2509	return scb_root->scr_pset;
	2510	}
	2511	return (recommended_pset_type(thread) == PSET_AMP_E) ? ecore_set : pcore_set;
	2512	}
	2513
	2514	/*
	2515	* sched_clutch_thread_pset_recommended()
	2516	*
	2517	* Routine to determine if the thread should be placed on the provided pset.
	2518	* The routine first makes sure the cluster is available for scheduling. If
	2519	* it is available, it looks at the thread's recommendation. Called
	2520	* with the pset lock held.
	2521	*/
	2522	static bool
	2523	sched_clutch_thread_pset_recommended(thread_t thread, processor_set_t pset)
	2524	{
	2525	if (!sched_clutch_pset_available(pset)) {
	2526	return false;
	2527	}
	2528
	2529	/* At this point, all clusters should be available and recommended */
	2530	if (sched_clutch_hierarchy_thread_pset(thread) != pset) {
	2531	return false;
	2532	}
	2533
	2534	return true;
	2535	}
	2536
	2537
	2538	static void
	2539	sched_clutch_amp_processor_queue_shutdown(processor_t processor)
	2540	{
	2541	processor_set_t pset = processor->processor_set;
	2542	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
	2543	thread_t thread;
	2544	queue_head_t tqueue;
	2545
	2546	/* We only need to migrate threads if this is the last active or last recommended processor in the pset */
	2547	if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
	2548	pset_unlock(pset);
	2549	return;
	2550	}
	2551
	2552	queue_init(&tqueue);
	2553	while (sched_clutch_root_count(pset_clutch_root) > 0) {
	2554	thread = sched_clutch_thread_highest(pset_clutch_root);
	2555	enqueue_tail(&tqueue, &thread->runq_links);
	2556	}
	2557	pset_unlock(pset);
	2558
	2559	qe_foreach_element_safe(thread, &tqueue, runq_links) {
	2560	remqueue(&thread->runq_links);
	2561	thread_lock(thread);
	2562	thread_setrun(thread, SCHED_TAILQ);
	2563	thread_unlock(thread);
	2564	}
	2565	}
	2566
	2567	static thread_t
	2568	sched_clutch_amp_steal_thread(processor_set_t pset)
	2569	{
	2570	thread_t thread = THREAD_NULL;
	2571	processor_set_t nset = pset;
	2572
	2573	if (pcore_set->online_processor_count == 0) {
	2574	/* Nothing to steal from */
	2575	goto out;
	2576	}
	2577
	2578	if (pset->pset_cluster_type == PSET_AMP_P) {
	2579	/* P cores don't steal from E cores */
	2580	goto out;
	2581	}
	2582
	2583	processor_t processor = current_processor();
	2584	assert(pset == processor->processor_set);
	2585
	2586	bool spill_pending = bit_test(pset->pending_spill_cpu_mask, processor->cpu_id);
	2587	bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
	2588
	2589	nset = pcore_set;
	2590
	2591	assert(nset != pset);
	2592
	2593	if (sched_get_pset_load_average(nset) >= sched_amp_steal_threshold(nset, spill_pending)) {
	2594	pset_unlock(pset);
	2595
	2596	pset = nset;
	2597
	2598	pset_lock(pset);
	2599
	2600	/* Allow steal if load average still OK, no idle cores, and more threads on runq than active cores DISPATCHING */
	2601	if ((sched_get_pset_load_average(pset) >= sched_amp_steal_threshold(pset, spill_pending)) &&
	2602	((int)sched_clutch_root_count(&pset->pset_clutch_root) > bit_count(pset->cpu_state_map[PROCESSOR_DISPATCHING])) &&
	2603	(bit_count(pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) == 0)) {
	2604	thread = sched_clutch_thread_highest(&pset->pset_clutch_root);
	2605	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_STEAL) \| DBG_FUNC_NONE, spill_pending, 0, 0, 0);
	2606	sched_update_pset_load_average(pset);
	2607	}
	2608	}
	2609
	2610	out:
	2611	pset_unlock(pset);
	2612	return thread;
	2613	}
	2614
	2615	/* Return true if this thread should not continue running on this processor */
	2616	static bool
	2617	sched_clutch_amp_thread_avoid_processor(processor_t processor, thread_t thread)
	2618	{
	2619	if (processor->processor_set->pset_cluster_type == PSET_AMP_E) {
	2620	if (sched_clutch_thread_pset_recommended(thread, pcore_set)) {
	2621	return true;
	2622	}
	2623	} else if (processor->processor_set->pset_cluster_type == PSET_AMP_P) {
	2624	if (!sched_clutch_thread_pset_recommended(thread, pcore_set)) {
	2625	return true;
	2626	}
	2627	}
	2628
	2629	return false;
	2630	}
	2631
	2632	static processor_t
	2633	sched_clutch_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
	2634	{
	2635	/* Bound threads don't call this function */
	2636	assert(thread->bound_processor == PROCESSOR_NULL);
	2637
	2638	processor_set_t nset;
	2639	processor_t chosen_processor = PROCESSOR_NULL;
	2640
	2641	select_pset:
	2642	nset = (pset == ecore_set) ? pcore_set : ecore_set;
	2643	if (!sched_clutch_pset_available(pset)) {
	2644	/* If the current pset is not available for scheduling, just use the other pset */
	2645	pset_unlock(pset);
	2646	pset_lock(nset);
	2647	goto select_processor;
	2648	}
	2649
	2650	/* Check if the thread is recommended to run on this pset */
	2651	if (sched_clutch_thread_pset_recommended(thread, pset)) {
	2652	nset = pset;
	2653	goto select_processor;
	2654	} else {
	2655	/* pset not recommended; try the other pset */
	2656	pset_unlock(pset);
	2657	pset_lock(nset);
	2658	pset = nset;
	2659	goto select_pset;
	2660	}
	2661
	2662	select_processor:
	2663	if (!sched_clutch_pset_available(nset)) {
	2664	/*
	2665	* It looks like both psets are not available due to some
	2666	* reason. In that case, just use the master processor's
	2667	* pset for scheduling.
	2668	*/
	2669	if (master_processor->processor_set != nset) {
	2670	pset_unlock(nset);
	2671	nset = master_processor->processor_set;
	2672	pset_lock(nset);
	2673	}
	2674	}
	2675	chosen_processor = choose_processor(nset, processor, thread);
	2676	assert(chosen_processor->processor_set == nset);
	2677	return chosen_processor;
	2678	}
	2679
	2680	/*
	2681	* AMP Clutch Scheduler Thread Migration
	2682	*
	2683	* For the AMP version of the clutch scheduler the thread is always scheduled via its
	2684	* thread group. So it is important to make sure that the thread group is part of the
	2685	* correct processor set hierarchy. In order to do that, the clutch scheduler moves
	2686	* all eligble clutch buckets to the correct hierarchy when the recommendation of a
	2687	* thread group is changed by CLPC.
	2688	*/
	2689
	2690	/*
	2691	* sched_clutch_recommended_pset()
	2692	*
	2693	* Routine to decide which hierarchy the thread group should be in based on the
	2694	* recommendation and other thread group and system properties. This routine is
	2695	* used to determine if thread group migration is necessary and should mimic the
	2696	* logic in sched_clutch_thread_pset_recommended() & recommended_pset_type().
	2697	*/
	2698	static processor_set_t
	2699	sched_clutch_recommended_pset(sched_clutch_t sched_clutch, cluster_type_t recommendation)
	2700	{
	2701	if (!sched_clutch_pset_available(pcore_set)) {
	2702	return ecore_set;
	2703	}
	2704
	2705	if (!sched_clutch_pset_available(ecore_set)) {
	2706	return pcore_set;
	2707	}
	2708
	2709	/*
	2710	* If all clusters are available and recommended, use the recommendation
	2711	* to decide which cluster to use.
	2712	*/
	2713	pset_cluster_type_t type = thread_group_pset_recommendation(sched_clutch->sc_tg, recommendation);
	2714	return (type == PSET_AMP_E) ? ecore_set : pcore_set;
	2715	}
	2716
	2717	static void
	2718	sched_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
	2719	{
	2720	uint16_t thread_count = clutch_bucket->scb_thr_count;
	2721	thread_t thread;
	2722	uint64_t current_timestamp = mach_approximate_time();
	2723	while (thread_count > 0) {
	2724	thread = run_queue_peek(&clutch_bucket->scb_runq);
	2725	sched_clutch_thread_remove(root_clutch, thread, current_timestamp);
	2726	enqueue_tail(clutch_threads, &thread->runq_links);
	2727	thread_count--;
	2728	}
	2729
	2730	/*
	2731	* This operation should have drained the clutch bucket and pulled it out of the
	2732	* hierarchy.
	2733	*/
	2734	assert(clutch_bucket->scb_thr_count == 0);
	2735	assert(clutch_bucket->scb_root == NULL);
	2736	}
	2737
	2738	/*
	2739	* sched_clutch_migrate_thread_group()
	2740	*
	2741	* Routine to implement the migration of threads when the thread group
	2742	* recommendation is updated. The migration works using a 2-phase
	2743	* algorithm.
	2744	*
	2745	* Phase 1: With the source pset (determined by sched_clutch_recommended_pset)
	2746	* locked, drain all the runnable threads into a local queue and update the TG
	2747	* recommendation.
	2748	*
	2749	* Phase 2: Call thread_setrun() on all the drained threads. Since the TG recommendation
	2750	* has been updated, these should all end up in the right hierarchy.
	2751	*/
	2752	static void
	2753	sched_clutch_migrate_thread_group(sched_clutch_t sched_clutch, cluster_type_t new_recommendation)
	2754	{
	2755	thread_t thread;
	2756
	2757	/* If the thread group is empty, just update the recommendation */
	2758	if (os_atomic_load(&sched_clutch->sc_thr_count, relaxed) == 0) {
	2759	thread_group_update_recommendation(sched_clutch->sc_tg, new_recommendation);
	2760	return;
	2761	}
	2762
	2763	processor_set_t dst_pset = sched_clutch_recommended_pset(sched_clutch, new_recommendation);
	2764	processor_set_t src_pset = (dst_pset == pcore_set) ? ecore_set : pcore_set;
	2765
	2766	queue_head_t clutch_threads;
	2767	queue_init(&clutch_threads);
	2768
	2769	/* Interrupts need to be disabled to make sure threads wont become runnable during the
	2770	* migration and attempt to grab the pset/thread locks.
	2771	*/
	2772	spl_t s = splsched();
	2773
	2774	pset_lock(src_pset);
	2775	for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
	2776	sched_clutch_bucket_t clutch_bucket = &(sched_clutch->sc_clutch_buckets[bucket]);
	2777	sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
	2778	if ((scb_root == NULL) \|\| (scb_root->scr_pset == dst_pset)) {
	2779	/* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
	2780	continue;
	2781	}
	2782	assert(scb_root->scr_pset == src_pset);
	2783	/* Now remove all the threads from the runq so that thread->runq is set correctly */
	2784	sched_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
	2785	}
	2786
	2787	/*
	2788	* Now that all the clutch buckets have been drained, update the TG recommendation.
	2789	* This operation needs to be done with the pset lock held to make sure that anyone
	2790	* coming in before the migration started would get the original pset as the root
	2791	* of this sched_clutch and attempt to hold the src_pset lock. Once the TG changes,
	2792	* all threads that are becoming runnable would find the clutch bucket empty and
	2793	* the TG recommendation would coax them to enqueue it in the new recommended
	2794	* hierarchy. This effectively synchronizes with other threads calling
	2795	* thread_setrun() and trying to decide which pset the thread/clutch_bucket
	2796	* belongs in.
	2797	*/
	2798	thread_group_update_recommendation(sched_clutch->sc_tg, new_recommendation);
	2799	pset_unlock(src_pset);
	2800
	2801	/* Now setrun all the threads in the local queue */
	2802	qe_foreach_element_safe(thread, &clutch_threads, runq_links) {
	2803	remqueue(&thread->runq_links);
	2804	thread_lock(thread);
	2805	thread_setrun(thread, SCHED_TAILQ);
	2806	thread_unlock(thread);
	2807	}
	2808
	2809	splx(s);
	2810	}
	2811
	2812	static void
	2813	sched_clutch_amp_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation)
	2814	{
	2815	/*
	2816	* For the clutch scheduler, the change in recommendation moves the thread group
	2817	* to the right hierarchy. sched_clutch_migrate_thread_group() is also responsible
	2818	* for updating the recommendation of the thread group.
	2819	*/
	2820	sched_clutch_migrate_thread_group(&tg->tg_sched_clutch, new_recommendation);
	2821
	2822	if (new_recommendation != CLUSTER_TYPE_P) {
	2823	return;
	2824	}
	2825
	2826	sched_amp_bounce_thread_group_from_ecores(ecore_set, tg);
	2827	}
	2828
	2829	/*
	2830	* sched_clutch_migrate_foreign_buckets()
	2831	*
	2832	* Routine to migrate all the clutch buckets which are not in their recommended
	2833	* pset hierarchy now that a new pset has become runnable. The algorithm is
	2834	* similar to sched_clutch_migrate_thread_group().
	2835	*
	2836	* Invoked with the newly recommended pset lock held and interrupts disabled.
	2837	*/
	2838	static void
	2839	sched_clutch_migrate_foreign_buckets(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
	2840	{
	2841	thread_t thread;
	2842	processor_set_t src_pset = (dst_pset == pcore_set) ? ecore_set : pcore_set;
	2843
	2844	if (!sched_clutch_pset_available(dst_pset)) {
	2845	/*
	2846	* It is possible that some state about the pset changed,
	2847	* but its still not available for scheduling. Nothing to
	2848	* do here in that case.
	2849	*/
	2850	if (drop_lock) {
	2851	pset_unlock(dst_pset);
	2852	}
	2853	return;
	2854	}
	2855	pset_unlock(dst_pset);
	2856
	2857	queue_head_t clutch_threads;
	2858	queue_init(&clutch_threads);
	2859	sched_clutch_root_t src_root = &src_pset->pset_clutch_root;
	2860
	2861	pset_lock(src_pset);
	2862	queue_t clutch_bucket_list = &src_pset->pset_clutch_root.scr_foreign_buckets;
	2863
	2864	if (sched_clutch_root_count(src_root) == 0) {
	2865	/* No threads present in this hierarchy */
	2866	pset_unlock(src_pset);
	2867	goto migration_complete;
	2868	}
	2869
	2870	sched_clutch_bucket_t clutch_bucket;
	2871	qe_foreach_element_safe(clutch_bucket, clutch_bucket_list, scb_foreignlink) {
	2872	sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
	2873	assert(scb_root->scr_pset == src_pset);
	2874	/* Now remove all the threads from the runq so that thread->runq is set correctly */
	2875	sched_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
	2876	assert(clutch_bucket->scb_foreign == false);
	2877	}
	2878	pset_unlock(src_pset);
	2879
	2880	/* Now setrun all the threads in the local queue */
	2881	qe_foreach_element_safe(thread, &clutch_threads, runq_links) {
	2882	remqueue(&thread->runq_links);
	2883	thread_lock(thread);
	2884	thread_setrun(thread, SCHED_TAILQ);
	2885	thread_unlock(thread);
	2886	}
	2887
	2888	migration_complete:
	2889	if (!drop_lock) {
	2890	pset_lock(dst_pset);
	2891	}
	2892	}
	2893
	2894	#endif /* __AMP__ */
	2895
	2896	#endif /* CONFIG_SCHED_CLUTCH */