git.saurik.com Git - apple/xnu.git/blame - osfmk/kern/sched

Commit	Line	Data
cb323159 A	1	/*
	2	* Copyright (c) 2018 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	#include <mach/mach_types.h>
	30	#include <mach/machine.h>
	31	#include <machine/machine_routines.h>
	32	#include <machine/sched_param.h>
	33	#include <machine/machine_cpu.h>
	34	#include <kern/kern_types.h>
	35	#include <kern/debug.h>
	36	#include <kern/machine.h>
	37	#include <kern/misc_protos.h>
	38	#include <kern/processor.h>
	39	#include <kern/queue.h>
	40	#include <kern/sched.h>
	41	#include <kern/sched_prim.h>
	42	#include <kern/task.h>
	43	#include <kern/thread.h>
	44	#include <kern/sched_clutch.h>
	45	#include <machine/atomic.h>
	46	#include <kern/sched_clutch.h>
	47	#include <sys/kdebug.h>
	48
f427ee49	49	#if CONFIG_SCHED_EDGE
c6bf4f31	50	#include <kern/sched_amp_common.h>
f427ee49	51	#endif /* CONFIG_SCHED_EDGE */
cb323159 A	52
	53	#if CONFIG_SCHED_CLUTCH
	54
	55	/* Forward declarations of static routines */
	56
	57	/* Root level hierarchy management */
	58	static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
f427ee49	59	static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t, bool);
cb323159	60	static void sched_clutch_root_pri_update(sched_clutch_root_t);
cb323159 A	61	static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
	62	static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
	63
f427ee49 A	64	__enum_decl(sched_clutch_highest_root_bucket_type_t, uint32_t, {
	65	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE = 0,
	66	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY = 1,
	67	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL = 2,
	68	});
	69
	70	static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t, sched_clutch_highest_root_bucket_type_t);
	71
	72	#if CONFIG_SCHED_EDGE
	73	/* Support for foreign threads on AMP platforms */
	74	static boolean_t sched_clutch_root_foreign_empty(sched_clutch_root_t);
	75	static thread_t sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t);
	76	#endif /* CONFIG_SCHED_EDGE */
	77
cb323159 A	78	/* Root bucket level hierarchy management */
	79	static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
	80	static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t);
cb323159	81
ea3f0419 A	82	/* Options for clutch bucket ordering in the runq */
	83	__options_decl(sched_clutch_bucket_options_t, uint32_t, {
	84	SCHED_CLUTCH_BUCKET_OPTIONS_NONE = 0x0,
	85	/* Round robin clutch bucket on thread removal */
	86	SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR = 0x1,
	87	/* Insert clutch bucket at head (for thread preemption) */
	88	SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ = 0x2,
	89	/* Insert clutch bucket at tail (default) */
	90	SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ = 0x4,
	91	});
	92
cb323159	93	/* Clutch bucket level hierarchy management */
ea3f0419 A	94	static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
	95	static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
	96	static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
	97	static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
	98	static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
cb323159	99	static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
f427ee49 A	100
	101	/* Clutch bucket group level properties management */
	102	static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t);
	103	static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t);
	104	static void sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t, sched_clutch_bucket_t, uint64_t);
	105	static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t);
	106	static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t);
	107	static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t);
	108	static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t, uint64_t);
cb323159 A	109
	110	/* Clutch timeshare properties updates */
	111	static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
	112	static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
f427ee49	113
cb323159 A	114	/* Clutch membership management */
cb323159 A	115	static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
ea3f0419	116	static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
f427ee49	117	static thread_t sched_clutch_thread_highest_remove(sched_clutch_root_t);
cb323159 A	118
	119	/* Clutch properties updates */
	120	static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
	121	static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
	122	static int sched_clutch_root_priority(sched_clutch_root_t);
f427ee49 A	123	static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t);
f427ee49 A	124	static boolean_t sched_thread_sched_pri_promoted(thread_t);
cb323159	125
f427ee49	126	#if CONFIG_SCHED_EDGE
c6bf4f31	127	/* System based routines */
f427ee49 A	128	static bool sched_edge_pset_available(processor_set_t);
	129	static uint32_t sched_edge_thread_bound_cluster_id(thread_t);
	130	#endif /* CONFIG_SCHED_EDGE */
cb323159 A	131
	132	/* Helper debugging routines */
	133	static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
	134
f427ee49	135	extern processor_set_t pset_array[MAX_PSETS];
cb323159 A	136
	137	/*
	138	* Special markers for buckets that have invalid WCELs/quantums etc.
	139	*/
	140	#define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
	141	#define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
	142
	143	/*
	144	* Root level bucket WCELs
	145	*
	146	* The root level bucket selection algorithm is an Earliest Deadline
	147	* First (EDF) algorithm where the deadline for buckets are defined
	148	* by the worst-case-execution-latency and the make runnable timestamp
	149	* for the bucket.
	150	*
	151	*/
	152	static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
	153	SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
	154	0, /* FG */
	155	37500, /* IN (37.5ms) */
	156	75000, /* DF (75ms) */
	157	150000, /* UT (150ms) */
	158	250000 /* BG (250ms) */
	159	};
	160	static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
	161
	162	/*
	163	* Root level bucket warp
	164	*
	165	* Each root level bucket has a warp value associated with it as well.
	166	* The warp value allows the root bucket to effectively warp ahead of
	167	* lower priority buckets for a limited time even if it has a later
	168	* deadline. The warping behavior provides extra (but limited)
	169	* opportunity for high priority buckets to remain responsive.
	170	*/
	171
	172	/* Special warp deadline value to indicate that the bucket has not used any warp yet */
	173	#define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED (SCHED_CLUTCH_INVALID_TIME_64)
	174
	175	/* Warp window durations for various tiers */
	176	static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
	177	SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
	178	8000, /* FG (8ms)*/
	179	4000, /* IN (4ms) */
	180	2000, /* DF (2ms) */
	181	1000, /* UT (1ms) */
	182	0 /* BG (0ms) */
	183	};
	184	static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
	185
	186	/*
	187	* Thread level quantum
	188	*
	189	* The algorithm defines quantums for threads at various buckets. This
	190	* (combined with the root level bucket quantums) restricts how much
	191	* the lower priority levels can preempt the higher priority threads.
	192	*/
f427ee49 A	193
	194	#if XNU_TARGET_OS_OSX
	195	static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
	196	10000, /* FIXPRI (10ms) */
	197	10000, /* FG (10ms) */
	198	10000, /* IN (10ms) */
	199	10000, /* DF (10ms) */
	200	4000, /* UT (4ms) */
	201	2000 /* BG (2ms) */
	202	};
	203	#else /* XNU_TARGET_OS_OSX */
cb323159 A	204	static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
	205	10000, /* FIXPRI (10ms) */
	206	10000, /* FG (10ms) */
	207	8000, /* IN (8ms) */
	208	6000, /* DF (6ms) */
	209	4000, /* UT (4ms) */
	210	2000 /* BG (2ms) */
	211	};
f427ee49	212	#endif /* XNU_TARGET_OS_OSX */
cb323159	213
f427ee49	214	static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
cb323159 A	215
	216	/*
	217	* sched_clutch_us_to_abstime()
	218	*
	219	* Initializer for converting all durations in usec to abstime
	220	*/
	221	static void
	222	sched_clutch_us_to_abstime(uint32_t us_vals, uint64_t abstime_vals)
	223	{
	224	for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
	225	if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
	226	abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
	227	} else {
	228	clock_interval_to_absolutetime_interval(us_vals[i],
	229	NSEC_PER_USEC, &abstime_vals[i]);
	230	}
	231	}
	232	}
	233
f427ee49 A	234	/* Clutch/Edge Scheduler Debugging support */
	235	#define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c) ((uint64_t)c \| ((uint64_t)b << 16) \| ((uint64_t)a << 32))
	236
cb323159 A	237	#if DEVELOPMENT \|\| DEBUG
	238
	239	/*
	240	* sched_clutch_hierarchy_locked_assert()
	241	*
	242	* Debugging helper routine. Asserts that the hierarchy is locked. The locking
	243	* for the hierarchy depends on where the hierarchy is hooked. The current
	244	* implementation hooks the hierarchy at the pset, so the hierarchy is locked
	245	* using the pset lock.
	246	*/
	247	static inline void
	248	sched_clutch_hierarchy_locked_assert(
	249	sched_clutch_root_t root_clutch)
	250	{
	251	pset_assert_locked(root_clutch->scr_pset);
	252	}
	253
	254	#else /* DEVELOPMENT \|\| DEBUG */
	255
	256	static inline void
	257	sched_clutch_hierarchy_locked_assert(
	258	__unused sched_clutch_root_t root_clutch)
	259	{
	260	}
	261
	262	#endif /* DEVELOPMENT \|\| DEBUG */
	263
	264	/*
	265	* sched_clutch_thr_count_inc()
	266	*
	267	* Increment thread count at a hierarchy level with overflow checks.
	268	*/
	269	static void
	270	sched_clutch_thr_count_inc(
	271	uint16_t *thr_count)
	272	{
	273	if (__improbable(os_inc_overflow(thr_count))) {
	274	panic("sched_clutch thread count overflowed!");
	275	}
	276	}
	277
	278	/*
	279	* sched_clutch_thr_count_dec()
	280	*
	281	* Decrement thread count at a hierarchy level with underflow checks.
	282	*/
	283	static void
	284	sched_clutch_thr_count_dec(
	285	uint16_t *thr_count)
	286	{
	287	if (__improbable(os_dec_overflow(thr_count))) {
	288	panic("sched_clutch thread count underflowed!");
	289	}
	290	}
	291
c6bf4f31	292	/*
f427ee49 A	293	* The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
	294	* based on the amount of time they have been pending and the load at that
	295	* scheduling bucket level. Since the clutch bucket groups are global (i.e. span
	296	* multiple clusters, its important to keep the load also as a global counter.
c6bf4f31	297	*/
f427ee49	298	static uint32_t _Atomic sched_clutch_global_bucket_load[TH_BUCKET_SCHED_MAX];
cb323159 A	299
	300	/*
	301	* sched_clutch_root_init()
	302	*
	303	* Routine to initialize the scheduler hierarchy root.
	304	*/
	305	static void
	306	sched_clutch_root_init(
	307	sched_clutch_root_t root_clutch,
	308	processor_set_t pset)
	309	{
	310	root_clutch->scr_thr_count = 0;
	311	root_clutch->scr_priority = NOPRI;
	312	root_clutch->scr_urgency = 0;
	313	root_clutch->scr_pset = pset;
f427ee49 A	314	#if CONFIG_SCHED_EDGE
	315	root_clutch->scr_cluster_id = pset->pset_cluster_id;
	316	#else /* CONFIG_SCHED_EDGE */
	317	root_clutch->scr_cluster_id = 0;
	318	#endif /* CONFIG_SCHED_EDGE */
cb323159 A	319
	320	/* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
	321	queue_init(&root_clutch->scr_clutch_buckets);
	322
f427ee49 A	323	/* Initialize the priority queue which maintains all runnable foreign clutch buckets */
	324	priority_queue_init(&root_clutch->scr_foreign_buckets);
	325	bzero(&root_clutch->scr_cumulative_run_count, sizeof(root_clutch->scr_cumulative_run_count));
	326	bitmap_zero(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
	327	bitmap_zero(root_clutch->scr_bound_warp_available, TH_BUCKET_SCHED_MAX);
	328	priority_queue_init(&root_clutch->scr_bound_root_buckets);
cb323159 A	329
cb323159 A	330	/* Initialize the bitmap and priority queue of runnable root buckets */
f427ee49 A	331	priority_queue_init(&root_clutch->scr_unbound_root_buckets);
	332	bitmap_zero(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
	333	bitmap_zero(root_clutch->scr_unbound_warp_available, TH_BUCKET_SCHED_MAX);
cb323159 A	334
	335	/* Initialize all the root buckets */
	336	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
f427ee49 A	337	sched_clutch_root_bucket_init(&root_clutch->scr_unbound_buckets[i], i, false);
f427ee49 A	338	sched_clutch_root_bucket_init(&root_clutch->scr_bound_buckets[i], i, true);
cb323159 A	339	}
	340	}
	341
ea3f0419 A	342	/*
	343	* Clutch Bucket Runqueues
	344	*
	345	* The clutch buckets are maintained in a runq at the root bucket level. The
	346	* runq organization allows clutch buckets to be ordered based on various
	347	* factors such as:
	348	*
	349	* - Clutch buckets are round robin'ed at the same priority level when a
	350	* thread is selected from a clutch bucket. This prevents a clutch bucket
	351	* from starving out other clutch buckets at the same priority.
	352	*
	353	* - Clutch buckets are inserted at the head when it becomes runnable due to
	354	* thread preemption. This allows threads that were preempted to maintain
	355	* their order in the queue.
	356	*
	357	*/
	358
	359	/*
	360	* sched_clutch_bucket_runq_init()
	361	*
	362	* Initialize a clutch bucket runq.
	363	*/
	364	static void
	365	sched_clutch_bucket_runq_init(
	366	sched_clutch_bucket_runq_t clutch_buckets_rq)
	367	{
	368	clutch_buckets_rq->scbrq_highq = NOPRI;
	369	for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) {
	370	clutch_buckets_rq->scbrq_bitmap[i] = 0;
	371	}
	372	clutch_buckets_rq->scbrq_count = 0;
	373	for (int i = 0; i < NRQS; i++) {
	374	circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
	375	}
	376	}
	377
	378	/*
	379	* sched_clutch_bucket_runq_empty()
	380	*
	381	* Returns if a clutch bucket runq is empty.
	382	*/
	383	static boolean_t
	384	sched_clutch_bucket_runq_empty(
	385	sched_clutch_bucket_runq_t clutch_buckets_rq)
	386	{
	387	return clutch_buckets_rq->scbrq_count == 0;
	388	}
	389
	390	/*
	391	* sched_clutch_bucket_runq_peek()
	392	*
	393	* Returns the highest priority clutch bucket in the runq.
	394	*/
	395	static sched_clutch_bucket_t
	396	sched_clutch_bucket_runq_peek(
	397	sched_clutch_bucket_runq_t clutch_buckets_rq)
	398	{
	399	if (clutch_buckets_rq->scbrq_count > 0) {
	400	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
	401	return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
	402	} else {
	403	return NULL;
	404	}
	405	}
406
407	/*
408	* sched_clutch_bucket_runq_enqueue()
409	*
410	* Enqueue a clutch bucket into the runq based on the options passed in.
411	*/
412	static void
413	sched_clutch_bucket_runq_enqueue(
414	sched_clutch_bucket_runq_t clutch_buckets_rq,
415	sched_clutch_bucket_t clutch_bucket,
416	sched_clutch_bucket_options_t options)
417	{
418	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
419	if (circle_queue_empty(queue)) {
420	circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
421	bitmap_set(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
422	if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
423	clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
424	}
425	} else {
426	if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
427	circle_enqueue_head(queue, &clutch_bucket->scb_runqlink);
428	} else {
429	/*
430	* Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
431	* SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
432	*/
433	circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
434	}
435	}
436	clutch_buckets_rq->scbrq_count++;
437	}
438
439	/*
440	* sched_clutch_bucket_runq_remove()
441	*
442	* Remove a clutch bucket from the runq.
443	*/
444	static void
445	sched_clutch_bucket_runq_remove(
446	sched_clutch_bucket_runq_t clutch_buckets_rq,
447	sched_clutch_bucket_t clutch_bucket)
448	{
449	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
450	circle_dequeue(queue, &clutch_bucket->scb_runqlink);
451	assert(clutch_buckets_rq->scbrq_count > 0);
452	clutch_buckets_rq->scbrq_count--;
453	if (circle_queue_empty(queue)) {
454	bitmap_clear(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
455	clutch_buckets_rq->scbrq_highq = bitmap_first(clutch_buckets_rq->scbrq_bitmap, NRQS);
456	}
457	}
458
459	static void
460	sched_clutch_bucket_runq_rotate(
461	sched_clutch_bucket_runq_t clutch_buckets_rq,
462	sched_clutch_bucket_t clutch_bucket)
463	{
464	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
465	assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
466	circle_queue_rotate_head_forward(queue);
467	}
468
cb323159 A	469	/*
	470	* sched_clutch_root_bucket_init()
	471	*
	472	* Routine to initialize root buckets.
	473	*/
	474	static void
	475	sched_clutch_root_bucket_init(
	476	sched_clutch_root_bucket_t root_bucket,
f427ee49 A	477	sched_bucket_t bucket,
f427ee49 A	478	bool bound_root_bucket)
cb323159 A	479	{
cb323159 A	480	root_bucket->scrb_bucket = bucket;
f427ee49 A	481	if (bound_root_bucket) {
	482	/* For bound root buckets, initialize the bound thread runq. */
	483	root_bucket->scrb_bound = true;
	484	run_queue_init(&root_bucket->scrb_bound_thread_runq);
	485	} else {
	486	/*
	487	* The unbounded root buckets contain a runq of runnable clutch buckets
	488	* which then hold the runnable threads.
	489	*/
	490	root_bucket->scrb_bound = false;
	491	sched_clutch_bucket_runq_init(&root_bucket->scrb_clutch_buckets);
	492	}
cb323159	493	priority_queue_entry_init(&root_bucket->scrb_pqlink);
f427ee49	494	root_bucket->scrb_pqlink.deadline = SCHED_CLUTCH_INVALID_TIME_64;
cb323159 A	495	root_bucket->scrb_warped_deadline = 0;
cb323159 A	496	root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
f427ee49 A	497	root_bucket->scrb_starvation_avoidance = false;
f427ee49 A	498	root_bucket->scrb_starvation_ts = 0;
cb323159 A	499	}
	500
	501	/*
cb323159 A	502	* Special case scheduling for Above UI bucket.
	503	*
	504	* AboveUI threads are typically system critical threads that need low latency
	505	* which is why they are handled specially.
	506	*
	507	* Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
f427ee49 A	508	* important to maintain some native priority order between those buckets. For unbounded
f427ee49 A	509	* root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
cb323159	510	* Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
f427ee49 A	511	* deadline based scheduling which should pickup the timeshare buckets. For the bound
	512	* case, the policy simply compares the priority of the highest runnable threads in
	513	* the above UI and timeshare buckets.
cb323159 A	514	*
	515	* The implementation allows extremely low latency CPU access for Above UI threads
	516	* while supporting the use case of high priority timeshare threads contending with
	517	* lower priority fixed priority threads.
	518	*/
f427ee49 A	519
	520
	521	/*
	522	* sched_clutch_root_unbound_select_aboveui()
	523	*
	524	* Routine to determine if the above UI unbounded bucket should be selected for execution.
	525	*/
	526	static bool
	527	sched_clutch_root_unbound_select_aboveui(
cb323159 A	528	sched_clutch_root_t root_clutch)
cb323159 A	529	{
f427ee49 A	530	if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_FIXPRI)) {
	531	sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
	532	sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_unbound_buckets[TH_BUCKET_SHARE_FG];
	533	if (!bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
cb323159 A	534	/* If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling */
	535	return true;
	536	}
f427ee49 A	537	sched_clutch_bucket_t clutch_bucket_aboveui = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_aboveui);
	538	sched_clutch_bucket_t clutch_bucket_sharefg = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_sharefg);
	539	if (clutch_bucket_aboveui->scb_priority >= clutch_bucket_sharefg->scb_priority) {
cb323159 A	540	return true;
	541	}
	542	}
	543	return false;
	544	}
	545
f427ee49 A	546	/*
	547	* sched_clutch_root_bound_select_aboveui()
	548	*
	549	* Routine to determine if the above UI bounded bucket should be selected for execution.
	550	*/
	551	static bool
	552	sched_clutch_root_bound_select_aboveui(
	553	sched_clutch_root_t root_clutch)
	554	{
	555	sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
	556	sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_bound_buckets[TH_BUCKET_SHARE_FG];
	557	if (root_bucket_aboveui->scrb_bound_thread_runq.count == 0) {
	558	return false;
	559	}
	560	return root_bucket_aboveui->scrb_bound_thread_runq.highq >= root_bucket_sharefg->scrb_bound_thread_runq.highq;
	561	}
cb323159 A	562
	563	/*
	564	* sched_clutch_root_highest_root_bucket()
	565	*
	566	* Main routine to find the highest runnable root level bucket.
	567	* This routine is called from performance sensitive contexts; so it is
f427ee49 A	568	* crucial to keep this O(1). The options parameter determines if
	569	* the selection logic should look at unbounded threads only (for
	570	* cross-cluster stealing operations) or both bounded and unbounded
	571	* threads (for selecting next thread for execution on current cluster).
cb323159 A	572	*/
	573	static sched_clutch_root_bucket_t
	574	sched_clutch_root_highest_root_bucket(
	575	sched_clutch_root_t root_clutch,
f427ee49 A	576	uint64_t timestamp,
f427ee49 A	577	sched_clutch_highest_root_bucket_type_t type)
cb323159 A	578	{
cb323159 A	579	sched_clutch_hierarchy_locked_assert(root_clutch);
f427ee49 A	580	int highest_runnable_bucket = -1;
	581	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
	582	highest_runnable_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
	583	} else {
	584	int highest_unbound_runnable_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
	585	int highest_bound_runnable_bucket = bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
	586	highest_runnable_bucket = (highest_bound_runnable_bucket != -1) ? ((highest_unbound_runnable_bucket != -1) ? MIN(highest_bound_runnable_bucket, highest_unbound_runnable_bucket) : highest_bound_runnable_bucket) : highest_unbound_runnable_bucket;
	587	}
	588
	589	if (highest_runnable_bucket == -1) {
cb323159 A	590	return NULL;
	591	}
	592
f427ee49 A	593	/* Above UI root bucket selection (see comment above for more details on this special case handling) */
	594	bool unbound_aboveui = sched_clutch_root_unbound_select_aboveui(root_clutch);
	595	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
	596	if (unbound_aboveui) {
	597	return &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
	598	}
	599	/* Fall through to selecting a timeshare root bucket */
	600	} else {
	601	bool bound_aboveui = sched_clutch_root_bound_select_aboveui(root_clutch);
	602	sched_clutch_root_bucket_t unbound_aboveui_root_bucket = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
	603	sched_clutch_root_bucket_t bound_aboveui_root_bucket = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
	604
	605	if (unbound_aboveui && bound_aboveui) {
	606	/*
	607	* In this scenario both the bounded and unbounded above UI buckets are runnable; choose based on the
	608	* highest runnable priority in both the buckets.
	609	* */
	610	int bound_aboveui_pri = root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI].scrb_bound_thread_runq.highq;
	611	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(unbound_aboveui_root_bucket);
	612	int unbound_aboveui_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
	613	return (bound_aboveui_pri >= unbound_aboveui_pri) ? bound_aboveui_root_bucket : unbound_aboveui_root_bucket;
	614	}
	615	if (unbound_aboveui) {
	616	return unbound_aboveui_root_bucket;
	617	}
	618	if (bound_aboveui) {
	619	return bound_aboveui_root_bucket;
	620	}
	621	/* Fall through to selecting a timeshare root bucket */
cb323159 A	622	}
	623
	624	/*
f427ee49 A	625	* Above UI bucket is not runnable or has a low priority runnable thread; use the
	626	* earliest deadline model to schedule threads. The idea is that as the timeshare
	627	* buckets use CPU, they will drop their interactivity score/sched priority and
	628	* allow the low priority AboveUI buckets to be scheduled.
cb323159 A	629	*/
	630
	631	/* Find the earliest deadline bucket */
f427ee49	632	sched_clutch_root_bucket_t edf_bucket = NULL;
cb323159 A	633	sched_clutch_root_bucket_t warp_bucket = NULL;
cb323159 A	634	int warp_bucket_index = -1;
f427ee49 A	635
	636	evaluate_root_buckets:
	637	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
	638	edf_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
	639	} else {
	640	sched_clutch_root_bucket_t unbound_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
	641	sched_clutch_root_bucket_t bound_bucket = priority_queue_min(&root_clutch->scr_bound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
	642	if (bound_bucket && unbound_bucket) {
	643	/* If bound and unbound root buckets are runnable, select the one with the earlier deadline */
	644	edf_bucket = (bound_bucket->scrb_pqlink.deadline <= unbound_bucket->scrb_pqlink.deadline) ? bound_bucket : unbound_bucket;
	645	} else {
	646	edf_bucket = (bound_bucket) ? bound_bucket : unbound_bucket;
	647	}
	648	}
	649	/*
	650	* Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
	651	* buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
	652	* makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
	653	* QoS root bucket.
	654	*/
	655	bitmap_t *warp_available_bitmap = (edf_bucket->scrb_bound) ? (root_clutch->scr_bound_warp_available) : (root_clutch->scr_unbound_warp_available);
	656	warp_bucket_index = bitmap_lsb_first(warp_available_bitmap, TH_BUCKET_SCHED_MAX);
cb323159 A	657
cb323159 A	658	if ((warp_bucket_index == -1) \|\| (warp_bucket_index >= edf_bucket->scrb_bucket)) {
f427ee49 A	659	/* No higher buckets have warp left; best choice is the EDF based bucket */
	660	if (edf_bucket->scrb_starvation_avoidance) {
	661	/*
	662	* Indicates that the earliest deadline bucket is in starvation avoidance mode. Check to see if the
	663	* starvation avoidance window is still open and return this bucket if it is.
	664	*
	665	* The starvation avoidance window is calculated based on the quantum of threads at that bucket and
	666	* the number of CPUs in the cluster. The idea is to basically provide one quantum worth of starvation
	667	* avoidance across all CPUs.
	668	*/
	669	uint64_t starvation_window = sched_clutch_thread_quantum[edf_bucket->scrb_bucket] / root_clutch->scr_pset->online_processor_count;
	670	if (timestamp < (edf_bucket->scrb_starvation_ts + starvation_window)) {
	671	return edf_bucket;
	672	} else {
	673	/* Starvation avoidance window is over; update deadline and re-evaluate EDF */
	674	edf_bucket->scrb_starvation_avoidance = false;
	675	edf_bucket->scrb_starvation_ts = 0;
	676	sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
	677	}
	678	goto evaluate_root_buckets;
	679	}
	680
	681	/* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */
	682	if (highest_runnable_bucket < edf_bucket->scrb_bucket) {
	683	/* Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance */
	684	edf_bucket->scrb_starvation_avoidance = true;
	685	edf_bucket->scrb_starvation_ts = timestamp;
	686	} else {
	687	/* EDF bucket is being selected in the natural order; update deadline and reset warp */
	688	sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
	689	edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
	690	edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
	691	if (edf_bucket->scrb_bound) {
	692	bitmap_set(root_clutch->scr_bound_warp_available, edf_bucket->scrb_bucket);
	693	} else {
	694	bitmap_set(root_clutch->scr_unbound_warp_available, edf_bucket->scrb_bucket);
	695	}
	696	}
cb323159 A	697	return edf_bucket;
	698	}
	699
	700	/*
	701	* Looks like there is a root bucket which is higher in the natural priority
	702	* order than edf_bucket and might have some warp remaining.
	703	*/
f427ee49	704	warp_bucket = (edf_bucket->scrb_bound) ? &root_clutch->scr_bound_buckets[warp_bucket_index] : &root_clutch->scr_unbound_buckets[warp_bucket_index];
cb323159 A	705	if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
	706	/* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
	707	warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
	708	sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
	709	return warp_bucket;
	710	}
	711	if (warp_bucket->scrb_warped_deadline > timestamp) {
	712	/* Root bucket already has a warp window open with some warp remaining */
	713	sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
	714	return warp_bucket;
	715	}
	716
	717	/* For this bucket, warp window was opened sometime in the past but has now
	718	* expired. Mark the bucket as not avilable for warp anymore and re-run the
	719	* warp bucket selection logic.
	720	*/
	721	warp_bucket->scrb_warp_remaining = 0;
f427ee49 A	722	if (warp_bucket->scrb_bound) {
	723	bitmap_clear(root_clutch->scr_bound_warp_available, warp_bucket->scrb_bucket);
	724	} else {
	725	bitmap_clear(root_clutch->scr_unbound_warp_available, warp_bucket->scrb_bucket);
	726	}
	727	goto evaluate_root_buckets;
cb323159 A	728	}
	729
	730	/*
	731	* sched_clutch_root_bucket_deadline_calculate()
	732	*
	733	* Calculate the deadline for the bucket based on its WCEL
	734	*/
	735	static uint64_t
	736	sched_clutch_root_bucket_deadline_calculate(
	737	sched_clutch_root_bucket_t root_bucket,
	738	uint64_t timestamp)
	739	{
	740	/* For fixpri AboveUI bucket always return it as the earliest deadline */
	741	if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
	742	return 0;
	743	}
	744
	745	/* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
	746	return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
	747	}
	748
	749	/*
	750	* sched_clutch_root_bucket_deadline_update()
	751	*
	752	* Routine to update the deadline of the root bucket when it is selected.
	753	* Updating the deadline also moves the root_bucket in the EDF priority
	754	* queue.
	755	*/
	756	static void
	757	sched_clutch_root_bucket_deadline_update(
	758	sched_clutch_root_bucket_t root_bucket,
	759	sched_clutch_root_t root_clutch,
	760	uint64_t timestamp)
	761	{
	762	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
	763	/* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
	764	return;
	765	}
f427ee49 A	766
f427ee49 A	767	uint64_t old_deadline = root_bucket->scrb_pqlink.deadline;
cb323159	768	uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
f427ee49 A	769	if (__improbable(old_deadline > new_deadline)) {
	770	panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline, new_deadline, root_bucket->scrb_bucket, timestamp);
	771	}
cb323159	772	if (old_deadline != new_deadline) {
f427ee49 A	773	root_bucket->scrb_pqlink.deadline = new_deadline;
	774	struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
	775	priority_queue_entry_increased(prioq, &root_bucket->scrb_pqlink);
cb323159 A	776	}
	777	}
	778
	779	/*
	780	* sched_clutch_root_bucket_runnable()
	781	*
	782	* Routine to insert a newly runnable root bucket into the hierarchy.
	783	* Also updates the deadline and warp parameters as necessary.
	784	*/
	785	static void
	786	sched_clutch_root_bucket_runnable(
	787	sched_clutch_root_bucket_t root_bucket,
	788	sched_clutch_root_t root_clutch,
	789	uint64_t timestamp)
	790	{
	791	/* Mark the root bucket as runnable */
f427ee49 A	792	bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
f427ee49 A	793	bitmap_set(runnable_bitmap, root_bucket->scrb_bucket);
cb323159 A	794
	795	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
	796	/* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
	797	return;
	798	}
	799
f427ee49 A	800	if (root_bucket->scrb_starvation_avoidance == false) {
	801	/*
	802	* Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
	803	* starvation avoidance and its window has expired, the highest root bucket selection logic will notice
	804	* that and fix it up.
	805	*/
	806	root_bucket->scrb_pqlink.deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
	807	}
	808	struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
	809	priority_queue_insert(prioq, &root_bucket->scrb_pqlink);
cb323159 A	810	if (root_bucket->scrb_warp_remaining) {
cb323159 A	811	/* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
f427ee49 A	812	bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
f427ee49 A	813	bitmap_set(warp_bitmap, root_bucket->scrb_bucket);
cb323159 A	814	}
	815	}
	816
	817	/*
	818	* sched_clutch_root_bucket_empty()
	819	*
	820	* Routine to remove an empty root bucket from the hierarchy.
	821	* Also updates the deadline and warp parameters as necessary.
	822	*/
	823	static void
	824	sched_clutch_root_bucket_empty(
	825	sched_clutch_root_bucket_t root_bucket,
	826	sched_clutch_root_t root_clutch,
	827	uint64_t timestamp)
	828	{
f427ee49 A	829	bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
f427ee49 A	830	bitmap_clear(runnable_bitmap, root_bucket->scrb_bucket);
cb323159 A	831
	832	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
	833	/* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
	834	return;
	835	}
	836
f427ee49 A	837	struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
	838	priority_queue_remove(prioq, &root_bucket->scrb_pqlink);
	839
	840	bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
	841	bitmap_clear(warp_bitmap, root_bucket->scrb_bucket);
cb323159	842
cb323159 A	843	if (root_bucket->scrb_warped_deadline > timestamp) {
	844	/*
	845	* For root buckets that were using the warp, check if the warp
	846	* deadline is in the future. If yes, remove the wall time the
	847	* warp was active and update the warp remaining. This allows
	848	* the root bucket to use the remaining warp the next time it
	849	* becomes runnable.
	850	*/
	851	root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
	852	} else if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
	853	/*
	854	* If the root bucket's warped deadline is in the past, it has used up
	855	* all the warp it was assigned. Empty out its warp remaining.
	856	*/
	857	root_bucket->scrb_warp_remaining = 0;
	858	}
	859	}
	860
f427ee49 A	861	static int
	862	sched_clutch_global_bucket_load_get(
	863	sched_bucket_t bucket)
	864	{
	865	return (int)os_atomic_load(&sched_clutch_global_bucket_load[bucket], relaxed);
	866	}
	867
cb323159 A	868	/*
	869	* sched_clutch_root_pri_update()
	870	*
	871	* The root level priority is used for thread selection and preemption
	872	* logic.
f427ee49 A	873	*
	874	* The logic uses the same decision as thread selection for deciding between the
	875	* above UI and timeshare buckets. If one of the timesharing buckets have to be
	876	* used for priority calculation, the logic is slightly different from thread
	877	* selection, because thread selection considers deadlines, warps etc. to
	878	* decide the most optimal bucket at a given timestamp. Since the priority
	879	* value is used for preemption decisions only, it needs to be based on the
	880	* highest runnable thread available in the timeshare domain. This logic can
	881	* be made more sophisticated if there are cases of unnecessary preemption
	882	* being seen in workloads.
cb323159 A	883	*/
	884	static void
	885	sched_clutch_root_pri_update(
	886	sched_clutch_root_t root_clutch)
	887	{
	888	sched_clutch_hierarchy_locked_assert(root_clutch);
f427ee49 A	889	int16_t root_bound_pri = NOPRI;
	890	int16_t root_unbound_pri = NOPRI;
	891
	892	if (bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
	893	goto root_pri_update_unbound;
cb323159	894	}
f427ee49 A	895	sched_clutch_root_bucket_t root_bucket_bound = NULL;
	896	if (sched_clutch_root_bound_select_aboveui(root_clutch)) {
	897	root_bucket_bound = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
cb323159	898	} else {
f427ee49 A	899	int root_bucket_index = bitmap_lsb_next(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
	900	assert(root_bucket_index != -1);
	901	root_bucket_bound = &root_clutch->scr_bound_buckets[root_bucket_index];
	902	}
	903	root_bound_pri = root_bucket_bound->scrb_bound_thread_runq.highq;
	904
	905	root_pri_update_unbound:
	906	if (bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
	907	goto root_pri_update_complete;
	908	}
	909	sched_clutch_root_bucket_t root_bucket_unbound = NULL;
	910	if (sched_clutch_root_unbound_select_aboveui(root_clutch)) {
	911	root_bucket_unbound = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
	912	} else {
	913	int root_bucket_index = bitmap_lsb_next(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
cb323159	914	assert(root_bucket_index != -1);
f427ee49	915	root_bucket_unbound = &root_clutch->scr_unbound_buckets[root_bucket_index];
cb323159 A	916	}
cb323159 A	917	/* For the selected root bucket, find the highest priority clutch bucket */
f427ee49 A	918	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_unbound);
	919	root_unbound_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
	920
	921	root_pri_update_complete:
	922	root_clutch->scr_priority = MAX(root_bound_pri, root_unbound_pri);
cb323159 A	923	}
	924
	925	/*
	926	* sched_clutch_root_urgency_inc()
	927	*
	928	* Routine to increment the urgency at the root level based on the thread
	929	* priority that is being inserted into the hierarchy. The root urgency
	930	* counter is updated based on the urgency of threads in any of the
	931	* clutch buckets which are part of the hierarchy.
	932	*
	933	* Always called with the pset lock held.
	934	*/
	935	static void
	936	sched_clutch_root_urgency_inc(
	937	sched_clutch_root_t root_clutch,
	938	thread_t thread)
	939	{
	940	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
	941	root_clutch->scr_urgency++;
	942	}
	943	}
	944
	945	/*
	946	* sched_clutch_root_urgency_dec()
	947	*
	948	* Routine to decrement the urgency at the root level based on the thread
	949	* priority that is being removed from the hierarchy. The root urgency
	950	* counter is updated based on the urgency of threads in any of the
	951	* clutch buckets which are part of the hierarchy.
	952	*
	953	* Always called with the pset lock held.
	954	*/
	955	static void
	956	sched_clutch_root_urgency_dec(
	957	sched_clutch_root_t root_clutch,
	958	thread_t thread)
	959	{
	960	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
	961	root_clutch->scr_urgency--;
	962	}
	963	}
	964
	965	/*
	966	* Clutch bucket level scheduling
	967	*
	968	* The second level of scheduling is the clutch bucket level scheduling
	969	* which tries to schedule thread groups within root_buckets. Each
f427ee49	970	* clutch represents a thread group and a clutch_bucket_group represents
cb323159	971	* threads at a particular sched_bucket within that thread group. The
f427ee49 A	972	* clutch_bucket_group contains a clutch_bucket per cluster on the system
	973	* where it holds the runnable threads destined for execution on that
	974	* cluster.
	975	*
	976	* The goal of this level of scheduling is to allow interactive thread
cb323159 A	977	* groups low latency access to the CPU. It also provides slight
	978	* scheduling preference for App and unrestricted thread groups.
	979	*
	980	* The clutch bucket scheduling algorithm measures an interactivity
f427ee49	981	* score for all clutch bucket groups. The interactivity score is based
cb323159	982	* on the ratio of the CPU used and the voluntary blocking of threads
f427ee49	983	* within the clutch bucket group. The algorithm is very close to the ULE
cb323159 A	984	* scheduler on FreeBSD in terms of calculations. The interactivity
	985	* score provides an interactivity boost in the range of
	986	* [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
	987	* thread groups to win over CPU spinners.
f427ee49 A	988	*
	989	* The interactivity score of the clutch bucket group is combined with the
	990	* highest base/promoted priority of threads in the clutch bucket to form
	991	* the overall priority of the clutch bucket.
cb323159 A	992	*/
	993
	994	/* Priority boost range for interactivity */
f427ee49 A	995	#define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT (8)
f427ee49 A	996	uint8_t sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
cb323159 A	997
cb323159 A	998	/* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
f427ee49 A	999	uint64_t sched_clutch_bucket_group_adjust_threshold = 0;
f427ee49 A	1000	#define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS (500000)
cb323159 A	1001
cb323159 A	1002	/* The ratio to scale the cpu/blocked time per window */
f427ee49	1003	#define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO (10)
cb323159 A	1004
	1005	/*
	1006	* In order to allow App thread groups some preference over daemon thread
f427ee49	1007	* groups, the App clutch_buckets get a priority boost. The boost value should
cb323159	1008	* be chosen such that badly behaved apps are still penalized over well
f427ee49	1009	* behaved interactive daemons.
cb323159	1010	*/
f427ee49 A	1011	static uint8_t sched_clutch_bucket_group_pri_boost[SCHED_CLUTCH_TG_PRI_MAX] = {
	1012	[SCHED_CLUTCH_TG_PRI_LOW] = 0,
	1013	[SCHED_CLUTCH_TG_PRI_MED] = 2,
	1014	[SCHED_CLUTCH_TG_PRI_HIGH] = 4,
	1015	};
cb323159 A	1016
cb323159 A	1017	/* Initial value for voluntary blocking time for the clutch_bucket */
f427ee49 A	1018	#define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID (uint64_t)(~0)
	1019
	1020	/* Value indicating the clutch bucket is not pending execution */
	1021	#define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID ((uint64_t)(~0))
	1022
	1023	/*
	1024	* Thread group CPU starvation avoidance
	1025	*
	1026	* In heavily CPU contended scenarios, it is possible that some thread groups
	1027	* which have a low interactivity score do not get CPU time at all. In order to
	1028	* resolve that, the scheduler tries to ageout the CPU usage of the clutch
	1029	* bucket group when it has been pending execution for a certain time as defined
	1030	* by the sched_clutch_bucket_group_pending_delta_us values below.
	1031	*
	1032	* The values chosen here are very close to the WCEL values for each sched bucket.
	1033	* These values are multiplied by the load average of the relevant root bucket to
	1034	* provide an estimate of the actual clutch bucket load.
	1035	*/
	1036	static uint32_t sched_clutch_bucket_group_pending_delta_us[TH_BUCKET_SCHED_MAX] = {
	1037	SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
	1038	10000, /* FG */
	1039	37500, /* IN */
	1040	75000, /* DF */
	1041	150000, /* UT */
	1042	250000, /* BG */
	1043	};
	1044	static uint64_t sched_clutch_bucket_group_pending_delta[TH_BUCKET_SCHED_MAX] = {0};
cb323159 A	1045
	1046	/*
	1047	* sched_clutch_bucket_init()
	1048	*
	1049	* Initializer for clutch buckets.
	1050	*/
	1051	static void
	1052	sched_clutch_bucket_init(
	1053	sched_clutch_bucket_t clutch_bucket,
f427ee49	1054	sched_clutch_bucket_group_t clutch_bucket_group,
cb323159 A	1055	sched_bucket_t bucket)
	1056	{
	1057	bzero(clutch_bucket, sizeof(struct sched_clutch_bucket));
	1058
	1059	clutch_bucket->scb_bucket = bucket;
	1060	/* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
	1061	clutch_bucket->scb_priority = 0;
f427ee49 A	1062	#if CONFIG_SCHED_EDGE
	1063	clutch_bucket->scb_foreign = false;
	1064	priority_queue_entry_init(&clutch_bucket->scb_foreignlink);
	1065	#endif /* CONFIG_SCHED_EDGE */
	1066	clutch_bucket->scb_group = clutch_bucket_group;
	1067	clutch_bucket->scb_root = NULL;
	1068	priority_queue_init(&clutch_bucket->scb_clutchpri_prioq);
	1069	priority_queue_init(&clutch_bucket->scb_thread_runq);
	1070	queue_init(&clutch_bucket->scb_thread_timeshare_queue);
	1071	}
	1072
	1073	/*
	1074	* sched_clutch_bucket_group_init()
	1075	*
	1076	* Initializer for clutch bucket groups.
	1077	*/
	1078	static void
	1079	sched_clutch_bucket_group_init(
	1080	sched_clutch_bucket_group_t clutch_bucket_group,
	1081	sched_clutch_t clutch,
	1082	sched_bucket_t bucket)
	1083	{
	1084	bzero(clutch_bucket_group, sizeof(struct sched_clutch_bucket_group));
	1085	clutch_bucket_group->scbg_bucket = bucket;
	1086	clutch_bucket_group->scbg_clutch = clutch;
	1087	for (int i = 0; i < MAX_PSETS; i++) {
	1088	sched_clutch_bucket_init(&clutch_bucket_group->scbg_clutch_buckets[i], clutch_bucket_group, bucket);
	1089	}
	1090	os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, 0, relaxed);
	1091	os_atomic_store(&clutch_bucket_group->scbg_pri_shift, INT8_MAX, relaxed);
	1092	os_atomic_store(&clutch_bucket_group->scbg_preferred_cluster, pset0.pset_cluster_id, relaxed);
cb323159 A	1093	/*
	1094	* All thread groups should be initialized to be interactive; this allows the newly launched
	1095	* thread groups to fairly compete with already running thread groups.
	1096	*/
f427ee49 A	1097	clutch_bucket_group->scbg_interactivity_data.scct_count = (sched_clutch_bucket_group_interactive_pri * 2);
	1098	clutch_bucket_group->scbg_interactivity_data.scct_timestamp = 0;
	1099	os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed);
	1100	#if !__LP64__
	1101	lck_spin_init(&clutch_bucket_group->scbg_stats_lock, &pset_lck_grp, NULL);
	1102	#endif /* !__LP64__ */
	1103	clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
	1104	clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
cb323159 A	1105	}
	1106
	1107	/*
	1108	* sched_clutch_init_with_thread_group()
	1109	*
	1110	* Initialize the sched_clutch when the thread group is being created
	1111	*/
	1112	void
	1113	sched_clutch_init_with_thread_group(
	1114	sched_clutch_t clutch,
	1115	struct thread_group *tg)
	1116	{
	1117	os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
	1118
	1119	/* Initialize all the clutch buckets */
	1120	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
f427ee49	1121	sched_clutch_bucket_group_init(&(clutch->sc_clutch_groups[i]), clutch, i);
cb323159 A	1122	}
	1123
	1124	/* Grouping specific fields */
	1125	clutch->sc_tg = tg;
	1126	os_atomic_store(&clutch->sc_tg_priority, 0, relaxed);
	1127	}
	1128
	1129	/*
	1130	* sched_clutch_destroy()
	1131	*
	1132	* Destructor for clutch; called from thread group release code.
	1133	*/
	1134	void
	1135	sched_clutch_destroy(
	1136	__unused sched_clutch_t clutch)
	1137	{
	1138	assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
	1139	}
	1140
f427ee49	1141	#if CONFIG_SCHED_EDGE
c6bf4f31 A	1142
c6bf4f31 A	1143	/*
f427ee49 A	1144	* The current edge scheduler still relies on globals for E & P clusters. It uses these
	1145	* globals for the following operations:
	1146	* - Sysctl support for configuring edges
	1147	* - Edge scheduler initialization
c6bf4f31	1148	*
f427ee49 A	1149	* These should be removed for multi-cluster platforms once a clear policy for the above
	1150	* operations is defined.
	1151	* <Edge Multi-cluster Support Needed>
c6bf4f31	1152	*/
f427ee49 A	1153	static uint32_t ecore_cluster_id = 0;
	1154	static uint32_t pcore_cluster_id = 1;
	1155
	1156	/*
	1157	* Edge Scheduler Preferred Cluster Mechanism
	1158	*
	1159	* In order to have better control over various QoS buckets within a thread group, the Edge
	1160	* scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
	1161	* preferences are stored at the sched_clutch_bucket_group level since that represents all
	1162	* threads at a particular QoS level within a sched_clutch. For any lookup of preferred
	1163	* cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
	1164	*/
	1165
	1166	static uint32_t
	1167	sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)
	1168	{
	1169	return os_atomic_load(&clutch_bucket_group->scbg_preferred_cluster, relaxed);
	1170	}
	1171
	1172	static uint32_t
	1173	sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)
	1174	{
	1175	return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket->scb_group);
	1176	}
	1177
	1178	uint32_t
	1179	sched_edge_thread_preferred_cluster(thread_t thread)
	1180	{
	1181	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
	1182	/* For threads bound to a specific cluster, return the bound cluster id */
	1183	return sched_edge_thread_bound_cluster_id(thread);
	1184	}
	1185
	1186	sched_clutch_t clutch = sched_clutch_for_thread(thread);
	1187	sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[thread->th_sched_bucket];
	1188	return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group);
	1189	}
	1190
	1191	/*
	1192	* Edge Scheduler Foreign Bucket Support
	1193	*
	1194	* In the Edge Scheduler, each cluster maintains a priority queue of clutch buckets containing
	1195	* threads that are not native to the cluster. A clutch bucket is considered native if its
	1196	* preferred cluster has the same type as the cluster its enqueued in. The foreign clutch
	1197	* bucket priority queue is used for rebalance operations to get threads back to their native
	1198	* cluster quickly.
	1199	*
	1200	* It is possible to make this policy even more aggressive by considering all clusters that
	1201	* are not the preferred cluster as the foreign cluster, but that would mean a lot of thread
	1202	* migrations which might have performance implications.
	1203	*/
	1204
	1205	static void
	1206	sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
	1207	{
	1208	if (clutch_bucket->scb_foreign) {
	1209	clutch_bucket->scb_foreign = false;
	1210	priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
	1211	}
	1212	}
	1213
	1214	static void
	1215	sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
	1216	{
1217	if (!clutch_bucket->scb_foreign) {
1218	clutch_bucket->scb_foreign = true;
1219	priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1220	priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1221	}
1222	}
1223
1224	/*
1225	* Edge Scheduler Cumulative Load Average
1226	*
1227	* The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1228	* making thread migration decisions. The per-bucket load is maintained as a
1229	* cumulative count since higher scheduling buckets impact load on lower buckets
1230	* for thread migration decisions.
1231	*
1232	*/
1233
1234	static void
1235	sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1236	{
1237	switch (bucket) {
1238	case TH_BUCKET_FIXPRI: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1239	case TH_BUCKET_SHARE_FG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1240	case TH_BUCKET_SHARE_IN: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1241	case TH_BUCKET_SHARE_DF: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1242	case TH_BUCKET_SHARE_UT: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1243	case TH_BUCKET_SHARE_BG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1244	default:
1245	panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1246	}
1247	}
1248
1249	static void
1250	sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1251	{
1252	switch (bucket) {
1253	case TH_BUCKET_FIXPRI: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1254	case TH_BUCKET_SHARE_FG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1255	case TH_BUCKET_SHARE_IN: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1256	case TH_BUCKET_SHARE_DF: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1257	case TH_BUCKET_SHARE_UT: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1258	case TH_BUCKET_SHARE_BG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1259	default:
1260	panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
c6bf4f31	1261	}
c6bf4f31 A	1262	}
c6bf4f31 A	1263
f427ee49 A	1264	uint16_t
	1265	sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
	1266	{
	1267	return os_atomic_load(&root_clutch->scr_cumulative_run_count[bucket], relaxed);
	1268	}
	1269
	1270	#endif /* CONFIG_SCHED_EDGE */
cb323159 A	1271
	1272	/*
	1273	* sched_clutch_bucket_hierarchy_insert()
	1274	*
	1275	* Routine to insert a newly runnable clutch_bucket into the root hierarchy.
	1276	*/
	1277	static void
	1278	sched_clutch_bucket_hierarchy_insert(
	1279	sched_clutch_root_t root_clutch,
	1280	sched_clutch_bucket_t clutch_bucket,
	1281	sched_bucket_t bucket,
ea3f0419 A	1282	uint64_t timestamp,
ea3f0419 A	1283	sched_clutch_bucket_options_t options)
cb323159 A	1284	{
	1285	sched_clutch_hierarchy_locked_assert(root_clutch);
	1286	if (bucket > TH_BUCKET_FIXPRI) {
	1287	/* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
	1288	enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
	1289	}
f427ee49	1290	#if CONFIG_SCHED_EDGE
c6bf4f31	1291	/* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
f427ee49 A	1292	uint32_t preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
	1293	if (pset_type_for_id(preferred_cluster) != pset_type_for_id(root_clutch->scr_cluster_id)) {
	1294	sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
c6bf4f31	1295	}
f427ee49 A	1296	#endif /* CONFIG_SCHED_EDGE */
f427ee49 A	1297	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
cb323159 A	1298
cb323159 A	1299	/* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
ea3f0419	1300	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
cb323159 A	1301	sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
	1302	}
	1303
ea3f0419 A	1304	/* Insert the clutch bucket into the root bucket run queue with order based on options */
ea3f0419 A	1305	sched_clutch_bucket_runq_enqueue(&root_bucket->scrb_clutch_buckets, clutch_bucket, options);
cb323159	1306	os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
f427ee49	1307	os_atomic_inc(&sched_clutch_global_bucket_load[bucket], relaxed);
cb323159 A	1308	}
	1309
	1310	/*
	1311	* sched_clutch_bucket_hierarchy_remove()
	1312	*
	1313	* Rotuine to remove a empty clutch bucket from the root hierarchy.
	1314	*/
	1315	static void
	1316	sched_clutch_bucket_hierarchy_remove(
	1317	sched_clutch_root_t root_clutch,
	1318	sched_clutch_bucket_t clutch_bucket,
	1319	sched_bucket_t bucket,
ea3f0419 A	1320	uint64_t timestamp,
ea3f0419 A	1321	__unused sched_clutch_bucket_options_t options)
cb323159 A	1322	{
	1323	sched_clutch_hierarchy_locked_assert(root_clutch);
	1324	if (bucket > TH_BUCKET_FIXPRI) {
	1325	/* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
	1326	remqueue(&clutch_bucket->scb_listlink);
	1327	}
f427ee49 A	1328	#if CONFIG_SCHED_EDGE
	1329	sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
	1330	#endif /* CONFIG_SCHED_EDGE */
cb323159	1331
f427ee49	1332	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
cb323159 A	1333
cb323159 A	1334	/* Remove the clutch bucket from the root bucket priority queue */
ea3f0419	1335	sched_clutch_bucket_runq_remove(&root_bucket->scrb_clutch_buckets, clutch_bucket);
cb323159	1336	os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
cb323159 A	1337
cb323159 A	1338	/* If the root bucket priority queue is now empty, remove it from the root priority queue */
ea3f0419	1339	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
cb323159 A	1340	sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
cb323159 A	1341	}
f427ee49	1342	os_atomic_dec(&sched_clutch_global_bucket_load[bucket], relaxed);
cb323159 A	1343	}
	1344
	1345	/*
	1346	* sched_clutch_bucket_base_pri()
	1347	*
	1348	* Calculates the "base" priority of the clutch bucket. The base
	1349	* priority of the clutch bucket is the sum of the max of highest
	1350	* base_pri and highest sched_pri in the clutch bucket and any
	1351	* grouping specific (App/Daemon...) boosts applicable to the
	1352	* clutch_bucket.
	1353	*/
	1354	static uint8_t
	1355	sched_clutch_bucket_base_pri(
	1356	sched_clutch_bucket_t clutch_bucket)
	1357	{
	1358	uint8_t clutch_boost = 0;
f427ee49	1359	assert(priority_queue_empty(&clutch_bucket->scb_thread_runq) == false);
cb323159	1360
f427ee49	1361	sched_clutch_t clutch = clutch_bucket->scb_group->scbg_clutch;
cb323159 A	1362
	1363	/*
	1364	* Since the clutch bucket can contain threads that are members of the group due
	1365	* to the sched_pri being promoted or due to their base pri, the base priority of
	1366	* the entire clutch bucket should be based on the highest thread (promoted or base)
	1367	* in the clutch bucket.
	1368	*/
f427ee49 A	1369	uint8_t max_pri = 0;
	1370	if (!priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq)) {
	1371	max_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
cb323159	1372	}
f427ee49 A	1373
	1374	sched_clutch_tg_priority_t tg_pri = os_atomic_load(&clutch->sc_tg_priority, relaxed);
	1375	clutch_boost = sched_clutch_bucket_group_pri_boost[tg_pri];
cb323159 A	1376	return max_pri + clutch_boost;
	1377	}
	1378
	1379	/*
f427ee49	1380	* sched_clutch_interactivity_from_cpu_data()
cb323159	1381	*
f427ee49	1382	* Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
cb323159 A	1383	*/
cb323159 A	1384	static uint8_t
f427ee49	1385	sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)
cb323159	1386	{
cb323159	1387	sched_clutch_bucket_cpu_data_t scb_cpu_data;
f427ee49	1388	scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
cb323159 A	1389	clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
cb323159 A	1390	clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
f427ee49	1391	uint8_t interactive_score = 0;
cb323159	1392
cb323159	1393	if ((cpu_blocked == 0) && (cpu_used == 0)) {
f427ee49	1394	return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
cb323159	1395	}
cb323159 A	1396	/*
	1397	* For all timeshare buckets, calculate the interactivity score of the bucket
	1398	* and add it to the base priority
	1399	*/
cb323159 A	1400	if (cpu_blocked > cpu_used) {
cb323159 A	1401	/* Interactive clutch_bucket case */
f427ee49 A	1402	interactive_score = sched_clutch_bucket_group_interactive_pri +
f427ee49 A	1403	((sched_clutch_bucket_group_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
cb323159 A	1404	} else {
cb323159 A	1405	/* Non-interactive clutch_bucket case */
f427ee49	1406	interactive_score = ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) / cpu_used);
cb323159	1407	}
cb323159 A	1408	return interactive_score;
	1409	}
	1410
	1411	/*
	1412	* sched_clutch_bucket_pri_calculate()
	1413	*
	1414	* The priority calculation algorithm for the clutch_bucket is a slight
	1415	* modification on the ULE interactivity score. It uses the base priority
	1416	* of the clutch bucket and applies an interactivity score boost to the
	1417	* highly responsive clutch buckets.
	1418	*/
cb323159 A	1419	static uint8_t
	1420	sched_clutch_bucket_pri_calculate(
	1421	sched_clutch_bucket_t clutch_bucket,
	1422	uint64_t timestamp)
	1423	{
	1424	/* For empty clutch buckets, return priority 0 */
	1425	if (clutch_bucket->scb_thr_count == 0) {
	1426	return 0;
	1427	}
	1428
	1429	uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
f427ee49	1430	uint8_t interactive_score = sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket->scb_group, timestamp);
cb323159 A	1431
	1432	assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
	1433	uint8_t pri = base_pri + interactive_score;
	1434	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) \| DBG_FUNC_NONE,
f427ee49	1435	thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
cb323159 A	1436	return pri;
	1437	}
	1438
	1439	/*
	1440	* sched_clutch_root_bucket_highest_clutch_bucket()
	1441	*
	1442	* Routine to find the highest priority clutch bucket
	1443	* within the root bucket.
	1444	*/
	1445	static sched_clutch_bucket_t
	1446	sched_clutch_root_bucket_highest_clutch_bucket(
	1447	sched_clutch_root_bucket_t root_bucket)
	1448	{
ea3f0419	1449	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
cb323159 A	1450	return NULL;
cb323159 A	1451	}
ea3f0419	1452	return sched_clutch_bucket_runq_peek(&root_bucket->scrb_clutch_buckets);
cb323159 A	1453	}
	1454
	1455	/*
	1456	* sched_clutch_bucket_runnable()
	1457	*
	1458	* Perform all operations needed when a new clutch bucket becomes runnable.
	1459	* It involves inserting the clutch_bucket into the hierarchy and updating the
	1460	* root priority appropriately.
	1461	*/
	1462	static boolean_t
	1463	sched_clutch_bucket_runnable(
	1464	sched_clutch_bucket_t clutch_bucket,
	1465	sched_clutch_root_t root_clutch,
ea3f0419 A	1466	uint64_t timestamp,
ea3f0419 A	1467	sched_clutch_bucket_options_t options)
cb323159 A	1468	{
cb323159 A	1469	sched_clutch_hierarchy_locked_assert(root_clutch);
f427ee49	1470	/* Since the clutch bucket became newly runnable, update its pending timestamp */
cb323159	1471	clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
ea3f0419	1472	sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
f427ee49	1473
cb323159	1474	/* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
f427ee49	1475	sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, timestamp);
cb323159 A	1476	int16_t root_old_pri = root_clutch->scr_priority;
	1477	sched_clutch_root_pri_update(root_clutch);
	1478	return root_clutch->scr_priority > root_old_pri;
	1479	}
	1480
	1481	/*
	1482	* sched_clutch_bucket_update()
	1483	*
ea3f0419 A	1484	* Update the clutch_bucket's position in the hierarchy. This routine is
	1485	* called when a new thread is inserted or removed from a runnable clutch
	1486	* bucket. The options specify some properties about the clutch bucket
	1487	* insertion order into the clutch bucket runq.
cb323159 A	1488	*/
	1489	static boolean_t
	1490	sched_clutch_bucket_update(
	1491	sched_clutch_bucket_t clutch_bucket,
	1492	sched_clutch_root_t root_clutch,
ea3f0419 A	1493	uint64_t timestamp,
ea3f0419 A	1494	sched_clutch_bucket_options_t options)
cb323159 A	1495	{
	1496	sched_clutch_hierarchy_locked_assert(root_clutch);
	1497	uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
f427ee49	1498	sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_unbound_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
cb323159	1499	if (new_pri == clutch_bucket->scb_priority) {
ea3f0419 A	1500	/*
	1501	* If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
	1502	* to the end of the runq. Typically used when a thread is selected for execution
	1503	* from a clutch bucket.
	1504	*/
	1505	if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
	1506	sched_clutch_bucket_runq_rotate(bucket_runq, clutch_bucket);
	1507	}
cb323159 A	1508	return false;
cb323159 A	1509	}
ea3f0419	1510	sched_clutch_bucket_runq_remove(bucket_runq, clutch_bucket);
f427ee49 A	1511	#if CONFIG_SCHED_EDGE
	1512	if (clutch_bucket->scb_foreign) {
	1513	priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
	1514	}
	1515	#endif /* CONFIG_SCHED_EDGE */
ea3f0419	1516	clutch_bucket->scb_priority = new_pri;
f427ee49 A	1517	#if CONFIG_SCHED_EDGE
	1518	if (clutch_bucket->scb_foreign) {
	1519	priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
	1520	priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
	1521	}
	1522	#endif /* CONFIG_SCHED_EDGE */
ea3f0419	1523	sched_clutch_bucket_runq_enqueue(bucket_runq, clutch_bucket, options);
cb323159 A	1524
	1525	int16_t root_old_pri = root_clutch->scr_priority;
	1526	sched_clutch_root_pri_update(root_clutch);
	1527	return root_clutch->scr_priority > root_old_pri;
	1528	}
	1529
	1530	/*
	1531	* sched_clutch_bucket_empty()
	1532	*
	1533	* Perform all the operations needed when a clutch_bucket is no longer runnable.
	1534	* It involves removing the clutch bucket from the hierarchy and updaing the root
	1535	* priority appropriately.
	1536	*/
	1537	static void
	1538	sched_clutch_bucket_empty(
	1539	sched_clutch_bucket_t clutch_bucket,
	1540	sched_clutch_root_t root_clutch,
ea3f0419 A	1541	uint64_t timestamp,
ea3f0419 A	1542	sched_clutch_bucket_options_t options)
cb323159 A	1543	{
cb323159 A	1544	sched_clutch_hierarchy_locked_assert(root_clutch);
ea3f0419	1545	sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
cb323159 A	1546	clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
	1547	sched_clutch_root_pri_update(root_clutch);
	1548	}
	1549
	1550	/*
	1551	* sched_clutch_cpu_usage_update()
	1552	*
	1553	* Routine to update CPU usage of the thread in the hierarchy.
	1554	*/
	1555	void
	1556	sched_clutch_cpu_usage_update(
	1557	thread_t thread,
	1558	uint64_t delta)
	1559	{
f427ee49	1560	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread) \|\| SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
cb323159 A	1561	return;
cb323159 A	1562	}
f427ee49	1563
cb323159	1564	sched_clutch_t clutch = sched_clutch_for_thread(thread);
f427ee49 A	1565	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
f427ee49 A	1566	sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta);
cb323159 A	1567	}
	1568
	1569	/*
f427ee49	1570	* sched_clutch_bucket_group_cpu_usage_update()
cb323159 A	1571	*
	1572	* Routine to update the CPU usage of the clutch_bucket.
	1573	*/
	1574	static void
f427ee49 A	1575	sched_clutch_bucket_group_cpu_usage_update(
f427ee49 A	1576	sched_clutch_bucket_group_t clutch_bucket_group,
cb323159 A	1577	uint64_t delta)
cb323159 A	1578	{
f427ee49	1579	if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
cb323159 A	1580	/* Since Above UI bucket has maximum interactivity score always, nothing to do here */
	1581	return;
	1582	}
f427ee49 A	1583	delta = MIN(delta, sched_clutch_bucket_group_adjust_threshold);
f427ee49 A	1584	os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
cb323159 A	1585	}
	1586
	1587	/*
f427ee49	1588	* sched_clutch_bucket_group_cpu_pending_adjust()
cb323159	1589	*
f427ee49 A	1590	* Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
f427ee49 A	1591	* such that one "pending interval" provides one point improvement in interactivity score.
cb323159	1592	*/
f427ee49 A	1593	static inline uint64_t
	1594	sched_clutch_bucket_group_cpu_pending_adjust(
	1595	uint64_t cpu_used,
	1596	uint64_t cpu_blocked,
	1597	uint8_t pending_intervals)
cb323159	1598	{
f427ee49 A	1599	uint64_t cpu_used_adjusted = 0;
	1600	if (cpu_blocked < cpu_used) {
	1601	cpu_used_adjusted = (sched_clutch_bucket_group_interactive_pri * cpu_blocked * cpu_used);
	1602	cpu_used_adjusted = cpu_used_adjusted / ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) + (cpu_used * pending_intervals));
	1603	} else {
	1604	uint64_t adjust_factor = (cpu_blocked * pending_intervals) / sched_clutch_bucket_group_interactive_pri;
	1605	cpu_used_adjusted = (adjust_factor > cpu_used) ? 0 : (cpu_used - adjust_factor);
cb323159	1606	}
f427ee49	1607	return cpu_used_adjusted;
cb323159 A	1608	}
	1609
	1610	/*
f427ee49	1611	* sched_clutch_bucket_group_cpu_adjust()
cb323159 A	1612	*
cb323159 A	1613	* Routine to scale the cpu usage and blocked time once the sum gets bigger
f427ee49	1614	* than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
cb323159 A	1615	* manageable and maintain the same ratio while allowing clutch buckets to
cb323159 A	1616	* adjust behavior and reflect in the interactivity score in a reasonable
f427ee49 A	1617	* amount of time. Also adjusts the CPU usage based on pending_intervals
f427ee49 A	1618	* which allows ageout of CPU to avoid starvation in highly contended scenarios.
cb323159 A	1619	*/
cb323159 A	1620	static void
f427ee49 A	1621	sched_clutch_bucket_group_cpu_adjust(
	1622	sched_clutch_bucket_group_t clutch_bucket_group,
	1623	uint8_t pending_intervals)
cb323159 A	1624	{
	1625	sched_clutch_bucket_cpu_data_t old_cpu_data = {};
	1626	sched_clutch_bucket_cpu_data_t new_cpu_data = {};
f427ee49	1627	os_atomic_rmw_loop(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed, {
cb323159 A	1628	clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
cb323159 A	1629	clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
cb323159	1630
f427ee49 A	1631	if ((pending_intervals == 0) && (cpu_used + cpu_blocked) < sched_clutch_bucket_group_adjust_threshold) {
	1632	/* No changes to the CPU used and blocked values */
	1633	os_atomic_rmw_loop_give_up();
	1634	}
	1635	if ((cpu_used + cpu_blocked) >= sched_clutch_bucket_group_adjust_threshold) {
	1636	/* Only keep the recent CPU history to better indicate how this TG has been behaving */
	1637	cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
	1638	cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
	1639	}
	1640	/* Use the shift passed in to ageout the CPU usage */
	1641	cpu_used = (clutch_cpu_data_t)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used, cpu_blocked, pending_intervals);
	1642	new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used;
	1643	new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked;
	1644	});
cb323159 A	1645	}
	1646
	1647	/*
	1648	* Thread level scheduling algorithm
	1649	*
	1650	* The thread level scheduling algorithm uses the mach timeshare
	1651	* decay based algorithm to achieve sharing between threads within the
	1652	* same clutch bucket. The load/priority shifts etc. are all maintained
	1653	* at the clutch bucket level and used for decay calculation of the
	1654	* threads. The load sampling is still driven off the scheduler tick
	1655	* for runnable clutch buckets (it does not use the new higher frequency
	1656	* EWMA based load calculation). The idea is that the contention and load
	1657	* within clutch_buckets should be limited enough to not see heavy decay
	1658	* and timeshare effectively.
	1659	*/
	1660
	1661	/*
	1662	* sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
	1663	*
	1664	* Increment the run count for the clutch bucket associated with the
	1665	* thread.
	1666	*/
	1667	uint32_t
	1668	sched_clutch_thread_run_bucket_incr(
	1669	thread_t thread,
	1670	sched_bucket_t bucket)
	1671	{
	1672	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
	1673	return 0;
	1674	}
	1675	sched_clutch_t clutch = sched_clutch_for_thread(thread);
	1676	return sched_clutch_run_bucket_incr(clutch, bucket);
	1677	}
	1678
	1679	static uint32_t
	1680	sched_clutch_run_bucket_incr(
	1681	sched_clutch_t clutch,
	1682	sched_bucket_t bucket)
	1683	{
	1684	assert(bucket != TH_BUCKET_RUN);
f427ee49 A	1685	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
f427ee49 A	1686	return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group);
cb323159 A	1687	}
	1688
	1689	/*
	1690	* sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
	1691	*
	1692	* Decrement the run count for the clutch bucket associated with the
	1693	* thread.
	1694	*/
	1695	uint32_t
	1696	sched_clutch_thread_run_bucket_decr(
	1697	thread_t thread,
	1698	sched_bucket_t bucket)
	1699	{
	1700	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
	1701	return 0;
	1702	}
	1703	sched_clutch_t clutch = sched_clutch_for_thread(thread);
	1704	return sched_clutch_run_bucket_decr(clutch, bucket);
	1705	}
	1706
	1707	static uint32_t
	1708	sched_clutch_run_bucket_decr(
	1709	sched_clutch_t clutch,
	1710	sched_bucket_t bucket)
	1711	{
	1712	assert(bucket != TH_BUCKET_RUN);
f427ee49 A	1713	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
f427ee49 A	1714	return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group);
cb323159 A	1715	}
	1716
	1717	/*
f427ee49	1718	* sched_clutch_bucket_group_timeshare_update()
cb323159	1719	*
f427ee49 A	1720	* Routine to update the load and priority shift for the clutch_bucket_group
	1721	* every sched_tick. For multi-cluster platforms, each QoS level will have multiple
	1722	* clutch buckets with runnable threads in them. So it is important to maintain
	1723	* the timesharing information at the clutch_bucket_group level instead of
	1724	* individual clutch buckets (because the algorithm is trying to timeshare all
	1725	* threads at the same QoS irrespective of which hierarchy they are enqueued in).
	1726	*
	1727	* The routine is called from the sched tick handling code to make sure this value
	1728	* is updated at least once every sched tick. For clutch bucket groups which have
	1729	* not been runnable for very long, the clutch_bucket_group maintains a "last
	1730	* updated schedtick" parameter. As threads become runnable in the clutch bucket group,
	1731	* if this value is outdated, the load and shifts are updated.
cb323159 A	1732	*
	1733	* Possible optimization:
	1734	* - The current algorithm samples the load every sched tick (125ms).
	1735	* This is prone to spikes in runnable counts; if that turns out to be
	1736	* a problem, a simple solution would be to do the EWMA trick to sample
	1737	* load at every load_tick (30ms) and use the averaged value for the pri
	1738	* shift calculation.
	1739	*/
	1740	static void
f427ee49 A	1741	sched_clutch_bucket_group_timeshare_update(
	1742	sched_clutch_bucket_group_t clutch_bucket_group,
	1743	sched_clutch_bucket_t clutch_bucket,
	1744	uint64_t ctime)
cb323159	1745	{
f427ee49 A	1746	if (clutch_bucket_group->scbg_bucket < TH_BUCKET_SHARE_FG) {
f427ee49 A	1747	/* No timesharing needed for fixed priority Above UI threads */
cb323159 A	1748	return;
	1749	}
	1750
	1751	/*
f427ee49 A	1752	* Update the timeshare parameters for the clutch bucket group
f427ee49 A	1753	* if they havent been updated in this tick.
cb323159	1754	*/
f427ee49	1755	uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed);
cb323159	1756	uint32_t current_sched_ts = sched_tick;
f427ee49 A	1757	if (sched_ts < current_sched_ts) {
	1758	os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed);
	1759	/* NCPU wide workloads should not experience decay */
	1760	uint64_t bucket_group_run_count = os_atomic_load_wide(&clutch_bucket_group->scbg_blocked_data.scct_count, relaxed) - 1;
	1761	uint32_t bucket_group_load = (uint32_t)(bucket_group_run_count / processor_avail_count);
	1762	bucket_group_load = MIN(bucket_group_load, NRQS - 1);
	1763	uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_group_load];
	1764	/* Ensure that the pri_shift value is reasonable */
	1765	pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift;
	1766	os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed);
cb323159	1767	}
f427ee49 A	1768
	1769	/*
	1770	* Update the clutch bucket priority; this allows clutch buckets that have been pending
	1771	* for a long time to get an updated interactivity score.
	1772	*/
	1773	sched_clutch_bucket_update(clutch_bucket, clutch_bucket->scb_root, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
cb323159 A	1774	}
	1775
	1776	/*
	1777	* sched_clutch_thread_clutch_update()
	1778	*
	1779	* Routine called when the thread changes its thread group. The current
	1780	* implementation relies on the fact that the thread group is changed only
	1781	* from the context of the thread itself. Due to this fact, the thread
	1782	* group change causes only counter updates in the old & new clutch
	1783	* buckets and no hierarchy changes. The routine also attributes the CPU
	1784	* used so far to the old clutch.
	1785	*/
	1786	void
	1787	sched_clutch_thread_clutch_update(
	1788	thread_t thread,
	1789	sched_clutch_t old_clutch,
	1790	sched_clutch_t new_clutch)
	1791	{
	1792	uint32_t cpu_delta;
	1793	assert(current_thread() == thread);
	1794
	1795	if (old_clutch) {
	1796	sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
	1797	/*
	1798	* Calculate the CPU used by this thread in the old bucket and
	1799	* add it to the old clutch bucket. This uses the same CPU usage
	1800	* logic as update_priority etc.
	1801	*/
	1802	thread_timer_delta(thread, cpu_delta);
	1803	if (thread->pri_shift < INT8_MAX) {
	1804	thread->sched_usage += cpu_delta;
	1805	}
	1806	thread->cpu_delta += cpu_delta;
f427ee49 A	1807	if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
	1808	sched_clutch_bucket_group_t clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]);
	1809	sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, cpu_delta);
	1810	}
cb323159 A	1811	}
	1812
	1813	if (new_clutch) {
	1814	sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
	1815	}
	1816	}
	1817
	1818	/* Thread Insertion/Removal/Selection routines */
	1819
f427ee49 A	1820	#if CONFIG_SCHED_EDGE
	1821
	1822	/*
	1823	* Edge Scheduler Bound Thread Support
	1824	*
	1825	* The edge scheduler allows threads to be bound to specific clusters. The scheduler
	1826	* maintains a separate runq on the clutch root to hold these bound threads. These
	1827	* bound threads count towards the root priority and thread count, but are ignored
	1828	* for thread migration/steal decisions. Bound threads that are enqueued in the
	1829	* separate runq have the th_bound_cluster_enqueued flag set to allow easy
	1830	* removal.
	1831	*
	1832	* Bound Threads Timesharing
	1833	* The bound threads share the timesharing properties of the clutch bucket group they are
	1834	* part of. They contribute to the load and use priority shifts/decay values from the
	1835	* clutch bucket group.
	1836	*/
	1837
	1838	static boolean_t
	1839	sched_edge_bound_thread_insert(
	1840	sched_clutch_root_t root_clutch,
	1841	thread_t thread,
	1842	integer_t options)
	1843	{
	1844	/* Update the clutch runnable count and priority */
	1845	sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
	1846	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
	1847	if (root_bucket->scrb_bound_thread_runq.count == 0) {
	1848	sched_clutch_root_bucket_runnable(root_bucket, root_clutch, mach_absolute_time());
	1849	}
	1850
	1851	assert((thread->th_bound_cluster_enqueued) == false);
	1852	run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options);
	1853	thread->th_bound_cluster_enqueued = true;
	1854
	1855	int16_t root_old_pri = root_clutch->scr_priority;
	1856	sched_clutch_root_pri_update(root_clutch);
	1857	return root_clutch->scr_priority > root_old_pri;
	1858	}
	1859
	1860	static void
	1861	sched_edge_bound_thread_remove(
	1862	sched_clutch_root_t root_clutch,
	1863	thread_t thread)
	1864	{
	1865	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
	1866	assert((thread->th_bound_cluster_enqueued) == true);
	1867	run_queue_remove(&root_bucket->scrb_bound_thread_runq, thread);
	1868	thread->th_bound_cluster_enqueued = false;
	1869
	1870	/* Update the clutch runnable count and priority */
	1871	sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
	1872	if (root_bucket->scrb_bound_thread_runq.count == 0) {
	1873	sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time());
	1874	}
	1875	sched_clutch_root_pri_update(root_clutch);
	1876	}
	1877
	1878	#endif /* CONFIG_SCHED_EDGE */
	1879
	1880	/*
	1881	* sched_clutch_thread_bound_lookup()
	1882	*
	1883	* Routine to lookup the highest priority runnable thread in a bounded root bucket.
1884	*/
1885	static thread_t
1886	sched_clutch_thread_bound_lookup(
1887	__unused sched_clutch_root_t root_clutch,
1888	sched_clutch_root_bucket_t root_bucket)
1889	{
1890	return run_queue_peek(&root_bucket->scrb_bound_thread_runq);
1891	}
1892
1893	/*
1894	* Clutch Bucket Group Thread Counts and Pending time calculation
1895	*
1896	* The pending time on the clutch_bucket_group allows the scheduler to track if it
1897	* needs to ageout the CPU usage because the clutch_bucket_group has been pending for
1898	* a very long time. The pending time is set to the timestamp as soon as a thread becomes
1899	* runnable. When a thread is picked up for execution from this clutch_bucket_group, the
1900	* pending time is advanced to the time of thread selection.
1901	*
1902	* Since threads for a clutch bucket group can be added or removed from multiple CPUs
1903	* simulataneously, it is important that the updates to thread counts and pending timestamps
1904	* happen atomically. The implementation relies on the following aspects to make that work
1905	* as expected:
1906	* - The clutch scheduler would be deployed on single cluster platforms where the pset lock
1907	* is held when threads are added/removed and pending timestamps are updated
1908	* - The edge scheduler would have support for double wide 128 bit atomics which allow the
1909	* thread count and pending timestamp to be updated atomically.
1910	*
1911	* Clutch bucket group interactivity timestamp and score updates also rely on the properties
1912	* above to atomically update the interactivity score for a clutch bucket group.
1913	*/
1914
1915	#if CONFIG_SCHED_EDGE
1916
1917	static void
1918	sched_clutch_bucket_group_thr_count_inc(
1919	sched_clutch_bucket_group_t clutch_bucket_group,
1920	uint64_t timestamp)
1921	{
1922	sched_clutch_counter_time_t old_pending_data;
1923	sched_clutch_counter_time_t new_pending_data;
1924	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
1925	new_pending_data.scct_count = old_pending_data.scct_count + 1;
1926	new_pending_data.scct_timestamp = old_pending_data.scct_timestamp;
1927	if (old_pending_data.scct_count == 0) {
1928	new_pending_data.scct_timestamp = timestamp;
1929	}
1930	});
1931	}
1932
1933	static void
1934	sched_clutch_bucket_group_thr_count_dec(
1935	sched_clutch_bucket_group_t clutch_bucket_group,
1936	uint64_t timestamp)
1937	{
1938	sched_clutch_counter_time_t old_pending_data;
1939	sched_clutch_counter_time_t new_pending_data;
1940	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
1941	new_pending_data.scct_count = old_pending_data.scct_count - 1;
1942	if (new_pending_data.scct_count == 0) {
1943	new_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1944	} else {
1945	new_pending_data.scct_timestamp = timestamp;
1946	}
1947	});
1948	}
1949
1950	static uint8_t
1951	sched_clutch_bucket_group_pending_ageout(
1952	sched_clutch_bucket_group_t clutch_bucket_group,
1953	uint64_t timestamp)
1954	{
1955	int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
1956	sched_clutch_counter_time_t old_pending_data;
1957	sched_clutch_counter_time_t new_pending_data;
1958	uint8_t cpu_usage_shift = 0;
1959
1960	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
1961	cpu_usage_shift = 0;
1962	uint64_t old_pending_ts = old_pending_data.scct_timestamp;
1963	bool old_update = (old_pending_ts >= timestamp);
1964	bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
1965	bool no_bucket_load = (bucket_load == 0);
1966	if (old_update \|\| no_pending_time \|\| no_bucket_load) {
1967	os_atomic_rmw_loop_give_up();
1968	}
1969
1970	/* Calculate the time the clutch bucket group has been pending */
1971	uint64_t pending_delta = timestamp - old_pending_ts;
1972	uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
1973	if (pending_delta < interactivity_delta) {
1974	os_atomic_rmw_loop_give_up();
1975	}
1976	cpu_usage_shift = (pending_delta / interactivity_delta);
1977	new_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
1978	new_pending_data.scct_count = old_pending_data.scct_count;
1979	});
1980	return cpu_usage_shift;
1981	}
1982
1983	static uint8_t
1984	sched_clutch_bucket_group_interactivity_score_calculate(
1985	sched_clutch_bucket_group_t clutch_bucket_group,
1986	uint64_t timestamp)
1987	{
1988	if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
1989	/*
1990	* Since the root bucket selection algorithm for Above UI looks at clutch bucket
1991	* priorities, make sure all AboveUI buckets are marked interactive.
1992	*/
1993	assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
1994	return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1995	}
1996	/* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
1997	uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
1998	/* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
1999	sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2000	uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2001	sched_clutch_counter_time_t old_interactivity_data;
2002	sched_clutch_counter_time_t new_interactivity_data;
2003
2004	bool score_updated = os_atomic_rmw_loop(&clutch_bucket_group->scbg_interactivity_data.scct_packed, old_interactivity_data.scct_packed, new_interactivity_data.scct_packed, relaxed, {
2005	if (old_interactivity_data.scct_timestamp >= timestamp) {
2006	os_atomic_rmw_loop_give_up();
2007	}
2008	new_interactivity_data.scct_timestamp = timestamp;
2009	if (old_interactivity_data.scct_timestamp != 0) {
2010	new_interactivity_data.scct_count = interactivity_score;
2011	}
2012	});
2013	if (score_updated) {
2014	return (uint8_t)new_interactivity_data.scct_count;
2015	} else {
2016	return (uint8_t)old_interactivity_data.scct_count;
2017	}
2018	}
2019
2020	#else /* CONFIG_SCHED_EDGE */
2021
2022	/*
2023	* For the clutch scheduler, atomicity is ensured by making sure all operations
2024	* are happening under the pset lock of the only cluster present on the platform.
2025	*/
2026	static void
2027	sched_clutch_bucket_group_thr_count_inc(
2028	sched_clutch_bucket_group_t clutch_bucket_group,
2029	uint64_t timestamp)
2030	{
2031	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2032	if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2033	clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2034	}
2035	clutch_bucket_group->scbg_pending_data.scct_count++;
2036	}
2037
2038	static void
2039	sched_clutch_bucket_group_thr_count_dec(
2040	sched_clutch_bucket_group_t clutch_bucket_group,
2041	uint64_t timestamp)
2042	{
2043	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2044	clutch_bucket_group->scbg_pending_data.scct_count--;
2045	if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2046	clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2047	} else {
2048	clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2049	}
2050	}
2051
2052	static uint8_t
2053	sched_clutch_bucket_group_pending_ageout(
2054	sched_clutch_bucket_group_t clutch_bucket_group,
2055	uint64_t timestamp)
2056	{
2057	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2058	int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2059	uint64_t old_pending_ts = clutch_bucket_group->scbg_pending_data.scct_timestamp;
2060	bool old_update = (old_pending_ts >= timestamp);
2061	bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2062	bool no_bucket_load = (bucket_load == 0);
2063	if (old_update \|\| no_pending_time \|\| no_bucket_load) {
2064	return 0;
2065	}
2066	uint64_t pending_delta = timestamp - old_pending_ts;
2067	uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
2068	if (pending_delta < interactivity_delta) {
2069	return 0;
2070	}
2071	uint8_t cpu_usage_shift = (pending_delta / interactivity_delta);
2072	clutch_bucket_group->scbg_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2073	return cpu_usage_shift;
2074	}
2075
2076	static uint8_t
2077	sched_clutch_bucket_group_interactivity_score_calculate(
2078	sched_clutch_bucket_group_t clutch_bucket_group,
2079	uint64_t timestamp)
2080	{
2081	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2082	if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2083	/*
2084	* Since the root bucket selection algorithm for Above UI looks at clutch bucket
2085	* priorities, make sure all AboveUI buckets are marked interactive.
2086	*/
2087	assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2088	return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2089	}
2090	/* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2091	uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2092	/* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2093	sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2094	uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2095	if (timestamp > clutch_bucket_group->scbg_interactivity_data.scct_timestamp) {
2096	clutch_bucket_group->scbg_interactivity_data.scct_count = interactivity_score;
2097	clutch_bucket_group->scbg_interactivity_data.scct_timestamp = timestamp;
2098	return interactivity_score;
2099	} else {
2100	return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2101	}
2102	}
2103
2104	#endif /* CONFIG_SCHED_EDGE */
2105
2106	/*
2107	* Clutch Bucket Group Run Count and Blocked Time Accounting
2108	*
2109	* The clutch bucket group maintains the number of runnable/running threads in the group.
2110	* Since the blocked time of the clutch bucket group is based on this count, it is
2111	* important to make sure the blocking timestamp and the run count are updated atomically.
2112	*
2113	* Since the run count increments happen without any pset locks held, the scheduler makes
2114	* these updates atomic in the following way:
2115	* - On 64-bit platforms, it uses double wide atomics to update the count & timestamp
2116	* - On 32-bit platforms, it uses a lock to synchronize the count & timestamp update
2117	*/
2118
2119	#if !__LP64__
2120
2121	static uint32_t
2122	sched_clutch_bucket_group_run_count_inc(
2123	sched_clutch_bucket_group_t clutch_bucket_group)
2124	{
2125	uint64_t blocked_time = 0;
2126	uint64_t updated_run_count = 0;
2127
2128	lck_spin_lock(&clutch_bucket_group->scbg_stats_lock);
2129	if ((clutch_bucket_group->scbg_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID) &&
2130	(clutch_bucket_group->scbg_blocked_data.scct_count == 0)) {
2131	/* Run count is transitioning from 0 to 1; calculate blocked time and add it to CPU data */
2132	blocked_time = mach_absolute_time() - clutch_bucket_group->scbg_blocked_data.scct_timestamp;
2133	clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2134	}
2135	clutch_bucket_group->scbg_blocked_data.scct_count = clutch_bucket_group->scbg_blocked_data.scct_count + 1;
2136	updated_run_count = clutch_bucket_group->scbg_blocked_data.scct_count;
2137	lck_spin_unlock(&clutch_bucket_group->scbg_stats_lock);
2138
2139	blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2140	os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2141	return (uint32_t)updated_run_count;
2142	}
2143
2144	static uint32_t
2145	sched_clutch_bucket_group_run_count_dec(
2146	sched_clutch_bucket_group_t clutch_bucket_group)
2147	{
2148	uint64_t updated_run_count = 0;
2149
2150	lck_spin_lock(&clutch_bucket_group->scbg_stats_lock);
2151	clutch_bucket_group->scbg_blocked_data.scct_count = clutch_bucket_group->scbg_blocked_data.scct_count - 1;
2152	if (clutch_bucket_group->scbg_blocked_data.scct_count == 0) {
2153	/* Run count is transitioning from 1 to 0; start the blocked timer */
2154	clutch_bucket_group->scbg_blocked_data.scct_timestamp = mach_absolute_time();
2155	}
2156	updated_run_count = clutch_bucket_group->scbg_blocked_data.scct_count;
2157	lck_spin_unlock(&clutch_bucket_group->scbg_stats_lock);
2158	return (uint32_t)updated_run_count;
2159	}
2160
2161	#else /* !__LP64__ */
2162
2163	static uint32_t
2164	sched_clutch_bucket_group_run_count_inc(
2165	sched_clutch_bucket_group_t clutch_bucket_group)
2166	{
2167	sched_clutch_counter_time_t old_blocked_data;
2168	sched_clutch_counter_time_t new_blocked_data;
2169
2170	bool update_blocked_time = false;
2171	os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2172	new_blocked_data.scct_count = old_blocked_data.scct_count + 1;
2173	new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2174	update_blocked_time = false;
2175	if (old_blocked_data.scct_count == 0) {
2176	new_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2177	update_blocked_time = true;
2178	}
2179	});
2180	if (update_blocked_time && (old_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID)) {
2181	uint64_t ctime = mach_absolute_time();
2182	if (ctime > old_blocked_data.scct_timestamp) {
2183	uint64_t blocked_time = ctime - old_blocked_data.scct_timestamp;
2184	blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2185	os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2186	}
2187	}
2188	return (uint32_t)new_blocked_data.scct_count;
2189	}
2190
2191	static uint32_t
2192	sched_clutch_bucket_group_run_count_dec(
2193	sched_clutch_bucket_group_t clutch_bucket_group)
2194	{
2195	sched_clutch_counter_time_t old_blocked_data;
2196	sched_clutch_counter_time_t new_blocked_data;
2197
2198	uint64_t ctime = mach_absolute_time();
2199	os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2200	new_blocked_data.scct_count = old_blocked_data.scct_count - 1;
2201	new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2202	if (new_blocked_data.scct_count == 0) {
2203	new_blocked_data.scct_timestamp = ctime;
2204	}
2205	});
2206	return (uint32_t)new_blocked_data.scct_count;
2207	}
2208
2209	#endif /* !__LP64__ */
2210
cb323159 A	2211	/*
	2212	* sched_clutch_thread_insert()
	2213	*
	2214	* Routine to insert a thread into the sched clutch hierarchy.
	2215	* Update the counts at all levels of the hierarchy and insert the nodes
	2216	* as they become runnable. Always called with the pset lock held.
	2217	*/
	2218	static boolean_t
	2219	sched_clutch_thread_insert(
	2220	sched_clutch_root_t root_clutch,
	2221	thread_t thread,
	2222	integer_t options)
	2223	{
	2224	boolean_t result = FALSE;
	2225
	2226	sched_clutch_hierarchy_locked_assert(root_clutch);
f427ee49 A	2227	#if CONFIG_SCHED_EDGE
	2228	sched_edge_cluster_cumulative_count_incr(root_clutch, thread->th_sched_bucket);
	2229	/*
	2230	* Check if the thread is bound and is being enqueued in its desired bound cluster.
	2231	* One scenario where a bound thread might not be getting enqueued in the bound cluster
	2232	* hierarchy would be if the thread is "soft" bound and the bound cluster is
	2233	* de-recommended. In that case, the thread should be treated as an unbound
	2234	* thread.
	2235	*/
	2236	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) && (sched_edge_thread_bound_cluster_id(thread) == root_clutch->scr_cluster_id)) {
	2237	return sched_edge_bound_thread_insert(root_clutch, thread, options);
	2238	}
	2239	#endif /* CONFIG_SCHED_EDGE */
cb323159 A	2240	sched_clutch_t clutch = sched_clutch_for_thread(thread);
	2241	assert(thread->thread_group == clutch->sc_tg);
	2242
	2243	uint64_t current_timestamp = mach_absolute_time();
f427ee49 A	2244	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
f427ee49 A	2245	sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
cb323159 A	2246	assert((clutch_bucket->scb_root == NULL) \|\| (clutch_bucket->scb_root == root_clutch));
cb323159 A	2247
f427ee49 A	2248	/*
	2249	* Thread linkage in clutch_bucket
	2250	*
	2251	* A thread has a few linkages within the clutch bucket:
	2252	* - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
	2253	* - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
	2254	* - A queue linkage used for timesharing operations of threads at the scheduler tick
	2255	*/
	2256
	2257	/* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */
	2258	thread->th_clutch_runq_link.stamp = current_timestamp;
	2259	priority_queue_entry_set_sched_pri(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link, thread->sched_pri,
	2260	(options & SCHED_TAILQ) ? PRIORITY_QUEUE_ENTRY_NONE : PRIORITY_QUEUE_ENTRY_PREEMPTED);
	2261	priority_queue_insert(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
	2262
	2263	/* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
	2264	priority_queue_entry_set_sched_pri(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link,
	2265	sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, false);
	2266	priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
	2267
	2268	/* Insert thread into timesharing queue of the clutch bucket */
	2269	enqueue_tail(&clutch_bucket->scb_thread_timeshare_queue, &thread->th_clutch_timeshare_link);
	2270
cb323159 A	2271	/* Increment the urgency counter for the root if necessary */
	2272	sched_clutch_root_urgency_inc(root_clutch, thread);
	2273
cb323159	2274	os_atomic_inc(&clutch->sc_thr_count, relaxed);
f427ee49	2275	sched_clutch_bucket_group_thr_count_inc(clutch_bucket->scb_group, current_timestamp);
cb323159	2276
ea3f0419 A	2277	/* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
ea3f0419 A	2278	sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
cb323159 A	2279	if (clutch_bucket->scb_thr_count == 0) {
	2280	sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
	2281	sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
ea3f0419	2282	result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp, scb_options);
cb323159 A	2283	} else {
	2284	sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
	2285	sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
ea3f0419	2286	result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, scb_options);
cb323159	2287	}
f427ee49 A	2288
	2289	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) \| DBG_FUNC_NONE,
	2290	root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
	2291	SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
cb323159 A	2292	return result;
	2293	}
	2294
	2295	/*
	2296	* sched_clutch_thread_remove()
	2297	*
	2298	* Routine to remove a thread from the sched clutch hierarchy.
	2299	* Update the counts at all levels of the hierarchy and remove the nodes
	2300	* as they become empty. Always called with the pset lock held.
	2301	*/
	2302	static void
	2303	sched_clutch_thread_remove(
	2304	sched_clutch_root_t root_clutch,
	2305	thread_t thread,
ea3f0419 A	2306	uint64_t current_timestamp,
ea3f0419 A	2307	sched_clutch_bucket_options_t options)
cb323159 A	2308	{
cb323159 A	2309	sched_clutch_hierarchy_locked_assert(root_clutch);
f427ee49 A	2310	#if CONFIG_SCHED_EDGE
	2311	sched_edge_cluster_cumulative_count_decr(root_clutch, thread->th_sched_bucket);
	2312	if (thread->th_bound_cluster_enqueued) {
	2313	sched_edge_bound_thread_remove(root_clutch, thread);
	2314	return;
	2315	}
	2316	#endif /* CONFIG_SCHED_EDGE */
cb323159 A	2317	sched_clutch_t clutch = sched_clutch_for_thread(thread);
	2318	assert(thread->thread_group == clutch->sc_tg);
	2319	assert(thread->runq != PROCESSOR_NULL);
	2320
f427ee49 A	2321	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
f427ee49 A	2322	sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
cb323159 A	2323	assert(clutch_bucket->scb_root == root_clutch);
	2324
	2325	/* Decrement the urgency counter for the root if necessary */
	2326	sched_clutch_root_urgency_dec(root_clutch, thread);
	2327	/* Remove thread from the clutch_bucket */
f427ee49 A	2328	priority_queue_remove(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
	2329	remqueue(&thread->th_clutch_timeshare_link);
	2330	thread->runq = PROCESSOR_NULL;
cb323159	2331
f427ee49	2332	priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
cb323159 A	2333
	2334	/* Update counts at various levels of the hierarchy */
	2335	os_atomic_dec(&clutch->sc_thr_count, relaxed);
f427ee49	2336	sched_clutch_bucket_group_thr_count_dec(clutch_bucket->scb_group, current_timestamp);
cb323159 A	2337	sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
	2338	sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
	2339
	2340	/* Remove the clutch from hierarchy (if needed) and update properties */
	2341	if (clutch_bucket->scb_thr_count == 0) {
ea3f0419	2342	sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp, options);
cb323159	2343	} else {
ea3f0419	2344	sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, options);
cb323159	2345	}
f427ee49 A	2346	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) \| DBG_FUNC_NONE,
	2347	root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
	2348	SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
	2349	}
	2350
	2351	/*
	2352	* sched_clutch_thread_unbound_lookup()
	2353	*
	2354	* Routine to find the highest unbound thread in the root clutch.
	2355	* Helps find threads easily for steal/migrate scenarios in the
	2356	* Edge scheduler.
	2357	*/
	2358	static thread_t
	2359	sched_clutch_thread_unbound_lookup(
	2360	sched_clutch_root_t root_clutch,
	2361	sched_clutch_root_bucket_t root_bucket)
	2362	{
	2363	sched_clutch_hierarchy_locked_assert(root_clutch);
	2364
	2365	/* Find the highest priority clutch bucket in this root bucket */
	2366	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
	2367	assert(clutch_bucket != NULL);
	2368
	2369	/* Find the highest priority runnable thread in this clutch bucket */
	2370	thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
	2371	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) \| DBG_FUNC_NONE,
	2372	thread_tid(thread), thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, 0, 0);
	2373	return thread;
cb323159 A	2374	}
	2375
	2376	/*
f427ee49	2377	* sched_clutch_thread_highest_remove()
cb323159 A	2378	*
	2379	* Routine to find and remove the highest priority thread
	2380	* from the sched clutch hierarchy. The algorithm looks at the
	2381	* hierarchy for the most eligible runnable thread and calls
	2382	* sched_clutch_thread_remove(). Always called with the
	2383	* pset lock held.
	2384	*/
	2385	static thread_t
f427ee49	2386	sched_clutch_thread_highest_remove(
cb323159 A	2387	sched_clutch_root_t root_clutch)
	2388	{
	2389	sched_clutch_hierarchy_locked_assert(root_clutch);
	2390	uint64_t current_timestamp = mach_absolute_time();
	2391
f427ee49	2392	sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
cb323159 A	2393	if (root_bucket == NULL) {
	2394	return THREAD_NULL;
	2395	}
cb323159	2396
f427ee49 A	2397	thread_t highest_thread = THREAD_NULL;
	2398	if (root_bucket->scrb_bound) {
	2399	highest_thread = sched_clutch_thread_bound_lookup(root_clutch, root_bucket);
	2400	} else {
	2401	highest_thread = sched_clutch_thread_unbound_lookup(root_clutch, root_bucket);
	2402	}
	2403	sched_clutch_thread_remove(root_clutch, highest_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
	2404	return highest_thread;
cb323159 A	2405	}
cb323159 A	2406
cb323159 A	2407	/* High level global accessor routines */
	2408
	2409	/*
	2410	* sched_clutch_root_urgency()
	2411	*
	2412	* Routine to get the urgency of the highest runnable
	2413	* thread in the hierarchy.
	2414	*/
	2415	static uint32_t
	2416	sched_clutch_root_urgency(
	2417	sched_clutch_root_t root_clutch)
	2418	{
	2419	return root_clutch->scr_urgency;
	2420	}
	2421
	2422	/*
	2423	* sched_clutch_root_count_sum()
	2424	*
	2425	* The count_sum mechanism is used for scheduler runq
	2426	* statistics calculation. Its only useful for debugging
	2427	* purposes; since it takes a mach_absolute_time() on
	2428	* other scheduler implementations, its better to avoid
	2429	* populating this until absolutely necessary.
	2430	*/
	2431	static uint32_t
	2432	sched_clutch_root_count_sum(
	2433	__unused sched_clutch_root_t root_clutch)
	2434	{
	2435	return 0;
	2436	}
	2437
	2438	/*
	2439	* sched_clutch_root_priority()
	2440	*
	2441	* Routine to get the priority of the highest runnable
	2442	* thread in the hierarchy.
	2443	*/
	2444	static int
	2445	sched_clutch_root_priority(
	2446	sched_clutch_root_t root_clutch)
	2447	{
	2448	return root_clutch->scr_priority;
	2449	}
	2450
	2451	/*
	2452	* sched_clutch_root_count()
	2453	*
	2454	* Returns total number of runnable threads in the hierarchy.
	2455	*/
	2456	uint32_t
	2457	sched_clutch_root_count(
	2458	sched_clutch_root_t root_clutch)
	2459	{
	2460	return root_clutch->scr_thr_count;
	2461	}
	2462
f427ee49 A	2463	#if CONFIG_SCHED_EDGE
	2464
	2465	/*
	2466	* sched_clutch_root_foreign_empty()
	2467	*
	2468	* Routine to check if the foreign clutch bucket priority list is empty for a cluster.
	2469	*/
	2470	static boolean_t
	2471	sched_clutch_root_foreign_empty(
	2472	sched_clutch_root_t root_clutch)
	2473	{
	2474	return priority_queue_empty(&root_clutch->scr_foreign_buckets);
	2475	}
	2476
	2477	/*
	2478	* sched_clutch_root_highest_foreign_thread_remove()
	2479	*
	2480	* Routine to return the thread in the highest priority clutch bucket in a cluster.
	2481	* Must be called with the pset for the cluster locked.
	2482	*/
	2483	static thread_t
	2484	sched_clutch_root_highest_foreign_thread_remove(
	2485	sched_clutch_root_t root_clutch)
	2486	{
	2487	thread_t thread = THREAD_NULL;
	2488	if (priority_queue_empty(&root_clutch->scr_foreign_buckets)) {
	2489	return thread;
	2490	}
	2491	sched_clutch_bucket_t clutch_bucket = priority_queue_max(&root_clutch->scr_foreign_buckets, struct sched_clutch_bucket, scb_foreignlink);
	2492	thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
	2493	sched_clutch_thread_remove(root_clutch, thread, mach_absolute_time(), 0);
	2494	return thread;
	2495	}
	2496
	2497	#endif /* CONFIG_SCHED_EDGE */
	2498
cb323159 A	2499	/*
	2500	* sched_clutch_thread_pri_shift()
	2501	*
	2502	* Routine to get the priority shift value for a thread.
	2503	* Since the timesharing is done at the clutch_bucket level,
	2504	* this routine gets the clutch_bucket and retrieves the
	2505	* values from there.
	2506	*/
	2507	uint32_t
	2508	sched_clutch_thread_pri_shift(
	2509	thread_t thread,
	2510	sched_bucket_t bucket)
	2511	{
	2512	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
f427ee49	2513	return INT8_MAX;
cb323159 A	2514	}
	2515	assert(bucket != TH_BUCKET_RUN);
	2516	sched_clutch_t clutch = sched_clutch_for_thread(thread);
f427ee49 A	2517	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
f427ee49 A	2518	return os_atomic_load(&clutch_bucket_group->scbg_pri_shift, relaxed);
cb323159 A	2519	}
	2520
	2521	#pragma mark -- Clutch Scheduler Algorithm
	2522
	2523	static void
	2524	sched_clutch_init(void);
	2525
cb323159 A	2526	static thread_t
	2527	sched_clutch_steal_thread(processor_set_t pset);
	2528
	2529	static void
	2530	sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
	2531
	2532	static boolean_t
	2533	sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
	2534	sched_options_t options);
	2535
	2536	static boolean_t
	2537	sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
	2538
	2539	static ast_t
	2540	sched_clutch_processor_csw_check(processor_t processor);
	2541
	2542	static boolean_t
	2543	sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
	2544
	2545	static int
	2546	sched_clutch_runq_count(processor_t processor);
	2547
	2548	static boolean_t
	2549	sched_clutch_processor_queue_empty(processor_t processor);
	2550
	2551	static uint64_t
	2552	sched_clutch_runq_stats_count_sum(processor_t processor);
	2553
	2554	static int
	2555	sched_clutch_processor_bound_count(processor_t processor);
	2556
	2557	static void
	2558	sched_clutch_pset_init(processor_set_t pset);
	2559
	2560	static void
	2561	sched_clutch_processor_init(processor_t processor);
	2562
	2563	static thread_t
	2564	sched_clutch_choose_thread(processor_t processor, int priority, ast_t reason);
	2565
	2566	static void
	2567	sched_clutch_processor_queue_shutdown(processor_t processor);
	2568
	2569	static sched_mode_t
	2570	sched_clutch_initial_thread_sched_mode(task_t parent_task);
	2571
	2572	static uint32_t
	2573	sched_clutch_initial_quantum_size(thread_t thread);
	2574
	2575	static bool
	2576	sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread);
	2577
	2578	static uint32_t
	2579	sched_clutch_run_incr(thread_t thread);
	2580
	2581	static uint32_t
	2582	sched_clutch_run_decr(thread_t thread);
	2583
	2584	static void
	2585	sched_clutch_update_thread_bucket(thread_t thread);
	2586
	2587	const struct sched_dispatch_table sched_clutch_dispatch = {
	2588	.sched_name = "clutch",
	2589	.init = sched_clutch_init,
f427ee49	2590	.timebase_init = sched_timeshare_timebase_init,
cb323159 A	2591	.processor_init = sched_clutch_processor_init,
	2592	.pset_init = sched_clutch_pset_init,
	2593	.maintenance_continuation = sched_timeshare_maintenance_continue,
	2594	.choose_thread = sched_clutch_choose_thread,
	2595	.steal_thread_enabled = sched_steal_thread_enabled,
	2596	.steal_thread = sched_clutch_steal_thread,
	2597	.compute_timeshare_priority = sched_compute_timeshare_priority,
f427ee49	2598	.choose_node = sched_choose_node,
cb323159 A	2599	.choose_processor = choose_processor,
	2600	.processor_enqueue = sched_clutch_processor_enqueue,
	2601	.processor_queue_shutdown = sched_clutch_processor_queue_shutdown,
	2602	.processor_queue_remove = sched_clutch_processor_queue_remove,
	2603	.processor_queue_empty = sched_clutch_processor_queue_empty,
	2604	.priority_is_urgent = priority_is_urgent,
	2605	.processor_csw_check = sched_clutch_processor_csw_check,
	2606	.processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
	2607	.initial_quantum_size = sched_clutch_initial_quantum_size,
	2608	.initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
	2609	.can_update_priority = can_update_priority,
	2610	.update_priority = update_priority,
	2611	.lightweight_update_priority = lightweight_update_priority,
	2612	.quantum_expire = sched_default_quantum_expire,
	2613	.processor_runq_count = sched_clutch_runq_count,
	2614	.processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
	2615	.processor_bound_count = sched_clutch_processor_bound_count,
	2616	.thread_update_scan = sched_clutch_thread_update_scan,
	2617	.multiple_psets_enabled = TRUE,
	2618	.sched_groups_enabled = FALSE,
	2619	.avoid_processor_enabled = TRUE,
	2620	.thread_avoid_processor = sched_clutch_thread_avoid_processor,
	2621	.processor_balance = sched_SMT_balance,
	2622
f427ee49 A	2623	.rt_runq = sched_rtlocal_runq,
	2624	.rt_init = sched_rtlocal_init,
	2625	.rt_queue_shutdown = sched_rtlocal_queue_shutdown,
	2626	.rt_runq_scan = sched_rtlocal_runq_scan,
	2627	.rt_runq_count_sum = sched_rtlocal_runq_count_sum,
cb323159 A	2628
	2629	.qos_max_parallelism = sched_qos_max_parallelism,
	2630	.check_spill = sched_check_spill,
	2631	.ipi_policy = sched_ipi_policy,
	2632	.thread_should_yield = sched_thread_should_yield,
	2633	.run_count_incr = sched_clutch_run_incr,
	2634	.run_count_decr = sched_clutch_run_decr,
	2635	.update_thread_bucket = sched_clutch_update_thread_bucket,
	2636	.pset_made_schedulable = sched_pset_made_schedulable,
	2637	};
	2638
	2639	__attribute__((always_inline))
	2640	static inline run_queue_t
	2641	sched_clutch_bound_runq(processor_t processor)
	2642	{
	2643	return &processor->runq;
	2644	}
	2645
	2646	__attribute__((always_inline))
	2647	static inline sched_clutch_root_t
	2648	sched_clutch_processor_root_clutch(processor_t processor)
	2649	{
	2650	return &processor->processor_set->pset_clutch_root;
	2651	}
	2652
	2653	__attribute__((always_inline))
	2654	static inline run_queue_t
	2655	sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
	2656	{
	2657	assert(thread->bound_processor == processor);
	2658	return sched_clutch_bound_runq(processor);
	2659	}
	2660
	2661	static uint32_t
	2662	sched_clutch_initial_quantum_size(thread_t thread)
	2663	{
	2664	if (thread == THREAD_NULL) {
	2665	return std_quantum;
	2666	}
	2667	assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
	2668	return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
	2669	}
	2670
	2671	static sched_mode_t
	2672	sched_clutch_initial_thread_sched_mode(task_t parent_task)
	2673	{
	2674	if (parent_task == kernel_task) {
	2675	return TH_MODE_FIXED;
	2676	} else {
	2677	return TH_MODE_TIMESHARE;
	2678	}
	2679	}
	2680
	2681	static void
	2682	sched_clutch_processor_init(processor_t processor)
	2683	{
	2684	run_queue_init(&processor->runq);
	2685	}
	2686
	2687	static void
	2688	sched_clutch_pset_init(processor_set_t pset)
	2689	{
	2690	sched_clutch_root_init(&pset->pset_clutch_root, pset);
	2691	}
2692
f427ee49 A	2693	static void
	2694	sched_clutch_tunables_init(void)
	2695	{
	2696	sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
	2697	sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
	2698	sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
	2699	clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS,
	2700	NSEC_PER_USEC, &sched_clutch_bucket_group_adjust_threshold);
	2701	assert(sched_clutch_bucket_group_adjust_threshold <= CLUTCH_CPU_DATA_MAX);
	2702	sched_clutch_us_to_abstime(sched_clutch_bucket_group_pending_delta_us, sched_clutch_bucket_group_pending_delta);
	2703	}
	2704
cb323159 A	2705	static void
	2706	sched_clutch_init(void)
	2707	{
f427ee49 A	2708	if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
f427ee49 A	2709	sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
cb323159 A	2710	}
cb323159 A	2711	sched_timeshare_init();
f427ee49	2712	sched_clutch_tunables_init();
cb323159 A	2713	}
	2714
	2715	static thread_t
	2716	sched_clutch_choose_thread(
	2717	processor_t processor,
	2718	int priority,
	2719	__unused ast_t reason)
	2720	{
	2721	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
	2722	uint32_t clutch_count = sched_clutch_root_count(sched_clutch_processor_root_clutch(processor));
	2723	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
	2724	boolean_t choose_from_boundq = false;
	2725
	2726	if (bound_runq->highq < priority &&
	2727	clutch_pri < priority) {
	2728	return THREAD_NULL;
	2729	}
	2730
	2731	if (bound_runq->count && clutch_count) {
	2732	if (bound_runq->highq >= clutch_pri) {
	2733	choose_from_boundq = true;
	2734	}
	2735	} else if (bound_runq->count) {
	2736	choose_from_boundq = true;
	2737	} else if (clutch_count) {
	2738	choose_from_boundq = false;
	2739	} else {
	2740	return THREAD_NULL;
	2741	}
	2742
	2743	thread_t thread = THREAD_NULL;
	2744	if (choose_from_boundq == false) {
	2745	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
f427ee49	2746	thread = sched_clutch_thread_highest_remove(pset_clutch_root);
cb323159 A	2747	} else {
	2748	thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
	2749	}
	2750	return thread;
	2751	}
	2752
	2753	static boolean_t
	2754	sched_clutch_processor_enqueue(
	2755	processor_t processor,
	2756	thread_t thread,
	2757	sched_options_t options)
	2758	{
	2759	boolean_t result;
	2760
	2761	thread->runq = processor;
	2762	if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
	2763	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
	2764	result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
	2765	} else {
	2766	run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
	2767	result = run_queue_enqueue(rq, thread, options);
	2768	}
	2769	return result;
	2770	}
	2771
	2772	static boolean_t
	2773	sched_clutch_processor_queue_empty(processor_t processor)
	2774	{
	2775	return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
	2776	sched_clutch_bound_runq(processor)->count == 0;
	2777	}
	2778
	2779	static ast_t
	2780	sched_clutch_processor_csw_check(processor_t processor)
	2781	{
	2782	boolean_t has_higher;
	2783	int pri;
	2784
	2785	if (sched_clutch_thread_avoid_processor(processor, current_thread())) {
	2786	return AST_PREEMPT \| AST_URGENT;
	2787	}
	2788
	2789	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
	2790	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
	2791
	2792	assert(processor->active_thread != NULL);
	2793
	2794	pri = MAX(clutch_pri, bound_runq->highq);
	2795
	2796	if (processor->first_timeslice) {
	2797	has_higher = (pri > processor->current_pri);
	2798	} else {
	2799	has_higher = (pri >= processor->current_pri);
	2800	}
	2801
	2802	if (has_higher) {
	2803	if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
	2804	return AST_PREEMPT \| AST_URGENT;
	2805	}
	2806
	2807	if (bound_runq->urgency > 0) {
	2808	return AST_PREEMPT \| AST_URGENT;
	2809	}
	2810
2811	return AST_PREEMPT;
2812	}
2813
2814	return AST_NONE;
2815	}
2816
2817	static boolean_t
2818	sched_clutch_processor_queue_has_priority(processor_t processor,
2819	int priority,
2820	boolean_t gte)
2821	{
2822	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2823
2824	int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
2825
2826	if (gte) {
2827	return qpri >= priority;
2828	} else {
2829	return qpri > priority;
2830	}
2831	}
2832
2833	static int
2834	sched_clutch_runq_count(processor_t processor)
2835	{
2836	return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
2837	}
2838
2839	static uint64_t
2840	sched_clutch_runq_stats_count_sum(processor_t processor)
2841	{
2842	uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
2843
2844	if (processor->cpu_id == processor->processor_set->cpu_set_low) {
2845	return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
2846	} else {
2847	return bound_sum;
2848	}
2849	}
2850	static int
2851	sched_clutch_processor_bound_count(processor_t processor)
2852	{
2853	return sched_clutch_bound_runq(processor)->count;
2854	}
2855
2856	static void
2857	sched_clutch_processor_queue_shutdown(processor_t processor)
2858	{
2859	processor_set_t pset = processor->processor_set;
2860	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2861	thread_t thread;
2862	queue_head_t tqueue;
2863
2864	/* We only need to migrate threads if this is the last active processor in the pset */
2865	if (pset->online_processor_count > 0) {
2866	pset_unlock(pset);
2867	return;
2868	}
2869
2870	queue_init(&tqueue);
2871	while (sched_clutch_root_count(pset_clutch_root) > 0) {
f427ee49	2872	thread = sched_clutch_thread_highest_remove(pset_clutch_root);
cb323159 A	2873	enqueue_tail(&tqueue, &thread->runq_links);
	2874	}
	2875
	2876	pset_unlock(pset);
	2877
	2878	qe_foreach_element_safe(thread, &tqueue, runq_links) {
	2879	remqueue(&thread->runq_links);
cb323159	2880	thread_lock(thread);
cb323159	2881	thread_setrun(thread, SCHED_TAILQ);
cb323159 A	2882	thread_unlock(thread);
	2883	}
	2884	}
	2885
	2886	static boolean_t
	2887	sched_clutch_processor_queue_remove(
	2888	processor_t processor,
	2889	thread_t thread)
	2890	{
	2891	run_queue_t rq;
	2892	processor_set_t pset = processor->processor_set;
	2893
	2894	pset_lock(pset);
	2895
	2896	if (processor == thread->runq) {
	2897	/*
	2898	* Thread is on a run queue and we have a lock on
	2899	* that run queue.
	2900	*/
	2901	if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
	2902	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
ea3f0419	2903	sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
cb323159 A	2904	} else {
	2905	rq = sched_clutch_thread_bound_runq(processor, thread);
	2906	run_queue_remove(rq, thread);
	2907	}
	2908	} else {
	2909	/*
	2910	* The thread left the run queue before we could
	2911	* lock the run queue.
	2912	*/
	2913	assert(thread->runq == PROCESSOR_NULL);
	2914	processor = PROCESSOR_NULL;
	2915	}
	2916
	2917	pset_unlock(pset);
	2918
	2919	return processor != PROCESSOR_NULL;
	2920	}
	2921
	2922	static thread_t
f427ee49	2923	sched_clutch_steal_thread(__unused processor_set_t pset)
cb323159	2924	{
f427ee49	2925	/* Thread stealing is not enabled for single cluster clutch scheduler platforms */
cb323159 A	2926	return THREAD_NULL;
	2927	}
	2928
	2929	static void
	2930	sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
	2931	{
	2932	boolean_t restart_needed = FALSE;
	2933	processor_t processor = processor_list;
	2934	processor_set_t pset;
	2935	thread_t thread;
	2936	spl_t s;
	2937
	2938	/*
	2939	* We update the threads associated with each processor (bound and idle threads)
	2940	* and then update the threads in each pset runqueue.
	2941	*/
	2942
	2943	do {
	2944	do {
	2945	pset = processor->processor_set;
	2946
	2947	s = splsched();
	2948	pset_lock(pset);
	2949
	2950	restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
	2951
	2952	pset_unlock(pset);
	2953	splx(s);
	2954
	2955	if (restart_needed) {
	2956	break;
	2957	}
	2958
	2959	thread = processor->idle_thread;
	2960	if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
	2961	if (thread_update_add_thread(thread) == FALSE) {
	2962	restart_needed = TRUE;
	2963	break;
	2964	}
	2965	}
	2966	} while ((processor = processor->processor_list) != NULL);
	2967
	2968	/* Ok, we now have a collection of candidates -- fix them. */
	2969	thread_update_process_threads();
	2970	} while (restart_needed);
	2971
f427ee49 A	2972	pset_node_t node = &pset_node0;
f427ee49 A	2973	pset = node->psets;
cb323159 A	2974
	2975	do {
	2976	do {
f427ee49 A	2977	restart_needed = FALSE;
	2978	while (pset != NULL) {
	2979	s = splsched();
	2980	pset_lock(pset);
	2981
	2982	if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
	2983	for (sched_bucket_t bucket = TH_BUCKET_SHARE_FG; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
	2984	restart_needed = runq_scan(&pset->pset_clutch_root.scr_bound_buckets[bucket].scrb_bound_thread_runq, scan_context);
	2985	if (restart_needed) {
	2986	break;
	2987	}
cb323159	2988	}
f427ee49 A	2989	queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
	2990	sched_clutch_bucket_t clutch_bucket;
	2991	qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
	2992	sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, scan_context->sched_tick_last_abstime);
	2993	restart_needed = sched_clutch_timeshare_scan(&clutch_bucket->scb_thread_timeshare_queue, clutch_bucket->scb_thr_count, scan_context);
	2994	}
	2995	}
	2996
	2997	pset_unlock(pset);
	2998	splx(s);
	2999
	3000	if (restart_needed) {
	3001	break;
cb323159	3002	}
f427ee49	3003	pset = pset->pset_list;
cb323159 A	3004	}
cb323159 A	3005
cb323159 A	3006	if (restart_needed) {
	3007	break;
	3008	}
f427ee49	3009	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
cb323159 A	3010
	3011	/* Ok, we now have a collection of candidates -- fix them. */
	3012	thread_update_process_threads();
	3013	} while (restart_needed);
	3014	}
	3015
	3016	extern int sched_allow_rt_smt;
	3017
	3018	/* Return true if this thread should not continue running on this processor */
	3019	static bool
	3020	sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread)
	3021	{
	3022	if (processor->processor_primary != processor) {
	3023	/*
	3024	* This is a secondary SMT processor. If the primary is running
	3025	* a realtime thread, only allow realtime threads on the secondary.
	3026	*/
	3027	if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) \|\| !sched_allow_rt_smt)) {
	3028	return true;
	3029	}
	3030	}
	3031
	3032	return false;
	3033	}
	3034
	3035	/*
	3036	* For the clutch scheduler, the run counts are maintained in the clutch
	3037	* buckets (i.e thread group scheduling structure).
	3038	*/
	3039	static uint32_t
	3040	sched_clutch_run_incr(thread_t thread)
	3041	{
	3042	assert((thread->state & (TH_RUN \| TH_IDLE)) == TH_RUN);
	3043	uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
	3044	sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
	3045	return new_count;
	3046	}
	3047
	3048	static uint32_t
	3049	sched_clutch_run_decr(thread_t thread)
	3050	{
	3051	assert((thread->state & (TH_RUN \| TH_IDLE)) != TH_RUN);
	3052	uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
	3053	sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
	3054	return new_count;
	3055	}
	3056
	3057	static sched_bucket_t
	3058	sched_convert_pri_to_bucket(uint8_t priority)
	3059	{
	3060	sched_bucket_t bucket = TH_BUCKET_RUN;
	3061
	3062	if (priority > BASEPRI_USER_INITIATED) {
	3063	bucket = TH_BUCKET_SHARE_FG;
	3064	} else if (priority > BASEPRI_DEFAULT) {
	3065	bucket = TH_BUCKET_SHARE_IN;
	3066	} else if (priority > BASEPRI_UTILITY) {
	3067	bucket = TH_BUCKET_SHARE_DF;
	3068	} else if (priority > MAXPRI_THROTTLE) {
	3069	bucket = TH_BUCKET_SHARE_UT;
	3070	} else {
	3071	bucket = TH_BUCKET_SHARE_BG;
	3072	}
	3073	return bucket;
3074	}
3075
3076	/*
3077	* For threads that have changed sched_pri without changing the
3078	* base_pri for any reason other than decay, use the sched_pri
3079	* as the bucketizing priority instead of base_pri. All such
3080	* changes are typically due to kernel locking primitives boosts
3081	* or demotions.
3082	*/
3083	static boolean_t
3084	sched_thread_sched_pri_promoted(thread_t thread)
3085	{
3086	return (thread->sched_flags & TH_SFLAG_PROMOTED) \|\|
3087	(thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) \|\|
3088	(thread->sched_flags & TH_SFLAG_DEMOTED_MASK) \|\|
3089	(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) \|\|
3090	(thread->kern_promotion_schedpri != 0);
3091	}
3092
3093	/*
3094	* Routine to update the scheduling bucket for the thread.
3095	*
3096	* In the clutch scheduler implementation, the thread's bucket
3097	* is based on sched_pri if it was promoted due to a kernel
3098	* primitive; otherwise its based on the thread base_pri. This
3099	* enhancement allows promoted threads to reach a higher priority
3100	* bucket and potentially get selected sooner for scheduling.
3101	*
3102	* Also, the clutch scheduler does not honor fixed priority below
3103	* FG priority. It simply puts those threads in the corresponding
3104	* timeshare bucket. The reason for to do that is because it is
3105	* extremely hard to define the scheduling properties of such threads
3106	* and they typically lead to performance issues.
3107	*/
3108
3109	void
3110	sched_clutch_update_thread_bucket(thread_t thread)
3111	{
3112	sched_bucket_t old_bucket = thread->th_sched_bucket;
3113	sched_bucket_t new_bucket = TH_BUCKET_RUN;
3114	assert(thread->runq == PROCESSOR_NULL);
cb323159 A	3115	int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
	3116
	3117	switch (thread->sched_mode) {
	3118	case TH_MODE_FIXED:
	3119	if (pri >= BASEPRI_FOREGROUND) {
	3120	new_bucket = TH_BUCKET_FIXPRI;
	3121	} else {
	3122	new_bucket = sched_convert_pri_to_bucket(pri);
	3123	}
	3124	break;
	3125
	3126	case TH_MODE_REALTIME:
	3127	new_bucket = TH_BUCKET_FIXPRI;
	3128	break;
	3129
	3130	case TH_MODE_TIMESHARE:
	3131	new_bucket = sched_convert_pri_to_bucket(pri);
	3132	break;
	3133
	3134	default:
	3135	panic("unexpected mode: %d", thread->sched_mode);
	3136	break;
	3137	}
	3138
	3139	if (old_bucket == new_bucket) {
	3140	return;
	3141	}
	3142
	3143	thread->th_sched_bucket = new_bucket;
	3144	thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
cb323159 A	3145	/*
	3146	* Since this is called after the thread has been removed from the runq,
	3147	* only the run counts need to be updated. The re-insert into the runq
	3148	* would put the thread into the correct new bucket's runq.
	3149	*/
	3150	if ((thread->state & (TH_RUN \| TH_IDLE)) == TH_RUN) {
	3151	sched_clutch_thread_run_bucket_decr(thread, old_bucket);
	3152	sched_clutch_thread_run_bucket_incr(thread, new_bucket);
	3153	}
	3154	}
	3155
f427ee49	3156	#if CONFIG_SCHED_EDGE
c6bf4f31 A	3157
	3158	/* Implementation of the AMP version of the clutch scheduler */
	3159
f427ee49 A	3160	static void
	3161	sched_edge_init(void);
	3162
c6bf4f31	3163	static thread_t
f427ee49	3164	sched_edge_processor_idle(processor_set_t pset);
c6bf4f31 A	3165
c6bf4f31 A	3166	static ast_t
f427ee49	3167	sched_edge_processor_csw_check(processor_t processor);
c6bf4f31 A	3168
c6bf4f31 A	3169	static boolean_t
f427ee49	3170	sched_edge_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
c6bf4f31 A	3171
c6bf4f31 A	3172	static boolean_t
f427ee49	3173	sched_edge_processor_queue_empty(processor_t processor);
c6bf4f31 A	3174
c6bf4f31 A	3175	static thread_t
f427ee49	3176	sched_edge_choose_thread(processor_t processor, int priority, ast_t reason);
c6bf4f31 A	3177
c6bf4f31 A	3178	static void
f427ee49	3179	sched_edge_processor_queue_shutdown(processor_t processor);
c6bf4f31 A	3180
c6bf4f31 A	3181	static processor_t
f427ee49	3182	sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
c6bf4f31 A	3183
c6bf4f31 A	3184	static bool
f427ee49	3185	sched_edge_thread_avoid_processor(processor_t processor, thread_t thread);
c6bf4f31	3186
f427ee49 A	3187	static void
f427ee49 A	3188	sched_edge_balance(processor_t cprocessor, processor_set_t cpset);
c6bf4f31 A	3189
c6bf4f31 A	3190	static void
f427ee49 A	3191	sched_edge_check_spill(processor_set_t pset, thread_t thread);
	3192
	3193	static bool
	3194	sched_edge_thread_should_yield(processor_t processor, thread_t thread);
c6bf4f31 A	3195
c6bf4f31 A	3196	static void
f427ee49 A	3197	sched_edge_pset_made_schedulable(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
	3198
	3199	static bool
	3200	sched_edge_steal_thread_enabled(processor_set_t pset);
	3201
	3202	static sched_ipi_type_t
	3203	sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
	3204
	3205	static uint32_t
	3206	sched_edge_qos_max_parallelism(int qos, uint64_t options);
c6bf4f31	3207
f427ee49 A	3208	const struct sched_dispatch_table sched_edge_dispatch = {
	3209	.sched_name = "edge",
	3210	.init = sched_edge_init,
	3211	.timebase_init = sched_timeshare_timebase_init,
c6bf4f31 A	3212	.processor_init = sched_clutch_processor_init,
	3213	.pset_init = sched_clutch_pset_init,
	3214	.maintenance_continuation = sched_timeshare_maintenance_continue,
f427ee49 A	3215	.choose_thread = sched_edge_choose_thread,
	3216	.steal_thread_enabled = sched_edge_steal_thread_enabled,
	3217	.steal_thread = sched_edge_processor_idle,
c6bf4f31	3218	.compute_timeshare_priority = sched_compute_timeshare_priority,
f427ee49 A	3219	.choose_node = sched_choose_node,
f427ee49 A	3220	.choose_processor = sched_edge_choose_processor,
c6bf4f31	3221	.processor_enqueue = sched_clutch_processor_enqueue,
f427ee49	3222	.processor_queue_shutdown = sched_edge_processor_queue_shutdown,
c6bf4f31	3223	.processor_queue_remove = sched_clutch_processor_queue_remove,
f427ee49	3224	.processor_queue_empty = sched_edge_processor_queue_empty,
c6bf4f31	3225	.priority_is_urgent = priority_is_urgent,
f427ee49 A	3226	.processor_csw_check = sched_edge_processor_csw_check,
f427ee49 A	3227	.processor_queue_has_priority = sched_edge_processor_queue_has_priority,
c6bf4f31 A	3228	.initial_quantum_size = sched_clutch_initial_quantum_size,
	3229	.initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
	3230	.can_update_priority = can_update_priority,
	3231	.update_priority = update_priority,
	3232	.lightweight_update_priority = lightweight_update_priority,
	3233	.quantum_expire = sched_default_quantum_expire,
	3234	.processor_runq_count = sched_clutch_runq_count,
	3235	.processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
	3236	.processor_bound_count = sched_clutch_processor_bound_count,
	3237	.thread_update_scan = sched_clutch_thread_update_scan,
	3238	.multiple_psets_enabled = TRUE,
	3239	.sched_groups_enabled = FALSE,
	3240	.avoid_processor_enabled = TRUE,
f427ee49 A	3241	.thread_avoid_processor = sched_edge_thread_avoid_processor,
f427ee49 A	3242	.processor_balance = sched_edge_balance,
c6bf4f31 A	3243
	3244	.rt_runq = sched_amp_rt_runq,
	3245	.rt_init = sched_amp_rt_init,
	3246	.rt_queue_shutdown = sched_amp_rt_queue_shutdown,
	3247	.rt_runq_scan = sched_amp_rt_runq_scan,
	3248	.rt_runq_count_sum = sched_amp_rt_runq_count_sum,
	3249
f427ee49 A	3250	.qos_max_parallelism = sched_edge_qos_max_parallelism,
	3251	.check_spill = sched_edge_check_spill,
	3252	.ipi_policy = sched_edge_ipi_policy,
	3253	.thread_should_yield = sched_edge_thread_should_yield,
c6bf4f31 A	3254	.run_count_incr = sched_clutch_run_incr,
	3255	.run_count_decr = sched_clutch_run_decr,
	3256	.update_thread_bucket = sched_clutch_update_thread_bucket,
f427ee49 A	3257	.pset_made_schedulable = sched_edge_pset_made_schedulable,
f427ee49 A	3258	.thread_group_recommendation_change = NULL,
c6bf4f31 A	3259	};
c6bf4f31 A	3260
f427ee49 A	3261	static struct processor_set pset1;
	3262	static struct pset_node pset_node1;
	3263	static bitmap_t sched_edge_available_pset_bitmask[BITMAP_LEN(MAX_PSETS)];
	3264
	3265	/*
	3266	* sched_edge_pset_available()
	3267	*
	3268	* Routine to determine if a pset is available for scheduling.
	3269	*/
	3270	static bool
	3271	sched_edge_pset_available(processor_set_t pset)
	3272	{
	3273	return bitmap_test(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
	3274	}
	3275
	3276	/*
	3277	* sched_edge_thread_bound_cluster_id()
	3278	*
	3279	* Routine to determine which cluster a particular thread is bound to. Uses
	3280	* the sched_flags on the thread to map back to a specific cluster id.
	3281	*
	3282	* <Edge Multi-cluster Support Needed>
	3283	*/
	3284	static uint32_t
	3285	sched_edge_thread_bound_cluster_id(thread_t thread)
	3286	{
	3287	assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread));
	3288	if (thread->sched_flags & TH_SFLAG_ECORE_ONLY) {
	3289	return (pset_array[0]->pset_type == CLUSTER_TYPE_E) ? 0 : 1;
	3290	} else {
	3291	return (pset_array[0]->pset_type == CLUSTER_TYPE_P) ? 0 : 1;
	3292	}
	3293	}
	3294
	3295	/* Forward declaration for some thread migration routines */
	3296	static boolean_t sched_edge_foreign_runnable_thread_available(processor_set_t pset);
	3297	static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset);
	3298	static processor_set_t sched_edge_steal_candidate(processor_set_t pset);
	3299	static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks);
	3300
	3301	/*
	3302	* sched_edge_config_set()
	3303	*
	3304	* Support to update an edge configuration. Typically used by CLPC to affect thread migration
	3305	* policies in the scheduler.
	3306	*/
	3307	static void
	3308	sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_clutch_edge edge_config)
	3309	{
	3310	sched_clutch_edge *edge = &pset_array[src_cluster]->sched_edges[dst_cluster];
	3311	edge->sce_edge_packed = edge_config.sce_edge_packed;
	3312	}
	3313
	3314	/*
	3315	* sched_edge_config_get()
	3316	*
	3317	* Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
	3318	* if it needs to update edges.
	3319	*/
	3320	static sched_clutch_edge
	3321	sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster)
	3322	{
	3323	return pset_array[src_cluster]->sched_edges[dst_cluster];
	3324	}
3325
3326	#if DEVELOPMENT \|\| DEBUG
3327
3328	/*
3329	* Helper Routines for edge scheduler sysctl configuration
3330	*
3331	* The current support is limited to dual cluster AMP platforms.
3332	* <Edge Multi-cluster Support Needed>
3333	*/
3334
3335	kern_return_t
3336	sched_edge_sysctl_configure_e_to_p(uint64_t edge_config)
3337	{
3338	pset_array[ecore_cluster_id]->sched_edges[pcore_cluster_id].sce_edge_packed = edge_config;
3339	return KERN_SUCCESS;
3340	}
3341
3342	kern_return_t
3343	sched_edge_sysctl_configure_p_to_e(uint64_t edge_config)
3344	{
3345	pset_array[pcore_cluster_id]->sched_edges[ecore_cluster_id].sce_edge_packed = edge_config;
3346	return KERN_SUCCESS;
3347	}
3348
3349	sched_clutch_edge
3350	sched_edge_e_to_p(void)
3351	{
3352	return sched_edge_config_get(ecore_cluster_id, pcore_cluster_id);
3353	}
3354
3355	sched_clutch_edge
3356	sched_edge_p_to_e(void)
3357	{
3358	return sched_edge_config_get(pcore_cluster_id, ecore_cluster_id);
3359	}
3360
3361	#endif /* DEVELOPMENT \|\| DEBUG */
3362
3363	/*
3364	* sched_edge_matrix_set()
3365	*
3366	* Routine to update various edges in the cluster edge matrix. The edge_changes_bitmap
3367	* indicates which edges need to be updated. Both the edge_matrix & edge_changes_bitmap
3368	* are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3369	*/
3370	void
3371	sched_edge_matrix_set(sched_clutch_edge edge_matrix, bool edge_changes_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3372	{
3373	uint32_t edge_index = 0;
3374	for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3375	for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3376	if (edge_changes_bitmap[edge_index]) {
3377	sched_edge_config_set(src_cluster, dst_cluster, edge_matrix[edge_index]);
3378	}
3379	edge_index++;
3380	}
3381	}
3382	}
3383
3384	/*
3385	* sched_edge_matrix_get()
3386	*
3387	* Routine to retrieve various edges in the cluster edge matrix. The edge_request_bitmap
3388	* indicates which edges need to be retrieved. Both the edge_matrix & edge_request_bitmap
3389	* are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3390	*/
3391	void
3392	sched_edge_matrix_get(sched_clutch_edge edge_matrix, bool edge_request_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3393	{
3394	uint32_t edge_index = 0;
3395	for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3396	for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3397	if (edge_request_bitmap[edge_index]) {
3398	edge_matrix[edge_index] = sched_edge_config_get(src_cluster, dst_cluster);
3399	}
3400	edge_index++;
3401	}
3402	}
3403	}
3404
3405	/*
3406	* sched_edge_init()
3407	*
3408	* Routine to initialize the data structures for the Edge scheduler. The current implementation
3409	* relies on this being enabled for a dual cluster AMP system. Once a better design for MAX_PSETS,
3410	* edge config etc. is defined, it should be made more generic to handle the multi-cluster
3411	* platorms.
3412	* <Edge Multi-cluster Support Needed>
3413	*/
3414	static void
3415	sched_edge_init(void)
3416	{
3417	processor_set_t ecore_set = &pset0;
3418	processor_set_t pcore_set = &pset1;
3419
3420	if (ml_get_boot_cluster() == CLUSTER_TYPE_P) {
3421	/* If the device boots on a P-cluster, fixup the IDs etc. */
3422	pcore_set = &pset0;
3423	ecore_set = &pset1;
3424	bitmap_set(sched_edge_available_pset_bitmask, pcore_cluster_id);
3425	} else {
3426	bitmap_set(sched_edge_available_pset_bitmask, ecore_cluster_id);
3427	}
3428
3429	ecore_set->pset_cluster_type = PSET_AMP_E;
3430	ecore_set->pset_cluster_id = ecore_cluster_id;
3431
3432	pcore_set->pset_cluster_type = PSET_AMP_P;
3433	pcore_set->pset_cluster_id = pcore_cluster_id;
3434
3435	pset_init(&pset1, &pset_node1);
3436	pset_node1.psets = &pset1;
3437	pset_node0.node_list = &pset_node1;
3438
3439	pset_array[ecore_cluster_id] = ecore_set;
3440	pset_array[ecore_cluster_id]->pset_type = CLUSTER_TYPE_E;
3441	bitmap_set(pset_array[ecore_cluster_id]->foreign_psets, pcore_cluster_id);
3442
3443	sched_edge_config_set(ecore_cluster_id, ecore_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
3444	sched_edge_config_set(ecore_cluster_id, pcore_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
3445
3446	pset_array[pcore_cluster_id] = pcore_set;
3447	pset_array[pcore_cluster_id]->pset_type = CLUSTER_TYPE_P;
3448	bitmap_set(pset_array[pcore_cluster_id]->foreign_psets, ecore_cluster_id);
3449
3450	sched_edge_config_set(pcore_cluster_id, pcore_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
3451	sched_edge_config_set(pcore_cluster_id, ecore_cluster_id, (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
3452
3453	sched_timeshare_init();
3454	sched_clutch_tunables_init();
3455	}
c6bf4f31 A	3456
c6bf4f31 A	3457	static thread_t
f427ee49	3458	sched_edge_choose_thread(
c6bf4f31 A	3459	processor_t processor,
	3460	int priority,
	3461	__unused ast_t reason)
	3462	{
c6bf4f31 A	3463	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
	3464	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
	3465	boolean_t choose_from_boundq = false;
	3466
	3467	if ((bound_runq->highq < priority) &&
f427ee49	3468	(clutch_pri < priority)) {
c6bf4f31 A	3469	return THREAD_NULL;
	3470	}
	3471
	3472	if (bound_runq->highq >= clutch_pri) {
	3473	choose_from_boundq = true;
	3474	}
	3475
	3476	thread_t thread = THREAD_NULL;
	3477	if (choose_from_boundq == false) {
	3478	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
f427ee49	3479	thread = sched_clutch_thread_highest_remove(pset_clutch_root);
c6bf4f31 A	3480	} else {
	3481	thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
	3482	}
	3483	return thread;
	3484	}
	3485
	3486	static boolean_t
f427ee49	3487	sched_edge_processor_queue_empty(processor_t processor)
c6bf4f31	3488	{
c6bf4f31	3489	return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
f427ee49 A	3490	(sched_clutch_bound_runq(processor)->count == 0);
	3491	}
	3492
	3493	static void
	3494	sched_edge_check_spill(__unused processor_set_t pset, __unused thread_t thread)
	3495	{
	3496	assert(thread->bound_processor == PROCESSOR_NULL);
c6bf4f31 A	3497	}
c6bf4f31 A	3498
f427ee49 A	3499	__options_decl(sched_edge_thread_yield_reason_t, uint32_t, {
	3500	SCHED_EDGE_YIELD_RUNQ_NONEMPTY = 0x0,
	3501	SCHED_EDGE_YIELD_FOREIGN_RUNNABLE = 0x1,
	3502	SCHED_EDGE_YIELD_FOREIGN_RUNNING = 0x2,
	3503	SCHED_EDGE_YIELD_STEAL_POSSIBLE = 0x3,
	3504	SCHED_EDGE_YIELD_DISALLOW = 0x4,
	3505	});
	3506
c6bf4f31	3507	static bool
f427ee49	3508	sched_edge_thread_should_yield(processor_t processor, __unused thread_t thread)
c6bf4f31	3509	{
f427ee49 A	3510	if (!sched_edge_processor_queue_empty(processor) \|\| (rt_runq_count(processor->processor_set) > 0)) {
	3511	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) \| DBG_FUNC_NONE,
	3512	thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY);
c6bf4f31 A	3513	return true;
	3514	}
	3515
f427ee49 A	3516	/*
	3517	* The yield logic should follow the same logic that steal_thread () does. The
	3518	* thread_should_yield() is effectively trying to quickly check that if the
	3519	* current thread gave up CPU, is there any other thread that would execute
	3520	* on this CPU. So it needs to provide the same answer as the steal_thread()/
	3521	* processor Idle logic.
	3522	*/
	3523	if (sched_edge_foreign_runnable_thread_available(processor->processor_set)) {
	3524	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) \| DBG_FUNC_NONE,
	3525	thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNABLE);
	3526	return true;
	3527	}
	3528	if (sched_edge_foreign_running_thread_available(processor->processor_set)) {
	3529	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) \| DBG_FUNC_NONE,
	3530	thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING);
	3531	return true;
	3532	}
	3533
	3534	processor_set_t steal_candidate = sched_edge_steal_candidate(processor->processor_set);
	3535	if (steal_candidate != NULL) {
	3536	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) \| DBG_FUNC_NONE,
	3537	thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE);
	3538	return true;
c6bf4f31 A	3539	}
c6bf4f31 A	3540
f427ee49 A	3541	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) \| DBG_FUNC_NONE, thread_tid(thread), processor->processor_set->pset_cluster_id,
f427ee49 A	3542	0, SCHED_EDGE_YIELD_DISALLOW);
c6bf4f31 A	3543	return false;
	3544	}
	3545
	3546	static ast_t
f427ee49	3547	sched_edge_processor_csw_check(processor_t processor)
c6bf4f31 A	3548	{
	3549	boolean_t has_higher;
	3550	int pri;
	3551
	3552	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
	3553	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
	3554
	3555	assert(processor->active_thread != NULL);
	3556
c6bf4f31	3557	pri = MAX(clutch_pri, bound_runq->highq);
c6bf4f31 A	3558
	3559	if (processor->first_timeslice) {
	3560	has_higher = (pri > processor->current_pri);
	3561	} else {
	3562	has_higher = (pri >= processor->current_pri);
	3563	}
	3564
	3565	if (has_higher) {
	3566	if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
	3567	return AST_PREEMPT \| AST_URGENT;
	3568	}
	3569
	3570	if (bound_runq->urgency > 0) {
	3571	return AST_PREEMPT \| AST_URGENT;
	3572	}
	3573
c6bf4f31 A	3574	return AST_PREEMPT;
	3575	}
	3576
	3577	return AST_NONE;
	3578	}
	3579
	3580	static boolean_t
f427ee49	3581	sched_edge_processor_queue_has_priority(processor_t processor,
c6bf4f31 A	3582	int priority,
	3583	boolean_t gte)
	3584	{
c6bf4f31 A	3585	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
	3586
	3587	int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
c6bf4f31 A	3588	if (gte) {
	3589	return qpri >= priority;
	3590	} else {
	3591	return qpri > priority;
	3592	}
	3593	}
	3594
c6bf4f31	3595	static void
f427ee49	3596	sched_edge_processor_queue_shutdown(processor_t processor)
c6bf4f31 A	3597	{
	3598	processor_set_t pset = processor->processor_set;
	3599	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
	3600	thread_t thread;
	3601	queue_head_t tqueue;
	3602
	3603	/* We only need to migrate threads if this is the last active or last recommended processor in the pset */
	3604	if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
	3605	pset_unlock(pset);
	3606	return;
	3607	}
	3608
f427ee49 A	3609	bitmap_clear(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
f427ee49 A	3610
c6bf4f31 A	3611	queue_init(&tqueue);
c6bf4f31 A	3612	while (sched_clutch_root_count(pset_clutch_root) > 0) {
f427ee49	3613	thread = sched_clutch_thread_highest_remove(pset_clutch_root);
c6bf4f31 A	3614	enqueue_tail(&tqueue, &thread->runq_links);
	3615	}
	3616	pset_unlock(pset);
	3617
	3618	qe_foreach_element_safe(thread, &tqueue, runq_links) {
	3619	remqueue(&thread->runq_links);
	3620	thread_lock(thread);
	3621	thread_setrun(thread, SCHED_TAILQ);
	3622	thread_unlock(thread);
	3623	}
	3624	}
	3625
f427ee49 A	3626	/*
	3627	* sched_edge_cluster_load_metric()
	3628	*
	3629	* The load metric for a cluster is a measure of the average scheduling latency
	3630	* experienced by threads on that cluster. It is a product of the average number
	3631	* of threads in the runqueue and the average execution time for threads. The metric
	3632	* has special values in the following cases:
	3633	* - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
	3634	* the maximum value to disallow any threads to migrate to this cluster.
	3635	* - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
	3636	* to be spread across the platform quickly for ncpu wide workloads.
	3637	*/
	3638	static uint32_t
	3639	sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket)
c6bf4f31	3640	{
f427ee49 A	3641	if (sched_edge_pset_available(pset) == false) {
f427ee49 A	3642	return UINT32_MAX;
c6bf4f31	3643	}
f427ee49 A	3644	return (uint32_t)sched_get_pset_load_average(pset, sched_bucket);
f427ee49 A	3645	}
c6bf4f31	3646
f427ee49 A	3647	/*
	3648	*
	3649	* Edge Scheduler Steal/Rebalance logic
	3650	*
	3651	* = Generic scheduler logic =
	3652	*
	3653	* The SCHED(steal_thread) scheduler callout is invoked when the processor does not
	3654	* find any thread for execution in its runqueue. The aim of the steal operation
	3655	* is to find other threads running/runnable in other clusters which should be
	3656	* executed here.
	3657	*
	3658	* If the steal callout does not return a thread, the thread_select() logic calls
	3659	* SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
	3660	* threads and idle out the current CPU.
	3661	*
	3662	* = SCHED(steal_thread) for Edge Scheduler =
	3663	*
	3664	* The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
	3665	* routine tries to do the following operations in order:
	3666	* (1) Find foreign runnnable threads in non-native cluster
	3667	* runqueues (sched_edge_foreign_runnable_thread_remove())
	3668	* (2) Check if foreign threads are running on the non-native
	3669	* clusters (sched_edge_foreign_running_thread_available())
	3670	* - If yes, return THREAD_NULL for the steal callout and
	3671	* perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
	3672	* (3) Steal a thread from another cluster based on edge
	3673	* weights (sched_edge_steal_thread())
	3674	*
	3675	* = SCHED(processor_balance) for Edge Scheduler =
	3676	*
	3677	* If steal_thread did not return a thread for the processor, use
	3678	* sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
	3679	*
	3680	* = Clutch Bucket Preferred Cluster Overrides =
	3681	*
	3682	* Since these operations (just like thread migrations on enqueue)
	3683	* move threads across clusters, they need support for handling clutch
	3684	* bucket group level preferred cluster recommendations.
	3685	* For (1), a clutch bucket will be in the foreign runnable queue based
	3686	* on the clutch bucket group preferred cluster.
	3687	* For (2), the running thread will set the bit on the processor based
	3688	* on its preferred cluster type.
	3689	* For (3), the edge configuration would prevent threads from being stolen
	3690	* in the wrong direction.
	3691	*
	3692	* = SCHED(thread_should_yield) =
	3693	* The thread_should_yield() logic needs to have the same logic as sched_edge_processor_idle()
	3694	* since that is expecting the same answer as if thread_select() was called on a core
	3695	* with an empty runqueue.
	3696	*/
c6bf4f31	3697
f427ee49 A	3698	static bool
	3699	sched_edge_steal_thread_enabled(__unused processor_set_t pset)
	3700	{
	3701	/*
	3702	* For edge scheduler, the gating for steal is being done by sched_edge_steal_candidate()
	3703	*/
	3704	return true;
	3705	}
c6bf4f31	3706
f427ee49 A	3707	static processor_set_t
	3708	sched_edge_steal_candidate(processor_set_t pset)
	3709	{
	3710	/*
	3711	* Edge Scheduler Optimization
	3712	*
	3713	* Investigate a better policy for stealing. The current implementation looks
	3714	* at all the incoming weights for the pset that just became idle and sees which
	3715	* clusters have loads > edge weights. It is effectively trying to simulate a
	3716	* overload migration as if a thread had become runnable on the candidate cluster.
	3717	*
	3718	* The logic today bails as soon as it finds a cluster where the cluster load is
	3719	* greater than the edge weight. This helps the check to be quick which is useful
	3720	* for sched_edge_thread_should_yield() which uses this. Maybe it should have a
	3721	* more advanced version for the actual steal operation which looks for the
	3722	* maximum delta etc.
	3723	*/
	3724	processor_set_t target_pset = NULL;
	3725	uint32_t dst_cluster_id = pset->pset_cluster_id;
c6bf4f31	3726
f427ee49 A	3727	for (int cluster_id = 0; cluster_id < MAX_PSETS; cluster_id++) {
f427ee49 A	3728	processor_set_t candidate_pset = pset_array[cluster_id];
c6bf4f31	3729
f427ee49 A	3730	if (candidate_pset == pset) {
	3731	continue;
	3732	}
c6bf4f31	3733
f427ee49 A	3734	sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[dst_cluster_id];
	3735	if (incoming_edge->sce_steal_allowed == false) {
	3736	continue;
	3737	}
c6bf4f31	3738
f427ee49 A	3739	uint32_t incoming_weight = incoming_edge->sce_migration_weight;
	3740	int highest_runnable_bucket = bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
	3741	if (highest_runnable_bucket == -1) {
	3742	/* Candidate cluster runq is empty */
	3743	continue;
	3744	}
	3745	/* Use the load metrics for highest runnable bucket since that would be stolen next */
	3746	uint32_t candidate_load = sched_edge_cluster_load_metric(candidate_pset, (sched_bucket_t)highest_runnable_bucket);
	3747	if (candidate_load > incoming_weight) {
	3748	/* Only steal from the candidate if its load is higher than the incoming edge and it has runnable threads */
	3749	target_pset = candidate_pset;
	3750	break;
c6bf4f31 A	3751	}
	3752	}
	3753
f427ee49	3754	return target_pset;
c6bf4f31 A	3755	}
c6bf4f31 A	3756
f427ee49 A	3757	static boolean_t
f427ee49 A	3758	sched_edge_foreign_runnable_thread_available(processor_set_t pset)
c6bf4f31	3759	{
f427ee49 A	3760	/* Find all the clusters that are foreign for this cluster */
	3761	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
	3762	for (int cluster = bitmap_first(foreign_pset_bitmap, MAX_PSETS); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
	3763	/*
	3764	* For each cluster, see if there are any runnable foreign threads.
	3765	* This check is currently being done without the pset lock to make it cheap for
	3766	* the common case.
	3767	*/
	3768	processor_set_t target_pset = pset_array[cluster];
	3769	if (sched_edge_pset_available(target_pset) == false) {
	3770	continue;
c6bf4f31	3771	}
f427ee49 A	3772
f427ee49 A	3773	if (!sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
c6bf4f31 A	3774	return true;
	3775	}
	3776	}
c6bf4f31 A	3777	return false;
	3778	}
	3779
f427ee49 A	3780	static thread_t
f427ee49 A	3781	sched_edge_foreign_runnable_thread_remove(processor_set_t pset, uint64_t ctime)
c6bf4f31	3782	{
f427ee49	3783	thread_t thread = THREAD_NULL;
c6bf4f31	3784
f427ee49 A	3785	/* Find all the clusters that are foreign for this cluster */
	3786	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
	3787	for (int cluster = bitmap_first(foreign_pset_bitmap, MAX_PSETS); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
	3788	/*
	3789	* For each cluster, see if there are any runnable foreign threads.
	3790	* This check is currently being done without the pset lock to make it cheap for
	3791	* the common case.
	3792	*/
	3793	processor_set_t target_pset = pset_array[cluster];
	3794	if (sched_edge_pset_available(target_pset) == false) {
	3795	continue;
	3796	}
c6bf4f31	3797
f427ee49 A	3798	if (sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
	3799	continue;
	3800	}
	3801	/*
	3802	* Looks like there are runnable foreign threads in the hierarchy; lock the pset
	3803	* and get the highest priority thread.
	3804	*/
	3805	pset_lock(target_pset);
	3806	if (sched_edge_pset_available(target_pset)) {
	3807	thread = sched_clutch_root_highest_foreign_thread_remove(&target_pset->pset_clutch_root);
	3808	sched_update_pset_load_average(target_pset, ctime);
	3809	}
	3810	pset_unlock(target_pset);
	3811
	3812	/*
	3813	* Edge Scheduler Optimization
	3814	*
	3815	* The current implementation immediately returns as soon as it finds a foreign
	3816	* runnable thread. This could be enhanced to look at highest priority threads
	3817	* from all foreign clusters and pick the highest amongst them. That would need
	3818	* some form of global state across psets to make that kind of a check cheap.
	3819	*/
	3820	if (thread != THREAD_NULL) {
	3821	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNABLE) \| DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, target_pset->pset_cluster_id, 0);
	3822	break;
	3823	}
	3824	/* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */
c6bf4f31 A	3825	}
c6bf4f31 A	3826
f427ee49 A	3827	return thread;
	3828	}
	3829
	3830	static boolean_t
	3831	sched_edge_foreign_running_thread_available(processor_set_t pset)
	3832	{
	3833	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
	3834	for (int cluster = bitmap_first(foreign_pset_bitmap, MAX_PSETS); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
	3835	/* Skip the pset if its not schedulable */
	3836	processor_set_t target_pset = pset_array[cluster];
	3837	if (sched_edge_pset_available(target_pset) == false) {
	3838	continue;
	3839	}
	3840
	3841	uint64_t running_foreign_bitmap = target_pset->cpu_state_map[PROCESSOR_RUNNING] & target_pset->cpu_running_foreign;
	3842	if (lsb_first(running_foreign_bitmap) != -1) {
	3843	/* Found a non-native CPU running a foreign thread; rebalance is needed */
	3844	return true;
	3845	}
c6bf4f31	3846	}
f427ee49 A	3847	return false;
f427ee49 A	3848	}
c6bf4f31	3849
f427ee49 A	3850	static thread_t
	3851	sched_edge_steal_thread(processor_set_t pset)
	3852	{
	3853	thread_t thread = THREAD_NULL;
	3854	processor_set_t steal_from_pset = sched_edge_steal_candidate(pset);
	3855	if (steal_from_pset) {
c6bf4f31	3856	/*
f427ee49 A	3857	* sched_edge_steal_candidate() has found a pset which is ideal to steal from.
f427ee49 A	3858	* Lock the pset and select the highest thread in that runqueue.
c6bf4f31	3859	*/
f427ee49 A	3860	pset_lock(steal_from_pset);
	3861	if (bitmap_first(steal_from_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) != -1) {
	3862	uint64_t current_timestamp = mach_absolute_time();
	3863	sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(&steal_from_pset->pset_clutch_root, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
	3864	thread = sched_clutch_thread_unbound_lookup(&steal_from_pset->pset_clutch_root, root_bucket);
	3865	sched_clutch_thread_remove(&steal_from_pset->pset_clutch_root, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
	3866	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STEAL) \| DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, steal_from_pset->pset_cluster_id, 0);
	3867	sched_update_pset_load_average(steal_from_pset, current_timestamp);
c6bf4f31	3868	}
f427ee49 A	3869	/*
	3870	* Edge Scheduler Optimization
	3871	* Maybe this needs to circle around if the steal candidate did not have any threads by
	3872	* by the time the pset lock was taken.
	3873	*/
	3874	pset_unlock(steal_from_pset);
c6bf4f31	3875	}
f427ee49 A	3876	return thread;
	3877	}
	3878
	3879	/*
	3880	* sched_edge_processor_idle()
	3881	*
	3882	* The routine is the implementation for steal_thread() for the Edge scheduler.
	3883	*/
	3884	static thread_t
	3885	sched_edge_processor_idle(processor_set_t pset)
	3886	{
	3887	thread_t thread = THREAD_NULL;
	3888
	3889	uint64_t ctime = mach_absolute_time();
	3890
	3891	/* Each of the operations acquire the lock for the pset they target */
	3892	pset_unlock(pset);
	3893
	3894	/* Find highest priority runnable thread on all non-native clusters */
	3895	thread = sched_edge_foreign_runnable_thread_remove(pset, ctime);
	3896	if (thread != THREAD_NULL) {
	3897	return thread;
	3898	}
	3899
	3900	/* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */
	3901	boolean_t rebalance_needed = sched_edge_foreign_running_thread_available(pset);
	3902	if (rebalance_needed) {
	3903	return THREAD_NULL;
	3904	}
	3905
	3906	/* No foreign threads found; find a thread to steal from a pset based on weights/loads etc. */
	3907	thread = sched_edge_steal_thread(pset);
	3908	return thread;
	3909	}
	3910
	3911	/* Return true if this thread should not continue running on this processor */
	3912	static bool
	3913	sched_edge_thread_avoid_processor(processor_t processor, thread_t thread)
	3914	{
	3915	processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
	3916	/*
	3917	* For long running parallel workloads, it is important to rebalance threads across
	3918	* E/P clusters so that they make equal forward progress. This is achieved through
	3919	* threads expiring their quantum on the non-preferred cluster type and explicitly
	3920	* rebalancing to the preferred cluster runqueue.
	3921	*
	3922	* <Edge Multi-Cluster Support Needed>
	3923	* For multi-cluster platforms, it mignt be useful to move the thread incase its
	3924	* preferred pset is idle now.
	3925	*/
	3926	if (processor->processor_set->pset_type != preferred_pset->pset_type) {
	3927	return true;
	3928	}
	3929	/* If thread already running on preferred cluster, do not avoid */
	3930	if (processor->processor_set == preferred_pset) {
	3931	return false;
	3932	}
	3933	/*
	3934	* The thread is running on a processor that is of the same type as the
	3935	* preferred pset, but is not the actual preferred pset. In that case
	3936	* look at edge weights to see if this thread should continue execution
	3937	* here or go back to its preferred cluster.
	3938	*
	3939	* <Edge Multi-Cluster Support Needed>
3940	* This logic needs to ensure that the current thread is not counted against
3941	* the load average for the current pset otherwise it would always end up avoiding
3942	* the current cluster.
3943	*/
3944	processor_set_t chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, processor->processor_set, false);
3945	return chosen_pset != processor->processor_set;
3946	}
3947
3948	static void
3949	sched_edge_balance(__unused processor_t cprocessor, processor_set_t cpset)
3950	{
3951	assert(cprocessor == current_processor());
3952	pset_unlock(cpset);
3953
3954	uint64_t ast_processor_map = 0;
3955	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
3956
3957	bitmap_t *foreign_pset_bitmap = cpset->foreign_psets;
3958	for (int cluster = bitmap_first(foreign_pset_bitmap, MAX_PSETS); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3959	/* Skip the pset if its not schedulable */
3960	processor_set_t target_pset = pset_array[cluster];
3961	if (sched_edge_pset_available(target_pset) == false) {
3962	continue;
3963	}
3964
3965	pset_lock(target_pset);
3966	uint64_t cpu_running_foreign_map = (target_pset->cpu_running_foreign & target_pset->cpu_state_map[PROCESSOR_RUNNING]);
3967	for (int cpuid = lsb_first(cpu_running_foreign_map); cpuid >= 0; cpuid = lsb_next(cpu_running_foreign_map, cpuid)) {
3968	processor_t target_cpu = processor_array[cpuid];
3969	ipi_type[target_cpu->cpu_id] = sched_ipi_action(target_cpu, NULL, false, SCHED_IPI_EVENT_REBALANCE);
3970	if (ipi_type[cpuid] != SCHED_IPI_NONE) {
3971	bit_set(ast_processor_map, cpuid);
3972	}
3973	}
3974	pset_unlock(target_pset);
3975	}
3976
3977	for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
3978	processor_t ast_processor = processor_array[cpuid];
3979	sched_ipi_perform(ast_processor, ipi_type[cpuid]);
3980	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNING) \| DBG_FUNC_NONE, 0, cprocessor->cpu_id, cpuid, 0);
3981	}
3982	}
3983
3984	/*
3985	* sched_edge_migrate_edges_evaluate()
3986	*
3987	* Routine to find the candidate for thread migration based on edge weights.
3988	*
3989	* Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
3990	* return preferred_pset if its the most ideal destination for this thread.
3991	*/
3992	static processor_set_t
3993	sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t preferred_cluster_load, thread_t thread)
3994	{
3995	processor_set_t selected_pset = preferred_pset;
3996	uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
3997	cluster_type_t preferred_cluster_type = pset_type_for_id(preferred_cluster_id);
3998
3999	/* Look at edge deltas with other clusters to find the ideal migration candidate */
4000	sched_clutch_edge *edge = preferred_pset->sched_edges;
4001	uint32_t max_edge_delta = 0;
4002
4003	/*
4004	* Edge Scheduler Optimization
4005	*
4006	* For really large cluster count systems, it might make sense to optimize the
4007	* clusters iterated by using bitmaps and skipping over clusters that are not
4008	* available for scheduling or have migration disabled from this cluster.
4009	*/
4010	for (uint32_t cluster_id = 0; cluster_id < MAX_PSETS; cluster_id++) {
4011	processor_set_t dst_pset = pset_array[cluster_id];
4012	if (cluster_id == preferred_cluster_id) {
4013	continue;
4014	}
4015
4016	if (edge[cluster_id].sce_migration_allowed == false) {
4017	continue;
4018	}
4019
4020	uint32_t dst_load = sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket);
4021	if (dst_load > preferred_cluster_load) {
4022	continue;
4023	}
4024
4025	/*
4026	* Fast path for idle dst cluster
4027	*
4028	* For extremely parallel workloads, it is important to load up
4029	* all clusters as quickly as possible. This short-circuit allows
4030	* that.
4031	* <Edge Multi-cluster Support Needed>
4032	*
4033	* For multi-cluster platforms, the loop should start with the homogeneous
4034	* clusters first.
4035	*/
4036	if (dst_load == 0) {
4037	selected_pset = dst_pset;
4038	break;
4039	}
4040
4041	uint32_t edge_delta = preferred_cluster_load - dst_load;
4042	if (edge_delta < edge[cluster_id].sce_migration_weight) {
4043	continue;
4044	}
4045
4046	if (edge_delta < max_edge_delta) {
4047	continue;
4048	}
4049
4050	if (edge_delta == max_edge_delta) {
4051	/* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */
4052	boolean_t selected_homogeneous = (pset_type_for_id(selected_pset->pset_cluster_id) == preferred_cluster_type);
4053	boolean_t candidate_homogeneous = (pset_type_for_id(dst_pset->pset_cluster_id) == preferred_cluster_type);
4054	if (selected_homogeneous \|\| !candidate_homogeneous) {
4055	continue;
4056	}
4057	}
4058	/* dst_pset seems to be the best candidate for migration */
4059	max_edge_delta = edge_delta;
4060	selected_pset = dst_pset;
4061	}
4062	return selected_pset;
c6bf4f31 A	4063	}
	4064
	4065	/*
f427ee49	4066	* sched_edge_candidate_alternative()
c6bf4f31	4067	*
f427ee49 A	4068	* Routine to find an alternative cluster from candidate_cluster_bitmap since the
	4069	* selected_pset is not available for execution. The logic tries to prefer homogeneous
	4070	* clusters over heterogeneous clusters since this is typically used in thread
	4071	* placement decisions.
c6bf4f31	4072	*/
f427ee49 A	4073	_Static_assert(MAX_PSETS <= 64, "Unable to fit maximum number of psets in uint64_t bitmask");
	4074	static processor_set_t
	4075	sched_edge_candidate_alternative(processor_set_t selected_pset, uint64_t candidate_cluster_bitmap)
	4076	{
	4077	/*
	4078	* It looks like the most ideal pset is not available for scheduling currently.
	4079	* Try to find a homogeneous cluster that is still available.
	4080	*/
	4081	bitmap_t *foreign_clusters = selected_pset->foreign_psets;
	4082	uint64_t available_native_clusters = ~(foreign_clusters[0]) & candidate_cluster_bitmap;
	4083	int available_cluster_id = lsb_first(available_native_clusters);
	4084	if (available_cluster_id == -1) {
	4085	/* Looks like none of the homogeneous clusters are available; pick the first available cluster */
	4086	available_cluster_id = bit_first(candidate_cluster_bitmap);
	4087	}
	4088	assert(available_cluster_id != -1);
	4089	return pset_array[available_cluster_id];
	4090	}
c6bf4f31 A	4091
c6bf4f31 A	4092	/*
f427ee49	4093	* sched_edge_switch_pset_lock()
c6bf4f31	4094	*
f427ee49 A	4095	* Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
	4096	* switch_pset_locks.
	4097	* Returns the newly locked pset after the switch.
c6bf4f31 A	4098	*/
c6bf4f31 A	4099	static processor_set_t
f427ee49	4100	sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locked_pset, bool switch_pset_locks)
c6bf4f31	4101	{
f427ee49 A	4102	if (!switch_pset_locks) {
f427ee49 A	4103	return locked_pset;
c6bf4f31	4104	}
f427ee49 A	4105	if (selected_pset != locked_pset) {
	4106	pset_unlock(locked_pset);
	4107	pset_lock(selected_pset);
	4108	return selected_pset;
	4109	} else {
	4110	return locked_pset;
	4111	}
	4112	}
c6bf4f31	4113
f427ee49 A	4114	/*
	4115	* sched_edge_migrate_candidate()
	4116	*
	4117	* Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
	4118	* the thread and the preferred cluster to determine the best available pset for scheduling.
	4119	*
	4120	* The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
	4121	* accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
	4122	* decision has to be synchronized with another CPU changing the recommendation of clusters available
	4123	* on the system. If this parameter is set to false, this routine returns the best effort indication of
	4124	* the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
	4125	* SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
	4126	* more appropriate cluster.
	4127	*
	4128	* Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
	4129	* resultant pset lock is held.
	4130	*/
	4131	static processor_set_t
	4132	sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks)
	4133	{
	4134	__kdebug_only uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
	4135	processor_set_t selected_pset = preferred_pset;
	4136
	4137	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
	4138	/* For bound threads always recommend the cluster its bound to */
	4139	selected_pset = pset_array[sched_edge_thread_bound_cluster_id(thread)];
	4140	locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
	4141	if (sched_edge_pset_available(selected_pset) \|\| (SCHED_CLUTCH_THREAD_CLUSTER_BOUND_SOFT(thread) == false)) {
	4142	/*
	4143	* If the bound cluster is not available, check if the thread is soft bound. For soft bound threads,
	4144	* fall through to the regular cluster selection logic which handles unavailable clusters
	4145	* appropriately. If the thread is hard bound, then return the bound cluster always.
	4146	*/
	4147	return selected_pset;
	4148	}
	4149	}
	4150
	4151	uint64_t candidate_cluster_bitmap = mask(MAX_PSETS);
	4152	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
	4153	/* For realtime threads, try and schedule them on the preferred pset always */
	4154	goto migrate_candidate_available_check;
	4155	}
	4156
	4157	/*
	4158	* If a thread is being rebalanced for achieving equal progress of parallel workloads,
	4159	* it needs to end up on the preferred runqueue.
	4160	*/
	4161	uint32_t preferred_cluster_load = sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket);
	4162	boolean_t amp_rebalance = (thread->reason & (AST_REBALANCE \| AST_QUANTUM)) == (AST_REBALANCE \| AST_QUANTUM);
	4163	if ((preferred_cluster_load == 0) \|\| amp_rebalance) {
	4164	goto migrate_candidate_available_check;
	4165	}
	4166
	4167	/* Look at edge weights to decide the most ideal migration candidate for this thread */
	4168	selected_pset = sched_edge_migrate_edges_evaluate(preferred_pset, preferred_cluster_load, thread);
	4169
	4170	migrate_candidate_available_check:
	4171	locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
	4172	if (sched_edge_pset_available(selected_pset) == true) {
	4173	if (selected_pset != preferred_pset) {
	4174	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) \| DBG_FUNC_NONE, thread_tid(thread), preferred_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
	4175	}
	4176	return selected_pset;
	4177	}
4178	/* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */
4179	bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id);
4180	if (__improbable(bitmap_first(&candidate_cluster_bitmap, MAX_PSETS) == -1)) {
4181	/*
4182	* None of the clusters are available for scheduling; this situation should be rare but if it happens,
4183	* simply return the boot cluster.
4184	*/
4185	selected_pset = &pset0;
4186	locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4187	if (selected_pset != preferred_pset) {
4188	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) \| DBG_FUNC_NONE, thread_tid(thread), preferred_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
4189	}
4190	return selected_pset;
c6bf4f31	4191	}
f427ee49 A	4192	/* Try and find an alternative for the selected pset */
	4193	selected_pset = sched_edge_candidate_alternative(selected_pset, candidate_cluster_bitmap);
	4194	goto migrate_candidate_available_check;
	4195	}
	4196
	4197	static processor_t
	4198	sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
	4199	{
	4200	/* Bound threads don't call this function */
	4201	assert(thread->bound_processor == PROCESSOR_NULL);
	4202	processor_t chosen_processor = PROCESSOR_NULL;
c6bf4f31 A	4203
c6bf4f31 A	4204	/*
f427ee49 A	4205	* sched_edge_preferred_pset() returns the preferred pset for a given thread.
	4206	* It should take the passed in "pset" as a hint which represents the recency metric for
	4207	* pset selection logic.
c6bf4f31	4208	*/
f427ee49 A	4209	processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
	4210	processor_set_t chosen_pset = preferred_pset;
	4211	/*
	4212	* If the preferred pset is overloaded, find a pset which is the best candidate to migrate
	4213	* threads to. sched_edge_migrate_candidate() returns the preferred pset
	4214	* if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
	4215	*
	4216	* <Edge Multi-cluster Support Needed>
	4217	* It might be useful to build a recency metric for the thread for multiple clusters and
	4218	* factor that into the migration decisions.
	4219	*/
	4220	chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true);
	4221	chosen_processor = choose_processor(chosen_pset, processor, thread);
	4222	assert(chosen_processor->processor_set == chosen_pset);
	4223	return chosen_processor;
c6bf4f31 A	4224	}
c6bf4f31 A	4225
f427ee49 A	4226	/*
	4227	* sched_edge_clutch_bucket_threads_drain()
	4228	*
	4229	* Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
	4230	* bucket overrides etc.) into a local thread queue.
	4231	*/
c6bf4f31	4232	static void
f427ee49	4233	sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
c6bf4f31	4234	{
f427ee49	4235	thread_t thread = THREAD_NULL;
c6bf4f31	4236	uint64_t current_timestamp = mach_approximate_time();
f427ee49	4237	qe_foreach_element_safe(thread, &clutch_bucket->scb_thread_timeshare_queue, th_clutch_timeshare_link) {
ea3f0419	4238	sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
c6bf4f31	4239	enqueue_tail(clutch_threads, &thread->runq_links);
c6bf4f31	4240	}
f427ee49	4241	}
c6bf4f31	4242
f427ee49 A	4243	/*
	4244	* sched_edge_run_drained_threads()
	4245	*
	4246	* Makes all drained threads in a local queue runnable.
	4247	*/
	4248	static void
	4249	sched_edge_run_drained_threads(queue_t clutch_threads)
	4250	{
	4251	thread_t thread;
	4252	/* Now setrun all the threads in the local queue */
	4253	qe_foreach_element_safe(thread, clutch_threads, runq_links) {
	4254	remqueue(&thread->runq_links);
	4255	thread_lock(thread);
	4256	thread_setrun(thread, SCHED_TAILQ);
	4257	thread_unlock(thread);
	4258	}
	4259	}
	4260
	4261	/*
	4262	* sched_edge_update_preferred_cluster()
	4263	*
	4264	* Routine to update the preferred cluster for QoS buckets within a thread group.
	4265	* The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
	4266	*/
	4267	static void
	4268	sched_edge_update_preferred_cluster(
	4269	sched_clutch_t sched_clutch,
	4270	bitmap_t *clutch_bucket_modify_bitmap,
	4271	uint32_t *tg_bucket_preferred_cluster)
	4272	{
	4273	for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
	4274	os_atomic_store(&sched_clutch->sc_clutch_groups[bucket].scbg_preferred_cluster, tg_bucket_preferred_cluster[bucket], relaxed);
	4275	}
c6bf4f31 A	4276	}
	4277
	4278	/*
f427ee49	4279	* sched_edge_migrate_thread_group_runnable_threads()
c6bf4f31	4280	*
f427ee49	4281	* Routine to implement the migration of threads on a cluster when the thread group
c6bf4f31 A	4282	* recommendation is updated. The migration works using a 2-phase
	4283	* algorithm.
	4284	*
f427ee49 A	4285	* Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
	4286	* For each clutch bucket, if it needs to be migrated immediately, drain the threads
	4287	* into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
	4288	* appropriate.
	4289	*
	4290	* Phase 2: After unlocking the pset, drain all the threads from the local thread
	4291	* queue and mark them runnable which should land them in the right hierarchy.
	4292	*
	4293	* The routine assumes that the preferences for the clutch buckets/clutch bucket
	4294	* groups have already been updated by the caller.
c6bf4f31	4295	*
f427ee49 A	4296	* - Called with the pset locked and interrupts disabled.
f427ee49 A	4297	* - Returns with the pset unlocked.
c6bf4f31 A	4298	*/
c6bf4f31 A	4299	static void
f427ee49 A	4300	sched_edge_migrate_thread_group_runnable_threads(
	4301	sched_clutch_t sched_clutch,
	4302	sched_clutch_root_t root_clutch,
	4303	bitmap_t *clutch_bucket_modify_bitmap,
	4304	__unused uint32_t *tg_bucket_preferred_cluster,
	4305	bool migrate_immediately)
c6bf4f31	4306	{
f427ee49	4307	/* Queue to hold threads that have been drained from clutch buckets to be migrated */
c6bf4f31 A	4308	queue_head_t clutch_threads;
	4309	queue_init(&clutch_threads);
	4310
f427ee49 A	4311	for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
	4312	/* Get the clutch bucket for this cluster and sched bucket */
	4313	sched_clutch_bucket_group_t clutch_bucket_group = &(sched_clutch->sc_clutch_groups[bucket]);
	4314	sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
c6bf4f31	4315	sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
f427ee49	4316	if (scb_root == NULL) {
c6bf4f31	4317	/* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
f427ee49	4318	assert(clutch_bucket->scb_thr_count == 0);
c6bf4f31 A	4319	continue;
c6bf4f31 A	4320	}
f427ee49 A	4321	assert(scb_root == root_clutch);
	4322	uint32_t clutch_bucket_preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
	4323
	4324	if (migrate_immediately) {
	4325	/*
	4326	* For transitions where threads need to be migrated immediately, drain the threads into a
	4327	* local queue unless we are looking at the clutch buckets for the newly recommended
	4328	* cluster.
	4329	*/
	4330	if (root_clutch->scr_cluster_id != clutch_bucket_preferred_cluster) {
	4331	sched_edge_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
	4332	} else {
	4333	sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
	4334	}
	4335	} else {
	4336	/* Check if this cluster is the same type as the newly recommended cluster */
	4337	boolean_t homogeneous_cluster = (pset_type_for_id(root_clutch->scr_cluster_id) == pset_type_for_id(clutch_bucket_preferred_cluster));
	4338	/*
	4339	* If threads do not have to be migrated immediately, just change the native/foreign
	4340	* flag on the clutch bucket.
	4341	*/
	4342	if (homogeneous_cluster) {
	4343	sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
	4344	} else {
	4345	sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
	4346	}
	4347	}
	4348	}
	4349
	4350	pset_unlock(root_clutch->scr_pset);
	4351	sched_edge_run_drained_threads(&clutch_threads);
	4352	}
	4353
	4354	/*
	4355	* sched_edge_migrate_thread_group_running_threads()
	4356	*
	4357	* Routine to find all running threads of a thread group on a specific cluster
	4358	* and IPI them if they need to be moved immediately.
	4359	*/
	4360	static void
	4361	sched_edge_migrate_thread_group_running_threads(
	4362	sched_clutch_t sched_clutch,
	4363	sched_clutch_root_t root_clutch,
	4364	__unused bitmap_t *clutch_bucket_modify_bitmap,
	4365	uint32_t *tg_bucket_preferred_cluster,
	4366	bool migrate_immediately)
	4367	{
	4368	if (migrate_immediately == false) {
	4369	/* If CLPC has recommended not to move threads immediately, nothing to do here */
	4370	return;
c6bf4f31 A	4371	}
	4372
	4373	/*
f427ee49 A	4374	* Edge Scheduler Optimization
	4375	*
	4376	* When the system has a large number of clusters and cores, it might be useful to
	4377	* narrow down the iteration by using a thread running bitmap per clutch.
c6bf4f31	4378	*/
f427ee49 A	4379	uint64_t ast_processor_map = 0;
f427ee49 A	4380	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
c6bf4f31	4381
f427ee49 A	4382	uint64_t running_map = root_clutch->scr_pset->cpu_state_map[PROCESSOR_RUNNING];
	4383	/*
	4384	* Iterate all CPUs and look for the ones running threads from this thread group and are
	4385	* not restricted to the specific cluster (due to overrides etc.)
	4386	*/
	4387	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
	4388	processor_t src_processor = processor_array[cpuid];
	4389	boolean_t expected_tg = (src_processor->current_thread_group == sched_clutch->sc_tg);
	4390	sched_bucket_t processor_sched_bucket = src_processor->processor_set->cpu_running_buckets[cpuid];
	4391	boolean_t non_preferred_cluster = tg_bucket_preferred_cluster[processor_sched_bucket] != root_clutch->scr_cluster_id;
	4392
	4393	if (expected_tg && non_preferred_cluster) {
	4394	ipi_type[cpuid] = sched_ipi_action(src_processor, NULL, false, SCHED_IPI_EVENT_REBALANCE);
	4395	if (ipi_type[cpuid] != SCHED_IPI_NONE) {
	4396	bit_set(ast_processor_map, cpuid);
	4397	} else if (src_processor == current_processor()) {
	4398	ast_on(AST_PREEMPT);
	4399	bit_set(root_clutch->scr_pset->pending_AST_PREEMPT_cpu_mask, cpuid);
	4400	}
	4401	}
c6bf4f31 A	4402	}
c6bf4f31 A	4403
f427ee49 A	4404	/* Perform all the IPIs */
	4405	if (bit_first(ast_processor_map) != -1) {
	4406	for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
	4407	processor_t ast_processor = processor_array[cpuid];
	4408	sched_ipi_perform(ast_processor, ipi_type[cpuid]);
	4409	}
	4410	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) \| DBG_FUNC_NONE, thread_group_get_id(sched_clutch->sc_tg), ast_processor_map, 0, 0);
	4411	}
c6bf4f31 A	4412	}
c6bf4f31 A	4413
f427ee49 A	4414	/*
	4415	* sched_edge_tg_preferred_cluster_change()
	4416	*
	4417	* Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
	4418	* is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
	4419	* thread migrations based on the policy specified by CLPC.
	4420	* tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
	4421	* for each QoS within the thread group.
	4422	*/
	4423	void
	4424	sched_edge_tg_preferred_cluster_change(struct thread_group tg, uint32_t tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options)
c6bf4f31	4425	{
f427ee49	4426	sched_clutch_t clutch = sched_clutch_for_thread_group(tg);
c6bf4f31	4427	/*
f427ee49 A	4428	* In order to optimize the processing, create a bitmap which represents all QoS buckets
f427ee49 A	4429	* for which the preferred cluster has changed.
c6bf4f31	4430	*/
f427ee49 A	4431	bitmap_t clutch_bucket_modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {0};
	4432	for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
	4433	uint32_t old_preferred_cluster = sched_edge_clutch_bucket_group_preferred_cluster(&clutch->sc_clutch_groups[bucket]);
	4434	uint32_t new_preferred_cluster = tg_bucket_preferred_cluster[bucket];
	4435	if (old_preferred_cluster != new_preferred_cluster) {
	4436	bitmap_set(clutch_bucket_modify_bitmap, bucket);
	4437	}
	4438	}
	4439	if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
	4440	/* No changes in any clutch buckets; nothing to do here */
c6bf4f31 A	4441	return;
	4442	}
	4443
f427ee49 A	4444	for (uint32_t cluster_id = 0; cluster_id < MAX_PSETS; cluster_id++) {
	4445	processor_set_t pset = pset_array[cluster_id];
	4446	spl_t s = splsched();
	4447	pset_lock(pset);
	4448	/*
	4449	* The first operation is to update the preferred cluster for all QoS buckets within the
	4450	* thread group so that any future threads becoming runnable would see the new preferred
	4451	* cluster value.
	4452	*/
	4453	sched_edge_update_preferred_cluster(clutch, clutch_bucket_modify_bitmap, tg_bucket_preferred_cluster);
	4454	/*
	4455	* Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
	4456	* by keeping a per-clutch bitmap of clusters running threads for a particular TG.
	4457	*
	4458	* <Edge Multi-cluster Support Needed>
	4459	*/
	4460	/* Migrate all running threads of the TG on this cluster based on options specified by CLPC */
	4461	sched_edge_migrate_thread_group_running_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
	4462	tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING));
	4463	/* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */
	4464	sched_edge_migrate_thread_group_runnable_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
	4465	tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE));
	4466	/* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */
	4467	splx(s);
	4468	}
c6bf4f31 A	4469	}
	4470
	4471	/*
f427ee49	4472	* sched_edge_pset_made_schedulable()
c6bf4f31 A	4473	*
c6bf4f31 A	4474	* Routine to migrate all the clutch buckets which are not in their recommended
f427ee49 A	4475	* pset hierarchy now that a new pset has become runnable. Its possible that this
f427ee49 A	4476	* routine is called when the pset is already marked schedulable.
c6bf4f31	4477	*
f427ee49	4478	* Invoked with the pset lock held and interrupts disabled.
c6bf4f31 A	4479	*/
c6bf4f31 A	4480	static void
f427ee49	4481	sched_edge_pset_made_schedulable(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
c6bf4f31	4482	{
f427ee49 A	4483	if (bitmap_test(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id)) {
f427ee49 A	4484	/* Nothing to do here since pset is already marked schedulable */
c6bf4f31 A	4485	if (drop_lock) {
	4486	pset_unlock(dst_pset);
	4487	}
	4488	return;
	4489	}
c6bf4f31	4490
f427ee49	4491	bitmap_set(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id);
c6bf4f31	4492
f427ee49 A	4493	thread_t thread = sched_edge_processor_idle(dst_pset);
f427ee49 A	4494	if (thread != THREAD_NULL) {
c6bf4f31 A	4495	thread_lock(thread);
	4496	thread_setrun(thread, SCHED_TAILQ);
	4497	thread_unlock(thread);
	4498	}
	4499
c6bf4f31 A	4500	if (!drop_lock) {
	4501	pset_lock(dst_pset);
	4502	}
	4503	}
	4504
f427ee49 A	4505	extern int sched_amp_spill_deferred_ipi;
	4506	extern int sched_amp_pcores_preempt_immediate_ipi;
	4507
	4508	int sched_edge_migrate_ipi_immediate = 1;
	4509
	4510	sched_ipi_type_t
	4511	sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
	4512	{
	4513	processor_set_t pset = dst->processor_set;
	4514	assert(bit_test(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id) == false);
	4515	assert(dst != current_processor());
	4516
	4517	boolean_t deferred_ipi_supported = false;
	4518	#if defined(CONFIG_SCHED_DEFERRED_AST)
	4519	deferred_ipi_supported = true;
	4520	#endif /* CONFIG_SCHED_DEFERRED_AST */
	4521
	4522	switch (event) {
	4523	case SCHED_IPI_EVENT_SPILL:
	4524	/* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
	4525	if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
	4526	return sched_ipi_deferred_policy(pset, dst, event);
	4527	}
	4528	break;
	4529	case SCHED_IPI_EVENT_PREEMPT:
	4530	/* For preemption, the default policy is to use deferred IPIs
	4531	* for Non-RT P-core preemption. Override that behavior if
	4532	* sched_amp_pcores_preempt_immediate_ipi is set
	4533	*/
	4534	if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
	4535	if (sched_edge_migrate_ipi_immediate) {
	4536	/*
	4537	* For workloads that are going wide, it might be useful use Immediate IPI to
	4538	* wakeup the idle CPU if the scheduler estimates that the preferred pset will
	4539	* be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
	4540	* latency on the preferred pset as an estimate of busyness.
	4541	*
	4542	* <Edge Multi-cluster Support Needed>
	4543	*/
	4544	processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
	4545	if ((preferred_pset->pset_execution_time[thread->th_sched_bucket].pset_avg_thread_execution_time * NSEC_PER_USEC) >= ml_cpu_signal_deferred_get_timer()) {
	4546	return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
	4547	}
	4548	}
	4549	if (sched_amp_pcores_preempt_immediate_ipi && (pset_type_for_id(pset->pset_cluster_id) == CLUSTER_TYPE_P)) {
	4550	return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
	4551	}
	4552	}
	4553	break;
	4554	default:
	4555	break;
	4556	}
	4557	/* Default back to the global policy for all other scenarios */
	4558	return sched_ipi_policy(dst, thread, dst_idle, event);
	4559	}
	4560
	4561	/*
	4562	* sched_edge_qos_max_parallelism()
	4563	*/
	4564	uint32_t
	4565	sched_edge_qos_max_parallelism(int qos, uint64_t options)
	4566	{
	4567	uint32_t ecount = 0;
	4568	uint32_t pcount = 0;
4569
4570	for (int cluster_id = 0; cluster_id < MAX_PSETS; cluster_id++) {
4571	processor_set_t pset = pset_array[cluster_id];
4572	if (pset_type_for_id(cluster_id) == CLUSTER_TYPE_P) {
4573	pcount += pset->cpu_set_count;
4574	} else {
4575	ecount += pset->cpu_set_count;
4576	}
4577	}
4578
4579	if (options & QOS_PARALLELISM_REALTIME) {
4580	/* For realtime threads on AMP, we would want them
4581	* to limit the width to just the P-cores since we
4582	* do not spill/rebalance for RT threads.
4583	*/
4584	return pcount;
4585	}
4586
4587	/*
4588	* The Edge scheduler supports per-QoS recommendations for thread groups.
4589	* This enables lower QoS buckets (such as UT) to be scheduled on all
4590	* CPUs on the system.
4591	*
4592	* The only restriction is for BG/Maintenance QoS classes for which the
4593	* performance controller would never recommend execution on the P-cores.
4594	* If that policy changes in the future, this value should be changed.
4595	*/
4596	switch (qos) {
4597	case THREAD_QOS_BACKGROUND:
4598	case THREAD_QOS_MAINTENANCE:
4599	return ecount;
4600	default:
4601	return ecount + pcount;
4602	}
4603	}
4604
4605
4606
4607	#endif /* CONFIG_SCHED_EDGE */
cb323159 A	4608
cb323159 A	4609	#endif /* CONFIG_SCHED_CLUTCH */