]> git.saurik.com Git - apple/xnu.git/blame - osfmk/kern/sched_clutch.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / osfmk / kern / sched_clutch.c
CommitLineData
cb323159
A
1/*
2 * Copyright (c) 2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <mach/mach_types.h>
30#include <mach/machine.h>
31#include <machine/machine_routines.h>
32#include <machine/sched_param.h>
33#include <machine/machine_cpu.h>
34#include <kern/kern_types.h>
35#include <kern/debug.h>
36#include <kern/machine.h>
37#include <kern/misc_protos.h>
38#include <kern/processor.h>
39#include <kern/queue.h>
40#include <kern/sched.h>
41#include <kern/sched_prim.h>
42#include <kern/task.h>
43#include <kern/thread.h>
44#include <kern/sched_clutch.h>
45#include <machine/atomic.h>
46#include <kern/sched_clutch.h>
47#include <sys/kdebug.h>
48
f427ee49 49#if CONFIG_SCHED_EDGE
c6bf4f31 50#include <kern/sched_amp_common.h>
f427ee49 51#endif /* CONFIG_SCHED_EDGE */
cb323159
A
52
53#if CONFIG_SCHED_CLUTCH
54
55/* Forward declarations of static routines */
56
57/* Root level hierarchy management */
58static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
f427ee49 59static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t, bool);
cb323159 60static void sched_clutch_root_pri_update(sched_clutch_root_t);
cb323159
A
61static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
62static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
63
f427ee49
A
64__enum_decl(sched_clutch_highest_root_bucket_type_t, uint32_t, {
65 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE = 0,
66 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY = 1,
67 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL = 2,
68});
69
70static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t, sched_clutch_highest_root_bucket_type_t);
71
72#if CONFIG_SCHED_EDGE
73/* Support for foreign threads on AMP platforms */
74static boolean_t sched_clutch_root_foreign_empty(sched_clutch_root_t);
75static thread_t sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t);
76#endif /* CONFIG_SCHED_EDGE */
77
cb323159
A
78/* Root bucket level hierarchy management */
79static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
80static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t);
cb323159 81
ea3f0419
A
82/* Options for clutch bucket ordering in the runq */
83__options_decl(sched_clutch_bucket_options_t, uint32_t, {
84 SCHED_CLUTCH_BUCKET_OPTIONS_NONE = 0x0,
85 /* Round robin clutch bucket on thread removal */
86 SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR = 0x1,
87 /* Insert clutch bucket at head (for thread preemption) */
88 SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ = 0x2,
89 /* Insert clutch bucket at tail (default) */
90 SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ = 0x4,
91});
92
cb323159 93/* Clutch bucket level hierarchy management */
ea3f0419
A
94static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
95static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
96static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
97static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
98static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
cb323159 99static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
f427ee49
A
100
101/* Clutch bucket group level properties management */
102static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t);
103static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t);
104static void sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t, sched_clutch_bucket_t, uint64_t);
105static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t);
106static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t);
107static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t);
108static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t, uint64_t);
cb323159
A
109
110/* Clutch timeshare properties updates */
111static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
112static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
f427ee49 113
cb323159
A
114/* Clutch membership management */
115static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
ea3f0419 116static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
f427ee49 117static thread_t sched_clutch_thread_highest_remove(sched_clutch_root_t);
cb323159
A
118
119/* Clutch properties updates */
120static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
121static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
122static int sched_clutch_root_priority(sched_clutch_root_t);
f427ee49
A
123static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t);
124static boolean_t sched_thread_sched_pri_promoted(thread_t);
cb323159 125
f427ee49 126#if CONFIG_SCHED_EDGE
c6bf4f31 127/* System based routines */
f427ee49
A
128static bool sched_edge_pset_available(processor_set_t);
129static uint32_t sched_edge_thread_bound_cluster_id(thread_t);
130#endif /* CONFIG_SCHED_EDGE */
cb323159
A
131
132/* Helper debugging routines */
133static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
134
f427ee49 135extern processor_set_t pset_array[MAX_PSETS];
cb323159
A
136
137/*
138 * Special markers for buckets that have invalid WCELs/quantums etc.
139 */
140#define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
141#define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
142
143/*
144 * Root level bucket WCELs
145 *
146 * The root level bucket selection algorithm is an Earliest Deadline
147 * First (EDF) algorithm where the deadline for buckets are defined
148 * by the worst-case-execution-latency and the make runnable timestamp
149 * for the bucket.
150 *
151 */
152static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
153 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
154 0, /* FG */
155 37500, /* IN (37.5ms) */
156 75000, /* DF (75ms) */
157 150000, /* UT (150ms) */
158 250000 /* BG (250ms) */
159};
160static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
161
162/*
163 * Root level bucket warp
164 *
165 * Each root level bucket has a warp value associated with it as well.
166 * The warp value allows the root bucket to effectively warp ahead of
167 * lower priority buckets for a limited time even if it has a later
168 * deadline. The warping behavior provides extra (but limited)
169 * opportunity for high priority buckets to remain responsive.
170 */
171
172/* Special warp deadline value to indicate that the bucket has not used any warp yet */
173#define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED (SCHED_CLUTCH_INVALID_TIME_64)
174
175/* Warp window durations for various tiers */
176static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
177 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
178 8000, /* FG (8ms)*/
179 4000, /* IN (4ms) */
180 2000, /* DF (2ms) */
181 1000, /* UT (1ms) */
182 0 /* BG (0ms) */
183};
184static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
185
186/*
187 * Thread level quantum
188 *
189 * The algorithm defines quantums for threads at various buckets. This
190 * (combined with the root level bucket quantums) restricts how much
191 * the lower priority levels can preempt the higher priority threads.
192 */
f427ee49
A
193
194#if XNU_TARGET_OS_OSX
195static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
196 10000, /* FIXPRI (10ms) */
197 10000, /* FG (10ms) */
198 10000, /* IN (10ms) */
199 10000, /* DF (10ms) */
200 4000, /* UT (4ms) */
201 2000 /* BG (2ms) */
202};
203#else /* XNU_TARGET_OS_OSX */
cb323159
A
204static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
205 10000, /* FIXPRI (10ms) */
206 10000, /* FG (10ms) */
207 8000, /* IN (8ms) */
208 6000, /* DF (6ms) */
209 4000, /* UT (4ms) */
210 2000 /* BG (2ms) */
211};
f427ee49 212#endif /* XNU_TARGET_OS_OSX */
cb323159 213
f427ee49 214static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
cb323159
A
215
216/*
217 * sched_clutch_us_to_abstime()
218 *
219 * Initializer for converting all durations in usec to abstime
220 */
221static void
222sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
223{
224 for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
225 if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
226 abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
227 } else {
228 clock_interval_to_absolutetime_interval(us_vals[i],
229 NSEC_PER_USEC, &abstime_vals[i]);
230 }
231 }
232}
233
f427ee49
A
234/* Clutch/Edge Scheduler Debugging support */
235#define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c) ((uint64_t)c | ((uint64_t)b << 16) | ((uint64_t)a << 32))
236
cb323159
A
237#if DEVELOPMENT || DEBUG
238
239/*
240 * sched_clutch_hierarchy_locked_assert()
241 *
242 * Debugging helper routine. Asserts that the hierarchy is locked. The locking
243 * for the hierarchy depends on where the hierarchy is hooked. The current
244 * implementation hooks the hierarchy at the pset, so the hierarchy is locked
245 * using the pset lock.
246 */
247static inline void
248sched_clutch_hierarchy_locked_assert(
249 sched_clutch_root_t root_clutch)
250{
251 pset_assert_locked(root_clutch->scr_pset);
252}
253
254#else /* DEVELOPMENT || DEBUG */
255
256static inline void
257sched_clutch_hierarchy_locked_assert(
258 __unused sched_clutch_root_t root_clutch)
259{
260}
261
262#endif /* DEVELOPMENT || DEBUG */
263
264/*
265 * sched_clutch_thr_count_inc()
266 *
267 * Increment thread count at a hierarchy level with overflow checks.
268 */
269static void
270sched_clutch_thr_count_inc(
271 uint16_t *thr_count)
272{
273 if (__improbable(os_inc_overflow(thr_count))) {
274 panic("sched_clutch thread count overflowed!");
275 }
276}
277
278/*
279 * sched_clutch_thr_count_dec()
280 *
281 * Decrement thread count at a hierarchy level with underflow checks.
282 */
283static void
284sched_clutch_thr_count_dec(
285 uint16_t *thr_count)
286{
287 if (__improbable(os_dec_overflow(thr_count))) {
288 panic("sched_clutch thread count underflowed!");
289 }
290}
291
c6bf4f31 292/*
f427ee49
A
293 * The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
294 * based on the amount of time they have been pending and the load at that
295 * scheduling bucket level. Since the clutch bucket groups are global (i.e. span
296 * multiple clusters, its important to keep the load also as a global counter.
c6bf4f31 297 */
f427ee49 298static uint32_t _Atomic sched_clutch_global_bucket_load[TH_BUCKET_SCHED_MAX];
cb323159
A
299
300/*
301 * sched_clutch_root_init()
302 *
303 * Routine to initialize the scheduler hierarchy root.
304 */
305static void
306sched_clutch_root_init(
307 sched_clutch_root_t root_clutch,
308 processor_set_t pset)
309{
310 root_clutch->scr_thr_count = 0;
311 root_clutch->scr_priority = NOPRI;
312 root_clutch->scr_urgency = 0;
313 root_clutch->scr_pset = pset;
f427ee49
A
314#if CONFIG_SCHED_EDGE
315 root_clutch->scr_cluster_id = pset->pset_cluster_id;
316#else /* CONFIG_SCHED_EDGE */
317 root_clutch->scr_cluster_id = 0;
318#endif /* CONFIG_SCHED_EDGE */
cb323159
A
319
320 /* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
321 queue_init(&root_clutch->scr_clutch_buckets);
322
f427ee49
A
323 /* Initialize the priority queue which maintains all runnable foreign clutch buckets */
324 priority_queue_init(&root_clutch->scr_foreign_buckets);
325 bzero(&root_clutch->scr_cumulative_run_count, sizeof(root_clutch->scr_cumulative_run_count));
326 bitmap_zero(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
327 bitmap_zero(root_clutch->scr_bound_warp_available, TH_BUCKET_SCHED_MAX);
328 priority_queue_init(&root_clutch->scr_bound_root_buckets);
cb323159
A
329
330 /* Initialize the bitmap and priority queue of runnable root buckets */
f427ee49
A
331 priority_queue_init(&root_clutch->scr_unbound_root_buckets);
332 bitmap_zero(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
333 bitmap_zero(root_clutch->scr_unbound_warp_available, TH_BUCKET_SCHED_MAX);
cb323159
A
334
335 /* Initialize all the root buckets */
336 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
f427ee49
A
337 sched_clutch_root_bucket_init(&root_clutch->scr_unbound_buckets[i], i, false);
338 sched_clutch_root_bucket_init(&root_clutch->scr_bound_buckets[i], i, true);
cb323159
A
339 }
340}
341
ea3f0419
A
342/*
343 * Clutch Bucket Runqueues
344 *
345 * The clutch buckets are maintained in a runq at the root bucket level. The
346 * runq organization allows clutch buckets to be ordered based on various
347 * factors such as:
348 *
349 * - Clutch buckets are round robin'ed at the same priority level when a
350 * thread is selected from a clutch bucket. This prevents a clutch bucket
351 * from starving out other clutch buckets at the same priority.
352 *
353 * - Clutch buckets are inserted at the head when it becomes runnable due to
354 * thread preemption. This allows threads that were preempted to maintain
355 * their order in the queue.
356 *
357 */
358
359/*
360 * sched_clutch_bucket_runq_init()
361 *
362 * Initialize a clutch bucket runq.
363 */
364static void
365sched_clutch_bucket_runq_init(
366 sched_clutch_bucket_runq_t clutch_buckets_rq)
367{
368 clutch_buckets_rq->scbrq_highq = NOPRI;
369 for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) {
370 clutch_buckets_rq->scbrq_bitmap[i] = 0;
371 }
372 clutch_buckets_rq->scbrq_count = 0;
373 for (int i = 0; i < NRQS; i++) {
374 circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
375 }
376}
377
378/*
379 * sched_clutch_bucket_runq_empty()
380 *
381 * Returns if a clutch bucket runq is empty.
382 */
383static boolean_t
384sched_clutch_bucket_runq_empty(
385 sched_clutch_bucket_runq_t clutch_buckets_rq)
386{
387 return clutch_buckets_rq->scbrq_count == 0;
388}
389
390/*
391 * sched_clutch_bucket_runq_peek()
392 *
393 * Returns the highest priority clutch bucket in the runq.
394 */
395static sched_clutch_bucket_t
396sched_clutch_bucket_runq_peek(
397 sched_clutch_bucket_runq_t clutch_buckets_rq)
398{
399 if (clutch_buckets_rq->scbrq_count > 0) {
400 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
401 return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
402 } else {
403 return NULL;
404 }
405}
406
407/*
408 * sched_clutch_bucket_runq_enqueue()
409 *
410 * Enqueue a clutch bucket into the runq based on the options passed in.
411 */
412static void
413sched_clutch_bucket_runq_enqueue(
414 sched_clutch_bucket_runq_t clutch_buckets_rq,
415 sched_clutch_bucket_t clutch_bucket,
416 sched_clutch_bucket_options_t options)
417{
418 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
419 if (circle_queue_empty(queue)) {
420 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
421 bitmap_set(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
422 if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
423 clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
424 }
425 } else {
426 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
427 circle_enqueue_head(queue, &clutch_bucket->scb_runqlink);
428 } else {
429 /*
430 * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
431 * SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
432 */
433 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
434 }
435 }
436 clutch_buckets_rq->scbrq_count++;
437}
438
439/*
440 * sched_clutch_bucket_runq_remove()
441 *
442 * Remove a clutch bucket from the runq.
443 */
444static void
445sched_clutch_bucket_runq_remove(
446 sched_clutch_bucket_runq_t clutch_buckets_rq,
447 sched_clutch_bucket_t clutch_bucket)
448{
449 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
450 circle_dequeue(queue, &clutch_bucket->scb_runqlink);
451 assert(clutch_buckets_rq->scbrq_count > 0);
452 clutch_buckets_rq->scbrq_count--;
453 if (circle_queue_empty(queue)) {
454 bitmap_clear(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
455 clutch_buckets_rq->scbrq_highq = bitmap_first(clutch_buckets_rq->scbrq_bitmap, NRQS);
456 }
457}
458
459static void
460sched_clutch_bucket_runq_rotate(
461 sched_clutch_bucket_runq_t clutch_buckets_rq,
462 sched_clutch_bucket_t clutch_bucket)
463{
464 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
465 assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
466 circle_queue_rotate_head_forward(queue);
467}
468
cb323159
A
469/*
470 * sched_clutch_root_bucket_init()
471 *
472 * Routine to initialize root buckets.
473 */
474static void
475sched_clutch_root_bucket_init(
476 sched_clutch_root_bucket_t root_bucket,
f427ee49
A
477 sched_bucket_t bucket,
478 bool bound_root_bucket)
cb323159
A
479{
480 root_bucket->scrb_bucket = bucket;
f427ee49
A
481 if (bound_root_bucket) {
482 /* For bound root buckets, initialize the bound thread runq. */
483 root_bucket->scrb_bound = true;
484 run_queue_init(&root_bucket->scrb_bound_thread_runq);
485 } else {
486 /*
487 * The unbounded root buckets contain a runq of runnable clutch buckets
488 * which then hold the runnable threads.
489 */
490 root_bucket->scrb_bound = false;
491 sched_clutch_bucket_runq_init(&root_bucket->scrb_clutch_buckets);
492 }
cb323159 493 priority_queue_entry_init(&root_bucket->scrb_pqlink);
f427ee49 494 root_bucket->scrb_pqlink.deadline = SCHED_CLUTCH_INVALID_TIME_64;
cb323159
A
495 root_bucket->scrb_warped_deadline = 0;
496 root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
f427ee49
A
497 root_bucket->scrb_starvation_avoidance = false;
498 root_bucket->scrb_starvation_ts = 0;
cb323159
A
499}
500
501/*
cb323159
A
502 * Special case scheduling for Above UI bucket.
503 *
504 * AboveUI threads are typically system critical threads that need low latency
505 * which is why they are handled specially.
506 *
507 * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
f427ee49
A
508 * important to maintain some native priority order between those buckets. For unbounded
509 * root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
cb323159 510 * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
f427ee49
A
511 * deadline based scheduling which should pickup the timeshare buckets. For the bound
512 * case, the policy simply compares the priority of the highest runnable threads in
513 * the above UI and timeshare buckets.
cb323159
A
514 *
515 * The implementation allows extremely low latency CPU access for Above UI threads
516 * while supporting the use case of high priority timeshare threads contending with
517 * lower priority fixed priority threads.
518 */
f427ee49
A
519
520
521/*
522 * sched_clutch_root_unbound_select_aboveui()
523 *
524 * Routine to determine if the above UI unbounded bucket should be selected for execution.
525 */
526static bool
527sched_clutch_root_unbound_select_aboveui(
cb323159
A
528 sched_clutch_root_t root_clutch)
529{
f427ee49
A
530 if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_FIXPRI)) {
531 sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
532 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_unbound_buckets[TH_BUCKET_SHARE_FG];
533 if (!bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
cb323159
A
534 /* If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling */
535 return true;
536 }
f427ee49
A
537 sched_clutch_bucket_t clutch_bucket_aboveui = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_aboveui);
538 sched_clutch_bucket_t clutch_bucket_sharefg = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_sharefg);
539 if (clutch_bucket_aboveui->scb_priority >= clutch_bucket_sharefg->scb_priority) {
cb323159
A
540 return true;
541 }
542 }
543 return false;
544}
545
f427ee49
A
546/*
547 * sched_clutch_root_bound_select_aboveui()
548 *
549 * Routine to determine if the above UI bounded bucket should be selected for execution.
550 */
551static bool
552sched_clutch_root_bound_select_aboveui(
553 sched_clutch_root_t root_clutch)
554{
555 sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
556 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_bound_buckets[TH_BUCKET_SHARE_FG];
557 if (root_bucket_aboveui->scrb_bound_thread_runq.count == 0) {
558 return false;
559 }
560 return root_bucket_aboveui->scrb_bound_thread_runq.highq >= root_bucket_sharefg->scrb_bound_thread_runq.highq;
561}
cb323159
A
562
563/*
564 * sched_clutch_root_highest_root_bucket()
565 *
566 * Main routine to find the highest runnable root level bucket.
567 * This routine is called from performance sensitive contexts; so it is
f427ee49
A
568 * crucial to keep this O(1). The options parameter determines if
569 * the selection logic should look at unbounded threads only (for
570 * cross-cluster stealing operations) or both bounded and unbounded
571 * threads (for selecting next thread for execution on current cluster).
cb323159
A
572 */
573static sched_clutch_root_bucket_t
574sched_clutch_root_highest_root_bucket(
575 sched_clutch_root_t root_clutch,
f427ee49
A
576 uint64_t timestamp,
577 sched_clutch_highest_root_bucket_type_t type)
cb323159
A
578{
579 sched_clutch_hierarchy_locked_assert(root_clutch);
f427ee49
A
580 int highest_runnable_bucket = -1;
581 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
582 highest_runnable_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
583 } else {
584 int highest_unbound_runnable_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
585 int highest_bound_runnable_bucket = bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
586 highest_runnable_bucket = (highest_bound_runnable_bucket != -1) ? ((highest_unbound_runnable_bucket != -1) ? MIN(highest_bound_runnable_bucket, highest_unbound_runnable_bucket) : highest_bound_runnable_bucket) : highest_unbound_runnable_bucket;
587 }
588
589 if (highest_runnable_bucket == -1) {
cb323159
A
590 return NULL;
591 }
592
f427ee49
A
593 /* Above UI root bucket selection (see comment above for more details on this special case handling) */
594 bool unbound_aboveui = sched_clutch_root_unbound_select_aboveui(root_clutch);
595 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
596 if (unbound_aboveui) {
597 return &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
598 }
599 /* Fall through to selecting a timeshare root bucket */
600 } else {
601 bool bound_aboveui = sched_clutch_root_bound_select_aboveui(root_clutch);
602 sched_clutch_root_bucket_t unbound_aboveui_root_bucket = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
603 sched_clutch_root_bucket_t bound_aboveui_root_bucket = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
604
605 if (unbound_aboveui && bound_aboveui) {
606 /*
607 * In this scenario both the bounded and unbounded above UI buckets are runnable; choose based on the
608 * highest runnable priority in both the buckets.
609 * */
610 int bound_aboveui_pri = root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI].scrb_bound_thread_runq.highq;
611 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(unbound_aboveui_root_bucket);
612 int unbound_aboveui_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
613 return (bound_aboveui_pri >= unbound_aboveui_pri) ? bound_aboveui_root_bucket : unbound_aboveui_root_bucket;
614 }
615 if (unbound_aboveui) {
616 return unbound_aboveui_root_bucket;
617 }
618 if (bound_aboveui) {
619 return bound_aboveui_root_bucket;
620 }
621 /* Fall through to selecting a timeshare root bucket */
cb323159
A
622 }
623
624 /*
f427ee49
A
625 * Above UI bucket is not runnable or has a low priority runnable thread; use the
626 * earliest deadline model to schedule threads. The idea is that as the timeshare
627 * buckets use CPU, they will drop their interactivity score/sched priority and
628 * allow the low priority AboveUI buckets to be scheduled.
cb323159
A
629 */
630
631 /* Find the earliest deadline bucket */
f427ee49 632 sched_clutch_root_bucket_t edf_bucket = NULL;
cb323159
A
633 sched_clutch_root_bucket_t warp_bucket = NULL;
634 int warp_bucket_index = -1;
f427ee49
A
635
636evaluate_root_buckets:
637 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
638 edf_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
639 } else {
640 sched_clutch_root_bucket_t unbound_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
641 sched_clutch_root_bucket_t bound_bucket = priority_queue_min(&root_clutch->scr_bound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
642 if (bound_bucket && unbound_bucket) {
643 /* If bound and unbound root buckets are runnable, select the one with the earlier deadline */
644 edf_bucket = (bound_bucket->scrb_pqlink.deadline <= unbound_bucket->scrb_pqlink.deadline) ? bound_bucket : unbound_bucket;
645 } else {
646 edf_bucket = (bound_bucket) ? bound_bucket : unbound_bucket;
647 }
648 }
649 /*
650 * Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
651 * buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
652 * makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
653 * QoS root bucket.
654 */
655 bitmap_t *warp_available_bitmap = (edf_bucket->scrb_bound) ? (root_clutch->scr_bound_warp_available) : (root_clutch->scr_unbound_warp_available);
656 warp_bucket_index = bitmap_lsb_first(warp_available_bitmap, TH_BUCKET_SCHED_MAX);
cb323159
A
657
658 if ((warp_bucket_index == -1) || (warp_bucket_index >= edf_bucket->scrb_bucket)) {
f427ee49
A
659 /* No higher buckets have warp left; best choice is the EDF based bucket */
660 if (edf_bucket->scrb_starvation_avoidance) {
661 /*
662 * Indicates that the earliest deadline bucket is in starvation avoidance mode. Check to see if the
663 * starvation avoidance window is still open and return this bucket if it is.
664 *
665 * The starvation avoidance window is calculated based on the quantum of threads at that bucket and
666 * the number of CPUs in the cluster. The idea is to basically provide one quantum worth of starvation
667 * avoidance across all CPUs.
668 */
669 uint64_t starvation_window = sched_clutch_thread_quantum[edf_bucket->scrb_bucket] / root_clutch->scr_pset->online_processor_count;
670 if (timestamp < (edf_bucket->scrb_starvation_ts + starvation_window)) {
671 return edf_bucket;
672 } else {
673 /* Starvation avoidance window is over; update deadline and re-evaluate EDF */
674 edf_bucket->scrb_starvation_avoidance = false;
675 edf_bucket->scrb_starvation_ts = 0;
676 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
677 }
678 goto evaluate_root_buckets;
679 }
680
681 /* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */
682 if (highest_runnable_bucket < edf_bucket->scrb_bucket) {
683 /* Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance */
684 edf_bucket->scrb_starvation_avoidance = true;
685 edf_bucket->scrb_starvation_ts = timestamp;
686 } else {
687 /* EDF bucket is being selected in the natural order; update deadline and reset warp */
688 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
689 edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
690 edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
691 if (edf_bucket->scrb_bound) {
692 bitmap_set(root_clutch->scr_bound_warp_available, edf_bucket->scrb_bucket);
693 } else {
694 bitmap_set(root_clutch->scr_unbound_warp_available, edf_bucket->scrb_bucket);
695 }
696 }
cb323159
A
697 return edf_bucket;
698 }
699
700 /*
701 * Looks like there is a root bucket which is higher in the natural priority
702 * order than edf_bucket and might have some warp remaining.
703 */
f427ee49 704 warp_bucket = (edf_bucket->scrb_bound) ? &root_clutch->scr_bound_buckets[warp_bucket_index] : &root_clutch->scr_unbound_buckets[warp_bucket_index];
cb323159
A
705 if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
706 /* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
707 warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
708 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
709 return warp_bucket;
710 }
711 if (warp_bucket->scrb_warped_deadline > timestamp) {
712 /* Root bucket already has a warp window open with some warp remaining */
713 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
714 return warp_bucket;
715 }
716
717 /* For this bucket, warp window was opened sometime in the past but has now
718 * expired. Mark the bucket as not avilable for warp anymore and re-run the
719 * warp bucket selection logic.
720 */
721 warp_bucket->scrb_warp_remaining = 0;
f427ee49
A
722 if (warp_bucket->scrb_bound) {
723 bitmap_clear(root_clutch->scr_bound_warp_available, warp_bucket->scrb_bucket);
724 } else {
725 bitmap_clear(root_clutch->scr_unbound_warp_available, warp_bucket->scrb_bucket);
726 }
727 goto evaluate_root_buckets;
cb323159
A
728}
729
730/*
731 * sched_clutch_root_bucket_deadline_calculate()
732 *
733 * Calculate the deadline for the bucket based on its WCEL
734 */
735static uint64_t
736sched_clutch_root_bucket_deadline_calculate(
737 sched_clutch_root_bucket_t root_bucket,
738 uint64_t timestamp)
739{
740 /* For fixpri AboveUI bucket always return it as the earliest deadline */
741 if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
742 return 0;
743 }
744
745 /* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
746 return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
747}
748
749/*
750 * sched_clutch_root_bucket_deadline_update()
751 *
752 * Routine to update the deadline of the root bucket when it is selected.
753 * Updating the deadline also moves the root_bucket in the EDF priority
754 * queue.
755 */
756static void
757sched_clutch_root_bucket_deadline_update(
758 sched_clutch_root_bucket_t root_bucket,
759 sched_clutch_root_t root_clutch,
760 uint64_t timestamp)
761{
762 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
763 /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
764 return;
765 }
f427ee49
A
766
767 uint64_t old_deadline = root_bucket->scrb_pqlink.deadline;
cb323159 768 uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
f427ee49
A
769 if (__improbable(old_deadline > new_deadline)) {
770 panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline, new_deadline, root_bucket->scrb_bucket, timestamp);
771 }
cb323159 772 if (old_deadline != new_deadline) {
f427ee49
A
773 root_bucket->scrb_pqlink.deadline = new_deadline;
774 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
775 priority_queue_entry_increased(prioq, &root_bucket->scrb_pqlink);
cb323159
A
776 }
777}
778
779/*
780 * sched_clutch_root_bucket_runnable()
781 *
782 * Routine to insert a newly runnable root bucket into the hierarchy.
783 * Also updates the deadline and warp parameters as necessary.
784 */
785static void
786sched_clutch_root_bucket_runnable(
787 sched_clutch_root_bucket_t root_bucket,
788 sched_clutch_root_t root_clutch,
789 uint64_t timestamp)
790{
791 /* Mark the root bucket as runnable */
f427ee49
A
792 bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
793 bitmap_set(runnable_bitmap, root_bucket->scrb_bucket);
cb323159
A
794
795 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
796 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
797 return;
798 }
799
f427ee49
A
800 if (root_bucket->scrb_starvation_avoidance == false) {
801 /*
802 * Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
803 * starvation avoidance and its window has expired, the highest root bucket selection logic will notice
804 * that and fix it up.
805 */
806 root_bucket->scrb_pqlink.deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
807 }
808 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
809 priority_queue_insert(prioq, &root_bucket->scrb_pqlink);
cb323159
A
810 if (root_bucket->scrb_warp_remaining) {
811 /* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
f427ee49
A
812 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
813 bitmap_set(warp_bitmap, root_bucket->scrb_bucket);
cb323159
A
814 }
815}
816
817/*
818 * sched_clutch_root_bucket_empty()
819 *
820 * Routine to remove an empty root bucket from the hierarchy.
821 * Also updates the deadline and warp parameters as necessary.
822 */
823static void
824sched_clutch_root_bucket_empty(
825 sched_clutch_root_bucket_t root_bucket,
826 sched_clutch_root_t root_clutch,
827 uint64_t timestamp)
828{
f427ee49
A
829 bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
830 bitmap_clear(runnable_bitmap, root_bucket->scrb_bucket);
cb323159
A
831
832 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
833 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
834 return;
835 }
836
f427ee49
A
837 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
838 priority_queue_remove(prioq, &root_bucket->scrb_pqlink);
839
840 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
841 bitmap_clear(warp_bitmap, root_bucket->scrb_bucket);
cb323159 842
cb323159
A
843 if (root_bucket->scrb_warped_deadline > timestamp) {
844 /*
845 * For root buckets that were using the warp, check if the warp
846 * deadline is in the future. If yes, remove the wall time the
847 * warp was active and update the warp remaining. This allows
848 * the root bucket to use the remaining warp the next time it
849 * becomes runnable.
850 */
851 root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
852 } else if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
853 /*
854 * If the root bucket's warped deadline is in the past, it has used up
855 * all the warp it was assigned. Empty out its warp remaining.
856 */
857 root_bucket->scrb_warp_remaining = 0;
858 }
859}
860
f427ee49
A
861static int
862sched_clutch_global_bucket_load_get(
863 sched_bucket_t bucket)
864{
865 return (int)os_atomic_load(&sched_clutch_global_bucket_load[bucket], relaxed);
866}
867
cb323159
A
868/*
869 * sched_clutch_root_pri_update()
870 *
871 * The root level priority is used for thread selection and preemption
872 * logic.
f427ee49
A
873 *
874 * The logic uses the same decision as thread selection for deciding between the
875 * above UI and timeshare buckets. If one of the timesharing buckets have to be
876 * used for priority calculation, the logic is slightly different from thread
877 * selection, because thread selection considers deadlines, warps etc. to
878 * decide the most optimal bucket at a given timestamp. Since the priority
879 * value is used for preemption decisions only, it needs to be based on the
880 * highest runnable thread available in the timeshare domain. This logic can
881 * be made more sophisticated if there are cases of unnecessary preemption
882 * being seen in workloads.
cb323159
A
883 */
884static void
885sched_clutch_root_pri_update(
886 sched_clutch_root_t root_clutch)
887{
888 sched_clutch_hierarchy_locked_assert(root_clutch);
f427ee49
A
889 int16_t root_bound_pri = NOPRI;
890 int16_t root_unbound_pri = NOPRI;
891
892 if (bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
893 goto root_pri_update_unbound;
cb323159 894 }
f427ee49
A
895 sched_clutch_root_bucket_t root_bucket_bound = NULL;
896 if (sched_clutch_root_bound_select_aboveui(root_clutch)) {
897 root_bucket_bound = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
cb323159 898 } else {
f427ee49
A
899 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
900 assert(root_bucket_index != -1);
901 root_bucket_bound = &root_clutch->scr_bound_buckets[root_bucket_index];
902 }
903 root_bound_pri = root_bucket_bound->scrb_bound_thread_runq.highq;
904
905root_pri_update_unbound:
906 if (bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
907 goto root_pri_update_complete;
908 }
909 sched_clutch_root_bucket_t root_bucket_unbound = NULL;
910 if (sched_clutch_root_unbound_select_aboveui(root_clutch)) {
911 root_bucket_unbound = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
912 } else {
913 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
cb323159 914 assert(root_bucket_index != -1);
f427ee49 915 root_bucket_unbound = &root_clutch->scr_unbound_buckets[root_bucket_index];
cb323159
A
916 }
917 /* For the selected root bucket, find the highest priority clutch bucket */
f427ee49
A
918 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_unbound);
919 root_unbound_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
920
921root_pri_update_complete:
922 root_clutch->scr_priority = MAX(root_bound_pri, root_unbound_pri);
cb323159
A
923}
924
925/*
926 * sched_clutch_root_urgency_inc()
927 *
928 * Routine to increment the urgency at the root level based on the thread
929 * priority that is being inserted into the hierarchy. The root urgency
930 * counter is updated based on the urgency of threads in any of the
931 * clutch buckets which are part of the hierarchy.
932 *
933 * Always called with the pset lock held.
934 */
935static void
936sched_clutch_root_urgency_inc(
937 sched_clutch_root_t root_clutch,
938 thread_t thread)
939{
940 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
941 root_clutch->scr_urgency++;
942 }
943}
944
945/*
946 * sched_clutch_root_urgency_dec()
947 *
948 * Routine to decrement the urgency at the root level based on the thread
949 * priority that is being removed from the hierarchy. The root urgency
950 * counter is updated based on the urgency of threads in any of the
951 * clutch buckets which are part of the hierarchy.
952 *
953 * Always called with the pset lock held.
954 */
955static void
956sched_clutch_root_urgency_dec(
957 sched_clutch_root_t root_clutch,
958 thread_t thread)
959{
960 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
961 root_clutch->scr_urgency--;
962 }
963}
964
965/*
966 * Clutch bucket level scheduling
967 *
968 * The second level of scheduling is the clutch bucket level scheduling
969 * which tries to schedule thread groups within root_buckets. Each
f427ee49 970 * clutch represents a thread group and a clutch_bucket_group represents
cb323159 971 * threads at a particular sched_bucket within that thread group. The
f427ee49
A
972 * clutch_bucket_group contains a clutch_bucket per cluster on the system
973 * where it holds the runnable threads destined for execution on that
974 * cluster.
975 *
976 * The goal of this level of scheduling is to allow interactive thread
cb323159
A
977 * groups low latency access to the CPU. It also provides slight
978 * scheduling preference for App and unrestricted thread groups.
979 *
980 * The clutch bucket scheduling algorithm measures an interactivity
f427ee49 981 * score for all clutch bucket groups. The interactivity score is based
cb323159 982 * on the ratio of the CPU used and the voluntary blocking of threads
f427ee49 983 * within the clutch bucket group. The algorithm is very close to the ULE
cb323159
A
984 * scheduler on FreeBSD in terms of calculations. The interactivity
985 * score provides an interactivity boost in the range of
986 * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
987 * thread groups to win over CPU spinners.
f427ee49
A
988 *
989 * The interactivity score of the clutch bucket group is combined with the
990 * highest base/promoted priority of threads in the clutch bucket to form
991 * the overall priority of the clutch bucket.
cb323159
A
992 */
993
994/* Priority boost range for interactivity */
f427ee49
A
995#define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT (8)
996uint8_t sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
cb323159
A
997
998/* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
f427ee49
A
999uint64_t sched_clutch_bucket_group_adjust_threshold = 0;
1000#define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS (500000)
cb323159
A
1001
1002/* The ratio to scale the cpu/blocked time per window */
f427ee49 1003#define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO (10)
cb323159
A
1004
1005/*
1006 * In order to allow App thread groups some preference over daemon thread
f427ee49 1007 * groups, the App clutch_buckets get a priority boost. The boost value should
cb323159 1008 * be chosen such that badly behaved apps are still penalized over well
f427ee49 1009 * behaved interactive daemons.
cb323159 1010 */
f427ee49
A
1011static uint8_t sched_clutch_bucket_group_pri_boost[SCHED_CLUTCH_TG_PRI_MAX] = {
1012 [SCHED_CLUTCH_TG_PRI_LOW] = 0,
1013 [SCHED_CLUTCH_TG_PRI_MED] = 2,
1014 [SCHED_CLUTCH_TG_PRI_HIGH] = 4,
1015};
cb323159
A
1016
1017/* Initial value for voluntary blocking time for the clutch_bucket */
f427ee49
A
1018#define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID (uint64_t)(~0)
1019
1020/* Value indicating the clutch bucket is not pending execution */
1021#define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID ((uint64_t)(~0))
1022
1023/*
1024 * Thread group CPU starvation avoidance
1025 *
1026 * In heavily CPU contended scenarios, it is possible that some thread groups
1027 * which have a low interactivity score do not get CPU time at all. In order to
1028 * resolve that, the scheduler tries to ageout the CPU usage of the clutch
1029 * bucket group when it has been pending execution for a certain time as defined
1030 * by the sched_clutch_bucket_group_pending_delta_us values below.
1031 *
1032 * The values chosen here are very close to the WCEL values for each sched bucket.
1033 * These values are multiplied by the load average of the relevant root bucket to
1034 * provide an estimate of the actual clutch bucket load.
1035 */
1036static uint32_t sched_clutch_bucket_group_pending_delta_us[TH_BUCKET_SCHED_MAX] = {
1037 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
1038 10000, /* FG */
1039 37500, /* IN */
1040 75000, /* DF */
1041 150000, /* UT */
1042 250000, /* BG */
1043};
1044static uint64_t sched_clutch_bucket_group_pending_delta[TH_BUCKET_SCHED_MAX] = {0};
cb323159
A
1045
1046/*
1047 * sched_clutch_bucket_init()
1048 *
1049 * Initializer for clutch buckets.
1050 */
1051static void
1052sched_clutch_bucket_init(
1053 sched_clutch_bucket_t clutch_bucket,
f427ee49 1054 sched_clutch_bucket_group_t clutch_bucket_group,
cb323159
A
1055 sched_bucket_t bucket)
1056{
1057 bzero(clutch_bucket, sizeof(struct sched_clutch_bucket));
1058
1059 clutch_bucket->scb_bucket = bucket;
1060 /* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
1061 clutch_bucket->scb_priority = 0;
f427ee49
A
1062#if CONFIG_SCHED_EDGE
1063 clutch_bucket->scb_foreign = false;
1064 priority_queue_entry_init(&clutch_bucket->scb_foreignlink);
1065#endif /* CONFIG_SCHED_EDGE */
1066 clutch_bucket->scb_group = clutch_bucket_group;
1067 clutch_bucket->scb_root = NULL;
1068 priority_queue_init(&clutch_bucket->scb_clutchpri_prioq);
1069 priority_queue_init(&clutch_bucket->scb_thread_runq);
1070 queue_init(&clutch_bucket->scb_thread_timeshare_queue);
1071}
1072
1073/*
1074 * sched_clutch_bucket_group_init()
1075 *
1076 * Initializer for clutch bucket groups.
1077 */
1078static void
1079sched_clutch_bucket_group_init(
1080 sched_clutch_bucket_group_t clutch_bucket_group,
1081 sched_clutch_t clutch,
1082 sched_bucket_t bucket)
1083{
1084 bzero(clutch_bucket_group, sizeof(struct sched_clutch_bucket_group));
1085 clutch_bucket_group->scbg_bucket = bucket;
1086 clutch_bucket_group->scbg_clutch = clutch;
1087 for (int i = 0; i < MAX_PSETS; i++) {
1088 sched_clutch_bucket_init(&clutch_bucket_group->scbg_clutch_buckets[i], clutch_bucket_group, bucket);
1089 }
1090 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, 0, relaxed);
1091 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, INT8_MAX, relaxed);
1092 os_atomic_store(&clutch_bucket_group->scbg_preferred_cluster, pset0.pset_cluster_id, relaxed);
cb323159
A
1093 /*
1094 * All thread groups should be initialized to be interactive; this allows the newly launched
1095 * thread groups to fairly compete with already running thread groups.
1096 */
f427ee49
A
1097 clutch_bucket_group->scbg_interactivity_data.scct_count = (sched_clutch_bucket_group_interactive_pri * 2);
1098 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = 0;
1099 os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed);
1100#if !__LP64__
1101 lck_spin_init(&clutch_bucket_group->scbg_stats_lock, &pset_lck_grp, NULL);
1102#endif /* !__LP64__ */
1103 clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
1104 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
cb323159
A
1105}
1106
1107/*
1108 * sched_clutch_init_with_thread_group()
1109 *
1110 * Initialize the sched_clutch when the thread group is being created
1111 */
1112void
1113sched_clutch_init_with_thread_group(
1114 sched_clutch_t clutch,
1115 struct thread_group *tg)
1116{
1117 os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
1118
1119 /* Initialize all the clutch buckets */
1120 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
f427ee49 1121 sched_clutch_bucket_group_init(&(clutch->sc_clutch_groups[i]), clutch, i);
cb323159
A
1122 }
1123
1124 /* Grouping specific fields */
1125 clutch->sc_tg = tg;
1126 os_atomic_store(&clutch->sc_tg_priority, 0, relaxed);
1127}
1128
1129/*
1130 * sched_clutch_destroy()
1131 *
1132 * Destructor for clutch; called from thread group release code.
1133 */
1134void
1135sched_clutch_destroy(
1136 __unused sched_clutch_t clutch)
1137{
1138 assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
1139}
1140
f427ee49 1141#if CONFIG_SCHED_EDGE
c6bf4f31
A
1142
1143/*
f427ee49
A
1144 * The current edge scheduler still relies on globals for E & P clusters. It uses these
1145 * globals for the following operations:
1146 * - Sysctl support for configuring edges
1147 * - Edge scheduler initialization
c6bf4f31 1148 *
f427ee49
A
1149 * These should be removed for multi-cluster platforms once a clear policy for the above
1150 * operations is defined.
1151 * <Edge Multi-cluster Support Needed>
c6bf4f31 1152 */
f427ee49
A
1153static uint32_t ecore_cluster_id = 0;
1154static uint32_t pcore_cluster_id = 1;
1155
1156/*
1157 * Edge Scheduler Preferred Cluster Mechanism
1158 *
1159 * In order to have better control over various QoS buckets within a thread group, the Edge
1160 * scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
1161 * preferences are stored at the sched_clutch_bucket_group level since that represents all
1162 * threads at a particular QoS level within a sched_clutch. For any lookup of preferred
1163 * cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
1164 */
1165
1166static uint32_t
1167sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)
1168{
1169 return os_atomic_load(&clutch_bucket_group->scbg_preferred_cluster, relaxed);
1170}
1171
1172static uint32_t
1173sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)
1174{
1175 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket->scb_group);
1176}
1177
1178uint32_t
1179sched_edge_thread_preferred_cluster(thread_t thread)
1180{
1181 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1182 /* For threads bound to a specific cluster, return the bound cluster id */
1183 return sched_edge_thread_bound_cluster_id(thread);
1184 }
1185
1186 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1187 sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[thread->th_sched_bucket];
1188 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group);
1189}
1190
1191/*
1192 * Edge Scheduler Foreign Bucket Support
1193 *
1194 * In the Edge Scheduler, each cluster maintains a priority queue of clutch buckets containing
1195 * threads that are not native to the cluster. A clutch bucket is considered native if its
1196 * preferred cluster has the same type as the cluster its enqueued in. The foreign clutch
1197 * bucket priority queue is used for rebalance operations to get threads back to their native
1198 * cluster quickly.
1199 *
1200 * It is possible to make this policy even more aggressive by considering all clusters that
1201 * are not the preferred cluster as the foreign cluster, but that would mean a lot of thread
1202 * migrations which might have performance implications.
1203 */
1204
1205static void
1206sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1207{
1208 if (clutch_bucket->scb_foreign) {
1209 clutch_bucket->scb_foreign = false;
1210 priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1211 }
1212}
1213
1214static void
1215sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1216{
1217 if (!clutch_bucket->scb_foreign) {
1218 clutch_bucket->scb_foreign = true;
1219 priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1220 priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1221 }
1222}
1223
1224/*
1225 * Edge Scheduler Cumulative Load Average
1226 *
1227 * The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1228 * making thread migration decisions. The per-bucket load is maintained as a
1229 * cumulative count since higher scheduling buckets impact load on lower buckets
1230 * for thread migration decisions.
1231 *
1232 */
1233
1234static void
1235sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1236{
1237 switch (bucket) {
1238 case TH_BUCKET_FIXPRI: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1239 case TH_BUCKET_SHARE_FG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1240 case TH_BUCKET_SHARE_IN: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1241 case TH_BUCKET_SHARE_DF: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1242 case TH_BUCKET_SHARE_UT: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1243 case TH_BUCKET_SHARE_BG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1244 default:
1245 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1246 }
1247}
1248
1249static void
1250sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1251{
1252 switch (bucket) {
1253 case TH_BUCKET_FIXPRI: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1254 case TH_BUCKET_SHARE_FG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1255 case TH_BUCKET_SHARE_IN: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1256 case TH_BUCKET_SHARE_DF: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1257 case TH_BUCKET_SHARE_UT: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1258 case TH_BUCKET_SHARE_BG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1259 default:
1260 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
c6bf4f31 1261 }
c6bf4f31
A
1262}
1263
f427ee49
A
1264uint16_t
1265sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1266{
1267 return os_atomic_load(&root_clutch->scr_cumulative_run_count[bucket], relaxed);
1268}
1269
1270#endif /* CONFIG_SCHED_EDGE */
cb323159
A
1271
1272/*
1273 * sched_clutch_bucket_hierarchy_insert()
1274 *
1275 * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
1276 */
1277static void
1278sched_clutch_bucket_hierarchy_insert(
1279 sched_clutch_root_t root_clutch,
1280 sched_clutch_bucket_t clutch_bucket,
1281 sched_bucket_t bucket,
ea3f0419
A
1282 uint64_t timestamp,
1283 sched_clutch_bucket_options_t options)
cb323159
A
1284{
1285 sched_clutch_hierarchy_locked_assert(root_clutch);
1286 if (bucket > TH_BUCKET_FIXPRI) {
1287 /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
1288 enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
1289 }
f427ee49 1290#if CONFIG_SCHED_EDGE
c6bf4f31 1291 /* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
f427ee49
A
1292 uint32_t preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
1293 if (pset_type_for_id(preferred_cluster) != pset_type_for_id(root_clutch->scr_cluster_id)) {
1294 sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
c6bf4f31 1295 }
f427ee49
A
1296#endif /* CONFIG_SCHED_EDGE */
1297 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
cb323159
A
1298
1299 /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
ea3f0419 1300 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
cb323159
A
1301 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
1302 }
1303
ea3f0419
A
1304 /* Insert the clutch bucket into the root bucket run queue with order based on options */
1305 sched_clutch_bucket_runq_enqueue(&root_bucket->scrb_clutch_buckets, clutch_bucket, options);
cb323159 1306 os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
f427ee49 1307 os_atomic_inc(&sched_clutch_global_bucket_load[bucket], relaxed);
cb323159
A
1308}
1309
1310/*
1311 * sched_clutch_bucket_hierarchy_remove()
1312 *
1313 * Rotuine to remove a empty clutch bucket from the root hierarchy.
1314 */
1315static void
1316sched_clutch_bucket_hierarchy_remove(
1317 sched_clutch_root_t root_clutch,
1318 sched_clutch_bucket_t clutch_bucket,
1319 sched_bucket_t bucket,
ea3f0419
A
1320 uint64_t timestamp,
1321 __unused sched_clutch_bucket_options_t options)
cb323159
A
1322{
1323 sched_clutch_hierarchy_locked_assert(root_clutch);
1324 if (bucket > TH_BUCKET_FIXPRI) {
1325 /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
1326 remqueue(&clutch_bucket->scb_listlink);
1327 }
f427ee49
A
1328#if CONFIG_SCHED_EDGE
1329 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
1330#endif /* CONFIG_SCHED_EDGE */
cb323159 1331
f427ee49 1332 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
cb323159
A
1333
1334 /* Remove the clutch bucket from the root bucket priority queue */
ea3f0419 1335 sched_clutch_bucket_runq_remove(&root_bucket->scrb_clutch_buckets, clutch_bucket);
cb323159 1336 os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
cb323159
A
1337
1338 /* If the root bucket priority queue is now empty, remove it from the root priority queue */
ea3f0419 1339 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
cb323159
A
1340 sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
1341 }
f427ee49 1342 os_atomic_dec(&sched_clutch_global_bucket_load[bucket], relaxed);
cb323159
A
1343}
1344
1345/*
1346 * sched_clutch_bucket_base_pri()
1347 *
1348 * Calculates the "base" priority of the clutch bucket. The base
1349 * priority of the clutch bucket is the sum of the max of highest
1350 * base_pri and highest sched_pri in the clutch bucket and any
1351 * grouping specific (App/Daemon...) boosts applicable to the
1352 * clutch_bucket.
1353 */
1354static uint8_t
1355sched_clutch_bucket_base_pri(
1356 sched_clutch_bucket_t clutch_bucket)
1357{
1358 uint8_t clutch_boost = 0;
f427ee49 1359 assert(priority_queue_empty(&clutch_bucket->scb_thread_runq) == false);
cb323159 1360
f427ee49 1361 sched_clutch_t clutch = clutch_bucket->scb_group->scbg_clutch;
cb323159
A
1362
1363 /*
1364 * Since the clutch bucket can contain threads that are members of the group due
1365 * to the sched_pri being promoted or due to their base pri, the base priority of
1366 * the entire clutch bucket should be based on the highest thread (promoted or base)
1367 * in the clutch bucket.
1368 */
f427ee49
A
1369 uint8_t max_pri = 0;
1370 if (!priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq)) {
1371 max_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
cb323159 1372 }
f427ee49
A
1373
1374 sched_clutch_tg_priority_t tg_pri = os_atomic_load(&clutch->sc_tg_priority, relaxed);
1375 clutch_boost = sched_clutch_bucket_group_pri_boost[tg_pri];
cb323159
A
1376 return max_pri + clutch_boost;
1377}
1378
1379/*
f427ee49 1380 * sched_clutch_interactivity_from_cpu_data()
cb323159 1381 *
f427ee49 1382 * Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
cb323159
A
1383 */
1384static uint8_t
f427ee49 1385sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)
cb323159 1386{
cb323159 1387 sched_clutch_bucket_cpu_data_t scb_cpu_data;
f427ee49 1388 scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
cb323159
A
1389 clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
1390 clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
f427ee49 1391 uint8_t interactive_score = 0;
cb323159 1392
cb323159 1393 if ((cpu_blocked == 0) && (cpu_used == 0)) {
f427ee49 1394 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
cb323159 1395 }
cb323159
A
1396 /*
1397 * For all timeshare buckets, calculate the interactivity score of the bucket
1398 * and add it to the base priority
1399 */
cb323159
A
1400 if (cpu_blocked > cpu_used) {
1401 /* Interactive clutch_bucket case */
f427ee49
A
1402 interactive_score = sched_clutch_bucket_group_interactive_pri +
1403 ((sched_clutch_bucket_group_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
cb323159
A
1404 } else {
1405 /* Non-interactive clutch_bucket case */
f427ee49 1406 interactive_score = ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) / cpu_used);
cb323159 1407 }
cb323159
A
1408 return interactive_score;
1409}
1410
1411/*
1412 * sched_clutch_bucket_pri_calculate()
1413 *
1414 * The priority calculation algorithm for the clutch_bucket is a slight
1415 * modification on the ULE interactivity score. It uses the base priority
1416 * of the clutch bucket and applies an interactivity score boost to the
1417 * highly responsive clutch buckets.
1418 */
cb323159
A
1419static uint8_t
1420sched_clutch_bucket_pri_calculate(
1421 sched_clutch_bucket_t clutch_bucket,
1422 uint64_t timestamp)
1423{
1424 /* For empty clutch buckets, return priority 0 */
1425 if (clutch_bucket->scb_thr_count == 0) {
1426 return 0;
1427 }
1428
1429 uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
f427ee49 1430 uint8_t interactive_score = sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket->scb_group, timestamp);
cb323159
A
1431
1432 assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1433 uint8_t pri = base_pri + interactive_score;
1434 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
f427ee49 1435 thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
cb323159
A
1436 return pri;
1437}
1438
1439/*
1440 * sched_clutch_root_bucket_highest_clutch_bucket()
1441 *
1442 * Routine to find the highest priority clutch bucket
1443 * within the root bucket.
1444 */
1445static sched_clutch_bucket_t
1446sched_clutch_root_bucket_highest_clutch_bucket(
1447 sched_clutch_root_bucket_t root_bucket)
1448{
ea3f0419 1449 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
cb323159
A
1450 return NULL;
1451 }
ea3f0419 1452 return sched_clutch_bucket_runq_peek(&root_bucket->scrb_clutch_buckets);
cb323159
A
1453}
1454
1455/*
1456 * sched_clutch_bucket_runnable()
1457 *
1458 * Perform all operations needed when a new clutch bucket becomes runnable.
1459 * It involves inserting the clutch_bucket into the hierarchy and updating the
1460 * root priority appropriately.
1461 */
1462static boolean_t
1463sched_clutch_bucket_runnable(
1464 sched_clutch_bucket_t clutch_bucket,
1465 sched_clutch_root_t root_clutch,
ea3f0419
A
1466 uint64_t timestamp,
1467 sched_clutch_bucket_options_t options)
cb323159
A
1468{
1469 sched_clutch_hierarchy_locked_assert(root_clutch);
f427ee49 1470 /* Since the clutch bucket became newly runnable, update its pending timestamp */
cb323159 1471 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
ea3f0419 1472 sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
f427ee49 1473
cb323159 1474 /* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
f427ee49 1475 sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, timestamp);
cb323159
A
1476 int16_t root_old_pri = root_clutch->scr_priority;
1477 sched_clutch_root_pri_update(root_clutch);
1478 return root_clutch->scr_priority > root_old_pri;
1479}
1480
1481/*
1482 * sched_clutch_bucket_update()
1483 *
ea3f0419
A
1484 * Update the clutch_bucket's position in the hierarchy. This routine is
1485 * called when a new thread is inserted or removed from a runnable clutch
1486 * bucket. The options specify some properties about the clutch bucket
1487 * insertion order into the clutch bucket runq.
cb323159
A
1488 */
1489static boolean_t
1490sched_clutch_bucket_update(
1491 sched_clutch_bucket_t clutch_bucket,
1492 sched_clutch_root_t root_clutch,
ea3f0419
A
1493 uint64_t timestamp,
1494 sched_clutch_bucket_options_t options)
cb323159
A
1495{
1496 sched_clutch_hierarchy_locked_assert(root_clutch);
1497 uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
f427ee49 1498 sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_unbound_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
cb323159 1499 if (new_pri == clutch_bucket->scb_priority) {
ea3f0419
A
1500 /*
1501 * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1502 * to the end of the runq. Typically used when a thread is selected for execution
1503 * from a clutch bucket.
1504 */
1505 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
1506 sched_clutch_bucket_runq_rotate(bucket_runq, clutch_bucket);
1507 }
cb323159
A
1508 return false;
1509 }
ea3f0419 1510 sched_clutch_bucket_runq_remove(bucket_runq, clutch_bucket);
f427ee49
A
1511#if CONFIG_SCHED_EDGE
1512 if (clutch_bucket->scb_foreign) {
1513 priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1514 }
1515#endif /* CONFIG_SCHED_EDGE */
ea3f0419 1516 clutch_bucket->scb_priority = new_pri;
f427ee49
A
1517#if CONFIG_SCHED_EDGE
1518 if (clutch_bucket->scb_foreign) {
1519 priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1520 priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1521 }
1522#endif /* CONFIG_SCHED_EDGE */
ea3f0419 1523 sched_clutch_bucket_runq_enqueue(bucket_runq, clutch_bucket, options);
cb323159
A
1524
1525 int16_t root_old_pri = root_clutch->scr_priority;
1526 sched_clutch_root_pri_update(root_clutch);
1527 return root_clutch->scr_priority > root_old_pri;
1528}
1529
1530/*
1531 * sched_clutch_bucket_empty()
1532 *
1533 * Perform all the operations needed when a clutch_bucket is no longer runnable.
1534 * It involves removing the clutch bucket from the hierarchy and updaing the root
1535 * priority appropriately.
1536 */
1537static void
1538sched_clutch_bucket_empty(
1539 sched_clutch_bucket_t clutch_bucket,
1540 sched_clutch_root_t root_clutch,
ea3f0419
A
1541 uint64_t timestamp,
1542 sched_clutch_bucket_options_t options)
cb323159
A
1543{
1544 sched_clutch_hierarchy_locked_assert(root_clutch);
ea3f0419 1545 sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
cb323159
A
1546 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1547 sched_clutch_root_pri_update(root_clutch);
1548}
1549
1550/*
1551 * sched_clutch_cpu_usage_update()
1552 *
1553 * Routine to update CPU usage of the thread in the hierarchy.
1554 */
1555void
1556sched_clutch_cpu_usage_update(
1557 thread_t thread,
1558 uint64_t delta)
1559{
f427ee49 1560 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread) || SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
cb323159
A
1561 return;
1562 }
f427ee49 1563
cb323159 1564 sched_clutch_t clutch = sched_clutch_for_thread(thread);
f427ee49
A
1565 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
1566 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta);
cb323159
A
1567}
1568
1569/*
f427ee49 1570 * sched_clutch_bucket_group_cpu_usage_update()
cb323159
A
1571 *
1572 * Routine to update the CPU usage of the clutch_bucket.
1573 */
1574static void
f427ee49
A
1575sched_clutch_bucket_group_cpu_usage_update(
1576 sched_clutch_bucket_group_t clutch_bucket_group,
cb323159
A
1577 uint64_t delta)
1578{
f427ee49 1579 if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
cb323159
A
1580 /* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1581 return;
1582 }
f427ee49
A
1583 delta = MIN(delta, sched_clutch_bucket_group_adjust_threshold);
1584 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
cb323159
A
1585}
1586
1587/*
f427ee49 1588 * sched_clutch_bucket_group_cpu_pending_adjust()
cb323159 1589 *
f427ee49
A
1590 * Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
1591 * such that one "pending interval" provides one point improvement in interactivity score.
cb323159 1592 */
f427ee49
A
1593static inline uint64_t
1594sched_clutch_bucket_group_cpu_pending_adjust(
1595 uint64_t cpu_used,
1596 uint64_t cpu_blocked,
1597 uint8_t pending_intervals)
cb323159 1598{
f427ee49
A
1599 uint64_t cpu_used_adjusted = 0;
1600 if (cpu_blocked < cpu_used) {
1601 cpu_used_adjusted = (sched_clutch_bucket_group_interactive_pri * cpu_blocked * cpu_used);
1602 cpu_used_adjusted = cpu_used_adjusted / ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) + (cpu_used * pending_intervals));
1603 } else {
1604 uint64_t adjust_factor = (cpu_blocked * pending_intervals) / sched_clutch_bucket_group_interactive_pri;
1605 cpu_used_adjusted = (adjust_factor > cpu_used) ? 0 : (cpu_used - adjust_factor);
cb323159 1606 }
f427ee49 1607 return cpu_used_adjusted;
cb323159
A
1608}
1609
1610/*
f427ee49 1611 * sched_clutch_bucket_group_cpu_adjust()
cb323159
A
1612 *
1613 * Routine to scale the cpu usage and blocked time once the sum gets bigger
f427ee49 1614 * than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
cb323159
A
1615 * manageable and maintain the same ratio while allowing clutch buckets to
1616 * adjust behavior and reflect in the interactivity score in a reasonable
f427ee49
A
1617 * amount of time. Also adjusts the CPU usage based on pending_intervals
1618 * which allows ageout of CPU to avoid starvation in highly contended scenarios.
cb323159
A
1619 */
1620static void
f427ee49
A
1621sched_clutch_bucket_group_cpu_adjust(
1622 sched_clutch_bucket_group_t clutch_bucket_group,
1623 uint8_t pending_intervals)
cb323159
A
1624{
1625 sched_clutch_bucket_cpu_data_t old_cpu_data = {};
1626 sched_clutch_bucket_cpu_data_t new_cpu_data = {};
f427ee49 1627 os_atomic_rmw_loop(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed, {
cb323159
A
1628 clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
1629 clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
cb323159 1630
f427ee49
A
1631 if ((pending_intervals == 0) && (cpu_used + cpu_blocked) < sched_clutch_bucket_group_adjust_threshold) {
1632 /* No changes to the CPU used and blocked values */
1633 os_atomic_rmw_loop_give_up();
1634 }
1635 if ((cpu_used + cpu_blocked) >= sched_clutch_bucket_group_adjust_threshold) {
1636 /* Only keep the recent CPU history to better indicate how this TG has been behaving */
1637 cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1638 cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1639 }
1640 /* Use the shift passed in to ageout the CPU usage */
1641 cpu_used = (clutch_cpu_data_t)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used, cpu_blocked, pending_intervals);
1642 new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used;
1643 new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked;
1644 });
cb323159
A
1645}
1646
1647/*
1648 * Thread level scheduling algorithm
1649 *
1650 * The thread level scheduling algorithm uses the mach timeshare
1651 * decay based algorithm to achieve sharing between threads within the
1652 * same clutch bucket. The load/priority shifts etc. are all maintained
1653 * at the clutch bucket level and used for decay calculation of the
1654 * threads. The load sampling is still driven off the scheduler tick
1655 * for runnable clutch buckets (it does not use the new higher frequency
1656 * EWMA based load calculation). The idea is that the contention and load
1657 * within clutch_buckets should be limited enough to not see heavy decay
1658 * and timeshare effectively.
1659 */
1660
1661/*
1662 * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1663 *
1664 * Increment the run count for the clutch bucket associated with the
1665 * thread.
1666 */
1667uint32_t
1668sched_clutch_thread_run_bucket_incr(
1669 thread_t thread,
1670 sched_bucket_t bucket)
1671{
1672 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1673 return 0;
1674 }
1675 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1676 return sched_clutch_run_bucket_incr(clutch, bucket);
1677}
1678
1679static uint32_t
1680sched_clutch_run_bucket_incr(
1681 sched_clutch_t clutch,
1682 sched_bucket_t bucket)
1683{
1684 assert(bucket != TH_BUCKET_RUN);
f427ee49
A
1685 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1686 return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group);
cb323159
A
1687}
1688
1689/*
1690 * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
1691 *
1692 * Decrement the run count for the clutch bucket associated with the
1693 * thread.
1694 */
1695uint32_t
1696sched_clutch_thread_run_bucket_decr(
1697 thread_t thread,
1698 sched_bucket_t bucket)
1699{
1700 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1701 return 0;
1702 }
1703 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1704 return sched_clutch_run_bucket_decr(clutch, bucket);
1705}
1706
1707static uint32_t
1708sched_clutch_run_bucket_decr(
1709 sched_clutch_t clutch,
1710 sched_bucket_t bucket)
1711{
1712 assert(bucket != TH_BUCKET_RUN);
f427ee49
A
1713 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1714 return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group);
cb323159
A
1715}
1716
1717/*
f427ee49 1718 * sched_clutch_bucket_group_timeshare_update()
cb323159 1719 *
f427ee49
A
1720 * Routine to update the load and priority shift for the clutch_bucket_group
1721 * every sched_tick. For multi-cluster platforms, each QoS level will have multiple
1722 * clutch buckets with runnable threads in them. So it is important to maintain
1723 * the timesharing information at the clutch_bucket_group level instead of
1724 * individual clutch buckets (because the algorithm is trying to timeshare all
1725 * threads at the same QoS irrespective of which hierarchy they are enqueued in).
1726 *
1727 * The routine is called from the sched tick handling code to make sure this value
1728 * is updated at least once every sched tick. For clutch bucket groups which have
1729 * not been runnable for very long, the clutch_bucket_group maintains a "last
1730 * updated schedtick" parameter. As threads become runnable in the clutch bucket group,
1731 * if this value is outdated, the load and shifts are updated.
cb323159
A
1732 *
1733 * Possible optimization:
1734 * - The current algorithm samples the load every sched tick (125ms).
1735 * This is prone to spikes in runnable counts; if that turns out to be
1736 * a problem, a simple solution would be to do the EWMA trick to sample
1737 * load at every load_tick (30ms) and use the averaged value for the pri
1738 * shift calculation.
1739 */
1740static void
f427ee49
A
1741sched_clutch_bucket_group_timeshare_update(
1742 sched_clutch_bucket_group_t clutch_bucket_group,
1743 sched_clutch_bucket_t clutch_bucket,
1744 uint64_t ctime)
cb323159 1745{
f427ee49
A
1746 if (clutch_bucket_group->scbg_bucket < TH_BUCKET_SHARE_FG) {
1747 /* No timesharing needed for fixed priority Above UI threads */
cb323159
A
1748 return;
1749 }
1750
1751 /*
f427ee49
A
1752 * Update the timeshare parameters for the clutch bucket group
1753 * if they havent been updated in this tick.
cb323159 1754 */
f427ee49 1755 uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed);
cb323159 1756 uint32_t current_sched_ts = sched_tick;
f427ee49
A
1757 if (sched_ts < current_sched_ts) {
1758 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed);
1759 /* NCPU wide workloads should not experience decay */
1760 uint64_t bucket_group_run_count = os_atomic_load_wide(&clutch_bucket_group->scbg_blocked_data.scct_count, relaxed) - 1;
1761 uint32_t bucket_group_load = (uint32_t)(bucket_group_run_count / processor_avail_count);
1762 bucket_group_load = MIN(bucket_group_load, NRQS - 1);
1763 uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_group_load];
1764 /* Ensure that the pri_shift value is reasonable */
1765 pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift;
1766 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed);
cb323159 1767 }
f427ee49
A
1768
1769 /*
1770 * Update the clutch bucket priority; this allows clutch buckets that have been pending
1771 * for a long time to get an updated interactivity score.
1772 */
1773 sched_clutch_bucket_update(clutch_bucket, clutch_bucket->scb_root, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
cb323159
A
1774}
1775
1776/*
1777 * sched_clutch_thread_clutch_update()
1778 *
1779 * Routine called when the thread changes its thread group. The current
1780 * implementation relies on the fact that the thread group is changed only
1781 * from the context of the thread itself. Due to this fact, the thread
1782 * group change causes only counter updates in the old & new clutch
1783 * buckets and no hierarchy changes. The routine also attributes the CPU
1784 * used so far to the old clutch.
1785 */
1786void
1787sched_clutch_thread_clutch_update(
1788 thread_t thread,
1789 sched_clutch_t old_clutch,
1790 sched_clutch_t new_clutch)
1791{
1792 uint32_t cpu_delta;
1793 assert(current_thread() == thread);
1794
1795 if (old_clutch) {
1796 sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
1797 /*
1798 * Calculate the CPU used by this thread in the old bucket and
1799 * add it to the old clutch bucket. This uses the same CPU usage
1800 * logic as update_priority etc.
1801 */
1802 thread_timer_delta(thread, cpu_delta);
1803 if (thread->pri_shift < INT8_MAX) {
1804 thread->sched_usage += cpu_delta;
1805 }
1806 thread->cpu_delta += cpu_delta;
f427ee49
A
1807 if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1808 sched_clutch_bucket_group_t clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]);
1809 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, cpu_delta);
1810 }
cb323159
A
1811 }
1812
1813 if (new_clutch) {
1814 sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
1815 }
1816}
1817
1818/* Thread Insertion/Removal/Selection routines */
1819
f427ee49
A
1820#if CONFIG_SCHED_EDGE
1821
1822/*
1823 * Edge Scheduler Bound Thread Support
1824 *
1825 * The edge scheduler allows threads to be bound to specific clusters. The scheduler
1826 * maintains a separate runq on the clutch root to hold these bound threads. These
1827 * bound threads count towards the root priority and thread count, but are ignored
1828 * for thread migration/steal decisions. Bound threads that are enqueued in the
1829 * separate runq have the th_bound_cluster_enqueued flag set to allow easy
1830 * removal.
1831 *
1832 * Bound Threads Timesharing
1833 * The bound threads share the timesharing properties of the clutch bucket group they are
1834 * part of. They contribute to the load and use priority shifts/decay values from the
1835 * clutch bucket group.
1836 */
1837
1838static boolean_t
1839sched_edge_bound_thread_insert(
1840 sched_clutch_root_t root_clutch,
1841 thread_t thread,
1842 integer_t options)
1843{
1844 /* Update the clutch runnable count and priority */
1845 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
1846 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
1847 if (root_bucket->scrb_bound_thread_runq.count == 0) {
1848 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, mach_absolute_time());
1849 }
1850
1851 assert((thread->th_bound_cluster_enqueued) == false);
1852 run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options);
1853 thread->th_bound_cluster_enqueued = true;
1854
1855 int16_t root_old_pri = root_clutch->scr_priority;
1856 sched_clutch_root_pri_update(root_clutch);
1857 return root_clutch->scr_priority > root_old_pri;
1858}
1859
1860static void
1861sched_edge_bound_thread_remove(
1862 sched_clutch_root_t root_clutch,
1863 thread_t thread)
1864{
1865 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
1866 assert((thread->th_bound_cluster_enqueued) == true);
1867 run_queue_remove(&root_bucket->scrb_bound_thread_runq, thread);
1868 thread->th_bound_cluster_enqueued = false;
1869
1870 /* Update the clutch runnable count and priority */
1871 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
1872 if (root_bucket->scrb_bound_thread_runq.count == 0) {
1873 sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time());
1874 }
1875 sched_clutch_root_pri_update(root_clutch);
1876}
1877
1878#endif /* CONFIG_SCHED_EDGE */
1879
1880/*
1881 * sched_clutch_thread_bound_lookup()
1882 *
1883 * Routine to lookup the highest priority runnable thread in a bounded root bucket.
1884 */
1885static thread_t
1886sched_clutch_thread_bound_lookup(
1887 __unused sched_clutch_root_t root_clutch,
1888 sched_clutch_root_bucket_t root_bucket)
1889{
1890 return run_queue_peek(&root_bucket->scrb_bound_thread_runq);
1891}
1892
1893/*
1894 * Clutch Bucket Group Thread Counts and Pending time calculation
1895 *
1896 * The pending time on the clutch_bucket_group allows the scheduler to track if it
1897 * needs to ageout the CPU usage because the clutch_bucket_group has been pending for
1898 * a very long time. The pending time is set to the timestamp as soon as a thread becomes
1899 * runnable. When a thread is picked up for execution from this clutch_bucket_group, the
1900 * pending time is advanced to the time of thread selection.
1901 *
1902 * Since threads for a clutch bucket group can be added or removed from multiple CPUs
1903 * simulataneously, it is important that the updates to thread counts and pending timestamps
1904 * happen atomically. The implementation relies on the following aspects to make that work
1905 * as expected:
1906 * - The clutch scheduler would be deployed on single cluster platforms where the pset lock
1907 * is held when threads are added/removed and pending timestamps are updated
1908 * - The edge scheduler would have support for double wide 128 bit atomics which allow the
1909 * thread count and pending timestamp to be updated atomically.
1910 *
1911 * Clutch bucket group interactivity timestamp and score updates also rely on the properties
1912 * above to atomically update the interactivity score for a clutch bucket group.
1913 */
1914
1915#if CONFIG_SCHED_EDGE
1916
1917static void
1918sched_clutch_bucket_group_thr_count_inc(
1919 sched_clutch_bucket_group_t clutch_bucket_group,
1920 uint64_t timestamp)
1921{
1922 sched_clutch_counter_time_t old_pending_data;
1923 sched_clutch_counter_time_t new_pending_data;
1924 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
1925 new_pending_data.scct_count = old_pending_data.scct_count + 1;
1926 new_pending_data.scct_timestamp = old_pending_data.scct_timestamp;
1927 if (old_pending_data.scct_count == 0) {
1928 new_pending_data.scct_timestamp = timestamp;
1929 }
1930 });
1931}
1932
1933static void
1934sched_clutch_bucket_group_thr_count_dec(
1935 sched_clutch_bucket_group_t clutch_bucket_group,
1936 uint64_t timestamp)
1937{
1938 sched_clutch_counter_time_t old_pending_data;
1939 sched_clutch_counter_time_t new_pending_data;
1940 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
1941 new_pending_data.scct_count = old_pending_data.scct_count - 1;
1942 if (new_pending_data.scct_count == 0) {
1943 new_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1944 } else {
1945 new_pending_data.scct_timestamp = timestamp;
1946 }
1947 });
1948}
1949
1950static uint8_t
1951sched_clutch_bucket_group_pending_ageout(
1952 sched_clutch_bucket_group_t clutch_bucket_group,
1953 uint64_t timestamp)
1954{
1955 int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
1956 sched_clutch_counter_time_t old_pending_data;
1957 sched_clutch_counter_time_t new_pending_data;
1958 uint8_t cpu_usage_shift = 0;
1959
1960 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
1961 cpu_usage_shift = 0;
1962 uint64_t old_pending_ts = old_pending_data.scct_timestamp;
1963 bool old_update = (old_pending_ts >= timestamp);
1964 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
1965 bool no_bucket_load = (bucket_load == 0);
1966 if (old_update || no_pending_time || no_bucket_load) {
1967 os_atomic_rmw_loop_give_up();
1968 }
1969
1970 /* Calculate the time the clutch bucket group has been pending */
1971 uint64_t pending_delta = timestamp - old_pending_ts;
1972 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
1973 if (pending_delta < interactivity_delta) {
1974 os_atomic_rmw_loop_give_up();
1975 }
1976 cpu_usage_shift = (pending_delta / interactivity_delta);
1977 new_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
1978 new_pending_data.scct_count = old_pending_data.scct_count;
1979 });
1980 return cpu_usage_shift;
1981}
1982
1983static uint8_t
1984sched_clutch_bucket_group_interactivity_score_calculate(
1985 sched_clutch_bucket_group_t clutch_bucket_group,
1986 uint64_t timestamp)
1987{
1988 if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
1989 /*
1990 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
1991 * priorities, make sure all AboveUI buckets are marked interactive.
1992 */
1993 assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
1994 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1995 }
1996 /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
1997 uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
1998 /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
1999 sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2000 uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2001 sched_clutch_counter_time_t old_interactivity_data;
2002 sched_clutch_counter_time_t new_interactivity_data;
2003
2004 bool score_updated = os_atomic_rmw_loop(&clutch_bucket_group->scbg_interactivity_data.scct_packed, old_interactivity_data.scct_packed, new_interactivity_data.scct_packed, relaxed, {
2005 if (old_interactivity_data.scct_timestamp >= timestamp) {
2006 os_atomic_rmw_loop_give_up();
2007 }
2008 new_interactivity_data.scct_timestamp = timestamp;
2009 if (old_interactivity_data.scct_timestamp != 0) {
2010 new_interactivity_data.scct_count = interactivity_score;
2011 }
2012 });
2013 if (score_updated) {
2014 return (uint8_t)new_interactivity_data.scct_count;
2015 } else {
2016 return (uint8_t)old_interactivity_data.scct_count;
2017 }
2018}
2019
2020#else /* CONFIG_SCHED_EDGE */
2021
2022/*
2023 * For the clutch scheduler, atomicity is ensured by making sure all operations
2024 * are happening under the pset lock of the only cluster present on the platform.
2025 */
2026static void
2027sched_clutch_bucket_group_thr_count_inc(
2028 sched_clutch_bucket_group_t clutch_bucket_group,
2029 uint64_t timestamp)
2030{
2031 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2032 if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2033 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2034 }
2035 clutch_bucket_group->scbg_pending_data.scct_count++;
2036}
2037
2038static void
2039sched_clutch_bucket_group_thr_count_dec(
2040 sched_clutch_bucket_group_t clutch_bucket_group,
2041 uint64_t timestamp)
2042{
2043 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2044 clutch_bucket_group->scbg_pending_data.scct_count--;
2045 if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2046 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2047 } else {
2048 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2049 }
2050}
2051
2052static uint8_t
2053sched_clutch_bucket_group_pending_ageout(
2054 sched_clutch_bucket_group_t clutch_bucket_group,
2055 uint64_t timestamp)
2056{
2057 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2058 int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2059 uint64_t old_pending_ts = clutch_bucket_group->scbg_pending_data.scct_timestamp;
2060 bool old_update = (old_pending_ts >= timestamp);
2061 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2062 bool no_bucket_load = (bucket_load == 0);
2063 if (old_update || no_pending_time || no_bucket_load) {
2064 return 0;
2065 }
2066 uint64_t pending_delta = timestamp - old_pending_ts;
2067 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
2068 if (pending_delta < interactivity_delta) {
2069 return 0;
2070 }
2071 uint8_t cpu_usage_shift = (pending_delta / interactivity_delta);
2072 clutch_bucket_group->scbg_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2073 return cpu_usage_shift;
2074}
2075
2076static uint8_t
2077sched_clutch_bucket_group_interactivity_score_calculate(
2078 sched_clutch_bucket_group_t clutch_bucket_group,
2079 uint64_t timestamp)
2080{
2081 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2082 if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2083 /*
2084 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2085 * priorities, make sure all AboveUI buckets are marked interactive.
2086 */
2087 assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2088 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2089 }
2090 /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2091 uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2092 /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2093 sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2094 uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2095 if (timestamp > clutch_bucket_group->scbg_interactivity_data.scct_timestamp) {
2096 clutch_bucket_group->scbg_interactivity_data.scct_count = interactivity_score;
2097 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = timestamp;
2098 return interactivity_score;
2099 } else {
2100 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2101 }
2102}
2103
2104#endif /* CONFIG_SCHED_EDGE */
2105
2106/*
2107 * Clutch Bucket Group Run Count and Blocked Time Accounting
2108 *
2109 * The clutch bucket group maintains the number of runnable/running threads in the group.
2110 * Since the blocked time of the clutch bucket group is based on this count, it is
2111 * important to make sure the blocking timestamp and the run count are updated atomically.
2112 *
2113 * Since the run count increments happen without any pset locks held, the scheduler makes
2114 * these updates atomic in the following way:
2115 * - On 64-bit platforms, it uses double wide atomics to update the count & timestamp
2116 * - On 32-bit platforms, it uses a lock to synchronize the count & timestamp update
2117 */
2118
2119#if !__LP64__
2120
2121static uint32_t
2122sched_clutch_bucket_group_run_count_inc(
2123 sched_clutch_bucket_group_t clutch_bucket_group)
2124{
2125 uint64_t blocked_time = 0;
2126 uint64_t updated_run_count = 0;
2127
2128 lck_spin_lock(&clutch_bucket_group->scbg_stats_lock);
2129 if ((clutch_bucket_group->scbg_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID) &&
2130 (clutch_bucket_group->scbg_blocked_data.scct_count == 0)) {
2131 /* Run count is transitioning from 0 to 1; calculate blocked time and add it to CPU data */
2132 blocked_time = mach_absolute_time() - clutch_bucket_group->scbg_blocked_data.scct_timestamp;
2133 clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2134 }
2135 clutch_bucket_group->scbg_blocked_data.scct_count = clutch_bucket_group->scbg_blocked_data.scct_count + 1;
2136 updated_run_count = clutch_bucket_group->scbg_blocked_data.scct_count;
2137 lck_spin_unlock(&clutch_bucket_group->scbg_stats_lock);
2138
2139 blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2140 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2141 return (uint32_t)updated_run_count;
2142}
2143
2144static uint32_t
2145sched_clutch_bucket_group_run_count_dec(
2146 sched_clutch_bucket_group_t clutch_bucket_group)
2147{
2148 uint64_t updated_run_count = 0;
2149
2150 lck_spin_lock(&clutch_bucket_group->scbg_stats_lock);
2151 clutch_bucket_group->scbg_blocked_data.scct_count = clutch_bucket_group->scbg_blocked_data.scct_count - 1;
2152 if (clutch_bucket_group->scbg_blocked_data.scct_count == 0) {
2153 /* Run count is transitioning from 1 to 0; start the blocked timer */
2154 clutch_bucket_group->scbg_blocked_data.scct_timestamp = mach_absolute_time();
2155 }
2156 updated_run_count = clutch_bucket_group->scbg_blocked_data.scct_count;
2157 lck_spin_unlock(&clutch_bucket_group->scbg_stats_lock);
2158 return (uint32_t)updated_run_count;
2159}
2160
2161#else /* !__LP64__ */
2162
2163static uint32_t
2164sched_clutch_bucket_group_run_count_inc(
2165 sched_clutch_bucket_group_t clutch_bucket_group)
2166{
2167 sched_clutch_counter_time_t old_blocked_data;
2168 sched_clutch_counter_time_t new_blocked_data;
2169
2170 bool update_blocked_time = false;
2171 os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2172 new_blocked_data.scct_count = old_blocked_data.scct_count + 1;
2173 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2174 update_blocked_time = false;
2175 if (old_blocked_data.scct_count == 0) {
2176 new_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2177 update_blocked_time = true;
2178 }
2179 });
2180 if (update_blocked_time && (old_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID)) {
2181 uint64_t ctime = mach_absolute_time();
2182 if (ctime > old_blocked_data.scct_timestamp) {
2183 uint64_t blocked_time = ctime - old_blocked_data.scct_timestamp;
2184 blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2185 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2186 }
2187 }
2188 return (uint32_t)new_blocked_data.scct_count;
2189}
2190
2191static uint32_t
2192sched_clutch_bucket_group_run_count_dec(
2193 sched_clutch_bucket_group_t clutch_bucket_group)
2194{
2195 sched_clutch_counter_time_t old_blocked_data;
2196 sched_clutch_counter_time_t new_blocked_data;
2197
2198 uint64_t ctime = mach_absolute_time();
2199 os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2200 new_blocked_data.scct_count = old_blocked_data.scct_count - 1;
2201 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2202 if (new_blocked_data.scct_count == 0) {
2203 new_blocked_data.scct_timestamp = ctime;
2204 }
2205 });
2206 return (uint32_t)new_blocked_data.scct_count;
2207}
2208
2209#endif /* !__LP64__ */
2210
cb323159
A
2211/*
2212 * sched_clutch_thread_insert()
2213 *
2214 * Routine to insert a thread into the sched clutch hierarchy.
2215 * Update the counts at all levels of the hierarchy and insert the nodes
2216 * as they become runnable. Always called with the pset lock held.
2217 */
2218static boolean_t
2219sched_clutch_thread_insert(
2220 sched_clutch_root_t root_clutch,
2221 thread_t thread,
2222 integer_t options)
2223{
2224 boolean_t result = FALSE;
2225
2226 sched_clutch_hierarchy_locked_assert(root_clutch);
f427ee49
A
2227#if CONFIG_SCHED_EDGE
2228 sched_edge_cluster_cumulative_count_incr(root_clutch, thread->th_sched_bucket);
2229 /*
2230 * Check if the thread is bound and is being enqueued in its desired bound cluster.
2231 * One scenario where a bound thread might not be getting enqueued in the bound cluster
2232 * hierarchy would be if the thread is "soft" bound and the bound cluster is
2233 * de-recommended. In that case, the thread should be treated as an unbound
2234 * thread.
2235 */
2236 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) && (sched_edge_thread_bound_cluster_id(thread) == root_clutch->scr_cluster_id)) {
2237 return sched_edge_bound_thread_insert(root_clutch, thread, options);
2238 }
2239#endif /* CONFIG_SCHED_EDGE */
cb323159
A
2240 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2241 assert(thread->thread_group == clutch->sc_tg);
2242
2243 uint64_t current_timestamp = mach_absolute_time();
f427ee49
A
2244 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2245 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
cb323159
A
2246 assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2247
f427ee49
A
2248 /*
2249 * Thread linkage in clutch_bucket
2250 *
2251 * A thread has a few linkages within the clutch bucket:
2252 * - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
2253 * - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
2254 * - A queue linkage used for timesharing operations of threads at the scheduler tick
2255 */
2256
2257 /* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */
2258 thread->th_clutch_runq_link.stamp = current_timestamp;
2259 priority_queue_entry_set_sched_pri(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link, thread->sched_pri,
2260 (options & SCHED_TAILQ) ? PRIORITY_QUEUE_ENTRY_NONE : PRIORITY_QUEUE_ENTRY_PREEMPTED);
2261 priority_queue_insert(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2262
2263 /* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
2264 priority_queue_entry_set_sched_pri(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link,
2265 sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, false);
2266 priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2267
2268 /* Insert thread into timesharing queue of the clutch bucket */
2269 enqueue_tail(&clutch_bucket->scb_thread_timeshare_queue, &thread->th_clutch_timeshare_link);
2270
cb323159
A
2271 /* Increment the urgency counter for the root if necessary */
2272 sched_clutch_root_urgency_inc(root_clutch, thread);
2273
cb323159 2274 os_atomic_inc(&clutch->sc_thr_count, relaxed);
f427ee49 2275 sched_clutch_bucket_group_thr_count_inc(clutch_bucket->scb_group, current_timestamp);
cb323159 2276
ea3f0419
A
2277 /* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
2278 sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
cb323159
A
2279 if (clutch_bucket->scb_thr_count == 0) {
2280 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2281 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
ea3f0419 2282 result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp, scb_options);
cb323159
A
2283 } else {
2284 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2285 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
ea3f0419 2286 result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, scb_options);
cb323159 2287 }
f427ee49
A
2288
2289 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2290 root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2291 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
cb323159
A
2292 return result;
2293}
2294
2295/*
2296 * sched_clutch_thread_remove()
2297 *
2298 * Routine to remove a thread from the sched clutch hierarchy.
2299 * Update the counts at all levels of the hierarchy and remove the nodes
2300 * as they become empty. Always called with the pset lock held.
2301 */
2302static void
2303sched_clutch_thread_remove(
2304 sched_clutch_root_t root_clutch,
2305 thread_t thread,
ea3f0419
A
2306 uint64_t current_timestamp,
2307 sched_clutch_bucket_options_t options)
cb323159
A
2308{
2309 sched_clutch_hierarchy_locked_assert(root_clutch);
f427ee49
A
2310#if CONFIG_SCHED_EDGE
2311 sched_edge_cluster_cumulative_count_decr(root_clutch, thread->th_sched_bucket);
2312 if (thread->th_bound_cluster_enqueued) {
2313 sched_edge_bound_thread_remove(root_clutch, thread);
2314 return;
2315 }
2316#endif /* CONFIG_SCHED_EDGE */
cb323159
A
2317 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2318 assert(thread->thread_group == clutch->sc_tg);
2319 assert(thread->runq != PROCESSOR_NULL);
2320
f427ee49
A
2321 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2322 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
cb323159
A
2323 assert(clutch_bucket->scb_root == root_clutch);
2324
2325 /* Decrement the urgency counter for the root if necessary */
2326 sched_clutch_root_urgency_dec(root_clutch, thread);
2327 /* Remove thread from the clutch_bucket */
f427ee49
A
2328 priority_queue_remove(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2329 remqueue(&thread->th_clutch_timeshare_link);
2330 thread->runq = PROCESSOR_NULL;
cb323159 2331
f427ee49 2332 priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
cb323159
A
2333
2334 /* Update counts at various levels of the hierarchy */
2335 os_atomic_dec(&clutch->sc_thr_count, relaxed);
f427ee49 2336 sched_clutch_bucket_group_thr_count_dec(clutch_bucket->scb_group, current_timestamp);
cb323159
A
2337 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2338 sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
2339
2340 /* Remove the clutch from hierarchy (if needed) and update properties */
2341 if (clutch_bucket->scb_thr_count == 0) {
ea3f0419 2342 sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp, options);
cb323159 2343 } else {
ea3f0419 2344 sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, options);
cb323159 2345 }
f427ee49
A
2346 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2347 root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2348 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2349}
2350
2351/*
2352 * sched_clutch_thread_unbound_lookup()
2353 *
2354 * Routine to find the highest unbound thread in the root clutch.
2355 * Helps find threads easily for steal/migrate scenarios in the
2356 * Edge scheduler.
2357 */
2358static thread_t
2359sched_clutch_thread_unbound_lookup(
2360 sched_clutch_root_t root_clutch,
2361 sched_clutch_root_bucket_t root_bucket)
2362{
2363 sched_clutch_hierarchy_locked_assert(root_clutch);
2364
2365 /* Find the highest priority clutch bucket in this root bucket */
2366 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
2367 assert(clutch_bucket != NULL);
2368
2369 /* Find the highest priority runnable thread in this clutch bucket */
2370 thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2371 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
2372 thread_tid(thread), thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, 0, 0);
2373 return thread;
cb323159
A
2374}
2375
2376/*
f427ee49 2377 * sched_clutch_thread_highest_remove()
cb323159
A
2378 *
2379 * Routine to find and remove the highest priority thread
2380 * from the sched clutch hierarchy. The algorithm looks at the
2381 * hierarchy for the most eligible runnable thread and calls
2382 * sched_clutch_thread_remove(). Always called with the
2383 * pset lock held.
2384 */
2385static thread_t
f427ee49 2386sched_clutch_thread_highest_remove(
cb323159
A
2387 sched_clutch_root_t root_clutch)
2388{
2389 sched_clutch_hierarchy_locked_assert(root_clutch);
2390 uint64_t current_timestamp = mach_absolute_time();
2391
f427ee49 2392 sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
cb323159
A
2393 if (root_bucket == NULL) {
2394 return THREAD_NULL;
2395 }
cb323159 2396
f427ee49
A
2397 thread_t highest_thread = THREAD_NULL;
2398 if (root_bucket->scrb_bound) {
2399 highest_thread = sched_clutch_thread_bound_lookup(root_clutch, root_bucket);
2400 } else {
2401 highest_thread = sched_clutch_thread_unbound_lookup(root_clutch, root_bucket);
2402 }
2403 sched_clutch_thread_remove(root_clutch, highest_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
2404 return highest_thread;
cb323159
A
2405}
2406
cb323159
A
2407/* High level global accessor routines */
2408
2409/*
2410 * sched_clutch_root_urgency()
2411 *
2412 * Routine to get the urgency of the highest runnable
2413 * thread in the hierarchy.
2414 */
2415static uint32_t
2416sched_clutch_root_urgency(
2417 sched_clutch_root_t root_clutch)
2418{
2419 return root_clutch->scr_urgency;
2420}
2421
2422/*
2423 * sched_clutch_root_count_sum()
2424 *
2425 * The count_sum mechanism is used for scheduler runq
2426 * statistics calculation. Its only useful for debugging
2427 * purposes; since it takes a mach_absolute_time() on
2428 * other scheduler implementations, its better to avoid
2429 * populating this until absolutely necessary.
2430 */
2431static uint32_t
2432sched_clutch_root_count_sum(
2433 __unused sched_clutch_root_t root_clutch)
2434{
2435 return 0;
2436}
2437
2438/*
2439 * sched_clutch_root_priority()
2440 *
2441 * Routine to get the priority of the highest runnable
2442 * thread in the hierarchy.
2443 */
2444static int
2445sched_clutch_root_priority(
2446 sched_clutch_root_t root_clutch)
2447{
2448 return root_clutch->scr_priority;
2449}
2450
2451/*
2452 * sched_clutch_root_count()
2453 *
2454 * Returns total number of runnable threads in the hierarchy.
2455 */
2456uint32_t
2457sched_clutch_root_count(
2458 sched_clutch_root_t root_clutch)
2459{
2460 return root_clutch->scr_thr_count;
2461}
2462
f427ee49
A
2463#if CONFIG_SCHED_EDGE
2464
2465/*
2466 * sched_clutch_root_foreign_empty()
2467 *
2468 * Routine to check if the foreign clutch bucket priority list is empty for a cluster.
2469 */
2470static boolean_t
2471sched_clutch_root_foreign_empty(
2472 sched_clutch_root_t root_clutch)
2473{
2474 return priority_queue_empty(&root_clutch->scr_foreign_buckets);
2475}
2476
2477/*
2478 * sched_clutch_root_highest_foreign_thread_remove()
2479 *
2480 * Routine to return the thread in the highest priority clutch bucket in a cluster.
2481 * Must be called with the pset for the cluster locked.
2482 */
2483static thread_t
2484sched_clutch_root_highest_foreign_thread_remove(
2485 sched_clutch_root_t root_clutch)
2486{
2487 thread_t thread = THREAD_NULL;
2488 if (priority_queue_empty(&root_clutch->scr_foreign_buckets)) {
2489 return thread;
2490 }
2491 sched_clutch_bucket_t clutch_bucket = priority_queue_max(&root_clutch->scr_foreign_buckets, struct sched_clutch_bucket, scb_foreignlink);
2492 thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2493 sched_clutch_thread_remove(root_clutch, thread, mach_absolute_time(), 0);
2494 return thread;
2495}
2496
2497#endif /* CONFIG_SCHED_EDGE */
2498
cb323159
A
2499/*
2500 * sched_clutch_thread_pri_shift()
2501 *
2502 * Routine to get the priority shift value for a thread.
2503 * Since the timesharing is done at the clutch_bucket level,
2504 * this routine gets the clutch_bucket and retrieves the
2505 * values from there.
2506 */
2507uint32_t
2508sched_clutch_thread_pri_shift(
2509 thread_t thread,
2510 sched_bucket_t bucket)
2511{
2512 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
f427ee49 2513 return INT8_MAX;
cb323159
A
2514 }
2515 assert(bucket != TH_BUCKET_RUN);
2516 sched_clutch_t clutch = sched_clutch_for_thread(thread);
f427ee49
A
2517 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2518 return os_atomic_load(&clutch_bucket_group->scbg_pri_shift, relaxed);
cb323159
A
2519}
2520
2521#pragma mark -- Clutch Scheduler Algorithm
2522
2523static void
2524sched_clutch_init(void);
2525
cb323159
A
2526static thread_t
2527sched_clutch_steal_thread(processor_set_t pset);
2528
2529static void
2530sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
2531
2532static boolean_t
2533sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
2534 sched_options_t options);
2535
2536static boolean_t
2537sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
2538
2539static ast_t
2540sched_clutch_processor_csw_check(processor_t processor);
2541
2542static boolean_t
2543sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
2544
2545static int
2546sched_clutch_runq_count(processor_t processor);
2547
2548static boolean_t
2549sched_clutch_processor_queue_empty(processor_t processor);
2550
2551static uint64_t
2552sched_clutch_runq_stats_count_sum(processor_t processor);
2553
2554static int
2555sched_clutch_processor_bound_count(processor_t processor);
2556
2557static void
2558sched_clutch_pset_init(processor_set_t pset);
2559
2560static void
2561sched_clutch_processor_init(processor_t processor);
2562
2563static thread_t
2564sched_clutch_choose_thread(processor_t processor, int priority, ast_t reason);
2565
2566static void
2567sched_clutch_processor_queue_shutdown(processor_t processor);
2568
2569static sched_mode_t
2570sched_clutch_initial_thread_sched_mode(task_t parent_task);
2571
2572static uint32_t
2573sched_clutch_initial_quantum_size(thread_t thread);
2574
2575static bool
2576sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread);
2577
2578static uint32_t
2579sched_clutch_run_incr(thread_t thread);
2580
2581static uint32_t
2582sched_clutch_run_decr(thread_t thread);
2583
2584static void
2585sched_clutch_update_thread_bucket(thread_t thread);
2586
2587const struct sched_dispatch_table sched_clutch_dispatch = {
2588 .sched_name = "clutch",
2589 .init = sched_clutch_init,
f427ee49 2590 .timebase_init = sched_timeshare_timebase_init,
cb323159
A
2591 .processor_init = sched_clutch_processor_init,
2592 .pset_init = sched_clutch_pset_init,
2593 .maintenance_continuation = sched_timeshare_maintenance_continue,
2594 .choose_thread = sched_clutch_choose_thread,
2595 .steal_thread_enabled = sched_steal_thread_enabled,
2596 .steal_thread = sched_clutch_steal_thread,
2597 .compute_timeshare_priority = sched_compute_timeshare_priority,
f427ee49 2598 .choose_node = sched_choose_node,
cb323159
A
2599 .choose_processor = choose_processor,
2600 .processor_enqueue = sched_clutch_processor_enqueue,
2601 .processor_queue_shutdown = sched_clutch_processor_queue_shutdown,
2602 .processor_queue_remove = sched_clutch_processor_queue_remove,
2603 .processor_queue_empty = sched_clutch_processor_queue_empty,
2604 .priority_is_urgent = priority_is_urgent,
2605 .processor_csw_check = sched_clutch_processor_csw_check,
2606 .processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
2607 .initial_quantum_size = sched_clutch_initial_quantum_size,
2608 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
2609 .can_update_priority = can_update_priority,
2610 .update_priority = update_priority,
2611 .lightweight_update_priority = lightweight_update_priority,
2612 .quantum_expire = sched_default_quantum_expire,
2613 .processor_runq_count = sched_clutch_runq_count,
2614 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
2615 .processor_bound_count = sched_clutch_processor_bound_count,
2616 .thread_update_scan = sched_clutch_thread_update_scan,
2617 .multiple_psets_enabled = TRUE,
2618 .sched_groups_enabled = FALSE,
2619 .avoid_processor_enabled = TRUE,
2620 .thread_avoid_processor = sched_clutch_thread_avoid_processor,
2621 .processor_balance = sched_SMT_balance,
2622
f427ee49
A
2623 .rt_runq = sched_rtlocal_runq,
2624 .rt_init = sched_rtlocal_init,
2625 .rt_queue_shutdown = sched_rtlocal_queue_shutdown,
2626 .rt_runq_scan = sched_rtlocal_runq_scan,
2627 .rt_runq_count_sum = sched_rtlocal_runq_count_sum,
cb323159
A
2628
2629 .qos_max_parallelism = sched_qos_max_parallelism,
2630 .check_spill = sched_check_spill,
2631 .ipi_policy = sched_ipi_policy,
2632 .thread_should_yield = sched_thread_should_yield,
2633 .run_count_incr = sched_clutch_run_incr,
2634 .run_count_decr = sched_clutch_run_decr,
2635 .update_thread_bucket = sched_clutch_update_thread_bucket,
2636 .pset_made_schedulable = sched_pset_made_schedulable,
2637};
2638
2639__attribute__((always_inline))
2640static inline run_queue_t
2641sched_clutch_bound_runq(processor_t processor)
2642{
2643 return &processor->runq;
2644}
2645
2646__attribute__((always_inline))
2647static inline sched_clutch_root_t
2648sched_clutch_processor_root_clutch(processor_t processor)
2649{
2650 return &processor->processor_set->pset_clutch_root;
2651}
2652
2653__attribute__((always_inline))
2654static inline run_queue_t
2655sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
2656{
2657 assert(thread->bound_processor == processor);
2658 return sched_clutch_bound_runq(processor);
2659}
2660
2661static uint32_t
2662sched_clutch_initial_quantum_size(thread_t thread)
2663{
2664 if (thread == THREAD_NULL) {
2665 return std_quantum;
2666 }
2667 assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
2668 return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
2669}
2670
2671static sched_mode_t
2672sched_clutch_initial_thread_sched_mode(task_t parent_task)
2673{
2674 if (parent_task == kernel_task) {
2675 return TH_MODE_FIXED;
2676 } else {
2677 return TH_MODE_TIMESHARE;
2678 }
2679}
2680
2681static void
2682sched_clutch_processor_init(processor_t processor)
2683{
2684 run_queue_init(&processor->runq);
2685}
2686
2687static void
2688sched_clutch_pset_init(processor_set_t pset)
2689{
2690 sched_clutch_root_init(&pset->pset_clutch_root, pset);
2691}
2692
f427ee49
A
2693static void
2694sched_clutch_tunables_init(void)
2695{
2696 sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
2697 sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
2698 sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
2699 clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS,
2700 NSEC_PER_USEC, &sched_clutch_bucket_group_adjust_threshold);
2701 assert(sched_clutch_bucket_group_adjust_threshold <= CLUTCH_CPU_DATA_MAX);
2702 sched_clutch_us_to_abstime(sched_clutch_bucket_group_pending_delta_us, sched_clutch_bucket_group_pending_delta);
2703}
2704
cb323159
A
2705static void
2706sched_clutch_init(void)
2707{
f427ee49
A
2708 if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
2709 sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
cb323159
A
2710 }
2711 sched_timeshare_init();
f427ee49 2712 sched_clutch_tunables_init();
cb323159
A
2713}
2714
2715static thread_t
2716sched_clutch_choose_thread(
2717 processor_t processor,
2718 int priority,
2719 __unused ast_t reason)
2720{
2721 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2722 uint32_t clutch_count = sched_clutch_root_count(sched_clutch_processor_root_clutch(processor));
2723 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2724 boolean_t choose_from_boundq = false;
2725
2726 if (bound_runq->highq < priority &&
2727 clutch_pri < priority) {
2728 return THREAD_NULL;
2729 }
2730
2731 if (bound_runq->count && clutch_count) {
2732 if (bound_runq->highq >= clutch_pri) {
2733 choose_from_boundq = true;
2734 }
2735 } else if (bound_runq->count) {
2736 choose_from_boundq = true;
2737 } else if (clutch_count) {
2738 choose_from_boundq = false;
2739 } else {
2740 return THREAD_NULL;
2741 }
2742
2743 thread_t thread = THREAD_NULL;
2744 if (choose_from_boundq == false) {
2745 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
f427ee49 2746 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
cb323159
A
2747 } else {
2748 thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
2749 }
2750 return thread;
2751}
2752
2753static boolean_t
2754sched_clutch_processor_enqueue(
2755 processor_t processor,
2756 thread_t thread,
2757 sched_options_t options)
2758{
2759 boolean_t result;
2760
2761 thread->runq = processor;
2762 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2763 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2764 result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
2765 } else {
2766 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
2767 result = run_queue_enqueue(rq, thread, options);
2768 }
2769 return result;
2770}
2771
2772static boolean_t
2773sched_clutch_processor_queue_empty(processor_t processor)
2774{
2775 return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
2776 sched_clutch_bound_runq(processor)->count == 0;
2777}
2778
2779static ast_t
2780sched_clutch_processor_csw_check(processor_t processor)
2781{
2782 boolean_t has_higher;
2783 int pri;
2784
2785 if (sched_clutch_thread_avoid_processor(processor, current_thread())) {
2786 return AST_PREEMPT | AST_URGENT;
2787 }
2788
2789 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2790 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2791
2792 assert(processor->active_thread != NULL);
2793
2794 pri = MAX(clutch_pri, bound_runq->highq);
2795
2796 if (processor->first_timeslice) {
2797 has_higher = (pri > processor->current_pri);
2798 } else {
2799 has_higher = (pri >= processor->current_pri);
2800 }
2801
2802 if (has_higher) {
2803 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
2804 return AST_PREEMPT | AST_URGENT;
2805 }
2806
2807 if (bound_runq->urgency > 0) {
2808 return AST_PREEMPT | AST_URGENT;
2809 }
2810
2811 return AST_PREEMPT;
2812 }
2813
2814 return AST_NONE;
2815}
2816
2817static boolean_t
2818sched_clutch_processor_queue_has_priority(processor_t processor,
2819 int priority,
2820 boolean_t gte)
2821{
2822 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2823
2824 int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
2825
2826 if (gte) {
2827 return qpri >= priority;
2828 } else {
2829 return qpri > priority;
2830 }
2831}
2832
2833static int
2834sched_clutch_runq_count(processor_t processor)
2835{
2836 return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
2837}
2838
2839static uint64_t
2840sched_clutch_runq_stats_count_sum(processor_t processor)
2841{
2842 uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
2843
2844 if (processor->cpu_id == processor->processor_set->cpu_set_low) {
2845 return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
2846 } else {
2847 return bound_sum;
2848 }
2849}
2850static int
2851sched_clutch_processor_bound_count(processor_t processor)
2852{
2853 return sched_clutch_bound_runq(processor)->count;
2854}
2855
2856static void
2857sched_clutch_processor_queue_shutdown(processor_t processor)
2858{
2859 processor_set_t pset = processor->processor_set;
2860 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2861 thread_t thread;
2862 queue_head_t tqueue;
2863
2864 /* We only need to migrate threads if this is the last active processor in the pset */
2865 if (pset->online_processor_count > 0) {
2866 pset_unlock(pset);
2867 return;
2868 }
2869
2870 queue_init(&tqueue);
2871 while (sched_clutch_root_count(pset_clutch_root) > 0) {
f427ee49 2872 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
cb323159
A
2873 enqueue_tail(&tqueue, &thread->runq_links);
2874 }
2875
2876 pset_unlock(pset);
2877
2878 qe_foreach_element_safe(thread, &tqueue, runq_links) {
2879 remqueue(&thread->runq_links);
cb323159 2880 thread_lock(thread);
cb323159 2881 thread_setrun(thread, SCHED_TAILQ);
cb323159
A
2882 thread_unlock(thread);
2883 }
2884}
2885
2886static boolean_t
2887sched_clutch_processor_queue_remove(
2888 processor_t processor,
2889 thread_t thread)
2890{
2891 run_queue_t rq;
2892 processor_set_t pset = processor->processor_set;
2893
2894 pset_lock(pset);
2895
2896 if (processor == thread->runq) {
2897 /*
2898 * Thread is on a run queue and we have a lock on
2899 * that run queue.
2900 */
2901 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2902 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
ea3f0419 2903 sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
cb323159
A
2904 } else {
2905 rq = sched_clutch_thread_bound_runq(processor, thread);
2906 run_queue_remove(rq, thread);
2907 }
2908 } else {
2909 /*
2910 * The thread left the run queue before we could
2911 * lock the run queue.
2912 */
2913 assert(thread->runq == PROCESSOR_NULL);
2914 processor = PROCESSOR_NULL;
2915 }
2916
2917 pset_unlock(pset);
2918
2919 return processor != PROCESSOR_NULL;
2920}
2921
2922static thread_t
f427ee49 2923sched_clutch_steal_thread(__unused processor_set_t pset)
cb323159 2924{
f427ee49 2925 /* Thread stealing is not enabled for single cluster clutch scheduler platforms */
cb323159
A
2926 return THREAD_NULL;
2927}
2928
2929static void
2930sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
2931{
2932 boolean_t restart_needed = FALSE;
2933 processor_t processor = processor_list;
2934 processor_set_t pset;
2935 thread_t thread;
2936 spl_t s;
2937
2938 /*
2939 * We update the threads associated with each processor (bound and idle threads)
2940 * and then update the threads in each pset runqueue.
2941 */
2942
2943 do {
2944 do {
2945 pset = processor->processor_set;
2946
2947 s = splsched();
2948 pset_lock(pset);
2949
2950 restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
2951
2952 pset_unlock(pset);
2953 splx(s);
2954
2955 if (restart_needed) {
2956 break;
2957 }
2958
2959 thread = processor->idle_thread;
2960 if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
2961 if (thread_update_add_thread(thread) == FALSE) {
2962 restart_needed = TRUE;
2963 break;
2964 }
2965 }
2966 } while ((processor = processor->processor_list) != NULL);
2967
2968 /* Ok, we now have a collection of candidates -- fix them. */
2969 thread_update_process_threads();
2970 } while (restart_needed);
2971
f427ee49
A
2972 pset_node_t node = &pset_node0;
2973 pset = node->psets;
cb323159
A
2974
2975 do {
2976 do {
f427ee49
A
2977 restart_needed = FALSE;
2978 while (pset != NULL) {
2979 s = splsched();
2980 pset_lock(pset);
2981
2982 if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
2983 for (sched_bucket_t bucket = TH_BUCKET_SHARE_FG; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
2984 restart_needed = runq_scan(&pset->pset_clutch_root.scr_bound_buckets[bucket].scrb_bound_thread_runq, scan_context);
2985 if (restart_needed) {
2986 break;
2987 }
cb323159 2988 }
f427ee49
A
2989 queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
2990 sched_clutch_bucket_t clutch_bucket;
2991 qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
2992 sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, scan_context->sched_tick_last_abstime);
2993 restart_needed = sched_clutch_timeshare_scan(&clutch_bucket->scb_thread_timeshare_queue, clutch_bucket->scb_thr_count, scan_context);
2994 }
2995 }
2996
2997 pset_unlock(pset);
2998 splx(s);
2999
3000 if (restart_needed) {
3001 break;
cb323159 3002 }
f427ee49 3003 pset = pset->pset_list;
cb323159
A
3004 }
3005
cb323159
A
3006 if (restart_needed) {
3007 break;
3008 }
f427ee49 3009 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
cb323159
A
3010
3011 /* Ok, we now have a collection of candidates -- fix them. */
3012 thread_update_process_threads();
3013 } while (restart_needed);
3014}
3015
3016extern int sched_allow_rt_smt;
3017
3018/* Return true if this thread should not continue running on this processor */
3019static bool
3020sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread)
3021{
3022 if (processor->processor_primary != processor) {
3023 /*
3024 * This is a secondary SMT processor. If the primary is running
3025 * a realtime thread, only allow realtime threads on the secondary.
3026 */
3027 if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
3028 return true;
3029 }
3030 }
3031
3032 return false;
3033}
3034
3035/*
3036 * For the clutch scheduler, the run counts are maintained in the clutch
3037 * buckets (i.e thread group scheduling structure).
3038 */
3039static uint32_t
3040sched_clutch_run_incr(thread_t thread)
3041{
3042 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
3043 uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3044 sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
3045 return new_count;
3046}
3047
3048static uint32_t
3049sched_clutch_run_decr(thread_t thread)
3050{
3051 assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
3052 uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3053 sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
3054 return new_count;
3055}
3056
3057static sched_bucket_t
3058sched_convert_pri_to_bucket(uint8_t priority)
3059{
3060 sched_bucket_t bucket = TH_BUCKET_RUN;
3061
3062 if (priority > BASEPRI_USER_INITIATED) {
3063 bucket = TH_BUCKET_SHARE_FG;
3064 } else if (priority > BASEPRI_DEFAULT) {
3065 bucket = TH_BUCKET_SHARE_IN;
3066 } else if (priority > BASEPRI_UTILITY) {
3067 bucket = TH_BUCKET_SHARE_DF;
3068 } else if (priority > MAXPRI_THROTTLE) {
3069 bucket = TH_BUCKET_SHARE_UT;
3070 } else {
3071 bucket = TH_BUCKET_SHARE_BG;
3072 }
3073 return bucket;
3074}
3075
3076/*
3077 * For threads that have changed sched_pri without changing the
3078 * base_pri for any reason other than decay, use the sched_pri
3079 * as the bucketizing priority instead of base_pri. All such
3080 * changes are typically due to kernel locking primitives boosts
3081 * or demotions.
3082 */
3083static boolean_t
3084sched_thread_sched_pri_promoted(thread_t thread)
3085{
3086 return (thread->sched_flags & TH_SFLAG_PROMOTED) ||
3087 (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
3088 (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
3089 (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
3090 (thread->kern_promotion_schedpri != 0);
3091}
3092
3093/*
3094 * Routine to update the scheduling bucket for the thread.
3095 *
3096 * In the clutch scheduler implementation, the thread's bucket
3097 * is based on sched_pri if it was promoted due to a kernel
3098 * primitive; otherwise its based on the thread base_pri. This
3099 * enhancement allows promoted threads to reach a higher priority
3100 * bucket and potentially get selected sooner for scheduling.
3101 *
3102 * Also, the clutch scheduler does not honor fixed priority below
3103 * FG priority. It simply puts those threads in the corresponding
3104 * timeshare bucket. The reason for to do that is because it is
3105 * extremely hard to define the scheduling properties of such threads
3106 * and they typically lead to performance issues.
3107 */
3108
3109void
3110sched_clutch_update_thread_bucket(thread_t thread)
3111{
3112 sched_bucket_t old_bucket = thread->th_sched_bucket;
3113 sched_bucket_t new_bucket = TH_BUCKET_RUN;
3114 assert(thread->runq == PROCESSOR_NULL);
cb323159
A
3115 int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
3116
3117 switch (thread->sched_mode) {
3118 case TH_MODE_FIXED:
3119 if (pri >= BASEPRI_FOREGROUND) {
3120 new_bucket = TH_BUCKET_FIXPRI;
3121 } else {
3122 new_bucket = sched_convert_pri_to_bucket(pri);
3123 }
3124 break;
3125
3126 case TH_MODE_REALTIME:
3127 new_bucket = TH_BUCKET_FIXPRI;
3128 break;
3129
3130 case TH_MODE_TIMESHARE:
3131 new_bucket = sched_convert_pri_to_bucket(pri);
3132 break;
3133
3134 default:
3135 panic("unexpected mode: %d", thread->sched_mode);
3136 break;
3137 }
3138
3139 if (old_bucket == new_bucket) {
3140 return;
3141 }
3142
3143 thread->th_sched_bucket = new_bucket;
3144 thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
cb323159
A
3145 /*
3146 * Since this is called after the thread has been removed from the runq,
3147 * only the run counts need to be updated. The re-insert into the runq
3148 * would put the thread into the correct new bucket's runq.
3149 */
3150 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
3151 sched_clutch_thread_run_bucket_decr(thread, old_bucket);
3152 sched_clutch_thread_run_bucket_incr(thread, new_bucket);
3153 }
3154}
3155
f427ee49 3156#if CONFIG_SCHED_EDGE
c6bf4f31
A
3157
3158/* Implementation of the AMP version of the clutch scheduler */
3159
f427ee49
A
3160static void
3161sched_edge_init(void);
3162
c6bf4f31 3163static thread_t
f427ee49 3164sched_edge_processor_idle(processor_set_t pset);
c6bf4f31
A
3165
3166static ast_t
f427ee49 3167sched_edge_processor_csw_check(processor_t processor);
c6bf4f31
A
3168
3169static boolean_t
f427ee49 3170sched_edge_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
c6bf4f31
A
3171
3172static boolean_t
f427ee49 3173sched_edge_processor_queue_empty(processor_t processor);
c6bf4f31
A
3174
3175static thread_t
f427ee49 3176sched_edge_choose_thread(processor_t processor, int priority, ast_t reason);
c6bf4f31
A
3177
3178static void
f427ee49 3179sched_edge_processor_queue_shutdown(processor_t processor);
c6bf4f31
A
3180
3181static processor_t
f427ee49 3182sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
c6bf4f31
A
3183
3184static bool
f427ee49 3185sched_edge_thread_avoid_processor(processor_t processor, thread_t thread);
c6bf4f31 3186
f427ee49
A
3187static void
3188sched_edge_balance(processor_t cprocessor, processor_set_t cpset);
c6bf4f31
A
3189
3190static void
f427ee49
A
3191sched_edge_check_spill(processor_set_t pset, thread_t thread);
3192
3193static bool
3194sched_edge_thread_should_yield(processor_t processor, thread_t thread);
c6bf4f31
A
3195
3196static void
f427ee49
A
3197sched_edge_pset_made_schedulable(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
3198
3199static bool
3200sched_edge_steal_thread_enabled(processor_set_t pset);
3201
3202static sched_ipi_type_t
3203sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
3204
3205static uint32_t
3206sched_edge_qos_max_parallelism(int qos, uint64_t options);
c6bf4f31 3207
f427ee49
A
3208const struct sched_dispatch_table sched_edge_dispatch = {
3209 .sched_name = "edge",
3210 .init = sched_edge_init,
3211 .timebase_init = sched_timeshare_timebase_init,
c6bf4f31
A
3212 .processor_init = sched_clutch_processor_init,
3213 .pset_init = sched_clutch_pset_init,
3214 .maintenance_continuation = sched_timeshare_maintenance_continue,
f427ee49
A
3215 .choose_thread = sched_edge_choose_thread,
3216 .steal_thread_enabled = sched_edge_steal_thread_enabled,
3217 .steal_thread = sched_edge_processor_idle,
c6bf4f31 3218 .compute_timeshare_priority = sched_compute_timeshare_priority,
f427ee49
A
3219 .choose_node = sched_choose_node,
3220 .choose_processor = sched_edge_choose_processor,
c6bf4f31 3221 .processor_enqueue = sched_clutch_processor_enqueue,
f427ee49 3222 .processor_queue_shutdown = sched_edge_processor_queue_shutdown,
c6bf4f31 3223 .processor_queue_remove = sched_clutch_processor_queue_remove,
f427ee49 3224 .processor_queue_empty = sched_edge_processor_queue_empty,
c6bf4f31 3225 .priority_is_urgent = priority_is_urgent,
f427ee49
A
3226 .processor_csw_check = sched_edge_processor_csw_check,
3227 .processor_queue_has_priority = sched_edge_processor_queue_has_priority,
c6bf4f31
A
3228 .initial_quantum_size = sched_clutch_initial_quantum_size,
3229 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
3230 .can_update_priority = can_update_priority,
3231 .update_priority = update_priority,
3232 .lightweight_update_priority = lightweight_update_priority,
3233 .quantum_expire = sched_default_quantum_expire,
3234 .processor_runq_count = sched_clutch_runq_count,
3235 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
3236 .processor_bound_count = sched_clutch_processor_bound_count,
3237 .thread_update_scan = sched_clutch_thread_update_scan,
3238 .multiple_psets_enabled = TRUE,
3239 .sched_groups_enabled = FALSE,
3240 .avoid_processor_enabled = TRUE,
f427ee49
A
3241 .thread_avoid_processor = sched_edge_thread_avoid_processor,
3242 .processor_balance = sched_edge_balance,
c6bf4f31
A
3243
3244 .rt_runq = sched_amp_rt_runq,
3245 .rt_init = sched_amp_rt_init,
3246 .rt_queue_shutdown = sched_amp_rt_queue_shutdown,
3247 .rt_runq_scan = sched_amp_rt_runq_scan,
3248 .rt_runq_count_sum = sched_amp_rt_runq_count_sum,
3249
f427ee49
A
3250 .qos_max_parallelism = sched_edge_qos_max_parallelism,
3251 .check_spill = sched_edge_check_spill,
3252 .ipi_policy = sched_edge_ipi_policy,
3253 .thread_should_yield = sched_edge_thread_should_yield,
c6bf4f31
A
3254 .run_count_incr = sched_clutch_run_incr,
3255 .run_count_decr = sched_clutch_run_decr,
3256 .update_thread_bucket = sched_clutch_update_thread_bucket,
f427ee49
A
3257 .pset_made_schedulable = sched_edge_pset_made_schedulable,
3258 .thread_group_recommendation_change = NULL,
c6bf4f31
A
3259};
3260
f427ee49
A
3261static struct processor_set pset1;
3262static struct pset_node pset_node1;
3263static bitmap_t sched_edge_available_pset_bitmask[BITMAP_LEN(MAX_PSETS)];
3264
3265/*
3266 * sched_edge_pset_available()
3267 *
3268 * Routine to determine if a pset is available for scheduling.
3269 */
3270static bool
3271sched_edge_pset_available(processor_set_t pset)
3272{
3273 return bitmap_test(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
3274}
3275
3276/*
3277 * sched_edge_thread_bound_cluster_id()
3278 *
3279 * Routine to determine which cluster a particular thread is bound to. Uses
3280 * the sched_flags on the thread to map back to a specific cluster id.
3281 *
3282 * <Edge Multi-cluster Support Needed>
3283 */
3284static uint32_t
3285sched_edge_thread_bound_cluster_id(thread_t thread)
3286{
3287 assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread));
3288 if (thread->sched_flags & TH_SFLAG_ECORE_ONLY) {
3289 return (pset_array[0]->pset_type == CLUSTER_TYPE_E) ? 0 : 1;
3290 } else {
3291 return (pset_array[0]->pset_type == CLUSTER_TYPE_P) ? 0 : 1;
3292 }
3293}
3294
3295/* Forward declaration for some thread migration routines */
3296static boolean_t sched_edge_foreign_runnable_thread_available(processor_set_t pset);
3297static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset);
3298static processor_set_t sched_edge_steal_candidate(processor_set_t pset);
3299static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks);
3300
3301/*
3302 * sched_edge_config_set()
3303 *
3304 * Support to update an edge configuration. Typically used by CLPC to affect thread migration
3305 * policies in the scheduler.
3306 */
3307static void
3308sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_clutch_edge edge_config)
3309{
3310 sched_clutch_edge *edge = &pset_array[src_cluster]->sched_edges[dst_cluster];
3311 edge->sce_edge_packed = edge_config.sce_edge_packed;
3312}
3313
3314/*
3315 * sched_edge_config_get()
3316 *
3317 * Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
3318 * if it needs to update edges.
3319 */
3320static sched_clutch_edge
3321sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster)
3322{
3323 return pset_array[src_cluster]->sched_edges[dst_cluster];
3324}
3325
3326#if DEVELOPMENT || DEBUG
3327
3328/*
3329 * Helper Routines for edge scheduler sysctl configuration
3330 *
3331 * The current support is limited to dual cluster AMP platforms.
3332 * <Edge Multi-cluster Support Needed>
3333 */
3334
3335kern_return_t
3336sched_edge_sysctl_configure_e_to_p(uint64_t edge_config)
3337{
3338 pset_array[ecore_cluster_id]->sched_edges[pcore_cluster_id].sce_edge_packed = edge_config;
3339 return KERN_SUCCESS;
3340}
3341
3342kern_return_t
3343sched_edge_sysctl_configure_p_to_e(uint64_t edge_config)
3344{
3345 pset_array[pcore_cluster_id]->sched_edges[ecore_cluster_id].sce_edge_packed = edge_config;
3346 return KERN_SUCCESS;
3347}
3348
3349sched_clutch_edge
3350sched_edge_e_to_p(void)
3351{
3352 return sched_edge_config_get(ecore_cluster_id, pcore_cluster_id);
3353}
3354
3355sched_clutch_edge
3356sched_edge_p_to_e(void)
3357{
3358 return sched_edge_config_get(pcore_cluster_id, ecore_cluster_id);
3359}
3360
3361#endif /* DEVELOPMENT || DEBUG */
3362
3363/*
3364 * sched_edge_matrix_set()
3365 *
3366 * Routine to update various edges in the cluster edge matrix. The edge_changes_bitmap
3367 * indicates which edges need to be updated. Both the edge_matrix & edge_changes_bitmap
3368 * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3369 */
3370void
3371sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changes_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3372{
3373 uint32_t edge_index = 0;
3374 for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3375 for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3376 if (edge_changes_bitmap[edge_index]) {
3377 sched_edge_config_set(src_cluster, dst_cluster, edge_matrix[edge_index]);
3378 }
3379 edge_index++;
3380 }
3381 }
3382}
3383
3384/*
3385 * sched_edge_matrix_get()
3386 *
3387 * Routine to retrieve various edges in the cluster edge matrix. The edge_request_bitmap
3388 * indicates which edges need to be retrieved. Both the edge_matrix & edge_request_bitmap
3389 * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3390 */
3391void
3392sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_request_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3393{
3394 uint32_t edge_index = 0;
3395 for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3396 for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3397 if (edge_request_bitmap[edge_index]) {
3398 edge_matrix[edge_index] = sched_edge_config_get(src_cluster, dst_cluster);
3399 }
3400 edge_index++;
3401 }
3402 }
3403}
3404
3405/*
3406 * sched_edge_init()
3407 *
3408 * Routine to initialize the data structures for the Edge scheduler. The current implementation
3409 * relies on this being enabled for a dual cluster AMP system. Once a better design for MAX_PSETS,
3410 * edge config etc. is defined, it should be made more generic to handle the multi-cluster
3411 * platorms.
3412 * <Edge Multi-cluster Support Needed>
3413 */
3414static void
3415sched_edge_init(void)
3416{
3417 processor_set_t ecore_set = &pset0;
3418 processor_set_t pcore_set = &pset1;
3419
3420 if (ml_get_boot_cluster() == CLUSTER_TYPE_P) {
3421 /* If the device boots on a P-cluster, fixup the IDs etc. */
3422 pcore_set = &pset0;
3423 ecore_set = &pset1;
3424 bitmap_set(sched_edge_available_pset_bitmask, pcore_cluster_id);
3425 } else {
3426 bitmap_set(sched_edge_available_pset_bitmask, ecore_cluster_id);
3427 }
3428
3429 ecore_set->pset_cluster_type = PSET_AMP_E;
3430 ecore_set->pset_cluster_id = ecore_cluster_id;
3431
3432 pcore_set->pset_cluster_type = PSET_AMP_P;
3433 pcore_set->pset_cluster_id = pcore_cluster_id;
3434
3435 pset_init(&pset1, &pset_node1);
3436 pset_node1.psets = &pset1;
3437 pset_node0.node_list = &pset_node1;
3438
3439 pset_array[ecore_cluster_id] = ecore_set;
3440 pset_array[ecore_cluster_id]->pset_type = CLUSTER_TYPE_E;
3441 bitmap_set(pset_array[ecore_cluster_id]->foreign_psets, pcore_cluster_id);
3442
3443 sched_edge_config_set(ecore_cluster_id, ecore_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
3444 sched_edge_config_set(ecore_cluster_id, pcore_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
3445
3446 pset_array[pcore_cluster_id] = pcore_set;
3447 pset_array[pcore_cluster_id]->pset_type = CLUSTER_TYPE_P;
3448 bitmap_set(pset_array[pcore_cluster_id]->foreign_psets, ecore_cluster_id);
3449
3450 sched_edge_config_set(pcore_cluster_id, pcore_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
3451 sched_edge_config_set(pcore_cluster_id, ecore_cluster_id, (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
3452
3453 sched_timeshare_init();
3454 sched_clutch_tunables_init();
3455}
c6bf4f31
A
3456
3457static thread_t
f427ee49 3458sched_edge_choose_thread(
c6bf4f31
A
3459 processor_t processor,
3460 int priority,
3461 __unused ast_t reason)
3462{
c6bf4f31
A
3463 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
3464 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3465 boolean_t choose_from_boundq = false;
3466
3467 if ((bound_runq->highq < priority) &&
f427ee49 3468 (clutch_pri < priority)) {
c6bf4f31
A
3469 return THREAD_NULL;
3470 }
3471
3472 if (bound_runq->highq >= clutch_pri) {
3473 choose_from_boundq = true;
3474 }
3475
3476 thread_t thread = THREAD_NULL;
3477 if (choose_from_boundq == false) {
3478 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
f427ee49 3479 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
c6bf4f31
A
3480 } else {
3481 thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
3482 }
3483 return thread;
3484}
3485
3486static boolean_t
f427ee49 3487sched_edge_processor_queue_empty(processor_t processor)
c6bf4f31 3488{
c6bf4f31 3489 return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
f427ee49
A
3490 (sched_clutch_bound_runq(processor)->count == 0);
3491}
3492
3493static void
3494sched_edge_check_spill(__unused processor_set_t pset, __unused thread_t thread)
3495{
3496 assert(thread->bound_processor == PROCESSOR_NULL);
c6bf4f31
A
3497}
3498
f427ee49
A
3499__options_decl(sched_edge_thread_yield_reason_t, uint32_t, {
3500 SCHED_EDGE_YIELD_RUNQ_NONEMPTY = 0x0,
3501 SCHED_EDGE_YIELD_FOREIGN_RUNNABLE = 0x1,
3502 SCHED_EDGE_YIELD_FOREIGN_RUNNING = 0x2,
3503 SCHED_EDGE_YIELD_STEAL_POSSIBLE = 0x3,
3504 SCHED_EDGE_YIELD_DISALLOW = 0x4,
3505});
3506
c6bf4f31 3507static bool
f427ee49 3508sched_edge_thread_should_yield(processor_t processor, __unused thread_t thread)
c6bf4f31 3509{
f427ee49
A
3510 if (!sched_edge_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
3511 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3512 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY);
c6bf4f31
A
3513 return true;
3514 }
3515
f427ee49
A
3516 /*
3517 * The yield logic should follow the same logic that steal_thread () does. The
3518 * thread_should_yield() is effectively trying to quickly check that if the
3519 * current thread gave up CPU, is there any other thread that would execute
3520 * on this CPU. So it needs to provide the same answer as the steal_thread()/
3521 * processor Idle logic.
3522 */
3523 if (sched_edge_foreign_runnable_thread_available(processor->processor_set)) {
3524 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3525 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNABLE);
3526 return true;
3527 }
3528 if (sched_edge_foreign_running_thread_available(processor->processor_set)) {
3529 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3530 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING);
3531 return true;
3532 }
3533
3534 processor_set_t steal_candidate = sched_edge_steal_candidate(processor->processor_set);
3535 if (steal_candidate != NULL) {
3536 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3537 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE);
3538 return true;
c6bf4f31
A
3539 }
3540
f427ee49
A
3541 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE, thread_tid(thread), processor->processor_set->pset_cluster_id,
3542 0, SCHED_EDGE_YIELD_DISALLOW);
c6bf4f31
A
3543 return false;
3544}
3545
3546static ast_t
f427ee49 3547sched_edge_processor_csw_check(processor_t processor)
c6bf4f31
A
3548{
3549 boolean_t has_higher;
3550 int pri;
3551
3552 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
3553 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3554
3555 assert(processor->active_thread != NULL);
3556
c6bf4f31 3557 pri = MAX(clutch_pri, bound_runq->highq);
c6bf4f31
A
3558
3559 if (processor->first_timeslice) {
3560 has_higher = (pri > processor->current_pri);
3561 } else {
3562 has_higher = (pri >= processor->current_pri);
3563 }
3564
3565 if (has_higher) {
3566 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
3567 return AST_PREEMPT | AST_URGENT;
3568 }
3569
3570 if (bound_runq->urgency > 0) {
3571 return AST_PREEMPT | AST_URGENT;
3572 }
3573
c6bf4f31
A
3574 return AST_PREEMPT;
3575 }
3576
3577 return AST_NONE;
3578}
3579
3580static boolean_t
f427ee49 3581sched_edge_processor_queue_has_priority(processor_t processor,
c6bf4f31
A
3582 int priority,
3583 boolean_t gte)
3584{
c6bf4f31
A
3585 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3586
3587 int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
c6bf4f31
A
3588 if (gte) {
3589 return qpri >= priority;
3590 } else {
3591 return qpri > priority;
3592 }
3593}
3594
c6bf4f31 3595static void
f427ee49 3596sched_edge_processor_queue_shutdown(processor_t processor)
c6bf4f31
A
3597{
3598 processor_set_t pset = processor->processor_set;
3599 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3600 thread_t thread;
3601 queue_head_t tqueue;
3602
3603 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
3604 if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
3605 pset_unlock(pset);
3606 return;
3607 }
3608
f427ee49
A
3609 bitmap_clear(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
3610
c6bf4f31
A
3611 queue_init(&tqueue);
3612 while (sched_clutch_root_count(pset_clutch_root) > 0) {
f427ee49 3613 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
c6bf4f31
A
3614 enqueue_tail(&tqueue, &thread->runq_links);
3615 }
3616 pset_unlock(pset);
3617
3618 qe_foreach_element_safe(thread, &tqueue, runq_links) {
3619 remqueue(&thread->runq_links);
3620 thread_lock(thread);
3621 thread_setrun(thread, SCHED_TAILQ);
3622 thread_unlock(thread);
3623 }
3624}
3625
f427ee49
A
3626/*
3627 * sched_edge_cluster_load_metric()
3628 *
3629 * The load metric for a cluster is a measure of the average scheduling latency
3630 * experienced by threads on that cluster. It is a product of the average number
3631 * of threads in the runqueue and the average execution time for threads. The metric
3632 * has special values in the following cases:
3633 * - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
3634 * the maximum value to disallow any threads to migrate to this cluster.
3635 * - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
3636 * to be spread across the platform quickly for ncpu wide workloads.
3637 */
3638static uint32_t
3639sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket)
c6bf4f31 3640{
f427ee49
A
3641 if (sched_edge_pset_available(pset) == false) {
3642 return UINT32_MAX;
c6bf4f31 3643 }
f427ee49
A
3644 return (uint32_t)sched_get_pset_load_average(pset, sched_bucket);
3645}
c6bf4f31 3646
f427ee49
A
3647/*
3648 *
3649 * Edge Scheduler Steal/Rebalance logic
3650 *
3651 * = Generic scheduler logic =
3652 *
3653 * The SCHED(steal_thread) scheduler callout is invoked when the processor does not
3654 * find any thread for execution in its runqueue. The aim of the steal operation
3655 * is to find other threads running/runnable in other clusters which should be
3656 * executed here.
3657 *
3658 * If the steal callout does not return a thread, the thread_select() logic calls
3659 * SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
3660 * threads and idle out the current CPU.
3661 *
3662 * = SCHED(steal_thread) for Edge Scheduler =
3663 *
3664 * The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
3665 * routine tries to do the following operations in order:
3666 * (1) Find foreign runnnable threads in non-native cluster
3667 * runqueues (sched_edge_foreign_runnable_thread_remove())
3668 * (2) Check if foreign threads are running on the non-native
3669 * clusters (sched_edge_foreign_running_thread_available())
3670 * - If yes, return THREAD_NULL for the steal callout and
3671 * perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
3672 * (3) Steal a thread from another cluster based on edge
3673 * weights (sched_edge_steal_thread())
3674 *
3675 * = SCHED(processor_balance) for Edge Scheduler =
3676 *
3677 * If steal_thread did not return a thread for the processor, use
3678 * sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
3679 *
3680 * = Clutch Bucket Preferred Cluster Overrides =
3681 *
3682 * Since these operations (just like thread migrations on enqueue)
3683 * move threads across clusters, they need support for handling clutch
3684 * bucket group level preferred cluster recommendations.
3685 * For (1), a clutch bucket will be in the foreign runnable queue based
3686 * on the clutch bucket group preferred cluster.
3687 * For (2), the running thread will set the bit on the processor based
3688 * on its preferred cluster type.
3689 * For (3), the edge configuration would prevent threads from being stolen
3690 * in the wrong direction.
3691 *
3692 * = SCHED(thread_should_yield) =
3693 * The thread_should_yield() logic needs to have the same logic as sched_edge_processor_idle()
3694 * since that is expecting the same answer as if thread_select() was called on a core
3695 * with an empty runqueue.
3696 */
c6bf4f31 3697
f427ee49
A
3698static bool
3699sched_edge_steal_thread_enabled(__unused processor_set_t pset)
3700{
3701 /*
3702 * For edge scheduler, the gating for steal is being done by sched_edge_steal_candidate()
3703 */
3704 return true;
3705}
c6bf4f31 3706
f427ee49
A
3707static processor_set_t
3708sched_edge_steal_candidate(processor_set_t pset)
3709{
3710 /*
3711 * Edge Scheduler Optimization
3712 *
3713 * Investigate a better policy for stealing. The current implementation looks
3714 * at all the incoming weights for the pset that just became idle and sees which
3715 * clusters have loads > edge weights. It is effectively trying to simulate a
3716 * overload migration as if a thread had become runnable on the candidate cluster.
3717 *
3718 * The logic today bails as soon as it finds a cluster where the cluster load is
3719 * greater than the edge weight. This helps the check to be quick which is useful
3720 * for sched_edge_thread_should_yield() which uses this. Maybe it should have a
3721 * more advanced version for the actual steal operation which looks for the
3722 * maximum delta etc.
3723 */
3724 processor_set_t target_pset = NULL;
3725 uint32_t dst_cluster_id = pset->pset_cluster_id;
c6bf4f31 3726
f427ee49
A
3727 for (int cluster_id = 0; cluster_id < MAX_PSETS; cluster_id++) {
3728 processor_set_t candidate_pset = pset_array[cluster_id];
c6bf4f31 3729
f427ee49
A
3730 if (candidate_pset == pset) {
3731 continue;
3732 }
c6bf4f31 3733
f427ee49
A
3734 sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[dst_cluster_id];
3735 if (incoming_edge->sce_steal_allowed == false) {
3736 continue;
3737 }
c6bf4f31 3738
f427ee49
A
3739 uint32_t incoming_weight = incoming_edge->sce_migration_weight;
3740 int highest_runnable_bucket = bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
3741 if (highest_runnable_bucket == -1) {
3742 /* Candidate cluster runq is empty */
3743 continue;
3744 }
3745 /* Use the load metrics for highest runnable bucket since that would be stolen next */
3746 uint32_t candidate_load = sched_edge_cluster_load_metric(candidate_pset, (sched_bucket_t)highest_runnable_bucket);
3747 if (candidate_load > incoming_weight) {
3748 /* Only steal from the candidate if its load is higher than the incoming edge and it has runnable threads */
3749 target_pset = candidate_pset;
3750 break;
c6bf4f31
A
3751 }
3752 }
3753
f427ee49 3754 return target_pset;
c6bf4f31
A
3755}
3756
f427ee49
A
3757static boolean_t
3758sched_edge_foreign_runnable_thread_available(processor_set_t pset)
c6bf4f31 3759{
f427ee49
A
3760 /* Find all the clusters that are foreign for this cluster */
3761 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3762 for (int cluster = bitmap_first(foreign_pset_bitmap, MAX_PSETS); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3763 /*
3764 * For each cluster, see if there are any runnable foreign threads.
3765 * This check is currently being done without the pset lock to make it cheap for
3766 * the common case.
3767 */
3768 processor_set_t target_pset = pset_array[cluster];
3769 if (sched_edge_pset_available(target_pset) == false) {
3770 continue;
c6bf4f31 3771 }
f427ee49
A
3772
3773 if (!sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
c6bf4f31
A
3774 return true;
3775 }
3776 }
c6bf4f31
A
3777 return false;
3778}
3779
f427ee49
A
3780static thread_t
3781sched_edge_foreign_runnable_thread_remove(processor_set_t pset, uint64_t ctime)
c6bf4f31 3782{
f427ee49 3783 thread_t thread = THREAD_NULL;
c6bf4f31 3784
f427ee49
A
3785 /* Find all the clusters that are foreign for this cluster */
3786 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3787 for (int cluster = bitmap_first(foreign_pset_bitmap, MAX_PSETS); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3788 /*
3789 * For each cluster, see if there are any runnable foreign threads.
3790 * This check is currently being done without the pset lock to make it cheap for
3791 * the common case.
3792 */
3793 processor_set_t target_pset = pset_array[cluster];
3794 if (sched_edge_pset_available(target_pset) == false) {
3795 continue;
3796 }
c6bf4f31 3797
f427ee49
A
3798 if (sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
3799 continue;
3800 }
3801 /*
3802 * Looks like there are runnable foreign threads in the hierarchy; lock the pset
3803 * and get the highest priority thread.
3804 */
3805 pset_lock(target_pset);
3806 if (sched_edge_pset_available(target_pset)) {
3807 thread = sched_clutch_root_highest_foreign_thread_remove(&target_pset->pset_clutch_root);
3808 sched_update_pset_load_average(target_pset, ctime);
3809 }
3810 pset_unlock(target_pset);
3811
3812 /*
3813 * Edge Scheduler Optimization
3814 *
3815 * The current implementation immediately returns as soon as it finds a foreign
3816 * runnable thread. This could be enhanced to look at highest priority threads
3817 * from all foreign clusters and pick the highest amongst them. That would need
3818 * some form of global state across psets to make that kind of a check cheap.
3819 */
3820 if (thread != THREAD_NULL) {
3821 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNABLE) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, target_pset->pset_cluster_id, 0);
3822 break;
3823 }
3824 /* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */
c6bf4f31
A
3825 }
3826
f427ee49
A
3827 return thread;
3828}
3829
3830static boolean_t
3831sched_edge_foreign_running_thread_available(processor_set_t pset)
3832{
3833 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3834 for (int cluster = bitmap_first(foreign_pset_bitmap, MAX_PSETS); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3835 /* Skip the pset if its not schedulable */
3836 processor_set_t target_pset = pset_array[cluster];
3837 if (sched_edge_pset_available(target_pset) == false) {
3838 continue;
3839 }
3840
3841 uint64_t running_foreign_bitmap = target_pset->cpu_state_map[PROCESSOR_RUNNING] & target_pset->cpu_running_foreign;
3842 if (lsb_first(running_foreign_bitmap) != -1) {
3843 /* Found a non-native CPU running a foreign thread; rebalance is needed */
3844 return true;
3845 }
c6bf4f31 3846 }
f427ee49
A
3847 return false;
3848}
c6bf4f31 3849
f427ee49
A
3850static thread_t
3851sched_edge_steal_thread(processor_set_t pset)
3852{
3853 thread_t thread = THREAD_NULL;
3854 processor_set_t steal_from_pset = sched_edge_steal_candidate(pset);
3855 if (steal_from_pset) {
c6bf4f31 3856 /*
f427ee49
A
3857 * sched_edge_steal_candidate() has found a pset which is ideal to steal from.
3858 * Lock the pset and select the highest thread in that runqueue.
c6bf4f31 3859 */
f427ee49
A
3860 pset_lock(steal_from_pset);
3861 if (bitmap_first(steal_from_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) != -1) {
3862 uint64_t current_timestamp = mach_absolute_time();
3863 sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(&steal_from_pset->pset_clutch_root, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
3864 thread = sched_clutch_thread_unbound_lookup(&steal_from_pset->pset_clutch_root, root_bucket);
3865 sched_clutch_thread_remove(&steal_from_pset->pset_clutch_root, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
3866 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STEAL) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, steal_from_pset->pset_cluster_id, 0);
3867 sched_update_pset_load_average(steal_from_pset, current_timestamp);
c6bf4f31 3868 }
f427ee49
A
3869 /*
3870 * Edge Scheduler Optimization
3871 * Maybe this needs to circle around if the steal candidate did not have any threads by
3872 * by the time the pset lock was taken.
3873 */
3874 pset_unlock(steal_from_pset);
c6bf4f31 3875 }
f427ee49
A
3876 return thread;
3877}
3878
3879/*
3880 * sched_edge_processor_idle()
3881 *
3882 * The routine is the implementation for steal_thread() for the Edge scheduler.
3883 */
3884static thread_t
3885sched_edge_processor_idle(processor_set_t pset)
3886{
3887 thread_t thread = THREAD_NULL;
3888
3889 uint64_t ctime = mach_absolute_time();
3890
3891 /* Each of the operations acquire the lock for the pset they target */
3892 pset_unlock(pset);
3893
3894 /* Find highest priority runnable thread on all non-native clusters */
3895 thread = sched_edge_foreign_runnable_thread_remove(pset, ctime);
3896 if (thread != THREAD_NULL) {
3897 return thread;
3898 }
3899
3900 /* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */
3901 boolean_t rebalance_needed = sched_edge_foreign_running_thread_available(pset);
3902 if (rebalance_needed) {
3903 return THREAD_NULL;
3904 }
3905
3906 /* No foreign threads found; find a thread to steal from a pset based on weights/loads etc. */
3907 thread = sched_edge_steal_thread(pset);
3908 return thread;
3909}
3910
3911/* Return true if this thread should not continue running on this processor */
3912static bool
3913sched_edge_thread_avoid_processor(processor_t processor, thread_t thread)
3914{
3915 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
3916 /*
3917 * For long running parallel workloads, it is important to rebalance threads across
3918 * E/P clusters so that they make equal forward progress. This is achieved through
3919 * threads expiring their quantum on the non-preferred cluster type and explicitly
3920 * rebalancing to the preferred cluster runqueue.
3921 *
3922 * <Edge Multi-Cluster Support Needed>
3923 * For multi-cluster platforms, it mignt be useful to move the thread incase its
3924 * preferred pset is idle now.
3925 */
3926 if (processor->processor_set->pset_type != preferred_pset->pset_type) {
3927 return true;
3928 }
3929 /* If thread already running on preferred cluster, do not avoid */
3930 if (processor->processor_set == preferred_pset) {
3931 return false;
3932 }
3933 /*
3934 * The thread is running on a processor that is of the same type as the
3935 * preferred pset, but is not the actual preferred pset. In that case
3936 * look at edge weights to see if this thread should continue execution
3937 * here or go back to its preferred cluster.
3938 *
3939 * <Edge Multi-Cluster Support Needed>
3940 * This logic needs to ensure that the current thread is not counted against
3941 * the load average for the current pset otherwise it would always end up avoiding
3942 * the current cluster.
3943 */
3944 processor_set_t chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, processor->processor_set, false);
3945 return chosen_pset != processor->processor_set;
3946}
3947
3948static void
3949sched_edge_balance(__unused processor_t cprocessor, processor_set_t cpset)
3950{
3951 assert(cprocessor == current_processor());
3952 pset_unlock(cpset);
3953
3954 uint64_t ast_processor_map = 0;
3955 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
3956
3957 bitmap_t *foreign_pset_bitmap = cpset->foreign_psets;
3958 for (int cluster = bitmap_first(foreign_pset_bitmap, MAX_PSETS); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3959 /* Skip the pset if its not schedulable */
3960 processor_set_t target_pset = pset_array[cluster];
3961 if (sched_edge_pset_available(target_pset) == false) {
3962 continue;
3963 }
3964
3965 pset_lock(target_pset);
3966 uint64_t cpu_running_foreign_map = (target_pset->cpu_running_foreign & target_pset->cpu_state_map[PROCESSOR_RUNNING]);
3967 for (int cpuid = lsb_first(cpu_running_foreign_map); cpuid >= 0; cpuid = lsb_next(cpu_running_foreign_map, cpuid)) {
3968 processor_t target_cpu = processor_array[cpuid];
3969 ipi_type[target_cpu->cpu_id] = sched_ipi_action(target_cpu, NULL, false, SCHED_IPI_EVENT_REBALANCE);
3970 if (ipi_type[cpuid] != SCHED_IPI_NONE) {
3971 bit_set(ast_processor_map, cpuid);
3972 }
3973 }
3974 pset_unlock(target_pset);
3975 }
3976
3977 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
3978 processor_t ast_processor = processor_array[cpuid];
3979 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
3980 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNING) | DBG_FUNC_NONE, 0, cprocessor->cpu_id, cpuid, 0);
3981 }
3982}
3983
3984/*
3985 * sched_edge_migrate_edges_evaluate()
3986 *
3987 * Routine to find the candidate for thread migration based on edge weights.
3988 *
3989 * Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
3990 * return preferred_pset if its the most ideal destination for this thread.
3991 */
3992static processor_set_t
3993sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t preferred_cluster_load, thread_t thread)
3994{
3995 processor_set_t selected_pset = preferred_pset;
3996 uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
3997 cluster_type_t preferred_cluster_type = pset_type_for_id(preferred_cluster_id);
3998
3999 /* Look at edge deltas with other clusters to find the ideal migration candidate */
4000 sched_clutch_edge *edge = preferred_pset->sched_edges;
4001 uint32_t max_edge_delta = 0;
4002
4003 /*
4004 * Edge Scheduler Optimization
4005 *
4006 * For really large cluster count systems, it might make sense to optimize the
4007 * clusters iterated by using bitmaps and skipping over clusters that are not
4008 * available for scheduling or have migration disabled from this cluster.
4009 */
4010 for (uint32_t cluster_id = 0; cluster_id < MAX_PSETS; cluster_id++) {
4011 processor_set_t dst_pset = pset_array[cluster_id];
4012 if (cluster_id == preferred_cluster_id) {
4013 continue;
4014 }
4015
4016 if (edge[cluster_id].sce_migration_allowed == false) {
4017 continue;
4018 }
4019
4020 uint32_t dst_load = sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket);
4021 if (dst_load > preferred_cluster_load) {
4022 continue;
4023 }
4024
4025 /*
4026 * Fast path for idle dst cluster
4027 *
4028 * For extremely parallel workloads, it is important to load up
4029 * all clusters as quickly as possible. This short-circuit allows
4030 * that.
4031 * <Edge Multi-cluster Support Needed>
4032 *
4033 * For multi-cluster platforms, the loop should start with the homogeneous
4034 * clusters first.
4035 */
4036 if (dst_load == 0) {
4037 selected_pset = dst_pset;
4038 break;
4039 }
4040
4041 uint32_t edge_delta = preferred_cluster_load - dst_load;
4042 if (edge_delta < edge[cluster_id].sce_migration_weight) {
4043 continue;
4044 }
4045
4046 if (edge_delta < max_edge_delta) {
4047 continue;
4048 }
4049
4050 if (edge_delta == max_edge_delta) {
4051 /* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */
4052 boolean_t selected_homogeneous = (pset_type_for_id(selected_pset->pset_cluster_id) == preferred_cluster_type);
4053 boolean_t candidate_homogeneous = (pset_type_for_id(dst_pset->pset_cluster_id) == preferred_cluster_type);
4054 if (selected_homogeneous || !candidate_homogeneous) {
4055 continue;
4056 }
4057 }
4058 /* dst_pset seems to be the best candidate for migration */
4059 max_edge_delta = edge_delta;
4060 selected_pset = dst_pset;
4061 }
4062 return selected_pset;
c6bf4f31
A
4063}
4064
4065/*
f427ee49 4066 * sched_edge_candidate_alternative()
c6bf4f31 4067 *
f427ee49
A
4068 * Routine to find an alternative cluster from candidate_cluster_bitmap since the
4069 * selected_pset is not available for execution. The logic tries to prefer homogeneous
4070 * clusters over heterogeneous clusters since this is typically used in thread
4071 * placement decisions.
c6bf4f31 4072 */
f427ee49
A
4073_Static_assert(MAX_PSETS <= 64, "Unable to fit maximum number of psets in uint64_t bitmask");
4074static processor_set_t
4075sched_edge_candidate_alternative(processor_set_t selected_pset, uint64_t candidate_cluster_bitmap)
4076{
4077 /*
4078 * It looks like the most ideal pset is not available for scheduling currently.
4079 * Try to find a homogeneous cluster that is still available.
4080 */
4081 bitmap_t *foreign_clusters = selected_pset->foreign_psets;
4082 uint64_t available_native_clusters = ~(foreign_clusters[0]) & candidate_cluster_bitmap;
4083 int available_cluster_id = lsb_first(available_native_clusters);
4084 if (available_cluster_id == -1) {
4085 /* Looks like none of the homogeneous clusters are available; pick the first available cluster */
4086 available_cluster_id = bit_first(candidate_cluster_bitmap);
4087 }
4088 assert(available_cluster_id != -1);
4089 return pset_array[available_cluster_id];
4090}
c6bf4f31
A
4091
4092/*
f427ee49 4093 * sched_edge_switch_pset_lock()
c6bf4f31 4094 *
f427ee49
A
4095 * Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
4096 * switch_pset_locks.
4097 * Returns the newly locked pset after the switch.
c6bf4f31
A
4098 */
4099static processor_set_t
f427ee49 4100sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locked_pset, bool switch_pset_locks)
c6bf4f31 4101{
f427ee49
A
4102 if (!switch_pset_locks) {
4103 return locked_pset;
c6bf4f31 4104 }
f427ee49
A
4105 if (selected_pset != locked_pset) {
4106 pset_unlock(locked_pset);
4107 pset_lock(selected_pset);
4108 return selected_pset;
4109 } else {
4110 return locked_pset;
4111 }
4112}
c6bf4f31 4113
f427ee49
A
4114/*
4115 * sched_edge_migrate_candidate()
4116 *
4117 * Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
4118 * the thread and the preferred cluster to determine the best available pset for scheduling.
4119 *
4120 * The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
4121 * accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
4122 * decision has to be synchronized with another CPU changing the recommendation of clusters available
4123 * on the system. If this parameter is set to false, this routine returns the best effort indication of
4124 * the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
4125 * SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
4126 * more appropriate cluster.
4127 *
4128 * Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
4129 * resultant pset lock is held.
4130 */
4131static processor_set_t
4132sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks)
4133{
4134 __kdebug_only uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
4135 processor_set_t selected_pset = preferred_pset;
4136
4137 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
4138 /* For bound threads always recommend the cluster its bound to */
4139 selected_pset = pset_array[sched_edge_thread_bound_cluster_id(thread)];
4140 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4141 if (sched_edge_pset_available(selected_pset) || (SCHED_CLUTCH_THREAD_CLUSTER_BOUND_SOFT(thread) == false)) {
4142 /*
4143 * If the bound cluster is not available, check if the thread is soft bound. For soft bound threads,
4144 * fall through to the regular cluster selection logic which handles unavailable clusters
4145 * appropriately. If the thread is hard bound, then return the bound cluster always.
4146 */
4147 return selected_pset;
4148 }
4149 }
4150
4151 uint64_t candidate_cluster_bitmap = mask(MAX_PSETS);
4152 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4153 /* For realtime threads, try and schedule them on the preferred pset always */
4154 goto migrate_candidate_available_check;
4155 }
4156
4157 /*
4158 * If a thread is being rebalanced for achieving equal progress of parallel workloads,
4159 * it needs to end up on the preferred runqueue.
4160 */
4161 uint32_t preferred_cluster_load = sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket);
4162 boolean_t amp_rebalance = (thread->reason & (AST_REBALANCE | AST_QUANTUM)) == (AST_REBALANCE | AST_QUANTUM);
4163 if ((preferred_cluster_load == 0) || amp_rebalance) {
4164 goto migrate_candidate_available_check;
4165 }
4166
4167 /* Look at edge weights to decide the most ideal migration candidate for this thread */
4168 selected_pset = sched_edge_migrate_edges_evaluate(preferred_pset, preferred_cluster_load, thread);
4169
4170migrate_candidate_available_check:
4171 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4172 if (sched_edge_pset_available(selected_pset) == true) {
4173 if (selected_pset != preferred_pset) {
4174 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
4175 }
4176 return selected_pset;
4177 }
4178 /* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */
4179 bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id);
4180 if (__improbable(bitmap_first(&candidate_cluster_bitmap, MAX_PSETS) == -1)) {
4181 /*
4182 * None of the clusters are available for scheduling; this situation should be rare but if it happens,
4183 * simply return the boot cluster.
4184 */
4185 selected_pset = &pset0;
4186 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4187 if (selected_pset != preferred_pset) {
4188 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
4189 }
4190 return selected_pset;
c6bf4f31 4191 }
f427ee49
A
4192 /* Try and find an alternative for the selected pset */
4193 selected_pset = sched_edge_candidate_alternative(selected_pset, candidate_cluster_bitmap);
4194 goto migrate_candidate_available_check;
4195}
4196
4197static processor_t
4198sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
4199{
4200 /* Bound threads don't call this function */
4201 assert(thread->bound_processor == PROCESSOR_NULL);
4202 processor_t chosen_processor = PROCESSOR_NULL;
c6bf4f31
A
4203
4204 /*
f427ee49
A
4205 * sched_edge_preferred_pset() returns the preferred pset for a given thread.
4206 * It should take the passed in "pset" as a hint which represents the recency metric for
4207 * pset selection logic.
c6bf4f31 4208 */
f427ee49
A
4209 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4210 processor_set_t chosen_pset = preferred_pset;
4211 /*
4212 * If the preferred pset is overloaded, find a pset which is the best candidate to migrate
4213 * threads to. sched_edge_migrate_candidate() returns the preferred pset
4214 * if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
4215 *
4216 * <Edge Multi-cluster Support Needed>
4217 * It might be useful to build a recency metric for the thread for multiple clusters and
4218 * factor that into the migration decisions.
4219 */
4220 chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true);
4221 chosen_processor = choose_processor(chosen_pset, processor, thread);
4222 assert(chosen_processor->processor_set == chosen_pset);
4223 return chosen_processor;
c6bf4f31
A
4224}
4225
f427ee49
A
4226/*
4227 * sched_edge_clutch_bucket_threads_drain()
4228 *
4229 * Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
4230 * bucket overrides etc.) into a local thread queue.
4231 */
c6bf4f31 4232static void
f427ee49 4233sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
c6bf4f31 4234{
f427ee49 4235 thread_t thread = THREAD_NULL;
c6bf4f31 4236 uint64_t current_timestamp = mach_approximate_time();
f427ee49 4237 qe_foreach_element_safe(thread, &clutch_bucket->scb_thread_timeshare_queue, th_clutch_timeshare_link) {
ea3f0419 4238 sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
c6bf4f31 4239 enqueue_tail(clutch_threads, &thread->runq_links);
c6bf4f31 4240 }
f427ee49 4241}
c6bf4f31 4242
f427ee49
A
4243/*
4244 * sched_edge_run_drained_threads()
4245 *
4246 * Makes all drained threads in a local queue runnable.
4247 */
4248static void
4249sched_edge_run_drained_threads(queue_t clutch_threads)
4250{
4251 thread_t thread;
4252 /* Now setrun all the threads in the local queue */
4253 qe_foreach_element_safe(thread, clutch_threads, runq_links) {
4254 remqueue(&thread->runq_links);
4255 thread_lock(thread);
4256 thread_setrun(thread, SCHED_TAILQ);
4257 thread_unlock(thread);
4258 }
4259}
4260
4261/*
4262 * sched_edge_update_preferred_cluster()
4263 *
4264 * Routine to update the preferred cluster for QoS buckets within a thread group.
4265 * The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
4266 */
4267static void
4268sched_edge_update_preferred_cluster(
4269 sched_clutch_t sched_clutch,
4270 bitmap_t *clutch_bucket_modify_bitmap,
4271 uint32_t *tg_bucket_preferred_cluster)
4272{
4273 for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4274 os_atomic_store(&sched_clutch->sc_clutch_groups[bucket].scbg_preferred_cluster, tg_bucket_preferred_cluster[bucket], relaxed);
4275 }
c6bf4f31
A
4276}
4277
4278/*
f427ee49 4279 * sched_edge_migrate_thread_group_runnable_threads()
c6bf4f31 4280 *
f427ee49 4281 * Routine to implement the migration of threads on a cluster when the thread group
c6bf4f31
A
4282 * recommendation is updated. The migration works using a 2-phase
4283 * algorithm.
4284 *
f427ee49
A
4285 * Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
4286 * For each clutch bucket, if it needs to be migrated immediately, drain the threads
4287 * into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
4288 * appropriate.
4289 *
4290 * Phase 2: After unlocking the pset, drain all the threads from the local thread
4291 * queue and mark them runnable which should land them in the right hierarchy.
4292 *
4293 * The routine assumes that the preferences for the clutch buckets/clutch bucket
4294 * groups have already been updated by the caller.
c6bf4f31 4295 *
f427ee49
A
4296 * - Called with the pset locked and interrupts disabled.
4297 * - Returns with the pset unlocked.
c6bf4f31
A
4298 */
4299static void
f427ee49
A
4300sched_edge_migrate_thread_group_runnable_threads(
4301 sched_clutch_t sched_clutch,
4302 sched_clutch_root_t root_clutch,
4303 bitmap_t *clutch_bucket_modify_bitmap,
4304 __unused uint32_t *tg_bucket_preferred_cluster,
4305 bool migrate_immediately)
c6bf4f31 4306{
f427ee49 4307 /* Queue to hold threads that have been drained from clutch buckets to be migrated */
c6bf4f31
A
4308 queue_head_t clutch_threads;
4309 queue_init(&clutch_threads);
4310
f427ee49
A
4311 for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4312 /* Get the clutch bucket for this cluster and sched bucket */
4313 sched_clutch_bucket_group_t clutch_bucket_group = &(sched_clutch->sc_clutch_groups[bucket]);
4314 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
c6bf4f31 4315 sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
f427ee49 4316 if (scb_root == NULL) {
c6bf4f31 4317 /* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
f427ee49 4318 assert(clutch_bucket->scb_thr_count == 0);
c6bf4f31
A
4319 continue;
4320 }
f427ee49
A
4321 assert(scb_root == root_clutch);
4322 uint32_t clutch_bucket_preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
4323
4324 if (migrate_immediately) {
4325 /*
4326 * For transitions where threads need to be migrated immediately, drain the threads into a
4327 * local queue unless we are looking at the clutch buckets for the newly recommended
4328 * cluster.
4329 */
4330 if (root_clutch->scr_cluster_id != clutch_bucket_preferred_cluster) {
4331 sched_edge_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
4332 } else {
4333 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
4334 }
4335 } else {
4336 /* Check if this cluster is the same type as the newly recommended cluster */
4337 boolean_t homogeneous_cluster = (pset_type_for_id(root_clutch->scr_cluster_id) == pset_type_for_id(clutch_bucket_preferred_cluster));
4338 /*
4339 * If threads do not have to be migrated immediately, just change the native/foreign
4340 * flag on the clutch bucket.
4341 */
4342 if (homogeneous_cluster) {
4343 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
4344 } else {
4345 sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
4346 }
4347 }
4348 }
4349
4350 pset_unlock(root_clutch->scr_pset);
4351 sched_edge_run_drained_threads(&clutch_threads);
4352}
4353
4354/*
4355 * sched_edge_migrate_thread_group_running_threads()
4356 *
4357 * Routine to find all running threads of a thread group on a specific cluster
4358 * and IPI them if they need to be moved immediately.
4359 */
4360static void
4361sched_edge_migrate_thread_group_running_threads(
4362 sched_clutch_t sched_clutch,
4363 sched_clutch_root_t root_clutch,
4364 __unused bitmap_t *clutch_bucket_modify_bitmap,
4365 uint32_t *tg_bucket_preferred_cluster,
4366 bool migrate_immediately)
4367{
4368 if (migrate_immediately == false) {
4369 /* If CLPC has recommended not to move threads immediately, nothing to do here */
4370 return;
c6bf4f31
A
4371 }
4372
4373 /*
f427ee49
A
4374 * Edge Scheduler Optimization
4375 *
4376 * When the system has a large number of clusters and cores, it might be useful to
4377 * narrow down the iteration by using a thread running bitmap per clutch.
c6bf4f31 4378 */
f427ee49
A
4379 uint64_t ast_processor_map = 0;
4380 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
c6bf4f31 4381
f427ee49
A
4382 uint64_t running_map = root_clutch->scr_pset->cpu_state_map[PROCESSOR_RUNNING];
4383 /*
4384 * Iterate all CPUs and look for the ones running threads from this thread group and are
4385 * not restricted to the specific cluster (due to overrides etc.)
4386 */
4387 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
4388 processor_t src_processor = processor_array[cpuid];
4389 boolean_t expected_tg = (src_processor->current_thread_group == sched_clutch->sc_tg);
4390 sched_bucket_t processor_sched_bucket = src_processor->processor_set->cpu_running_buckets[cpuid];
4391 boolean_t non_preferred_cluster = tg_bucket_preferred_cluster[processor_sched_bucket] != root_clutch->scr_cluster_id;
4392
4393 if (expected_tg && non_preferred_cluster) {
4394 ipi_type[cpuid] = sched_ipi_action(src_processor, NULL, false, SCHED_IPI_EVENT_REBALANCE);
4395 if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4396 bit_set(ast_processor_map, cpuid);
4397 } else if (src_processor == current_processor()) {
4398 ast_on(AST_PREEMPT);
4399 bit_set(root_clutch->scr_pset->pending_AST_PREEMPT_cpu_mask, cpuid);
4400 }
4401 }
c6bf4f31
A
4402 }
4403
f427ee49
A
4404 /* Perform all the IPIs */
4405 if (bit_first(ast_processor_map) != -1) {
4406 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
4407 processor_t ast_processor = processor_array[cpuid];
4408 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4409 }
4410 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, thread_group_get_id(sched_clutch->sc_tg), ast_processor_map, 0, 0);
4411 }
c6bf4f31
A
4412}
4413
f427ee49
A
4414/*
4415 * sched_edge_tg_preferred_cluster_change()
4416 *
4417 * Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
4418 * is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
4419 * thread migrations based on the policy specified by CLPC.
4420 * tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
4421 * for each QoS within the thread group.
4422 */
4423void
4424sched_edge_tg_preferred_cluster_change(struct thread_group *tg, uint32_t *tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options)
c6bf4f31 4425{
f427ee49 4426 sched_clutch_t clutch = sched_clutch_for_thread_group(tg);
c6bf4f31 4427 /*
f427ee49
A
4428 * In order to optimize the processing, create a bitmap which represents all QoS buckets
4429 * for which the preferred cluster has changed.
c6bf4f31 4430 */
f427ee49
A
4431 bitmap_t clutch_bucket_modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {0};
4432 for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4433 uint32_t old_preferred_cluster = sched_edge_clutch_bucket_group_preferred_cluster(&clutch->sc_clutch_groups[bucket]);
4434 uint32_t new_preferred_cluster = tg_bucket_preferred_cluster[bucket];
4435 if (old_preferred_cluster != new_preferred_cluster) {
4436 bitmap_set(clutch_bucket_modify_bitmap, bucket);
4437 }
4438 }
4439 if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
4440 /* No changes in any clutch buckets; nothing to do here */
c6bf4f31
A
4441 return;
4442 }
4443
f427ee49
A
4444 for (uint32_t cluster_id = 0; cluster_id < MAX_PSETS; cluster_id++) {
4445 processor_set_t pset = pset_array[cluster_id];
4446 spl_t s = splsched();
4447 pset_lock(pset);
4448 /*
4449 * The first operation is to update the preferred cluster for all QoS buckets within the
4450 * thread group so that any future threads becoming runnable would see the new preferred
4451 * cluster value.
4452 */
4453 sched_edge_update_preferred_cluster(clutch, clutch_bucket_modify_bitmap, tg_bucket_preferred_cluster);
4454 /*
4455 * Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
4456 * by keeping a per-clutch bitmap of clusters running threads for a particular TG.
4457 *
4458 * <Edge Multi-cluster Support Needed>
4459 */
4460 /* Migrate all running threads of the TG on this cluster based on options specified by CLPC */
4461 sched_edge_migrate_thread_group_running_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
4462 tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING));
4463 /* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */
4464 sched_edge_migrate_thread_group_runnable_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
4465 tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE));
4466 /* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */
4467 splx(s);
4468 }
c6bf4f31
A
4469}
4470
4471/*
f427ee49 4472 * sched_edge_pset_made_schedulable()
c6bf4f31
A
4473 *
4474 * Routine to migrate all the clutch buckets which are not in their recommended
f427ee49
A
4475 * pset hierarchy now that a new pset has become runnable. Its possible that this
4476 * routine is called when the pset is already marked schedulable.
c6bf4f31 4477 *
f427ee49 4478 * Invoked with the pset lock held and interrupts disabled.
c6bf4f31
A
4479 */
4480static void
f427ee49 4481sched_edge_pset_made_schedulable(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
c6bf4f31 4482{
f427ee49
A
4483 if (bitmap_test(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id)) {
4484 /* Nothing to do here since pset is already marked schedulable */
c6bf4f31
A
4485 if (drop_lock) {
4486 pset_unlock(dst_pset);
4487 }
4488 return;
4489 }
c6bf4f31 4490
f427ee49 4491 bitmap_set(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id);
c6bf4f31 4492
f427ee49
A
4493 thread_t thread = sched_edge_processor_idle(dst_pset);
4494 if (thread != THREAD_NULL) {
c6bf4f31
A
4495 thread_lock(thread);
4496 thread_setrun(thread, SCHED_TAILQ);
4497 thread_unlock(thread);
4498 }
4499
c6bf4f31
A
4500 if (!drop_lock) {
4501 pset_lock(dst_pset);
4502 }
4503}
4504
f427ee49
A
4505extern int sched_amp_spill_deferred_ipi;
4506extern int sched_amp_pcores_preempt_immediate_ipi;
4507
4508int sched_edge_migrate_ipi_immediate = 1;
4509
4510sched_ipi_type_t
4511sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4512{
4513 processor_set_t pset = dst->processor_set;
4514 assert(bit_test(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id) == false);
4515 assert(dst != current_processor());
4516
4517 boolean_t deferred_ipi_supported = false;
4518#if defined(CONFIG_SCHED_DEFERRED_AST)
4519 deferred_ipi_supported = true;
4520#endif /* CONFIG_SCHED_DEFERRED_AST */
4521
4522 switch (event) {
4523 case SCHED_IPI_EVENT_SPILL:
4524 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
4525 if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
4526 return sched_ipi_deferred_policy(pset, dst, event);
4527 }
4528 break;
4529 case SCHED_IPI_EVENT_PREEMPT:
4530 /* For preemption, the default policy is to use deferred IPIs
4531 * for Non-RT P-core preemption. Override that behavior if
4532 * sched_amp_pcores_preempt_immediate_ipi is set
4533 */
4534 if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
4535 if (sched_edge_migrate_ipi_immediate) {
4536 /*
4537 * For workloads that are going wide, it might be useful use Immediate IPI to
4538 * wakeup the idle CPU if the scheduler estimates that the preferred pset will
4539 * be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
4540 * latency on the preferred pset as an estimate of busyness.
4541 *
4542 * <Edge Multi-cluster Support Needed>
4543 */
4544 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4545 if ((preferred_pset->pset_execution_time[thread->th_sched_bucket].pset_avg_thread_execution_time * NSEC_PER_USEC) >= ml_cpu_signal_deferred_get_timer()) {
4546 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4547 }
4548 }
4549 if (sched_amp_pcores_preempt_immediate_ipi && (pset_type_for_id(pset->pset_cluster_id) == CLUSTER_TYPE_P)) {
4550 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4551 }
4552 }
4553 break;
4554 default:
4555 break;
4556 }
4557 /* Default back to the global policy for all other scenarios */
4558 return sched_ipi_policy(dst, thread, dst_idle, event);
4559}
4560
4561/*
4562 * sched_edge_qos_max_parallelism()
4563 */
4564uint32_t
4565sched_edge_qos_max_parallelism(int qos, uint64_t options)
4566{
4567 uint32_t ecount = 0;
4568 uint32_t pcount = 0;
4569
4570 for (int cluster_id = 0; cluster_id < MAX_PSETS; cluster_id++) {
4571 processor_set_t pset = pset_array[cluster_id];
4572 if (pset_type_for_id(cluster_id) == CLUSTER_TYPE_P) {
4573 pcount += pset->cpu_set_count;
4574 } else {
4575 ecount += pset->cpu_set_count;
4576 }
4577 }
4578
4579 if (options & QOS_PARALLELISM_REALTIME) {
4580 /* For realtime threads on AMP, we would want them
4581 * to limit the width to just the P-cores since we
4582 * do not spill/rebalance for RT threads.
4583 */
4584 return pcount;
4585 }
4586
4587 /*
4588 * The Edge scheduler supports per-QoS recommendations for thread groups.
4589 * This enables lower QoS buckets (such as UT) to be scheduled on all
4590 * CPUs on the system.
4591 *
4592 * The only restriction is for BG/Maintenance QoS classes for which the
4593 * performance controller would never recommend execution on the P-cores.
4594 * If that policy changes in the future, this value should be changed.
4595 */
4596 switch (qos) {
4597 case THREAD_QOS_BACKGROUND:
4598 case THREAD_QOS_MAINTENANCE:
4599 return ecount;
4600 default:
4601 return ecount + pcount;
4602 }
4603}
4604
4605
4606
4607#endif /* CONFIG_SCHED_EDGE */
cb323159
A
4608
4609#endif /* CONFIG_SCHED_CLUTCH */