]> git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/kern/sched_clutch.c
xnu-6153.81.5.tar.gz
[apple/xnu.git] / osfmk / kern / sched_clutch.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <mach/mach_types.h>
30#include <mach/machine.h>
31#include <machine/machine_routines.h>
32#include <machine/sched_param.h>
33#include <machine/machine_cpu.h>
34#include <kern/kern_types.h>
35#include <kern/debug.h>
36#include <kern/machine.h>
37#include <kern/misc_protos.h>
38#include <kern/processor.h>
39#include <kern/queue.h>
40#include <kern/sched.h>
41#include <kern/sched_prim.h>
42#include <kern/task.h>
43#include <kern/thread.h>
44#include <kern/sched_clutch.h>
45#include <machine/atomic.h>
46#include <kern/sched_clutch.h>
47#include <sys/kdebug.h>
48
49#if __AMP__
50#include <kern/sched_amp_common.h>
51#endif /* __AMP__ */
52
53#if CONFIG_SCHED_CLUTCH
54
55/* Forward declarations of static routines */
56
57/* Root level hierarchy management */
58static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
59static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t);
60static void sched_clutch_root_pri_update(sched_clutch_root_t);
61static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t);
62static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
63static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
64
65/* Root bucket level hierarchy management */
66static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
67static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t);
68static int sched_clutch_root_bucket_pri_compare(sched_clutch_root_bucket_t, sched_clutch_root_bucket_t);
69
70/* Clutch bucket level hierarchy management */
71static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t);
72static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t);
73static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t);
74static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t);
75static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t);
76
77static void sched_clutch_bucket_cpu_usage_update(sched_clutch_bucket_t, uint64_t);
78static void sched_clutch_bucket_cpu_blocked_update(sched_clutch_bucket_t, uint64_t);
79static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
80static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t);
81
82/* Clutch timeshare properties updates */
83static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
84static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
85static void sched_clutch_bucket_cpu_adjust(sched_clutch_bucket_t);
86static void sched_clutch_bucket_timeshare_update(sched_clutch_bucket_t);
87static boolean_t sched_thread_sched_pri_promoted(thread_t);
88/* Clutch membership management */
89static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
90static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t);
91static thread_t sched_clutch_thread_highest(sched_clutch_root_t);
92
93/* Clutch properties updates */
94static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
95static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
96static int sched_clutch_root_priority(sched_clutch_root_t);
97
98#if __AMP__
99/* System based routines */
100static bool sched_clutch_pset_available(processor_set_t);
101#endif /* __AMP__ */
102
103/* Helper debugging routines */
104static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
105
106
107
108/*
109 * Global priority queue comparator routine for root buckets. The
110 * routine implements the priority queue as a minimum deadline queue
111 * to achieve EDF scheduling.
112 */
113priority_queue_compare_fn_t sched_clutch_root_bucket_compare;
114
115
116/*
117 * Special markers for buckets that have invalid WCELs/quantums etc.
118 */
119#define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
120#define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
121
122/*
123 * Root level bucket WCELs
124 *
125 * The root level bucket selection algorithm is an Earliest Deadline
126 * First (EDF) algorithm where the deadline for buckets are defined
127 * by the worst-case-execution-latency and the make runnable timestamp
128 * for the bucket.
129 *
130 */
131static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
132 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
133 0, /* FG */
134 37500, /* IN (37.5ms) */
135 75000, /* DF (75ms) */
136 150000, /* UT (150ms) */
137 250000 /* BG (250ms) */
138};
139static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
140
141/*
142 * Root level bucket warp
143 *
144 * Each root level bucket has a warp value associated with it as well.
145 * The warp value allows the root bucket to effectively warp ahead of
146 * lower priority buckets for a limited time even if it has a later
147 * deadline. The warping behavior provides extra (but limited)
148 * opportunity for high priority buckets to remain responsive.
149 */
150
151/* Special warp deadline value to indicate that the bucket has not used any warp yet */
152#define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED (SCHED_CLUTCH_INVALID_TIME_64)
153
154/* Warp window durations for various tiers */
155static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
156 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
157 8000, /* FG (8ms)*/
158 4000, /* IN (4ms) */
159 2000, /* DF (2ms) */
160 1000, /* UT (1ms) */
161 0 /* BG (0ms) */
162};
163static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
164
165/*
166 * Thread level quantum
167 *
168 * The algorithm defines quantums for threads at various buckets. This
169 * (combined with the root level bucket quantums) restricts how much
170 * the lower priority levels can preempt the higher priority threads.
171 */
172static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
173 10000, /* FIXPRI (10ms) */
174 10000, /* FG (10ms) */
175 8000, /* IN (8ms) */
176 6000, /* DF (6ms) */
177 4000, /* UT (4ms) */
178 2000 /* BG (2ms) */
179};
180static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
181
182enum sched_clutch_state {
183 SCHED_CLUTCH_STATE_EMPTY = 0,
184 SCHED_CLUTCH_STATE_RUNNABLE,
185};
186
187/*
188 * sched_clutch_us_to_abstime()
189 *
190 * Initializer for converting all durations in usec to abstime
191 */
192static void
193sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
194{
195 for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
196 if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
197 abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
198 } else {
199 clock_interval_to_absolutetime_interval(us_vals[i],
200 NSEC_PER_USEC, &abstime_vals[i]);
201 }
202 }
203}
204
205#if DEVELOPMENT || DEBUG
206
207/*
208 * sched_clutch_hierarchy_locked_assert()
209 *
210 * Debugging helper routine. Asserts that the hierarchy is locked. The locking
211 * for the hierarchy depends on where the hierarchy is hooked. The current
212 * implementation hooks the hierarchy at the pset, so the hierarchy is locked
213 * using the pset lock.
214 */
215static inline void
216sched_clutch_hierarchy_locked_assert(
217 sched_clutch_root_t root_clutch)
218{
219 pset_assert_locked(root_clutch->scr_pset);
220}
221
222#else /* DEVELOPMENT || DEBUG */
223
224static inline void
225sched_clutch_hierarchy_locked_assert(
226 __unused sched_clutch_root_t root_clutch)
227{
228}
229
230#endif /* DEVELOPMENT || DEBUG */
231
232/*
233 * sched_clutch_thr_count_inc()
234 *
235 * Increment thread count at a hierarchy level with overflow checks.
236 */
237static void
238sched_clutch_thr_count_inc(
239 uint16_t *thr_count)
240{
241 if (__improbable(os_inc_overflow(thr_count))) {
242 panic("sched_clutch thread count overflowed!");
243 }
244}
245
246/*
247 * sched_clutch_thr_count_dec()
248 *
249 * Decrement thread count at a hierarchy level with underflow checks.
250 */
251static void
252sched_clutch_thr_count_dec(
253 uint16_t *thr_count)
254{
255 if (__improbable(os_dec_overflow(thr_count))) {
256 panic("sched_clutch thread count underflowed!");
257 }
258}
259
260#if __AMP__
261
262/*
263 * sched_clutch_pset_available()
264 *
265 * Routine to determine if a pset is available for scheduling.
266 */
267static bool
268sched_clutch_pset_available(processor_set_t pset)
269{
270 /* Check if cluster has none of the CPUs available */
271 if (pset->online_processor_count == 0) {
272 return false;
273 }
274
275 /* Check if the cluster is not recommended by CLPC */
276 if (!pset_is_recommended(pset)) {
277 return false;
278 }
279
280 return true;
281}
282
283#endif /* __AMP__ */
284
285/*
286 * sched_clutch_root_init()
287 *
288 * Routine to initialize the scheduler hierarchy root.
289 */
290static void
291sched_clutch_root_init(
292 sched_clutch_root_t root_clutch,
293 processor_set_t pset)
294{
295 root_clutch->scr_thr_count = 0;
296 root_clutch->scr_priority = NOPRI;
297 root_clutch->scr_urgency = 0;
298 root_clutch->scr_pset = pset;
299
300 /* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
301 queue_init(&root_clutch->scr_clutch_buckets);
302
303 /* Initialize the queue which maintains all runnable foreign clutch buckets */
304 queue_init(&root_clutch->scr_foreign_buckets);
305
306 /* Initialize the bitmap and priority queue of runnable root buckets */
307 sched_clutch_root_bucket_compare = priority_heap_make_comparator(a, b, struct sched_clutch_root_bucket, scrb_pqlink, {
308 return (a->scrb_deadline < b->scrb_deadline) ? 1 : ((a->scrb_deadline == b->scrb_deadline) ? 0 : -1);
309 });
310 priority_queue_init(&root_clutch->scr_root_buckets, PRIORITY_QUEUE_GENERIC_KEY | PRIORITY_QUEUE_MIN_HEAP);
311 bitmap_zero(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX);
312 bitmap_zero(root_clutch->scr_warp_available, TH_BUCKET_SCHED_MAX);
313
314 /* Initialize all the root buckets */
315 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
316 sched_clutch_root_bucket_init(&root_clutch->scr_buckets[i], i);
317 }
318}
319
320/*
321 * sched_clutch_root_bucket_init()
322 *
323 * Routine to initialize root buckets.
324 */
325static void
326sched_clutch_root_bucket_init(
327 sched_clutch_root_bucket_t root_bucket,
328 sched_bucket_t bucket)
329{
330 root_bucket->scrb_bucket = bucket;
331 priority_queue_init(&root_bucket->scrb_clutch_buckets, PRIORITY_QUEUE_BUILTIN_KEY | PRIORITY_QUEUE_MAX_HEAP);
332 priority_queue_entry_init(&root_bucket->scrb_pqlink);
333 root_bucket->scrb_deadline = SCHED_CLUTCH_INVALID_TIME_64;
334 root_bucket->scrb_warped_deadline = 0;
335 root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
336}
337
338/*
339 * sched_clutch_root_bucket_pri_compare()
340 *
341 * Routine to compare root buckets based on the highest runnable clutch
342 * bucket priorities in the root buckets.
343 */
344static int
345sched_clutch_root_bucket_pri_compare(
346 sched_clutch_root_bucket_t a,
347 sched_clutch_root_bucket_t b)
348{
349 sched_clutch_bucket_t a_highest = sched_clutch_root_bucket_highest_clutch_bucket(a);
350 sched_clutch_bucket_t b_highest = sched_clutch_root_bucket_highest_clutch_bucket(b);
351 return (a_highest->scb_priority > b_highest->scb_priority) ?
352 1 : ((a_highest->scb_priority == b_highest->scb_priority) ? 0 : -1);
353}
354
355/*
356 * sched_clutch_root_select_aboveui()
357 *
358 * Special case scheduling for Above UI bucket.
359 *
360 * AboveUI threads are typically system critical threads that need low latency
361 * which is why they are handled specially.
362 *
363 * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
364 * important to maintain some native priority order between those buckets. The policy
365 * implemented here is to compare the highest clutch buckets of both buckets; if the
366 * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
367 * deadline based scheduling which should pickup the timeshare buckets.
368 *
369 * The implementation allows extremely low latency CPU access for Above UI threads
370 * while supporting the use case of high priority timeshare threads contending with
371 * lower priority fixed priority threads.
372 */
373static boolean_t
374sched_clutch_root_select_aboveui(
375 sched_clutch_root_t root_clutch)
376{
377 if (bitmap_test(root_clutch->scr_runnable_bitmap, TH_BUCKET_FIXPRI)) {
378 sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_buckets[TH_BUCKET_FIXPRI];
379 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_buckets[TH_BUCKET_SHARE_FG];
380
381 if (!bitmap_test(root_clutch->scr_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
382 /* If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling */
383 return true;
384 }
385 if (sched_clutch_root_bucket_pri_compare(root_bucket_aboveui, root_bucket_sharefg) >= 0) {
386 /* If the aboveUI bucket has a higher native clutch bucket priority, schedule it */
387 return true;
388 }
389 }
390 return false;
391}
392
393
394/*
395 * sched_clutch_root_highest_root_bucket()
396 *
397 * Main routine to find the highest runnable root level bucket.
398 * This routine is called from performance sensitive contexts; so it is
399 * crucial to keep this O(1).
400 *
401 */
402static sched_clutch_root_bucket_t
403sched_clutch_root_highest_root_bucket(
404 sched_clutch_root_t root_clutch,
405 uint64_t timestamp)
406{
407 sched_clutch_hierarchy_locked_assert(root_clutch);
408 if (bitmap_lsb_first(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
409 return NULL;
410 }
411
412 if (sched_clutch_root_select_aboveui(root_clutch)) {
413 return &root_clutch->scr_buckets[TH_BUCKET_FIXPRI];
414 }
415
416 /*
417 * Above UI bucket is not runnable or has a low priority clutch bucket; use the earliest deadline model
418 * to schedule threads. The idea is that as the timeshare buckets use CPU, they will drop their
419 * interactivity score and allow low priority AboveUI clutch buckets to be scheduled.
420 */
421
422 /* Find the earliest deadline bucket */
423 sched_clutch_root_bucket_t edf_bucket = priority_queue_min(&root_clutch->scr_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
424
425 sched_clutch_root_bucket_t warp_bucket = NULL;
426 int warp_bucket_index = -1;
427evaluate_warp_buckets:
428 /* Check if any higher runnable buckets have warp available */
429 warp_bucket_index = bitmap_lsb_first(root_clutch->scr_warp_available, TH_BUCKET_SCHED_MAX);
430
431 if ((warp_bucket_index == -1) || (warp_bucket_index >= edf_bucket->scrb_bucket)) {
432 /* No higher buckets have warp available; choose the edf bucket and replenish its warp */
433 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
434 edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
435 edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
436 bitmap_set(root_clutch->scr_warp_available, edf_bucket->scrb_bucket);
437 return edf_bucket;
438 }
439
440 /*
441 * Looks like there is a root bucket which is higher in the natural priority
442 * order than edf_bucket and might have some warp remaining.
443 */
444 warp_bucket = &root_clutch->scr_buckets[warp_bucket_index];
445 if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
446 /* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
447 warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
448 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
449 return warp_bucket;
450 }
451 if (warp_bucket->scrb_warped_deadline > timestamp) {
452 /* Root bucket already has a warp window open with some warp remaining */
453 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
454 return warp_bucket;
455 }
456
457 /* For this bucket, warp window was opened sometime in the past but has now
458 * expired. Mark the bucket as not avilable for warp anymore and re-run the
459 * warp bucket selection logic.
460 */
461 warp_bucket->scrb_warp_remaining = 0;
462 bitmap_clear(root_clutch->scr_warp_available, warp_bucket->scrb_bucket);
463 goto evaluate_warp_buckets;
464}
465
466/*
467 * sched_clutch_root_bucket_deadline_calculate()
468 *
469 * Calculate the deadline for the bucket based on its WCEL
470 */
471static uint64_t
472sched_clutch_root_bucket_deadline_calculate(
473 sched_clutch_root_bucket_t root_bucket,
474 uint64_t timestamp)
475{
476 /* For fixpri AboveUI bucket always return it as the earliest deadline */
477 if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
478 return 0;
479 }
480
481 /* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
482 return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
483}
484
485/*
486 * sched_clutch_root_bucket_deadline_update()
487 *
488 * Routine to update the deadline of the root bucket when it is selected.
489 * Updating the deadline also moves the root_bucket in the EDF priority
490 * queue.
491 */
492static void
493sched_clutch_root_bucket_deadline_update(
494 sched_clutch_root_bucket_t root_bucket,
495 sched_clutch_root_t root_clutch,
496 uint64_t timestamp)
497{
498 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
499 /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
500 return;
501 }
502 uint64_t old_deadline = root_bucket->scrb_deadline;
503 uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
504 assert(old_deadline <= new_deadline);
505 if (old_deadline != new_deadline) {
506 root_bucket->scrb_deadline = new_deadline;
507 /* Since the priority queue is a min-heap, use the decrease routine even though the deadline has a larger value now */
508 priority_queue_entry_decrease(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, PRIORITY_QUEUE_KEY_NONE, sched_clutch_root_bucket_compare);
509 }
510}
511
512/*
513 * sched_clutch_root_bucket_runnable()
514 *
515 * Routine to insert a newly runnable root bucket into the hierarchy.
516 * Also updates the deadline and warp parameters as necessary.
517 */
518static void
519sched_clutch_root_bucket_runnable(
520 sched_clutch_root_bucket_t root_bucket,
521 sched_clutch_root_t root_clutch,
522 uint64_t timestamp)
523{
524 /* Mark the root bucket as runnable */
525 bitmap_set(root_clutch->scr_runnable_bitmap, root_bucket->scrb_bucket);
526 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_ROOT_BUCKET_STATE) | DBG_FUNC_NONE,
527 root_bucket->scrb_bucket, SCHED_CLUTCH_STATE_RUNNABLE, 0, 0, 0);
528
529 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
530 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
531 return;
532 }
533
534 root_bucket->scrb_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
535 priority_queue_insert(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, PRIORITY_QUEUE_KEY_NONE, sched_clutch_root_bucket_compare);
536
537 if (root_bucket->scrb_warp_remaining) {
538 /* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
539 bitmap_set(root_clutch->scr_warp_available, root_bucket->scrb_bucket);
540 }
541}
542
543/*
544 * sched_clutch_root_bucket_empty()
545 *
546 * Routine to remove an empty root bucket from the hierarchy.
547 * Also updates the deadline and warp parameters as necessary.
548 */
549static void
550sched_clutch_root_bucket_empty(
551 sched_clutch_root_bucket_t root_bucket,
552 sched_clutch_root_t root_clutch,
553 uint64_t timestamp)
554{
555 bitmap_clear(root_clutch->scr_runnable_bitmap, root_bucket->scrb_bucket);
556 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_ROOT_BUCKET_STATE) | DBG_FUNC_NONE,
557 root_bucket->scrb_bucket, SCHED_CLUTCH_STATE_EMPTY, 0, 0, 0);
558
559 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
560 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
561 return;
562 }
563
564 priority_queue_remove(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, sched_clutch_root_bucket_compare);
565
566 bitmap_clear(root_clutch->scr_warp_available, root_bucket->scrb_bucket);
567 if (root_bucket->scrb_warped_deadline > timestamp) {
568 /*
569 * For root buckets that were using the warp, check if the warp
570 * deadline is in the future. If yes, remove the wall time the
571 * warp was active and update the warp remaining. This allows
572 * the root bucket to use the remaining warp the next time it
573 * becomes runnable.
574 */
575 root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
576 } else if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
577 /*
578 * If the root bucket's warped deadline is in the past, it has used up
579 * all the warp it was assigned. Empty out its warp remaining.
580 */
581 root_bucket->scrb_warp_remaining = 0;
582 }
583}
584
585/*
586 * sched_clutch_root_pri_update()
587 *
588 * The root level priority is used for thread selection and preemption
589 * logic.
590 */
591static void
592sched_clutch_root_pri_update(
593 sched_clutch_root_t root_clutch)
594{
595 sched_clutch_hierarchy_locked_assert(root_clutch);
596 if (bitmap_lsb_first(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
597 /* No runnable root buckets */
598 root_clutch->scr_priority = NOPRI;
599 assert(root_clutch->scr_urgency == 0);
600 return;
601 }
602 sched_clutch_root_bucket_t root_bucket = NULL;
603 /* Special case for AboveUI (uses same logic as thread selection) */
604 if (sched_clutch_root_select_aboveui(root_clutch)) {
605 root_bucket = &root_clutch->scr_buckets[TH_BUCKET_FIXPRI];
606 } else {
607 /*
608 * AboveUI bucket is not runnable or has a low clutch bucket priority,
609 * select the next runnable root bucket in natural priority order. This logic
610 * is slightly different from thread selection, because thread selection
611 * considers deadlines, warps etc. to decide the most optimal bucket at a
612 * given timestamp. Since the priority value is used for preemption decisions
613 * only, it needs to be based on the highest runnable thread available in
614 * the timeshare domain.
615 */
616 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
617 assert(root_bucket_index != -1);
618 root_bucket = &root_clutch->scr_buckets[root_bucket_index];
619 }
620 /* For the selected root bucket, find the highest priority clutch bucket */
621 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
622 root_clutch->scr_priority = priority_queue_max_key(&clutch_bucket->scb_clutchpri_prioq);
623}
624
625/*
626 * sched_clutch_root_urgency_inc()
627 *
628 * Routine to increment the urgency at the root level based on the thread
629 * priority that is being inserted into the hierarchy. The root urgency
630 * counter is updated based on the urgency of threads in any of the
631 * clutch buckets which are part of the hierarchy.
632 *
633 * Always called with the pset lock held.
634 */
635static void
636sched_clutch_root_urgency_inc(
637 sched_clutch_root_t root_clutch,
638 thread_t thread)
639{
640 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
641 root_clutch->scr_urgency++;
642 }
643}
644
645/*
646 * sched_clutch_root_urgency_dec()
647 *
648 * Routine to decrement the urgency at the root level based on the thread
649 * priority that is being removed from the hierarchy. The root urgency
650 * counter is updated based on the urgency of threads in any of the
651 * clutch buckets which are part of the hierarchy.
652 *
653 * Always called with the pset lock held.
654 */
655static void
656sched_clutch_root_urgency_dec(
657 sched_clutch_root_t root_clutch,
658 thread_t thread)
659{
660 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
661 root_clutch->scr_urgency--;
662 }
663}
664
665/*
666 * Clutch bucket level scheduling
667 *
668 * The second level of scheduling is the clutch bucket level scheduling
669 * which tries to schedule thread groups within root_buckets. Each
670 * clutch represents a thread group and a clutch_bucket represents
671 * threads at a particular sched_bucket within that thread group. The
672 * goal of this level of scheduling is to allow interactive thread
673 * groups low latency access to the CPU. It also provides slight
674 * scheduling preference for App and unrestricted thread groups.
675 *
676 * The clutch bucket scheduling algorithm measures an interactivity
677 * score for all clutch buckets. The interactivity score is based
678 * on the ratio of the CPU used and the voluntary blocking of threads
679 * within the clutch bucket. The algorithm is very close to the ULE
680 * scheduler on FreeBSD in terms of calculations. The interactivity
681 * score provides an interactivity boost in the range of
682 * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
683 * thread groups to win over CPU spinners.
684 */
685
686/* Priority boost range for interactivity */
687#define SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT (8)
688uint8_t sched_clutch_bucket_interactive_pri = SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT;
689
690/* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
691uint64_t sched_clutch_bucket_adjust_threshold = 0;
692#define SCHED_CLUTCH_BUCKET_ADJUST_THRESHOLD_USECS (500000)
693
694/* The ratio to scale the cpu/blocked time per window */
695#define SCHED_CLUTCH_BUCKET_ADJUST_RATIO (10)
696
697/* rate at which interactivity score is recalculated. This keeps the score smooth in terms of extremely bursty behavior */
698uint64_t sched_clutch_bucket_interactivity_delta = 0;
699#define SCHED_CLUTCH_BUCKET_INTERACTIVITY_DELTA_USECS_DEFAULT (25000)
700
701/*
702 * In order to allow App thread groups some preference over daemon thread
703 * groups, the App clutch_buckets get a 8 point boost. The boost value should
704 * be chosen such that badly behaved apps are still penalized over well
705 * behaved interactive daemon clutch_buckets.
706 */
707#define SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT (8)
708uint8_t sched_clutch_bucket_pri_boost = SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT;
709
710/* Initial value for voluntary blocking time for the clutch_bucket */
711#define SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID (uint32_t)(~0)
712
713/*
714 * sched_clutch_bucket_init()
715 *
716 * Initializer for clutch buckets.
717 */
718static void
719sched_clutch_bucket_init(
720 sched_clutch_bucket_t clutch_bucket,
721 sched_clutch_t clutch,
722 sched_bucket_t bucket)
723{
724 bzero(clutch_bucket, sizeof(struct sched_clutch_bucket));
725
726 clutch_bucket->scb_bucket = bucket;
727 /* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
728 clutch_bucket->scb_priority = 0;
729 /*
730 * All thread groups should be initialized to be interactive; this allows the newly launched
731 * thread groups to fairly compete with already running thread groups.
732 */
733 clutch_bucket->scb_interactivity_score = (sched_clutch_bucket_interactive_pri * 2);
734 clutch_bucket->scb_foreign = false;
735
736 os_atomic_store(&clutch_bucket->scb_timeshare_tick, 0, relaxed);
737 os_atomic_store(&clutch_bucket->scb_pri_shift, INT8_MAX, relaxed);
738
739 clutch_bucket->scb_interactivity_ts = 0;
740 clutch_bucket->scb_blocked_ts = SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID;
741 priority_queue_entry_init(&clutch_bucket->scb_pqlink);
742 clutch_bucket->scb_clutch = clutch;
743 clutch_bucket->scb_root = NULL;
744 priority_queue_init(&clutch_bucket->scb_clutchpri_prioq, PRIORITY_QUEUE_BUILTIN_KEY | PRIORITY_QUEUE_MAX_HEAP);
745 run_queue_init(&clutch_bucket->scb_runq);
746}
747
748/*
749 * sched_clutch_init_with_thread_group()
750 *
751 * Initialize the sched_clutch when the thread group is being created
752 */
753void
754sched_clutch_init_with_thread_group(
755 sched_clutch_t clutch,
756 struct thread_group *tg)
757{
758 os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
759
760 /* Initialize all the clutch buckets */
761 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
762 sched_clutch_bucket_init(&(clutch->sc_clutch_buckets[i]), clutch, i);
763 }
764
765 /* Grouping specific fields */
766 clutch->sc_tg = tg;
767 os_atomic_store(&clutch->sc_tg_priority, 0, relaxed);
768}
769
770/*
771 * sched_clutch_destroy()
772 *
773 * Destructor for clutch; called from thread group release code.
774 */
775void
776sched_clutch_destroy(
777 __unused sched_clutch_t clutch)
778{
779 assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
780}
781
782#if __AMP__
783
784/*
785 * sched_clutch_bucket_foreign()
786 *
787 * Identifies if the clutch bucket is a foreign (not recommended for) this
788 * hierarchy. This is possible due to the recommended hierarchy/pset not
789 * available for scheduling currently.
790 */
791static boolean_t
792sched_clutch_bucket_foreign(sched_clutch_root_t root_clutch, sched_clutch_bucket_t clutch_bucket)
793{
794 assert(clutch_bucket->scb_thr_count > 0);
795 if (!sched_clutch_pset_available(root_clutch->scr_pset)) {
796 /* Even though the pset was not available for scheduling, threads
797 * are being put in its runq (this might be due to the other pset
798 * being turned off and this being the master processor pset).
799 * Mark the clutch bucket as foreign so that when the other
800 * pset becomes available, it moves the clutch bucket accordingly.
801 */
802 return true;
803 }
804 thread_t thread = run_queue_peek(&clutch_bucket->scb_runq);
805 pset_cluster_type_t pset_type = recommended_pset_type(thread);
806 return pset_type != root_clutch->scr_pset->pset_cluster_type;
807}
808
809#endif /* __AMP__ */
810
811/*
812 * sched_clutch_bucket_hierarchy_insert()
813 *
814 * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
815 */
816static void
817sched_clutch_bucket_hierarchy_insert(
818 sched_clutch_root_t root_clutch,
819 sched_clutch_bucket_t clutch_bucket,
820 sched_bucket_t bucket,
821 uint64_t timestamp)
822{
823 sched_clutch_hierarchy_locked_assert(root_clutch);
824 if (bucket > TH_BUCKET_FIXPRI) {
825 /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
826 enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
827 }
828#if __AMP__
829 /* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
830 if (sched_clutch_bucket_foreign(root_clutch, clutch_bucket)) {
831 clutch_bucket->scb_foreign = true;
832 enqueue_tail(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
833 }
834#endif /* __AMP__ */
835 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket];
836
837 /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
838 if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) {
839 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
840 }
841
842 /* Insert the clutch bucket into the root bucket priority queue */
843 priority_queue_insert(&root_bucket->scrb_clutch_buckets, &clutch_bucket->scb_pqlink, clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
844 os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
845 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_STATE) | DBG_FUNC_NONE,
846 thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, SCHED_CLUTCH_STATE_RUNNABLE, clutch_bucket->scb_priority, 0);
847}
848
849/*
850 * sched_clutch_bucket_hierarchy_remove()
851 *
852 * Rotuine to remove a empty clutch bucket from the root hierarchy.
853 */
854static void
855sched_clutch_bucket_hierarchy_remove(
856 sched_clutch_root_t root_clutch,
857 sched_clutch_bucket_t clutch_bucket,
858 sched_bucket_t bucket,
859 uint64_t timestamp)
860{
861 sched_clutch_hierarchy_locked_assert(root_clutch);
862 if (bucket > TH_BUCKET_FIXPRI) {
863 /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
864 remqueue(&clutch_bucket->scb_listlink);
865 }
866#if __AMP__
867 if (clutch_bucket->scb_foreign) {
868 clutch_bucket->scb_foreign = false;
869 remqueue(&clutch_bucket->scb_foreignlink);
870 }
871#endif /* __AMP__ */
872
873 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket];
874
875 /* Remove the clutch bucket from the root bucket priority queue */
876 priority_queue_remove(&root_bucket->scrb_clutch_buckets, &clutch_bucket->scb_pqlink, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
877 os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
878 clutch_bucket->scb_blocked_ts = timestamp;
879 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_STATE) | DBG_FUNC_NONE,
880 thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, SCHED_CLUTCH_STATE_EMPTY, 0, 0);
881
882 /* If the root bucket priority queue is now empty, remove it from the root priority queue */
883 if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) {
884 sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
885 }
886}
887
888/*
889 * sched_clutch_bucket_base_pri()
890 *
891 * Calculates the "base" priority of the clutch bucket. The base
892 * priority of the clutch bucket is the sum of the max of highest
893 * base_pri and highest sched_pri in the clutch bucket and any
894 * grouping specific (App/Daemon...) boosts applicable to the
895 * clutch_bucket.
896 */
897static uint8_t
898sched_clutch_bucket_base_pri(
899 sched_clutch_bucket_t clutch_bucket)
900{
901 uint8_t clutch_boost = 0;
902 assert(clutch_bucket->scb_runq.count != 0);
903
904 sched_clutch_t clutch = clutch_bucket->scb_clutch;
905
906 /*
907 * Since the clutch bucket can contain threads that are members of the group due
908 * to the sched_pri being promoted or due to their base pri, the base priority of
909 * the entire clutch bucket should be based on the highest thread (promoted or base)
910 * in the clutch bucket.
911 */
912 uint8_t max_pri = priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq) ? 0 : priority_queue_max_key(&clutch_bucket->scb_clutchpri_prioq);
913
914 /*
915 * For all AboveUI clutch buckets and clutch buckets for thread groups that
916 * havent been specified as SCHED_CLUTCH_TG_PRI_LOW, give a priority boost
917 */
918 if ((clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) ||
919 (os_atomic_load(&clutch->sc_tg_priority, relaxed) != SCHED_CLUTCH_TG_PRI_LOW)) {
920 clutch_boost = sched_clutch_bucket_pri_boost;
921 }
922 return max_pri + clutch_boost;
923}
924
925/*
926 * sched_clutch_bucket_interactivity_score_calculate()
927 *
928 * Routine to calculate the interactivity score for the clutch bucket. The
929 * interactivity score is based on the ratio of CPU used by all threads in
930 * the bucket and the blocked time of the bucket as a whole.
931 */
932static uint8_t
933sched_clutch_bucket_interactivity_score_calculate(
934 sched_clutch_bucket_t clutch_bucket,
935 uint64_t timestamp)
936{
937 if (clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) {
938 /*
939 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
940 * priorities, make sure all AboveUI buckets are marked interactive.
941 */
942 assert(clutch_bucket->scb_interactivity_score == (2 * sched_clutch_bucket_interactive_pri));
943 return clutch_bucket->scb_interactivity_score;
944 }
945
946 if (clutch_bucket->scb_interactivity_ts == 0) {
947 /*
948 * This indicates a newly initialized clutch bucket; return the default interactivity score
949 * and update timestamp.
950 */
951 clutch_bucket->scb_interactivity_ts = timestamp;
952 return clutch_bucket->scb_interactivity_score;
953 }
954
955 if (timestamp < (clutch_bucket->scb_interactivity_ts + sched_clutch_bucket_interactivity_delta)) {
956 return clutch_bucket->scb_interactivity_score;
957 }
958
959 /* Check if the clutch bucket accounting needs to be scaled */
960 sched_clutch_bucket_cpu_adjust(clutch_bucket);
961 clutch_bucket->scb_interactivity_ts = timestamp;
962
963 sched_clutch_bucket_cpu_data_t scb_cpu_data;
964 scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, relaxed);
965 clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
966 clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
967
968 /*
969 * In extremely CPU contended cases, it is possible that the clutch bucket has been runnable
970 * for a long time but none of its threads have been picked up for execution. In that case, both
971 * the CPU used and blocked would be 0.
972 */
973 if ((cpu_blocked == 0) && (cpu_used == 0)) {
974 return clutch_bucket->scb_interactivity_score;
975 }
976
977 /*
978 * For all timeshare buckets, calculate the interactivity score of the bucket
979 * and add it to the base priority
980 */
981 uint8_t interactive_score = 0;
982 if (cpu_blocked > cpu_used) {
983 /* Interactive clutch_bucket case */
984 interactive_score = sched_clutch_bucket_interactive_pri +
985 ((sched_clutch_bucket_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
986 } else {
987 /* Non-interactive clutch_bucket case */
988 interactive_score = ((sched_clutch_bucket_interactive_pri * cpu_blocked) / cpu_used);
989 }
990 clutch_bucket->scb_interactivity_score = interactive_score;
991 return interactive_score;
992}
993
994/*
995 * sched_clutch_bucket_pri_calculate()
996 *
997 * The priority calculation algorithm for the clutch_bucket is a slight
998 * modification on the ULE interactivity score. It uses the base priority
999 * of the clutch bucket and applies an interactivity score boost to the
1000 * highly responsive clutch buckets.
1001 */
1002
1003static uint8_t
1004sched_clutch_bucket_pri_calculate(
1005 sched_clutch_bucket_t clutch_bucket,
1006 uint64_t timestamp)
1007{
1008 /* For empty clutch buckets, return priority 0 */
1009 if (clutch_bucket->scb_thr_count == 0) {
1010 return 0;
1011 }
1012
1013 uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
1014 uint8_t interactive_score = sched_clutch_bucket_interactivity_score_calculate(clutch_bucket, timestamp);
1015
1016 assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1017 uint8_t pri = base_pri + interactive_score;
1018 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
1019 thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
1020 return pri;
1021}
1022
1023/*
1024 * sched_clutch_root_bucket_highest_clutch_bucket()
1025 *
1026 * Routine to find the highest priority clutch bucket
1027 * within the root bucket.
1028 */
1029static sched_clutch_bucket_t
1030sched_clutch_root_bucket_highest_clutch_bucket(
1031 sched_clutch_root_bucket_t root_bucket)
1032{
1033 if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) {
1034 return NULL;
1035 }
1036 return priority_queue_max(&root_bucket->scrb_clutch_buckets, struct sched_clutch_bucket, scb_pqlink);
1037}
1038
1039/*
1040 * sched_clutch_bucket_runnable()
1041 *
1042 * Perform all operations needed when a new clutch bucket becomes runnable.
1043 * It involves inserting the clutch_bucket into the hierarchy and updating the
1044 * root priority appropriately.
1045 */
1046static boolean_t
1047sched_clutch_bucket_runnable(
1048 sched_clutch_bucket_t clutch_bucket,
1049 sched_clutch_root_t root_clutch,
1050 uint64_t timestamp)
1051{
1052 sched_clutch_hierarchy_locked_assert(root_clutch);
1053 sched_clutch_bucket_cpu_blocked_update(clutch_bucket, timestamp);
1054 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1055 sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp);
1056 /* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
1057 sched_clutch_bucket_timeshare_update(clutch_bucket);
1058 int16_t root_old_pri = root_clutch->scr_priority;
1059 sched_clutch_root_pri_update(root_clutch);
1060 return root_clutch->scr_priority > root_old_pri;
1061}
1062
1063/*
1064 * sched_clutch_bucket_update()
1065 *
1066 * Update the clutch_bucket's position in the hierarchy based on whether
1067 * the newly runnable thread changes its priority. Also update the root
1068 * priority accordingly.
1069 */
1070static boolean_t
1071sched_clutch_bucket_update(
1072 sched_clutch_bucket_t clutch_bucket,
1073 sched_clutch_root_t root_clutch,
1074 uint64_t timestamp)
1075{
1076 sched_clutch_hierarchy_locked_assert(root_clutch);
1077 uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1078 if (new_pri == clutch_bucket->scb_priority) {
1079 return false;
1080 }
1081 struct priority_queue *bucket_prioq = &root_clutch->scr_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1082
1083 if (new_pri < clutch_bucket->scb_priority) {
1084 clutch_bucket->scb_priority = new_pri;
1085 priority_queue_entry_decrease(bucket_prioq, &clutch_bucket->scb_pqlink,
1086 clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
1087 } else {
1088 clutch_bucket->scb_priority = new_pri;
1089 priority_queue_entry_increase(bucket_prioq, &clutch_bucket->scb_pqlink,
1090 clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
1091 }
1092
1093 int16_t root_old_pri = root_clutch->scr_priority;
1094 sched_clutch_root_pri_update(root_clutch);
1095 return root_clutch->scr_priority > root_old_pri;
1096}
1097
1098/*
1099 * sched_clutch_bucket_empty()
1100 *
1101 * Perform all the operations needed when a clutch_bucket is no longer runnable.
1102 * It involves removing the clutch bucket from the hierarchy and updaing the root
1103 * priority appropriately.
1104 */
1105static void
1106sched_clutch_bucket_empty(
1107 sched_clutch_bucket_t clutch_bucket,
1108 sched_clutch_root_t root_clutch,
1109 uint64_t timestamp)
1110{
1111 sched_clutch_hierarchy_locked_assert(root_clutch);
1112 sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp);
1113 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1114 sched_clutch_root_pri_update(root_clutch);
1115}
1116
1117/*
1118 * sched_clutch_cpu_usage_update()
1119 *
1120 * Routine to update CPU usage of the thread in the hierarchy.
1121 */
1122void
1123sched_clutch_cpu_usage_update(
1124 thread_t thread,
1125 uint64_t delta)
1126{
1127 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1128 return;
1129 }
1130 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1131 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
1132 sched_clutch_bucket_cpu_usage_update(clutch_bucket, delta);
1133}
1134
1135/*
1136 * sched_clutch_bucket_cpu_usage_update()
1137 *
1138 * Routine to update the CPU usage of the clutch_bucket.
1139 */
1140static void
1141sched_clutch_bucket_cpu_usage_update(
1142 sched_clutch_bucket_t clutch_bucket,
1143 uint64_t delta)
1144{
1145 if (clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) {
1146 /* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1147 return;
1148 }
1149
1150 /*
1151 * The CPU usage should not overflow the clutch_cpu_data_t type. Since the usage is used to
1152 * calculate interactivity score, it is safe to restrict it to CLUTCH_CPU_DATA_MAX.
1153 */
1154 delta = MIN(delta, CLUTCH_CPU_DATA_MAX);
1155 os_atomic_add_orig(&(clutch_bucket->scb_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1156}
1157
1158/*
1159 * sched_clutch_bucket_cpu_blocked_update()
1160 *
1161 * Routine to update CPU blocked time for clutch_bucket.
1162 */
1163static void
1164sched_clutch_bucket_cpu_blocked_update(
1165 sched_clutch_bucket_t clutch_bucket,
1166 uint64_t timestamp)
1167{
1168 if ((clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) ||
1169 (clutch_bucket->scb_blocked_ts == SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID)) {
1170 /* For Above UI bucket and a newly initialized clutch bucket, nothing to do here */
1171 return;
1172 }
1173
1174 uint64_t blocked_time = timestamp - clutch_bucket->scb_blocked_ts;
1175 if (blocked_time > sched_clutch_bucket_adjust_threshold) {
1176 blocked_time = sched_clutch_bucket_adjust_threshold;
1177 }
1178
1179 /*
1180 * The CPU blocked should not overflow the clutch_cpu_data_t type. Since the blocked is used to
1181 * calculate interactivity score, it is safe to restrict it to CLUTCH_CPU_DATA_MAX.
1182 */
1183 blocked_time = MIN(blocked_time, CLUTCH_CPU_DATA_MAX);
1184 clutch_cpu_data_t __assert_only cpu_blocked_orig = os_atomic_add_orig(&(clutch_bucket->scb_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
1185 /* The blocked time is scaled every so often, it should never overflow */
1186 assert(blocked_time <= (CLUTCH_CPU_DATA_MAX - cpu_blocked_orig));
1187}
1188
1189/*
1190 * sched_clutch_bucket_cpu_adjust()
1191 *
1192 * Routine to scale the cpu usage and blocked time once the sum gets bigger
1193 * than sched_clutch_bucket_adjust_threshold. Allows the values to remain
1194 * manageable and maintain the same ratio while allowing clutch buckets to
1195 * adjust behavior and reflect in the interactivity score in a reasonable
1196 * amount of time.
1197 */
1198static void
1199sched_clutch_bucket_cpu_adjust(
1200 sched_clutch_bucket_t clutch_bucket)
1201{
1202 sched_clutch_bucket_cpu_data_t old_cpu_data = {};
1203 sched_clutch_bucket_cpu_data_t new_cpu_data = {};
1204 do {
1205 old_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, relaxed);
1206 clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
1207 clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
1208 if ((cpu_used + cpu_blocked) < sched_clutch_bucket_adjust_threshold) {
1209 return;
1210 }
1211
1212 /*
1213 * The accumulation of CPU used and blocked is past the threshold; scale it
1214 * down to lose old history.
1215 */
1216 new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_ADJUST_RATIO;
1217 new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_ADJUST_RATIO;
1218 } while (!os_atomic_cmpxchg(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed));
1219}
1220
1221/*
1222 * Thread level scheduling algorithm
1223 *
1224 * The thread level scheduling algorithm uses the mach timeshare
1225 * decay based algorithm to achieve sharing between threads within the
1226 * same clutch bucket. The load/priority shifts etc. are all maintained
1227 * at the clutch bucket level and used for decay calculation of the
1228 * threads. The load sampling is still driven off the scheduler tick
1229 * for runnable clutch buckets (it does not use the new higher frequency
1230 * EWMA based load calculation). The idea is that the contention and load
1231 * within clutch_buckets should be limited enough to not see heavy decay
1232 * and timeshare effectively.
1233 */
1234
1235/*
1236 * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1237 *
1238 * Increment the run count for the clutch bucket associated with the
1239 * thread.
1240 */
1241uint32_t
1242sched_clutch_thread_run_bucket_incr(
1243 thread_t thread,
1244 sched_bucket_t bucket)
1245{
1246 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1247 return 0;
1248 }
1249 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1250 return sched_clutch_run_bucket_incr(clutch, bucket);
1251}
1252
1253static uint32_t
1254sched_clutch_run_bucket_incr(
1255 sched_clutch_t clutch,
1256 sched_bucket_t bucket)
1257{
1258 assert(bucket != TH_BUCKET_RUN);
1259 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]);
1260 uint32_t result = os_atomic_inc(&(clutch_bucket->scb_run_count), relaxed);
1261 return result;
1262}
1263
1264/*
1265 * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
1266 *
1267 * Decrement the run count for the clutch bucket associated with the
1268 * thread.
1269 */
1270uint32_t
1271sched_clutch_thread_run_bucket_decr(
1272 thread_t thread,
1273 sched_bucket_t bucket)
1274{
1275 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1276 return 0;
1277 }
1278 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1279 return sched_clutch_run_bucket_decr(clutch, bucket);
1280}
1281
1282static uint32_t
1283sched_clutch_run_bucket_decr(
1284 sched_clutch_t clutch,
1285 sched_bucket_t bucket)
1286{
1287 assert(bucket != TH_BUCKET_RUN);
1288 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]);
1289 uint32_t result = os_atomic_dec(&(clutch_bucket->scb_run_count), relaxed);
1290 return result;
1291}
1292
1293/*
1294 * sched_clutch_bucket_timeshare_update()
1295 *
1296 * Routine to update the load and priority shift for the clutch_bucket every
1297 * sched_tick. For runnable clutch_buckets, the sched tick handling code
1298 * iterates the clutch buckets and calls this routine. For all others, the
1299 * clutch_bucket maintains a "last updated schedtick" parameter. As threads
1300 * become runnable in the clutch bucket, if this value is outdated, the load
1301 * and shifts are updated.
1302 *
1303 * Possible optimization:
1304 * - The current algorithm samples the load every sched tick (125ms).
1305 * This is prone to spikes in runnable counts; if that turns out to be
1306 * a problem, a simple solution would be to do the EWMA trick to sample
1307 * load at every load_tick (30ms) and use the averaged value for the pri
1308 * shift calculation.
1309 */
1310static void
1311sched_clutch_bucket_timeshare_update(
1312 sched_clutch_bucket_t clutch_bucket)
1313{
1314 if (clutch_bucket->scb_bucket < TH_BUCKET_SHARE_FG) {
1315 return;
1316 }
1317
1318 /*
1319 * Update the timeshare parameters for the clutch bucket if they havent been updated
1320 * in this tick.
1321 */
1322 uint32_t bucket_sched_ts = os_atomic_load(&clutch_bucket->scb_timeshare_tick, relaxed);
1323 uint32_t current_sched_ts = sched_tick;
1324 if (bucket_sched_ts != current_sched_ts) {
1325 os_atomic_store(&clutch_bucket->scb_timeshare_tick, current_sched_ts, relaxed);
1326 uint32_t bucket_load = (os_atomic_load(&clutch_bucket->scb_run_count, relaxed) / processor_avail_count);
1327 bucket_load = MIN(bucket_load, NRQS - 1);
1328 uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_load];
1329 os_atomic_store(&clutch_bucket->scb_pri_shift, pri_shift, relaxed);
1330 }
1331}
1332
1333/*
1334 * sched_clutch_thread_clutch_update()
1335 *
1336 * Routine called when the thread changes its thread group. The current
1337 * implementation relies on the fact that the thread group is changed only
1338 * from the context of the thread itself. Due to this fact, the thread
1339 * group change causes only counter updates in the old & new clutch
1340 * buckets and no hierarchy changes. The routine also attributes the CPU
1341 * used so far to the old clutch.
1342 */
1343void
1344sched_clutch_thread_clutch_update(
1345 thread_t thread,
1346 sched_clutch_t old_clutch,
1347 sched_clutch_t new_clutch)
1348{
1349 uint32_t cpu_delta;
1350 assert(current_thread() == thread);
1351
1352 if (old_clutch) {
1353 sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
1354 /*
1355 * Calculate the CPU used by this thread in the old bucket and
1356 * add it to the old clutch bucket. This uses the same CPU usage
1357 * logic as update_priority etc.
1358 */
1359 thread_timer_delta(thread, cpu_delta);
1360 if (thread->pri_shift < INT8_MAX) {
1361 thread->sched_usage += cpu_delta;
1362 }
1363 thread->cpu_delta += cpu_delta;
1364 sched_clutch_bucket_cpu_usage_update(&(old_clutch->sc_clutch_buckets[thread->th_sched_bucket]), cpu_delta);
1365 }
1366
1367 if (new_clutch) {
1368 sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
1369 }
1370}
1371
1372/* Thread Insertion/Removal/Selection routines */
1373
1374/*
1375 * sched_clutch_thread_insert()
1376 *
1377 * Routine to insert a thread into the sched clutch hierarchy.
1378 * Update the counts at all levels of the hierarchy and insert the nodes
1379 * as they become runnable. Always called with the pset lock held.
1380 */
1381static boolean_t
1382sched_clutch_thread_insert(
1383 sched_clutch_root_t root_clutch,
1384 thread_t thread,
1385 integer_t options)
1386{
1387 boolean_t result = FALSE;
1388
1389 sched_clutch_hierarchy_locked_assert(root_clutch);
1390 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1391 assert(thread->thread_group == clutch->sc_tg);
1392
1393 uint64_t current_timestamp = mach_absolute_time();
1394 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
1395 assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
1396
1397 /* Insert thread into the clutch_bucket runq using sched_pri */
1398 run_queue_enqueue(&clutch_bucket->scb_runq, thread, options);
1399 /* Increment the urgency counter for the root if necessary */
1400 sched_clutch_root_urgency_inc(root_clutch, thread);
1401
1402 /* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
1403 priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->sched_clutchpri_link,
1404 sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri,
1405 PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
1406 os_atomic_inc(&clutch->sc_thr_count, relaxed);
1407 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_STATE) | DBG_FUNC_NONE,
1408 thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, thread_tid(thread), SCHED_CLUTCH_STATE_RUNNABLE, 0);
1409
1410 /* Enqueue the clutch into the hierarchy (if needed) and update properties */
1411 if (clutch_bucket->scb_thr_count == 0) {
1412 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
1413 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
1414 /* Insert the newly runnable clutch bucket into the hierarchy */
1415 result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp);
1416 } else {
1417 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
1418 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
1419 /* Update the position of the clutch bucket in the hierarchy */
1420 result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp);
1421 }
1422 return result;
1423}
1424
1425/*
1426 * sched_clutch_thread_remove()
1427 *
1428 * Routine to remove a thread from the sched clutch hierarchy.
1429 * Update the counts at all levels of the hierarchy and remove the nodes
1430 * as they become empty. Always called with the pset lock held.
1431 */
1432static void
1433sched_clutch_thread_remove(
1434 sched_clutch_root_t root_clutch,
1435 thread_t thread,
1436 uint64_t current_timestamp)
1437{
1438 sched_clutch_hierarchy_locked_assert(root_clutch);
1439 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1440 assert(thread->thread_group == clutch->sc_tg);
1441 assert(thread->runq != PROCESSOR_NULL);
1442
1443 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
1444 assert(clutch_bucket->scb_root == root_clutch);
1445
1446 /* Decrement the urgency counter for the root if necessary */
1447 sched_clutch_root_urgency_dec(root_clutch, thread);
1448 /* Remove thread from the clutch_bucket */
1449 run_queue_remove(&clutch_bucket->scb_runq, thread);
1450
1451 priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->sched_clutchpri_link,
1452 PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
1453 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_STATE) | DBG_FUNC_NONE,
1454 thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, thread_tid(thread), SCHED_CLUTCH_STATE_EMPTY, 0);
1455
1456 /* Update counts at various levels of the hierarchy */
1457 os_atomic_dec(&clutch->sc_thr_count, relaxed);
1458 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
1459 sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
1460
1461 /* Remove the clutch from hierarchy (if needed) and update properties */
1462 if (clutch_bucket->scb_thr_count == 0) {
1463 sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp);
1464 } else {
1465 sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp);
1466 }
1467}
1468
1469/*
1470 * sched_clutch_thread_highest()
1471 *
1472 * Routine to find and remove the highest priority thread
1473 * from the sched clutch hierarchy. The algorithm looks at the
1474 * hierarchy for the most eligible runnable thread and calls
1475 * sched_clutch_thread_remove(). Always called with the
1476 * pset lock held.
1477 */
1478static thread_t
1479sched_clutch_thread_highest(
1480 sched_clutch_root_t root_clutch)
1481{
1482 sched_clutch_hierarchy_locked_assert(root_clutch);
1483 uint64_t current_timestamp = mach_absolute_time();
1484
1485 /* Select the highest priority root bucket */
1486 sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp);
1487 if (root_bucket == NULL) {
1488 return THREAD_NULL;
1489 }
1490 /* Since a thread is being picked from this root bucket, update its deadline */
1491 sched_clutch_root_bucket_deadline_update(root_bucket, root_clutch, current_timestamp);
1492
1493 /* Find the highest priority clutch bucket in this root bucket */
1494 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
1495 assert(clutch_bucket != NULL);
1496
1497 /* Find the highest priority runnable thread in this clutch bucket */
1498 thread_t thread = run_queue_peek(&clutch_bucket->scb_runq);
1499 assert(thread != NULL);
1500
1501 /* Remove and return the thread from the hierarchy */
1502 sched_clutch_thread_remove(root_clutch, thread, current_timestamp);
1503 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
1504 thread_tid(thread), thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, 0, 0);
1505 return thread;
1506}
1507
1508
1509/* High level global accessor routines */
1510
1511/*
1512 * sched_clutch_root_urgency()
1513 *
1514 * Routine to get the urgency of the highest runnable
1515 * thread in the hierarchy.
1516 */
1517static uint32_t
1518sched_clutch_root_urgency(
1519 sched_clutch_root_t root_clutch)
1520{
1521 return root_clutch->scr_urgency;
1522}
1523
1524/*
1525 * sched_clutch_root_count_sum()
1526 *
1527 * The count_sum mechanism is used for scheduler runq
1528 * statistics calculation. Its only useful for debugging
1529 * purposes; since it takes a mach_absolute_time() on
1530 * other scheduler implementations, its better to avoid
1531 * populating this until absolutely necessary.
1532 */
1533static uint32_t
1534sched_clutch_root_count_sum(
1535 __unused sched_clutch_root_t root_clutch)
1536{
1537 return 0;
1538}
1539
1540/*
1541 * sched_clutch_root_priority()
1542 *
1543 * Routine to get the priority of the highest runnable
1544 * thread in the hierarchy.
1545 */
1546static int
1547sched_clutch_root_priority(
1548 sched_clutch_root_t root_clutch)
1549{
1550 return root_clutch->scr_priority;
1551}
1552
1553/*
1554 * sched_clutch_root_count()
1555 *
1556 * Returns total number of runnable threads in the hierarchy.
1557 */
1558uint32_t
1559sched_clutch_root_count(
1560 sched_clutch_root_t root_clutch)
1561{
1562 return root_clutch->scr_thr_count;
1563}
1564
1565/*
1566 * sched_clutch_thread_pri_shift()
1567 *
1568 * Routine to get the priority shift value for a thread.
1569 * Since the timesharing is done at the clutch_bucket level,
1570 * this routine gets the clutch_bucket and retrieves the
1571 * values from there.
1572 */
1573uint32_t
1574sched_clutch_thread_pri_shift(
1575 thread_t thread,
1576 sched_bucket_t bucket)
1577{
1578 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1579 return UINT8_MAX;
1580 }
1581 assert(bucket != TH_BUCKET_RUN);
1582 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1583 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]);
1584 return os_atomic_load(&clutch_bucket->scb_pri_shift, relaxed);
1585}
1586
1587#pragma mark -- Clutch Scheduler Algorithm
1588
1589static void
1590sched_clutch_init(void);
1591
1592static void
1593sched_clutch_timebase_init(void);
1594
1595static thread_t
1596sched_clutch_steal_thread(processor_set_t pset);
1597
1598static void
1599sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
1600
1601static boolean_t
1602sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
1603 sched_options_t options);
1604
1605static boolean_t
1606sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
1607
1608static ast_t
1609sched_clutch_processor_csw_check(processor_t processor);
1610
1611static boolean_t
1612sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
1613
1614static int
1615sched_clutch_runq_count(processor_t processor);
1616
1617static boolean_t
1618sched_clutch_processor_queue_empty(processor_t processor);
1619
1620static uint64_t
1621sched_clutch_runq_stats_count_sum(processor_t processor);
1622
1623static int
1624sched_clutch_processor_bound_count(processor_t processor);
1625
1626static void
1627sched_clutch_pset_init(processor_set_t pset);
1628
1629static void
1630sched_clutch_processor_init(processor_t processor);
1631
1632static thread_t
1633sched_clutch_choose_thread(processor_t processor, int priority, ast_t reason);
1634
1635static void
1636sched_clutch_processor_queue_shutdown(processor_t processor);
1637
1638static sched_mode_t
1639sched_clutch_initial_thread_sched_mode(task_t parent_task);
1640
1641static uint32_t
1642sched_clutch_initial_quantum_size(thread_t thread);
1643
1644static bool
1645sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread);
1646
1647static uint32_t
1648sched_clutch_run_incr(thread_t thread);
1649
1650static uint32_t
1651sched_clutch_run_decr(thread_t thread);
1652
1653static void
1654sched_clutch_update_thread_bucket(thread_t thread);
1655
1656const struct sched_dispatch_table sched_clutch_dispatch = {
1657 .sched_name = "clutch",
1658 .init = sched_clutch_init,
1659 .timebase_init = sched_clutch_timebase_init,
1660 .processor_init = sched_clutch_processor_init,
1661 .pset_init = sched_clutch_pset_init,
1662 .maintenance_continuation = sched_timeshare_maintenance_continue,
1663 .choose_thread = sched_clutch_choose_thread,
1664 .steal_thread_enabled = sched_steal_thread_enabled,
1665 .steal_thread = sched_clutch_steal_thread,
1666 .compute_timeshare_priority = sched_compute_timeshare_priority,
1667 .choose_processor = choose_processor,
1668 .processor_enqueue = sched_clutch_processor_enqueue,
1669 .processor_queue_shutdown = sched_clutch_processor_queue_shutdown,
1670 .processor_queue_remove = sched_clutch_processor_queue_remove,
1671 .processor_queue_empty = sched_clutch_processor_queue_empty,
1672 .priority_is_urgent = priority_is_urgent,
1673 .processor_csw_check = sched_clutch_processor_csw_check,
1674 .processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
1675 .initial_quantum_size = sched_clutch_initial_quantum_size,
1676 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
1677 .can_update_priority = can_update_priority,
1678 .update_priority = update_priority,
1679 .lightweight_update_priority = lightweight_update_priority,
1680 .quantum_expire = sched_default_quantum_expire,
1681 .processor_runq_count = sched_clutch_runq_count,
1682 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
1683 .processor_bound_count = sched_clutch_processor_bound_count,
1684 .thread_update_scan = sched_clutch_thread_update_scan,
1685 .multiple_psets_enabled = TRUE,
1686 .sched_groups_enabled = FALSE,
1687 .avoid_processor_enabled = TRUE,
1688 .thread_avoid_processor = sched_clutch_thread_avoid_processor,
1689 .processor_balance = sched_SMT_balance,
1690
1691 .rt_runq = sched_rtglobal_runq,
1692 .rt_init = sched_rtglobal_init,
1693 .rt_queue_shutdown = sched_rtglobal_queue_shutdown,
1694 .rt_runq_scan = sched_rtglobal_runq_scan,
1695 .rt_runq_count_sum = sched_rtglobal_runq_count_sum,
1696
1697 .qos_max_parallelism = sched_qos_max_parallelism,
1698 .check_spill = sched_check_spill,
1699 .ipi_policy = sched_ipi_policy,
1700 .thread_should_yield = sched_thread_should_yield,
1701 .run_count_incr = sched_clutch_run_incr,
1702 .run_count_decr = sched_clutch_run_decr,
1703 .update_thread_bucket = sched_clutch_update_thread_bucket,
1704 .pset_made_schedulable = sched_pset_made_schedulable,
1705};
1706
1707__attribute__((always_inline))
1708static inline run_queue_t
1709sched_clutch_bound_runq(processor_t processor)
1710{
1711 return &processor->runq;
1712}
1713
1714__attribute__((always_inline))
1715static inline sched_clutch_root_t
1716sched_clutch_processor_root_clutch(processor_t processor)
1717{
1718 return &processor->processor_set->pset_clutch_root;
1719}
1720
1721__attribute__((always_inline))
1722static inline run_queue_t
1723sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
1724{
1725 assert(thread->bound_processor == processor);
1726 return sched_clutch_bound_runq(processor);
1727}
1728
1729static uint32_t
1730sched_clutch_initial_quantum_size(thread_t thread)
1731{
1732 if (thread == THREAD_NULL) {
1733 return std_quantum;
1734 }
1735 assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
1736 return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
1737}
1738
1739static sched_mode_t
1740sched_clutch_initial_thread_sched_mode(task_t parent_task)
1741{
1742 if (parent_task == kernel_task) {
1743 return TH_MODE_FIXED;
1744 } else {
1745 return TH_MODE_TIMESHARE;
1746 }
1747}
1748
1749static void
1750sched_clutch_processor_init(processor_t processor)
1751{
1752 run_queue_init(&processor->runq);
1753}
1754
1755static void
1756sched_clutch_pset_init(processor_set_t pset)
1757{
1758 sched_clutch_root_init(&pset->pset_clutch_root, pset);
1759}
1760
1761static void
1762sched_clutch_init(void)
1763{
1764 if (!PE_parse_boot_argn("sched_clutch_bucket_interactive_pri", &sched_clutch_bucket_interactive_pri, sizeof(sched_clutch_bucket_interactive_pri))) {
1765 sched_clutch_bucket_interactive_pri = SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT;
1766 }
1767 if (!PE_parse_boot_argn("sched_clutch_bucket_pri_boost", &sched_clutch_bucket_pri_boost, sizeof(sched_clutch_bucket_pri_boost))) {
1768 sched_clutch_bucket_pri_boost = SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT;
1769 }
1770 sched_timeshare_init();
1771}
1772
1773static void
1774sched_clutch_timebase_init(void)
1775{
1776 sched_timeshare_timebase_init();
1777 sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
1778 sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
1779 sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
1780 clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_ADJUST_THRESHOLD_USECS,
1781 NSEC_PER_USEC, &sched_clutch_bucket_adjust_threshold);
1782
1783 uint32_t interactivity_delta = 0;
1784 if (!PE_parse_boot_argn("sched_clutch_bucket_interactivity_delta_usecs", &interactivity_delta, sizeof(interactivity_delta))) {
1785 interactivity_delta = SCHED_CLUTCH_BUCKET_INTERACTIVITY_DELTA_USECS_DEFAULT;
1786 }
1787 clock_interval_to_absolutetime_interval(interactivity_delta, NSEC_PER_USEC, &sched_clutch_bucket_interactivity_delta);
1788}
1789
1790static thread_t
1791sched_clutch_choose_thread(
1792 processor_t processor,
1793 int priority,
1794 __unused ast_t reason)
1795{
1796 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
1797 uint32_t clutch_count = sched_clutch_root_count(sched_clutch_processor_root_clutch(processor));
1798 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
1799 boolean_t choose_from_boundq = false;
1800
1801 if (bound_runq->highq < priority &&
1802 clutch_pri < priority) {
1803 return THREAD_NULL;
1804 }
1805
1806 if (bound_runq->count && clutch_count) {
1807 if (bound_runq->highq >= clutch_pri) {
1808 choose_from_boundq = true;
1809 }
1810 } else if (bound_runq->count) {
1811 choose_from_boundq = true;
1812 } else if (clutch_count) {
1813 choose_from_boundq = false;
1814 } else {
1815 return THREAD_NULL;
1816 }
1817
1818 thread_t thread = THREAD_NULL;
1819 if (choose_from_boundq == false) {
1820 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
1821 thread = sched_clutch_thread_highest(pset_clutch_root);
1822 } else {
1823 thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
1824 }
1825 return thread;
1826}
1827
1828static boolean_t
1829sched_clutch_processor_enqueue(
1830 processor_t processor,
1831 thread_t thread,
1832 sched_options_t options)
1833{
1834 boolean_t result;
1835
1836 thread->runq = processor;
1837 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1838 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
1839 result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
1840 } else {
1841 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
1842 result = run_queue_enqueue(rq, thread, options);
1843 }
1844 return result;
1845}
1846
1847static boolean_t
1848sched_clutch_processor_queue_empty(processor_t processor)
1849{
1850 return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
1851 sched_clutch_bound_runq(processor)->count == 0;
1852}
1853
1854static ast_t
1855sched_clutch_processor_csw_check(processor_t processor)
1856{
1857 boolean_t has_higher;
1858 int pri;
1859
1860 if (sched_clutch_thread_avoid_processor(processor, current_thread())) {
1861 return AST_PREEMPT | AST_URGENT;
1862 }
1863
1864 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
1865 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
1866
1867 assert(processor->active_thread != NULL);
1868
1869 pri = MAX(clutch_pri, bound_runq->highq);
1870
1871 if (processor->first_timeslice) {
1872 has_higher = (pri > processor->current_pri);
1873 } else {
1874 has_higher = (pri >= processor->current_pri);
1875 }
1876
1877 if (has_higher) {
1878 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
1879 return AST_PREEMPT | AST_URGENT;
1880 }
1881
1882 if (bound_runq->urgency > 0) {
1883 return AST_PREEMPT | AST_URGENT;
1884 }
1885
1886 return AST_PREEMPT;
1887 }
1888
1889 return AST_NONE;
1890}
1891
1892static boolean_t
1893sched_clutch_processor_queue_has_priority(processor_t processor,
1894 int priority,
1895 boolean_t gte)
1896{
1897 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
1898
1899 int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
1900
1901 if (gte) {
1902 return qpri >= priority;
1903 } else {
1904 return qpri > priority;
1905 }
1906}
1907
1908static int
1909sched_clutch_runq_count(processor_t processor)
1910{
1911 return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
1912}
1913
1914static uint64_t
1915sched_clutch_runq_stats_count_sum(processor_t processor)
1916{
1917 uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
1918
1919 if (processor->cpu_id == processor->processor_set->cpu_set_low) {
1920 return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
1921 } else {
1922 return bound_sum;
1923 }
1924}
1925static int
1926sched_clutch_processor_bound_count(processor_t processor)
1927{
1928 return sched_clutch_bound_runq(processor)->count;
1929}
1930
1931static void
1932sched_clutch_processor_queue_shutdown(processor_t processor)
1933{
1934 processor_set_t pset = processor->processor_set;
1935 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
1936 thread_t thread;
1937 queue_head_t tqueue;
1938
1939 /* We only need to migrate threads if this is the last active processor in the pset */
1940 if (pset->online_processor_count > 0) {
1941 pset_unlock(pset);
1942 return;
1943 }
1944
1945 queue_init(&tqueue);
1946 while (sched_clutch_root_count(pset_clutch_root) > 0) {
1947 thread = sched_clutch_thread_highest(pset_clutch_root);
1948 enqueue_tail(&tqueue, &thread->runq_links);
1949 }
1950
1951 pset_unlock(pset);
1952
1953 qe_foreach_element_safe(thread, &tqueue, runq_links) {
1954 remqueue(&thread->runq_links);
1955
1956 thread_lock(thread);
1957
1958 thread_setrun(thread, SCHED_TAILQ);
1959
1960 thread_unlock(thread);
1961 }
1962}
1963
1964static boolean_t
1965sched_clutch_processor_queue_remove(
1966 processor_t processor,
1967 thread_t thread)
1968{
1969 run_queue_t rq;
1970 processor_set_t pset = processor->processor_set;
1971
1972 pset_lock(pset);
1973
1974 if (processor == thread->runq) {
1975 /*
1976 * Thread is on a run queue and we have a lock on
1977 * that run queue.
1978 */
1979 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1980 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
1981 sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time());
1982 } else {
1983 rq = sched_clutch_thread_bound_runq(processor, thread);
1984 run_queue_remove(rq, thread);
1985 }
1986 } else {
1987 /*
1988 * The thread left the run queue before we could
1989 * lock the run queue.
1990 */
1991 assert(thread->runq == PROCESSOR_NULL);
1992 processor = PROCESSOR_NULL;
1993 }
1994
1995 pset_unlock(pset);
1996
1997 return processor != PROCESSOR_NULL;
1998}
1999
2000static thread_t
2001sched_clutch_steal_thread(processor_set_t pset)
2002{
2003 processor_set_t nset, cset = pset;
2004 thread_t thread;
2005
2006 do {
2007 sched_clutch_root_t pset_clutch_root = &cset->pset_clutch_root;
2008 if (sched_clutch_root_count(pset_clutch_root) > 0) {
2009 thread = sched_clutch_thread_highest(pset_clutch_root);
2010 pset_unlock(cset);
2011 return thread;
2012 }
2013
2014 nset = next_pset(cset);
2015
2016 if (nset != pset) {
2017 pset_unlock(cset);
2018
2019 cset = nset;
2020 pset_lock(cset);
2021 }
2022 } while (nset != pset);
2023
2024 pset_unlock(cset);
2025
2026 return THREAD_NULL;
2027}
2028
2029static void
2030sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
2031{
2032 boolean_t restart_needed = FALSE;
2033 processor_t processor = processor_list;
2034 processor_set_t pset;
2035 thread_t thread;
2036 spl_t s;
2037
2038 /*
2039 * We update the threads associated with each processor (bound and idle threads)
2040 * and then update the threads in each pset runqueue.
2041 */
2042
2043 do {
2044 do {
2045 pset = processor->processor_set;
2046
2047 s = splsched();
2048 pset_lock(pset);
2049
2050 restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
2051
2052 pset_unlock(pset);
2053 splx(s);
2054
2055 if (restart_needed) {
2056 break;
2057 }
2058
2059 thread = processor->idle_thread;
2060 if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
2061 if (thread_update_add_thread(thread) == FALSE) {
2062 restart_needed = TRUE;
2063 break;
2064 }
2065 }
2066 } while ((processor = processor->processor_list) != NULL);
2067
2068 /* Ok, we now have a collection of candidates -- fix them. */
2069 thread_update_process_threads();
2070 } while (restart_needed);
2071
2072 pset = &pset0;
2073
2074 do {
2075 do {
2076 s = splsched();
2077 pset_lock(pset);
2078
2079 if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
2080 queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
2081 sched_clutch_bucket_t clutch_bucket;
2082 qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
2083 sched_clutch_bucket_timeshare_update(clutch_bucket);
2084 restart_needed = runq_scan(&clutch_bucket->scb_runq, scan_context);
2085 if (restart_needed) {
2086 break;
2087 }
2088 }
2089 }
2090
2091 pset_unlock(pset);
2092 splx(s);
2093 if (restart_needed) {
2094 break;
2095 }
2096 } while ((pset = pset->pset_list) != NULL);
2097
2098 /* Ok, we now have a collection of candidates -- fix them. */
2099 thread_update_process_threads();
2100 } while (restart_needed);
2101}
2102
2103extern int sched_allow_rt_smt;
2104
2105/* Return true if this thread should not continue running on this processor */
2106static bool
2107sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread)
2108{
2109 if (processor->processor_primary != processor) {
2110 /*
2111 * This is a secondary SMT processor. If the primary is running
2112 * a realtime thread, only allow realtime threads on the secondary.
2113 */
2114 if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
2115 return true;
2116 }
2117 }
2118
2119 return false;
2120}
2121
2122/*
2123 * For the clutch scheduler, the run counts are maintained in the clutch
2124 * buckets (i.e thread group scheduling structure).
2125 */
2126static uint32_t
2127sched_clutch_run_incr(thread_t thread)
2128{
2129 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
2130 uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
2131 sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
2132 return new_count;
2133}
2134
2135static uint32_t
2136sched_clutch_run_decr(thread_t thread)
2137{
2138 assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
2139 uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
2140 sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
2141 return new_count;
2142}
2143
2144static sched_bucket_t
2145sched_convert_pri_to_bucket(uint8_t priority)
2146{
2147 sched_bucket_t bucket = TH_BUCKET_RUN;
2148
2149 if (priority > BASEPRI_USER_INITIATED) {
2150 bucket = TH_BUCKET_SHARE_FG;
2151 } else if (priority > BASEPRI_DEFAULT) {
2152 bucket = TH_BUCKET_SHARE_IN;
2153 } else if (priority > BASEPRI_UTILITY) {
2154 bucket = TH_BUCKET_SHARE_DF;
2155 } else if (priority > MAXPRI_THROTTLE) {
2156 bucket = TH_BUCKET_SHARE_UT;
2157 } else {
2158 bucket = TH_BUCKET_SHARE_BG;
2159 }
2160 return bucket;
2161}
2162
2163/*
2164 * For threads that have changed sched_pri without changing the
2165 * base_pri for any reason other than decay, use the sched_pri
2166 * as the bucketizing priority instead of base_pri. All such
2167 * changes are typically due to kernel locking primitives boosts
2168 * or demotions.
2169 */
2170static boolean_t
2171sched_thread_sched_pri_promoted(thread_t thread)
2172{
2173 return (thread->sched_flags & TH_SFLAG_PROMOTED) ||
2174 (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
2175 (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
2176 (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
2177 (thread->kern_promotion_schedpri != 0);
2178}
2179
2180/*
2181 * Routine to update the scheduling bucket for the thread.
2182 *
2183 * In the clutch scheduler implementation, the thread's bucket
2184 * is based on sched_pri if it was promoted due to a kernel
2185 * primitive; otherwise its based on the thread base_pri. This
2186 * enhancement allows promoted threads to reach a higher priority
2187 * bucket and potentially get selected sooner for scheduling.
2188 *
2189 * Also, the clutch scheduler does not honor fixed priority below
2190 * FG priority. It simply puts those threads in the corresponding
2191 * timeshare bucket. The reason for to do that is because it is
2192 * extremely hard to define the scheduling properties of such threads
2193 * and they typically lead to performance issues.
2194 */
2195
2196void
2197sched_clutch_update_thread_bucket(thread_t thread)
2198{
2199 sched_bucket_t old_bucket = thread->th_sched_bucket;
2200 sched_bucket_t new_bucket = TH_BUCKET_RUN;
2201 assert(thread->runq == PROCESSOR_NULL);
2202
2203 int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
2204
2205 switch (thread->sched_mode) {
2206 case TH_MODE_FIXED:
2207 if (pri >= BASEPRI_FOREGROUND) {
2208 new_bucket = TH_BUCKET_FIXPRI;
2209 } else {
2210 new_bucket = sched_convert_pri_to_bucket(pri);
2211 }
2212 break;
2213
2214 case TH_MODE_REALTIME:
2215 new_bucket = TH_BUCKET_FIXPRI;
2216 break;
2217
2218 case TH_MODE_TIMESHARE:
2219 new_bucket = sched_convert_pri_to_bucket(pri);
2220 break;
2221
2222 default:
2223 panic("unexpected mode: %d", thread->sched_mode);
2224 break;
2225 }
2226
2227 if (old_bucket == new_bucket) {
2228 return;
2229 }
2230
2231 thread->th_sched_bucket = new_bucket;
2232 thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
2233
2234 /*
2235 * Since this is called after the thread has been removed from the runq,
2236 * only the run counts need to be updated. The re-insert into the runq
2237 * would put the thread into the correct new bucket's runq.
2238 */
2239 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
2240 sched_clutch_thread_run_bucket_decr(thread, old_bucket);
2241 sched_clutch_thread_run_bucket_incr(thread, new_bucket);
2242 }
2243}
2244
2245#if __AMP__
2246
2247/* Implementation of the AMP version of the clutch scheduler */
2248
2249static thread_t
2250sched_clutch_amp_steal_thread(processor_set_t pset);
2251
2252static ast_t
2253sched_clutch_amp_processor_csw_check(processor_t processor);
2254
2255static boolean_t
2256sched_clutch_amp_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
2257
2258static boolean_t
2259sched_clutch_amp_processor_queue_empty(processor_t processor);
2260
2261static thread_t
2262sched_clutch_amp_choose_thread(processor_t processor, int priority, ast_t reason);
2263
2264static void
2265sched_clutch_amp_processor_queue_shutdown(processor_t processor);
2266
2267static processor_t
2268sched_clutch_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
2269
2270static bool
2271sched_clutch_amp_thread_avoid_processor(processor_t processor, thread_t thread);
2272
2273static bool
2274sched_clutch_amp_thread_should_yield(processor_t processor, thread_t thread);
2275
2276static void
2277sched_clutch_migrate_foreign_buckets(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
2278
2279static void
2280sched_clutch_amp_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation);
2281
2282const struct sched_dispatch_table sched_clutch_amp_dispatch = {
2283 .sched_name = "clutch_amp",
2284 .init = sched_amp_init,
2285 .timebase_init = sched_clutch_timebase_init,
2286 .processor_init = sched_clutch_processor_init,
2287 .pset_init = sched_clutch_pset_init,
2288 .maintenance_continuation = sched_timeshare_maintenance_continue,
2289 .choose_thread = sched_clutch_amp_choose_thread,
2290 .steal_thread_enabled = sched_amp_steal_thread_enabled,
2291 .steal_thread = sched_clutch_amp_steal_thread,
2292 .compute_timeshare_priority = sched_compute_timeshare_priority,
2293 .choose_processor = sched_clutch_amp_choose_processor,
2294 .processor_enqueue = sched_clutch_processor_enqueue,
2295 .processor_queue_shutdown = sched_clutch_amp_processor_queue_shutdown,
2296 .processor_queue_remove = sched_clutch_processor_queue_remove,
2297 .processor_queue_empty = sched_clutch_amp_processor_queue_empty,
2298 .priority_is_urgent = priority_is_urgent,
2299 .processor_csw_check = sched_clutch_amp_processor_csw_check,
2300 .processor_queue_has_priority = sched_clutch_amp_processor_queue_has_priority,
2301 .initial_quantum_size = sched_clutch_initial_quantum_size,
2302 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
2303 .can_update_priority = can_update_priority,
2304 .update_priority = update_priority,
2305 .lightweight_update_priority = lightweight_update_priority,
2306 .quantum_expire = sched_default_quantum_expire,
2307 .processor_runq_count = sched_clutch_runq_count,
2308 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
2309 .processor_bound_count = sched_clutch_processor_bound_count,
2310 .thread_update_scan = sched_clutch_thread_update_scan,
2311 .multiple_psets_enabled = TRUE,
2312 .sched_groups_enabled = FALSE,
2313 .avoid_processor_enabled = TRUE,
2314 .thread_avoid_processor = sched_clutch_amp_thread_avoid_processor,
2315 .processor_balance = sched_amp_balance,
2316
2317 .rt_runq = sched_amp_rt_runq,
2318 .rt_init = sched_amp_rt_init,
2319 .rt_queue_shutdown = sched_amp_rt_queue_shutdown,
2320 .rt_runq_scan = sched_amp_rt_runq_scan,
2321 .rt_runq_count_sum = sched_amp_rt_runq_count_sum,
2322
2323 .qos_max_parallelism = sched_amp_qos_max_parallelism,
2324 .check_spill = sched_amp_check_spill,
2325 .ipi_policy = sched_amp_ipi_policy,
2326 .thread_should_yield = sched_clutch_amp_thread_should_yield,
2327 .run_count_incr = sched_clutch_run_incr,
2328 .run_count_decr = sched_clutch_run_decr,
2329 .update_thread_bucket = sched_clutch_update_thread_bucket,
2330 .pset_made_schedulable = sched_clutch_migrate_foreign_buckets,
2331 .thread_group_recommendation_change = sched_clutch_amp_thread_group_recommendation_change,
2332};
2333
2334extern processor_set_t ecore_set;
2335extern processor_set_t pcore_set;
2336
2337static thread_t
2338sched_clutch_amp_choose_thread(
2339 processor_t processor,
2340 int priority,
2341 __unused ast_t reason)
2342{
2343 processor_set_t pset = processor->processor_set;
2344 bool spill_pending = false;
2345 int spill_pri = -1;
2346
2347 if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
2348 spill_pending = true;
2349 spill_pri = sched_clutch_root_priority(&pcore_set->pset_clutch_root);
2350 }
2351
2352 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2353 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2354 boolean_t choose_from_boundq = false;
2355
2356 if ((bound_runq->highq < priority) &&
2357 (clutch_pri < priority) &&
2358 (spill_pri < priority)) {
2359 return THREAD_NULL;
2360 }
2361
2362 if ((spill_pri > bound_runq->highq) &&
2363 (spill_pri > clutch_pri)) {
2364 /*
2365 * There is a higher priority thread on the P-core runq,
2366 * so returning THREAD_NULL here will cause thread_select()
2367 * to call sched_clutch_amp_steal_thread() to try to get it.
2368 */
2369 return THREAD_NULL;
2370 }
2371
2372 if (bound_runq->highq >= clutch_pri) {
2373 choose_from_boundq = true;
2374 }
2375
2376 thread_t thread = THREAD_NULL;
2377 if (choose_from_boundq == false) {
2378 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2379 thread = sched_clutch_thread_highest(pset_clutch_root);
2380 } else {
2381 thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
2382 }
2383 return thread;
2384}
2385
2386static boolean_t
2387sched_clutch_amp_processor_queue_empty(processor_t processor)
2388{
2389 processor_set_t pset = processor->processor_set;
2390 bool spill_pending = bit_test(pset->pending_spill_cpu_mask, processor->cpu_id);
2391
2392 return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
2393 (sched_clutch_bound_runq(processor)->count == 0) &&
2394 !spill_pending;
2395}
2396
2397static bool
2398sched_clutch_amp_thread_should_yield(processor_t processor, thread_t thread)
2399{
2400 if (!sched_clutch_amp_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
2401 return true;
2402 }
2403
2404 if ((processor->processor_set->pset_cluster_type == PSET_AMP_E) && (recommended_pset_type(thread) == PSET_AMP_P)) {
2405 return sched_clutch_root_count(&pcore_set->pset_clutch_root) > 0;
2406 }
2407
2408 return false;
2409}
2410
2411static ast_t
2412sched_clutch_amp_processor_csw_check(processor_t processor)
2413{
2414 boolean_t has_higher;
2415 int pri;
2416
2417 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2418 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2419
2420 assert(processor->active_thread != NULL);
2421
2422 processor_set_t pset = processor->processor_set;
2423 bool spill_pending = false;
2424 int spill_pri = -1;
2425 int spill_urgency = 0;
2426
2427 if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
2428 spill_pending = true;
2429 spill_pri = sched_clutch_root_priority(&pcore_set->pset_clutch_root);
2430 spill_urgency = sched_clutch_root_urgency(&pcore_set->pset_clutch_root);
2431 }
2432
2433 pri = MAX(clutch_pri, bound_runq->highq);
2434 if (spill_pending) {
2435 pri = MAX(pri, spill_pri);
2436 }
2437
2438 if (processor->first_timeslice) {
2439 has_higher = (pri > processor->current_pri);
2440 } else {
2441 has_higher = (pri >= processor->current_pri);
2442 }
2443
2444 if (has_higher) {
2445 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
2446 return AST_PREEMPT | AST_URGENT;
2447 }
2448
2449 if (bound_runq->urgency > 0) {
2450 return AST_PREEMPT | AST_URGENT;
2451 }
2452
2453 if (spill_urgency > 0) {
2454 return AST_PREEMPT | AST_URGENT;
2455 }
2456
2457 return AST_PREEMPT;
2458 }
2459
2460 return AST_NONE;
2461}
2462
2463static boolean_t
2464sched_clutch_amp_processor_queue_has_priority(processor_t processor,
2465 int priority,
2466 boolean_t gte)
2467{
2468 bool spill_pending = false;
2469 int spill_pri = -1;
2470 processor_set_t pset = processor->processor_set;
2471
2472 if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
2473 spill_pending = true;
2474 spill_pri = sched_clutch_root_priority(&pcore_set->pset_clutch_root);
2475 }
2476 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2477
2478 int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
2479 if (spill_pending) {
2480 qpri = MAX(qpri, spill_pri);
2481 }
2482
2483 if (gte) {
2484 return qpri >= priority;
2485 } else {
2486 return qpri > priority;
2487 }
2488}
2489
2490/*
2491 * sched_clutch_hierarchy_thread_pset()
2492 *
2493 * Routine to determine where a thread should be enqueued based on its
2494 * recommendation if this is the first runnable thread in the clutch_bucket
2495 * or its clutch bucket's hierarchy membership.
2496 */
2497static processor_set_t
2498sched_clutch_hierarchy_thread_pset(thread_t thread)
2499{
2500 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread) == false) {
2501 return (recommended_pset_type(thread) == PSET_AMP_P) ? pcore_set : ecore_set;
2502 }
2503
2504 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2505 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
2506 sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
2507 if (scb_root) {
2508 /* Clutch bucket is already runnable, return the pset hierarchy its part of */
2509 return scb_root->scr_pset;
2510 }
2511 return (recommended_pset_type(thread) == PSET_AMP_E) ? ecore_set : pcore_set;
2512}
2513
2514/*
2515 * sched_clutch_thread_pset_recommended()
2516 *
2517 * Routine to determine if the thread should be placed on the provided pset.
2518 * The routine first makes sure the cluster is available for scheduling. If
2519 * it is available, it looks at the thread's recommendation. Called
2520 * with the pset lock held.
2521 */
2522static bool
2523sched_clutch_thread_pset_recommended(thread_t thread, processor_set_t pset)
2524{
2525 if (!sched_clutch_pset_available(pset)) {
2526 return false;
2527 }
2528
2529 /* At this point, all clusters should be available and recommended */
2530 if (sched_clutch_hierarchy_thread_pset(thread) != pset) {
2531 return false;
2532 }
2533
2534 return true;
2535}
2536
2537
2538static void
2539sched_clutch_amp_processor_queue_shutdown(processor_t processor)
2540{
2541 processor_set_t pset = processor->processor_set;
2542 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2543 thread_t thread;
2544 queue_head_t tqueue;
2545
2546 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
2547 if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
2548 pset_unlock(pset);
2549 return;
2550 }
2551
2552 queue_init(&tqueue);
2553 while (sched_clutch_root_count(pset_clutch_root) > 0) {
2554 thread = sched_clutch_thread_highest(pset_clutch_root);
2555 enqueue_tail(&tqueue, &thread->runq_links);
2556 }
2557 pset_unlock(pset);
2558
2559 qe_foreach_element_safe(thread, &tqueue, runq_links) {
2560 remqueue(&thread->runq_links);
2561 thread_lock(thread);
2562 thread_setrun(thread, SCHED_TAILQ);
2563 thread_unlock(thread);
2564 }
2565}
2566
2567static thread_t
2568sched_clutch_amp_steal_thread(processor_set_t pset)
2569{
2570 thread_t thread = THREAD_NULL;
2571 processor_set_t nset = pset;
2572
2573 if (pcore_set->online_processor_count == 0) {
2574 /* Nothing to steal from */
2575 goto out;
2576 }
2577
2578 if (pset->pset_cluster_type == PSET_AMP_P) {
2579 /* P cores don't steal from E cores */
2580 goto out;
2581 }
2582
2583 processor_t processor = current_processor();
2584 assert(pset == processor->processor_set);
2585
2586 bool spill_pending = bit_test(pset->pending_spill_cpu_mask, processor->cpu_id);
2587 bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
2588
2589 nset = pcore_set;
2590
2591 assert(nset != pset);
2592
2593 if (sched_get_pset_load_average(nset) >= sched_amp_steal_threshold(nset, spill_pending)) {
2594 pset_unlock(pset);
2595
2596 pset = nset;
2597
2598 pset_lock(pset);
2599
2600 /* Allow steal if load average still OK, no idle cores, and more threads on runq than active cores DISPATCHING */
2601 if ((sched_get_pset_load_average(pset) >= sched_amp_steal_threshold(pset, spill_pending)) &&
2602 ((int)sched_clutch_root_count(&pset->pset_clutch_root) > bit_count(pset->cpu_state_map[PROCESSOR_DISPATCHING])) &&
2603 (bit_count(pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) == 0)) {
2604 thread = sched_clutch_thread_highest(&pset->pset_clutch_root);
2605 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_STEAL) | DBG_FUNC_NONE, spill_pending, 0, 0, 0);
2606 sched_update_pset_load_average(pset);
2607 }
2608 }
2609
2610out:
2611 pset_unlock(pset);
2612 return thread;
2613}
2614
2615/* Return true if this thread should not continue running on this processor */
2616static bool
2617sched_clutch_amp_thread_avoid_processor(processor_t processor, thread_t thread)
2618{
2619 if (processor->processor_set->pset_cluster_type == PSET_AMP_E) {
2620 if (sched_clutch_thread_pset_recommended(thread, pcore_set)) {
2621 return true;
2622 }
2623 } else if (processor->processor_set->pset_cluster_type == PSET_AMP_P) {
2624 if (!sched_clutch_thread_pset_recommended(thread, pcore_set)) {
2625 return true;
2626 }
2627 }
2628
2629 return false;
2630}
2631
2632static processor_t
2633sched_clutch_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
2634{
2635 /* Bound threads don't call this function */
2636 assert(thread->bound_processor == PROCESSOR_NULL);
2637
2638 processor_set_t nset;
2639 processor_t chosen_processor = PROCESSOR_NULL;
2640
2641select_pset:
2642 nset = (pset == ecore_set) ? pcore_set : ecore_set;
2643 if (!sched_clutch_pset_available(pset)) {
2644 /* If the current pset is not available for scheduling, just use the other pset */
2645 pset_unlock(pset);
2646 pset_lock(nset);
2647 goto select_processor;
2648 }
2649
2650 /* Check if the thread is recommended to run on this pset */
2651 if (sched_clutch_thread_pset_recommended(thread, pset)) {
2652 nset = pset;
2653 goto select_processor;
2654 } else {
2655 /* pset not recommended; try the other pset */
2656 pset_unlock(pset);
2657 pset_lock(nset);
2658 pset = nset;
2659 goto select_pset;
2660 }
2661
2662select_processor:
2663 if (!sched_clutch_pset_available(nset)) {
2664 /*
2665 * It looks like both psets are not available due to some
2666 * reason. In that case, just use the master processor's
2667 * pset for scheduling.
2668 */
2669 if (master_processor->processor_set != nset) {
2670 pset_unlock(nset);
2671 nset = master_processor->processor_set;
2672 pset_lock(nset);
2673 }
2674 }
2675 chosen_processor = choose_processor(nset, processor, thread);
2676 assert(chosen_processor->processor_set == nset);
2677 return chosen_processor;
2678}
2679
2680/*
2681 * AMP Clutch Scheduler Thread Migration
2682 *
2683 * For the AMP version of the clutch scheduler the thread is always scheduled via its
2684 * thread group. So it is important to make sure that the thread group is part of the
2685 * correct processor set hierarchy. In order to do that, the clutch scheduler moves
2686 * all eligble clutch buckets to the correct hierarchy when the recommendation of a
2687 * thread group is changed by CLPC.
2688 */
2689
2690/*
2691 * sched_clutch_recommended_pset()
2692 *
2693 * Routine to decide which hierarchy the thread group should be in based on the
2694 * recommendation and other thread group and system properties. This routine is
2695 * used to determine if thread group migration is necessary and should mimic the
2696 * logic in sched_clutch_thread_pset_recommended() & recommended_pset_type().
2697 */
2698static processor_set_t
2699sched_clutch_recommended_pset(sched_clutch_t sched_clutch, cluster_type_t recommendation)
2700{
2701 if (!sched_clutch_pset_available(pcore_set)) {
2702 return ecore_set;
2703 }
2704
2705 if (!sched_clutch_pset_available(ecore_set)) {
2706 return pcore_set;
2707 }
2708
2709 /*
2710 * If all clusters are available and recommended, use the recommendation
2711 * to decide which cluster to use.
2712 */
2713 pset_cluster_type_t type = thread_group_pset_recommendation(sched_clutch->sc_tg, recommendation);
2714 return (type == PSET_AMP_E) ? ecore_set : pcore_set;
2715}
2716
2717static void
2718sched_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
2719{
2720 uint16_t thread_count = clutch_bucket->scb_thr_count;
2721 thread_t thread;
2722 uint64_t current_timestamp = mach_approximate_time();
2723 while (thread_count > 0) {
2724 thread = run_queue_peek(&clutch_bucket->scb_runq);
2725 sched_clutch_thread_remove(root_clutch, thread, current_timestamp);
2726 enqueue_tail(clutch_threads, &thread->runq_links);
2727 thread_count--;
2728 }
2729
2730 /*
2731 * This operation should have drained the clutch bucket and pulled it out of the
2732 * hierarchy.
2733 */
2734 assert(clutch_bucket->scb_thr_count == 0);
2735 assert(clutch_bucket->scb_root == NULL);
2736}
2737
2738/*
2739 * sched_clutch_migrate_thread_group()
2740 *
2741 * Routine to implement the migration of threads when the thread group
2742 * recommendation is updated. The migration works using a 2-phase
2743 * algorithm.
2744 *
2745 * Phase 1: With the source pset (determined by sched_clutch_recommended_pset)
2746 * locked, drain all the runnable threads into a local queue and update the TG
2747 * recommendation.
2748 *
2749 * Phase 2: Call thread_setrun() on all the drained threads. Since the TG recommendation
2750 * has been updated, these should all end up in the right hierarchy.
2751 */
2752static void
2753sched_clutch_migrate_thread_group(sched_clutch_t sched_clutch, cluster_type_t new_recommendation)
2754{
2755 thread_t thread;
2756
2757 /* If the thread group is empty, just update the recommendation */
2758 if (os_atomic_load(&sched_clutch->sc_thr_count, relaxed) == 0) {
2759 thread_group_update_recommendation(sched_clutch->sc_tg, new_recommendation);
2760 return;
2761 }
2762
2763 processor_set_t dst_pset = sched_clutch_recommended_pset(sched_clutch, new_recommendation);
2764 processor_set_t src_pset = (dst_pset == pcore_set) ? ecore_set : pcore_set;
2765
2766 queue_head_t clutch_threads;
2767 queue_init(&clutch_threads);
2768
2769 /* Interrupts need to be disabled to make sure threads wont become runnable during the
2770 * migration and attempt to grab the pset/thread locks.
2771 */
2772 spl_t s = splsched();
2773
2774 pset_lock(src_pset);
2775 for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
2776 sched_clutch_bucket_t clutch_bucket = &(sched_clutch->sc_clutch_buckets[bucket]);
2777 sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
2778 if ((scb_root == NULL) || (scb_root->scr_pset == dst_pset)) {
2779 /* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
2780 continue;
2781 }
2782 assert(scb_root->scr_pset == src_pset);
2783 /* Now remove all the threads from the runq so that thread->runq is set correctly */
2784 sched_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
2785 }
2786
2787 /*
2788 * Now that all the clutch buckets have been drained, update the TG recommendation.
2789 * This operation needs to be done with the pset lock held to make sure that anyone
2790 * coming in before the migration started would get the original pset as the root
2791 * of this sched_clutch and attempt to hold the src_pset lock. Once the TG changes,
2792 * all threads that are becoming runnable would find the clutch bucket empty and
2793 * the TG recommendation would coax them to enqueue it in the new recommended
2794 * hierarchy. This effectively synchronizes with other threads calling
2795 * thread_setrun() and trying to decide which pset the thread/clutch_bucket
2796 * belongs in.
2797 */
2798 thread_group_update_recommendation(sched_clutch->sc_tg, new_recommendation);
2799 pset_unlock(src_pset);
2800
2801 /* Now setrun all the threads in the local queue */
2802 qe_foreach_element_safe(thread, &clutch_threads, runq_links) {
2803 remqueue(&thread->runq_links);
2804 thread_lock(thread);
2805 thread_setrun(thread, SCHED_TAILQ);
2806 thread_unlock(thread);
2807 }
2808
2809 splx(s);
2810}
2811
2812static void
2813sched_clutch_amp_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation)
2814{
2815 /*
2816 * For the clutch scheduler, the change in recommendation moves the thread group
2817 * to the right hierarchy. sched_clutch_migrate_thread_group() is also responsible
2818 * for updating the recommendation of the thread group.
2819 */
2820 sched_clutch_migrate_thread_group(&tg->tg_sched_clutch, new_recommendation);
2821
2822 if (new_recommendation != CLUSTER_TYPE_P) {
2823 return;
2824 }
2825
2826 sched_amp_bounce_thread_group_from_ecores(ecore_set, tg);
2827}
2828
2829/*
2830 * sched_clutch_migrate_foreign_buckets()
2831 *
2832 * Routine to migrate all the clutch buckets which are not in their recommended
2833 * pset hierarchy now that a new pset has become runnable. The algorithm is
2834 * similar to sched_clutch_migrate_thread_group().
2835 *
2836 * Invoked with the newly recommended pset lock held and interrupts disabled.
2837 */
2838static void
2839sched_clutch_migrate_foreign_buckets(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
2840{
2841 thread_t thread;
2842 processor_set_t src_pset = (dst_pset == pcore_set) ? ecore_set : pcore_set;
2843
2844 if (!sched_clutch_pset_available(dst_pset)) {
2845 /*
2846 * It is possible that some state about the pset changed,
2847 * but its still not available for scheduling. Nothing to
2848 * do here in that case.
2849 */
2850 if (drop_lock) {
2851 pset_unlock(dst_pset);
2852 }
2853 return;
2854 }
2855 pset_unlock(dst_pset);
2856
2857 queue_head_t clutch_threads;
2858 queue_init(&clutch_threads);
2859 sched_clutch_root_t src_root = &src_pset->pset_clutch_root;
2860
2861 pset_lock(src_pset);
2862 queue_t clutch_bucket_list = &src_pset->pset_clutch_root.scr_foreign_buckets;
2863
2864 if (sched_clutch_root_count(src_root) == 0) {
2865 /* No threads present in this hierarchy */
2866 pset_unlock(src_pset);
2867 goto migration_complete;
2868 }
2869
2870 sched_clutch_bucket_t clutch_bucket;
2871 qe_foreach_element_safe(clutch_bucket, clutch_bucket_list, scb_foreignlink) {
2872 sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
2873 assert(scb_root->scr_pset == src_pset);
2874 /* Now remove all the threads from the runq so that thread->runq is set correctly */
2875 sched_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
2876 assert(clutch_bucket->scb_foreign == false);
2877 }
2878 pset_unlock(src_pset);
2879
2880 /* Now setrun all the threads in the local queue */
2881 qe_foreach_element_safe(thread, &clutch_threads, runq_links) {
2882 remqueue(&thread->runq_links);
2883 thread_lock(thread);
2884 thread_setrun(thread, SCHED_TAILQ);
2885 thread_unlock(thread);
2886 }
2887
2888migration_complete:
2889 if (!drop_lock) {
2890 pset_lock(dst_pset);
2891 }
2892}
2893
2894#endif /* __AMP__ */
2895
2896#endif /* CONFIG_SCHED_CLUTCH */