]> git.saurik.com Git - apple/xnu.git/blame - osfmk/kern/sched_clutch.c
xnu-6153.61.1.tar.gz
[apple/xnu.git] / osfmk / kern / sched_clutch.c
CommitLineData
cb323159
A
1/*
2 * Copyright (c) 2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <mach/mach_types.h>
30#include <mach/machine.h>
31#include <machine/machine_routines.h>
32#include <machine/sched_param.h>
33#include <machine/machine_cpu.h>
34#include <kern/kern_types.h>
35#include <kern/debug.h>
36#include <kern/machine.h>
37#include <kern/misc_protos.h>
38#include <kern/processor.h>
39#include <kern/queue.h>
40#include <kern/sched.h>
41#include <kern/sched_prim.h>
42#include <kern/task.h>
43#include <kern/thread.h>
44#include <kern/sched_clutch.h>
45#include <machine/atomic.h>
46#include <kern/sched_clutch.h>
47#include <sys/kdebug.h>
48
49
50#if CONFIG_SCHED_CLUTCH
51
52/* Forward declarations of static routines */
53
54/* Root level hierarchy management */
55static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
56static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t);
57static void sched_clutch_root_pri_update(sched_clutch_root_t);
58static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t);
59static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
60static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
61
62/* Root bucket level hierarchy management */
63static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
64static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t);
65static int sched_clutch_root_bucket_pri_compare(sched_clutch_root_bucket_t, sched_clutch_root_bucket_t);
66
67/* Clutch bucket level hierarchy management */
68static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t);
69static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t);
70static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t);
71static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t);
72static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t);
73
74static void sched_clutch_bucket_cpu_usage_update(sched_clutch_bucket_t, uint64_t);
75static void sched_clutch_bucket_cpu_blocked_update(sched_clutch_bucket_t, uint64_t);
76static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
77static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t);
78
79/* Clutch timeshare properties updates */
80static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
81static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
82static void sched_clutch_bucket_cpu_adjust(sched_clutch_bucket_t);
83static void sched_clutch_bucket_timeshare_update(sched_clutch_bucket_t);
84static boolean_t sched_thread_sched_pri_promoted(thread_t);
85/* Clutch membership management */
86static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
87static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t);
88static thread_t sched_clutch_thread_highest(sched_clutch_root_t);
89
90/* Clutch properties updates */
91static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
92static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
93static int sched_clutch_root_priority(sched_clutch_root_t);
94
95
96/* Helper debugging routines */
97static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
98
99
100
101/*
102 * Global priority queue comparator routine for root buckets. The
103 * routine implements the priority queue as a minimum deadline queue
104 * to achieve EDF scheduling.
105 */
106priority_queue_compare_fn_t sched_clutch_root_bucket_compare;
107
108
109/*
110 * Special markers for buckets that have invalid WCELs/quantums etc.
111 */
112#define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
113#define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
114
115/*
116 * Root level bucket WCELs
117 *
118 * The root level bucket selection algorithm is an Earliest Deadline
119 * First (EDF) algorithm where the deadline for buckets are defined
120 * by the worst-case-execution-latency and the make runnable timestamp
121 * for the bucket.
122 *
123 */
124static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
125 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
126 0, /* FG */
127 37500, /* IN (37.5ms) */
128 75000, /* DF (75ms) */
129 150000, /* UT (150ms) */
130 250000 /* BG (250ms) */
131};
132static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
133
134/*
135 * Root level bucket warp
136 *
137 * Each root level bucket has a warp value associated with it as well.
138 * The warp value allows the root bucket to effectively warp ahead of
139 * lower priority buckets for a limited time even if it has a later
140 * deadline. The warping behavior provides extra (but limited)
141 * opportunity for high priority buckets to remain responsive.
142 */
143
144/* Special warp deadline value to indicate that the bucket has not used any warp yet */
145#define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED (SCHED_CLUTCH_INVALID_TIME_64)
146
147/* Warp window durations for various tiers */
148static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
149 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
150 8000, /* FG (8ms)*/
151 4000, /* IN (4ms) */
152 2000, /* DF (2ms) */
153 1000, /* UT (1ms) */
154 0 /* BG (0ms) */
155};
156static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
157
158/*
159 * Thread level quantum
160 *
161 * The algorithm defines quantums for threads at various buckets. This
162 * (combined with the root level bucket quantums) restricts how much
163 * the lower priority levels can preempt the higher priority threads.
164 */
165static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
166 10000, /* FIXPRI (10ms) */
167 10000, /* FG (10ms) */
168 8000, /* IN (8ms) */
169 6000, /* DF (6ms) */
170 4000, /* UT (4ms) */
171 2000 /* BG (2ms) */
172};
173static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
174
175enum sched_clutch_state {
176 SCHED_CLUTCH_STATE_EMPTY = 0,
177 SCHED_CLUTCH_STATE_RUNNABLE,
178};
179
180/*
181 * sched_clutch_us_to_abstime()
182 *
183 * Initializer for converting all durations in usec to abstime
184 */
185static void
186sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
187{
188 for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
189 if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
190 abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
191 } else {
192 clock_interval_to_absolutetime_interval(us_vals[i],
193 NSEC_PER_USEC, &abstime_vals[i]);
194 }
195 }
196}
197
198#if DEVELOPMENT || DEBUG
199
200/*
201 * sched_clutch_hierarchy_locked_assert()
202 *
203 * Debugging helper routine. Asserts that the hierarchy is locked. The locking
204 * for the hierarchy depends on where the hierarchy is hooked. The current
205 * implementation hooks the hierarchy at the pset, so the hierarchy is locked
206 * using the pset lock.
207 */
208static inline void
209sched_clutch_hierarchy_locked_assert(
210 sched_clutch_root_t root_clutch)
211{
212 pset_assert_locked(root_clutch->scr_pset);
213}
214
215#else /* DEVELOPMENT || DEBUG */
216
217static inline void
218sched_clutch_hierarchy_locked_assert(
219 __unused sched_clutch_root_t root_clutch)
220{
221}
222
223#endif /* DEVELOPMENT || DEBUG */
224
225/*
226 * sched_clutch_thr_count_inc()
227 *
228 * Increment thread count at a hierarchy level with overflow checks.
229 */
230static void
231sched_clutch_thr_count_inc(
232 uint16_t *thr_count)
233{
234 if (__improbable(os_inc_overflow(thr_count))) {
235 panic("sched_clutch thread count overflowed!");
236 }
237}
238
239/*
240 * sched_clutch_thr_count_dec()
241 *
242 * Decrement thread count at a hierarchy level with underflow checks.
243 */
244static void
245sched_clutch_thr_count_dec(
246 uint16_t *thr_count)
247{
248 if (__improbable(os_dec_overflow(thr_count))) {
249 panic("sched_clutch thread count underflowed!");
250 }
251}
252
253
254/*
255 * sched_clutch_root_init()
256 *
257 * Routine to initialize the scheduler hierarchy root.
258 */
259static void
260sched_clutch_root_init(
261 sched_clutch_root_t root_clutch,
262 processor_set_t pset)
263{
264 root_clutch->scr_thr_count = 0;
265 root_clutch->scr_priority = NOPRI;
266 root_clutch->scr_urgency = 0;
267 root_clutch->scr_pset = pset;
268
269 /* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
270 queue_init(&root_clutch->scr_clutch_buckets);
271
272 /* Initialize the queue which maintains all runnable foreign clutch buckets */
273 queue_init(&root_clutch->scr_foreign_buckets);
274
275 /* Initialize the bitmap and priority queue of runnable root buckets */
276 sched_clutch_root_bucket_compare = priority_heap_make_comparator(a, b, struct sched_clutch_root_bucket, scrb_pqlink, {
277 return (a->scrb_deadline < b->scrb_deadline) ? 1 : ((a->scrb_deadline == b->scrb_deadline) ? 0 : -1);
278 });
279 priority_queue_init(&root_clutch->scr_root_buckets, PRIORITY_QUEUE_GENERIC_KEY | PRIORITY_QUEUE_MIN_HEAP);
280 bitmap_zero(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX);
281 bitmap_zero(root_clutch->scr_warp_available, TH_BUCKET_SCHED_MAX);
282
283 /* Initialize all the root buckets */
284 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
285 sched_clutch_root_bucket_init(&root_clutch->scr_buckets[i], i);
286 }
287}
288
289/*
290 * sched_clutch_root_bucket_init()
291 *
292 * Routine to initialize root buckets.
293 */
294static void
295sched_clutch_root_bucket_init(
296 sched_clutch_root_bucket_t root_bucket,
297 sched_bucket_t bucket)
298{
299 root_bucket->scrb_bucket = bucket;
300 priority_queue_init(&root_bucket->scrb_clutch_buckets, PRIORITY_QUEUE_BUILTIN_KEY | PRIORITY_QUEUE_MAX_HEAP);
301 priority_queue_entry_init(&root_bucket->scrb_pqlink);
302 root_bucket->scrb_deadline = SCHED_CLUTCH_INVALID_TIME_64;
303 root_bucket->scrb_warped_deadline = 0;
304 root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
305}
306
307/*
308 * sched_clutch_root_bucket_pri_compare()
309 *
310 * Routine to compare root buckets based on the highest runnable clutch
311 * bucket priorities in the root buckets.
312 */
313static int
314sched_clutch_root_bucket_pri_compare(
315 sched_clutch_root_bucket_t a,
316 sched_clutch_root_bucket_t b)
317{
318 sched_clutch_bucket_t a_highest = sched_clutch_root_bucket_highest_clutch_bucket(a);
319 sched_clutch_bucket_t b_highest = sched_clutch_root_bucket_highest_clutch_bucket(b);
320 return (a_highest->scb_priority > b_highest->scb_priority) ?
321 1 : ((a_highest->scb_priority == b_highest->scb_priority) ? 0 : -1);
322}
323
324/*
325 * sched_clutch_root_select_aboveui()
326 *
327 * Special case scheduling for Above UI bucket.
328 *
329 * AboveUI threads are typically system critical threads that need low latency
330 * which is why they are handled specially.
331 *
332 * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
333 * important to maintain some native priority order between those buckets. The policy
334 * implemented here is to compare the highest clutch buckets of both buckets; if the
335 * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
336 * deadline based scheduling which should pickup the timeshare buckets.
337 *
338 * The implementation allows extremely low latency CPU access for Above UI threads
339 * while supporting the use case of high priority timeshare threads contending with
340 * lower priority fixed priority threads.
341 */
342static boolean_t
343sched_clutch_root_select_aboveui(
344 sched_clutch_root_t root_clutch)
345{
346 if (bitmap_test(root_clutch->scr_runnable_bitmap, TH_BUCKET_FIXPRI)) {
347 sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_buckets[TH_BUCKET_FIXPRI];
348 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_buckets[TH_BUCKET_SHARE_FG];
349
350 if (!bitmap_test(root_clutch->scr_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
351 /* If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling */
352 return true;
353 }
354 if (sched_clutch_root_bucket_pri_compare(root_bucket_aboveui, root_bucket_sharefg) >= 0) {
355 /* If the aboveUI bucket has a higher native clutch bucket priority, schedule it */
356 return true;
357 }
358 }
359 return false;
360}
361
362
363/*
364 * sched_clutch_root_highest_root_bucket()
365 *
366 * Main routine to find the highest runnable root level bucket.
367 * This routine is called from performance sensitive contexts; so it is
368 * crucial to keep this O(1).
369 *
370 */
371static sched_clutch_root_bucket_t
372sched_clutch_root_highest_root_bucket(
373 sched_clutch_root_t root_clutch,
374 uint64_t timestamp)
375{
376 sched_clutch_hierarchy_locked_assert(root_clutch);
377 if (bitmap_lsb_first(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
378 return NULL;
379 }
380
381 if (sched_clutch_root_select_aboveui(root_clutch)) {
382 return &root_clutch->scr_buckets[TH_BUCKET_FIXPRI];
383 }
384
385 /*
386 * Above UI bucket is not runnable or has a low priority clutch bucket; use the earliest deadline model
387 * to schedule threads. The idea is that as the timeshare buckets use CPU, they will drop their
388 * interactivity score and allow low priority AboveUI clutch buckets to be scheduled.
389 */
390
391 /* Find the earliest deadline bucket */
392 sched_clutch_root_bucket_t edf_bucket = priority_queue_min(&root_clutch->scr_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
393
394 sched_clutch_root_bucket_t warp_bucket = NULL;
395 int warp_bucket_index = -1;
396evaluate_warp_buckets:
397 /* Check if any higher runnable buckets have warp available */
398 warp_bucket_index = bitmap_lsb_first(root_clutch->scr_warp_available, TH_BUCKET_SCHED_MAX);
399
400 if ((warp_bucket_index == -1) || (warp_bucket_index >= edf_bucket->scrb_bucket)) {
401 /* No higher buckets have warp available; choose the edf bucket and replenish its warp */
402 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
403 edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
404 edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
405 bitmap_set(root_clutch->scr_warp_available, edf_bucket->scrb_bucket);
406 return edf_bucket;
407 }
408
409 /*
410 * Looks like there is a root bucket which is higher in the natural priority
411 * order than edf_bucket and might have some warp remaining.
412 */
413 warp_bucket = &root_clutch->scr_buckets[warp_bucket_index];
414 if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
415 /* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
416 warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
417 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
418 return warp_bucket;
419 }
420 if (warp_bucket->scrb_warped_deadline > timestamp) {
421 /* Root bucket already has a warp window open with some warp remaining */
422 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
423 return warp_bucket;
424 }
425
426 /* For this bucket, warp window was opened sometime in the past but has now
427 * expired. Mark the bucket as not avilable for warp anymore and re-run the
428 * warp bucket selection logic.
429 */
430 warp_bucket->scrb_warp_remaining = 0;
431 bitmap_clear(root_clutch->scr_warp_available, warp_bucket->scrb_bucket);
432 goto evaluate_warp_buckets;
433}
434
435/*
436 * sched_clutch_root_bucket_deadline_calculate()
437 *
438 * Calculate the deadline for the bucket based on its WCEL
439 */
440static uint64_t
441sched_clutch_root_bucket_deadline_calculate(
442 sched_clutch_root_bucket_t root_bucket,
443 uint64_t timestamp)
444{
445 /* For fixpri AboveUI bucket always return it as the earliest deadline */
446 if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
447 return 0;
448 }
449
450 /* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
451 return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
452}
453
454/*
455 * sched_clutch_root_bucket_deadline_update()
456 *
457 * Routine to update the deadline of the root bucket when it is selected.
458 * Updating the deadline also moves the root_bucket in the EDF priority
459 * queue.
460 */
461static void
462sched_clutch_root_bucket_deadline_update(
463 sched_clutch_root_bucket_t root_bucket,
464 sched_clutch_root_t root_clutch,
465 uint64_t timestamp)
466{
467 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
468 /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
469 return;
470 }
471 uint64_t old_deadline = root_bucket->scrb_deadline;
472 uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
473 assert(old_deadline <= new_deadline);
474 if (old_deadline != new_deadline) {
475 root_bucket->scrb_deadline = new_deadline;
476 /* Since the priority queue is a min-heap, use the decrease routine even though the deadline has a larger value now */
477 priority_queue_entry_decrease(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, PRIORITY_QUEUE_KEY_NONE, sched_clutch_root_bucket_compare);
478 }
479}
480
481/*
482 * sched_clutch_root_bucket_runnable()
483 *
484 * Routine to insert a newly runnable root bucket into the hierarchy.
485 * Also updates the deadline and warp parameters as necessary.
486 */
487static void
488sched_clutch_root_bucket_runnable(
489 sched_clutch_root_bucket_t root_bucket,
490 sched_clutch_root_t root_clutch,
491 uint64_t timestamp)
492{
493 /* Mark the root bucket as runnable */
494 bitmap_set(root_clutch->scr_runnable_bitmap, root_bucket->scrb_bucket);
495 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_ROOT_BUCKET_STATE) | DBG_FUNC_NONE,
496 root_bucket->scrb_bucket, SCHED_CLUTCH_STATE_RUNNABLE, 0, 0, 0);
497
498 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
499 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
500 return;
501 }
502
503 root_bucket->scrb_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
504 priority_queue_insert(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, PRIORITY_QUEUE_KEY_NONE, sched_clutch_root_bucket_compare);
505
506 if (root_bucket->scrb_warp_remaining) {
507 /* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
508 bitmap_set(root_clutch->scr_warp_available, root_bucket->scrb_bucket);
509 }
510}
511
512/*
513 * sched_clutch_root_bucket_empty()
514 *
515 * Routine to remove an empty root bucket from the hierarchy.
516 * Also updates the deadline and warp parameters as necessary.
517 */
518static void
519sched_clutch_root_bucket_empty(
520 sched_clutch_root_bucket_t root_bucket,
521 sched_clutch_root_t root_clutch,
522 uint64_t timestamp)
523{
524 bitmap_clear(root_clutch->scr_runnable_bitmap, root_bucket->scrb_bucket);
525 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_ROOT_BUCKET_STATE) | DBG_FUNC_NONE,
526 root_bucket->scrb_bucket, SCHED_CLUTCH_STATE_EMPTY, 0, 0, 0);
527
528 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
529 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
530 return;
531 }
532
533 priority_queue_remove(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, sched_clutch_root_bucket_compare);
534
535 bitmap_clear(root_clutch->scr_warp_available, root_bucket->scrb_bucket);
536 if (root_bucket->scrb_warped_deadline > timestamp) {
537 /*
538 * For root buckets that were using the warp, check if the warp
539 * deadline is in the future. If yes, remove the wall time the
540 * warp was active and update the warp remaining. This allows
541 * the root bucket to use the remaining warp the next time it
542 * becomes runnable.
543 */
544 root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
545 } else if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
546 /*
547 * If the root bucket's warped deadline is in the past, it has used up
548 * all the warp it was assigned. Empty out its warp remaining.
549 */
550 root_bucket->scrb_warp_remaining = 0;
551 }
552}
553
554/*
555 * sched_clutch_root_pri_update()
556 *
557 * The root level priority is used for thread selection and preemption
558 * logic.
559 */
560static void
561sched_clutch_root_pri_update(
562 sched_clutch_root_t root_clutch)
563{
564 sched_clutch_hierarchy_locked_assert(root_clutch);
565 if (bitmap_lsb_first(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
566 /* No runnable root buckets */
567 root_clutch->scr_priority = NOPRI;
568 assert(root_clutch->scr_urgency == 0);
569 return;
570 }
571 sched_clutch_root_bucket_t root_bucket = NULL;
572 /* Special case for AboveUI (uses same logic as thread selection) */
573 if (sched_clutch_root_select_aboveui(root_clutch)) {
574 root_bucket = &root_clutch->scr_buckets[TH_BUCKET_FIXPRI];
575 } else {
576 /*
577 * AboveUI bucket is not runnable or has a low clutch bucket priority,
578 * select the next runnable root bucket in natural priority order. This logic
579 * is slightly different from thread selection, because thread selection
580 * considers deadlines, warps etc. to decide the most optimal bucket at a
581 * given timestamp. Since the priority value is used for preemption decisions
582 * only, it needs to be based on the highest runnable thread available in
583 * the timeshare domain.
584 */
585 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
586 assert(root_bucket_index != -1);
587 root_bucket = &root_clutch->scr_buckets[root_bucket_index];
588 }
589 /* For the selected root bucket, find the highest priority clutch bucket */
590 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
591 root_clutch->scr_priority = priority_queue_max_key(&clutch_bucket->scb_clutchpri_prioq);
592}
593
594/*
595 * sched_clutch_root_urgency_inc()
596 *
597 * Routine to increment the urgency at the root level based on the thread
598 * priority that is being inserted into the hierarchy. The root urgency
599 * counter is updated based on the urgency of threads in any of the
600 * clutch buckets which are part of the hierarchy.
601 *
602 * Always called with the pset lock held.
603 */
604static void
605sched_clutch_root_urgency_inc(
606 sched_clutch_root_t root_clutch,
607 thread_t thread)
608{
609 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
610 root_clutch->scr_urgency++;
611 }
612}
613
614/*
615 * sched_clutch_root_urgency_dec()
616 *
617 * Routine to decrement the urgency at the root level based on the thread
618 * priority that is being removed from the hierarchy. The root urgency
619 * counter is updated based on the urgency of threads in any of the
620 * clutch buckets which are part of the hierarchy.
621 *
622 * Always called with the pset lock held.
623 */
624static void
625sched_clutch_root_urgency_dec(
626 sched_clutch_root_t root_clutch,
627 thread_t thread)
628{
629 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
630 root_clutch->scr_urgency--;
631 }
632}
633
634/*
635 * Clutch bucket level scheduling
636 *
637 * The second level of scheduling is the clutch bucket level scheduling
638 * which tries to schedule thread groups within root_buckets. Each
639 * clutch represents a thread group and a clutch_bucket represents
640 * threads at a particular sched_bucket within that thread group. The
641 * goal of this level of scheduling is to allow interactive thread
642 * groups low latency access to the CPU. It also provides slight
643 * scheduling preference for App and unrestricted thread groups.
644 *
645 * The clutch bucket scheduling algorithm measures an interactivity
646 * score for all clutch buckets. The interactivity score is based
647 * on the ratio of the CPU used and the voluntary blocking of threads
648 * within the clutch bucket. The algorithm is very close to the ULE
649 * scheduler on FreeBSD in terms of calculations. The interactivity
650 * score provides an interactivity boost in the range of
651 * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
652 * thread groups to win over CPU spinners.
653 */
654
655/* Priority boost range for interactivity */
656#define SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT (8)
657uint8_t sched_clutch_bucket_interactive_pri = SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT;
658
659/* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
660uint64_t sched_clutch_bucket_adjust_threshold = 0;
661#define SCHED_CLUTCH_BUCKET_ADJUST_THRESHOLD_USECS (500000)
662
663/* The ratio to scale the cpu/blocked time per window */
664#define SCHED_CLUTCH_BUCKET_ADJUST_RATIO (10)
665
666/* rate at which interactivity score is recalculated. This keeps the score smooth in terms of extremely bursty behavior */
667uint64_t sched_clutch_bucket_interactivity_delta = 0;
668#define SCHED_CLUTCH_BUCKET_INTERACTIVITY_DELTA_USECS_DEFAULT (25000)
669
670/*
671 * In order to allow App thread groups some preference over daemon thread
672 * groups, the App clutch_buckets get a 8 point boost. The boost value should
673 * be chosen such that badly behaved apps are still penalized over well
674 * behaved interactive daemon clutch_buckets.
675 */
676#define SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT (8)
677uint8_t sched_clutch_bucket_pri_boost = SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT;
678
679/* Initial value for voluntary blocking time for the clutch_bucket */
680#define SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID (uint32_t)(~0)
681
682/*
683 * sched_clutch_bucket_init()
684 *
685 * Initializer for clutch buckets.
686 */
687static void
688sched_clutch_bucket_init(
689 sched_clutch_bucket_t clutch_bucket,
690 sched_clutch_t clutch,
691 sched_bucket_t bucket)
692{
693 bzero(clutch_bucket, sizeof(struct sched_clutch_bucket));
694
695 clutch_bucket->scb_bucket = bucket;
696 /* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
697 clutch_bucket->scb_priority = 0;
698 /*
699 * All thread groups should be initialized to be interactive; this allows the newly launched
700 * thread groups to fairly compete with already running thread groups.
701 */
702 clutch_bucket->scb_interactivity_score = (sched_clutch_bucket_interactive_pri * 2);
703 clutch_bucket->scb_foreign = false;
704
705 os_atomic_store(&clutch_bucket->scb_timeshare_tick, 0, relaxed);
706 os_atomic_store(&clutch_bucket->scb_pri_shift, INT8_MAX, relaxed);
707
708 clutch_bucket->scb_interactivity_ts = 0;
709 clutch_bucket->scb_blocked_ts = SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID;
710 priority_queue_entry_init(&clutch_bucket->scb_pqlink);
711 clutch_bucket->scb_clutch = clutch;
712 clutch_bucket->scb_root = NULL;
713 priority_queue_init(&clutch_bucket->scb_clutchpri_prioq, PRIORITY_QUEUE_BUILTIN_KEY | PRIORITY_QUEUE_MAX_HEAP);
714 run_queue_init(&clutch_bucket->scb_runq);
715}
716
717/*
718 * sched_clutch_init_with_thread_group()
719 *
720 * Initialize the sched_clutch when the thread group is being created
721 */
722void
723sched_clutch_init_with_thread_group(
724 sched_clutch_t clutch,
725 struct thread_group *tg)
726{
727 os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
728
729 /* Initialize all the clutch buckets */
730 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
731 sched_clutch_bucket_init(&(clutch->sc_clutch_buckets[i]), clutch, i);
732 }
733
734 /* Grouping specific fields */
735 clutch->sc_tg = tg;
736 os_atomic_store(&clutch->sc_tg_priority, 0, relaxed);
737}
738
739/*
740 * sched_clutch_destroy()
741 *
742 * Destructor for clutch; called from thread group release code.
743 */
744void
745sched_clutch_destroy(
746 __unused sched_clutch_t clutch)
747{
748 assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
749}
750
751
752/*
753 * sched_clutch_bucket_hierarchy_insert()
754 *
755 * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
756 */
757static void
758sched_clutch_bucket_hierarchy_insert(
759 sched_clutch_root_t root_clutch,
760 sched_clutch_bucket_t clutch_bucket,
761 sched_bucket_t bucket,
762 uint64_t timestamp)
763{
764 sched_clutch_hierarchy_locked_assert(root_clutch);
765 if (bucket > TH_BUCKET_FIXPRI) {
766 /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
767 enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
768 }
769 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket];
770
771 /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
772 if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) {
773 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
774 }
775
776 /* Insert the clutch bucket into the root bucket priority queue */
777 priority_queue_insert(&root_bucket->scrb_clutch_buckets, &clutch_bucket->scb_pqlink, clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
778 os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
779 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_STATE) | DBG_FUNC_NONE,
780 thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, SCHED_CLUTCH_STATE_RUNNABLE, clutch_bucket->scb_priority, 0);
781}
782
783/*
784 * sched_clutch_bucket_hierarchy_remove()
785 *
786 * Rotuine to remove a empty clutch bucket from the root hierarchy.
787 */
788static void
789sched_clutch_bucket_hierarchy_remove(
790 sched_clutch_root_t root_clutch,
791 sched_clutch_bucket_t clutch_bucket,
792 sched_bucket_t bucket,
793 uint64_t timestamp)
794{
795 sched_clutch_hierarchy_locked_assert(root_clutch);
796 if (bucket > TH_BUCKET_FIXPRI) {
797 /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
798 remqueue(&clutch_bucket->scb_listlink);
799 }
800
801 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket];
802
803 /* Remove the clutch bucket from the root bucket priority queue */
804 priority_queue_remove(&root_bucket->scrb_clutch_buckets, &clutch_bucket->scb_pqlink, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
805 os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
806 clutch_bucket->scb_blocked_ts = timestamp;
807 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_STATE) | DBG_FUNC_NONE,
808 thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, SCHED_CLUTCH_STATE_EMPTY, 0, 0);
809
810 /* If the root bucket priority queue is now empty, remove it from the root priority queue */
811 if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) {
812 sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
813 }
814}
815
816/*
817 * sched_clutch_bucket_base_pri()
818 *
819 * Calculates the "base" priority of the clutch bucket. The base
820 * priority of the clutch bucket is the sum of the max of highest
821 * base_pri and highest sched_pri in the clutch bucket and any
822 * grouping specific (App/Daemon...) boosts applicable to the
823 * clutch_bucket.
824 */
825static uint8_t
826sched_clutch_bucket_base_pri(
827 sched_clutch_bucket_t clutch_bucket)
828{
829 uint8_t clutch_boost = 0;
830 assert(clutch_bucket->scb_runq.count != 0);
831
832 sched_clutch_t clutch = clutch_bucket->scb_clutch;
833
834 /*
835 * Since the clutch bucket can contain threads that are members of the group due
836 * to the sched_pri being promoted or due to their base pri, the base priority of
837 * the entire clutch bucket should be based on the highest thread (promoted or base)
838 * in the clutch bucket.
839 */
840 uint8_t max_pri = priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq) ? 0 : priority_queue_max_key(&clutch_bucket->scb_clutchpri_prioq);
841
842 /*
843 * For all AboveUI clutch buckets and clutch buckets for thread groups that
844 * havent been specified as SCHED_CLUTCH_TG_PRI_LOW, give a priority boost
845 */
846 if ((clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) ||
847 (os_atomic_load(&clutch->sc_tg_priority, relaxed) != SCHED_CLUTCH_TG_PRI_LOW)) {
848 clutch_boost = sched_clutch_bucket_pri_boost;
849 }
850 return max_pri + clutch_boost;
851}
852
853/*
854 * sched_clutch_bucket_interactivity_score_calculate()
855 *
856 * Routine to calculate the interactivity score for the clutch bucket. The
857 * interactivity score is based on the ratio of CPU used by all threads in
858 * the bucket and the blocked time of the bucket as a whole.
859 */
860static uint8_t
861sched_clutch_bucket_interactivity_score_calculate(
862 sched_clutch_bucket_t clutch_bucket,
863 uint64_t timestamp)
864{
865 if (clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) {
866 /*
867 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
868 * priorities, make sure all AboveUI buckets are marked interactive.
869 */
870 assert(clutch_bucket->scb_interactivity_score == (2 * sched_clutch_bucket_interactive_pri));
871 return clutch_bucket->scb_interactivity_score;
872 }
873
874 if (clutch_bucket->scb_interactivity_ts == 0) {
875 /*
876 * This indicates a newly initialized clutch bucket; return the default interactivity score
877 * and update timestamp.
878 */
879 clutch_bucket->scb_interactivity_ts = timestamp;
880 return clutch_bucket->scb_interactivity_score;
881 }
882
883 if (timestamp < (clutch_bucket->scb_interactivity_ts + sched_clutch_bucket_interactivity_delta)) {
884 return clutch_bucket->scb_interactivity_score;
885 }
886
887 /* Check if the clutch bucket accounting needs to be scaled */
888 sched_clutch_bucket_cpu_adjust(clutch_bucket);
889 clutch_bucket->scb_interactivity_ts = timestamp;
890
891 sched_clutch_bucket_cpu_data_t scb_cpu_data;
892 scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, relaxed);
893 clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
894 clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
895
896 /*
897 * In extremely CPU contended cases, it is possible that the clutch bucket has been runnable
898 * for a long time but none of its threads have been picked up for execution. In that case, both
899 * the CPU used and blocked would be 0.
900 */
901 if ((cpu_blocked == 0) && (cpu_used == 0)) {
902 return clutch_bucket->scb_interactivity_score;
903 }
904
905 /*
906 * For all timeshare buckets, calculate the interactivity score of the bucket
907 * and add it to the base priority
908 */
909 uint8_t interactive_score = 0;
910 if (cpu_blocked > cpu_used) {
911 /* Interactive clutch_bucket case */
912 interactive_score = sched_clutch_bucket_interactive_pri +
913 ((sched_clutch_bucket_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
914 } else {
915 /* Non-interactive clutch_bucket case */
916 interactive_score = ((sched_clutch_bucket_interactive_pri * cpu_blocked) / cpu_used);
917 }
918 clutch_bucket->scb_interactivity_score = interactive_score;
919 return interactive_score;
920}
921
922/*
923 * sched_clutch_bucket_pri_calculate()
924 *
925 * The priority calculation algorithm for the clutch_bucket is a slight
926 * modification on the ULE interactivity score. It uses the base priority
927 * of the clutch bucket and applies an interactivity score boost to the
928 * highly responsive clutch buckets.
929 */
930
931static uint8_t
932sched_clutch_bucket_pri_calculate(
933 sched_clutch_bucket_t clutch_bucket,
934 uint64_t timestamp)
935{
936 /* For empty clutch buckets, return priority 0 */
937 if (clutch_bucket->scb_thr_count == 0) {
938 return 0;
939 }
940
941 uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
942 uint8_t interactive_score = sched_clutch_bucket_interactivity_score_calculate(clutch_bucket, timestamp);
943
944 assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
945 uint8_t pri = base_pri + interactive_score;
946 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
947 thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
948 return pri;
949}
950
951/*
952 * sched_clutch_root_bucket_highest_clutch_bucket()
953 *
954 * Routine to find the highest priority clutch bucket
955 * within the root bucket.
956 */
957static sched_clutch_bucket_t
958sched_clutch_root_bucket_highest_clutch_bucket(
959 sched_clutch_root_bucket_t root_bucket)
960{
961 if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) {
962 return NULL;
963 }
964 return priority_queue_max(&root_bucket->scrb_clutch_buckets, struct sched_clutch_bucket, scb_pqlink);
965}
966
967/*
968 * sched_clutch_bucket_runnable()
969 *
970 * Perform all operations needed when a new clutch bucket becomes runnable.
971 * It involves inserting the clutch_bucket into the hierarchy and updating the
972 * root priority appropriately.
973 */
974static boolean_t
975sched_clutch_bucket_runnable(
976 sched_clutch_bucket_t clutch_bucket,
977 sched_clutch_root_t root_clutch,
978 uint64_t timestamp)
979{
980 sched_clutch_hierarchy_locked_assert(root_clutch);
981 sched_clutch_bucket_cpu_blocked_update(clutch_bucket, timestamp);
982 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
983 sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp);
984 /* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
985 sched_clutch_bucket_timeshare_update(clutch_bucket);
986 int16_t root_old_pri = root_clutch->scr_priority;
987 sched_clutch_root_pri_update(root_clutch);
988 return root_clutch->scr_priority > root_old_pri;
989}
990
991/*
992 * sched_clutch_bucket_update()
993 *
994 * Update the clutch_bucket's position in the hierarchy based on whether
995 * the newly runnable thread changes its priority. Also update the root
996 * priority accordingly.
997 */
998static boolean_t
999sched_clutch_bucket_update(
1000 sched_clutch_bucket_t clutch_bucket,
1001 sched_clutch_root_t root_clutch,
1002 uint64_t timestamp)
1003{
1004 sched_clutch_hierarchy_locked_assert(root_clutch);
1005 uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1006 if (new_pri == clutch_bucket->scb_priority) {
1007 return false;
1008 }
1009 struct priority_queue *bucket_prioq = &root_clutch->scr_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1010
1011 if (new_pri < clutch_bucket->scb_priority) {
1012 clutch_bucket->scb_priority = new_pri;
1013 priority_queue_entry_decrease(bucket_prioq, &clutch_bucket->scb_pqlink,
1014 clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
1015 } else {
1016 clutch_bucket->scb_priority = new_pri;
1017 priority_queue_entry_increase(bucket_prioq, &clutch_bucket->scb_pqlink,
1018 clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
1019 }
1020
1021 int16_t root_old_pri = root_clutch->scr_priority;
1022 sched_clutch_root_pri_update(root_clutch);
1023 return root_clutch->scr_priority > root_old_pri;
1024}
1025
1026/*
1027 * sched_clutch_bucket_empty()
1028 *
1029 * Perform all the operations needed when a clutch_bucket is no longer runnable.
1030 * It involves removing the clutch bucket from the hierarchy and updaing the root
1031 * priority appropriately.
1032 */
1033static void
1034sched_clutch_bucket_empty(
1035 sched_clutch_bucket_t clutch_bucket,
1036 sched_clutch_root_t root_clutch,
1037 uint64_t timestamp)
1038{
1039 sched_clutch_hierarchy_locked_assert(root_clutch);
1040 sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp);
1041 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1042 sched_clutch_root_pri_update(root_clutch);
1043}
1044
1045/*
1046 * sched_clutch_cpu_usage_update()
1047 *
1048 * Routine to update CPU usage of the thread in the hierarchy.
1049 */
1050void
1051sched_clutch_cpu_usage_update(
1052 thread_t thread,
1053 uint64_t delta)
1054{
1055 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1056 return;
1057 }
1058 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1059 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
1060 sched_clutch_bucket_cpu_usage_update(clutch_bucket, delta);
1061}
1062
1063/*
1064 * sched_clutch_bucket_cpu_usage_update()
1065 *
1066 * Routine to update the CPU usage of the clutch_bucket.
1067 */
1068static void
1069sched_clutch_bucket_cpu_usage_update(
1070 sched_clutch_bucket_t clutch_bucket,
1071 uint64_t delta)
1072{
1073 if (clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) {
1074 /* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1075 return;
1076 }
1077
1078 /*
1079 * The CPU usage should not overflow the clutch_cpu_data_t type. Since the usage is used to
1080 * calculate interactivity score, it is safe to restrict it to CLUTCH_CPU_DATA_MAX.
1081 */
1082 delta = MIN(delta, CLUTCH_CPU_DATA_MAX);
1083 os_atomic_add_orig(&(clutch_bucket->scb_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1084}
1085
1086/*
1087 * sched_clutch_bucket_cpu_blocked_update()
1088 *
1089 * Routine to update CPU blocked time for clutch_bucket.
1090 */
1091static void
1092sched_clutch_bucket_cpu_blocked_update(
1093 sched_clutch_bucket_t clutch_bucket,
1094 uint64_t timestamp)
1095{
1096 if ((clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) ||
1097 (clutch_bucket->scb_blocked_ts == SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID)) {
1098 /* For Above UI bucket and a newly initialized clutch bucket, nothing to do here */
1099 return;
1100 }
1101
1102 uint64_t blocked_time = timestamp - clutch_bucket->scb_blocked_ts;
1103 if (blocked_time > sched_clutch_bucket_adjust_threshold) {
1104 blocked_time = sched_clutch_bucket_adjust_threshold;
1105 }
1106
1107 /*
1108 * The CPU blocked should not overflow the clutch_cpu_data_t type. Since the blocked is used to
1109 * calculate interactivity score, it is safe to restrict it to CLUTCH_CPU_DATA_MAX.
1110 */
1111 blocked_time = MIN(blocked_time, CLUTCH_CPU_DATA_MAX);
1112 clutch_cpu_data_t __assert_only cpu_blocked_orig = os_atomic_add_orig(&(clutch_bucket->scb_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
1113 /* The blocked time is scaled every so often, it should never overflow */
1114 assert(blocked_time <= (CLUTCH_CPU_DATA_MAX - cpu_blocked_orig));
1115}
1116
1117/*
1118 * sched_clutch_bucket_cpu_adjust()
1119 *
1120 * Routine to scale the cpu usage and blocked time once the sum gets bigger
1121 * than sched_clutch_bucket_adjust_threshold. Allows the values to remain
1122 * manageable and maintain the same ratio while allowing clutch buckets to
1123 * adjust behavior and reflect in the interactivity score in a reasonable
1124 * amount of time.
1125 */
1126static void
1127sched_clutch_bucket_cpu_adjust(
1128 sched_clutch_bucket_t clutch_bucket)
1129{
1130 sched_clutch_bucket_cpu_data_t old_cpu_data = {};
1131 sched_clutch_bucket_cpu_data_t new_cpu_data = {};
1132 do {
1133 old_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, relaxed);
1134 clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
1135 clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
1136 if ((cpu_used + cpu_blocked) < sched_clutch_bucket_adjust_threshold) {
1137 return;
1138 }
1139
1140 /*
1141 * The accumulation of CPU used and blocked is past the threshold; scale it
1142 * down to lose old history.
1143 */
1144 new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_ADJUST_RATIO;
1145 new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_ADJUST_RATIO;
1146 } while (!os_atomic_cmpxchg(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed));
1147}
1148
1149/*
1150 * Thread level scheduling algorithm
1151 *
1152 * The thread level scheduling algorithm uses the mach timeshare
1153 * decay based algorithm to achieve sharing between threads within the
1154 * same clutch bucket. The load/priority shifts etc. are all maintained
1155 * at the clutch bucket level and used for decay calculation of the
1156 * threads. The load sampling is still driven off the scheduler tick
1157 * for runnable clutch buckets (it does not use the new higher frequency
1158 * EWMA based load calculation). The idea is that the contention and load
1159 * within clutch_buckets should be limited enough to not see heavy decay
1160 * and timeshare effectively.
1161 */
1162
1163/*
1164 * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1165 *
1166 * Increment the run count for the clutch bucket associated with the
1167 * thread.
1168 */
1169uint32_t
1170sched_clutch_thread_run_bucket_incr(
1171 thread_t thread,
1172 sched_bucket_t bucket)
1173{
1174 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1175 return 0;
1176 }
1177 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1178 return sched_clutch_run_bucket_incr(clutch, bucket);
1179}
1180
1181static uint32_t
1182sched_clutch_run_bucket_incr(
1183 sched_clutch_t clutch,
1184 sched_bucket_t bucket)
1185{
1186 assert(bucket != TH_BUCKET_RUN);
1187 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]);
1188 uint32_t result = os_atomic_inc(&(clutch_bucket->scb_run_count), relaxed);
1189 return result;
1190}
1191
1192/*
1193 * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
1194 *
1195 * Decrement the run count for the clutch bucket associated with the
1196 * thread.
1197 */
1198uint32_t
1199sched_clutch_thread_run_bucket_decr(
1200 thread_t thread,
1201 sched_bucket_t bucket)
1202{
1203 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1204 return 0;
1205 }
1206 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1207 return sched_clutch_run_bucket_decr(clutch, bucket);
1208}
1209
1210static uint32_t
1211sched_clutch_run_bucket_decr(
1212 sched_clutch_t clutch,
1213 sched_bucket_t bucket)
1214{
1215 assert(bucket != TH_BUCKET_RUN);
1216 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]);
1217 uint32_t result = os_atomic_dec(&(clutch_bucket->scb_run_count), relaxed);
1218 return result;
1219}
1220
1221/*
1222 * sched_clutch_bucket_timeshare_update()
1223 *
1224 * Routine to update the load and priority shift for the clutch_bucket every
1225 * sched_tick. For runnable clutch_buckets, the sched tick handling code
1226 * iterates the clutch buckets and calls this routine. For all others, the
1227 * clutch_bucket maintains a "last updated schedtick" parameter. As threads
1228 * become runnable in the clutch bucket, if this value is outdated, the load
1229 * and shifts are updated.
1230 *
1231 * Possible optimization:
1232 * - The current algorithm samples the load every sched tick (125ms).
1233 * This is prone to spikes in runnable counts; if that turns out to be
1234 * a problem, a simple solution would be to do the EWMA trick to sample
1235 * load at every load_tick (30ms) and use the averaged value for the pri
1236 * shift calculation.
1237 */
1238static void
1239sched_clutch_bucket_timeshare_update(
1240 sched_clutch_bucket_t clutch_bucket)
1241{
1242 if (clutch_bucket->scb_bucket < TH_BUCKET_SHARE_FG) {
1243 return;
1244 }
1245
1246 /*
1247 * Update the timeshare parameters for the clutch bucket if they havent been updated
1248 * in this tick.
1249 */
1250 uint32_t bucket_sched_ts = os_atomic_load(&clutch_bucket->scb_timeshare_tick, relaxed);
1251 uint32_t current_sched_ts = sched_tick;
1252 if (bucket_sched_ts != current_sched_ts) {
1253 os_atomic_store(&clutch_bucket->scb_timeshare_tick, current_sched_ts, relaxed);
1254 uint32_t bucket_load = (os_atomic_load(&clutch_bucket->scb_run_count, relaxed) / processor_avail_count);
1255 bucket_load = MIN(bucket_load, NRQS - 1);
1256 uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_load];
1257 os_atomic_store(&clutch_bucket->scb_pri_shift, pri_shift, relaxed);
1258 }
1259}
1260
1261/*
1262 * sched_clutch_thread_clutch_update()
1263 *
1264 * Routine called when the thread changes its thread group. The current
1265 * implementation relies on the fact that the thread group is changed only
1266 * from the context of the thread itself. Due to this fact, the thread
1267 * group change causes only counter updates in the old & new clutch
1268 * buckets and no hierarchy changes. The routine also attributes the CPU
1269 * used so far to the old clutch.
1270 */
1271void
1272sched_clutch_thread_clutch_update(
1273 thread_t thread,
1274 sched_clutch_t old_clutch,
1275 sched_clutch_t new_clutch)
1276{
1277 uint32_t cpu_delta;
1278 assert(current_thread() == thread);
1279
1280 if (old_clutch) {
1281 sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
1282 /*
1283 * Calculate the CPU used by this thread in the old bucket and
1284 * add it to the old clutch bucket. This uses the same CPU usage
1285 * logic as update_priority etc.
1286 */
1287 thread_timer_delta(thread, cpu_delta);
1288 if (thread->pri_shift < INT8_MAX) {
1289 thread->sched_usage += cpu_delta;
1290 }
1291 thread->cpu_delta += cpu_delta;
1292 sched_clutch_bucket_cpu_usage_update(&(old_clutch->sc_clutch_buckets[thread->th_sched_bucket]), cpu_delta);
1293 }
1294
1295 if (new_clutch) {
1296 sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
1297 }
1298}
1299
1300/* Thread Insertion/Removal/Selection routines */
1301
1302/*
1303 * sched_clutch_thread_insert()
1304 *
1305 * Routine to insert a thread into the sched clutch hierarchy.
1306 * Update the counts at all levels of the hierarchy and insert the nodes
1307 * as they become runnable. Always called with the pset lock held.
1308 */
1309static boolean_t
1310sched_clutch_thread_insert(
1311 sched_clutch_root_t root_clutch,
1312 thread_t thread,
1313 integer_t options)
1314{
1315 boolean_t result = FALSE;
1316
1317 sched_clutch_hierarchy_locked_assert(root_clutch);
1318 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1319 assert(thread->thread_group == clutch->sc_tg);
1320
1321 uint64_t current_timestamp = mach_absolute_time();
1322 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
1323 assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
1324
1325 /* Insert thread into the clutch_bucket runq using sched_pri */
1326 run_queue_enqueue(&clutch_bucket->scb_runq, thread, options);
1327 /* Increment the urgency counter for the root if necessary */
1328 sched_clutch_root_urgency_inc(root_clutch, thread);
1329
1330 /* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
1331 priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->sched_clutchpri_link,
1332 sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri,
1333 PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
1334 os_atomic_inc(&clutch->sc_thr_count, relaxed);
1335 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_STATE) | DBG_FUNC_NONE,
1336 thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, thread_tid(thread), SCHED_CLUTCH_STATE_RUNNABLE, 0);
1337
1338 /* Enqueue the clutch into the hierarchy (if needed) and update properties */
1339 if (clutch_bucket->scb_thr_count == 0) {
1340 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
1341 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
1342 /* Insert the newly runnable clutch bucket into the hierarchy */
1343 result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp);
1344 } else {
1345 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
1346 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
1347 /* Update the position of the clutch bucket in the hierarchy */
1348 result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp);
1349 }
1350 return result;
1351}
1352
1353/*
1354 * sched_clutch_thread_remove()
1355 *
1356 * Routine to remove a thread from the sched clutch hierarchy.
1357 * Update the counts at all levels of the hierarchy and remove the nodes
1358 * as they become empty. Always called with the pset lock held.
1359 */
1360static void
1361sched_clutch_thread_remove(
1362 sched_clutch_root_t root_clutch,
1363 thread_t thread,
1364 uint64_t current_timestamp)
1365{
1366 sched_clutch_hierarchy_locked_assert(root_clutch);
1367 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1368 assert(thread->thread_group == clutch->sc_tg);
1369 assert(thread->runq != PROCESSOR_NULL);
1370
1371 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
1372 assert(clutch_bucket->scb_root == root_clutch);
1373
1374 /* Decrement the urgency counter for the root if necessary */
1375 sched_clutch_root_urgency_dec(root_clutch, thread);
1376 /* Remove thread from the clutch_bucket */
1377 run_queue_remove(&clutch_bucket->scb_runq, thread);
1378
1379 priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->sched_clutchpri_link,
1380 PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
1381 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_STATE) | DBG_FUNC_NONE,
1382 thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, thread_tid(thread), SCHED_CLUTCH_STATE_EMPTY, 0);
1383
1384 /* Update counts at various levels of the hierarchy */
1385 os_atomic_dec(&clutch->sc_thr_count, relaxed);
1386 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
1387 sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
1388
1389 /* Remove the clutch from hierarchy (if needed) and update properties */
1390 if (clutch_bucket->scb_thr_count == 0) {
1391 sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp);
1392 } else {
1393 sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp);
1394 }
1395}
1396
1397/*
1398 * sched_clutch_thread_highest()
1399 *
1400 * Routine to find and remove the highest priority thread
1401 * from the sched clutch hierarchy. The algorithm looks at the
1402 * hierarchy for the most eligible runnable thread and calls
1403 * sched_clutch_thread_remove(). Always called with the
1404 * pset lock held.
1405 */
1406static thread_t
1407sched_clutch_thread_highest(
1408 sched_clutch_root_t root_clutch)
1409{
1410 sched_clutch_hierarchy_locked_assert(root_clutch);
1411 uint64_t current_timestamp = mach_absolute_time();
1412
1413 /* Select the highest priority root bucket */
1414 sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp);
1415 if (root_bucket == NULL) {
1416 return THREAD_NULL;
1417 }
1418 /* Since a thread is being picked from this root bucket, update its deadline */
1419 sched_clutch_root_bucket_deadline_update(root_bucket, root_clutch, current_timestamp);
1420
1421 /* Find the highest priority clutch bucket in this root bucket */
1422 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
1423 assert(clutch_bucket != NULL);
1424
1425 /* Find the highest priority runnable thread in this clutch bucket */
1426 thread_t thread = run_queue_peek(&clutch_bucket->scb_runq);
1427 assert(thread != NULL);
1428
1429 /* Remove and return the thread from the hierarchy */
1430 sched_clutch_thread_remove(root_clutch, thread, current_timestamp);
1431 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
1432 thread_tid(thread), thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, 0, 0);
1433 return thread;
1434}
1435
1436
1437/* High level global accessor routines */
1438
1439/*
1440 * sched_clutch_root_urgency()
1441 *
1442 * Routine to get the urgency of the highest runnable
1443 * thread in the hierarchy.
1444 */
1445static uint32_t
1446sched_clutch_root_urgency(
1447 sched_clutch_root_t root_clutch)
1448{
1449 return root_clutch->scr_urgency;
1450}
1451
1452/*
1453 * sched_clutch_root_count_sum()
1454 *
1455 * The count_sum mechanism is used for scheduler runq
1456 * statistics calculation. Its only useful for debugging
1457 * purposes; since it takes a mach_absolute_time() on
1458 * other scheduler implementations, its better to avoid
1459 * populating this until absolutely necessary.
1460 */
1461static uint32_t
1462sched_clutch_root_count_sum(
1463 __unused sched_clutch_root_t root_clutch)
1464{
1465 return 0;
1466}
1467
1468/*
1469 * sched_clutch_root_priority()
1470 *
1471 * Routine to get the priority of the highest runnable
1472 * thread in the hierarchy.
1473 */
1474static int
1475sched_clutch_root_priority(
1476 sched_clutch_root_t root_clutch)
1477{
1478 return root_clutch->scr_priority;
1479}
1480
1481/*
1482 * sched_clutch_root_count()
1483 *
1484 * Returns total number of runnable threads in the hierarchy.
1485 */
1486uint32_t
1487sched_clutch_root_count(
1488 sched_clutch_root_t root_clutch)
1489{
1490 return root_clutch->scr_thr_count;
1491}
1492
1493/*
1494 * sched_clutch_thread_pri_shift()
1495 *
1496 * Routine to get the priority shift value for a thread.
1497 * Since the timesharing is done at the clutch_bucket level,
1498 * this routine gets the clutch_bucket and retrieves the
1499 * values from there.
1500 */
1501uint32_t
1502sched_clutch_thread_pri_shift(
1503 thread_t thread,
1504 sched_bucket_t bucket)
1505{
1506 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1507 return UINT8_MAX;
1508 }
1509 assert(bucket != TH_BUCKET_RUN);
1510 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1511 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]);
1512 return os_atomic_load(&clutch_bucket->scb_pri_shift, relaxed);
1513}
1514
1515#pragma mark -- Clutch Scheduler Algorithm
1516
1517static void
1518sched_clutch_init(void);
1519
1520static void
1521sched_clutch_timebase_init(void);
1522
1523static thread_t
1524sched_clutch_steal_thread(processor_set_t pset);
1525
1526static void
1527sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
1528
1529static boolean_t
1530sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
1531 sched_options_t options);
1532
1533static boolean_t
1534sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
1535
1536static ast_t
1537sched_clutch_processor_csw_check(processor_t processor);
1538
1539static boolean_t
1540sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
1541
1542static int
1543sched_clutch_runq_count(processor_t processor);
1544
1545static boolean_t
1546sched_clutch_processor_queue_empty(processor_t processor);
1547
1548static uint64_t
1549sched_clutch_runq_stats_count_sum(processor_t processor);
1550
1551static int
1552sched_clutch_processor_bound_count(processor_t processor);
1553
1554static void
1555sched_clutch_pset_init(processor_set_t pset);
1556
1557static void
1558sched_clutch_processor_init(processor_t processor);
1559
1560static thread_t
1561sched_clutch_choose_thread(processor_t processor, int priority, ast_t reason);
1562
1563static void
1564sched_clutch_processor_queue_shutdown(processor_t processor);
1565
1566static sched_mode_t
1567sched_clutch_initial_thread_sched_mode(task_t parent_task);
1568
1569static uint32_t
1570sched_clutch_initial_quantum_size(thread_t thread);
1571
1572static bool
1573sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread);
1574
1575static uint32_t
1576sched_clutch_run_incr(thread_t thread);
1577
1578static uint32_t
1579sched_clutch_run_decr(thread_t thread);
1580
1581static void
1582sched_clutch_update_thread_bucket(thread_t thread);
1583
1584const struct sched_dispatch_table sched_clutch_dispatch = {
1585 .sched_name = "clutch",
1586 .init = sched_clutch_init,
1587 .timebase_init = sched_clutch_timebase_init,
1588 .processor_init = sched_clutch_processor_init,
1589 .pset_init = sched_clutch_pset_init,
1590 .maintenance_continuation = sched_timeshare_maintenance_continue,
1591 .choose_thread = sched_clutch_choose_thread,
1592 .steal_thread_enabled = sched_steal_thread_enabled,
1593 .steal_thread = sched_clutch_steal_thread,
1594 .compute_timeshare_priority = sched_compute_timeshare_priority,
1595 .choose_processor = choose_processor,
1596 .processor_enqueue = sched_clutch_processor_enqueue,
1597 .processor_queue_shutdown = sched_clutch_processor_queue_shutdown,
1598 .processor_queue_remove = sched_clutch_processor_queue_remove,
1599 .processor_queue_empty = sched_clutch_processor_queue_empty,
1600 .priority_is_urgent = priority_is_urgent,
1601 .processor_csw_check = sched_clutch_processor_csw_check,
1602 .processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
1603 .initial_quantum_size = sched_clutch_initial_quantum_size,
1604 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
1605 .can_update_priority = can_update_priority,
1606 .update_priority = update_priority,
1607 .lightweight_update_priority = lightweight_update_priority,
1608 .quantum_expire = sched_default_quantum_expire,
1609 .processor_runq_count = sched_clutch_runq_count,
1610 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
1611 .processor_bound_count = sched_clutch_processor_bound_count,
1612 .thread_update_scan = sched_clutch_thread_update_scan,
1613 .multiple_psets_enabled = TRUE,
1614 .sched_groups_enabled = FALSE,
1615 .avoid_processor_enabled = TRUE,
1616 .thread_avoid_processor = sched_clutch_thread_avoid_processor,
1617 .processor_balance = sched_SMT_balance,
1618
1619 .rt_runq = sched_rtglobal_runq,
1620 .rt_init = sched_rtglobal_init,
1621 .rt_queue_shutdown = sched_rtglobal_queue_shutdown,
1622 .rt_runq_scan = sched_rtglobal_runq_scan,
1623 .rt_runq_count_sum = sched_rtglobal_runq_count_sum,
1624
1625 .qos_max_parallelism = sched_qos_max_parallelism,
1626 .check_spill = sched_check_spill,
1627 .ipi_policy = sched_ipi_policy,
1628 .thread_should_yield = sched_thread_should_yield,
1629 .run_count_incr = sched_clutch_run_incr,
1630 .run_count_decr = sched_clutch_run_decr,
1631 .update_thread_bucket = sched_clutch_update_thread_bucket,
1632 .pset_made_schedulable = sched_pset_made_schedulable,
1633};
1634
1635__attribute__((always_inline))
1636static inline run_queue_t
1637sched_clutch_bound_runq(processor_t processor)
1638{
1639 return &processor->runq;
1640}
1641
1642__attribute__((always_inline))
1643static inline sched_clutch_root_t
1644sched_clutch_processor_root_clutch(processor_t processor)
1645{
1646 return &processor->processor_set->pset_clutch_root;
1647}
1648
1649__attribute__((always_inline))
1650static inline run_queue_t
1651sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
1652{
1653 assert(thread->bound_processor == processor);
1654 return sched_clutch_bound_runq(processor);
1655}
1656
1657static uint32_t
1658sched_clutch_initial_quantum_size(thread_t thread)
1659{
1660 if (thread == THREAD_NULL) {
1661 return std_quantum;
1662 }
1663 assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
1664 return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
1665}
1666
1667static sched_mode_t
1668sched_clutch_initial_thread_sched_mode(task_t parent_task)
1669{
1670 if (parent_task == kernel_task) {
1671 return TH_MODE_FIXED;
1672 } else {
1673 return TH_MODE_TIMESHARE;
1674 }
1675}
1676
1677static void
1678sched_clutch_processor_init(processor_t processor)
1679{
1680 run_queue_init(&processor->runq);
1681}
1682
1683static void
1684sched_clutch_pset_init(processor_set_t pset)
1685{
1686 sched_clutch_root_init(&pset->pset_clutch_root, pset);
1687}
1688
1689static void
1690sched_clutch_init(void)
1691{
1692 if (!PE_parse_boot_argn("sched_clutch_bucket_interactive_pri", &sched_clutch_bucket_interactive_pri, sizeof(sched_clutch_bucket_interactive_pri))) {
1693 sched_clutch_bucket_interactive_pri = SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT;
1694 }
1695 if (!PE_parse_boot_argn("sched_clutch_bucket_pri_boost", &sched_clutch_bucket_pri_boost, sizeof(sched_clutch_bucket_pri_boost))) {
1696 sched_clutch_bucket_pri_boost = SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT;
1697 }
1698 sched_timeshare_init();
1699}
1700
1701static void
1702sched_clutch_timebase_init(void)
1703{
1704 sched_timeshare_timebase_init();
1705 sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
1706 sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
1707 sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
1708 clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_ADJUST_THRESHOLD_USECS,
1709 NSEC_PER_USEC, &sched_clutch_bucket_adjust_threshold);
1710
1711 uint32_t interactivity_delta = 0;
1712 if (!PE_parse_boot_argn("sched_clutch_bucket_interactivity_delta_usecs", &interactivity_delta, sizeof(interactivity_delta))) {
1713 interactivity_delta = SCHED_CLUTCH_BUCKET_INTERACTIVITY_DELTA_USECS_DEFAULT;
1714 }
1715 clock_interval_to_absolutetime_interval(interactivity_delta, NSEC_PER_USEC, &sched_clutch_bucket_interactivity_delta);
1716}
1717
1718static thread_t
1719sched_clutch_choose_thread(
1720 processor_t processor,
1721 int priority,
1722 __unused ast_t reason)
1723{
1724 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
1725 uint32_t clutch_count = sched_clutch_root_count(sched_clutch_processor_root_clutch(processor));
1726 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
1727 boolean_t choose_from_boundq = false;
1728
1729 if (bound_runq->highq < priority &&
1730 clutch_pri < priority) {
1731 return THREAD_NULL;
1732 }
1733
1734 if (bound_runq->count && clutch_count) {
1735 if (bound_runq->highq >= clutch_pri) {
1736 choose_from_boundq = true;
1737 }
1738 } else if (bound_runq->count) {
1739 choose_from_boundq = true;
1740 } else if (clutch_count) {
1741 choose_from_boundq = false;
1742 } else {
1743 return THREAD_NULL;
1744 }
1745
1746 thread_t thread = THREAD_NULL;
1747 if (choose_from_boundq == false) {
1748 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
1749 thread = sched_clutch_thread_highest(pset_clutch_root);
1750 } else {
1751 thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
1752 }
1753 return thread;
1754}
1755
1756static boolean_t
1757sched_clutch_processor_enqueue(
1758 processor_t processor,
1759 thread_t thread,
1760 sched_options_t options)
1761{
1762 boolean_t result;
1763
1764 thread->runq = processor;
1765 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1766 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
1767 result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
1768 } else {
1769 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
1770 result = run_queue_enqueue(rq, thread, options);
1771 }
1772 return result;
1773}
1774
1775static boolean_t
1776sched_clutch_processor_queue_empty(processor_t processor)
1777{
1778 return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
1779 sched_clutch_bound_runq(processor)->count == 0;
1780}
1781
1782static ast_t
1783sched_clutch_processor_csw_check(processor_t processor)
1784{
1785 boolean_t has_higher;
1786 int pri;
1787
1788 if (sched_clutch_thread_avoid_processor(processor, current_thread())) {
1789 return AST_PREEMPT | AST_URGENT;
1790 }
1791
1792 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
1793 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
1794
1795 assert(processor->active_thread != NULL);
1796
1797 pri = MAX(clutch_pri, bound_runq->highq);
1798
1799 if (processor->first_timeslice) {
1800 has_higher = (pri > processor->current_pri);
1801 } else {
1802 has_higher = (pri >= processor->current_pri);
1803 }
1804
1805 if (has_higher) {
1806 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
1807 return AST_PREEMPT | AST_URGENT;
1808 }
1809
1810 if (bound_runq->urgency > 0) {
1811 return AST_PREEMPT | AST_URGENT;
1812 }
1813
1814 return AST_PREEMPT;
1815 }
1816
1817 return AST_NONE;
1818}
1819
1820static boolean_t
1821sched_clutch_processor_queue_has_priority(processor_t processor,
1822 int priority,
1823 boolean_t gte)
1824{
1825 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
1826
1827 int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
1828
1829 if (gte) {
1830 return qpri >= priority;
1831 } else {
1832 return qpri > priority;
1833 }
1834}
1835
1836static int
1837sched_clutch_runq_count(processor_t processor)
1838{
1839 return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
1840}
1841
1842static uint64_t
1843sched_clutch_runq_stats_count_sum(processor_t processor)
1844{
1845 uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
1846
1847 if (processor->cpu_id == processor->processor_set->cpu_set_low) {
1848 return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
1849 } else {
1850 return bound_sum;
1851 }
1852}
1853static int
1854sched_clutch_processor_bound_count(processor_t processor)
1855{
1856 return sched_clutch_bound_runq(processor)->count;
1857}
1858
1859static void
1860sched_clutch_processor_queue_shutdown(processor_t processor)
1861{
1862 processor_set_t pset = processor->processor_set;
1863 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
1864 thread_t thread;
1865 queue_head_t tqueue;
1866
1867 /* We only need to migrate threads if this is the last active processor in the pset */
1868 if (pset->online_processor_count > 0) {
1869 pset_unlock(pset);
1870 return;
1871 }
1872
1873 queue_init(&tqueue);
1874 while (sched_clutch_root_count(pset_clutch_root) > 0) {
1875 thread = sched_clutch_thread_highest(pset_clutch_root);
1876 enqueue_tail(&tqueue, &thread->runq_links);
1877 }
1878
1879 pset_unlock(pset);
1880
1881 qe_foreach_element_safe(thread, &tqueue, runq_links) {
1882 remqueue(&thread->runq_links);
1883
1884 thread_lock(thread);
1885
1886 thread_setrun(thread, SCHED_TAILQ);
1887
1888 thread_unlock(thread);
1889 }
1890}
1891
1892static boolean_t
1893sched_clutch_processor_queue_remove(
1894 processor_t processor,
1895 thread_t thread)
1896{
1897 run_queue_t rq;
1898 processor_set_t pset = processor->processor_set;
1899
1900 pset_lock(pset);
1901
1902 if (processor == thread->runq) {
1903 /*
1904 * Thread is on a run queue and we have a lock on
1905 * that run queue.
1906 */
1907 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1908 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
1909 sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time());
1910 } else {
1911 rq = sched_clutch_thread_bound_runq(processor, thread);
1912 run_queue_remove(rq, thread);
1913 }
1914 } else {
1915 /*
1916 * The thread left the run queue before we could
1917 * lock the run queue.
1918 */
1919 assert(thread->runq == PROCESSOR_NULL);
1920 processor = PROCESSOR_NULL;
1921 }
1922
1923 pset_unlock(pset);
1924
1925 return processor != PROCESSOR_NULL;
1926}
1927
1928static thread_t
1929sched_clutch_steal_thread(processor_set_t pset)
1930{
1931 processor_set_t nset, cset = pset;
1932 thread_t thread;
1933
1934 do {
1935 sched_clutch_root_t pset_clutch_root = &cset->pset_clutch_root;
1936 if (sched_clutch_root_count(pset_clutch_root) > 0) {
1937 thread = sched_clutch_thread_highest(pset_clutch_root);
1938 pset_unlock(cset);
1939 return thread;
1940 }
1941
1942 nset = next_pset(cset);
1943
1944 if (nset != pset) {
1945 pset_unlock(cset);
1946
1947 cset = nset;
1948 pset_lock(cset);
1949 }
1950 } while (nset != pset);
1951
1952 pset_unlock(cset);
1953
1954 return THREAD_NULL;
1955}
1956
1957static void
1958sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
1959{
1960 boolean_t restart_needed = FALSE;
1961 processor_t processor = processor_list;
1962 processor_set_t pset;
1963 thread_t thread;
1964 spl_t s;
1965
1966 /*
1967 * We update the threads associated with each processor (bound and idle threads)
1968 * and then update the threads in each pset runqueue.
1969 */
1970
1971 do {
1972 do {
1973 pset = processor->processor_set;
1974
1975 s = splsched();
1976 pset_lock(pset);
1977
1978 restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
1979
1980 pset_unlock(pset);
1981 splx(s);
1982
1983 if (restart_needed) {
1984 break;
1985 }
1986
1987 thread = processor->idle_thread;
1988 if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
1989 if (thread_update_add_thread(thread) == FALSE) {
1990 restart_needed = TRUE;
1991 break;
1992 }
1993 }
1994 } while ((processor = processor->processor_list) != NULL);
1995
1996 /* Ok, we now have a collection of candidates -- fix them. */
1997 thread_update_process_threads();
1998 } while (restart_needed);
1999
2000 pset = &pset0;
2001
2002 do {
2003 do {
2004 s = splsched();
2005 pset_lock(pset);
2006
2007 if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
2008 queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
2009 sched_clutch_bucket_t clutch_bucket;
2010 qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
2011 sched_clutch_bucket_timeshare_update(clutch_bucket);
2012 restart_needed = runq_scan(&clutch_bucket->scb_runq, scan_context);
2013 if (restart_needed) {
2014 break;
2015 }
2016 }
2017 }
2018
2019 pset_unlock(pset);
2020 splx(s);
2021 if (restart_needed) {
2022 break;
2023 }
2024 } while ((pset = pset->pset_list) != NULL);
2025
2026 /* Ok, we now have a collection of candidates -- fix them. */
2027 thread_update_process_threads();
2028 } while (restart_needed);
2029}
2030
2031extern int sched_allow_rt_smt;
2032
2033/* Return true if this thread should not continue running on this processor */
2034static bool
2035sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread)
2036{
2037 if (processor->processor_primary != processor) {
2038 /*
2039 * This is a secondary SMT processor. If the primary is running
2040 * a realtime thread, only allow realtime threads on the secondary.
2041 */
2042 if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
2043 return true;
2044 }
2045 }
2046
2047 return false;
2048}
2049
2050/*
2051 * For the clutch scheduler, the run counts are maintained in the clutch
2052 * buckets (i.e thread group scheduling structure).
2053 */
2054static uint32_t
2055sched_clutch_run_incr(thread_t thread)
2056{
2057 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
2058 uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
2059 sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
2060 return new_count;
2061}
2062
2063static uint32_t
2064sched_clutch_run_decr(thread_t thread)
2065{
2066 assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
2067 uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
2068 sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
2069 return new_count;
2070}
2071
2072static sched_bucket_t
2073sched_convert_pri_to_bucket(uint8_t priority)
2074{
2075 sched_bucket_t bucket = TH_BUCKET_RUN;
2076
2077 if (priority > BASEPRI_USER_INITIATED) {
2078 bucket = TH_BUCKET_SHARE_FG;
2079 } else if (priority > BASEPRI_DEFAULT) {
2080 bucket = TH_BUCKET_SHARE_IN;
2081 } else if (priority > BASEPRI_UTILITY) {
2082 bucket = TH_BUCKET_SHARE_DF;
2083 } else if (priority > MAXPRI_THROTTLE) {
2084 bucket = TH_BUCKET_SHARE_UT;
2085 } else {
2086 bucket = TH_BUCKET_SHARE_BG;
2087 }
2088 return bucket;
2089}
2090
2091/*
2092 * For threads that have changed sched_pri without changing the
2093 * base_pri for any reason other than decay, use the sched_pri
2094 * as the bucketizing priority instead of base_pri. All such
2095 * changes are typically due to kernel locking primitives boosts
2096 * or demotions.
2097 */
2098static boolean_t
2099sched_thread_sched_pri_promoted(thread_t thread)
2100{
2101 return (thread->sched_flags & TH_SFLAG_PROMOTED) ||
2102 (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
2103 (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
2104 (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
2105 (thread->kern_promotion_schedpri != 0);
2106}
2107
2108/*
2109 * Routine to update the scheduling bucket for the thread.
2110 *
2111 * In the clutch scheduler implementation, the thread's bucket
2112 * is based on sched_pri if it was promoted due to a kernel
2113 * primitive; otherwise its based on the thread base_pri. This
2114 * enhancement allows promoted threads to reach a higher priority
2115 * bucket and potentially get selected sooner for scheduling.
2116 *
2117 * Also, the clutch scheduler does not honor fixed priority below
2118 * FG priority. It simply puts those threads in the corresponding
2119 * timeshare bucket. The reason for to do that is because it is
2120 * extremely hard to define the scheduling properties of such threads
2121 * and they typically lead to performance issues.
2122 */
2123
2124void
2125sched_clutch_update_thread_bucket(thread_t thread)
2126{
2127 sched_bucket_t old_bucket = thread->th_sched_bucket;
2128 sched_bucket_t new_bucket = TH_BUCKET_RUN;
2129 assert(thread->runq == PROCESSOR_NULL);
2130
2131 int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
2132
2133 switch (thread->sched_mode) {
2134 case TH_MODE_FIXED:
2135 if (pri >= BASEPRI_FOREGROUND) {
2136 new_bucket = TH_BUCKET_FIXPRI;
2137 } else {
2138 new_bucket = sched_convert_pri_to_bucket(pri);
2139 }
2140 break;
2141
2142 case TH_MODE_REALTIME:
2143 new_bucket = TH_BUCKET_FIXPRI;
2144 break;
2145
2146 case TH_MODE_TIMESHARE:
2147 new_bucket = sched_convert_pri_to_bucket(pri);
2148 break;
2149
2150 default:
2151 panic("unexpected mode: %d", thread->sched_mode);
2152 break;
2153 }
2154
2155 if (old_bucket == new_bucket) {
2156 return;
2157 }
2158
2159 thread->th_sched_bucket = new_bucket;
2160 thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
2161
2162 /*
2163 * Since this is called after the thread has been removed from the runq,
2164 * only the run counts need to be updated. The re-insert into the runq
2165 * would put the thread into the correct new bucket's runq.
2166 */
2167 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
2168 sched_clutch_thread_run_bucket_decr(thread, old_bucket);
2169 sched_clutch_thread_run_bucket_incr(thread, new_bucket);
2170 }
2171}
2172
2173
2174#endif /* CONFIG_SCHED_CLUTCH */