]> git.saurik.com Git - apple/xnu.git/blob - osfmk/kern/sched_clutch.c
xnu-6153.101.6.tar.gz
[apple/xnu.git] / osfmk / kern / sched_clutch.c
1 /*
2 * Copyright (c) 2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <mach/mach_types.h>
30 #include <mach/machine.h>
31 #include <machine/machine_routines.h>
32 #include <machine/sched_param.h>
33 #include <machine/machine_cpu.h>
34 #include <kern/kern_types.h>
35 #include <kern/debug.h>
36 #include <kern/machine.h>
37 #include <kern/misc_protos.h>
38 #include <kern/processor.h>
39 #include <kern/queue.h>
40 #include <kern/sched.h>
41 #include <kern/sched_prim.h>
42 #include <kern/task.h>
43 #include <kern/thread.h>
44 #include <kern/sched_clutch.h>
45 #include <machine/atomic.h>
46 #include <kern/sched_clutch.h>
47 #include <sys/kdebug.h>
48
49 #if __AMP__
50 #include <kern/sched_amp_common.h>
51 #endif /* __AMP__ */
52
53 #if CONFIG_SCHED_CLUTCH
54
55 /* Forward declarations of static routines */
56
57 /* Root level hierarchy management */
58 static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
59 static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t);
60 static void sched_clutch_root_pri_update(sched_clutch_root_t);
61 static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t);
62 static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
63 static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
64
65 /* Root bucket level hierarchy management */
66 static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
67 static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t);
68 static int sched_clutch_root_bucket_pri_compare(sched_clutch_root_bucket_t, sched_clutch_root_bucket_t);
69
70 /* Options for clutch bucket ordering in the runq */
71 __options_decl(sched_clutch_bucket_options_t, uint32_t, {
72 SCHED_CLUTCH_BUCKET_OPTIONS_NONE = 0x0,
73 /* Round robin clutch bucket on thread removal */
74 SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR = 0x1,
75 /* Insert clutch bucket at head (for thread preemption) */
76 SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ = 0x2,
77 /* Insert clutch bucket at tail (default) */
78 SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ = 0x4,
79 });
80
81 /* Clutch bucket level hierarchy management */
82 static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
83 static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
84 static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
85 static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
86 static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
87
88 static void sched_clutch_bucket_cpu_usage_update(sched_clutch_bucket_t, uint64_t);
89 static void sched_clutch_bucket_cpu_blocked_update(sched_clutch_bucket_t, uint64_t);
90 static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
91 static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t);
92
93 /* Clutch timeshare properties updates */
94 static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
95 static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
96 static void sched_clutch_bucket_cpu_adjust(sched_clutch_bucket_t);
97 static void sched_clutch_bucket_timeshare_update(sched_clutch_bucket_t);
98 static boolean_t sched_thread_sched_pri_promoted(thread_t);
99 /* Clutch membership management */
100 static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
101 static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
102 static thread_t sched_clutch_thread_highest(sched_clutch_root_t);
103
104 /* Clutch properties updates */
105 static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
106 static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
107 static int sched_clutch_root_priority(sched_clutch_root_t);
108
109 #if __AMP__
110 /* System based routines */
111 static bool sched_clutch_pset_available(processor_set_t);
112 #endif /* __AMP__ */
113
114 /* Helper debugging routines */
115 static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
116
117
118
119 /*
120 * Global priority queue comparator routine for root buckets. The
121 * routine implements the priority queue as a minimum deadline queue
122 * to achieve EDF scheduling.
123 */
124 priority_queue_compare_fn_t sched_clutch_root_bucket_compare;
125
126
127 /*
128 * Special markers for buckets that have invalid WCELs/quantums etc.
129 */
130 #define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
131 #define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
132
133 /*
134 * Root level bucket WCELs
135 *
136 * The root level bucket selection algorithm is an Earliest Deadline
137 * First (EDF) algorithm where the deadline for buckets are defined
138 * by the worst-case-execution-latency and the make runnable timestamp
139 * for the bucket.
140 *
141 */
142 static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
143 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
144 0, /* FG */
145 37500, /* IN (37.5ms) */
146 75000, /* DF (75ms) */
147 150000, /* UT (150ms) */
148 250000 /* BG (250ms) */
149 };
150 static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
151
152 /*
153 * Root level bucket warp
154 *
155 * Each root level bucket has a warp value associated with it as well.
156 * The warp value allows the root bucket to effectively warp ahead of
157 * lower priority buckets for a limited time even if it has a later
158 * deadline. The warping behavior provides extra (but limited)
159 * opportunity for high priority buckets to remain responsive.
160 */
161
162 /* Special warp deadline value to indicate that the bucket has not used any warp yet */
163 #define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED (SCHED_CLUTCH_INVALID_TIME_64)
164
165 /* Warp window durations for various tiers */
166 static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
167 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
168 8000, /* FG (8ms)*/
169 4000, /* IN (4ms) */
170 2000, /* DF (2ms) */
171 1000, /* UT (1ms) */
172 0 /* BG (0ms) */
173 };
174 static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
175
176 /*
177 * Thread level quantum
178 *
179 * The algorithm defines quantums for threads at various buckets. This
180 * (combined with the root level bucket quantums) restricts how much
181 * the lower priority levels can preempt the higher priority threads.
182 */
183 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
184 10000, /* FIXPRI (10ms) */
185 10000, /* FG (10ms) */
186 8000, /* IN (8ms) */
187 6000, /* DF (6ms) */
188 4000, /* UT (4ms) */
189 2000 /* BG (2ms) */
190 };
191 static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
192
193 enum sched_clutch_state {
194 SCHED_CLUTCH_STATE_EMPTY = 0,
195 SCHED_CLUTCH_STATE_RUNNABLE,
196 };
197
198 /*
199 * sched_clutch_us_to_abstime()
200 *
201 * Initializer for converting all durations in usec to abstime
202 */
203 static void
204 sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
205 {
206 for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
207 if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
208 abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
209 } else {
210 clock_interval_to_absolutetime_interval(us_vals[i],
211 NSEC_PER_USEC, &abstime_vals[i]);
212 }
213 }
214 }
215
216 #if DEVELOPMENT || DEBUG
217
218 /*
219 * sched_clutch_hierarchy_locked_assert()
220 *
221 * Debugging helper routine. Asserts that the hierarchy is locked. The locking
222 * for the hierarchy depends on where the hierarchy is hooked. The current
223 * implementation hooks the hierarchy at the pset, so the hierarchy is locked
224 * using the pset lock.
225 */
226 static inline void
227 sched_clutch_hierarchy_locked_assert(
228 sched_clutch_root_t root_clutch)
229 {
230 pset_assert_locked(root_clutch->scr_pset);
231 }
232
233 #else /* DEVELOPMENT || DEBUG */
234
235 static inline void
236 sched_clutch_hierarchy_locked_assert(
237 __unused sched_clutch_root_t root_clutch)
238 {
239 }
240
241 #endif /* DEVELOPMENT || DEBUG */
242
243 /*
244 * sched_clutch_thr_count_inc()
245 *
246 * Increment thread count at a hierarchy level with overflow checks.
247 */
248 static void
249 sched_clutch_thr_count_inc(
250 uint16_t *thr_count)
251 {
252 if (__improbable(os_inc_overflow(thr_count))) {
253 panic("sched_clutch thread count overflowed!");
254 }
255 }
256
257 /*
258 * sched_clutch_thr_count_dec()
259 *
260 * Decrement thread count at a hierarchy level with underflow checks.
261 */
262 static void
263 sched_clutch_thr_count_dec(
264 uint16_t *thr_count)
265 {
266 if (__improbable(os_dec_overflow(thr_count))) {
267 panic("sched_clutch thread count underflowed!");
268 }
269 }
270
271 #if __AMP__
272
273 /*
274 * sched_clutch_pset_available()
275 *
276 * Routine to determine if a pset is available for scheduling.
277 */
278 static bool
279 sched_clutch_pset_available(processor_set_t pset)
280 {
281 /* Check if cluster has none of the CPUs available */
282 if (pset->online_processor_count == 0) {
283 return false;
284 }
285
286 /* Check if the cluster is not recommended by CLPC */
287 if (!pset_is_recommended(pset)) {
288 return false;
289 }
290
291 return true;
292 }
293
294 #endif /* __AMP__ */
295
296 /*
297 * sched_clutch_root_init()
298 *
299 * Routine to initialize the scheduler hierarchy root.
300 */
301 static void
302 sched_clutch_root_init(
303 sched_clutch_root_t root_clutch,
304 processor_set_t pset)
305 {
306 root_clutch->scr_thr_count = 0;
307 root_clutch->scr_priority = NOPRI;
308 root_clutch->scr_urgency = 0;
309 root_clutch->scr_pset = pset;
310
311 /* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
312 queue_init(&root_clutch->scr_clutch_buckets);
313
314 /* Initialize the queue which maintains all runnable foreign clutch buckets */
315 queue_init(&root_clutch->scr_foreign_buckets);
316
317 /* Initialize the bitmap and priority queue of runnable root buckets */
318 sched_clutch_root_bucket_compare = priority_heap_make_comparator(a, b, struct sched_clutch_root_bucket, scrb_pqlink, {
319 return (a->scrb_deadline < b->scrb_deadline) ? 1 : ((a->scrb_deadline == b->scrb_deadline) ? 0 : -1);
320 });
321 priority_queue_init(&root_clutch->scr_root_buckets, PRIORITY_QUEUE_GENERIC_KEY | PRIORITY_QUEUE_MIN_HEAP);
322 bitmap_zero(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX);
323 bitmap_zero(root_clutch->scr_warp_available, TH_BUCKET_SCHED_MAX);
324
325 /* Initialize all the root buckets */
326 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
327 sched_clutch_root_bucket_init(&root_clutch->scr_buckets[i], i);
328 }
329 }
330
331 /*
332 * Clutch Bucket Runqueues
333 *
334 * The clutch buckets are maintained in a runq at the root bucket level. The
335 * runq organization allows clutch buckets to be ordered based on various
336 * factors such as:
337 *
338 * - Clutch buckets are round robin'ed at the same priority level when a
339 * thread is selected from a clutch bucket. This prevents a clutch bucket
340 * from starving out other clutch buckets at the same priority.
341 *
342 * - Clutch buckets are inserted at the head when it becomes runnable due to
343 * thread preemption. This allows threads that were preempted to maintain
344 * their order in the queue.
345 *
346 */
347
348 /*
349 * sched_clutch_bucket_runq_init()
350 *
351 * Initialize a clutch bucket runq.
352 */
353 static void
354 sched_clutch_bucket_runq_init(
355 sched_clutch_bucket_runq_t clutch_buckets_rq)
356 {
357 clutch_buckets_rq->scbrq_highq = NOPRI;
358 for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) {
359 clutch_buckets_rq->scbrq_bitmap[i] = 0;
360 }
361 clutch_buckets_rq->scbrq_count = 0;
362 for (int i = 0; i < NRQS; i++) {
363 circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
364 }
365 }
366
367 /*
368 * sched_clutch_bucket_runq_empty()
369 *
370 * Returns if a clutch bucket runq is empty.
371 */
372 static boolean_t
373 sched_clutch_bucket_runq_empty(
374 sched_clutch_bucket_runq_t clutch_buckets_rq)
375 {
376 return clutch_buckets_rq->scbrq_count == 0;
377 }
378
379 /*
380 * sched_clutch_bucket_runq_peek()
381 *
382 * Returns the highest priority clutch bucket in the runq.
383 */
384 static sched_clutch_bucket_t
385 sched_clutch_bucket_runq_peek(
386 sched_clutch_bucket_runq_t clutch_buckets_rq)
387 {
388 if (clutch_buckets_rq->scbrq_count > 0) {
389 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
390 return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
391 } else {
392 return NULL;
393 }
394 }
395
396 /*
397 * sched_clutch_bucket_runq_enqueue()
398 *
399 * Enqueue a clutch bucket into the runq based on the options passed in.
400 */
401 static void
402 sched_clutch_bucket_runq_enqueue(
403 sched_clutch_bucket_runq_t clutch_buckets_rq,
404 sched_clutch_bucket_t clutch_bucket,
405 sched_clutch_bucket_options_t options)
406 {
407 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
408 if (circle_queue_empty(queue)) {
409 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
410 bitmap_set(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
411 if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
412 clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
413 }
414 } else {
415 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
416 circle_enqueue_head(queue, &clutch_bucket->scb_runqlink);
417 } else {
418 /*
419 * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
420 * SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
421 */
422 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
423 }
424 }
425 clutch_buckets_rq->scbrq_count++;
426 }
427
428 /*
429 * sched_clutch_bucket_runq_remove()
430 *
431 * Remove a clutch bucket from the runq.
432 */
433 static void
434 sched_clutch_bucket_runq_remove(
435 sched_clutch_bucket_runq_t clutch_buckets_rq,
436 sched_clutch_bucket_t clutch_bucket)
437 {
438 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
439 circle_dequeue(queue, &clutch_bucket->scb_runqlink);
440 assert(clutch_buckets_rq->scbrq_count > 0);
441 clutch_buckets_rq->scbrq_count--;
442 if (circle_queue_empty(queue)) {
443 bitmap_clear(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
444 clutch_buckets_rq->scbrq_highq = bitmap_first(clutch_buckets_rq->scbrq_bitmap, NRQS);
445 }
446 }
447
448 static void
449 sched_clutch_bucket_runq_rotate(
450 sched_clutch_bucket_runq_t clutch_buckets_rq,
451 sched_clutch_bucket_t clutch_bucket)
452 {
453 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
454 assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
455 circle_queue_rotate_head_forward(queue);
456 }
457
458 /*
459 * sched_clutch_root_bucket_init()
460 *
461 * Routine to initialize root buckets.
462 */
463 static void
464 sched_clutch_root_bucket_init(
465 sched_clutch_root_bucket_t root_bucket,
466 sched_bucket_t bucket)
467 {
468 root_bucket->scrb_bucket = bucket;
469 sched_clutch_bucket_runq_init(&root_bucket->scrb_clutch_buckets);
470 priority_queue_entry_init(&root_bucket->scrb_pqlink);
471 root_bucket->scrb_deadline = SCHED_CLUTCH_INVALID_TIME_64;
472 root_bucket->scrb_warped_deadline = 0;
473 root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
474 }
475
476 /*
477 * sched_clutch_root_bucket_pri_compare()
478 *
479 * Routine to compare root buckets based on the highest runnable clutch
480 * bucket priorities in the root buckets.
481 */
482 static int
483 sched_clutch_root_bucket_pri_compare(
484 sched_clutch_root_bucket_t a,
485 sched_clutch_root_bucket_t b)
486 {
487 sched_clutch_bucket_t a_highest = sched_clutch_root_bucket_highest_clutch_bucket(a);
488 sched_clutch_bucket_t b_highest = sched_clutch_root_bucket_highest_clutch_bucket(b);
489 return (a_highest->scb_priority > b_highest->scb_priority) ?
490 1 : ((a_highest->scb_priority == b_highest->scb_priority) ? 0 : -1);
491 }
492
493 /*
494 * sched_clutch_root_select_aboveui()
495 *
496 * Special case scheduling for Above UI bucket.
497 *
498 * AboveUI threads are typically system critical threads that need low latency
499 * which is why they are handled specially.
500 *
501 * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
502 * important to maintain some native priority order between those buckets. The policy
503 * implemented here is to compare the highest clutch buckets of both buckets; if the
504 * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
505 * deadline based scheduling which should pickup the timeshare buckets.
506 *
507 * The implementation allows extremely low latency CPU access for Above UI threads
508 * while supporting the use case of high priority timeshare threads contending with
509 * lower priority fixed priority threads.
510 */
511 static boolean_t
512 sched_clutch_root_select_aboveui(
513 sched_clutch_root_t root_clutch)
514 {
515 if (bitmap_test(root_clutch->scr_runnable_bitmap, TH_BUCKET_FIXPRI)) {
516 sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_buckets[TH_BUCKET_FIXPRI];
517 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_buckets[TH_BUCKET_SHARE_FG];
518
519 if (!bitmap_test(root_clutch->scr_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
520 /* If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling */
521 return true;
522 }
523 if (sched_clutch_root_bucket_pri_compare(root_bucket_aboveui, root_bucket_sharefg) >= 0) {
524 /* If the aboveUI bucket has a higher native clutch bucket priority, schedule it */
525 return true;
526 }
527 }
528 return false;
529 }
530
531
532 /*
533 * sched_clutch_root_highest_root_bucket()
534 *
535 * Main routine to find the highest runnable root level bucket.
536 * This routine is called from performance sensitive contexts; so it is
537 * crucial to keep this O(1).
538 *
539 */
540 static sched_clutch_root_bucket_t
541 sched_clutch_root_highest_root_bucket(
542 sched_clutch_root_t root_clutch,
543 uint64_t timestamp)
544 {
545 sched_clutch_hierarchy_locked_assert(root_clutch);
546 if (bitmap_lsb_first(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
547 return NULL;
548 }
549
550 if (sched_clutch_root_select_aboveui(root_clutch)) {
551 return &root_clutch->scr_buckets[TH_BUCKET_FIXPRI];
552 }
553
554 /*
555 * Above UI bucket is not runnable or has a low priority clutch bucket; use the earliest deadline model
556 * to schedule threads. The idea is that as the timeshare buckets use CPU, they will drop their
557 * interactivity score and allow low priority AboveUI clutch buckets to be scheduled.
558 */
559
560 /* Find the earliest deadline bucket */
561 sched_clutch_root_bucket_t edf_bucket = priority_queue_min(&root_clutch->scr_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
562
563 sched_clutch_root_bucket_t warp_bucket = NULL;
564 int warp_bucket_index = -1;
565 evaluate_warp_buckets:
566 /* Check if any higher runnable buckets have warp available */
567 warp_bucket_index = bitmap_lsb_first(root_clutch->scr_warp_available, TH_BUCKET_SCHED_MAX);
568
569 if ((warp_bucket_index == -1) || (warp_bucket_index >= edf_bucket->scrb_bucket)) {
570 /* No higher buckets have warp available; choose the edf bucket and replenish its warp */
571 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
572 edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
573 edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
574 bitmap_set(root_clutch->scr_warp_available, edf_bucket->scrb_bucket);
575 return edf_bucket;
576 }
577
578 /*
579 * Looks like there is a root bucket which is higher in the natural priority
580 * order than edf_bucket and might have some warp remaining.
581 */
582 warp_bucket = &root_clutch->scr_buckets[warp_bucket_index];
583 if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
584 /* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
585 warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
586 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
587 return warp_bucket;
588 }
589 if (warp_bucket->scrb_warped_deadline > timestamp) {
590 /* Root bucket already has a warp window open with some warp remaining */
591 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
592 return warp_bucket;
593 }
594
595 /* For this bucket, warp window was opened sometime in the past but has now
596 * expired. Mark the bucket as not avilable for warp anymore and re-run the
597 * warp bucket selection logic.
598 */
599 warp_bucket->scrb_warp_remaining = 0;
600 bitmap_clear(root_clutch->scr_warp_available, warp_bucket->scrb_bucket);
601 goto evaluate_warp_buckets;
602 }
603
604 /*
605 * sched_clutch_root_bucket_deadline_calculate()
606 *
607 * Calculate the deadline for the bucket based on its WCEL
608 */
609 static uint64_t
610 sched_clutch_root_bucket_deadline_calculate(
611 sched_clutch_root_bucket_t root_bucket,
612 uint64_t timestamp)
613 {
614 /* For fixpri AboveUI bucket always return it as the earliest deadline */
615 if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
616 return 0;
617 }
618
619 /* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
620 return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
621 }
622
623 /*
624 * sched_clutch_root_bucket_deadline_update()
625 *
626 * Routine to update the deadline of the root bucket when it is selected.
627 * Updating the deadline also moves the root_bucket in the EDF priority
628 * queue.
629 */
630 static void
631 sched_clutch_root_bucket_deadline_update(
632 sched_clutch_root_bucket_t root_bucket,
633 sched_clutch_root_t root_clutch,
634 uint64_t timestamp)
635 {
636 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
637 /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
638 return;
639 }
640 uint64_t old_deadline = root_bucket->scrb_deadline;
641 uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
642 assert(old_deadline <= new_deadline);
643 if (old_deadline != new_deadline) {
644 root_bucket->scrb_deadline = new_deadline;
645 /* Since the priority queue is a min-heap, use the decrease routine even though the deadline has a larger value now */
646 priority_queue_entry_decrease(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, PRIORITY_QUEUE_KEY_NONE, sched_clutch_root_bucket_compare);
647 }
648 }
649
650 /*
651 * sched_clutch_root_bucket_runnable()
652 *
653 * Routine to insert a newly runnable root bucket into the hierarchy.
654 * Also updates the deadline and warp parameters as necessary.
655 */
656 static void
657 sched_clutch_root_bucket_runnable(
658 sched_clutch_root_bucket_t root_bucket,
659 sched_clutch_root_t root_clutch,
660 uint64_t timestamp)
661 {
662 /* Mark the root bucket as runnable */
663 bitmap_set(root_clutch->scr_runnable_bitmap, root_bucket->scrb_bucket);
664 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_ROOT_BUCKET_STATE) | DBG_FUNC_NONE,
665 root_bucket->scrb_bucket, SCHED_CLUTCH_STATE_RUNNABLE, 0, 0, 0);
666
667 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
668 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
669 return;
670 }
671
672 root_bucket->scrb_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
673 priority_queue_insert(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, PRIORITY_QUEUE_KEY_NONE, sched_clutch_root_bucket_compare);
674
675 if (root_bucket->scrb_warp_remaining) {
676 /* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
677 bitmap_set(root_clutch->scr_warp_available, root_bucket->scrb_bucket);
678 }
679 }
680
681 /*
682 * sched_clutch_root_bucket_empty()
683 *
684 * Routine to remove an empty root bucket from the hierarchy.
685 * Also updates the deadline and warp parameters as necessary.
686 */
687 static void
688 sched_clutch_root_bucket_empty(
689 sched_clutch_root_bucket_t root_bucket,
690 sched_clutch_root_t root_clutch,
691 uint64_t timestamp)
692 {
693 bitmap_clear(root_clutch->scr_runnable_bitmap, root_bucket->scrb_bucket);
694 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_ROOT_BUCKET_STATE) | DBG_FUNC_NONE,
695 root_bucket->scrb_bucket, SCHED_CLUTCH_STATE_EMPTY, 0, 0, 0);
696
697 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
698 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
699 return;
700 }
701
702 priority_queue_remove(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, sched_clutch_root_bucket_compare);
703
704 bitmap_clear(root_clutch->scr_warp_available, root_bucket->scrb_bucket);
705 if (root_bucket->scrb_warped_deadline > timestamp) {
706 /*
707 * For root buckets that were using the warp, check if the warp
708 * deadline is in the future. If yes, remove the wall time the
709 * warp was active and update the warp remaining. This allows
710 * the root bucket to use the remaining warp the next time it
711 * becomes runnable.
712 */
713 root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
714 } else if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
715 /*
716 * If the root bucket's warped deadline is in the past, it has used up
717 * all the warp it was assigned. Empty out its warp remaining.
718 */
719 root_bucket->scrb_warp_remaining = 0;
720 }
721 }
722
723 /*
724 * sched_clutch_root_pri_update()
725 *
726 * The root level priority is used for thread selection and preemption
727 * logic.
728 */
729 static void
730 sched_clutch_root_pri_update(
731 sched_clutch_root_t root_clutch)
732 {
733 sched_clutch_hierarchy_locked_assert(root_clutch);
734 if (bitmap_lsb_first(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
735 /* No runnable root buckets */
736 root_clutch->scr_priority = NOPRI;
737 assert(root_clutch->scr_urgency == 0);
738 return;
739 }
740 sched_clutch_root_bucket_t root_bucket = NULL;
741 /* Special case for AboveUI (uses same logic as thread selection) */
742 if (sched_clutch_root_select_aboveui(root_clutch)) {
743 root_bucket = &root_clutch->scr_buckets[TH_BUCKET_FIXPRI];
744 } else {
745 /*
746 * AboveUI bucket is not runnable or has a low clutch bucket priority,
747 * select the next runnable root bucket in natural priority order. This logic
748 * is slightly different from thread selection, because thread selection
749 * considers deadlines, warps etc. to decide the most optimal bucket at a
750 * given timestamp. Since the priority value is used for preemption decisions
751 * only, it needs to be based on the highest runnable thread available in
752 * the timeshare domain.
753 */
754 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
755 assert(root_bucket_index != -1);
756 root_bucket = &root_clutch->scr_buckets[root_bucket_index];
757 }
758 /* For the selected root bucket, find the highest priority clutch bucket */
759 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
760 root_clutch->scr_priority = priority_queue_max_key(&clutch_bucket->scb_clutchpri_prioq);
761 }
762
763 /*
764 * sched_clutch_root_urgency_inc()
765 *
766 * Routine to increment the urgency at the root level based on the thread
767 * priority that is being inserted into the hierarchy. The root urgency
768 * counter is updated based on the urgency of threads in any of the
769 * clutch buckets which are part of the hierarchy.
770 *
771 * Always called with the pset lock held.
772 */
773 static void
774 sched_clutch_root_urgency_inc(
775 sched_clutch_root_t root_clutch,
776 thread_t thread)
777 {
778 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
779 root_clutch->scr_urgency++;
780 }
781 }
782
783 /*
784 * sched_clutch_root_urgency_dec()
785 *
786 * Routine to decrement the urgency at the root level based on the thread
787 * priority that is being removed from the hierarchy. The root urgency
788 * counter is updated based on the urgency of threads in any of the
789 * clutch buckets which are part of the hierarchy.
790 *
791 * Always called with the pset lock held.
792 */
793 static void
794 sched_clutch_root_urgency_dec(
795 sched_clutch_root_t root_clutch,
796 thread_t thread)
797 {
798 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
799 root_clutch->scr_urgency--;
800 }
801 }
802
803 /*
804 * Clutch bucket level scheduling
805 *
806 * The second level of scheduling is the clutch bucket level scheduling
807 * which tries to schedule thread groups within root_buckets. Each
808 * clutch represents a thread group and a clutch_bucket represents
809 * threads at a particular sched_bucket within that thread group. The
810 * goal of this level of scheduling is to allow interactive thread
811 * groups low latency access to the CPU. It also provides slight
812 * scheduling preference for App and unrestricted thread groups.
813 *
814 * The clutch bucket scheduling algorithm measures an interactivity
815 * score for all clutch buckets. The interactivity score is based
816 * on the ratio of the CPU used and the voluntary blocking of threads
817 * within the clutch bucket. The algorithm is very close to the ULE
818 * scheduler on FreeBSD in terms of calculations. The interactivity
819 * score provides an interactivity boost in the range of
820 * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
821 * thread groups to win over CPU spinners.
822 */
823
824 /* Priority boost range for interactivity */
825 #define SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT (8)
826 uint8_t sched_clutch_bucket_interactive_pri = SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT;
827
828 /* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
829 uint64_t sched_clutch_bucket_adjust_threshold = 0;
830 #define SCHED_CLUTCH_BUCKET_ADJUST_THRESHOLD_USECS (500000)
831
832 /* The ratio to scale the cpu/blocked time per window */
833 #define SCHED_CLUTCH_BUCKET_ADJUST_RATIO (10)
834
835 /* rate at which interactivity score is recalculated. This keeps the score smooth in terms of extremely bursty behavior */
836 uint64_t sched_clutch_bucket_interactivity_delta = 0;
837 #define SCHED_CLUTCH_BUCKET_INTERACTIVITY_DELTA_USECS_DEFAULT (25000)
838
839 /*
840 * In order to allow App thread groups some preference over daemon thread
841 * groups, the App clutch_buckets get a 8 point boost. The boost value should
842 * be chosen such that badly behaved apps are still penalized over well
843 * behaved interactive daemon clutch_buckets.
844 */
845 #define SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT (8)
846 uint8_t sched_clutch_bucket_pri_boost = SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT;
847
848 /* Initial value for voluntary blocking time for the clutch_bucket */
849 #define SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID (uint32_t)(~0)
850
851 /*
852 * sched_clutch_bucket_init()
853 *
854 * Initializer for clutch buckets.
855 */
856 static void
857 sched_clutch_bucket_init(
858 sched_clutch_bucket_t clutch_bucket,
859 sched_clutch_t clutch,
860 sched_bucket_t bucket)
861 {
862 bzero(clutch_bucket, sizeof(struct sched_clutch_bucket));
863
864 clutch_bucket->scb_bucket = bucket;
865 /* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
866 clutch_bucket->scb_priority = 0;
867 /*
868 * All thread groups should be initialized to be interactive; this allows the newly launched
869 * thread groups to fairly compete with already running thread groups.
870 */
871 clutch_bucket->scb_interactivity_score = (sched_clutch_bucket_interactive_pri * 2);
872 clutch_bucket->scb_foreign = false;
873
874 os_atomic_store(&clutch_bucket->scb_timeshare_tick, 0, relaxed);
875 os_atomic_store(&clutch_bucket->scb_pri_shift, INT8_MAX, relaxed);
876
877 clutch_bucket->scb_interactivity_ts = 0;
878 clutch_bucket->scb_blocked_ts = SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID;
879 clutch_bucket->scb_clutch = clutch;
880 clutch_bucket->scb_root = NULL;
881 priority_queue_init(&clutch_bucket->scb_clutchpri_prioq, PRIORITY_QUEUE_BUILTIN_KEY | PRIORITY_QUEUE_MAX_HEAP);
882 run_queue_init(&clutch_bucket->scb_runq);
883 }
884
885 /*
886 * sched_clutch_init_with_thread_group()
887 *
888 * Initialize the sched_clutch when the thread group is being created
889 */
890 void
891 sched_clutch_init_with_thread_group(
892 sched_clutch_t clutch,
893 struct thread_group *tg)
894 {
895 os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
896
897 /* Initialize all the clutch buckets */
898 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
899 sched_clutch_bucket_init(&(clutch->sc_clutch_buckets[i]), clutch, i);
900 }
901
902 /* Grouping specific fields */
903 clutch->sc_tg = tg;
904 os_atomic_store(&clutch->sc_tg_priority, 0, relaxed);
905 }
906
907 /*
908 * sched_clutch_destroy()
909 *
910 * Destructor for clutch; called from thread group release code.
911 */
912 void
913 sched_clutch_destroy(
914 __unused sched_clutch_t clutch)
915 {
916 assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
917 }
918
919 #if __AMP__
920
921 /*
922 * sched_clutch_bucket_foreign()
923 *
924 * Identifies if the clutch bucket is a foreign (not recommended for) this
925 * hierarchy. This is possible due to the recommended hierarchy/pset not
926 * available for scheduling currently.
927 */
928 static boolean_t
929 sched_clutch_bucket_foreign(sched_clutch_root_t root_clutch, sched_clutch_bucket_t clutch_bucket)
930 {
931 assert(clutch_bucket->scb_thr_count > 0);
932 if (!sched_clutch_pset_available(root_clutch->scr_pset)) {
933 /* Even though the pset was not available for scheduling, threads
934 * are being put in its runq (this might be due to the other pset
935 * being turned off and this being the master processor pset).
936 * Mark the clutch bucket as foreign so that when the other
937 * pset becomes available, it moves the clutch bucket accordingly.
938 */
939 return true;
940 }
941 thread_t thread = run_queue_peek(&clutch_bucket->scb_runq);
942 pset_cluster_type_t pset_type = recommended_pset_type(thread);
943 return pset_type != root_clutch->scr_pset->pset_cluster_type;
944 }
945
946 #endif /* __AMP__ */
947
948 /*
949 * sched_clutch_bucket_hierarchy_insert()
950 *
951 * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
952 */
953 static void
954 sched_clutch_bucket_hierarchy_insert(
955 sched_clutch_root_t root_clutch,
956 sched_clutch_bucket_t clutch_bucket,
957 sched_bucket_t bucket,
958 uint64_t timestamp,
959 sched_clutch_bucket_options_t options)
960 {
961 sched_clutch_hierarchy_locked_assert(root_clutch);
962 if (bucket > TH_BUCKET_FIXPRI) {
963 /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
964 enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
965 }
966 #if __AMP__
967 /* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
968 if (sched_clutch_bucket_foreign(root_clutch, clutch_bucket)) {
969 clutch_bucket->scb_foreign = true;
970 enqueue_tail(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
971 }
972 #endif /* __AMP__ */
973 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket];
974
975 /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
976 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
977 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
978 }
979
980 /* Insert the clutch bucket into the root bucket run queue with order based on options */
981 sched_clutch_bucket_runq_enqueue(&root_bucket->scrb_clutch_buckets, clutch_bucket, options);
982 os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
983 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_STATE) | DBG_FUNC_NONE,
984 thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, SCHED_CLUTCH_STATE_RUNNABLE, clutch_bucket->scb_priority, 0);
985 }
986
987 /*
988 * sched_clutch_bucket_hierarchy_remove()
989 *
990 * Rotuine to remove a empty clutch bucket from the root hierarchy.
991 */
992 static void
993 sched_clutch_bucket_hierarchy_remove(
994 sched_clutch_root_t root_clutch,
995 sched_clutch_bucket_t clutch_bucket,
996 sched_bucket_t bucket,
997 uint64_t timestamp,
998 __unused sched_clutch_bucket_options_t options)
999 {
1000 sched_clutch_hierarchy_locked_assert(root_clutch);
1001 if (bucket > TH_BUCKET_FIXPRI) {
1002 /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
1003 remqueue(&clutch_bucket->scb_listlink);
1004 }
1005 #if __AMP__
1006 if (clutch_bucket->scb_foreign) {
1007 clutch_bucket->scb_foreign = false;
1008 remqueue(&clutch_bucket->scb_foreignlink);
1009 }
1010 #endif /* __AMP__ */
1011
1012 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket];
1013
1014 /* Remove the clutch bucket from the root bucket priority queue */
1015 sched_clutch_bucket_runq_remove(&root_bucket->scrb_clutch_buckets, clutch_bucket);
1016 os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
1017 clutch_bucket->scb_blocked_ts = timestamp;
1018 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_STATE) | DBG_FUNC_NONE,
1019 thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, SCHED_CLUTCH_STATE_EMPTY, 0, 0);
1020
1021 /* If the root bucket priority queue is now empty, remove it from the root priority queue */
1022 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1023 sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
1024 }
1025 }
1026
1027 /*
1028 * sched_clutch_bucket_base_pri()
1029 *
1030 * Calculates the "base" priority of the clutch bucket. The base
1031 * priority of the clutch bucket is the sum of the max of highest
1032 * base_pri and highest sched_pri in the clutch bucket and any
1033 * grouping specific (App/Daemon...) boosts applicable to the
1034 * clutch_bucket.
1035 */
1036 static uint8_t
1037 sched_clutch_bucket_base_pri(
1038 sched_clutch_bucket_t clutch_bucket)
1039 {
1040 uint8_t clutch_boost = 0;
1041 assert(clutch_bucket->scb_runq.count != 0);
1042
1043 sched_clutch_t clutch = clutch_bucket->scb_clutch;
1044
1045 /*
1046 * Since the clutch bucket can contain threads that are members of the group due
1047 * to the sched_pri being promoted or due to their base pri, the base priority of
1048 * the entire clutch bucket should be based on the highest thread (promoted or base)
1049 * in the clutch bucket.
1050 */
1051 uint8_t max_pri = priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq) ? 0 : priority_queue_max_key(&clutch_bucket->scb_clutchpri_prioq);
1052
1053 /*
1054 * For all AboveUI clutch buckets and clutch buckets for thread groups that
1055 * havent been specified as SCHED_CLUTCH_TG_PRI_LOW, give a priority boost
1056 */
1057 if ((clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) ||
1058 (os_atomic_load(&clutch->sc_tg_priority, relaxed) != SCHED_CLUTCH_TG_PRI_LOW)) {
1059 clutch_boost = sched_clutch_bucket_pri_boost;
1060 }
1061 return max_pri + clutch_boost;
1062 }
1063
1064 /*
1065 * sched_clutch_bucket_interactivity_score_calculate()
1066 *
1067 * Routine to calculate the interactivity score for the clutch bucket. The
1068 * interactivity score is based on the ratio of CPU used by all threads in
1069 * the bucket and the blocked time of the bucket as a whole.
1070 */
1071 static uint8_t
1072 sched_clutch_bucket_interactivity_score_calculate(
1073 sched_clutch_bucket_t clutch_bucket,
1074 uint64_t timestamp)
1075 {
1076 if (clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) {
1077 /*
1078 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
1079 * priorities, make sure all AboveUI buckets are marked interactive.
1080 */
1081 assert(clutch_bucket->scb_interactivity_score == (2 * sched_clutch_bucket_interactive_pri));
1082 return clutch_bucket->scb_interactivity_score;
1083 }
1084
1085 if (clutch_bucket->scb_interactivity_ts == 0) {
1086 /*
1087 * This indicates a newly initialized clutch bucket; return the default interactivity score
1088 * and update timestamp.
1089 */
1090 clutch_bucket->scb_interactivity_ts = timestamp;
1091 return clutch_bucket->scb_interactivity_score;
1092 }
1093
1094 if (timestamp < (clutch_bucket->scb_interactivity_ts + sched_clutch_bucket_interactivity_delta)) {
1095 return clutch_bucket->scb_interactivity_score;
1096 }
1097
1098 /* Check if the clutch bucket accounting needs to be scaled */
1099 sched_clutch_bucket_cpu_adjust(clutch_bucket);
1100 clutch_bucket->scb_interactivity_ts = timestamp;
1101
1102 sched_clutch_bucket_cpu_data_t scb_cpu_data;
1103 scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, relaxed);
1104 clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
1105 clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
1106
1107 /*
1108 * In extremely CPU contended cases, it is possible that the clutch bucket has been runnable
1109 * for a long time but none of its threads have been picked up for execution. In that case, both
1110 * the CPU used and blocked would be 0.
1111 */
1112 if ((cpu_blocked == 0) && (cpu_used == 0)) {
1113 return clutch_bucket->scb_interactivity_score;
1114 }
1115
1116 /*
1117 * For all timeshare buckets, calculate the interactivity score of the bucket
1118 * and add it to the base priority
1119 */
1120 uint8_t interactive_score = 0;
1121 if (cpu_blocked > cpu_used) {
1122 /* Interactive clutch_bucket case */
1123 interactive_score = sched_clutch_bucket_interactive_pri +
1124 ((sched_clutch_bucket_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
1125 } else {
1126 /* Non-interactive clutch_bucket case */
1127 interactive_score = ((sched_clutch_bucket_interactive_pri * cpu_blocked) / cpu_used);
1128 }
1129 clutch_bucket->scb_interactivity_score = interactive_score;
1130 return interactive_score;
1131 }
1132
1133 /*
1134 * sched_clutch_bucket_pri_calculate()
1135 *
1136 * The priority calculation algorithm for the clutch_bucket is a slight
1137 * modification on the ULE interactivity score. It uses the base priority
1138 * of the clutch bucket and applies an interactivity score boost to the
1139 * highly responsive clutch buckets.
1140 */
1141
1142 static uint8_t
1143 sched_clutch_bucket_pri_calculate(
1144 sched_clutch_bucket_t clutch_bucket,
1145 uint64_t timestamp)
1146 {
1147 /* For empty clutch buckets, return priority 0 */
1148 if (clutch_bucket->scb_thr_count == 0) {
1149 return 0;
1150 }
1151
1152 uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
1153 uint8_t interactive_score = sched_clutch_bucket_interactivity_score_calculate(clutch_bucket, timestamp);
1154
1155 assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1156 uint8_t pri = base_pri + interactive_score;
1157 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
1158 thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
1159 return pri;
1160 }
1161
1162 /*
1163 * sched_clutch_root_bucket_highest_clutch_bucket()
1164 *
1165 * Routine to find the highest priority clutch bucket
1166 * within the root bucket.
1167 */
1168 static sched_clutch_bucket_t
1169 sched_clutch_root_bucket_highest_clutch_bucket(
1170 sched_clutch_root_bucket_t root_bucket)
1171 {
1172 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1173 return NULL;
1174 }
1175 return sched_clutch_bucket_runq_peek(&root_bucket->scrb_clutch_buckets);
1176 }
1177
1178 /*
1179 * sched_clutch_bucket_runnable()
1180 *
1181 * Perform all operations needed when a new clutch bucket becomes runnable.
1182 * It involves inserting the clutch_bucket into the hierarchy and updating the
1183 * root priority appropriately.
1184 */
1185 static boolean_t
1186 sched_clutch_bucket_runnable(
1187 sched_clutch_bucket_t clutch_bucket,
1188 sched_clutch_root_t root_clutch,
1189 uint64_t timestamp,
1190 sched_clutch_bucket_options_t options)
1191 {
1192 sched_clutch_hierarchy_locked_assert(root_clutch);
1193 sched_clutch_bucket_cpu_blocked_update(clutch_bucket, timestamp);
1194 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1195 sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1196 /* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
1197 sched_clutch_bucket_timeshare_update(clutch_bucket);
1198 int16_t root_old_pri = root_clutch->scr_priority;
1199 sched_clutch_root_pri_update(root_clutch);
1200 return root_clutch->scr_priority > root_old_pri;
1201 }
1202
1203 /*
1204 * sched_clutch_bucket_update()
1205 *
1206 * Update the clutch_bucket's position in the hierarchy. This routine is
1207 * called when a new thread is inserted or removed from a runnable clutch
1208 * bucket. The options specify some properties about the clutch bucket
1209 * insertion order into the clutch bucket runq.
1210 */
1211 static boolean_t
1212 sched_clutch_bucket_update(
1213 sched_clutch_bucket_t clutch_bucket,
1214 sched_clutch_root_t root_clutch,
1215 uint64_t timestamp,
1216 sched_clutch_bucket_options_t options)
1217 {
1218 sched_clutch_hierarchy_locked_assert(root_clutch);
1219 uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1220 sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1221 if (new_pri == clutch_bucket->scb_priority) {
1222 /*
1223 * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1224 * to the end of the runq. Typically used when a thread is selected for execution
1225 * from a clutch bucket.
1226 */
1227 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
1228 sched_clutch_bucket_runq_rotate(bucket_runq, clutch_bucket);
1229 }
1230 return false;
1231 }
1232 sched_clutch_bucket_runq_remove(bucket_runq, clutch_bucket);
1233 clutch_bucket->scb_priority = new_pri;
1234 sched_clutch_bucket_runq_enqueue(bucket_runq, clutch_bucket, options);
1235
1236 int16_t root_old_pri = root_clutch->scr_priority;
1237 sched_clutch_root_pri_update(root_clutch);
1238 return root_clutch->scr_priority > root_old_pri;
1239 }
1240
1241 /*
1242 * sched_clutch_bucket_empty()
1243 *
1244 * Perform all the operations needed when a clutch_bucket is no longer runnable.
1245 * It involves removing the clutch bucket from the hierarchy and updaing the root
1246 * priority appropriately.
1247 */
1248 static void
1249 sched_clutch_bucket_empty(
1250 sched_clutch_bucket_t clutch_bucket,
1251 sched_clutch_root_t root_clutch,
1252 uint64_t timestamp,
1253 sched_clutch_bucket_options_t options)
1254 {
1255 sched_clutch_hierarchy_locked_assert(root_clutch);
1256 sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1257 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1258 sched_clutch_root_pri_update(root_clutch);
1259 }
1260
1261 /*
1262 * sched_clutch_cpu_usage_update()
1263 *
1264 * Routine to update CPU usage of the thread in the hierarchy.
1265 */
1266 void
1267 sched_clutch_cpu_usage_update(
1268 thread_t thread,
1269 uint64_t delta)
1270 {
1271 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1272 return;
1273 }
1274 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1275 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
1276 sched_clutch_bucket_cpu_usage_update(clutch_bucket, delta);
1277 }
1278
1279 /*
1280 * sched_clutch_bucket_cpu_usage_update()
1281 *
1282 * Routine to update the CPU usage of the clutch_bucket.
1283 */
1284 static void
1285 sched_clutch_bucket_cpu_usage_update(
1286 sched_clutch_bucket_t clutch_bucket,
1287 uint64_t delta)
1288 {
1289 if (clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) {
1290 /* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1291 return;
1292 }
1293
1294 /*
1295 * The CPU usage should not overflow the clutch_cpu_data_t type. Since the usage is used to
1296 * calculate interactivity score, it is safe to restrict it to CLUTCH_CPU_DATA_MAX.
1297 */
1298 delta = MIN(delta, CLUTCH_CPU_DATA_MAX);
1299 os_atomic_add_orig(&(clutch_bucket->scb_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1300 }
1301
1302 /*
1303 * sched_clutch_bucket_cpu_blocked_update()
1304 *
1305 * Routine to update CPU blocked time for clutch_bucket.
1306 */
1307 static void
1308 sched_clutch_bucket_cpu_blocked_update(
1309 sched_clutch_bucket_t clutch_bucket,
1310 uint64_t timestamp)
1311 {
1312 if ((clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) ||
1313 (clutch_bucket->scb_blocked_ts == SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID)) {
1314 /* For Above UI bucket and a newly initialized clutch bucket, nothing to do here */
1315 return;
1316 }
1317
1318 uint64_t blocked_time = timestamp - clutch_bucket->scb_blocked_ts;
1319 if (blocked_time > sched_clutch_bucket_adjust_threshold) {
1320 blocked_time = sched_clutch_bucket_adjust_threshold;
1321 }
1322
1323 /*
1324 * The CPU blocked should not overflow the clutch_cpu_data_t type. Since the blocked is used to
1325 * calculate interactivity score, it is safe to restrict it to CLUTCH_CPU_DATA_MAX.
1326 */
1327 blocked_time = MIN(blocked_time, CLUTCH_CPU_DATA_MAX);
1328 clutch_cpu_data_t __assert_only cpu_blocked_orig = os_atomic_add_orig(&(clutch_bucket->scb_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
1329 /* The blocked time is scaled every so often, it should never overflow */
1330 assert(blocked_time <= (CLUTCH_CPU_DATA_MAX - cpu_blocked_orig));
1331 }
1332
1333 /*
1334 * sched_clutch_bucket_cpu_adjust()
1335 *
1336 * Routine to scale the cpu usage and blocked time once the sum gets bigger
1337 * than sched_clutch_bucket_adjust_threshold. Allows the values to remain
1338 * manageable and maintain the same ratio while allowing clutch buckets to
1339 * adjust behavior and reflect in the interactivity score in a reasonable
1340 * amount of time.
1341 */
1342 static void
1343 sched_clutch_bucket_cpu_adjust(
1344 sched_clutch_bucket_t clutch_bucket)
1345 {
1346 sched_clutch_bucket_cpu_data_t old_cpu_data = {};
1347 sched_clutch_bucket_cpu_data_t new_cpu_data = {};
1348 do {
1349 old_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, relaxed);
1350 clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
1351 clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
1352 if ((cpu_used + cpu_blocked) < sched_clutch_bucket_adjust_threshold) {
1353 return;
1354 }
1355
1356 /*
1357 * The accumulation of CPU used and blocked is past the threshold; scale it
1358 * down to lose old history.
1359 */
1360 new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_ADJUST_RATIO;
1361 new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_ADJUST_RATIO;
1362 } while (!os_atomic_cmpxchg(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed));
1363 }
1364
1365 /*
1366 * Thread level scheduling algorithm
1367 *
1368 * The thread level scheduling algorithm uses the mach timeshare
1369 * decay based algorithm to achieve sharing between threads within the
1370 * same clutch bucket. The load/priority shifts etc. are all maintained
1371 * at the clutch bucket level and used for decay calculation of the
1372 * threads. The load sampling is still driven off the scheduler tick
1373 * for runnable clutch buckets (it does not use the new higher frequency
1374 * EWMA based load calculation). The idea is that the contention and load
1375 * within clutch_buckets should be limited enough to not see heavy decay
1376 * and timeshare effectively.
1377 */
1378
1379 /*
1380 * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1381 *
1382 * Increment the run count for the clutch bucket associated with the
1383 * thread.
1384 */
1385 uint32_t
1386 sched_clutch_thread_run_bucket_incr(
1387 thread_t thread,
1388 sched_bucket_t bucket)
1389 {
1390 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1391 return 0;
1392 }
1393 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1394 return sched_clutch_run_bucket_incr(clutch, bucket);
1395 }
1396
1397 static uint32_t
1398 sched_clutch_run_bucket_incr(
1399 sched_clutch_t clutch,
1400 sched_bucket_t bucket)
1401 {
1402 assert(bucket != TH_BUCKET_RUN);
1403 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]);
1404 uint32_t result = os_atomic_inc(&(clutch_bucket->scb_run_count), relaxed);
1405 return result;
1406 }
1407
1408 /*
1409 * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
1410 *
1411 * Decrement the run count for the clutch bucket associated with the
1412 * thread.
1413 */
1414 uint32_t
1415 sched_clutch_thread_run_bucket_decr(
1416 thread_t thread,
1417 sched_bucket_t bucket)
1418 {
1419 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1420 return 0;
1421 }
1422 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1423 return sched_clutch_run_bucket_decr(clutch, bucket);
1424 }
1425
1426 static uint32_t
1427 sched_clutch_run_bucket_decr(
1428 sched_clutch_t clutch,
1429 sched_bucket_t bucket)
1430 {
1431 assert(bucket != TH_BUCKET_RUN);
1432 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]);
1433 uint32_t result = os_atomic_dec(&(clutch_bucket->scb_run_count), relaxed);
1434 return result;
1435 }
1436
1437 /*
1438 * sched_clutch_bucket_timeshare_update()
1439 *
1440 * Routine to update the load and priority shift for the clutch_bucket every
1441 * sched_tick. For runnable clutch_buckets, the sched tick handling code
1442 * iterates the clutch buckets and calls this routine. For all others, the
1443 * clutch_bucket maintains a "last updated schedtick" parameter. As threads
1444 * become runnable in the clutch bucket, if this value is outdated, the load
1445 * and shifts are updated.
1446 *
1447 * Possible optimization:
1448 * - The current algorithm samples the load every sched tick (125ms).
1449 * This is prone to spikes in runnable counts; if that turns out to be
1450 * a problem, a simple solution would be to do the EWMA trick to sample
1451 * load at every load_tick (30ms) and use the averaged value for the pri
1452 * shift calculation.
1453 */
1454 static void
1455 sched_clutch_bucket_timeshare_update(
1456 sched_clutch_bucket_t clutch_bucket)
1457 {
1458 if (clutch_bucket->scb_bucket < TH_BUCKET_SHARE_FG) {
1459 return;
1460 }
1461
1462 /*
1463 * Update the timeshare parameters for the clutch bucket if they havent been updated
1464 * in this tick.
1465 */
1466 uint32_t bucket_sched_ts = os_atomic_load(&clutch_bucket->scb_timeshare_tick, relaxed);
1467 uint32_t current_sched_ts = sched_tick;
1468 if (bucket_sched_ts != current_sched_ts) {
1469 os_atomic_store(&clutch_bucket->scb_timeshare_tick, current_sched_ts, relaxed);
1470 uint32_t bucket_load = (os_atomic_load(&clutch_bucket->scb_run_count, relaxed) / processor_avail_count);
1471 bucket_load = MIN(bucket_load, NRQS - 1);
1472 uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_load];
1473 os_atomic_store(&clutch_bucket->scb_pri_shift, pri_shift, relaxed);
1474 }
1475 }
1476
1477 /*
1478 * sched_clutch_thread_clutch_update()
1479 *
1480 * Routine called when the thread changes its thread group. The current
1481 * implementation relies on the fact that the thread group is changed only
1482 * from the context of the thread itself. Due to this fact, the thread
1483 * group change causes only counter updates in the old & new clutch
1484 * buckets and no hierarchy changes. The routine also attributes the CPU
1485 * used so far to the old clutch.
1486 */
1487 void
1488 sched_clutch_thread_clutch_update(
1489 thread_t thread,
1490 sched_clutch_t old_clutch,
1491 sched_clutch_t new_clutch)
1492 {
1493 uint32_t cpu_delta;
1494 assert(current_thread() == thread);
1495
1496 if (old_clutch) {
1497 sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
1498 /*
1499 * Calculate the CPU used by this thread in the old bucket and
1500 * add it to the old clutch bucket. This uses the same CPU usage
1501 * logic as update_priority etc.
1502 */
1503 thread_timer_delta(thread, cpu_delta);
1504 if (thread->pri_shift < INT8_MAX) {
1505 thread->sched_usage += cpu_delta;
1506 }
1507 thread->cpu_delta += cpu_delta;
1508 sched_clutch_bucket_cpu_usage_update(&(old_clutch->sc_clutch_buckets[thread->th_sched_bucket]), cpu_delta);
1509 }
1510
1511 if (new_clutch) {
1512 sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
1513 }
1514 }
1515
1516 /* Thread Insertion/Removal/Selection routines */
1517
1518 /*
1519 * sched_clutch_thread_insert()
1520 *
1521 * Routine to insert a thread into the sched clutch hierarchy.
1522 * Update the counts at all levels of the hierarchy and insert the nodes
1523 * as they become runnable. Always called with the pset lock held.
1524 */
1525 static boolean_t
1526 sched_clutch_thread_insert(
1527 sched_clutch_root_t root_clutch,
1528 thread_t thread,
1529 integer_t options)
1530 {
1531 boolean_t result = FALSE;
1532
1533 sched_clutch_hierarchy_locked_assert(root_clutch);
1534 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1535 assert(thread->thread_group == clutch->sc_tg);
1536
1537 uint64_t current_timestamp = mach_absolute_time();
1538 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
1539 assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
1540
1541 /* Insert thread into the clutch_bucket runq using sched_pri */
1542 run_queue_enqueue(&clutch_bucket->scb_runq, thread, options);
1543 /* Increment the urgency counter for the root if necessary */
1544 sched_clutch_root_urgency_inc(root_clutch, thread);
1545
1546 /* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
1547 priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->sched_clutchpri_link,
1548 sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri,
1549 PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
1550 os_atomic_inc(&clutch->sc_thr_count, relaxed);
1551 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_STATE) | DBG_FUNC_NONE,
1552 thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, thread_tid(thread), SCHED_CLUTCH_STATE_RUNNABLE, 0);
1553
1554 /* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
1555 sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
1556 if (clutch_bucket->scb_thr_count == 0) {
1557 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
1558 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
1559 result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp, scb_options);
1560 } else {
1561 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
1562 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
1563 result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, scb_options);
1564 }
1565 return result;
1566 }
1567
1568 /*
1569 * sched_clutch_thread_remove()
1570 *
1571 * Routine to remove a thread from the sched clutch hierarchy.
1572 * Update the counts at all levels of the hierarchy and remove the nodes
1573 * as they become empty. Always called with the pset lock held.
1574 */
1575 static void
1576 sched_clutch_thread_remove(
1577 sched_clutch_root_t root_clutch,
1578 thread_t thread,
1579 uint64_t current_timestamp,
1580 sched_clutch_bucket_options_t options)
1581 {
1582 sched_clutch_hierarchy_locked_assert(root_clutch);
1583 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1584 assert(thread->thread_group == clutch->sc_tg);
1585 assert(thread->runq != PROCESSOR_NULL);
1586
1587 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
1588 assert(clutch_bucket->scb_root == root_clutch);
1589
1590 /* Decrement the urgency counter for the root if necessary */
1591 sched_clutch_root_urgency_dec(root_clutch, thread);
1592 /* Remove thread from the clutch_bucket */
1593 run_queue_remove(&clutch_bucket->scb_runq, thread);
1594
1595 priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->sched_clutchpri_link,
1596 PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
1597 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_STATE) | DBG_FUNC_NONE,
1598 thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, thread_tid(thread), SCHED_CLUTCH_STATE_EMPTY, 0);
1599
1600 /* Update counts at various levels of the hierarchy */
1601 os_atomic_dec(&clutch->sc_thr_count, relaxed);
1602 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
1603 sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
1604
1605 /* Remove the clutch from hierarchy (if needed) and update properties */
1606 if (clutch_bucket->scb_thr_count == 0) {
1607 sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp, options);
1608 } else {
1609 sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, options);
1610 }
1611 }
1612
1613 /*
1614 * sched_clutch_thread_highest()
1615 *
1616 * Routine to find and remove the highest priority thread
1617 * from the sched clutch hierarchy. The algorithm looks at the
1618 * hierarchy for the most eligible runnable thread and calls
1619 * sched_clutch_thread_remove(). Always called with the
1620 * pset lock held.
1621 */
1622 static thread_t
1623 sched_clutch_thread_highest(
1624 sched_clutch_root_t root_clutch)
1625 {
1626 sched_clutch_hierarchy_locked_assert(root_clutch);
1627 uint64_t current_timestamp = mach_absolute_time();
1628
1629 /* Select the highest priority root bucket */
1630 sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp);
1631 if (root_bucket == NULL) {
1632 return THREAD_NULL;
1633 }
1634 /* Since a thread is being picked from this root bucket, update its deadline */
1635 sched_clutch_root_bucket_deadline_update(root_bucket, root_clutch, current_timestamp);
1636
1637 /* Find the highest priority clutch bucket in this root bucket */
1638 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
1639 assert(clutch_bucket != NULL);
1640
1641 /* Find the highest priority runnable thread in this clutch bucket */
1642 thread_t thread = run_queue_peek(&clutch_bucket->scb_runq);
1643 assert(thread != NULL);
1644
1645 /* Remove and return the thread from the hierarchy; also round robin the clutch bucket if the priority remains unchanged */
1646 sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
1647 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
1648 thread_tid(thread), thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, 0, 0);
1649 return thread;
1650 }
1651
1652
1653 /* High level global accessor routines */
1654
1655 /*
1656 * sched_clutch_root_urgency()
1657 *
1658 * Routine to get the urgency of the highest runnable
1659 * thread in the hierarchy.
1660 */
1661 static uint32_t
1662 sched_clutch_root_urgency(
1663 sched_clutch_root_t root_clutch)
1664 {
1665 return root_clutch->scr_urgency;
1666 }
1667
1668 /*
1669 * sched_clutch_root_count_sum()
1670 *
1671 * The count_sum mechanism is used for scheduler runq
1672 * statistics calculation. Its only useful for debugging
1673 * purposes; since it takes a mach_absolute_time() on
1674 * other scheduler implementations, its better to avoid
1675 * populating this until absolutely necessary.
1676 */
1677 static uint32_t
1678 sched_clutch_root_count_sum(
1679 __unused sched_clutch_root_t root_clutch)
1680 {
1681 return 0;
1682 }
1683
1684 /*
1685 * sched_clutch_root_priority()
1686 *
1687 * Routine to get the priority of the highest runnable
1688 * thread in the hierarchy.
1689 */
1690 static int
1691 sched_clutch_root_priority(
1692 sched_clutch_root_t root_clutch)
1693 {
1694 return root_clutch->scr_priority;
1695 }
1696
1697 /*
1698 * sched_clutch_root_count()
1699 *
1700 * Returns total number of runnable threads in the hierarchy.
1701 */
1702 uint32_t
1703 sched_clutch_root_count(
1704 sched_clutch_root_t root_clutch)
1705 {
1706 return root_clutch->scr_thr_count;
1707 }
1708
1709 /*
1710 * sched_clutch_thread_pri_shift()
1711 *
1712 * Routine to get the priority shift value for a thread.
1713 * Since the timesharing is done at the clutch_bucket level,
1714 * this routine gets the clutch_bucket and retrieves the
1715 * values from there.
1716 */
1717 uint32_t
1718 sched_clutch_thread_pri_shift(
1719 thread_t thread,
1720 sched_bucket_t bucket)
1721 {
1722 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1723 return UINT8_MAX;
1724 }
1725 assert(bucket != TH_BUCKET_RUN);
1726 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1727 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]);
1728 return os_atomic_load(&clutch_bucket->scb_pri_shift, relaxed);
1729 }
1730
1731 #pragma mark -- Clutch Scheduler Algorithm
1732
1733 static void
1734 sched_clutch_init(void);
1735
1736 static void
1737 sched_clutch_timebase_init(void);
1738
1739 static thread_t
1740 sched_clutch_steal_thread(processor_set_t pset);
1741
1742 static void
1743 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
1744
1745 static boolean_t
1746 sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
1747 sched_options_t options);
1748
1749 static boolean_t
1750 sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
1751
1752 static ast_t
1753 sched_clutch_processor_csw_check(processor_t processor);
1754
1755 static boolean_t
1756 sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
1757
1758 static int
1759 sched_clutch_runq_count(processor_t processor);
1760
1761 static boolean_t
1762 sched_clutch_processor_queue_empty(processor_t processor);
1763
1764 static uint64_t
1765 sched_clutch_runq_stats_count_sum(processor_t processor);
1766
1767 static int
1768 sched_clutch_processor_bound_count(processor_t processor);
1769
1770 static void
1771 sched_clutch_pset_init(processor_set_t pset);
1772
1773 static void
1774 sched_clutch_processor_init(processor_t processor);
1775
1776 static thread_t
1777 sched_clutch_choose_thread(processor_t processor, int priority, ast_t reason);
1778
1779 static void
1780 sched_clutch_processor_queue_shutdown(processor_t processor);
1781
1782 static sched_mode_t
1783 sched_clutch_initial_thread_sched_mode(task_t parent_task);
1784
1785 static uint32_t
1786 sched_clutch_initial_quantum_size(thread_t thread);
1787
1788 static bool
1789 sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread);
1790
1791 static uint32_t
1792 sched_clutch_run_incr(thread_t thread);
1793
1794 static uint32_t
1795 sched_clutch_run_decr(thread_t thread);
1796
1797 static void
1798 sched_clutch_update_thread_bucket(thread_t thread);
1799
1800 const struct sched_dispatch_table sched_clutch_dispatch = {
1801 .sched_name = "clutch",
1802 .init = sched_clutch_init,
1803 .timebase_init = sched_clutch_timebase_init,
1804 .processor_init = sched_clutch_processor_init,
1805 .pset_init = sched_clutch_pset_init,
1806 .maintenance_continuation = sched_timeshare_maintenance_continue,
1807 .choose_thread = sched_clutch_choose_thread,
1808 .steal_thread_enabled = sched_steal_thread_enabled,
1809 .steal_thread = sched_clutch_steal_thread,
1810 .compute_timeshare_priority = sched_compute_timeshare_priority,
1811 .choose_processor = choose_processor,
1812 .processor_enqueue = sched_clutch_processor_enqueue,
1813 .processor_queue_shutdown = sched_clutch_processor_queue_shutdown,
1814 .processor_queue_remove = sched_clutch_processor_queue_remove,
1815 .processor_queue_empty = sched_clutch_processor_queue_empty,
1816 .priority_is_urgent = priority_is_urgent,
1817 .processor_csw_check = sched_clutch_processor_csw_check,
1818 .processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
1819 .initial_quantum_size = sched_clutch_initial_quantum_size,
1820 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
1821 .can_update_priority = can_update_priority,
1822 .update_priority = update_priority,
1823 .lightweight_update_priority = lightweight_update_priority,
1824 .quantum_expire = sched_default_quantum_expire,
1825 .processor_runq_count = sched_clutch_runq_count,
1826 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
1827 .processor_bound_count = sched_clutch_processor_bound_count,
1828 .thread_update_scan = sched_clutch_thread_update_scan,
1829 .multiple_psets_enabled = TRUE,
1830 .sched_groups_enabled = FALSE,
1831 .avoid_processor_enabled = TRUE,
1832 .thread_avoid_processor = sched_clutch_thread_avoid_processor,
1833 .processor_balance = sched_SMT_balance,
1834
1835 .rt_runq = sched_rtglobal_runq,
1836 .rt_init = sched_rtglobal_init,
1837 .rt_queue_shutdown = sched_rtglobal_queue_shutdown,
1838 .rt_runq_scan = sched_rtglobal_runq_scan,
1839 .rt_runq_count_sum = sched_rtglobal_runq_count_sum,
1840
1841 .qos_max_parallelism = sched_qos_max_parallelism,
1842 .check_spill = sched_check_spill,
1843 .ipi_policy = sched_ipi_policy,
1844 .thread_should_yield = sched_thread_should_yield,
1845 .run_count_incr = sched_clutch_run_incr,
1846 .run_count_decr = sched_clutch_run_decr,
1847 .update_thread_bucket = sched_clutch_update_thread_bucket,
1848 .pset_made_schedulable = sched_pset_made_schedulable,
1849 };
1850
1851 __attribute__((always_inline))
1852 static inline run_queue_t
1853 sched_clutch_bound_runq(processor_t processor)
1854 {
1855 return &processor->runq;
1856 }
1857
1858 __attribute__((always_inline))
1859 static inline sched_clutch_root_t
1860 sched_clutch_processor_root_clutch(processor_t processor)
1861 {
1862 return &processor->processor_set->pset_clutch_root;
1863 }
1864
1865 __attribute__((always_inline))
1866 static inline run_queue_t
1867 sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
1868 {
1869 assert(thread->bound_processor == processor);
1870 return sched_clutch_bound_runq(processor);
1871 }
1872
1873 static uint32_t
1874 sched_clutch_initial_quantum_size(thread_t thread)
1875 {
1876 if (thread == THREAD_NULL) {
1877 return std_quantum;
1878 }
1879 assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
1880 return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
1881 }
1882
1883 static sched_mode_t
1884 sched_clutch_initial_thread_sched_mode(task_t parent_task)
1885 {
1886 if (parent_task == kernel_task) {
1887 return TH_MODE_FIXED;
1888 } else {
1889 return TH_MODE_TIMESHARE;
1890 }
1891 }
1892
1893 static void
1894 sched_clutch_processor_init(processor_t processor)
1895 {
1896 run_queue_init(&processor->runq);
1897 }
1898
1899 static void
1900 sched_clutch_pset_init(processor_set_t pset)
1901 {
1902 sched_clutch_root_init(&pset->pset_clutch_root, pset);
1903 }
1904
1905 static void
1906 sched_clutch_init(void)
1907 {
1908 if (!PE_parse_boot_argn("sched_clutch_bucket_interactive_pri", &sched_clutch_bucket_interactive_pri, sizeof(sched_clutch_bucket_interactive_pri))) {
1909 sched_clutch_bucket_interactive_pri = SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT;
1910 }
1911 if (!PE_parse_boot_argn("sched_clutch_bucket_pri_boost", &sched_clutch_bucket_pri_boost, sizeof(sched_clutch_bucket_pri_boost))) {
1912 sched_clutch_bucket_pri_boost = SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT;
1913 }
1914 sched_timeshare_init();
1915 }
1916
1917 static void
1918 sched_clutch_timebase_init(void)
1919 {
1920 sched_timeshare_timebase_init();
1921 sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
1922 sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
1923 sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
1924 clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_ADJUST_THRESHOLD_USECS,
1925 NSEC_PER_USEC, &sched_clutch_bucket_adjust_threshold);
1926
1927 uint32_t interactivity_delta = 0;
1928 if (!PE_parse_boot_argn("sched_clutch_bucket_interactivity_delta_usecs", &interactivity_delta, sizeof(interactivity_delta))) {
1929 interactivity_delta = SCHED_CLUTCH_BUCKET_INTERACTIVITY_DELTA_USECS_DEFAULT;
1930 }
1931 clock_interval_to_absolutetime_interval(interactivity_delta, NSEC_PER_USEC, &sched_clutch_bucket_interactivity_delta);
1932 }
1933
1934 static thread_t
1935 sched_clutch_choose_thread(
1936 processor_t processor,
1937 int priority,
1938 __unused ast_t reason)
1939 {
1940 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
1941 uint32_t clutch_count = sched_clutch_root_count(sched_clutch_processor_root_clutch(processor));
1942 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
1943 boolean_t choose_from_boundq = false;
1944
1945 if (bound_runq->highq < priority &&
1946 clutch_pri < priority) {
1947 return THREAD_NULL;
1948 }
1949
1950 if (bound_runq->count && clutch_count) {
1951 if (bound_runq->highq >= clutch_pri) {
1952 choose_from_boundq = true;
1953 }
1954 } else if (bound_runq->count) {
1955 choose_from_boundq = true;
1956 } else if (clutch_count) {
1957 choose_from_boundq = false;
1958 } else {
1959 return THREAD_NULL;
1960 }
1961
1962 thread_t thread = THREAD_NULL;
1963 if (choose_from_boundq == false) {
1964 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
1965 thread = sched_clutch_thread_highest(pset_clutch_root);
1966 } else {
1967 thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
1968 }
1969 return thread;
1970 }
1971
1972 static boolean_t
1973 sched_clutch_processor_enqueue(
1974 processor_t processor,
1975 thread_t thread,
1976 sched_options_t options)
1977 {
1978 boolean_t result;
1979
1980 thread->runq = processor;
1981 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1982 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
1983 result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
1984 } else {
1985 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
1986 result = run_queue_enqueue(rq, thread, options);
1987 }
1988 return result;
1989 }
1990
1991 static boolean_t
1992 sched_clutch_processor_queue_empty(processor_t processor)
1993 {
1994 return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
1995 sched_clutch_bound_runq(processor)->count == 0;
1996 }
1997
1998 static ast_t
1999 sched_clutch_processor_csw_check(processor_t processor)
2000 {
2001 boolean_t has_higher;
2002 int pri;
2003
2004 if (sched_clutch_thread_avoid_processor(processor, current_thread())) {
2005 return AST_PREEMPT | AST_URGENT;
2006 }
2007
2008 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2009 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2010
2011 assert(processor->active_thread != NULL);
2012
2013 pri = MAX(clutch_pri, bound_runq->highq);
2014
2015 if (processor->first_timeslice) {
2016 has_higher = (pri > processor->current_pri);
2017 } else {
2018 has_higher = (pri >= processor->current_pri);
2019 }
2020
2021 if (has_higher) {
2022 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
2023 return AST_PREEMPT | AST_URGENT;
2024 }
2025
2026 if (bound_runq->urgency > 0) {
2027 return AST_PREEMPT | AST_URGENT;
2028 }
2029
2030 return AST_PREEMPT;
2031 }
2032
2033 return AST_NONE;
2034 }
2035
2036 static boolean_t
2037 sched_clutch_processor_queue_has_priority(processor_t processor,
2038 int priority,
2039 boolean_t gte)
2040 {
2041 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2042
2043 int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
2044
2045 if (gte) {
2046 return qpri >= priority;
2047 } else {
2048 return qpri > priority;
2049 }
2050 }
2051
2052 static int
2053 sched_clutch_runq_count(processor_t processor)
2054 {
2055 return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
2056 }
2057
2058 static uint64_t
2059 sched_clutch_runq_stats_count_sum(processor_t processor)
2060 {
2061 uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
2062
2063 if (processor->cpu_id == processor->processor_set->cpu_set_low) {
2064 return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
2065 } else {
2066 return bound_sum;
2067 }
2068 }
2069 static int
2070 sched_clutch_processor_bound_count(processor_t processor)
2071 {
2072 return sched_clutch_bound_runq(processor)->count;
2073 }
2074
2075 static void
2076 sched_clutch_processor_queue_shutdown(processor_t processor)
2077 {
2078 processor_set_t pset = processor->processor_set;
2079 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2080 thread_t thread;
2081 queue_head_t tqueue;
2082
2083 /* We only need to migrate threads if this is the last active processor in the pset */
2084 if (pset->online_processor_count > 0) {
2085 pset_unlock(pset);
2086 return;
2087 }
2088
2089 queue_init(&tqueue);
2090 while (sched_clutch_root_count(pset_clutch_root) > 0) {
2091 thread = sched_clutch_thread_highest(pset_clutch_root);
2092 enqueue_tail(&tqueue, &thread->runq_links);
2093 }
2094
2095 pset_unlock(pset);
2096
2097 qe_foreach_element_safe(thread, &tqueue, runq_links) {
2098 remqueue(&thread->runq_links);
2099
2100 thread_lock(thread);
2101
2102 thread_setrun(thread, SCHED_TAILQ);
2103
2104 thread_unlock(thread);
2105 }
2106 }
2107
2108 static boolean_t
2109 sched_clutch_processor_queue_remove(
2110 processor_t processor,
2111 thread_t thread)
2112 {
2113 run_queue_t rq;
2114 processor_set_t pset = processor->processor_set;
2115
2116 pset_lock(pset);
2117
2118 if (processor == thread->runq) {
2119 /*
2120 * Thread is on a run queue and we have a lock on
2121 * that run queue.
2122 */
2123 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2124 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2125 sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
2126 } else {
2127 rq = sched_clutch_thread_bound_runq(processor, thread);
2128 run_queue_remove(rq, thread);
2129 }
2130 } else {
2131 /*
2132 * The thread left the run queue before we could
2133 * lock the run queue.
2134 */
2135 assert(thread->runq == PROCESSOR_NULL);
2136 processor = PROCESSOR_NULL;
2137 }
2138
2139 pset_unlock(pset);
2140
2141 return processor != PROCESSOR_NULL;
2142 }
2143
2144 static thread_t
2145 sched_clutch_steal_thread(processor_set_t pset)
2146 {
2147 processor_set_t nset, cset = pset;
2148 thread_t thread;
2149
2150 do {
2151 sched_clutch_root_t pset_clutch_root = &cset->pset_clutch_root;
2152 if (sched_clutch_root_count(pset_clutch_root) > 0) {
2153 thread = sched_clutch_thread_highest(pset_clutch_root);
2154 pset_unlock(cset);
2155 return thread;
2156 }
2157
2158 nset = next_pset(cset);
2159
2160 if (nset != pset) {
2161 pset_unlock(cset);
2162
2163 cset = nset;
2164 pset_lock(cset);
2165 }
2166 } while (nset != pset);
2167
2168 pset_unlock(cset);
2169
2170 return THREAD_NULL;
2171 }
2172
2173 static void
2174 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
2175 {
2176 boolean_t restart_needed = FALSE;
2177 processor_t processor = processor_list;
2178 processor_set_t pset;
2179 thread_t thread;
2180 spl_t s;
2181
2182 /*
2183 * We update the threads associated with each processor (bound and idle threads)
2184 * and then update the threads in each pset runqueue.
2185 */
2186
2187 do {
2188 do {
2189 pset = processor->processor_set;
2190
2191 s = splsched();
2192 pset_lock(pset);
2193
2194 restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
2195
2196 pset_unlock(pset);
2197 splx(s);
2198
2199 if (restart_needed) {
2200 break;
2201 }
2202
2203 thread = processor->idle_thread;
2204 if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
2205 if (thread_update_add_thread(thread) == FALSE) {
2206 restart_needed = TRUE;
2207 break;
2208 }
2209 }
2210 } while ((processor = processor->processor_list) != NULL);
2211
2212 /* Ok, we now have a collection of candidates -- fix them. */
2213 thread_update_process_threads();
2214 } while (restart_needed);
2215
2216 pset = &pset0;
2217
2218 do {
2219 do {
2220 s = splsched();
2221 pset_lock(pset);
2222
2223 if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
2224 queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
2225 sched_clutch_bucket_t clutch_bucket;
2226 qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
2227 sched_clutch_bucket_timeshare_update(clutch_bucket);
2228 restart_needed = runq_scan(&clutch_bucket->scb_runq, scan_context);
2229 if (restart_needed) {
2230 break;
2231 }
2232 }
2233 }
2234
2235 pset_unlock(pset);
2236 splx(s);
2237 if (restart_needed) {
2238 break;
2239 }
2240 } while ((pset = pset->pset_list) != NULL);
2241
2242 /* Ok, we now have a collection of candidates -- fix them. */
2243 thread_update_process_threads();
2244 } while (restart_needed);
2245 }
2246
2247 extern int sched_allow_rt_smt;
2248
2249 /* Return true if this thread should not continue running on this processor */
2250 static bool
2251 sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread)
2252 {
2253 if (processor->processor_primary != processor) {
2254 /*
2255 * This is a secondary SMT processor. If the primary is running
2256 * a realtime thread, only allow realtime threads on the secondary.
2257 */
2258 if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
2259 return true;
2260 }
2261 }
2262
2263 return false;
2264 }
2265
2266 /*
2267 * For the clutch scheduler, the run counts are maintained in the clutch
2268 * buckets (i.e thread group scheduling structure).
2269 */
2270 static uint32_t
2271 sched_clutch_run_incr(thread_t thread)
2272 {
2273 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
2274 uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
2275 sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
2276 return new_count;
2277 }
2278
2279 static uint32_t
2280 sched_clutch_run_decr(thread_t thread)
2281 {
2282 assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
2283 uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
2284 sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
2285 return new_count;
2286 }
2287
2288 static sched_bucket_t
2289 sched_convert_pri_to_bucket(uint8_t priority)
2290 {
2291 sched_bucket_t bucket = TH_BUCKET_RUN;
2292
2293 if (priority > BASEPRI_USER_INITIATED) {
2294 bucket = TH_BUCKET_SHARE_FG;
2295 } else if (priority > BASEPRI_DEFAULT) {
2296 bucket = TH_BUCKET_SHARE_IN;
2297 } else if (priority > BASEPRI_UTILITY) {
2298 bucket = TH_BUCKET_SHARE_DF;
2299 } else if (priority > MAXPRI_THROTTLE) {
2300 bucket = TH_BUCKET_SHARE_UT;
2301 } else {
2302 bucket = TH_BUCKET_SHARE_BG;
2303 }
2304 return bucket;
2305 }
2306
2307 /*
2308 * For threads that have changed sched_pri without changing the
2309 * base_pri for any reason other than decay, use the sched_pri
2310 * as the bucketizing priority instead of base_pri. All such
2311 * changes are typically due to kernel locking primitives boosts
2312 * or demotions.
2313 */
2314 static boolean_t
2315 sched_thread_sched_pri_promoted(thread_t thread)
2316 {
2317 return (thread->sched_flags & TH_SFLAG_PROMOTED) ||
2318 (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
2319 (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
2320 (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
2321 (thread->kern_promotion_schedpri != 0);
2322 }
2323
2324 /*
2325 * Routine to update the scheduling bucket for the thread.
2326 *
2327 * In the clutch scheduler implementation, the thread's bucket
2328 * is based on sched_pri if it was promoted due to a kernel
2329 * primitive; otherwise its based on the thread base_pri. This
2330 * enhancement allows promoted threads to reach a higher priority
2331 * bucket and potentially get selected sooner for scheduling.
2332 *
2333 * Also, the clutch scheduler does not honor fixed priority below
2334 * FG priority. It simply puts those threads in the corresponding
2335 * timeshare bucket. The reason for to do that is because it is
2336 * extremely hard to define the scheduling properties of such threads
2337 * and they typically lead to performance issues.
2338 */
2339
2340 void
2341 sched_clutch_update_thread_bucket(thread_t thread)
2342 {
2343 sched_bucket_t old_bucket = thread->th_sched_bucket;
2344 sched_bucket_t new_bucket = TH_BUCKET_RUN;
2345 assert(thread->runq == PROCESSOR_NULL);
2346
2347 int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
2348
2349 switch (thread->sched_mode) {
2350 case TH_MODE_FIXED:
2351 if (pri >= BASEPRI_FOREGROUND) {
2352 new_bucket = TH_BUCKET_FIXPRI;
2353 } else {
2354 new_bucket = sched_convert_pri_to_bucket(pri);
2355 }
2356 break;
2357
2358 case TH_MODE_REALTIME:
2359 new_bucket = TH_BUCKET_FIXPRI;
2360 break;
2361
2362 case TH_MODE_TIMESHARE:
2363 new_bucket = sched_convert_pri_to_bucket(pri);
2364 break;
2365
2366 default:
2367 panic("unexpected mode: %d", thread->sched_mode);
2368 break;
2369 }
2370
2371 if (old_bucket == new_bucket) {
2372 return;
2373 }
2374
2375 thread->th_sched_bucket = new_bucket;
2376 thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
2377
2378 /*
2379 * Since this is called after the thread has been removed from the runq,
2380 * only the run counts need to be updated. The re-insert into the runq
2381 * would put the thread into the correct new bucket's runq.
2382 */
2383 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
2384 sched_clutch_thread_run_bucket_decr(thread, old_bucket);
2385 sched_clutch_thread_run_bucket_incr(thread, new_bucket);
2386 }
2387 }
2388
2389 #if __AMP__
2390
2391 /* Implementation of the AMP version of the clutch scheduler */
2392
2393 static thread_t
2394 sched_clutch_amp_steal_thread(processor_set_t pset);
2395
2396 static ast_t
2397 sched_clutch_amp_processor_csw_check(processor_t processor);
2398
2399 static boolean_t
2400 sched_clutch_amp_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
2401
2402 static boolean_t
2403 sched_clutch_amp_processor_queue_empty(processor_t processor);
2404
2405 static thread_t
2406 sched_clutch_amp_choose_thread(processor_t processor, int priority, ast_t reason);
2407
2408 static void
2409 sched_clutch_amp_processor_queue_shutdown(processor_t processor);
2410
2411 static processor_t
2412 sched_clutch_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
2413
2414 static bool
2415 sched_clutch_amp_thread_avoid_processor(processor_t processor, thread_t thread);
2416
2417 static bool
2418 sched_clutch_amp_thread_should_yield(processor_t processor, thread_t thread);
2419
2420 static void
2421 sched_clutch_migrate_foreign_buckets(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
2422
2423 static void
2424 sched_clutch_amp_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation);
2425
2426 const struct sched_dispatch_table sched_clutch_amp_dispatch = {
2427 .sched_name = "clutch_amp",
2428 .init = sched_amp_init,
2429 .timebase_init = sched_clutch_timebase_init,
2430 .processor_init = sched_clutch_processor_init,
2431 .pset_init = sched_clutch_pset_init,
2432 .maintenance_continuation = sched_timeshare_maintenance_continue,
2433 .choose_thread = sched_clutch_amp_choose_thread,
2434 .steal_thread_enabled = sched_amp_steal_thread_enabled,
2435 .steal_thread = sched_clutch_amp_steal_thread,
2436 .compute_timeshare_priority = sched_compute_timeshare_priority,
2437 .choose_processor = sched_clutch_amp_choose_processor,
2438 .processor_enqueue = sched_clutch_processor_enqueue,
2439 .processor_queue_shutdown = sched_clutch_amp_processor_queue_shutdown,
2440 .processor_queue_remove = sched_clutch_processor_queue_remove,
2441 .processor_queue_empty = sched_clutch_amp_processor_queue_empty,
2442 .priority_is_urgent = priority_is_urgent,
2443 .processor_csw_check = sched_clutch_amp_processor_csw_check,
2444 .processor_queue_has_priority = sched_clutch_amp_processor_queue_has_priority,
2445 .initial_quantum_size = sched_clutch_initial_quantum_size,
2446 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
2447 .can_update_priority = can_update_priority,
2448 .update_priority = update_priority,
2449 .lightweight_update_priority = lightweight_update_priority,
2450 .quantum_expire = sched_default_quantum_expire,
2451 .processor_runq_count = sched_clutch_runq_count,
2452 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
2453 .processor_bound_count = sched_clutch_processor_bound_count,
2454 .thread_update_scan = sched_clutch_thread_update_scan,
2455 .multiple_psets_enabled = TRUE,
2456 .sched_groups_enabled = FALSE,
2457 .avoid_processor_enabled = TRUE,
2458 .thread_avoid_processor = sched_clutch_amp_thread_avoid_processor,
2459 .processor_balance = sched_amp_balance,
2460
2461 .rt_runq = sched_amp_rt_runq,
2462 .rt_init = sched_amp_rt_init,
2463 .rt_queue_shutdown = sched_amp_rt_queue_shutdown,
2464 .rt_runq_scan = sched_amp_rt_runq_scan,
2465 .rt_runq_count_sum = sched_amp_rt_runq_count_sum,
2466
2467 .qos_max_parallelism = sched_amp_qos_max_parallelism,
2468 .check_spill = sched_amp_check_spill,
2469 .ipi_policy = sched_amp_ipi_policy,
2470 .thread_should_yield = sched_clutch_amp_thread_should_yield,
2471 .run_count_incr = sched_clutch_run_incr,
2472 .run_count_decr = sched_clutch_run_decr,
2473 .update_thread_bucket = sched_clutch_update_thread_bucket,
2474 .pset_made_schedulable = sched_clutch_migrate_foreign_buckets,
2475 .thread_group_recommendation_change = sched_clutch_amp_thread_group_recommendation_change,
2476 };
2477
2478 extern processor_set_t ecore_set;
2479 extern processor_set_t pcore_set;
2480
2481 static thread_t
2482 sched_clutch_amp_choose_thread(
2483 processor_t processor,
2484 int priority,
2485 __unused ast_t reason)
2486 {
2487 processor_set_t pset = processor->processor_set;
2488 bool spill_pending = false;
2489 int spill_pri = -1;
2490
2491 if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
2492 spill_pending = true;
2493 spill_pri = sched_clutch_root_priority(&pcore_set->pset_clutch_root);
2494 }
2495
2496 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2497 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2498 boolean_t choose_from_boundq = false;
2499
2500 if ((bound_runq->highq < priority) &&
2501 (clutch_pri < priority) &&
2502 (spill_pri < priority)) {
2503 return THREAD_NULL;
2504 }
2505
2506 if ((spill_pri > bound_runq->highq) &&
2507 (spill_pri > clutch_pri)) {
2508 /*
2509 * There is a higher priority thread on the P-core runq,
2510 * so returning THREAD_NULL here will cause thread_select()
2511 * to call sched_clutch_amp_steal_thread() to try to get it.
2512 */
2513 return THREAD_NULL;
2514 }
2515
2516 if (bound_runq->highq >= clutch_pri) {
2517 choose_from_boundq = true;
2518 }
2519
2520 thread_t thread = THREAD_NULL;
2521 if (choose_from_boundq == false) {
2522 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2523 thread = sched_clutch_thread_highest(pset_clutch_root);
2524 } else {
2525 thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
2526 }
2527 return thread;
2528 }
2529
2530 static boolean_t
2531 sched_clutch_amp_processor_queue_empty(processor_t processor)
2532 {
2533 processor_set_t pset = processor->processor_set;
2534 bool spill_pending = bit_test(pset->pending_spill_cpu_mask, processor->cpu_id);
2535
2536 return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
2537 (sched_clutch_bound_runq(processor)->count == 0) &&
2538 !spill_pending;
2539 }
2540
2541 static bool
2542 sched_clutch_amp_thread_should_yield(processor_t processor, thread_t thread)
2543 {
2544 if (!sched_clutch_amp_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
2545 return true;
2546 }
2547
2548 if ((processor->processor_set->pset_cluster_type == PSET_AMP_E) && (recommended_pset_type(thread) == PSET_AMP_P)) {
2549 return sched_clutch_root_count(&pcore_set->pset_clutch_root) > 0;
2550 }
2551
2552 return false;
2553 }
2554
2555 static ast_t
2556 sched_clutch_amp_processor_csw_check(processor_t processor)
2557 {
2558 boolean_t has_higher;
2559 int pri;
2560
2561 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2562 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2563
2564 assert(processor->active_thread != NULL);
2565
2566 processor_set_t pset = processor->processor_set;
2567 bool spill_pending = false;
2568 int spill_pri = -1;
2569 int spill_urgency = 0;
2570
2571 if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
2572 spill_pending = true;
2573 spill_pri = sched_clutch_root_priority(&pcore_set->pset_clutch_root);
2574 spill_urgency = sched_clutch_root_urgency(&pcore_set->pset_clutch_root);
2575 }
2576
2577 pri = MAX(clutch_pri, bound_runq->highq);
2578 if (spill_pending) {
2579 pri = MAX(pri, spill_pri);
2580 }
2581
2582 if (processor->first_timeslice) {
2583 has_higher = (pri > processor->current_pri);
2584 } else {
2585 has_higher = (pri >= processor->current_pri);
2586 }
2587
2588 if (has_higher) {
2589 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
2590 return AST_PREEMPT | AST_URGENT;
2591 }
2592
2593 if (bound_runq->urgency > 0) {
2594 return AST_PREEMPT | AST_URGENT;
2595 }
2596
2597 if (spill_urgency > 0) {
2598 return AST_PREEMPT | AST_URGENT;
2599 }
2600
2601 return AST_PREEMPT;
2602 }
2603
2604 return AST_NONE;
2605 }
2606
2607 static boolean_t
2608 sched_clutch_amp_processor_queue_has_priority(processor_t processor,
2609 int priority,
2610 boolean_t gte)
2611 {
2612 bool spill_pending = false;
2613 int spill_pri = -1;
2614 processor_set_t pset = processor->processor_set;
2615
2616 if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
2617 spill_pending = true;
2618 spill_pri = sched_clutch_root_priority(&pcore_set->pset_clutch_root);
2619 }
2620 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2621
2622 int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
2623 if (spill_pending) {
2624 qpri = MAX(qpri, spill_pri);
2625 }
2626
2627 if (gte) {
2628 return qpri >= priority;
2629 } else {
2630 return qpri > priority;
2631 }
2632 }
2633
2634 /*
2635 * sched_clutch_hierarchy_thread_pset()
2636 *
2637 * Routine to determine where a thread should be enqueued based on its
2638 * recommendation if this is the first runnable thread in the clutch_bucket
2639 * or its clutch bucket's hierarchy membership.
2640 */
2641 static processor_set_t
2642 sched_clutch_hierarchy_thread_pset(thread_t thread)
2643 {
2644 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread) == false) {
2645 return (recommended_pset_type(thread) == PSET_AMP_P) ? pcore_set : ecore_set;
2646 }
2647
2648 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2649 sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
2650 sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
2651 if (scb_root) {
2652 /* Clutch bucket is already runnable, return the pset hierarchy its part of */
2653 return scb_root->scr_pset;
2654 }
2655 return (recommended_pset_type(thread) == PSET_AMP_E) ? ecore_set : pcore_set;
2656 }
2657
2658 /*
2659 * sched_clutch_thread_pset_recommended()
2660 *
2661 * Routine to determine if the thread should be placed on the provided pset.
2662 * The routine first makes sure the cluster is available for scheduling. If
2663 * it is available, it looks at the thread's recommendation. Called
2664 * with the pset lock held.
2665 */
2666 static bool
2667 sched_clutch_thread_pset_recommended(thread_t thread, processor_set_t pset)
2668 {
2669 if (!sched_clutch_pset_available(pset)) {
2670 return false;
2671 }
2672
2673 /* At this point, all clusters should be available and recommended */
2674 if (sched_clutch_hierarchy_thread_pset(thread) != pset) {
2675 return false;
2676 }
2677
2678 return true;
2679 }
2680
2681
2682 static void
2683 sched_clutch_amp_processor_queue_shutdown(processor_t processor)
2684 {
2685 processor_set_t pset = processor->processor_set;
2686 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2687 thread_t thread;
2688 queue_head_t tqueue;
2689
2690 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
2691 if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
2692 pset_unlock(pset);
2693 return;
2694 }
2695
2696 queue_init(&tqueue);
2697 while (sched_clutch_root_count(pset_clutch_root) > 0) {
2698 thread = sched_clutch_thread_highest(pset_clutch_root);
2699 enqueue_tail(&tqueue, &thread->runq_links);
2700 }
2701 pset_unlock(pset);
2702
2703 qe_foreach_element_safe(thread, &tqueue, runq_links) {
2704 remqueue(&thread->runq_links);
2705 thread_lock(thread);
2706 thread_setrun(thread, SCHED_TAILQ);
2707 thread_unlock(thread);
2708 }
2709 }
2710
2711 static thread_t
2712 sched_clutch_amp_steal_thread(processor_set_t pset)
2713 {
2714 thread_t thread = THREAD_NULL;
2715 processor_set_t nset = pset;
2716
2717 if (pcore_set->online_processor_count == 0) {
2718 /* Nothing to steal from */
2719 goto out;
2720 }
2721
2722 if (pset->pset_cluster_type == PSET_AMP_P) {
2723 /* P cores don't steal from E cores */
2724 goto out;
2725 }
2726
2727 processor_t processor = current_processor();
2728 assert(pset == processor->processor_set);
2729
2730 bool spill_pending = bit_test(pset->pending_spill_cpu_mask, processor->cpu_id);
2731 bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
2732
2733 nset = pcore_set;
2734
2735 assert(nset != pset);
2736
2737 if (sched_get_pset_load_average(nset) >= sched_amp_steal_threshold(nset, spill_pending)) {
2738 pset_unlock(pset);
2739
2740 pset = nset;
2741
2742 pset_lock(pset);
2743
2744 /* Allow steal if load average still OK, no idle cores, and more threads on runq than active cores DISPATCHING */
2745 if ((sched_get_pset_load_average(pset) >= sched_amp_steal_threshold(pset, spill_pending)) &&
2746 ((int)sched_clutch_root_count(&pset->pset_clutch_root) > bit_count(pset->cpu_state_map[PROCESSOR_DISPATCHING])) &&
2747 (bit_count(pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) == 0)) {
2748 thread = sched_clutch_thread_highest(&pset->pset_clutch_root);
2749 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_STEAL) | DBG_FUNC_NONE, spill_pending, 0, 0, 0);
2750 sched_update_pset_load_average(pset);
2751 }
2752 }
2753
2754 out:
2755 pset_unlock(pset);
2756 return thread;
2757 }
2758
2759 /* Return true if this thread should not continue running on this processor */
2760 static bool
2761 sched_clutch_amp_thread_avoid_processor(processor_t processor, thread_t thread)
2762 {
2763 if (processor->processor_set->pset_cluster_type == PSET_AMP_E) {
2764 if (sched_clutch_thread_pset_recommended(thread, pcore_set)) {
2765 return true;
2766 }
2767 } else if (processor->processor_set->pset_cluster_type == PSET_AMP_P) {
2768 if (!sched_clutch_thread_pset_recommended(thread, pcore_set)) {
2769 return true;
2770 }
2771 }
2772
2773 return false;
2774 }
2775
2776 static processor_t
2777 sched_clutch_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
2778 {
2779 /* Bound threads don't call this function */
2780 assert(thread->bound_processor == PROCESSOR_NULL);
2781
2782 processor_set_t nset;
2783 processor_t chosen_processor = PROCESSOR_NULL;
2784
2785 select_pset:
2786 nset = (pset == ecore_set) ? pcore_set : ecore_set;
2787 if (!sched_clutch_pset_available(pset)) {
2788 /* If the current pset is not available for scheduling, just use the other pset */
2789 pset_unlock(pset);
2790 pset_lock(nset);
2791 goto select_processor;
2792 }
2793
2794 /* Check if the thread is recommended to run on this pset */
2795 if (sched_clutch_thread_pset_recommended(thread, pset)) {
2796 nset = pset;
2797 goto select_processor;
2798 } else {
2799 /* pset not recommended; try the other pset */
2800 pset_unlock(pset);
2801 pset_lock(nset);
2802 pset = nset;
2803 goto select_pset;
2804 }
2805
2806 select_processor:
2807 if (!sched_clutch_pset_available(nset)) {
2808 /*
2809 * It looks like both psets are not available due to some
2810 * reason. In that case, just use the master processor's
2811 * pset for scheduling.
2812 */
2813 if (master_processor->processor_set != nset) {
2814 pset_unlock(nset);
2815 nset = master_processor->processor_set;
2816 pset_lock(nset);
2817 }
2818 }
2819 chosen_processor = choose_processor(nset, processor, thread);
2820 assert(chosen_processor->processor_set == nset);
2821 return chosen_processor;
2822 }
2823
2824 /*
2825 * AMP Clutch Scheduler Thread Migration
2826 *
2827 * For the AMP version of the clutch scheduler the thread is always scheduled via its
2828 * thread group. So it is important to make sure that the thread group is part of the
2829 * correct processor set hierarchy. In order to do that, the clutch scheduler moves
2830 * all eligble clutch buckets to the correct hierarchy when the recommendation of a
2831 * thread group is changed by CLPC.
2832 */
2833
2834 /*
2835 * sched_clutch_recommended_pset()
2836 *
2837 * Routine to decide which hierarchy the thread group should be in based on the
2838 * recommendation and other thread group and system properties. This routine is
2839 * used to determine if thread group migration is necessary and should mimic the
2840 * logic in sched_clutch_thread_pset_recommended() & recommended_pset_type().
2841 */
2842 static processor_set_t
2843 sched_clutch_recommended_pset(sched_clutch_t sched_clutch, cluster_type_t recommendation)
2844 {
2845 if (!sched_clutch_pset_available(pcore_set)) {
2846 return ecore_set;
2847 }
2848
2849 if (!sched_clutch_pset_available(ecore_set)) {
2850 return pcore_set;
2851 }
2852
2853 /*
2854 * If all clusters are available and recommended, use the recommendation
2855 * to decide which cluster to use.
2856 */
2857 pset_cluster_type_t type = thread_group_pset_recommendation(sched_clutch->sc_tg, recommendation);
2858 return (type == PSET_AMP_E) ? ecore_set : pcore_set;
2859 }
2860
2861 static void
2862 sched_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
2863 {
2864 uint16_t thread_count = clutch_bucket->scb_thr_count;
2865 thread_t thread;
2866 uint64_t current_timestamp = mach_approximate_time();
2867 while (thread_count > 0) {
2868 thread = run_queue_peek(&clutch_bucket->scb_runq);
2869 sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
2870 enqueue_tail(clutch_threads, &thread->runq_links);
2871 thread_count--;
2872 }
2873
2874 /*
2875 * This operation should have drained the clutch bucket and pulled it out of the
2876 * hierarchy.
2877 */
2878 assert(clutch_bucket->scb_thr_count == 0);
2879 assert(clutch_bucket->scb_root == NULL);
2880 }
2881
2882 /*
2883 * sched_clutch_migrate_thread_group()
2884 *
2885 * Routine to implement the migration of threads when the thread group
2886 * recommendation is updated. The migration works using a 2-phase
2887 * algorithm.
2888 *
2889 * Phase 1: With the source pset (determined by sched_clutch_recommended_pset)
2890 * locked, drain all the runnable threads into a local queue and update the TG
2891 * recommendation.
2892 *
2893 * Phase 2: Call thread_setrun() on all the drained threads. Since the TG recommendation
2894 * has been updated, these should all end up in the right hierarchy.
2895 */
2896 static void
2897 sched_clutch_migrate_thread_group(sched_clutch_t sched_clutch, cluster_type_t new_recommendation)
2898 {
2899 thread_t thread;
2900
2901 /* If the thread group is empty, just update the recommendation */
2902 if (os_atomic_load(&sched_clutch->sc_thr_count, relaxed) == 0) {
2903 thread_group_update_recommendation(sched_clutch->sc_tg, new_recommendation);
2904 return;
2905 }
2906
2907 processor_set_t dst_pset = sched_clutch_recommended_pset(sched_clutch, new_recommendation);
2908 processor_set_t src_pset = (dst_pset == pcore_set) ? ecore_set : pcore_set;
2909
2910 queue_head_t clutch_threads;
2911 queue_init(&clutch_threads);
2912
2913 /* Interrupts need to be disabled to make sure threads wont become runnable during the
2914 * migration and attempt to grab the pset/thread locks.
2915 */
2916 spl_t s = splsched();
2917
2918 pset_lock(src_pset);
2919 for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
2920 sched_clutch_bucket_t clutch_bucket = &(sched_clutch->sc_clutch_buckets[bucket]);
2921 sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
2922 if ((scb_root == NULL) || (scb_root->scr_pset == dst_pset)) {
2923 /* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
2924 continue;
2925 }
2926 assert(scb_root->scr_pset == src_pset);
2927 /* Now remove all the threads from the runq so that thread->runq is set correctly */
2928 sched_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
2929 }
2930
2931 /*
2932 * Now that all the clutch buckets have been drained, update the TG recommendation.
2933 * This operation needs to be done with the pset lock held to make sure that anyone
2934 * coming in before the migration started would get the original pset as the root
2935 * of this sched_clutch and attempt to hold the src_pset lock. Once the TG changes,
2936 * all threads that are becoming runnable would find the clutch bucket empty and
2937 * the TG recommendation would coax them to enqueue it in the new recommended
2938 * hierarchy. This effectively synchronizes with other threads calling
2939 * thread_setrun() and trying to decide which pset the thread/clutch_bucket
2940 * belongs in.
2941 */
2942 thread_group_update_recommendation(sched_clutch->sc_tg, new_recommendation);
2943 pset_unlock(src_pset);
2944
2945 /* Now setrun all the threads in the local queue */
2946 qe_foreach_element_safe(thread, &clutch_threads, runq_links) {
2947 remqueue(&thread->runq_links);
2948 thread_lock(thread);
2949 thread_setrun(thread, SCHED_TAILQ);
2950 thread_unlock(thread);
2951 }
2952
2953 splx(s);
2954 }
2955
2956 static void
2957 sched_clutch_amp_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation)
2958 {
2959 /*
2960 * For the clutch scheduler, the change in recommendation moves the thread group
2961 * to the right hierarchy. sched_clutch_migrate_thread_group() is also responsible
2962 * for updating the recommendation of the thread group.
2963 */
2964 sched_clutch_migrate_thread_group(&tg->tg_sched_clutch, new_recommendation);
2965
2966 if (new_recommendation != CLUSTER_TYPE_P) {
2967 return;
2968 }
2969
2970 sched_amp_bounce_thread_group_from_ecores(ecore_set, tg);
2971 }
2972
2973 /*
2974 * sched_clutch_migrate_foreign_buckets()
2975 *
2976 * Routine to migrate all the clutch buckets which are not in their recommended
2977 * pset hierarchy now that a new pset has become runnable. The algorithm is
2978 * similar to sched_clutch_migrate_thread_group().
2979 *
2980 * Invoked with the newly recommended pset lock held and interrupts disabled.
2981 */
2982 static void
2983 sched_clutch_migrate_foreign_buckets(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
2984 {
2985 thread_t thread;
2986 processor_set_t src_pset = (dst_pset == pcore_set) ? ecore_set : pcore_set;
2987
2988 if (!sched_clutch_pset_available(dst_pset)) {
2989 /*
2990 * It is possible that some state about the pset changed,
2991 * but its still not available for scheduling. Nothing to
2992 * do here in that case.
2993 */
2994 if (drop_lock) {
2995 pset_unlock(dst_pset);
2996 }
2997 return;
2998 }
2999 pset_unlock(dst_pset);
3000
3001 queue_head_t clutch_threads;
3002 queue_init(&clutch_threads);
3003 sched_clutch_root_t src_root = &src_pset->pset_clutch_root;
3004
3005 pset_lock(src_pset);
3006 queue_t clutch_bucket_list = &src_pset->pset_clutch_root.scr_foreign_buckets;
3007
3008 if (sched_clutch_root_count(src_root) == 0) {
3009 /* No threads present in this hierarchy */
3010 pset_unlock(src_pset);
3011 goto migration_complete;
3012 }
3013
3014 sched_clutch_bucket_t clutch_bucket;
3015 qe_foreach_element_safe(clutch_bucket, clutch_bucket_list, scb_foreignlink) {
3016 sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
3017 assert(scb_root->scr_pset == src_pset);
3018 /* Now remove all the threads from the runq so that thread->runq is set correctly */
3019 sched_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
3020 assert(clutch_bucket->scb_foreign == false);
3021 }
3022 pset_unlock(src_pset);
3023
3024 /* Now setrun all the threads in the local queue */
3025 qe_foreach_element_safe(thread, &clutch_threads, runq_links) {
3026 remqueue(&thread->runq_links);
3027 thread_lock(thread);
3028 thread_setrun(thread, SCHED_TAILQ);
3029 thread_unlock(thread);
3030 }
3031
3032 migration_complete:
3033 if (!drop_lock) {
3034 pset_lock(dst_pset);
3035 }
3036 }
3037
3038 #endif /* __AMP__ */
3039
3040 #endif /* CONFIG_SCHED_CLUTCH */