2 * Copyright (c) 2018 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <mach/mach_types.h>
30 #include <mach/machine.h>
31 #include <machine/machine_routines.h>
32 #include <machine/sched_param.h>
33 #include <machine/machine_cpu.h>
34 #include <kern/kern_types.h>
35 #include <kern/debug.h>
36 #include <kern/machine.h>
37 #include <kern/misc_protos.h>
38 #include <kern/processor.h>
39 #include <kern/queue.h>
40 #include <kern/sched.h>
41 #include <kern/sched_prim.h>
42 #include <kern/task.h>
43 #include <kern/thread.h>
44 #include <kern/sched_clutch.h>
45 #include <machine/atomic.h>
46 #include <kern/sched_clutch.h>
47 #include <sys/kdebug.h>
50 #include <kern/sched_amp_common.h>
51 #endif /* CONFIG_SCHED_EDGE */
53 #if CONFIG_SCHED_CLUTCH
55 /* Forward declarations of static routines */
57 /* Root level hierarchy management */
58 static void sched_clutch_root_init(sched_clutch_root_t
, processor_set_t
);
59 static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t
, sched_bucket_t
, bool);
60 static void sched_clutch_root_pri_update(sched_clutch_root_t
);
61 static void sched_clutch_root_urgency_inc(sched_clutch_root_t
, thread_t
);
62 static void sched_clutch_root_urgency_dec(sched_clutch_root_t
, thread_t
);
64 __enum_decl(sched_clutch_highest_root_bucket_type_t
, uint32_t, {
65 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE
= 0,
66 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY
= 1,
67 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL
= 2,
70 static sched_clutch_root_bucket_t
sched_clutch_root_highest_root_bucket(sched_clutch_root_t
, uint64_t, sched_clutch_highest_root_bucket_type_t
);
73 /* Support for foreign threads on AMP platforms */
74 static boolean_t
sched_clutch_root_foreign_empty(sched_clutch_root_t
);
75 static thread_t
sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t
);
76 #endif /* CONFIG_SCHED_EDGE */
78 /* Root bucket level hierarchy management */
79 static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t
, uint64_t);
80 static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t
, sched_clutch_root_t
, uint64_t);
82 /* Options for clutch bucket ordering in the runq */
83 __options_decl(sched_clutch_bucket_options_t
, uint32_t, {
84 SCHED_CLUTCH_BUCKET_OPTIONS_NONE
= 0x0,
85 /* Round robin clutch bucket on thread removal */
86 SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR
= 0x1,
87 /* Insert clutch bucket at head (for thread preemption) */
88 SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ
= 0x2,
89 /* Insert clutch bucket at tail (default) */
90 SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ
= 0x4,
93 /* Clutch bucket level hierarchy management */
94 static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t
, sched_clutch_bucket_t
, sched_bucket_t
, uint64_t, sched_clutch_bucket_options_t
);
95 static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t
, sched_clutch_bucket_t
, sched_bucket_t
, uint64_t, sched_clutch_bucket_options_t
);
96 static boolean_t
sched_clutch_bucket_runnable(sched_clutch_bucket_t
, sched_clutch_root_t
, uint64_t, sched_clutch_bucket_options_t
);
97 static boolean_t
sched_clutch_bucket_update(sched_clutch_bucket_t
, sched_clutch_root_t
, uint64_t, sched_clutch_bucket_options_t
);
98 static void sched_clutch_bucket_empty(sched_clutch_bucket_t
, sched_clutch_root_t
, uint64_t, sched_clutch_bucket_options_t
);
99 static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t
, uint64_t);
101 /* Clutch bucket group level properties management */
102 static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t
, uint64_t);
103 static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t
, uint8_t);
104 static void sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t
, sched_clutch_bucket_t
, uint64_t);
105 static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t
, uint64_t);
106 static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t
);
107 static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t
);
108 static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t
, uint64_t);
110 /* Clutch timeshare properties updates */
111 static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t
, sched_bucket_t
);
112 static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t
, sched_bucket_t
);
114 /* Clutch membership management */
115 static boolean_t
sched_clutch_thread_insert(sched_clutch_root_t
, thread_t
, integer_t
);
116 static void sched_clutch_thread_remove(sched_clutch_root_t
, thread_t
, uint64_t, sched_clutch_bucket_options_t
);
117 static thread_t
sched_clutch_thread_highest_remove(sched_clutch_root_t
);
119 /* Clutch properties updates */
120 static uint32_t sched_clutch_root_urgency(sched_clutch_root_t
);
121 static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t
);
122 static int sched_clutch_root_priority(sched_clutch_root_t
);
123 static sched_clutch_bucket_t
sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t
);
124 static boolean_t
sched_thread_sched_pri_promoted(thread_t
);
126 #if CONFIG_SCHED_EDGE
127 /* System based routines */
128 static bool sched_edge_pset_available(processor_set_t
);
129 static uint32_t sched_edge_thread_bound_cluster_id(thread_t
);
130 #endif /* CONFIG_SCHED_EDGE */
132 /* Helper debugging routines */
133 static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t
);
135 extern processor_set_t pset_array
[MAX_PSETS
];
138 * Special markers for buckets that have invalid WCELs/quantums etc.
140 #define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
141 #define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
144 * Root level bucket WCELs
146 * The root level bucket selection algorithm is an Earliest Deadline
147 * First (EDF) algorithm where the deadline for buckets are defined
148 * by the worst-case-execution-latency and the make runnable timestamp
152 static uint32_t sched_clutch_root_bucket_wcel_us
[TH_BUCKET_SCHED_MAX
] = {
153 SCHED_CLUTCH_INVALID_TIME_32
, /* FIXPRI */
155 37500, /* IN (37.5ms) */
156 75000, /* DF (75ms) */
157 150000, /* UT (150ms) */
158 250000 /* BG (250ms) */
160 static uint64_t sched_clutch_root_bucket_wcel
[TH_BUCKET_SCHED_MAX
] = {0};
163 * Root level bucket warp
165 * Each root level bucket has a warp value associated with it as well.
166 * The warp value allows the root bucket to effectively warp ahead of
167 * lower priority buckets for a limited time even if it has a later
168 * deadline. The warping behavior provides extra (but limited)
169 * opportunity for high priority buckets to remain responsive.
172 /* Special warp deadline value to indicate that the bucket has not used any warp yet */
173 #define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED (SCHED_CLUTCH_INVALID_TIME_64)
175 /* Warp window durations for various tiers */
176 static uint32_t sched_clutch_root_bucket_warp_us
[TH_BUCKET_SCHED_MAX
] = {
177 SCHED_CLUTCH_INVALID_TIME_32
, /* FIXPRI */
184 static uint64_t sched_clutch_root_bucket_warp
[TH_BUCKET_SCHED_MAX
] = {0};
187 * Thread level quantum
189 * The algorithm defines quantums for threads at various buckets. This
190 * (combined with the root level bucket quantums) restricts how much
191 * the lower priority levels can preempt the higher priority threads.
194 #if XNU_TARGET_OS_OSX
195 static uint32_t sched_clutch_thread_quantum_us
[TH_BUCKET_SCHED_MAX
] = {
196 10000, /* FIXPRI (10ms) */
197 10000, /* FG (10ms) */
198 10000, /* IN (10ms) */
199 10000, /* DF (10ms) */
203 #else /* XNU_TARGET_OS_OSX */
204 static uint32_t sched_clutch_thread_quantum_us
[TH_BUCKET_SCHED_MAX
] = {
205 10000, /* FIXPRI (10ms) */
206 10000, /* FG (10ms) */
212 #endif /* XNU_TARGET_OS_OSX */
214 static uint64_t sched_clutch_thread_quantum
[TH_BUCKET_SCHED_MAX
] = {0};
217 * sched_clutch_us_to_abstime()
219 * Initializer for converting all durations in usec to abstime
222 sched_clutch_us_to_abstime(uint32_t *us_vals
, uint64_t *abstime_vals
)
224 for (int i
= 0; i
< TH_BUCKET_SCHED_MAX
; i
++) {
225 if (us_vals
[i
] == SCHED_CLUTCH_INVALID_TIME_32
) {
226 abstime_vals
[i
] = SCHED_CLUTCH_INVALID_TIME_64
;
228 clock_interval_to_absolutetime_interval(us_vals
[i
],
229 NSEC_PER_USEC
, &abstime_vals
[i
]);
234 /* Clutch/Edge Scheduler Debugging support */
235 #define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c) ((uint64_t)c | ((uint64_t)b << 16) | ((uint64_t)a << 32))
237 #if DEVELOPMENT || DEBUG
240 * sched_clutch_hierarchy_locked_assert()
242 * Debugging helper routine. Asserts that the hierarchy is locked. The locking
243 * for the hierarchy depends on where the hierarchy is hooked. The current
244 * implementation hooks the hierarchy at the pset, so the hierarchy is locked
245 * using the pset lock.
248 sched_clutch_hierarchy_locked_assert(
249 sched_clutch_root_t root_clutch
)
251 pset_assert_locked(root_clutch
->scr_pset
);
254 #else /* DEVELOPMENT || DEBUG */
257 sched_clutch_hierarchy_locked_assert(
258 __unused sched_clutch_root_t root_clutch
)
262 #endif /* DEVELOPMENT || DEBUG */
265 * sched_clutch_thr_count_inc()
267 * Increment thread count at a hierarchy level with overflow checks.
270 sched_clutch_thr_count_inc(
273 if (__improbable(os_inc_overflow(thr_count
))) {
274 panic("sched_clutch thread count overflowed!");
279 * sched_clutch_thr_count_dec()
281 * Decrement thread count at a hierarchy level with underflow checks.
284 sched_clutch_thr_count_dec(
287 if (__improbable(os_dec_overflow(thr_count
))) {
288 panic("sched_clutch thread count underflowed!");
293 * The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
294 * based on the amount of time they have been pending and the load at that
295 * scheduling bucket level. Since the clutch bucket groups are global (i.e. span
296 * multiple clusters, its important to keep the load also as a global counter.
298 static uint32_t _Atomic sched_clutch_global_bucket_load
[TH_BUCKET_SCHED_MAX
];
301 * sched_clutch_root_init()
303 * Routine to initialize the scheduler hierarchy root.
306 sched_clutch_root_init(
307 sched_clutch_root_t root_clutch
,
308 processor_set_t pset
)
310 root_clutch
->scr_thr_count
= 0;
311 root_clutch
->scr_priority
= NOPRI
;
312 root_clutch
->scr_urgency
= 0;
313 root_clutch
->scr_pset
= pset
;
314 #if CONFIG_SCHED_EDGE
315 root_clutch
->scr_cluster_id
= pset
->pset_cluster_id
;
316 #else /* CONFIG_SCHED_EDGE */
317 root_clutch
->scr_cluster_id
= 0;
318 #endif /* CONFIG_SCHED_EDGE */
320 /* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
321 queue_init(&root_clutch
->scr_clutch_buckets
);
323 /* Initialize the priority queue which maintains all runnable foreign clutch buckets */
324 priority_queue_init(&root_clutch
->scr_foreign_buckets
);
325 bzero(&root_clutch
->scr_cumulative_run_count
, sizeof(root_clutch
->scr_cumulative_run_count
));
326 bitmap_zero(root_clutch
->scr_bound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
);
327 bitmap_zero(root_clutch
->scr_bound_warp_available
, TH_BUCKET_SCHED_MAX
);
328 priority_queue_init(&root_clutch
->scr_bound_root_buckets
);
330 /* Initialize the bitmap and priority queue of runnable root buckets */
331 priority_queue_init(&root_clutch
->scr_unbound_root_buckets
);
332 bitmap_zero(root_clutch
->scr_unbound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
);
333 bitmap_zero(root_clutch
->scr_unbound_warp_available
, TH_BUCKET_SCHED_MAX
);
335 /* Initialize all the root buckets */
336 for (uint32_t i
= 0; i
< TH_BUCKET_SCHED_MAX
; i
++) {
337 sched_clutch_root_bucket_init(&root_clutch
->scr_unbound_buckets
[i
], i
, false);
338 sched_clutch_root_bucket_init(&root_clutch
->scr_bound_buckets
[i
], i
, true);
343 * Clutch Bucket Runqueues
345 * The clutch buckets are maintained in a runq at the root bucket level. The
346 * runq organization allows clutch buckets to be ordered based on various
349 * - Clutch buckets are round robin'ed at the same priority level when a
350 * thread is selected from a clutch bucket. This prevents a clutch bucket
351 * from starving out other clutch buckets at the same priority.
353 * - Clutch buckets are inserted at the head when it becomes runnable due to
354 * thread preemption. This allows threads that were preempted to maintain
355 * their order in the queue.
360 * sched_clutch_bucket_runq_init()
362 * Initialize a clutch bucket runq.
365 sched_clutch_bucket_runq_init(
366 sched_clutch_bucket_runq_t clutch_buckets_rq
)
368 clutch_buckets_rq
->scbrq_highq
= NOPRI
;
369 for (uint8_t i
= 0; i
< BITMAP_LEN(NRQS
); i
++) {
370 clutch_buckets_rq
->scbrq_bitmap
[i
] = 0;
372 clutch_buckets_rq
->scbrq_count
= 0;
373 for (int i
= 0; i
< NRQS
; i
++) {
374 circle_queue_init(&clutch_buckets_rq
->scbrq_queues
[i
]);
379 * sched_clutch_bucket_runq_empty()
381 * Returns if a clutch bucket runq is empty.
384 sched_clutch_bucket_runq_empty(
385 sched_clutch_bucket_runq_t clutch_buckets_rq
)
387 return clutch_buckets_rq
->scbrq_count
== 0;
391 * sched_clutch_bucket_runq_peek()
393 * Returns the highest priority clutch bucket in the runq.
395 static sched_clutch_bucket_t
396 sched_clutch_bucket_runq_peek(
397 sched_clutch_bucket_runq_t clutch_buckets_rq
)
399 if (clutch_buckets_rq
->scbrq_count
> 0) {
400 circle_queue_t queue
= &clutch_buckets_rq
->scbrq_queues
[clutch_buckets_rq
->scbrq_highq
];
401 return cqe_queue_first(queue
, struct sched_clutch_bucket
, scb_runqlink
);
408 * sched_clutch_bucket_runq_enqueue()
410 * Enqueue a clutch bucket into the runq based on the options passed in.
413 sched_clutch_bucket_runq_enqueue(
414 sched_clutch_bucket_runq_t clutch_buckets_rq
,
415 sched_clutch_bucket_t clutch_bucket
,
416 sched_clutch_bucket_options_t options
)
418 circle_queue_t queue
= &clutch_buckets_rq
->scbrq_queues
[clutch_bucket
->scb_priority
];
419 if (circle_queue_empty(queue
)) {
420 circle_enqueue_tail(queue
, &clutch_bucket
->scb_runqlink
);
421 bitmap_set(clutch_buckets_rq
->scbrq_bitmap
, clutch_bucket
->scb_priority
);
422 if (clutch_bucket
->scb_priority
> clutch_buckets_rq
->scbrq_highq
) {
423 clutch_buckets_rq
->scbrq_highq
= clutch_bucket
->scb_priority
;
426 if (options
& SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ
) {
427 circle_enqueue_head(queue
, &clutch_bucket
->scb_runqlink
);
430 * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
431 * SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
433 circle_enqueue_tail(queue
, &clutch_bucket
->scb_runqlink
);
436 clutch_buckets_rq
->scbrq_count
++;
440 * sched_clutch_bucket_runq_remove()
442 * Remove a clutch bucket from the runq.
445 sched_clutch_bucket_runq_remove(
446 sched_clutch_bucket_runq_t clutch_buckets_rq
,
447 sched_clutch_bucket_t clutch_bucket
)
449 circle_queue_t queue
= &clutch_buckets_rq
->scbrq_queues
[clutch_bucket
->scb_priority
];
450 circle_dequeue(queue
, &clutch_bucket
->scb_runqlink
);
451 assert(clutch_buckets_rq
->scbrq_count
> 0);
452 clutch_buckets_rq
->scbrq_count
--;
453 if (circle_queue_empty(queue
)) {
454 bitmap_clear(clutch_buckets_rq
->scbrq_bitmap
, clutch_bucket
->scb_priority
);
455 clutch_buckets_rq
->scbrq_highq
= bitmap_first(clutch_buckets_rq
->scbrq_bitmap
, NRQS
);
460 sched_clutch_bucket_runq_rotate(
461 sched_clutch_bucket_runq_t clutch_buckets_rq
,
462 sched_clutch_bucket_t clutch_bucket
)
464 circle_queue_t queue
= &clutch_buckets_rq
->scbrq_queues
[clutch_bucket
->scb_priority
];
465 assert(clutch_bucket
== cqe_queue_first(queue
, struct sched_clutch_bucket
, scb_runqlink
));
466 circle_queue_rotate_head_forward(queue
);
470 * sched_clutch_root_bucket_init()
472 * Routine to initialize root buckets.
475 sched_clutch_root_bucket_init(
476 sched_clutch_root_bucket_t root_bucket
,
477 sched_bucket_t bucket
,
478 bool bound_root_bucket
)
480 root_bucket
->scrb_bucket
= bucket
;
481 if (bound_root_bucket
) {
482 /* For bound root buckets, initialize the bound thread runq. */
483 root_bucket
->scrb_bound
= true;
484 run_queue_init(&root_bucket
->scrb_bound_thread_runq
);
487 * The unbounded root buckets contain a runq of runnable clutch buckets
488 * which then hold the runnable threads.
490 root_bucket
->scrb_bound
= false;
491 sched_clutch_bucket_runq_init(&root_bucket
->scrb_clutch_buckets
);
493 priority_queue_entry_init(&root_bucket
->scrb_pqlink
);
494 root_bucket
->scrb_pqlink
.deadline
= SCHED_CLUTCH_INVALID_TIME_64
;
495 root_bucket
->scrb_warped_deadline
= 0;
496 root_bucket
->scrb_warp_remaining
= sched_clutch_root_bucket_warp
[root_bucket
->scrb_bucket
];
497 root_bucket
->scrb_starvation_avoidance
= false;
498 root_bucket
->scrb_starvation_ts
= 0;
502 * Special case scheduling for Above UI bucket.
504 * AboveUI threads are typically system critical threads that need low latency
505 * which is why they are handled specially.
507 * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
508 * important to maintain some native priority order between those buckets. For unbounded
509 * root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
510 * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
511 * deadline based scheduling which should pickup the timeshare buckets. For the bound
512 * case, the policy simply compares the priority of the highest runnable threads in
513 * the above UI and timeshare buckets.
515 * The implementation allows extremely low latency CPU access for Above UI threads
516 * while supporting the use case of high priority timeshare threads contending with
517 * lower priority fixed priority threads.
522 * sched_clutch_root_unbound_select_aboveui()
524 * Routine to determine if the above UI unbounded bucket should be selected for execution.
527 sched_clutch_root_unbound_select_aboveui(
528 sched_clutch_root_t root_clutch
)
530 if (bitmap_test(root_clutch
->scr_unbound_runnable_bitmap
, TH_BUCKET_FIXPRI
)) {
531 sched_clutch_root_bucket_t root_bucket_aboveui
= &root_clutch
->scr_unbound_buckets
[TH_BUCKET_FIXPRI
];
532 sched_clutch_root_bucket_t root_bucket_sharefg
= &root_clutch
->scr_unbound_buckets
[TH_BUCKET_SHARE_FG
];
533 if (!bitmap_test(root_clutch
->scr_unbound_runnable_bitmap
, TH_BUCKET_SHARE_FG
)) {
534 /* If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling */
537 sched_clutch_bucket_t clutch_bucket_aboveui
= sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_aboveui
);
538 sched_clutch_bucket_t clutch_bucket_sharefg
= sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_sharefg
);
539 if (clutch_bucket_aboveui
->scb_priority
>= clutch_bucket_sharefg
->scb_priority
) {
547 * sched_clutch_root_bound_select_aboveui()
549 * Routine to determine if the above UI bounded bucket should be selected for execution.
552 sched_clutch_root_bound_select_aboveui(
553 sched_clutch_root_t root_clutch
)
555 sched_clutch_root_bucket_t root_bucket_aboveui
= &root_clutch
->scr_bound_buckets
[TH_BUCKET_FIXPRI
];
556 sched_clutch_root_bucket_t root_bucket_sharefg
= &root_clutch
->scr_bound_buckets
[TH_BUCKET_SHARE_FG
];
557 if (root_bucket_aboveui
->scrb_bound_thread_runq
.count
== 0) {
560 return root_bucket_aboveui
->scrb_bound_thread_runq
.highq
>= root_bucket_sharefg
->scrb_bound_thread_runq
.highq
;
564 * sched_clutch_root_highest_root_bucket()
566 * Main routine to find the highest runnable root level bucket.
567 * This routine is called from performance sensitive contexts; so it is
568 * crucial to keep this O(1). The options parameter determines if
569 * the selection logic should look at unbounded threads only (for
570 * cross-cluster stealing operations) or both bounded and unbounded
571 * threads (for selecting next thread for execution on current cluster).
573 static sched_clutch_root_bucket_t
574 sched_clutch_root_highest_root_bucket(
575 sched_clutch_root_t root_clutch
,
577 sched_clutch_highest_root_bucket_type_t type
)
579 sched_clutch_hierarchy_locked_assert(root_clutch
);
580 int highest_runnable_bucket
= -1;
581 if (type
== SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY
) {
582 highest_runnable_bucket
= bitmap_lsb_first(root_clutch
->scr_unbound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
);
584 int highest_unbound_runnable_bucket
= bitmap_lsb_first(root_clutch
->scr_unbound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
);
585 int highest_bound_runnable_bucket
= bitmap_lsb_first(root_clutch
->scr_bound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
);
586 highest_runnable_bucket
= (highest_bound_runnable_bucket
!= -1) ? ((highest_unbound_runnable_bucket
!= -1) ? MIN(highest_bound_runnable_bucket
, highest_unbound_runnable_bucket
) : highest_bound_runnable_bucket
) : highest_unbound_runnable_bucket
;
589 if (highest_runnable_bucket
== -1) {
593 /* Above UI root bucket selection (see comment above for more details on this special case handling) */
594 bool unbound_aboveui
= sched_clutch_root_unbound_select_aboveui(root_clutch
);
595 if (type
== SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY
) {
596 if (unbound_aboveui
) {
597 return &root_clutch
->scr_unbound_buckets
[TH_BUCKET_FIXPRI
];
599 /* Fall through to selecting a timeshare root bucket */
601 bool bound_aboveui
= sched_clutch_root_bound_select_aboveui(root_clutch
);
602 sched_clutch_root_bucket_t unbound_aboveui_root_bucket
= &root_clutch
->scr_unbound_buckets
[TH_BUCKET_FIXPRI
];
603 sched_clutch_root_bucket_t bound_aboveui_root_bucket
= &root_clutch
->scr_bound_buckets
[TH_BUCKET_FIXPRI
];
605 if (unbound_aboveui
&& bound_aboveui
) {
607 * In this scenario both the bounded and unbounded above UI buckets are runnable; choose based on the
608 * highest runnable priority in both the buckets.
610 int bound_aboveui_pri
= root_clutch
->scr_bound_buckets
[TH_BUCKET_FIXPRI
].scrb_bound_thread_runq
.highq
;
611 sched_clutch_bucket_t clutch_bucket
= sched_clutch_root_bucket_highest_clutch_bucket(unbound_aboveui_root_bucket
);
612 int unbound_aboveui_pri
= priority_queue_max_sched_pri(&clutch_bucket
->scb_clutchpri_prioq
);
613 return (bound_aboveui_pri
>= unbound_aboveui_pri
) ? bound_aboveui_root_bucket
: unbound_aboveui_root_bucket
;
615 if (unbound_aboveui
) {
616 return unbound_aboveui_root_bucket
;
619 return bound_aboveui_root_bucket
;
621 /* Fall through to selecting a timeshare root bucket */
625 * Above UI bucket is not runnable or has a low priority runnable thread; use the
626 * earliest deadline model to schedule threads. The idea is that as the timeshare
627 * buckets use CPU, they will drop their interactivity score/sched priority and
628 * allow the low priority AboveUI buckets to be scheduled.
631 /* Find the earliest deadline bucket */
632 sched_clutch_root_bucket_t edf_bucket
= NULL
;
633 sched_clutch_root_bucket_t warp_bucket
= NULL
;
634 int warp_bucket_index
= -1;
636 evaluate_root_buckets
:
637 if (type
== SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY
) {
638 edf_bucket
= priority_queue_min(&root_clutch
->scr_unbound_root_buckets
, struct sched_clutch_root_bucket
, scrb_pqlink
);
640 sched_clutch_root_bucket_t unbound_bucket
= priority_queue_min(&root_clutch
->scr_unbound_root_buckets
, struct sched_clutch_root_bucket
, scrb_pqlink
);
641 sched_clutch_root_bucket_t bound_bucket
= priority_queue_min(&root_clutch
->scr_bound_root_buckets
, struct sched_clutch_root_bucket
, scrb_pqlink
);
642 if (bound_bucket
&& unbound_bucket
) {
643 /* If bound and unbound root buckets are runnable, select the one with the earlier deadline */
644 edf_bucket
= (bound_bucket
->scrb_pqlink
.deadline
<= unbound_bucket
->scrb_pqlink
.deadline
) ? bound_bucket
: unbound_bucket
;
646 edf_bucket
= (bound_bucket
) ? bound_bucket
: unbound_bucket
;
650 * Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
651 * buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
652 * makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
655 bitmap_t
*warp_available_bitmap
= (edf_bucket
->scrb_bound
) ? (root_clutch
->scr_bound_warp_available
) : (root_clutch
->scr_unbound_warp_available
);
656 warp_bucket_index
= bitmap_lsb_first(warp_available_bitmap
, TH_BUCKET_SCHED_MAX
);
658 if ((warp_bucket_index
== -1) || (warp_bucket_index
>= edf_bucket
->scrb_bucket
)) {
659 /* No higher buckets have warp left; best choice is the EDF based bucket */
660 if (edf_bucket
->scrb_starvation_avoidance
) {
662 * Indicates that the earliest deadline bucket is in starvation avoidance mode. Check to see if the
663 * starvation avoidance window is still open and return this bucket if it is.
665 * The starvation avoidance window is calculated based on the quantum of threads at that bucket and
666 * the number of CPUs in the cluster. The idea is to basically provide one quantum worth of starvation
667 * avoidance across all CPUs.
669 uint64_t starvation_window
= sched_clutch_thread_quantum
[edf_bucket
->scrb_bucket
] / root_clutch
->scr_pset
->online_processor_count
;
670 if (timestamp
< (edf_bucket
->scrb_starvation_ts
+ starvation_window
)) {
673 /* Starvation avoidance window is over; update deadline and re-evaluate EDF */
674 edf_bucket
->scrb_starvation_avoidance
= false;
675 edf_bucket
->scrb_starvation_ts
= 0;
676 sched_clutch_root_bucket_deadline_update(edf_bucket
, root_clutch
, timestamp
);
678 goto evaluate_root_buckets
;
681 /* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */
682 if (highest_runnable_bucket
< edf_bucket
->scrb_bucket
) {
683 /* Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance */
684 edf_bucket
->scrb_starvation_avoidance
= true;
685 edf_bucket
->scrb_starvation_ts
= timestamp
;
687 /* EDF bucket is being selected in the natural order; update deadline and reset warp */
688 sched_clutch_root_bucket_deadline_update(edf_bucket
, root_clutch
, timestamp
);
689 edf_bucket
->scrb_warp_remaining
= sched_clutch_root_bucket_warp
[edf_bucket
->scrb_bucket
];
690 edf_bucket
->scrb_warped_deadline
= SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED
;
691 if (edf_bucket
->scrb_bound
) {
692 bitmap_set(root_clutch
->scr_bound_warp_available
, edf_bucket
->scrb_bucket
);
694 bitmap_set(root_clutch
->scr_unbound_warp_available
, edf_bucket
->scrb_bucket
);
701 * Looks like there is a root bucket which is higher in the natural priority
702 * order than edf_bucket and might have some warp remaining.
704 warp_bucket
= (edf_bucket
->scrb_bound
) ? &root_clutch
->scr_bound_buckets
[warp_bucket_index
] : &root_clutch
->scr_unbound_buckets
[warp_bucket_index
];
705 if (warp_bucket
->scrb_warped_deadline
== SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED
) {
706 /* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
707 warp_bucket
->scrb_warped_deadline
= timestamp
+ warp_bucket
->scrb_warp_remaining
;
708 sched_clutch_root_bucket_deadline_update(warp_bucket
, root_clutch
, timestamp
);
711 if (warp_bucket
->scrb_warped_deadline
> timestamp
) {
712 /* Root bucket already has a warp window open with some warp remaining */
713 sched_clutch_root_bucket_deadline_update(warp_bucket
, root_clutch
, timestamp
);
717 /* For this bucket, warp window was opened sometime in the past but has now
718 * expired. Mark the bucket as not avilable for warp anymore and re-run the
719 * warp bucket selection logic.
721 warp_bucket
->scrb_warp_remaining
= 0;
722 if (warp_bucket
->scrb_bound
) {
723 bitmap_clear(root_clutch
->scr_bound_warp_available
, warp_bucket
->scrb_bucket
);
725 bitmap_clear(root_clutch
->scr_unbound_warp_available
, warp_bucket
->scrb_bucket
);
727 goto evaluate_root_buckets
;
731 * sched_clutch_root_bucket_deadline_calculate()
733 * Calculate the deadline for the bucket based on its WCEL
736 sched_clutch_root_bucket_deadline_calculate(
737 sched_clutch_root_bucket_t root_bucket
,
740 /* For fixpri AboveUI bucket always return it as the earliest deadline */
741 if (root_bucket
->scrb_bucket
< TH_BUCKET_SHARE_FG
) {
745 /* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
746 return timestamp
+ sched_clutch_root_bucket_wcel
[root_bucket
->scrb_bucket
];
750 * sched_clutch_root_bucket_deadline_update()
752 * Routine to update the deadline of the root bucket when it is selected.
753 * Updating the deadline also moves the root_bucket in the EDF priority
757 sched_clutch_root_bucket_deadline_update(
758 sched_clutch_root_bucket_t root_bucket
,
759 sched_clutch_root_t root_clutch
,
762 if (root_bucket
->scrb_bucket
== TH_BUCKET_FIXPRI
) {
763 /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
767 uint64_t old_deadline
= root_bucket
->scrb_pqlink
.deadline
;
768 uint64_t new_deadline
= sched_clutch_root_bucket_deadline_calculate(root_bucket
, timestamp
);
769 if (__improbable(old_deadline
> new_deadline
)) {
770 panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline
, new_deadline
, root_bucket
->scrb_bucket
, timestamp
);
772 if (old_deadline
!= new_deadline
) {
773 root_bucket
->scrb_pqlink
.deadline
= new_deadline
;
774 struct priority_queue_deadline_min
*prioq
= (root_bucket
->scrb_bound
) ? &root_clutch
->scr_bound_root_buckets
: &root_clutch
->scr_unbound_root_buckets
;
775 priority_queue_entry_increased(prioq
, &root_bucket
->scrb_pqlink
);
780 * sched_clutch_root_bucket_runnable()
782 * Routine to insert a newly runnable root bucket into the hierarchy.
783 * Also updates the deadline and warp parameters as necessary.
786 sched_clutch_root_bucket_runnable(
787 sched_clutch_root_bucket_t root_bucket
,
788 sched_clutch_root_t root_clutch
,
791 /* Mark the root bucket as runnable */
792 bitmap_t
*runnable_bitmap
= (root_bucket
->scrb_bound
) ? root_clutch
->scr_bound_runnable_bitmap
: root_clutch
->scr_unbound_runnable_bitmap
;
793 bitmap_set(runnable_bitmap
, root_bucket
->scrb_bucket
);
795 if (root_bucket
->scrb_bucket
== TH_BUCKET_FIXPRI
) {
796 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
800 if (root_bucket
->scrb_starvation_avoidance
== false) {
802 * Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
803 * starvation avoidance and its window has expired, the highest root bucket selection logic will notice
804 * that and fix it up.
806 root_bucket
->scrb_pqlink
.deadline
= sched_clutch_root_bucket_deadline_calculate(root_bucket
, timestamp
);
808 struct priority_queue_deadline_min
*prioq
= (root_bucket
->scrb_bound
) ? &root_clutch
->scr_bound_root_buckets
: &root_clutch
->scr_unbound_root_buckets
;
809 priority_queue_insert(prioq
, &root_bucket
->scrb_pqlink
);
810 if (root_bucket
->scrb_warp_remaining
) {
811 /* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
812 bitmap_t
*warp_bitmap
= (root_bucket
->scrb_bound
) ? root_clutch
->scr_bound_warp_available
: root_clutch
->scr_unbound_warp_available
;
813 bitmap_set(warp_bitmap
, root_bucket
->scrb_bucket
);
818 * sched_clutch_root_bucket_empty()
820 * Routine to remove an empty root bucket from the hierarchy.
821 * Also updates the deadline and warp parameters as necessary.
824 sched_clutch_root_bucket_empty(
825 sched_clutch_root_bucket_t root_bucket
,
826 sched_clutch_root_t root_clutch
,
829 bitmap_t
*runnable_bitmap
= (root_bucket
->scrb_bound
) ? root_clutch
->scr_bound_runnable_bitmap
: root_clutch
->scr_unbound_runnable_bitmap
;
830 bitmap_clear(runnable_bitmap
, root_bucket
->scrb_bucket
);
832 if (root_bucket
->scrb_bucket
== TH_BUCKET_FIXPRI
) {
833 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
837 struct priority_queue_deadline_min
*prioq
= (root_bucket
->scrb_bound
) ? &root_clutch
->scr_bound_root_buckets
: &root_clutch
->scr_unbound_root_buckets
;
838 priority_queue_remove(prioq
, &root_bucket
->scrb_pqlink
);
840 bitmap_t
*warp_bitmap
= (root_bucket
->scrb_bound
) ? root_clutch
->scr_bound_warp_available
: root_clutch
->scr_unbound_warp_available
;
841 bitmap_clear(warp_bitmap
, root_bucket
->scrb_bucket
);
843 if (root_bucket
->scrb_warped_deadline
> timestamp
) {
845 * For root buckets that were using the warp, check if the warp
846 * deadline is in the future. If yes, remove the wall time the
847 * warp was active and update the warp remaining. This allows
848 * the root bucket to use the remaining warp the next time it
851 root_bucket
->scrb_warp_remaining
= root_bucket
->scrb_warped_deadline
- timestamp
;
852 } else if (root_bucket
->scrb_warped_deadline
!= SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED
) {
854 * If the root bucket's warped deadline is in the past, it has used up
855 * all the warp it was assigned. Empty out its warp remaining.
857 root_bucket
->scrb_warp_remaining
= 0;
862 sched_clutch_global_bucket_load_get(
863 sched_bucket_t bucket
)
865 return (int)os_atomic_load(&sched_clutch_global_bucket_load
[bucket
], relaxed
);
869 * sched_clutch_root_pri_update()
871 * The root level priority is used for thread selection and preemption
874 * The logic uses the same decision as thread selection for deciding between the
875 * above UI and timeshare buckets. If one of the timesharing buckets have to be
876 * used for priority calculation, the logic is slightly different from thread
877 * selection, because thread selection considers deadlines, warps etc. to
878 * decide the most optimal bucket at a given timestamp. Since the priority
879 * value is used for preemption decisions only, it needs to be based on the
880 * highest runnable thread available in the timeshare domain. This logic can
881 * be made more sophisticated if there are cases of unnecessary preemption
882 * being seen in workloads.
885 sched_clutch_root_pri_update(
886 sched_clutch_root_t root_clutch
)
888 sched_clutch_hierarchy_locked_assert(root_clutch
);
889 int16_t root_bound_pri
= NOPRI
;
890 int16_t root_unbound_pri
= NOPRI
;
892 if (bitmap_lsb_first(root_clutch
->scr_bound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
) == -1) {
893 goto root_pri_update_unbound
;
895 sched_clutch_root_bucket_t root_bucket_bound
= NULL
;
896 if (sched_clutch_root_bound_select_aboveui(root_clutch
)) {
897 root_bucket_bound
= &root_clutch
->scr_bound_buckets
[TH_BUCKET_FIXPRI
];
899 int root_bucket_index
= bitmap_lsb_next(root_clutch
->scr_bound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
, TH_BUCKET_FIXPRI
);
900 assert(root_bucket_index
!= -1);
901 root_bucket_bound
= &root_clutch
->scr_bound_buckets
[root_bucket_index
];
903 root_bound_pri
= root_bucket_bound
->scrb_bound_thread_runq
.highq
;
905 root_pri_update_unbound
:
906 if (bitmap_lsb_first(root_clutch
->scr_unbound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
) == -1) {
907 goto root_pri_update_complete
;
909 sched_clutch_root_bucket_t root_bucket_unbound
= NULL
;
910 if (sched_clutch_root_unbound_select_aboveui(root_clutch
)) {
911 root_bucket_unbound
= &root_clutch
->scr_unbound_buckets
[TH_BUCKET_FIXPRI
];
913 int root_bucket_index
= bitmap_lsb_next(root_clutch
->scr_unbound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
, TH_BUCKET_FIXPRI
);
914 assert(root_bucket_index
!= -1);
915 root_bucket_unbound
= &root_clutch
->scr_unbound_buckets
[root_bucket_index
];
917 /* For the selected root bucket, find the highest priority clutch bucket */
918 sched_clutch_bucket_t clutch_bucket
= sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_unbound
);
919 root_unbound_pri
= priority_queue_max_sched_pri(&clutch_bucket
->scb_clutchpri_prioq
);
921 root_pri_update_complete
:
922 root_clutch
->scr_priority
= MAX(root_bound_pri
, root_unbound_pri
);
926 * sched_clutch_root_urgency_inc()
928 * Routine to increment the urgency at the root level based on the thread
929 * priority that is being inserted into the hierarchy. The root urgency
930 * counter is updated based on the urgency of threads in any of the
931 * clutch buckets which are part of the hierarchy.
933 * Always called with the pset lock held.
936 sched_clutch_root_urgency_inc(
937 sched_clutch_root_t root_clutch
,
940 if (SCHED(priority_is_urgent
)(thread
->sched_pri
)) {
941 root_clutch
->scr_urgency
++;
946 * sched_clutch_root_urgency_dec()
948 * Routine to decrement the urgency at the root level based on the thread
949 * priority that is being removed from the hierarchy. The root urgency
950 * counter is updated based on the urgency of threads in any of the
951 * clutch buckets which are part of the hierarchy.
953 * Always called with the pset lock held.
956 sched_clutch_root_urgency_dec(
957 sched_clutch_root_t root_clutch
,
960 if (SCHED(priority_is_urgent
)(thread
->sched_pri
)) {
961 root_clutch
->scr_urgency
--;
966 * Clutch bucket level scheduling
968 * The second level of scheduling is the clutch bucket level scheduling
969 * which tries to schedule thread groups within root_buckets. Each
970 * clutch represents a thread group and a clutch_bucket_group represents
971 * threads at a particular sched_bucket within that thread group. The
972 * clutch_bucket_group contains a clutch_bucket per cluster on the system
973 * where it holds the runnable threads destined for execution on that
976 * The goal of this level of scheduling is to allow interactive thread
977 * groups low latency access to the CPU. It also provides slight
978 * scheduling preference for App and unrestricted thread groups.
980 * The clutch bucket scheduling algorithm measures an interactivity
981 * score for all clutch bucket groups. The interactivity score is based
982 * on the ratio of the CPU used and the voluntary blocking of threads
983 * within the clutch bucket group. The algorithm is very close to the ULE
984 * scheduler on FreeBSD in terms of calculations. The interactivity
985 * score provides an interactivity boost in the range of
986 * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
987 * thread groups to win over CPU spinners.
989 * The interactivity score of the clutch bucket group is combined with the
990 * highest base/promoted priority of threads in the clutch bucket to form
991 * the overall priority of the clutch bucket.
994 /* Priority boost range for interactivity */
995 #define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT (8)
996 uint8_t sched_clutch_bucket_group_interactive_pri
= SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT
;
998 /* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
999 uint64_t sched_clutch_bucket_group_adjust_threshold
= 0;
1000 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS (500000)
1002 /* The ratio to scale the cpu/blocked time per window */
1003 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO (10)
1006 * In order to allow App thread groups some preference over daemon thread
1007 * groups, the App clutch_buckets get a priority boost. The boost value should
1008 * be chosen such that badly behaved apps are still penalized over well
1009 * behaved interactive daemons.
1011 static uint8_t sched_clutch_bucket_group_pri_boost
[SCHED_CLUTCH_TG_PRI_MAX
] = {
1012 [SCHED_CLUTCH_TG_PRI_LOW
] = 0,
1013 [SCHED_CLUTCH_TG_PRI_MED
] = 2,
1014 [SCHED_CLUTCH_TG_PRI_HIGH
] = 4,
1017 /* Initial value for voluntary blocking time for the clutch_bucket */
1018 #define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID (uint64_t)(~0)
1020 /* Value indicating the clutch bucket is not pending execution */
1021 #define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID ((uint64_t)(~0))
1024 * Thread group CPU starvation avoidance
1026 * In heavily CPU contended scenarios, it is possible that some thread groups
1027 * which have a low interactivity score do not get CPU time at all. In order to
1028 * resolve that, the scheduler tries to ageout the CPU usage of the clutch
1029 * bucket group when it has been pending execution for a certain time as defined
1030 * by the sched_clutch_bucket_group_pending_delta_us values below.
1032 * The values chosen here are very close to the WCEL values for each sched bucket.
1033 * These values are multiplied by the load average of the relevant root bucket to
1034 * provide an estimate of the actual clutch bucket load.
1036 static uint32_t sched_clutch_bucket_group_pending_delta_us
[TH_BUCKET_SCHED_MAX
] = {
1037 SCHED_CLUTCH_INVALID_TIME_32
, /* FIXPRI */
1044 static uint64_t sched_clutch_bucket_group_pending_delta
[TH_BUCKET_SCHED_MAX
] = {0};
1047 * sched_clutch_bucket_init()
1049 * Initializer for clutch buckets.
1052 sched_clutch_bucket_init(
1053 sched_clutch_bucket_t clutch_bucket
,
1054 sched_clutch_bucket_group_t clutch_bucket_group
,
1055 sched_bucket_t bucket
)
1057 bzero(clutch_bucket
, sizeof(struct sched_clutch_bucket
));
1059 clutch_bucket
->scb_bucket
= bucket
;
1060 /* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
1061 clutch_bucket
->scb_priority
= 0;
1062 #if CONFIG_SCHED_EDGE
1063 clutch_bucket
->scb_foreign
= false;
1064 priority_queue_entry_init(&clutch_bucket
->scb_foreignlink
);
1065 #endif /* CONFIG_SCHED_EDGE */
1066 clutch_bucket
->scb_group
= clutch_bucket_group
;
1067 clutch_bucket
->scb_root
= NULL
;
1068 priority_queue_init(&clutch_bucket
->scb_clutchpri_prioq
);
1069 priority_queue_init(&clutch_bucket
->scb_thread_runq
);
1070 queue_init(&clutch_bucket
->scb_thread_timeshare_queue
);
1074 * sched_clutch_bucket_group_init()
1076 * Initializer for clutch bucket groups.
1079 sched_clutch_bucket_group_init(
1080 sched_clutch_bucket_group_t clutch_bucket_group
,
1081 sched_clutch_t clutch
,
1082 sched_bucket_t bucket
)
1084 bzero(clutch_bucket_group
, sizeof(struct sched_clutch_bucket_group
));
1085 clutch_bucket_group
->scbg_bucket
= bucket
;
1086 clutch_bucket_group
->scbg_clutch
= clutch
;
1087 for (int i
= 0; i
< MAX_PSETS
; i
++) {
1088 sched_clutch_bucket_init(&clutch_bucket_group
->scbg_clutch_buckets
[i
], clutch_bucket_group
, bucket
);
1090 os_atomic_store(&clutch_bucket_group
->scbg_timeshare_tick
, 0, relaxed
);
1091 os_atomic_store(&clutch_bucket_group
->scbg_pri_shift
, INT8_MAX
, relaxed
);
1092 os_atomic_store(&clutch_bucket_group
->scbg_preferred_cluster
, pset0
.pset_cluster_id
, relaxed
);
1094 * All thread groups should be initialized to be interactive; this allows the newly launched
1095 * thread groups to fairly compete with already running thread groups.
1097 clutch_bucket_group
->scbg_interactivity_data
.scct_count
= (sched_clutch_bucket_group_interactive_pri
* 2);
1098 clutch_bucket_group
->scbg_interactivity_data
.scct_timestamp
= 0;
1099 os_atomic_store(&clutch_bucket_group
->scbg_cpu_data
.cpu_data
.scbcd_cpu_blocked
, (clutch_cpu_data_t
)sched_clutch_bucket_group_adjust_threshold
, relaxed
);
1101 lck_spin_init(&clutch_bucket_group
->scbg_stats_lock
, &pset_lck_grp
, NULL
);
1102 #endif /* !__LP64__ */
1103 clutch_bucket_group
->scbg_blocked_data
.scct_timestamp
= SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID
;
1104 clutch_bucket_group
->scbg_pending_data
.scct_timestamp
= SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID
;
1108 * sched_clutch_init_with_thread_group()
1110 * Initialize the sched_clutch when the thread group is being created
1113 sched_clutch_init_with_thread_group(
1114 sched_clutch_t clutch
,
1115 struct thread_group
*tg
)
1117 os_atomic_store(&clutch
->sc_thr_count
, 0, relaxed
);
1119 /* Initialize all the clutch buckets */
1120 for (uint32_t i
= 0; i
< TH_BUCKET_SCHED_MAX
; i
++) {
1121 sched_clutch_bucket_group_init(&(clutch
->sc_clutch_groups
[i
]), clutch
, i
);
1124 /* Grouping specific fields */
1126 os_atomic_store(&clutch
->sc_tg_priority
, 0, relaxed
);
1130 * sched_clutch_destroy()
1132 * Destructor for clutch; called from thread group release code.
1135 sched_clutch_destroy(
1136 __unused sched_clutch_t clutch
)
1138 assert(os_atomic_load(&clutch
->sc_thr_count
, relaxed
) == 0);
1141 #if CONFIG_SCHED_EDGE
1144 * The current edge scheduler still relies on globals for E & P clusters. It uses these
1145 * globals for the following operations:
1146 * - Sysctl support for configuring edges
1147 * - Edge scheduler initialization
1149 * These should be removed for multi-cluster platforms once a clear policy for the above
1150 * operations is defined.
1151 * <Edge Multi-cluster Support Needed>
1153 static uint32_t ecore_cluster_id
= 0;
1154 static uint32_t pcore_cluster_id
= 1;
1157 * Edge Scheduler Preferred Cluster Mechanism
1159 * In order to have better control over various QoS buckets within a thread group, the Edge
1160 * scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
1161 * preferences are stored at the sched_clutch_bucket_group level since that represents all
1162 * threads at a particular QoS level within a sched_clutch. For any lookup of preferred
1163 * cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
1167 sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group
)
1169 return os_atomic_load(&clutch_bucket_group
->scbg_preferred_cluster
, relaxed
);
1173 sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket
)
1175 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket
->scb_group
);
1179 sched_edge_thread_preferred_cluster(thread_t thread
)
1181 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread
)) {
1182 /* For threads bound to a specific cluster, return the bound cluster id */
1183 return sched_edge_thread_bound_cluster_id(thread
);
1186 sched_clutch_t clutch
= sched_clutch_for_thread(thread
);
1187 sched_clutch_bucket_group_t clutch_bucket_group
= &clutch
->sc_clutch_groups
[thread
->th_sched_bucket
];
1188 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group
);
1192 * Edge Scheduler Foreign Bucket Support
1194 * In the Edge Scheduler, each cluster maintains a priority queue of clutch buckets containing
1195 * threads that are not native to the cluster. A clutch bucket is considered native if its
1196 * preferred cluster has the same type as the cluster its enqueued in. The foreign clutch
1197 * bucket priority queue is used for rebalance operations to get threads back to their native
1200 * It is possible to make this policy even more aggressive by considering all clusters that
1201 * are not the preferred cluster as the foreign cluster, but that would mean a lot of thread
1202 * migrations which might have performance implications.
1206 sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket
, sched_clutch_root_t root_clutch
)
1208 if (clutch_bucket
->scb_foreign
) {
1209 clutch_bucket
->scb_foreign
= false;
1210 priority_queue_remove(&root_clutch
->scr_foreign_buckets
, &clutch_bucket
->scb_foreignlink
);
1215 sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket
, sched_clutch_root_t root_clutch
)
1217 if (!clutch_bucket
->scb_foreign
) {
1218 clutch_bucket
->scb_foreign
= true;
1219 priority_queue_entry_set_sched_pri(&root_clutch
->scr_foreign_buckets
, &clutch_bucket
->scb_foreignlink
, clutch_bucket
->scb_priority
, 0);
1220 priority_queue_insert(&root_clutch
->scr_foreign_buckets
, &clutch_bucket
->scb_foreignlink
);
1225 * Edge Scheduler Cumulative Load Average
1227 * The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1228 * making thread migration decisions. The per-bucket load is maintained as a
1229 * cumulative count since higher scheduling buckets impact load on lower buckets
1230 * for thread migration decisions.
1235 sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch
, sched_bucket_t bucket
)
1238 case TH_BUCKET_FIXPRI
: os_atomic_inc(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_FIXPRI
], relaxed
); OS_FALLTHROUGH
;
1239 case TH_BUCKET_SHARE_FG
: os_atomic_inc(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_FG
], relaxed
); OS_FALLTHROUGH
;
1240 case TH_BUCKET_SHARE_IN
: os_atomic_inc(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_IN
], relaxed
); OS_FALLTHROUGH
;
1241 case TH_BUCKET_SHARE_DF
: os_atomic_inc(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_DF
], relaxed
); OS_FALLTHROUGH
;
1242 case TH_BUCKET_SHARE_UT
: os_atomic_inc(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_UT
], relaxed
); OS_FALLTHROUGH
;
1243 case TH_BUCKET_SHARE_BG
: os_atomic_inc(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_BG
], relaxed
); break;
1245 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1250 sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch
, sched_bucket_t bucket
)
1253 case TH_BUCKET_FIXPRI
: os_atomic_dec(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_FIXPRI
], relaxed
); OS_FALLTHROUGH
;
1254 case TH_BUCKET_SHARE_FG
: os_atomic_dec(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_FG
], relaxed
); OS_FALLTHROUGH
;
1255 case TH_BUCKET_SHARE_IN
: os_atomic_dec(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_IN
], relaxed
); OS_FALLTHROUGH
;
1256 case TH_BUCKET_SHARE_DF
: os_atomic_dec(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_DF
], relaxed
); OS_FALLTHROUGH
;
1257 case TH_BUCKET_SHARE_UT
: os_atomic_dec(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_UT
], relaxed
); OS_FALLTHROUGH
;
1258 case TH_BUCKET_SHARE_BG
: os_atomic_dec(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_BG
], relaxed
); break;
1260 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
1265 sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch
, sched_bucket_t bucket
)
1267 return os_atomic_load(&root_clutch
->scr_cumulative_run_count
[bucket
], relaxed
);
1270 #endif /* CONFIG_SCHED_EDGE */
1273 * sched_clutch_bucket_hierarchy_insert()
1275 * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
1278 sched_clutch_bucket_hierarchy_insert(
1279 sched_clutch_root_t root_clutch
,
1280 sched_clutch_bucket_t clutch_bucket
,
1281 sched_bucket_t bucket
,
1283 sched_clutch_bucket_options_t options
)
1285 sched_clutch_hierarchy_locked_assert(root_clutch
);
1286 if (bucket
> TH_BUCKET_FIXPRI
) {
1287 /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
1288 enqueue_tail(&root_clutch
->scr_clutch_buckets
, &clutch_bucket
->scb_listlink
);
1290 #if CONFIG_SCHED_EDGE
1291 /* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
1292 uint32_t preferred_cluster
= sched_clutch_bucket_preferred_cluster(clutch_bucket
);
1293 if (pset_type_for_id(preferred_cluster
) != pset_type_for_id(root_clutch
->scr_cluster_id
)) {
1294 sched_clutch_bucket_mark_foreign(clutch_bucket
, root_clutch
);
1296 #endif /* CONFIG_SCHED_EDGE */
1297 sched_clutch_root_bucket_t root_bucket
= &root_clutch
->scr_unbound_buckets
[bucket
];
1299 /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
1300 if (sched_clutch_bucket_runq_empty(&root_bucket
->scrb_clutch_buckets
)) {
1301 sched_clutch_root_bucket_runnable(root_bucket
, root_clutch
, timestamp
);
1304 /* Insert the clutch bucket into the root bucket run queue with order based on options */
1305 sched_clutch_bucket_runq_enqueue(&root_bucket
->scrb_clutch_buckets
, clutch_bucket
, options
);
1306 os_atomic_store(&clutch_bucket
->scb_root
, root_clutch
, relaxed
);
1307 os_atomic_inc(&sched_clutch_global_bucket_load
[bucket
], relaxed
);
1311 * sched_clutch_bucket_hierarchy_remove()
1313 * Rotuine to remove a empty clutch bucket from the root hierarchy.
1316 sched_clutch_bucket_hierarchy_remove(
1317 sched_clutch_root_t root_clutch
,
1318 sched_clutch_bucket_t clutch_bucket
,
1319 sched_bucket_t bucket
,
1321 __unused sched_clutch_bucket_options_t options
)
1323 sched_clutch_hierarchy_locked_assert(root_clutch
);
1324 if (bucket
> TH_BUCKET_FIXPRI
) {
1325 /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
1326 remqueue(&clutch_bucket
->scb_listlink
);
1328 #if CONFIG_SCHED_EDGE
1329 sched_clutch_bucket_mark_native(clutch_bucket
, root_clutch
);
1330 #endif /* CONFIG_SCHED_EDGE */
1332 sched_clutch_root_bucket_t root_bucket
= &root_clutch
->scr_unbound_buckets
[bucket
];
1334 /* Remove the clutch bucket from the root bucket priority queue */
1335 sched_clutch_bucket_runq_remove(&root_bucket
->scrb_clutch_buckets
, clutch_bucket
);
1336 os_atomic_store(&clutch_bucket
->scb_root
, NULL
, relaxed
);
1338 /* If the root bucket priority queue is now empty, remove it from the root priority queue */
1339 if (sched_clutch_bucket_runq_empty(&root_bucket
->scrb_clutch_buckets
)) {
1340 sched_clutch_root_bucket_empty(root_bucket
, root_clutch
, timestamp
);
1342 os_atomic_dec(&sched_clutch_global_bucket_load
[bucket
], relaxed
);
1346 * sched_clutch_bucket_base_pri()
1348 * Calculates the "base" priority of the clutch bucket. The base
1349 * priority of the clutch bucket is the sum of the max of highest
1350 * base_pri and highest sched_pri in the clutch bucket and any
1351 * grouping specific (App/Daemon...) boosts applicable to the
1355 sched_clutch_bucket_base_pri(
1356 sched_clutch_bucket_t clutch_bucket
)
1358 uint8_t clutch_boost
= 0;
1359 assert(priority_queue_empty(&clutch_bucket
->scb_thread_runq
) == false);
1361 sched_clutch_t clutch
= clutch_bucket
->scb_group
->scbg_clutch
;
1364 * Since the clutch bucket can contain threads that are members of the group due
1365 * to the sched_pri being promoted or due to their base pri, the base priority of
1366 * the entire clutch bucket should be based on the highest thread (promoted or base)
1367 * in the clutch bucket.
1369 uint8_t max_pri
= 0;
1370 if (!priority_queue_empty(&clutch_bucket
->scb_clutchpri_prioq
)) {
1371 max_pri
= priority_queue_max_sched_pri(&clutch_bucket
->scb_clutchpri_prioq
);
1374 sched_clutch_tg_priority_t tg_pri
= os_atomic_load(&clutch
->sc_tg_priority
, relaxed
);
1375 clutch_boost
= sched_clutch_bucket_group_pri_boost
[tg_pri
];
1376 return max_pri
+ clutch_boost
;
1380 * sched_clutch_interactivity_from_cpu_data()
1382 * Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
1385 sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group
)
1387 sched_clutch_bucket_cpu_data_t scb_cpu_data
;
1388 scb_cpu_data
.scbcd_cpu_data_packed
= os_atomic_load_wide(&clutch_bucket_group
->scbg_cpu_data
.scbcd_cpu_data_packed
, relaxed
);
1389 clutch_cpu_data_t cpu_used
= scb_cpu_data
.cpu_data
.scbcd_cpu_used
;
1390 clutch_cpu_data_t cpu_blocked
= scb_cpu_data
.cpu_data
.scbcd_cpu_blocked
;
1391 uint8_t interactive_score
= 0;
1393 if ((cpu_blocked
== 0) && (cpu_used
== 0)) {
1394 return (uint8_t)clutch_bucket_group
->scbg_interactivity_data
.scct_count
;
1397 * For all timeshare buckets, calculate the interactivity score of the bucket
1398 * and add it to the base priority
1400 if (cpu_blocked
> cpu_used
) {
1401 /* Interactive clutch_bucket case */
1402 interactive_score
= sched_clutch_bucket_group_interactive_pri
+
1403 ((sched_clutch_bucket_group_interactive_pri
* (cpu_blocked
- cpu_used
)) / cpu_blocked
);
1405 /* Non-interactive clutch_bucket case */
1406 interactive_score
= ((sched_clutch_bucket_group_interactive_pri
* cpu_blocked
) / cpu_used
);
1408 return interactive_score
;
1412 * sched_clutch_bucket_pri_calculate()
1414 * The priority calculation algorithm for the clutch_bucket is a slight
1415 * modification on the ULE interactivity score. It uses the base priority
1416 * of the clutch bucket and applies an interactivity score boost to the
1417 * highly responsive clutch buckets.
1420 sched_clutch_bucket_pri_calculate(
1421 sched_clutch_bucket_t clutch_bucket
,
1424 /* For empty clutch buckets, return priority 0 */
1425 if (clutch_bucket
->scb_thr_count
== 0) {
1429 uint8_t base_pri
= sched_clutch_bucket_base_pri(clutch_bucket
);
1430 uint8_t interactive_score
= sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket
->scb_group
, timestamp
);
1432 assert(((uint64_t)base_pri
+ interactive_score
) <= UINT8_MAX
);
1433 uint8_t pri
= base_pri
+ interactive_score
;
1434 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_CLUTCH_TG_BUCKET_PRI
) | DBG_FUNC_NONE
,
1435 thread_group_get_id(clutch_bucket
->scb_group
->scbg_clutch
->sc_tg
), clutch_bucket
->scb_bucket
, pri
, interactive_score
, 0);
1440 * sched_clutch_root_bucket_highest_clutch_bucket()
1442 * Routine to find the highest priority clutch bucket
1443 * within the root bucket.
1445 static sched_clutch_bucket_t
1446 sched_clutch_root_bucket_highest_clutch_bucket(
1447 sched_clutch_root_bucket_t root_bucket
)
1449 if (sched_clutch_bucket_runq_empty(&root_bucket
->scrb_clutch_buckets
)) {
1452 return sched_clutch_bucket_runq_peek(&root_bucket
->scrb_clutch_buckets
);
1456 * sched_clutch_bucket_runnable()
1458 * Perform all operations needed when a new clutch bucket becomes runnable.
1459 * It involves inserting the clutch_bucket into the hierarchy and updating the
1460 * root priority appropriately.
1463 sched_clutch_bucket_runnable(
1464 sched_clutch_bucket_t clutch_bucket
,
1465 sched_clutch_root_t root_clutch
,
1467 sched_clutch_bucket_options_t options
)
1469 sched_clutch_hierarchy_locked_assert(root_clutch
);
1470 /* Since the clutch bucket became newly runnable, update its pending timestamp */
1471 clutch_bucket
->scb_priority
= sched_clutch_bucket_pri_calculate(clutch_bucket
, timestamp
);
1472 sched_clutch_bucket_hierarchy_insert(root_clutch
, clutch_bucket
, clutch_bucket
->scb_bucket
, timestamp
, options
);
1474 /* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
1475 sched_clutch_bucket_group_timeshare_update(clutch_bucket
->scb_group
, clutch_bucket
, timestamp
);
1476 int16_t root_old_pri
= root_clutch
->scr_priority
;
1477 sched_clutch_root_pri_update(root_clutch
);
1478 return root_clutch
->scr_priority
> root_old_pri
;
1482 * sched_clutch_bucket_update()
1484 * Update the clutch_bucket's position in the hierarchy. This routine is
1485 * called when a new thread is inserted or removed from a runnable clutch
1486 * bucket. The options specify some properties about the clutch bucket
1487 * insertion order into the clutch bucket runq.
1490 sched_clutch_bucket_update(
1491 sched_clutch_bucket_t clutch_bucket
,
1492 sched_clutch_root_t root_clutch
,
1494 sched_clutch_bucket_options_t options
)
1496 sched_clutch_hierarchy_locked_assert(root_clutch
);
1497 uint64_t new_pri
= sched_clutch_bucket_pri_calculate(clutch_bucket
, timestamp
);
1498 sched_clutch_bucket_runq_t bucket_runq
= &root_clutch
->scr_unbound_buckets
[clutch_bucket
->scb_bucket
].scrb_clutch_buckets
;
1499 if (new_pri
== clutch_bucket
->scb_priority
) {
1501 * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1502 * to the end of the runq. Typically used when a thread is selected for execution
1503 * from a clutch bucket.
1505 if (options
& SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR
) {
1506 sched_clutch_bucket_runq_rotate(bucket_runq
, clutch_bucket
);
1510 sched_clutch_bucket_runq_remove(bucket_runq
, clutch_bucket
);
1511 #if CONFIG_SCHED_EDGE
1512 if (clutch_bucket
->scb_foreign
) {
1513 priority_queue_remove(&root_clutch
->scr_foreign_buckets
, &clutch_bucket
->scb_foreignlink
);
1515 #endif /* CONFIG_SCHED_EDGE */
1516 clutch_bucket
->scb_priority
= new_pri
;
1517 #if CONFIG_SCHED_EDGE
1518 if (clutch_bucket
->scb_foreign
) {
1519 priority_queue_entry_set_sched_pri(&root_clutch
->scr_foreign_buckets
, &clutch_bucket
->scb_foreignlink
, clutch_bucket
->scb_priority
, 0);
1520 priority_queue_insert(&root_clutch
->scr_foreign_buckets
, &clutch_bucket
->scb_foreignlink
);
1522 #endif /* CONFIG_SCHED_EDGE */
1523 sched_clutch_bucket_runq_enqueue(bucket_runq
, clutch_bucket
, options
);
1525 int16_t root_old_pri
= root_clutch
->scr_priority
;
1526 sched_clutch_root_pri_update(root_clutch
);
1527 return root_clutch
->scr_priority
> root_old_pri
;
1531 * sched_clutch_bucket_empty()
1533 * Perform all the operations needed when a clutch_bucket is no longer runnable.
1534 * It involves removing the clutch bucket from the hierarchy and updaing the root
1535 * priority appropriately.
1538 sched_clutch_bucket_empty(
1539 sched_clutch_bucket_t clutch_bucket
,
1540 sched_clutch_root_t root_clutch
,
1542 sched_clutch_bucket_options_t options
)
1544 sched_clutch_hierarchy_locked_assert(root_clutch
);
1545 sched_clutch_bucket_hierarchy_remove(root_clutch
, clutch_bucket
, clutch_bucket
->scb_bucket
, timestamp
, options
);
1546 clutch_bucket
->scb_priority
= sched_clutch_bucket_pri_calculate(clutch_bucket
, timestamp
);
1547 sched_clutch_root_pri_update(root_clutch
);
1551 * sched_clutch_cpu_usage_update()
1553 * Routine to update CPU usage of the thread in the hierarchy.
1556 sched_clutch_cpu_usage_update(
1560 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread
) || SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread
)) {
1564 sched_clutch_t clutch
= sched_clutch_for_thread(thread
);
1565 sched_clutch_bucket_group_t clutch_bucket_group
= &(clutch
->sc_clutch_groups
[thread
->th_sched_bucket
]);
1566 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group
, delta
);
1570 * sched_clutch_bucket_group_cpu_usage_update()
1572 * Routine to update the CPU usage of the clutch_bucket.
1575 sched_clutch_bucket_group_cpu_usage_update(
1576 sched_clutch_bucket_group_t clutch_bucket_group
,
1579 if (clutch_bucket_group
->scbg_bucket
== TH_BUCKET_FIXPRI
) {
1580 /* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1583 delta
= MIN(delta
, sched_clutch_bucket_group_adjust_threshold
);
1584 os_atomic_add(&(clutch_bucket_group
->scbg_cpu_data
.cpu_data
.scbcd_cpu_used
), (clutch_cpu_data_t
)delta
, relaxed
);
1588 * sched_clutch_bucket_group_cpu_pending_adjust()
1590 * Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
1591 * such that one "pending interval" provides one point improvement in interactivity score.
1593 static inline uint64_t
1594 sched_clutch_bucket_group_cpu_pending_adjust(
1596 uint64_t cpu_blocked
,
1597 uint8_t pending_intervals
)
1599 uint64_t cpu_used_adjusted
= 0;
1600 if (cpu_blocked
< cpu_used
) {
1601 cpu_used_adjusted
= (sched_clutch_bucket_group_interactive_pri
* cpu_blocked
* cpu_used
);
1602 cpu_used_adjusted
= cpu_used_adjusted
/ ((sched_clutch_bucket_group_interactive_pri
* cpu_blocked
) + (cpu_used
* pending_intervals
));
1604 uint64_t adjust_factor
= (cpu_blocked
* pending_intervals
) / sched_clutch_bucket_group_interactive_pri
;
1605 cpu_used_adjusted
= (adjust_factor
> cpu_used
) ? 0 : (cpu_used
- adjust_factor
);
1607 return cpu_used_adjusted
;
1611 * sched_clutch_bucket_group_cpu_adjust()
1613 * Routine to scale the cpu usage and blocked time once the sum gets bigger
1614 * than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
1615 * manageable and maintain the same ratio while allowing clutch buckets to
1616 * adjust behavior and reflect in the interactivity score in a reasonable
1617 * amount of time. Also adjusts the CPU usage based on pending_intervals
1618 * which allows ageout of CPU to avoid starvation in highly contended scenarios.
1621 sched_clutch_bucket_group_cpu_adjust(
1622 sched_clutch_bucket_group_t clutch_bucket_group
,
1623 uint8_t pending_intervals
)
1625 sched_clutch_bucket_cpu_data_t old_cpu_data
= {};
1626 sched_clutch_bucket_cpu_data_t new_cpu_data
= {};
1627 os_atomic_rmw_loop(&clutch_bucket_group
->scbg_cpu_data
.scbcd_cpu_data_packed
, old_cpu_data
.scbcd_cpu_data_packed
, new_cpu_data
.scbcd_cpu_data_packed
, relaxed
, {
1628 clutch_cpu_data_t cpu_used
= old_cpu_data
.cpu_data
.scbcd_cpu_used
;
1629 clutch_cpu_data_t cpu_blocked
= old_cpu_data
.cpu_data
.scbcd_cpu_blocked
;
1631 if ((pending_intervals
== 0) && (cpu_used
+ cpu_blocked
) < sched_clutch_bucket_group_adjust_threshold
) {
1632 /* No changes to the CPU used and blocked values */
1633 os_atomic_rmw_loop_give_up();
1635 if ((cpu_used
+ cpu_blocked
) >= sched_clutch_bucket_group_adjust_threshold
) {
1636 /* Only keep the recent CPU history to better indicate how this TG has been behaving */
1637 cpu_used
= cpu_used
/ SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO
;
1638 cpu_blocked
= cpu_blocked
/ SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO
;
1640 /* Use the shift passed in to ageout the CPU usage */
1641 cpu_used
= (clutch_cpu_data_t
)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used
, cpu_blocked
, pending_intervals
);
1642 new_cpu_data
.cpu_data
.scbcd_cpu_used
= cpu_used
;
1643 new_cpu_data
.cpu_data
.scbcd_cpu_blocked
= cpu_blocked
;
1648 * Thread level scheduling algorithm
1650 * The thread level scheduling algorithm uses the mach timeshare
1651 * decay based algorithm to achieve sharing between threads within the
1652 * same clutch bucket. The load/priority shifts etc. are all maintained
1653 * at the clutch bucket level and used for decay calculation of the
1654 * threads. The load sampling is still driven off the scheduler tick
1655 * for runnable clutch buckets (it does not use the new higher frequency
1656 * EWMA based load calculation). The idea is that the contention and load
1657 * within clutch_buckets should be limited enough to not see heavy decay
1658 * and timeshare effectively.
1662 * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1664 * Increment the run count for the clutch bucket associated with the
1668 sched_clutch_thread_run_bucket_incr(
1670 sched_bucket_t bucket
)
1672 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread
)) {
1675 sched_clutch_t clutch
= sched_clutch_for_thread(thread
);
1676 return sched_clutch_run_bucket_incr(clutch
, bucket
);
1680 sched_clutch_run_bucket_incr(
1681 sched_clutch_t clutch
,
1682 sched_bucket_t bucket
)
1684 assert(bucket
!= TH_BUCKET_RUN
);
1685 sched_clutch_bucket_group_t clutch_bucket_group
= &(clutch
->sc_clutch_groups
[bucket
]);
1686 return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group
);
1690 * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
1692 * Decrement the run count for the clutch bucket associated with the
1696 sched_clutch_thread_run_bucket_decr(
1698 sched_bucket_t bucket
)
1700 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread
)) {
1703 sched_clutch_t clutch
= sched_clutch_for_thread(thread
);
1704 return sched_clutch_run_bucket_decr(clutch
, bucket
);
1708 sched_clutch_run_bucket_decr(
1709 sched_clutch_t clutch
,
1710 sched_bucket_t bucket
)
1712 assert(bucket
!= TH_BUCKET_RUN
);
1713 sched_clutch_bucket_group_t clutch_bucket_group
= &(clutch
->sc_clutch_groups
[bucket
]);
1714 return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group
);
1718 * sched_clutch_bucket_group_timeshare_update()
1720 * Routine to update the load and priority shift for the clutch_bucket_group
1721 * every sched_tick. For multi-cluster platforms, each QoS level will have multiple
1722 * clutch buckets with runnable threads in them. So it is important to maintain
1723 * the timesharing information at the clutch_bucket_group level instead of
1724 * individual clutch buckets (because the algorithm is trying to timeshare all
1725 * threads at the same QoS irrespective of which hierarchy they are enqueued in).
1727 * The routine is called from the sched tick handling code to make sure this value
1728 * is updated at least once every sched tick. For clutch bucket groups which have
1729 * not been runnable for very long, the clutch_bucket_group maintains a "last
1730 * updated schedtick" parameter. As threads become runnable in the clutch bucket group,
1731 * if this value is outdated, the load and shifts are updated.
1733 * Possible optimization:
1734 * - The current algorithm samples the load every sched tick (125ms).
1735 * This is prone to spikes in runnable counts; if that turns out to be
1736 * a problem, a simple solution would be to do the EWMA trick to sample
1737 * load at every load_tick (30ms) and use the averaged value for the pri
1738 * shift calculation.
1741 sched_clutch_bucket_group_timeshare_update(
1742 sched_clutch_bucket_group_t clutch_bucket_group
,
1743 sched_clutch_bucket_t clutch_bucket
,
1746 if (clutch_bucket_group
->scbg_bucket
< TH_BUCKET_SHARE_FG
) {
1747 /* No timesharing needed for fixed priority Above UI threads */
1752 * Update the timeshare parameters for the clutch bucket group
1753 * if they havent been updated in this tick.
1755 uint32_t sched_ts
= os_atomic_load(&clutch_bucket_group
->scbg_timeshare_tick
, relaxed
);
1756 uint32_t current_sched_ts
= sched_tick
;
1757 if (sched_ts
< current_sched_ts
) {
1758 os_atomic_store(&clutch_bucket_group
->scbg_timeshare_tick
, current_sched_ts
, relaxed
);
1759 /* NCPU wide workloads should not experience decay */
1760 uint64_t bucket_group_run_count
= os_atomic_load_wide(&clutch_bucket_group
->scbg_blocked_data
.scct_count
, relaxed
) - 1;
1761 uint32_t bucket_group_load
= (uint32_t)(bucket_group_run_count
/ processor_avail_count
);
1762 bucket_group_load
= MIN(bucket_group_load
, NRQS
- 1);
1763 uint32_t pri_shift
= sched_fixed_shift
- sched_load_shifts
[bucket_group_load
];
1764 /* Ensure that the pri_shift value is reasonable */
1765 pri_shift
= (pri_shift
> SCHED_PRI_SHIFT_MAX
) ? INT8_MAX
: pri_shift
;
1766 os_atomic_store(&clutch_bucket_group
->scbg_pri_shift
, pri_shift
, relaxed
);
1770 * Update the clutch bucket priority; this allows clutch buckets that have been pending
1771 * for a long time to get an updated interactivity score.
1773 sched_clutch_bucket_update(clutch_bucket
, clutch_bucket
->scb_root
, ctime
, SCHED_CLUTCH_BUCKET_OPTIONS_NONE
);
1777 * sched_clutch_thread_clutch_update()
1779 * Routine called when the thread changes its thread group. The current
1780 * implementation relies on the fact that the thread group is changed only
1781 * from the context of the thread itself. Due to this fact, the thread
1782 * group change causes only counter updates in the old & new clutch
1783 * buckets and no hierarchy changes. The routine also attributes the CPU
1784 * used so far to the old clutch.
1787 sched_clutch_thread_clutch_update(
1789 sched_clutch_t old_clutch
,
1790 sched_clutch_t new_clutch
)
1793 assert(current_thread() == thread
);
1796 sched_clutch_run_bucket_decr(old_clutch
, thread
->th_sched_bucket
);
1798 * Calculate the CPU used by this thread in the old bucket and
1799 * add it to the old clutch bucket. This uses the same CPU usage
1800 * logic as update_priority etc.
1802 thread_timer_delta(thread
, cpu_delta
);
1803 if (thread
->pri_shift
< INT8_MAX
) {
1804 thread
->sched_usage
+= cpu_delta
;
1806 thread
->cpu_delta
+= cpu_delta
;
1807 if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread
)) {
1808 sched_clutch_bucket_group_t clutch_bucket_group
= &(old_clutch
->sc_clutch_groups
[thread
->th_sched_bucket
]);
1809 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group
, cpu_delta
);
1814 sched_clutch_run_bucket_incr(new_clutch
, thread
->th_sched_bucket
);
1818 /* Thread Insertion/Removal/Selection routines */
1820 #if CONFIG_SCHED_EDGE
1823 * Edge Scheduler Bound Thread Support
1825 * The edge scheduler allows threads to be bound to specific clusters. The scheduler
1826 * maintains a separate runq on the clutch root to hold these bound threads. These
1827 * bound threads count towards the root priority and thread count, but are ignored
1828 * for thread migration/steal decisions. Bound threads that are enqueued in the
1829 * separate runq have the th_bound_cluster_enqueued flag set to allow easy
1832 * Bound Threads Timesharing
1833 * The bound threads share the timesharing properties of the clutch bucket group they are
1834 * part of. They contribute to the load and use priority shifts/decay values from the
1835 * clutch bucket group.
1839 sched_edge_bound_thread_insert(
1840 sched_clutch_root_t root_clutch
,
1844 /* Update the clutch runnable count and priority */
1845 sched_clutch_thr_count_inc(&root_clutch
->scr_thr_count
);
1846 sched_clutch_root_bucket_t root_bucket
= &root_clutch
->scr_bound_buckets
[thread
->th_sched_bucket
];
1847 if (root_bucket
->scrb_bound_thread_runq
.count
== 0) {
1848 sched_clutch_root_bucket_runnable(root_bucket
, root_clutch
, mach_absolute_time());
1851 assert((thread
->th_bound_cluster_enqueued
) == false);
1852 run_queue_enqueue(&root_bucket
->scrb_bound_thread_runq
, thread
, options
);
1853 thread
->th_bound_cluster_enqueued
= true;
1855 int16_t root_old_pri
= root_clutch
->scr_priority
;
1856 sched_clutch_root_pri_update(root_clutch
);
1857 return root_clutch
->scr_priority
> root_old_pri
;
1861 sched_edge_bound_thread_remove(
1862 sched_clutch_root_t root_clutch
,
1865 sched_clutch_root_bucket_t root_bucket
= &root_clutch
->scr_bound_buckets
[thread
->th_sched_bucket
];
1866 assert((thread
->th_bound_cluster_enqueued
) == true);
1867 run_queue_remove(&root_bucket
->scrb_bound_thread_runq
, thread
);
1868 thread
->th_bound_cluster_enqueued
= false;
1870 /* Update the clutch runnable count and priority */
1871 sched_clutch_thr_count_dec(&root_clutch
->scr_thr_count
);
1872 if (root_bucket
->scrb_bound_thread_runq
.count
== 0) {
1873 sched_clutch_root_bucket_empty(root_bucket
, root_clutch
, mach_absolute_time());
1875 sched_clutch_root_pri_update(root_clutch
);
1878 #endif /* CONFIG_SCHED_EDGE */
1881 * sched_clutch_thread_bound_lookup()
1883 * Routine to lookup the highest priority runnable thread in a bounded root bucket.
1886 sched_clutch_thread_bound_lookup(
1887 __unused sched_clutch_root_t root_clutch
,
1888 sched_clutch_root_bucket_t root_bucket
)
1890 return run_queue_peek(&root_bucket
->scrb_bound_thread_runq
);
1894 * Clutch Bucket Group Thread Counts and Pending time calculation
1896 * The pending time on the clutch_bucket_group allows the scheduler to track if it
1897 * needs to ageout the CPU usage because the clutch_bucket_group has been pending for
1898 * a very long time. The pending time is set to the timestamp as soon as a thread becomes
1899 * runnable. When a thread is picked up for execution from this clutch_bucket_group, the
1900 * pending time is advanced to the time of thread selection.
1902 * Since threads for a clutch bucket group can be added or removed from multiple CPUs
1903 * simulataneously, it is important that the updates to thread counts and pending timestamps
1904 * happen atomically. The implementation relies on the following aspects to make that work
1906 * - The clutch scheduler would be deployed on single cluster platforms where the pset lock
1907 * is held when threads are added/removed and pending timestamps are updated
1908 * - The edge scheduler would have support for double wide 128 bit atomics which allow the
1909 * thread count and pending timestamp to be updated atomically.
1911 * Clutch bucket group interactivity timestamp and score updates also rely on the properties
1912 * above to atomically update the interactivity score for a clutch bucket group.
1915 #if CONFIG_SCHED_EDGE
1918 sched_clutch_bucket_group_thr_count_inc(
1919 sched_clutch_bucket_group_t clutch_bucket_group
,
1922 sched_clutch_counter_time_t old_pending_data
;
1923 sched_clutch_counter_time_t new_pending_data
;
1924 os_atomic_rmw_loop(&clutch_bucket_group
->scbg_pending_data
.scct_packed
, old_pending_data
.scct_packed
, new_pending_data
.scct_packed
, relaxed
, {
1925 new_pending_data
.scct_count
= old_pending_data
.scct_count
+ 1;
1926 new_pending_data
.scct_timestamp
= old_pending_data
.scct_timestamp
;
1927 if (old_pending_data
.scct_count
== 0) {
1928 new_pending_data
.scct_timestamp
= timestamp
;
1934 sched_clutch_bucket_group_thr_count_dec(
1935 sched_clutch_bucket_group_t clutch_bucket_group
,
1938 sched_clutch_counter_time_t old_pending_data
;
1939 sched_clutch_counter_time_t new_pending_data
;
1940 os_atomic_rmw_loop(&clutch_bucket_group
->scbg_pending_data
.scct_packed
, old_pending_data
.scct_packed
, new_pending_data
.scct_packed
, relaxed
, {
1941 new_pending_data
.scct_count
= old_pending_data
.scct_count
- 1;
1942 if (new_pending_data
.scct_count
== 0) {
1943 new_pending_data
.scct_timestamp
= SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID
;
1945 new_pending_data
.scct_timestamp
= timestamp
;
1951 sched_clutch_bucket_group_pending_ageout(
1952 sched_clutch_bucket_group_t clutch_bucket_group
,
1955 int bucket_load
= sched_clutch_global_bucket_load_get(clutch_bucket_group
->scbg_bucket
);
1956 sched_clutch_counter_time_t old_pending_data
;
1957 sched_clutch_counter_time_t new_pending_data
;
1958 uint8_t cpu_usage_shift
= 0;
1960 os_atomic_rmw_loop(&clutch_bucket_group
->scbg_pending_data
.scct_packed
, old_pending_data
.scct_packed
, new_pending_data
.scct_packed
, relaxed
, {
1961 cpu_usage_shift
= 0;
1962 uint64_t old_pending_ts
= old_pending_data
.scct_timestamp
;
1963 bool old_update
= (old_pending_ts
>= timestamp
);
1964 bool no_pending_time
= (old_pending_ts
== SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID
);
1965 bool no_bucket_load
= (bucket_load
== 0);
1966 if (old_update
|| no_pending_time
|| no_bucket_load
) {
1967 os_atomic_rmw_loop_give_up();
1970 /* Calculate the time the clutch bucket group has been pending */
1971 uint64_t pending_delta
= timestamp
- old_pending_ts
;
1972 uint64_t interactivity_delta
= sched_clutch_bucket_group_pending_delta
[clutch_bucket_group
->scbg_bucket
] * bucket_load
;
1973 if (pending_delta
< interactivity_delta
) {
1974 os_atomic_rmw_loop_give_up();
1976 cpu_usage_shift
= (pending_delta
/ interactivity_delta
);
1977 new_pending_data
.scct_timestamp
= old_pending_ts
+ (cpu_usage_shift
* interactivity_delta
);
1978 new_pending_data
.scct_count
= old_pending_data
.scct_count
;
1980 return cpu_usage_shift
;
1984 sched_clutch_bucket_group_interactivity_score_calculate(
1985 sched_clutch_bucket_group_t clutch_bucket_group
,
1988 if (clutch_bucket_group
->scbg_bucket
== TH_BUCKET_FIXPRI
) {
1990 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
1991 * priorities, make sure all AboveUI buckets are marked interactive.
1993 assert(clutch_bucket_group
->scbg_interactivity_data
.scct_count
== (2 * sched_clutch_bucket_group_interactive_pri
));
1994 return (uint8_t)clutch_bucket_group
->scbg_interactivity_data
.scct_count
;
1996 /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
1997 uint8_t pending_intervals
= sched_clutch_bucket_group_pending_ageout(clutch_bucket_group
, timestamp
);
1998 /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
1999 sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group
, pending_intervals
);
2000 uint8_t interactivity_score
= sched_clutch_interactivity_from_cpu_data(clutch_bucket_group
);
2001 sched_clutch_counter_time_t old_interactivity_data
;
2002 sched_clutch_counter_time_t new_interactivity_data
;
2004 bool score_updated
= os_atomic_rmw_loop(&clutch_bucket_group
->scbg_interactivity_data
.scct_packed
, old_interactivity_data
.scct_packed
, new_interactivity_data
.scct_packed
, relaxed
, {
2005 if (old_interactivity_data
.scct_timestamp
>= timestamp
) {
2006 os_atomic_rmw_loop_give_up();
2008 new_interactivity_data
.scct_timestamp
= timestamp
;
2009 if (old_interactivity_data
.scct_timestamp
!= 0) {
2010 new_interactivity_data
.scct_count
= interactivity_score
;
2013 if (score_updated
) {
2014 return (uint8_t)new_interactivity_data
.scct_count
;
2016 return (uint8_t)old_interactivity_data
.scct_count
;
2020 #else /* CONFIG_SCHED_EDGE */
2023 * For the clutch scheduler, atomicity is ensured by making sure all operations
2024 * are happening under the pset lock of the only cluster present on the platform.
2027 sched_clutch_bucket_group_thr_count_inc(
2028 sched_clutch_bucket_group_t clutch_bucket_group
,
2031 sched_clutch_hierarchy_locked_assert(&pset0
.pset_clutch_root
);
2032 if (clutch_bucket_group
->scbg_pending_data
.scct_count
== 0) {
2033 clutch_bucket_group
->scbg_pending_data
.scct_timestamp
= timestamp
;
2035 clutch_bucket_group
->scbg_pending_data
.scct_count
++;
2039 sched_clutch_bucket_group_thr_count_dec(
2040 sched_clutch_bucket_group_t clutch_bucket_group
,
2043 sched_clutch_hierarchy_locked_assert(&pset0
.pset_clutch_root
);
2044 clutch_bucket_group
->scbg_pending_data
.scct_count
--;
2045 if (clutch_bucket_group
->scbg_pending_data
.scct_count
== 0) {
2046 clutch_bucket_group
->scbg_pending_data
.scct_timestamp
= SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID
;
2048 clutch_bucket_group
->scbg_pending_data
.scct_timestamp
= timestamp
;
2053 sched_clutch_bucket_group_pending_ageout(
2054 sched_clutch_bucket_group_t clutch_bucket_group
,
2057 sched_clutch_hierarchy_locked_assert(&pset0
.pset_clutch_root
);
2058 int bucket_load
= sched_clutch_global_bucket_load_get(clutch_bucket_group
->scbg_bucket
);
2059 uint64_t old_pending_ts
= clutch_bucket_group
->scbg_pending_data
.scct_timestamp
;
2060 bool old_update
= (old_pending_ts
>= timestamp
);
2061 bool no_pending_time
= (old_pending_ts
== SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID
);
2062 bool no_bucket_load
= (bucket_load
== 0);
2063 if (old_update
|| no_pending_time
|| no_bucket_load
) {
2066 uint64_t pending_delta
= timestamp
- old_pending_ts
;
2067 uint64_t interactivity_delta
= sched_clutch_bucket_group_pending_delta
[clutch_bucket_group
->scbg_bucket
] * bucket_load
;
2068 if (pending_delta
< interactivity_delta
) {
2071 uint8_t cpu_usage_shift
= (pending_delta
/ interactivity_delta
);
2072 clutch_bucket_group
->scbg_pending_data
.scct_timestamp
= old_pending_ts
+ (cpu_usage_shift
* interactivity_delta
);
2073 return cpu_usage_shift
;
2077 sched_clutch_bucket_group_interactivity_score_calculate(
2078 sched_clutch_bucket_group_t clutch_bucket_group
,
2081 sched_clutch_hierarchy_locked_assert(&pset0
.pset_clutch_root
);
2082 if (clutch_bucket_group
->scbg_bucket
== TH_BUCKET_FIXPRI
) {
2084 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2085 * priorities, make sure all AboveUI buckets are marked interactive.
2087 assert(clutch_bucket_group
->scbg_interactivity_data
.scct_count
== (2 * sched_clutch_bucket_group_interactive_pri
));
2088 return (uint8_t)clutch_bucket_group
->scbg_interactivity_data
.scct_count
;
2090 /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2091 uint8_t pending_intervals
= sched_clutch_bucket_group_pending_ageout(clutch_bucket_group
, timestamp
);
2092 /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2093 sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group
, pending_intervals
);
2094 uint8_t interactivity_score
= sched_clutch_interactivity_from_cpu_data(clutch_bucket_group
);
2095 if (timestamp
> clutch_bucket_group
->scbg_interactivity_data
.scct_timestamp
) {
2096 clutch_bucket_group
->scbg_interactivity_data
.scct_count
= interactivity_score
;
2097 clutch_bucket_group
->scbg_interactivity_data
.scct_timestamp
= timestamp
;
2098 return interactivity_score
;
2100 return (uint8_t)clutch_bucket_group
->scbg_interactivity_data
.scct_count
;
2104 #endif /* CONFIG_SCHED_EDGE */
2107 * Clutch Bucket Group Run Count and Blocked Time Accounting
2109 * The clutch bucket group maintains the number of runnable/running threads in the group.
2110 * Since the blocked time of the clutch bucket group is based on this count, it is
2111 * important to make sure the blocking timestamp and the run count are updated atomically.
2113 * Since the run count increments happen without any pset locks held, the scheduler makes
2114 * these updates atomic in the following way:
2115 * - On 64-bit platforms, it uses double wide atomics to update the count & timestamp
2116 * - On 32-bit platforms, it uses a lock to synchronize the count & timestamp update
2122 sched_clutch_bucket_group_run_count_inc(
2123 sched_clutch_bucket_group_t clutch_bucket_group
)
2125 uint64_t blocked_time
= 0;
2126 uint64_t updated_run_count
= 0;
2128 lck_spin_lock(&clutch_bucket_group
->scbg_stats_lock
);
2129 if ((clutch_bucket_group
->scbg_blocked_data
.scct_timestamp
!= SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID
) &&
2130 (clutch_bucket_group
->scbg_blocked_data
.scct_count
== 0)) {
2131 /* Run count is transitioning from 0 to 1; calculate blocked time and add it to CPU data */
2132 blocked_time
= mach_absolute_time() - clutch_bucket_group
->scbg_blocked_data
.scct_timestamp
;
2133 clutch_bucket_group
->scbg_blocked_data
.scct_timestamp
= SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID
;
2135 clutch_bucket_group
->scbg_blocked_data
.scct_count
= clutch_bucket_group
->scbg_blocked_data
.scct_count
+ 1;
2136 updated_run_count
= clutch_bucket_group
->scbg_blocked_data
.scct_count
;
2137 lck_spin_unlock(&clutch_bucket_group
->scbg_stats_lock
);
2139 blocked_time
= MIN(blocked_time
, sched_clutch_bucket_group_adjust_threshold
);
2140 os_atomic_add(&(clutch_bucket_group
->scbg_cpu_data
.cpu_data
.scbcd_cpu_blocked
), (clutch_cpu_data_t
)blocked_time
, relaxed
);
2141 return (uint32_t)updated_run_count
;
2145 sched_clutch_bucket_group_run_count_dec(
2146 sched_clutch_bucket_group_t clutch_bucket_group
)
2148 uint64_t updated_run_count
= 0;
2150 lck_spin_lock(&clutch_bucket_group
->scbg_stats_lock
);
2151 clutch_bucket_group
->scbg_blocked_data
.scct_count
= clutch_bucket_group
->scbg_blocked_data
.scct_count
- 1;
2152 if (clutch_bucket_group
->scbg_blocked_data
.scct_count
== 0) {
2153 /* Run count is transitioning from 1 to 0; start the blocked timer */
2154 clutch_bucket_group
->scbg_blocked_data
.scct_timestamp
= mach_absolute_time();
2156 updated_run_count
= clutch_bucket_group
->scbg_blocked_data
.scct_count
;
2157 lck_spin_unlock(&clutch_bucket_group
->scbg_stats_lock
);
2158 return (uint32_t)updated_run_count
;
2161 #else /* !__LP64__ */
2164 sched_clutch_bucket_group_run_count_inc(
2165 sched_clutch_bucket_group_t clutch_bucket_group
)
2167 sched_clutch_counter_time_t old_blocked_data
;
2168 sched_clutch_counter_time_t new_blocked_data
;
2170 bool update_blocked_time
= false;
2171 os_atomic_rmw_loop(&clutch_bucket_group
->scbg_blocked_data
.scct_packed
, old_blocked_data
.scct_packed
, new_blocked_data
.scct_packed
, relaxed
, {
2172 new_blocked_data
.scct_count
= old_blocked_data
.scct_count
+ 1;
2173 new_blocked_data
.scct_timestamp
= old_blocked_data
.scct_timestamp
;
2174 update_blocked_time
= false;
2175 if (old_blocked_data
.scct_count
== 0) {
2176 new_blocked_data
.scct_timestamp
= SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID
;
2177 update_blocked_time
= true;
2180 if (update_blocked_time
&& (old_blocked_data
.scct_timestamp
!= SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID
)) {
2181 uint64_t ctime
= mach_absolute_time();
2182 if (ctime
> old_blocked_data
.scct_timestamp
) {
2183 uint64_t blocked_time
= ctime
- old_blocked_data
.scct_timestamp
;
2184 blocked_time
= MIN(blocked_time
, sched_clutch_bucket_group_adjust_threshold
);
2185 os_atomic_add(&(clutch_bucket_group
->scbg_cpu_data
.cpu_data
.scbcd_cpu_blocked
), (clutch_cpu_data_t
)blocked_time
, relaxed
);
2188 return (uint32_t)new_blocked_data
.scct_count
;
2192 sched_clutch_bucket_group_run_count_dec(
2193 sched_clutch_bucket_group_t clutch_bucket_group
)
2195 sched_clutch_counter_time_t old_blocked_data
;
2196 sched_clutch_counter_time_t new_blocked_data
;
2198 uint64_t ctime
= mach_absolute_time();
2199 os_atomic_rmw_loop(&clutch_bucket_group
->scbg_blocked_data
.scct_packed
, old_blocked_data
.scct_packed
, new_blocked_data
.scct_packed
, relaxed
, {
2200 new_blocked_data
.scct_count
= old_blocked_data
.scct_count
- 1;
2201 new_blocked_data
.scct_timestamp
= old_blocked_data
.scct_timestamp
;
2202 if (new_blocked_data
.scct_count
== 0) {
2203 new_blocked_data
.scct_timestamp
= ctime
;
2206 return (uint32_t)new_blocked_data
.scct_count
;
2209 #endif /* !__LP64__ */
2212 * sched_clutch_thread_insert()
2214 * Routine to insert a thread into the sched clutch hierarchy.
2215 * Update the counts at all levels of the hierarchy and insert the nodes
2216 * as they become runnable. Always called with the pset lock held.
2219 sched_clutch_thread_insert(
2220 sched_clutch_root_t root_clutch
,
2224 boolean_t result
= FALSE
;
2226 sched_clutch_hierarchy_locked_assert(root_clutch
);
2227 #if CONFIG_SCHED_EDGE
2228 sched_edge_cluster_cumulative_count_incr(root_clutch
, thread
->th_sched_bucket
);
2230 * Check if the thread is bound and is being enqueued in its desired bound cluster.
2231 * One scenario where a bound thread might not be getting enqueued in the bound cluster
2232 * hierarchy would be if the thread is "soft" bound and the bound cluster is
2233 * de-recommended. In that case, the thread should be treated as an unbound
2236 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread
) && (sched_edge_thread_bound_cluster_id(thread
) == root_clutch
->scr_cluster_id
)) {
2237 return sched_edge_bound_thread_insert(root_clutch
, thread
, options
);
2239 #endif /* CONFIG_SCHED_EDGE */
2240 sched_clutch_t clutch
= sched_clutch_for_thread(thread
);
2241 assert(thread
->thread_group
== clutch
->sc_tg
);
2243 uint64_t current_timestamp
= mach_absolute_time();
2244 sched_clutch_bucket_group_t clutch_bucket_group
= &(clutch
->sc_clutch_groups
[thread
->th_sched_bucket
]);
2245 sched_clutch_bucket_t clutch_bucket
= &(clutch_bucket_group
->scbg_clutch_buckets
[root_clutch
->scr_cluster_id
]);
2246 assert((clutch_bucket
->scb_root
== NULL
) || (clutch_bucket
->scb_root
== root_clutch
));
2249 * Thread linkage in clutch_bucket
2251 * A thread has a few linkages within the clutch bucket:
2252 * - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
2253 * - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
2254 * - A queue linkage used for timesharing operations of threads at the scheduler tick
2257 /* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */
2258 thread
->th_clutch_runq_link
.stamp
= current_timestamp
;
2259 priority_queue_entry_set_sched_pri(&clutch_bucket
->scb_thread_runq
, &thread
->th_clutch_runq_link
, thread
->sched_pri
,
2260 (options
& SCHED_TAILQ
) ? PRIORITY_QUEUE_ENTRY_NONE
: PRIORITY_QUEUE_ENTRY_PREEMPTED
);
2261 priority_queue_insert(&clutch_bucket
->scb_thread_runq
, &thread
->th_clutch_runq_link
);
2263 /* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
2264 priority_queue_entry_set_sched_pri(&clutch_bucket
->scb_clutchpri_prioq
, &thread
->th_clutch_pri_link
,
2265 sched_thread_sched_pri_promoted(thread
) ? thread
->sched_pri
: thread
->base_pri
, false);
2266 priority_queue_insert(&clutch_bucket
->scb_clutchpri_prioq
, &thread
->th_clutch_pri_link
);
2268 /* Insert thread into timesharing queue of the clutch bucket */
2269 enqueue_tail(&clutch_bucket
->scb_thread_timeshare_queue
, &thread
->th_clutch_timeshare_link
);
2271 /* Increment the urgency counter for the root if necessary */
2272 sched_clutch_root_urgency_inc(root_clutch
, thread
);
2274 os_atomic_inc(&clutch
->sc_thr_count
, relaxed
);
2275 sched_clutch_bucket_group_thr_count_inc(clutch_bucket
->scb_group
, current_timestamp
);
2277 /* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
2278 sched_clutch_bucket_options_t scb_options
= (options
& SCHED_HEADQ
) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ
: SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ
;
2279 if (clutch_bucket
->scb_thr_count
== 0) {
2280 sched_clutch_thr_count_inc(&clutch_bucket
->scb_thr_count
);
2281 sched_clutch_thr_count_inc(&root_clutch
->scr_thr_count
);
2282 result
= sched_clutch_bucket_runnable(clutch_bucket
, root_clutch
, current_timestamp
, scb_options
);
2284 sched_clutch_thr_count_inc(&clutch_bucket
->scb_thr_count
);
2285 sched_clutch_thr_count_inc(&root_clutch
->scr_thr_count
);
2286 result
= sched_clutch_bucket_update(clutch_bucket
, root_clutch
, current_timestamp
, scb_options
);
2289 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_CLUTCH_THR_COUNT
) | DBG_FUNC_NONE
,
2290 root_clutch
->scr_cluster_id
, thread_group_get_id(clutch_bucket
->scb_group
->scbg_clutch
->sc_tg
), clutch_bucket
->scb_bucket
,
2291 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch
->scr_thr_count
, os_atomic_load(&clutch
->sc_thr_count
, relaxed
), clutch_bucket
->scb_thr_count
));
2296 * sched_clutch_thread_remove()
2298 * Routine to remove a thread from the sched clutch hierarchy.
2299 * Update the counts at all levels of the hierarchy and remove the nodes
2300 * as they become empty. Always called with the pset lock held.
2303 sched_clutch_thread_remove(
2304 sched_clutch_root_t root_clutch
,
2306 uint64_t current_timestamp
,
2307 sched_clutch_bucket_options_t options
)
2309 sched_clutch_hierarchy_locked_assert(root_clutch
);
2310 #if CONFIG_SCHED_EDGE
2311 sched_edge_cluster_cumulative_count_decr(root_clutch
, thread
->th_sched_bucket
);
2312 if (thread
->th_bound_cluster_enqueued
) {
2313 sched_edge_bound_thread_remove(root_clutch
, thread
);
2316 #endif /* CONFIG_SCHED_EDGE */
2317 sched_clutch_t clutch
= sched_clutch_for_thread(thread
);
2318 assert(thread
->thread_group
== clutch
->sc_tg
);
2319 assert(thread
->runq
!= PROCESSOR_NULL
);
2321 sched_clutch_bucket_group_t clutch_bucket_group
= &(clutch
->sc_clutch_groups
[thread
->th_sched_bucket
]);
2322 sched_clutch_bucket_t clutch_bucket
= &(clutch_bucket_group
->scbg_clutch_buckets
[root_clutch
->scr_cluster_id
]);
2323 assert(clutch_bucket
->scb_root
== root_clutch
);
2325 /* Decrement the urgency counter for the root if necessary */
2326 sched_clutch_root_urgency_dec(root_clutch
, thread
);
2327 /* Remove thread from the clutch_bucket */
2328 priority_queue_remove(&clutch_bucket
->scb_thread_runq
, &thread
->th_clutch_runq_link
);
2329 remqueue(&thread
->th_clutch_timeshare_link
);
2330 thread
->runq
= PROCESSOR_NULL
;
2332 priority_queue_remove(&clutch_bucket
->scb_clutchpri_prioq
, &thread
->th_clutch_pri_link
);
2334 /* Update counts at various levels of the hierarchy */
2335 os_atomic_dec(&clutch
->sc_thr_count
, relaxed
);
2336 sched_clutch_bucket_group_thr_count_dec(clutch_bucket
->scb_group
, current_timestamp
);
2337 sched_clutch_thr_count_dec(&root_clutch
->scr_thr_count
);
2338 sched_clutch_thr_count_dec(&clutch_bucket
->scb_thr_count
);
2340 /* Remove the clutch from hierarchy (if needed) and update properties */
2341 if (clutch_bucket
->scb_thr_count
== 0) {
2342 sched_clutch_bucket_empty(clutch_bucket
, root_clutch
, current_timestamp
, options
);
2344 sched_clutch_bucket_update(clutch_bucket
, root_clutch
, current_timestamp
, options
);
2346 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_CLUTCH_THR_COUNT
) | DBG_FUNC_NONE
,
2347 root_clutch
->scr_cluster_id
, thread_group_get_id(clutch_bucket
->scb_group
->scbg_clutch
->sc_tg
), clutch_bucket
->scb_bucket
,
2348 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch
->scr_thr_count
, os_atomic_load(&clutch
->sc_thr_count
, relaxed
), clutch_bucket
->scb_thr_count
));
2352 * sched_clutch_thread_unbound_lookup()
2354 * Routine to find the highest unbound thread in the root clutch.
2355 * Helps find threads easily for steal/migrate scenarios in the
2359 sched_clutch_thread_unbound_lookup(
2360 sched_clutch_root_t root_clutch
,
2361 sched_clutch_root_bucket_t root_bucket
)
2363 sched_clutch_hierarchy_locked_assert(root_clutch
);
2365 /* Find the highest priority clutch bucket in this root bucket */
2366 sched_clutch_bucket_t clutch_bucket
= sched_clutch_root_bucket_highest_clutch_bucket(root_bucket
);
2367 assert(clutch_bucket
!= NULL
);
2369 /* Find the highest priority runnable thread in this clutch bucket */
2370 thread_t thread
= priority_queue_max(&clutch_bucket
->scb_thread_runq
, struct thread
, th_clutch_runq_link
);
2371 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_CLUTCH_THREAD_SELECT
) | DBG_FUNC_NONE
,
2372 thread_tid(thread
), thread_group_get_id(clutch_bucket
->scb_group
->scbg_clutch
->sc_tg
), clutch_bucket
->scb_bucket
, 0, 0);
2377 * sched_clutch_thread_highest_remove()
2379 * Routine to find and remove the highest priority thread
2380 * from the sched clutch hierarchy. The algorithm looks at the
2381 * hierarchy for the most eligible runnable thread and calls
2382 * sched_clutch_thread_remove(). Always called with the
2386 sched_clutch_thread_highest_remove(
2387 sched_clutch_root_t root_clutch
)
2389 sched_clutch_hierarchy_locked_assert(root_clutch
);
2390 uint64_t current_timestamp
= mach_absolute_time();
2392 sched_clutch_root_bucket_t root_bucket
= sched_clutch_root_highest_root_bucket(root_clutch
, current_timestamp
, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL
);
2393 if (root_bucket
== NULL
) {
2397 thread_t highest_thread
= THREAD_NULL
;
2398 if (root_bucket
->scrb_bound
) {
2399 highest_thread
= sched_clutch_thread_bound_lookup(root_clutch
, root_bucket
);
2401 highest_thread
= sched_clutch_thread_unbound_lookup(root_clutch
, root_bucket
);
2403 sched_clutch_thread_remove(root_clutch
, highest_thread
, current_timestamp
, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR
);
2404 return highest_thread
;
2407 /* High level global accessor routines */
2410 * sched_clutch_root_urgency()
2412 * Routine to get the urgency of the highest runnable
2413 * thread in the hierarchy.
2416 sched_clutch_root_urgency(
2417 sched_clutch_root_t root_clutch
)
2419 return root_clutch
->scr_urgency
;
2423 * sched_clutch_root_count_sum()
2425 * The count_sum mechanism is used for scheduler runq
2426 * statistics calculation. Its only useful for debugging
2427 * purposes; since it takes a mach_absolute_time() on
2428 * other scheduler implementations, its better to avoid
2429 * populating this until absolutely necessary.
2432 sched_clutch_root_count_sum(
2433 __unused sched_clutch_root_t root_clutch
)
2439 * sched_clutch_root_priority()
2441 * Routine to get the priority of the highest runnable
2442 * thread in the hierarchy.
2445 sched_clutch_root_priority(
2446 sched_clutch_root_t root_clutch
)
2448 return root_clutch
->scr_priority
;
2452 * sched_clutch_root_count()
2454 * Returns total number of runnable threads in the hierarchy.
2457 sched_clutch_root_count(
2458 sched_clutch_root_t root_clutch
)
2460 return root_clutch
->scr_thr_count
;
2463 #if CONFIG_SCHED_EDGE
2466 * sched_clutch_root_foreign_empty()
2468 * Routine to check if the foreign clutch bucket priority list is empty for a cluster.
2471 sched_clutch_root_foreign_empty(
2472 sched_clutch_root_t root_clutch
)
2474 return priority_queue_empty(&root_clutch
->scr_foreign_buckets
);
2478 * sched_clutch_root_highest_foreign_thread_remove()
2480 * Routine to return the thread in the highest priority clutch bucket in a cluster.
2481 * Must be called with the pset for the cluster locked.
2484 sched_clutch_root_highest_foreign_thread_remove(
2485 sched_clutch_root_t root_clutch
)
2487 thread_t thread
= THREAD_NULL
;
2488 if (priority_queue_empty(&root_clutch
->scr_foreign_buckets
)) {
2491 sched_clutch_bucket_t clutch_bucket
= priority_queue_max(&root_clutch
->scr_foreign_buckets
, struct sched_clutch_bucket
, scb_foreignlink
);
2492 thread
= priority_queue_max(&clutch_bucket
->scb_thread_runq
, struct thread
, th_clutch_runq_link
);
2493 sched_clutch_thread_remove(root_clutch
, thread
, mach_absolute_time(), 0);
2497 #endif /* CONFIG_SCHED_EDGE */
2500 * sched_clutch_thread_pri_shift()
2502 * Routine to get the priority shift value for a thread.
2503 * Since the timesharing is done at the clutch_bucket level,
2504 * this routine gets the clutch_bucket and retrieves the
2505 * values from there.
2508 sched_clutch_thread_pri_shift(
2510 sched_bucket_t bucket
)
2512 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread
)) {
2515 assert(bucket
!= TH_BUCKET_RUN
);
2516 sched_clutch_t clutch
= sched_clutch_for_thread(thread
);
2517 sched_clutch_bucket_group_t clutch_bucket_group
= &(clutch
->sc_clutch_groups
[bucket
]);
2518 return os_atomic_load(&clutch_bucket_group
->scbg_pri_shift
, relaxed
);
2521 #pragma mark -- Clutch Scheduler Algorithm
2524 sched_clutch_init(void);
2527 sched_clutch_steal_thread(processor_set_t pset
);
2530 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context
);
2533 sched_clutch_processor_enqueue(processor_t processor
, thread_t thread
,
2534 sched_options_t options
);
2537 sched_clutch_processor_queue_remove(processor_t processor
, thread_t thread
);
2540 sched_clutch_processor_csw_check(processor_t processor
);
2543 sched_clutch_processor_queue_has_priority(processor_t processor
, int priority
, boolean_t gte
);
2546 sched_clutch_runq_count(processor_t processor
);
2549 sched_clutch_processor_queue_empty(processor_t processor
);
2552 sched_clutch_runq_stats_count_sum(processor_t processor
);
2555 sched_clutch_processor_bound_count(processor_t processor
);
2558 sched_clutch_pset_init(processor_set_t pset
);
2561 sched_clutch_processor_init(processor_t processor
);
2564 sched_clutch_choose_thread(processor_t processor
, int priority
, ast_t reason
);
2567 sched_clutch_processor_queue_shutdown(processor_t processor
);
2570 sched_clutch_initial_thread_sched_mode(task_t parent_task
);
2573 sched_clutch_initial_quantum_size(thread_t thread
);
2576 sched_clutch_thread_avoid_processor(processor_t processor
, thread_t thread
);
2579 sched_clutch_run_incr(thread_t thread
);
2582 sched_clutch_run_decr(thread_t thread
);
2585 sched_clutch_update_thread_bucket(thread_t thread
);
2587 const struct sched_dispatch_table sched_clutch_dispatch
= {
2588 .sched_name
= "clutch",
2589 .init
= sched_clutch_init
,
2590 .timebase_init
= sched_timeshare_timebase_init
,
2591 .processor_init
= sched_clutch_processor_init
,
2592 .pset_init
= sched_clutch_pset_init
,
2593 .maintenance_continuation
= sched_timeshare_maintenance_continue
,
2594 .choose_thread
= sched_clutch_choose_thread
,
2595 .steal_thread_enabled
= sched_steal_thread_enabled
,
2596 .steal_thread
= sched_clutch_steal_thread
,
2597 .compute_timeshare_priority
= sched_compute_timeshare_priority
,
2598 .choose_node
= sched_choose_node
,
2599 .choose_processor
= choose_processor
,
2600 .processor_enqueue
= sched_clutch_processor_enqueue
,
2601 .processor_queue_shutdown
= sched_clutch_processor_queue_shutdown
,
2602 .processor_queue_remove
= sched_clutch_processor_queue_remove
,
2603 .processor_queue_empty
= sched_clutch_processor_queue_empty
,
2604 .priority_is_urgent
= priority_is_urgent
,
2605 .processor_csw_check
= sched_clutch_processor_csw_check
,
2606 .processor_queue_has_priority
= sched_clutch_processor_queue_has_priority
,
2607 .initial_quantum_size
= sched_clutch_initial_quantum_size
,
2608 .initial_thread_sched_mode
= sched_clutch_initial_thread_sched_mode
,
2609 .can_update_priority
= can_update_priority
,
2610 .update_priority
= update_priority
,
2611 .lightweight_update_priority
= lightweight_update_priority
,
2612 .quantum_expire
= sched_default_quantum_expire
,
2613 .processor_runq_count
= sched_clutch_runq_count
,
2614 .processor_runq_stats_count_sum
= sched_clutch_runq_stats_count_sum
,
2615 .processor_bound_count
= sched_clutch_processor_bound_count
,
2616 .thread_update_scan
= sched_clutch_thread_update_scan
,
2617 .multiple_psets_enabled
= TRUE
,
2618 .sched_groups_enabled
= FALSE
,
2619 .avoid_processor_enabled
= TRUE
,
2620 .thread_avoid_processor
= sched_clutch_thread_avoid_processor
,
2621 .processor_balance
= sched_SMT_balance
,
2623 .rt_runq
= sched_rtlocal_runq
,
2624 .rt_init
= sched_rtlocal_init
,
2625 .rt_queue_shutdown
= sched_rtlocal_queue_shutdown
,
2626 .rt_runq_scan
= sched_rtlocal_runq_scan
,
2627 .rt_runq_count_sum
= sched_rtlocal_runq_count_sum
,
2629 .qos_max_parallelism
= sched_qos_max_parallelism
,
2630 .check_spill
= sched_check_spill
,
2631 .ipi_policy
= sched_ipi_policy
,
2632 .thread_should_yield
= sched_thread_should_yield
,
2633 .run_count_incr
= sched_clutch_run_incr
,
2634 .run_count_decr
= sched_clutch_run_decr
,
2635 .update_thread_bucket
= sched_clutch_update_thread_bucket
,
2636 .pset_made_schedulable
= sched_pset_made_schedulable
,
2639 __attribute__((always_inline
))
2640 static inline run_queue_t
2641 sched_clutch_bound_runq(processor_t processor
)
2643 return &processor
->runq
;
2646 __attribute__((always_inline
))
2647 static inline sched_clutch_root_t
2648 sched_clutch_processor_root_clutch(processor_t processor
)
2650 return &processor
->processor_set
->pset_clutch_root
;
2653 __attribute__((always_inline
))
2654 static inline run_queue_t
2655 sched_clutch_thread_bound_runq(processor_t processor
, __assert_only thread_t thread
)
2657 assert(thread
->bound_processor
== processor
);
2658 return sched_clutch_bound_runq(processor
);
2662 sched_clutch_initial_quantum_size(thread_t thread
)
2664 if (thread
== THREAD_NULL
) {
2667 assert(sched_clutch_thread_quantum
[thread
->th_sched_bucket
] <= UINT32_MAX
);
2668 return (uint32_t)sched_clutch_thread_quantum
[thread
->th_sched_bucket
];
2672 sched_clutch_initial_thread_sched_mode(task_t parent_task
)
2674 if (parent_task
== kernel_task
) {
2675 return TH_MODE_FIXED
;
2677 return TH_MODE_TIMESHARE
;
2682 sched_clutch_processor_init(processor_t processor
)
2684 run_queue_init(&processor
->runq
);
2688 sched_clutch_pset_init(processor_set_t pset
)
2690 sched_clutch_root_init(&pset
->pset_clutch_root
, pset
);
2694 sched_clutch_tunables_init(void)
2696 sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us
, sched_clutch_root_bucket_wcel
);
2697 sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us
, sched_clutch_root_bucket_warp
);
2698 sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us
, sched_clutch_thread_quantum
);
2699 clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS
,
2700 NSEC_PER_USEC
, &sched_clutch_bucket_group_adjust_threshold
);
2701 assert(sched_clutch_bucket_group_adjust_threshold
<= CLUTCH_CPU_DATA_MAX
);
2702 sched_clutch_us_to_abstime(sched_clutch_bucket_group_pending_delta_us
, sched_clutch_bucket_group_pending_delta
);
2706 sched_clutch_init(void)
2708 if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri
, sizeof(sched_clutch_bucket_group_interactive_pri
))) {
2709 sched_clutch_bucket_group_interactive_pri
= SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT
;
2711 sched_timeshare_init();
2712 sched_clutch_tunables_init();
2716 sched_clutch_choose_thread(
2717 processor_t processor
,
2719 __unused ast_t reason
)
2721 int clutch_pri
= sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor
));
2722 uint32_t clutch_count
= sched_clutch_root_count(sched_clutch_processor_root_clutch(processor
));
2723 run_queue_t bound_runq
= sched_clutch_bound_runq(processor
);
2724 boolean_t choose_from_boundq
= false;
2726 if (bound_runq
->highq
< priority
&&
2727 clutch_pri
< priority
) {
2731 if (bound_runq
->count
&& clutch_count
) {
2732 if (bound_runq
->highq
>= clutch_pri
) {
2733 choose_from_boundq
= true;
2735 } else if (bound_runq
->count
) {
2736 choose_from_boundq
= true;
2737 } else if (clutch_count
) {
2738 choose_from_boundq
= false;
2743 thread_t thread
= THREAD_NULL
;
2744 if (choose_from_boundq
== false) {
2745 sched_clutch_root_t pset_clutch_root
= sched_clutch_processor_root_clutch(processor
);
2746 thread
= sched_clutch_thread_highest_remove(pset_clutch_root
);
2748 thread
= run_queue_dequeue(bound_runq
, SCHED_HEADQ
);
2754 sched_clutch_processor_enqueue(
2755 processor_t processor
,
2757 sched_options_t options
)
2761 thread
->runq
= processor
;
2762 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread
)) {
2763 sched_clutch_root_t pset_clutch_root
= sched_clutch_processor_root_clutch(processor
);
2764 result
= sched_clutch_thread_insert(pset_clutch_root
, thread
, options
);
2766 run_queue_t rq
= sched_clutch_thread_bound_runq(processor
, thread
);
2767 result
= run_queue_enqueue(rq
, thread
, options
);
2773 sched_clutch_processor_queue_empty(processor_t processor
)
2775 return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor
)) == 0 &&
2776 sched_clutch_bound_runq(processor
)->count
== 0;
2780 sched_clutch_processor_csw_check(processor_t processor
)
2782 boolean_t has_higher
;
2785 if (sched_clutch_thread_avoid_processor(processor
, current_thread())) {
2786 return AST_PREEMPT
| AST_URGENT
;
2789 run_queue_t bound_runq
= sched_clutch_bound_runq(processor
);
2790 int clutch_pri
= sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor
));
2792 assert(processor
->active_thread
!= NULL
);
2794 pri
= MAX(clutch_pri
, bound_runq
->highq
);
2796 if (processor
->first_timeslice
) {
2797 has_higher
= (pri
> processor
->current_pri
);
2799 has_higher
= (pri
>= processor
->current_pri
);
2803 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor
)) > 0) {
2804 return AST_PREEMPT
| AST_URGENT
;
2807 if (bound_runq
->urgency
> 0) {
2808 return AST_PREEMPT
| AST_URGENT
;
2818 sched_clutch_processor_queue_has_priority(processor_t processor
,
2822 run_queue_t bound_runq
= sched_clutch_bound_runq(processor
);
2824 int qpri
= MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor
)), bound_runq
->highq
);
2827 return qpri
>= priority
;
2829 return qpri
> priority
;
2834 sched_clutch_runq_count(processor_t processor
)
2836 return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor
)) + sched_clutch_bound_runq(processor
)->count
;
2840 sched_clutch_runq_stats_count_sum(processor_t processor
)
2842 uint64_t bound_sum
= sched_clutch_bound_runq(processor
)->runq_stats
.count_sum
;
2844 if (processor
->cpu_id
== processor
->processor_set
->cpu_set_low
) {
2845 return bound_sum
+ sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor
));
2851 sched_clutch_processor_bound_count(processor_t processor
)
2853 return sched_clutch_bound_runq(processor
)->count
;
2857 sched_clutch_processor_queue_shutdown(processor_t processor
)
2859 processor_set_t pset
= processor
->processor_set
;
2860 sched_clutch_root_t pset_clutch_root
= sched_clutch_processor_root_clutch(processor
);
2862 queue_head_t tqueue
;
2864 /* We only need to migrate threads if this is the last active processor in the pset */
2865 if (pset
->online_processor_count
> 0) {
2870 queue_init(&tqueue
);
2871 while (sched_clutch_root_count(pset_clutch_root
) > 0) {
2872 thread
= sched_clutch_thread_highest_remove(pset_clutch_root
);
2873 enqueue_tail(&tqueue
, &thread
->runq_links
);
2878 qe_foreach_element_safe(thread
, &tqueue
, runq_links
) {
2879 remqueue(&thread
->runq_links
);
2880 thread_lock(thread
);
2881 thread_setrun(thread
, SCHED_TAILQ
);
2882 thread_unlock(thread
);
2887 sched_clutch_processor_queue_remove(
2888 processor_t processor
,
2892 processor_set_t pset
= processor
->processor_set
;
2896 if (processor
== thread
->runq
) {
2898 * Thread is on a run queue and we have a lock on
2901 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread
)) {
2902 sched_clutch_root_t pset_clutch_root
= sched_clutch_processor_root_clutch(processor
);
2903 sched_clutch_thread_remove(pset_clutch_root
, thread
, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE
);
2905 rq
= sched_clutch_thread_bound_runq(processor
, thread
);
2906 run_queue_remove(rq
, thread
);
2910 * The thread left the run queue before we could
2911 * lock the run queue.
2913 assert(thread
->runq
== PROCESSOR_NULL
);
2914 processor
= PROCESSOR_NULL
;
2919 return processor
!= PROCESSOR_NULL
;
2923 sched_clutch_steal_thread(__unused processor_set_t pset
)
2925 /* Thread stealing is not enabled for single cluster clutch scheduler platforms */
2930 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context
)
2932 boolean_t restart_needed
= FALSE
;
2933 processor_t processor
= processor_list
;
2934 processor_set_t pset
;
2939 * We update the threads associated with each processor (bound and idle threads)
2940 * and then update the threads in each pset runqueue.
2945 pset
= processor
->processor_set
;
2950 restart_needed
= runq_scan(sched_clutch_bound_runq(processor
), scan_context
);
2955 if (restart_needed
) {
2959 thread
= processor
->idle_thread
;
2960 if (thread
!= THREAD_NULL
&& thread
->sched_stamp
!= sched_tick
) {
2961 if (thread_update_add_thread(thread
) == FALSE
) {
2962 restart_needed
= TRUE
;
2966 } while ((processor
= processor
->processor_list
) != NULL
);
2968 /* Ok, we now have a collection of candidates -- fix them. */
2969 thread_update_process_threads();
2970 } while (restart_needed
);
2972 pset_node_t node
= &pset_node0
;
2977 restart_needed
= FALSE
;
2978 while (pset
!= NULL
) {
2982 if (sched_clutch_root_count(&pset
->pset_clutch_root
) > 0) {
2983 for (sched_bucket_t bucket
= TH_BUCKET_SHARE_FG
; bucket
< TH_BUCKET_SCHED_MAX
; bucket
++) {
2984 restart_needed
= runq_scan(&pset
->pset_clutch_root
.scr_bound_buckets
[bucket
].scrb_bound_thread_runq
, scan_context
);
2985 if (restart_needed
) {
2989 queue_t clutch_bucket_list
= &pset
->pset_clutch_root
.scr_clutch_buckets
;
2990 sched_clutch_bucket_t clutch_bucket
;
2991 qe_foreach_element(clutch_bucket
, clutch_bucket_list
, scb_listlink
) {
2992 sched_clutch_bucket_group_timeshare_update(clutch_bucket
->scb_group
, clutch_bucket
, scan_context
->sched_tick_last_abstime
);
2993 restart_needed
= sched_clutch_timeshare_scan(&clutch_bucket
->scb_thread_timeshare_queue
, clutch_bucket
->scb_thr_count
, scan_context
);
3000 if (restart_needed
) {
3003 pset
= pset
->pset_list
;
3006 if (restart_needed
) {
3009 } while (((node
= node
->node_list
) != NULL
) && ((pset
= node
->psets
) != NULL
));
3011 /* Ok, we now have a collection of candidates -- fix them. */
3012 thread_update_process_threads();
3013 } while (restart_needed
);
3016 extern int sched_allow_rt_smt
;
3018 /* Return true if this thread should not continue running on this processor */
3020 sched_clutch_thread_avoid_processor(processor_t processor
, thread_t thread
)
3022 if (processor
->processor_primary
!= processor
) {
3024 * This is a secondary SMT processor. If the primary is running
3025 * a realtime thread, only allow realtime threads on the secondary.
3027 if ((processor
->processor_primary
->current_pri
>= BASEPRI_RTQUEUES
) && ((thread
->sched_pri
< BASEPRI_RTQUEUES
) || !sched_allow_rt_smt
)) {
3036 * For the clutch scheduler, the run counts are maintained in the clutch
3037 * buckets (i.e thread group scheduling structure).
3040 sched_clutch_run_incr(thread_t thread
)
3042 assert((thread
->state
& (TH_RUN
| TH_IDLE
)) == TH_RUN
);
3043 uint32_t new_count
= os_atomic_inc(&sched_run_buckets
[TH_BUCKET_RUN
], relaxed
);
3044 sched_clutch_thread_run_bucket_incr(thread
, thread
->th_sched_bucket
);
3049 sched_clutch_run_decr(thread_t thread
)
3051 assert((thread
->state
& (TH_RUN
| TH_IDLE
)) != TH_RUN
);
3052 uint32_t new_count
= os_atomic_dec(&sched_run_buckets
[TH_BUCKET_RUN
], relaxed
);
3053 sched_clutch_thread_run_bucket_decr(thread
, thread
->th_sched_bucket
);
3057 static sched_bucket_t
3058 sched_convert_pri_to_bucket(uint8_t priority
)
3060 sched_bucket_t bucket
= TH_BUCKET_RUN
;
3062 if (priority
> BASEPRI_USER_INITIATED
) {
3063 bucket
= TH_BUCKET_SHARE_FG
;
3064 } else if (priority
> BASEPRI_DEFAULT
) {
3065 bucket
= TH_BUCKET_SHARE_IN
;
3066 } else if (priority
> BASEPRI_UTILITY
) {
3067 bucket
= TH_BUCKET_SHARE_DF
;
3068 } else if (priority
> MAXPRI_THROTTLE
) {
3069 bucket
= TH_BUCKET_SHARE_UT
;
3071 bucket
= TH_BUCKET_SHARE_BG
;
3077 * For threads that have changed sched_pri without changing the
3078 * base_pri for any reason other than decay, use the sched_pri
3079 * as the bucketizing priority instead of base_pri. All such
3080 * changes are typically due to kernel locking primitives boosts
3084 sched_thread_sched_pri_promoted(thread_t thread
)
3086 return (thread
->sched_flags
& TH_SFLAG_PROMOTED
) ||
3087 (thread
->sched_flags
& TH_SFLAG_PROMOTE_REASON_MASK
) ||
3088 (thread
->sched_flags
& TH_SFLAG_DEMOTED_MASK
) ||
3089 (thread
->sched_flags
& TH_SFLAG_DEPRESSED_MASK
) ||
3090 (thread
->kern_promotion_schedpri
!= 0);
3094 * Routine to update the scheduling bucket for the thread.
3096 * In the clutch scheduler implementation, the thread's bucket
3097 * is based on sched_pri if it was promoted due to a kernel
3098 * primitive; otherwise its based on the thread base_pri. This
3099 * enhancement allows promoted threads to reach a higher priority
3100 * bucket and potentially get selected sooner for scheduling.
3102 * Also, the clutch scheduler does not honor fixed priority below
3103 * FG priority. It simply puts those threads in the corresponding
3104 * timeshare bucket. The reason for to do that is because it is
3105 * extremely hard to define the scheduling properties of such threads
3106 * and they typically lead to performance issues.
3110 sched_clutch_update_thread_bucket(thread_t thread
)
3112 sched_bucket_t old_bucket
= thread
->th_sched_bucket
;
3113 sched_bucket_t new_bucket
= TH_BUCKET_RUN
;
3114 assert(thread
->runq
== PROCESSOR_NULL
);
3115 int pri
= (sched_thread_sched_pri_promoted(thread
)) ? thread
->sched_pri
: thread
->base_pri
;
3117 switch (thread
->sched_mode
) {
3119 if (pri
>= BASEPRI_FOREGROUND
) {
3120 new_bucket
= TH_BUCKET_FIXPRI
;
3122 new_bucket
= sched_convert_pri_to_bucket(pri
);
3126 case TH_MODE_REALTIME
:
3127 new_bucket
= TH_BUCKET_FIXPRI
;
3130 case TH_MODE_TIMESHARE
:
3131 new_bucket
= sched_convert_pri_to_bucket(pri
);
3135 panic("unexpected mode: %d", thread
->sched_mode
);
3139 if (old_bucket
== new_bucket
) {
3143 thread
->th_sched_bucket
= new_bucket
;
3144 thread
->pri_shift
= sched_clutch_thread_pri_shift(thread
, new_bucket
);
3146 * Since this is called after the thread has been removed from the runq,
3147 * only the run counts need to be updated. The re-insert into the runq
3148 * would put the thread into the correct new bucket's runq.
3150 if ((thread
->state
& (TH_RUN
| TH_IDLE
)) == TH_RUN
) {
3151 sched_clutch_thread_run_bucket_decr(thread
, old_bucket
);
3152 sched_clutch_thread_run_bucket_incr(thread
, new_bucket
);
3156 #if CONFIG_SCHED_EDGE
3158 /* Implementation of the AMP version of the clutch scheduler */
3161 sched_edge_init(void);
3164 sched_edge_processor_idle(processor_set_t pset
);
3167 sched_edge_processor_csw_check(processor_t processor
);
3170 sched_edge_processor_queue_has_priority(processor_t processor
, int priority
, boolean_t gte
);
3173 sched_edge_processor_queue_empty(processor_t processor
);
3176 sched_edge_choose_thread(processor_t processor
, int priority
, ast_t reason
);
3179 sched_edge_processor_queue_shutdown(processor_t processor
);
3182 sched_edge_choose_processor(processor_set_t pset
, processor_t processor
, thread_t thread
);
3185 sched_edge_thread_avoid_processor(processor_t processor
, thread_t thread
);
3188 sched_edge_balance(processor_t cprocessor
, processor_set_t cpset
);
3191 sched_edge_check_spill(processor_set_t pset
, thread_t thread
);
3194 sched_edge_thread_should_yield(processor_t processor
, thread_t thread
);
3197 sched_edge_pset_made_schedulable(processor_t processor
, processor_set_t dst_pset
, boolean_t drop_lock
);
3200 sched_edge_steal_thread_enabled(processor_set_t pset
);
3202 static sched_ipi_type_t
3203 sched_edge_ipi_policy(processor_t dst
, thread_t thread
, boolean_t dst_idle
, sched_ipi_event_t event
);
3206 sched_edge_qos_max_parallelism(int qos
, uint64_t options
);
3208 const struct sched_dispatch_table sched_edge_dispatch
= {
3209 .sched_name
= "edge",
3210 .init
= sched_edge_init
,
3211 .timebase_init
= sched_timeshare_timebase_init
,
3212 .processor_init
= sched_clutch_processor_init
,
3213 .pset_init
= sched_clutch_pset_init
,
3214 .maintenance_continuation
= sched_timeshare_maintenance_continue
,
3215 .choose_thread
= sched_edge_choose_thread
,
3216 .steal_thread_enabled
= sched_edge_steal_thread_enabled
,
3217 .steal_thread
= sched_edge_processor_idle
,
3218 .compute_timeshare_priority
= sched_compute_timeshare_priority
,
3219 .choose_node
= sched_choose_node
,
3220 .choose_processor
= sched_edge_choose_processor
,
3221 .processor_enqueue
= sched_clutch_processor_enqueue
,
3222 .processor_queue_shutdown
= sched_edge_processor_queue_shutdown
,
3223 .processor_queue_remove
= sched_clutch_processor_queue_remove
,
3224 .processor_queue_empty
= sched_edge_processor_queue_empty
,
3225 .priority_is_urgent
= priority_is_urgent
,
3226 .processor_csw_check
= sched_edge_processor_csw_check
,
3227 .processor_queue_has_priority
= sched_edge_processor_queue_has_priority
,
3228 .initial_quantum_size
= sched_clutch_initial_quantum_size
,
3229 .initial_thread_sched_mode
= sched_clutch_initial_thread_sched_mode
,
3230 .can_update_priority
= can_update_priority
,
3231 .update_priority
= update_priority
,
3232 .lightweight_update_priority
= lightweight_update_priority
,
3233 .quantum_expire
= sched_default_quantum_expire
,
3234 .processor_runq_count
= sched_clutch_runq_count
,
3235 .processor_runq_stats_count_sum
= sched_clutch_runq_stats_count_sum
,
3236 .processor_bound_count
= sched_clutch_processor_bound_count
,
3237 .thread_update_scan
= sched_clutch_thread_update_scan
,
3238 .multiple_psets_enabled
= TRUE
,
3239 .sched_groups_enabled
= FALSE
,
3240 .avoid_processor_enabled
= TRUE
,
3241 .thread_avoid_processor
= sched_edge_thread_avoid_processor
,
3242 .processor_balance
= sched_edge_balance
,
3244 .rt_runq
= sched_amp_rt_runq
,
3245 .rt_init
= sched_amp_rt_init
,
3246 .rt_queue_shutdown
= sched_amp_rt_queue_shutdown
,
3247 .rt_runq_scan
= sched_amp_rt_runq_scan
,
3248 .rt_runq_count_sum
= sched_amp_rt_runq_count_sum
,
3250 .qos_max_parallelism
= sched_edge_qos_max_parallelism
,
3251 .check_spill
= sched_edge_check_spill
,
3252 .ipi_policy
= sched_edge_ipi_policy
,
3253 .thread_should_yield
= sched_edge_thread_should_yield
,
3254 .run_count_incr
= sched_clutch_run_incr
,
3255 .run_count_decr
= sched_clutch_run_decr
,
3256 .update_thread_bucket
= sched_clutch_update_thread_bucket
,
3257 .pset_made_schedulable
= sched_edge_pset_made_schedulable
,
3258 .thread_group_recommendation_change
= NULL
,
3261 static struct processor_set pset1
;
3262 static struct pset_node pset_node1
;
3263 static bitmap_t sched_edge_available_pset_bitmask
[BITMAP_LEN(MAX_PSETS
)];
3266 * sched_edge_pset_available()
3268 * Routine to determine if a pset is available for scheduling.
3271 sched_edge_pset_available(processor_set_t pset
)
3273 return bitmap_test(sched_edge_available_pset_bitmask
, pset
->pset_cluster_id
);
3277 * sched_edge_thread_bound_cluster_id()
3279 * Routine to determine which cluster a particular thread is bound to. Uses
3280 * the sched_flags on the thread to map back to a specific cluster id.
3282 * <Edge Multi-cluster Support Needed>
3285 sched_edge_thread_bound_cluster_id(thread_t thread
)
3287 assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread
));
3288 if (thread
->sched_flags
& TH_SFLAG_ECORE_ONLY
) {
3289 return (pset_array
[0]->pset_type
== CLUSTER_TYPE_E
) ? 0 : 1;
3291 return (pset_array
[0]->pset_type
== CLUSTER_TYPE_P
) ? 0 : 1;
3295 /* Forward declaration for some thread migration routines */
3296 static boolean_t
sched_edge_foreign_runnable_thread_available(processor_set_t pset
);
3297 static boolean_t
sched_edge_foreign_running_thread_available(processor_set_t pset
);
3298 static processor_set_t
sched_edge_steal_candidate(processor_set_t pset
);
3299 static processor_set_t
sched_edge_migrate_candidate(processor_set_t preferred_pset
, thread_t thread
, processor_set_t locked_pset
, bool switch_pset_locks
);
3302 * sched_edge_config_set()
3304 * Support to update an edge configuration. Typically used by CLPC to affect thread migration
3305 * policies in the scheduler.
3308 sched_edge_config_set(uint32_t src_cluster
, uint32_t dst_cluster
, sched_clutch_edge edge_config
)
3310 sched_clutch_edge
*edge
= &pset_array
[src_cluster
]->sched_edges
[dst_cluster
];
3311 edge
->sce_edge_packed
= edge_config
.sce_edge_packed
;
3315 * sched_edge_config_get()
3317 * Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
3318 * if it needs to update edges.
3320 static sched_clutch_edge
3321 sched_edge_config_get(uint32_t src_cluster
, uint32_t dst_cluster
)
3323 return pset_array
[src_cluster
]->sched_edges
[dst_cluster
];
3326 #if DEVELOPMENT || DEBUG
3329 * Helper Routines for edge scheduler sysctl configuration
3331 * The current support is limited to dual cluster AMP platforms.
3332 * <Edge Multi-cluster Support Needed>
3336 sched_edge_sysctl_configure_e_to_p(uint64_t edge_config
)
3338 pset_array
[ecore_cluster_id
]->sched_edges
[pcore_cluster_id
].sce_edge_packed
= edge_config
;
3339 return KERN_SUCCESS
;
3343 sched_edge_sysctl_configure_p_to_e(uint64_t edge_config
)
3345 pset_array
[pcore_cluster_id
]->sched_edges
[ecore_cluster_id
].sce_edge_packed
= edge_config
;
3346 return KERN_SUCCESS
;
3350 sched_edge_e_to_p(void)
3352 return sched_edge_config_get(ecore_cluster_id
, pcore_cluster_id
);
3356 sched_edge_p_to_e(void)
3358 return sched_edge_config_get(pcore_cluster_id
, ecore_cluster_id
);
3361 #endif /* DEVELOPMENT || DEBUG */
3364 * sched_edge_matrix_set()
3366 * Routine to update various edges in the cluster edge matrix. The edge_changes_bitmap
3367 * indicates which edges need to be updated. Both the edge_matrix & edge_changes_bitmap
3368 * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3371 sched_edge_matrix_set(sched_clutch_edge
*edge_matrix
, bool *edge_changes_bitmap
, __unused
uint64_t flags
, uint64_t matrix_order
)
3373 uint32_t edge_index
= 0;
3374 for (uint32_t src_cluster
= 0; src_cluster
< matrix_order
; src_cluster
++) {
3375 for (uint32_t dst_cluster
= 0; dst_cluster
< matrix_order
; dst_cluster
++) {
3376 if (edge_changes_bitmap
[edge_index
]) {
3377 sched_edge_config_set(src_cluster
, dst_cluster
, edge_matrix
[edge_index
]);
3385 * sched_edge_matrix_get()
3387 * Routine to retrieve various edges in the cluster edge matrix. The edge_request_bitmap
3388 * indicates which edges need to be retrieved. Both the edge_matrix & edge_request_bitmap
3389 * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3392 sched_edge_matrix_get(sched_clutch_edge
*edge_matrix
, bool *edge_request_bitmap
, __unused
uint64_t flags
, uint64_t matrix_order
)
3394 uint32_t edge_index
= 0;
3395 for (uint32_t src_cluster
= 0; src_cluster
< matrix_order
; src_cluster
++) {
3396 for (uint32_t dst_cluster
= 0; dst_cluster
< matrix_order
; dst_cluster
++) {
3397 if (edge_request_bitmap
[edge_index
]) {
3398 edge_matrix
[edge_index
] = sched_edge_config_get(src_cluster
, dst_cluster
);
3408 * Routine to initialize the data structures for the Edge scheduler. The current implementation
3409 * relies on this being enabled for a dual cluster AMP system. Once a better design for MAX_PSETS,
3410 * edge config etc. is defined, it should be made more generic to handle the multi-cluster
3412 * <Edge Multi-cluster Support Needed>
3415 sched_edge_init(void)
3417 processor_set_t ecore_set
= &pset0
;
3418 processor_set_t pcore_set
= &pset1
;
3420 if (ml_get_boot_cluster() == CLUSTER_TYPE_P
) {
3421 /* If the device boots on a P-cluster, fixup the IDs etc. */
3424 bitmap_set(sched_edge_available_pset_bitmask
, pcore_cluster_id
);
3426 bitmap_set(sched_edge_available_pset_bitmask
, ecore_cluster_id
);
3429 ecore_set
->pset_cluster_type
= PSET_AMP_E
;
3430 ecore_set
->pset_cluster_id
= ecore_cluster_id
;
3432 pcore_set
->pset_cluster_type
= PSET_AMP_P
;
3433 pcore_set
->pset_cluster_id
= pcore_cluster_id
;
3435 pset_init(&pset1
, &pset_node1
);
3436 pset_node1
.psets
= &pset1
;
3437 pset_node0
.node_list
= &pset_node1
;
3439 pset_array
[ecore_cluster_id
] = ecore_set
;
3440 pset_array
[ecore_cluster_id
]->pset_type
= CLUSTER_TYPE_E
;
3441 bitmap_set(pset_array
[ecore_cluster_id
]->foreign_psets
, pcore_cluster_id
);
3443 sched_edge_config_set(ecore_cluster_id
, ecore_cluster_id
, (sched_clutch_edge
){.sce_migration_weight
= 0, .sce_migration_allowed
= 0, .sce_steal_allowed
= 0});
3444 sched_edge_config_set(ecore_cluster_id
, pcore_cluster_id
, (sched_clutch_edge
){.sce_migration_weight
= 0, .sce_migration_allowed
= 0, .sce_steal_allowed
= 0});
3446 pset_array
[pcore_cluster_id
] = pcore_set
;
3447 pset_array
[pcore_cluster_id
]->pset_type
= CLUSTER_TYPE_P
;
3448 bitmap_set(pset_array
[pcore_cluster_id
]->foreign_psets
, ecore_cluster_id
);
3450 sched_edge_config_set(pcore_cluster_id
, pcore_cluster_id
, (sched_clutch_edge
){.sce_migration_weight
= 0, .sce_migration_allowed
= 0, .sce_steal_allowed
= 0});
3451 sched_edge_config_set(pcore_cluster_id
, ecore_cluster_id
, (sched_clutch_edge
){.sce_migration_weight
= 64, .sce_migration_allowed
= 1, .sce_steal_allowed
= 1});
3453 sched_timeshare_init();
3454 sched_clutch_tunables_init();
3458 sched_edge_choose_thread(
3459 processor_t processor
,
3461 __unused ast_t reason
)
3463 int clutch_pri
= sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor
));
3464 run_queue_t bound_runq
= sched_clutch_bound_runq(processor
);
3465 boolean_t choose_from_boundq
= false;
3467 if ((bound_runq
->highq
< priority
) &&
3468 (clutch_pri
< priority
)) {
3472 if (bound_runq
->highq
>= clutch_pri
) {
3473 choose_from_boundq
= true;
3476 thread_t thread
= THREAD_NULL
;
3477 if (choose_from_boundq
== false) {
3478 sched_clutch_root_t pset_clutch_root
= sched_clutch_processor_root_clutch(processor
);
3479 thread
= sched_clutch_thread_highest_remove(pset_clutch_root
);
3481 thread
= run_queue_dequeue(bound_runq
, SCHED_HEADQ
);
3487 sched_edge_processor_queue_empty(processor_t processor
)
3489 return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor
)) == 0) &&
3490 (sched_clutch_bound_runq(processor
)->count
== 0);
3494 sched_edge_check_spill(__unused processor_set_t pset
, __unused thread_t thread
)
3496 assert(thread
->bound_processor
== PROCESSOR_NULL
);
3499 __options_decl(sched_edge_thread_yield_reason_t
, uint32_t, {
3500 SCHED_EDGE_YIELD_RUNQ_NONEMPTY
= 0x0,
3501 SCHED_EDGE_YIELD_FOREIGN_RUNNABLE
= 0x1,
3502 SCHED_EDGE_YIELD_FOREIGN_RUNNING
= 0x2,
3503 SCHED_EDGE_YIELD_STEAL_POSSIBLE
= 0x3,
3504 SCHED_EDGE_YIELD_DISALLOW
= 0x4,
3508 sched_edge_thread_should_yield(processor_t processor
, __unused thread_t thread
)
3510 if (!sched_edge_processor_queue_empty(processor
) || (rt_runq_count(processor
->processor_set
) > 0)) {
3511 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_SHOULD_YIELD
) | DBG_FUNC_NONE
,
3512 thread_tid(thread
), processor
->processor_set
->pset_cluster_id
, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY
);
3517 * The yield logic should follow the same logic that steal_thread () does. The
3518 * thread_should_yield() is effectively trying to quickly check that if the
3519 * current thread gave up CPU, is there any other thread that would execute
3520 * on this CPU. So it needs to provide the same answer as the steal_thread()/
3521 * processor Idle logic.
3523 if (sched_edge_foreign_runnable_thread_available(processor
->processor_set
)) {
3524 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_SHOULD_YIELD
) | DBG_FUNC_NONE
,
3525 thread_tid(thread
), processor
->processor_set
->pset_cluster_id
, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNABLE
);
3528 if (sched_edge_foreign_running_thread_available(processor
->processor_set
)) {
3529 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_SHOULD_YIELD
) | DBG_FUNC_NONE
,
3530 thread_tid(thread
), processor
->processor_set
->pset_cluster_id
, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING
);
3534 processor_set_t steal_candidate
= sched_edge_steal_candidate(processor
->processor_set
);
3535 if (steal_candidate
!= NULL
) {
3536 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_SHOULD_YIELD
) | DBG_FUNC_NONE
,
3537 thread_tid(thread
), processor
->processor_set
->pset_cluster_id
, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE
);
3541 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_SHOULD_YIELD
) | DBG_FUNC_NONE
, thread_tid(thread
), processor
->processor_set
->pset_cluster_id
,
3542 0, SCHED_EDGE_YIELD_DISALLOW
);
3547 sched_edge_processor_csw_check(processor_t processor
)
3549 boolean_t has_higher
;
3552 int clutch_pri
= sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor
));
3553 run_queue_t bound_runq
= sched_clutch_bound_runq(processor
);
3555 assert(processor
->active_thread
!= NULL
);
3557 pri
= MAX(clutch_pri
, bound_runq
->highq
);
3559 if (processor
->first_timeslice
) {
3560 has_higher
= (pri
> processor
->current_pri
);
3562 has_higher
= (pri
>= processor
->current_pri
);
3566 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor
)) > 0) {
3567 return AST_PREEMPT
| AST_URGENT
;
3570 if (bound_runq
->urgency
> 0) {
3571 return AST_PREEMPT
| AST_URGENT
;
3581 sched_edge_processor_queue_has_priority(processor_t processor
,
3585 run_queue_t bound_runq
= sched_clutch_bound_runq(processor
);
3587 int qpri
= MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor
)), bound_runq
->highq
);
3589 return qpri
>= priority
;
3591 return qpri
> priority
;
3596 sched_edge_processor_queue_shutdown(processor_t processor
)
3598 processor_set_t pset
= processor
->processor_set
;
3599 sched_clutch_root_t pset_clutch_root
= sched_clutch_processor_root_clutch(processor
);
3601 queue_head_t tqueue
;
3603 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
3604 if ((pset
->online_processor_count
> 0) && pset_is_recommended(pset
)) {
3609 bitmap_clear(sched_edge_available_pset_bitmask
, pset
->pset_cluster_id
);
3611 queue_init(&tqueue
);
3612 while (sched_clutch_root_count(pset_clutch_root
) > 0) {
3613 thread
= sched_clutch_thread_highest_remove(pset_clutch_root
);
3614 enqueue_tail(&tqueue
, &thread
->runq_links
);
3618 qe_foreach_element_safe(thread
, &tqueue
, runq_links
) {
3619 remqueue(&thread
->runq_links
);
3620 thread_lock(thread
);
3621 thread_setrun(thread
, SCHED_TAILQ
);
3622 thread_unlock(thread
);
3627 * sched_edge_cluster_load_metric()
3629 * The load metric for a cluster is a measure of the average scheduling latency
3630 * experienced by threads on that cluster. It is a product of the average number
3631 * of threads in the runqueue and the average execution time for threads. The metric
3632 * has special values in the following cases:
3633 * - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
3634 * the maximum value to disallow any threads to migrate to this cluster.
3635 * - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
3636 * to be spread across the platform quickly for ncpu wide workloads.
3639 sched_edge_cluster_load_metric(processor_set_t pset
, sched_bucket_t sched_bucket
)
3641 if (sched_edge_pset_available(pset
) == false) {
3644 return (uint32_t)sched_get_pset_load_average(pset
, sched_bucket
);
3649 * Edge Scheduler Steal/Rebalance logic
3651 * = Generic scheduler logic =
3653 * The SCHED(steal_thread) scheduler callout is invoked when the processor does not
3654 * find any thread for execution in its runqueue. The aim of the steal operation
3655 * is to find other threads running/runnable in other clusters which should be
3658 * If the steal callout does not return a thread, the thread_select() logic calls
3659 * SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
3660 * threads and idle out the current CPU.
3662 * = SCHED(steal_thread) for Edge Scheduler =
3664 * The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
3665 * routine tries to do the following operations in order:
3666 * (1) Find foreign runnnable threads in non-native cluster
3667 * runqueues (sched_edge_foreign_runnable_thread_remove())
3668 * (2) Check if foreign threads are running on the non-native
3669 * clusters (sched_edge_foreign_running_thread_available())
3670 * - If yes, return THREAD_NULL for the steal callout and
3671 * perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
3672 * (3) Steal a thread from another cluster based on edge
3673 * weights (sched_edge_steal_thread())
3675 * = SCHED(processor_balance) for Edge Scheduler =
3677 * If steal_thread did not return a thread for the processor, use
3678 * sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
3680 * = Clutch Bucket Preferred Cluster Overrides =
3682 * Since these operations (just like thread migrations on enqueue)
3683 * move threads across clusters, they need support for handling clutch
3684 * bucket group level preferred cluster recommendations.
3685 * For (1), a clutch bucket will be in the foreign runnable queue based
3686 * on the clutch bucket group preferred cluster.
3687 * For (2), the running thread will set the bit on the processor based
3688 * on its preferred cluster type.
3689 * For (3), the edge configuration would prevent threads from being stolen
3690 * in the wrong direction.
3692 * = SCHED(thread_should_yield) =
3693 * The thread_should_yield() logic needs to have the same logic as sched_edge_processor_idle()
3694 * since that is expecting the same answer as if thread_select() was called on a core
3695 * with an empty runqueue.
3699 sched_edge_steal_thread_enabled(__unused processor_set_t pset
)
3702 * For edge scheduler, the gating for steal is being done by sched_edge_steal_candidate()
3707 static processor_set_t
3708 sched_edge_steal_candidate(processor_set_t pset
)
3711 * Edge Scheduler Optimization
3713 * Investigate a better policy for stealing. The current implementation looks
3714 * at all the incoming weights for the pset that just became idle and sees which
3715 * clusters have loads > edge weights. It is effectively trying to simulate a
3716 * overload migration as if a thread had become runnable on the candidate cluster.
3718 * The logic today bails as soon as it finds a cluster where the cluster load is
3719 * greater than the edge weight. This helps the check to be quick which is useful
3720 * for sched_edge_thread_should_yield() which uses this. Maybe it should have a
3721 * more advanced version for the actual steal operation which looks for the
3722 * maximum delta etc.
3724 processor_set_t target_pset
= NULL
;
3725 uint32_t dst_cluster_id
= pset
->pset_cluster_id
;
3727 for (int cluster_id
= 0; cluster_id
< MAX_PSETS
; cluster_id
++) {
3728 processor_set_t candidate_pset
= pset_array
[cluster_id
];
3730 if (candidate_pset
== pset
) {
3734 sched_clutch_edge
*incoming_edge
= &pset_array
[cluster_id
]->sched_edges
[dst_cluster_id
];
3735 if (incoming_edge
->sce_steal_allowed
== false) {
3739 uint32_t incoming_weight
= incoming_edge
->sce_migration_weight
;
3740 int highest_runnable_bucket
= bitmap_lsb_first(candidate_pset
->pset_clutch_root
.scr_unbound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
);
3741 if (highest_runnable_bucket
== -1) {
3742 /* Candidate cluster runq is empty */
3745 /* Use the load metrics for highest runnable bucket since that would be stolen next */
3746 uint32_t candidate_load
= sched_edge_cluster_load_metric(candidate_pset
, (sched_bucket_t
)highest_runnable_bucket
);
3747 if (candidate_load
> incoming_weight
) {
3748 /* Only steal from the candidate if its load is higher than the incoming edge and it has runnable threads */
3749 target_pset
= candidate_pset
;
3758 sched_edge_foreign_runnable_thread_available(processor_set_t pset
)
3760 /* Find all the clusters that are foreign for this cluster */
3761 bitmap_t
*foreign_pset_bitmap
= pset
->foreign_psets
;
3762 for (int cluster
= bitmap_first(foreign_pset_bitmap
, MAX_PSETS
); cluster
>= 0; cluster
= bitmap_next(foreign_pset_bitmap
, cluster
)) {
3764 * For each cluster, see if there are any runnable foreign threads.
3765 * This check is currently being done without the pset lock to make it cheap for
3768 processor_set_t target_pset
= pset_array
[cluster
];
3769 if (sched_edge_pset_available(target_pset
) == false) {
3773 if (!sched_clutch_root_foreign_empty(&target_pset
->pset_clutch_root
)) {
3781 sched_edge_foreign_runnable_thread_remove(processor_set_t pset
, uint64_t ctime
)
3783 thread_t thread
= THREAD_NULL
;
3785 /* Find all the clusters that are foreign for this cluster */
3786 bitmap_t
*foreign_pset_bitmap
= pset
->foreign_psets
;
3787 for (int cluster
= bitmap_first(foreign_pset_bitmap
, MAX_PSETS
); cluster
>= 0; cluster
= bitmap_next(foreign_pset_bitmap
, cluster
)) {
3789 * For each cluster, see if there are any runnable foreign threads.
3790 * This check is currently being done without the pset lock to make it cheap for
3793 processor_set_t target_pset
= pset_array
[cluster
];
3794 if (sched_edge_pset_available(target_pset
) == false) {
3798 if (sched_clutch_root_foreign_empty(&target_pset
->pset_clutch_root
)) {
3802 * Looks like there are runnable foreign threads in the hierarchy; lock the pset
3803 * and get the highest priority thread.
3805 pset_lock(target_pset
);
3806 if (sched_edge_pset_available(target_pset
)) {
3807 thread
= sched_clutch_root_highest_foreign_thread_remove(&target_pset
->pset_clutch_root
);
3808 sched_update_pset_load_average(target_pset
, ctime
);
3810 pset_unlock(target_pset
);
3813 * Edge Scheduler Optimization
3815 * The current implementation immediately returns as soon as it finds a foreign
3816 * runnable thread. This could be enhanced to look at highest priority threads
3817 * from all foreign clusters and pick the highest amongst them. That would need
3818 * some form of global state across psets to make that kind of a check cheap.
3820 if (thread
!= THREAD_NULL
) {
3821 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_REBAL_RUNNABLE
) | DBG_FUNC_NONE
, thread_tid(thread
), pset
->pset_cluster_id
, target_pset
->pset_cluster_id
, 0);
3824 /* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */
3831 sched_edge_foreign_running_thread_available(processor_set_t pset
)
3833 bitmap_t
*foreign_pset_bitmap
= pset
->foreign_psets
;
3834 for (int cluster
= bitmap_first(foreign_pset_bitmap
, MAX_PSETS
); cluster
>= 0; cluster
= bitmap_next(foreign_pset_bitmap
, cluster
)) {
3835 /* Skip the pset if its not schedulable */
3836 processor_set_t target_pset
= pset_array
[cluster
];
3837 if (sched_edge_pset_available(target_pset
) == false) {
3841 uint64_t running_foreign_bitmap
= target_pset
->cpu_state_map
[PROCESSOR_RUNNING
] & target_pset
->cpu_running_foreign
;
3842 if (lsb_first(running_foreign_bitmap
) != -1) {
3843 /* Found a non-native CPU running a foreign thread; rebalance is needed */
3851 sched_edge_steal_thread(processor_set_t pset
)
3853 thread_t thread
= THREAD_NULL
;
3854 processor_set_t steal_from_pset
= sched_edge_steal_candidate(pset
);
3855 if (steal_from_pset
) {
3857 * sched_edge_steal_candidate() has found a pset which is ideal to steal from.
3858 * Lock the pset and select the highest thread in that runqueue.
3860 pset_lock(steal_from_pset
);
3861 if (bitmap_first(steal_from_pset
->pset_clutch_root
.scr_unbound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
) != -1) {
3862 uint64_t current_timestamp
= mach_absolute_time();
3863 sched_clutch_root_bucket_t root_bucket
= sched_clutch_root_highest_root_bucket(&steal_from_pset
->pset_clutch_root
, current_timestamp
, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY
);
3864 thread
= sched_clutch_thread_unbound_lookup(&steal_from_pset
->pset_clutch_root
, root_bucket
);
3865 sched_clutch_thread_remove(&steal_from_pset
->pset_clutch_root
, thread
, current_timestamp
, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR
);
3866 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_STEAL
) | DBG_FUNC_NONE
, thread_tid(thread
), pset
->pset_cluster_id
, steal_from_pset
->pset_cluster_id
, 0);
3867 sched_update_pset_load_average(steal_from_pset
, current_timestamp
);
3870 * Edge Scheduler Optimization
3871 * Maybe this needs to circle around if the steal candidate did not have any threads by
3872 * by the time the pset lock was taken.
3874 pset_unlock(steal_from_pset
);
3880 * sched_edge_processor_idle()
3882 * The routine is the implementation for steal_thread() for the Edge scheduler.
3885 sched_edge_processor_idle(processor_set_t pset
)
3887 thread_t thread
= THREAD_NULL
;
3889 uint64_t ctime
= mach_absolute_time();
3891 /* Each of the operations acquire the lock for the pset they target */
3894 /* Find highest priority runnable thread on all non-native clusters */
3895 thread
= sched_edge_foreign_runnable_thread_remove(pset
, ctime
);
3896 if (thread
!= THREAD_NULL
) {
3900 /* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */
3901 boolean_t rebalance_needed
= sched_edge_foreign_running_thread_available(pset
);
3902 if (rebalance_needed
) {
3906 /* No foreign threads found; find a thread to steal from a pset based on weights/loads etc. */
3907 thread
= sched_edge_steal_thread(pset
);
3911 /* Return true if this thread should not continue running on this processor */
3913 sched_edge_thread_avoid_processor(processor_t processor
, thread_t thread
)
3915 processor_set_t preferred_pset
= pset_array
[sched_edge_thread_preferred_cluster(thread
)];
3917 * For long running parallel workloads, it is important to rebalance threads across
3918 * E/P clusters so that they make equal forward progress. This is achieved through
3919 * threads expiring their quantum on the non-preferred cluster type and explicitly
3920 * rebalancing to the preferred cluster runqueue.
3922 * <Edge Multi-Cluster Support Needed>
3923 * For multi-cluster platforms, it mignt be useful to move the thread incase its
3924 * preferred pset is idle now.
3926 if (processor
->processor_set
->pset_type
!= preferred_pset
->pset_type
) {
3929 /* If thread already running on preferred cluster, do not avoid */
3930 if (processor
->processor_set
== preferred_pset
) {
3934 * The thread is running on a processor that is of the same type as the
3935 * preferred pset, but is not the actual preferred pset. In that case
3936 * look at edge weights to see if this thread should continue execution
3937 * here or go back to its preferred cluster.
3939 * <Edge Multi-Cluster Support Needed>
3940 * This logic needs to ensure that the current thread is not counted against
3941 * the load average for the current pset otherwise it would always end up avoiding
3942 * the current cluster.
3944 processor_set_t chosen_pset
= sched_edge_migrate_candidate(preferred_pset
, thread
, processor
->processor_set
, false);
3945 return chosen_pset
!= processor
->processor_set
;
3949 sched_edge_balance(__unused processor_t cprocessor
, processor_set_t cpset
)
3951 assert(cprocessor
== current_processor());
3954 uint64_t ast_processor_map
= 0;
3955 sched_ipi_type_t ipi_type
[MAX_CPUS
] = {SCHED_IPI_NONE
};
3957 bitmap_t
*foreign_pset_bitmap
= cpset
->foreign_psets
;
3958 for (int cluster
= bitmap_first(foreign_pset_bitmap
, MAX_PSETS
); cluster
>= 0; cluster
= bitmap_next(foreign_pset_bitmap
, cluster
)) {
3959 /* Skip the pset if its not schedulable */
3960 processor_set_t target_pset
= pset_array
[cluster
];
3961 if (sched_edge_pset_available(target_pset
) == false) {
3965 pset_lock(target_pset
);
3966 uint64_t cpu_running_foreign_map
= (target_pset
->cpu_running_foreign
& target_pset
->cpu_state_map
[PROCESSOR_RUNNING
]);
3967 for (int cpuid
= lsb_first(cpu_running_foreign_map
); cpuid
>= 0; cpuid
= lsb_next(cpu_running_foreign_map
, cpuid
)) {
3968 processor_t target_cpu
= processor_array
[cpuid
];
3969 ipi_type
[target_cpu
->cpu_id
] = sched_ipi_action(target_cpu
, NULL
, false, SCHED_IPI_EVENT_REBALANCE
);
3970 if (ipi_type
[cpuid
] != SCHED_IPI_NONE
) {
3971 bit_set(ast_processor_map
, cpuid
);
3974 pset_unlock(target_pset
);
3977 for (int cpuid
= lsb_first(ast_processor_map
); cpuid
>= 0; cpuid
= lsb_next(ast_processor_map
, cpuid
)) {
3978 processor_t ast_processor
= processor_array
[cpuid
];
3979 sched_ipi_perform(ast_processor
, ipi_type
[cpuid
]);
3980 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_REBAL_RUNNING
) | DBG_FUNC_NONE
, 0, cprocessor
->cpu_id
, cpuid
, 0);
3985 * sched_edge_migrate_edges_evaluate()
3987 * Routine to find the candidate for thread migration based on edge weights.
3989 * Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
3990 * return preferred_pset if its the most ideal destination for this thread.
3992 static processor_set_t
3993 sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset
, uint32_t preferred_cluster_load
, thread_t thread
)
3995 processor_set_t selected_pset
= preferred_pset
;
3996 uint32_t preferred_cluster_id
= preferred_pset
->pset_cluster_id
;
3997 cluster_type_t preferred_cluster_type
= pset_type_for_id(preferred_cluster_id
);
3999 /* Look at edge deltas with other clusters to find the ideal migration candidate */
4000 sched_clutch_edge
*edge
= preferred_pset
->sched_edges
;
4001 uint32_t max_edge_delta
= 0;
4004 * Edge Scheduler Optimization
4006 * For really large cluster count systems, it might make sense to optimize the
4007 * clusters iterated by using bitmaps and skipping over clusters that are not
4008 * available for scheduling or have migration disabled from this cluster.
4010 for (uint32_t cluster_id
= 0; cluster_id
< MAX_PSETS
; cluster_id
++) {
4011 processor_set_t dst_pset
= pset_array
[cluster_id
];
4012 if (cluster_id
== preferred_cluster_id
) {
4016 if (edge
[cluster_id
].sce_migration_allowed
== false) {
4020 uint32_t dst_load
= sched_edge_cluster_load_metric(dst_pset
, thread
->th_sched_bucket
);
4021 if (dst_load
> preferred_cluster_load
) {
4026 * Fast path for idle dst cluster
4028 * For extremely parallel workloads, it is important to load up
4029 * all clusters as quickly as possible. This short-circuit allows
4031 * <Edge Multi-cluster Support Needed>
4033 * For multi-cluster platforms, the loop should start with the homogeneous
4036 if (dst_load
== 0) {
4037 selected_pset
= dst_pset
;
4041 uint32_t edge_delta
= preferred_cluster_load
- dst_load
;
4042 if (edge_delta
< edge
[cluster_id
].sce_migration_weight
) {
4046 if (edge_delta
< max_edge_delta
) {
4050 if (edge_delta
== max_edge_delta
) {
4051 /* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */
4052 boolean_t selected_homogeneous
= (pset_type_for_id(selected_pset
->pset_cluster_id
) == preferred_cluster_type
);
4053 boolean_t candidate_homogeneous
= (pset_type_for_id(dst_pset
->pset_cluster_id
) == preferred_cluster_type
);
4054 if (selected_homogeneous
|| !candidate_homogeneous
) {
4058 /* dst_pset seems to be the best candidate for migration */
4059 max_edge_delta
= edge_delta
;
4060 selected_pset
= dst_pset
;
4062 return selected_pset
;
4066 * sched_edge_candidate_alternative()
4068 * Routine to find an alternative cluster from candidate_cluster_bitmap since the
4069 * selected_pset is not available for execution. The logic tries to prefer homogeneous
4070 * clusters over heterogeneous clusters since this is typically used in thread
4071 * placement decisions.
4073 _Static_assert(MAX_PSETS
<= 64, "Unable to fit maximum number of psets in uint64_t bitmask");
4074 static processor_set_t
4075 sched_edge_candidate_alternative(processor_set_t selected_pset
, uint64_t candidate_cluster_bitmap
)
4078 * It looks like the most ideal pset is not available for scheduling currently.
4079 * Try to find a homogeneous cluster that is still available.
4081 bitmap_t
*foreign_clusters
= selected_pset
->foreign_psets
;
4082 uint64_t available_native_clusters
= ~(foreign_clusters
[0]) & candidate_cluster_bitmap
;
4083 int available_cluster_id
= lsb_first(available_native_clusters
);
4084 if (available_cluster_id
== -1) {
4085 /* Looks like none of the homogeneous clusters are available; pick the first available cluster */
4086 available_cluster_id
= bit_first(candidate_cluster_bitmap
);
4088 assert(available_cluster_id
!= -1);
4089 return pset_array
[available_cluster_id
];
4093 * sched_edge_switch_pset_lock()
4095 * Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
4096 * switch_pset_locks.
4097 * Returns the newly locked pset after the switch.
4099 static processor_set_t
4100 sched_edge_switch_pset_lock(processor_set_t selected_pset
, processor_set_t locked_pset
, bool switch_pset_locks
)
4102 if (!switch_pset_locks
) {
4105 if (selected_pset
!= locked_pset
) {
4106 pset_unlock(locked_pset
);
4107 pset_lock(selected_pset
);
4108 return selected_pset
;
4115 * sched_edge_migrate_candidate()
4117 * Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
4118 * the thread and the preferred cluster to determine the best available pset for scheduling.
4120 * The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
4121 * accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
4122 * decision has to be synchronized with another CPU changing the recommendation of clusters available
4123 * on the system. If this parameter is set to false, this routine returns the best effort indication of
4124 * the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
4125 * SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
4126 * more appropriate cluster.
4128 * Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
4129 * resultant pset lock is held.
4131 static processor_set_t
4132 sched_edge_migrate_candidate(processor_set_t preferred_pset
, thread_t thread
, processor_set_t locked_pset
, bool switch_pset_locks
)
4134 __kdebug_only
uint32_t preferred_cluster_id
= preferred_pset
->pset_cluster_id
;
4135 processor_set_t selected_pset
= preferred_pset
;
4137 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread
)) {
4138 /* For bound threads always recommend the cluster its bound to */
4139 selected_pset
= pset_array
[sched_edge_thread_bound_cluster_id(thread
)];
4140 locked_pset
= sched_edge_switch_pset_lock(selected_pset
, locked_pset
, switch_pset_locks
);
4141 if (sched_edge_pset_available(selected_pset
) || (SCHED_CLUTCH_THREAD_CLUSTER_BOUND_SOFT(thread
) == false)) {
4143 * If the bound cluster is not available, check if the thread is soft bound. For soft bound threads,
4144 * fall through to the regular cluster selection logic which handles unavailable clusters
4145 * appropriately. If the thread is hard bound, then return the bound cluster always.
4147 return selected_pset
;
4151 uint64_t candidate_cluster_bitmap
= mask(MAX_PSETS
);
4152 if (thread
->sched_pri
>= BASEPRI_RTQUEUES
) {
4153 /* For realtime threads, try and schedule them on the preferred pset always */
4154 goto migrate_candidate_available_check
;
4158 * If a thread is being rebalanced for achieving equal progress of parallel workloads,
4159 * it needs to end up on the preferred runqueue.
4161 uint32_t preferred_cluster_load
= sched_edge_cluster_load_metric(preferred_pset
, thread
->th_sched_bucket
);
4162 boolean_t amp_rebalance
= (thread
->reason
& (AST_REBALANCE
| AST_QUANTUM
)) == (AST_REBALANCE
| AST_QUANTUM
);
4163 if ((preferred_cluster_load
== 0) || amp_rebalance
) {
4164 goto migrate_candidate_available_check
;
4167 /* Look at edge weights to decide the most ideal migration candidate for this thread */
4168 selected_pset
= sched_edge_migrate_edges_evaluate(preferred_pset
, preferred_cluster_load
, thread
);
4170 migrate_candidate_available_check
:
4171 locked_pset
= sched_edge_switch_pset_lock(selected_pset
, locked_pset
, switch_pset_locks
);
4172 if (sched_edge_pset_available(selected_pset
) == true) {
4173 if (selected_pset
!= preferred_pset
) {
4174 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_CLUSTER_OVERLOAD
) | DBG_FUNC_NONE
, thread_tid(thread
), preferred_cluster_id
, selected_pset
->pset_cluster_id
, preferred_cluster_load
);
4176 return selected_pset
;
4178 /* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */
4179 bitmap_clear(&candidate_cluster_bitmap
, selected_pset
->pset_cluster_id
);
4180 if (__improbable(bitmap_first(&candidate_cluster_bitmap
, MAX_PSETS
) == -1)) {
4182 * None of the clusters are available for scheduling; this situation should be rare but if it happens,
4183 * simply return the boot cluster.
4185 selected_pset
= &pset0
;
4186 locked_pset
= sched_edge_switch_pset_lock(selected_pset
, locked_pset
, switch_pset_locks
);
4187 if (selected_pset
!= preferred_pset
) {
4188 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_CLUSTER_OVERLOAD
) | DBG_FUNC_NONE
, thread_tid(thread
), preferred_cluster_id
, selected_pset
->pset_cluster_id
, preferred_cluster_load
);
4190 return selected_pset
;
4192 /* Try and find an alternative for the selected pset */
4193 selected_pset
= sched_edge_candidate_alternative(selected_pset
, candidate_cluster_bitmap
);
4194 goto migrate_candidate_available_check
;
4198 sched_edge_choose_processor(processor_set_t pset
, processor_t processor
, thread_t thread
)
4200 /* Bound threads don't call this function */
4201 assert(thread
->bound_processor
== PROCESSOR_NULL
);
4202 processor_t chosen_processor
= PROCESSOR_NULL
;
4205 * sched_edge_preferred_pset() returns the preferred pset for a given thread.
4206 * It should take the passed in "pset" as a hint which represents the recency metric for
4207 * pset selection logic.
4209 processor_set_t preferred_pset
= pset_array
[sched_edge_thread_preferred_cluster(thread
)];
4210 processor_set_t chosen_pset
= preferred_pset
;
4212 * If the preferred pset is overloaded, find a pset which is the best candidate to migrate
4213 * threads to. sched_edge_migrate_candidate() returns the preferred pset
4214 * if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
4216 * <Edge Multi-cluster Support Needed>
4217 * It might be useful to build a recency metric for the thread for multiple clusters and
4218 * factor that into the migration decisions.
4220 chosen_pset
= sched_edge_migrate_candidate(preferred_pset
, thread
, pset
, true);
4221 chosen_processor
= choose_processor(chosen_pset
, processor
, thread
);
4222 assert(chosen_processor
->processor_set
== chosen_pset
);
4223 return chosen_processor
;
4227 * sched_edge_clutch_bucket_threads_drain()
4229 * Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
4230 * bucket overrides etc.) into a local thread queue.
4233 sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket
, sched_clutch_root_t root_clutch
, queue_t clutch_threads
)
4235 thread_t thread
= THREAD_NULL
;
4236 uint64_t current_timestamp
= mach_approximate_time();
4237 qe_foreach_element_safe(thread
, &clutch_bucket
->scb_thread_timeshare_queue
, th_clutch_timeshare_link
) {
4238 sched_clutch_thread_remove(root_clutch
, thread
, current_timestamp
, SCHED_CLUTCH_BUCKET_OPTIONS_NONE
);
4239 enqueue_tail(clutch_threads
, &thread
->runq_links
);
4244 * sched_edge_run_drained_threads()
4246 * Makes all drained threads in a local queue runnable.
4249 sched_edge_run_drained_threads(queue_t clutch_threads
)
4252 /* Now setrun all the threads in the local queue */
4253 qe_foreach_element_safe(thread
, clutch_threads
, runq_links
) {
4254 remqueue(&thread
->runq_links
);
4255 thread_lock(thread
);
4256 thread_setrun(thread
, SCHED_TAILQ
);
4257 thread_unlock(thread
);
4262 * sched_edge_update_preferred_cluster()
4264 * Routine to update the preferred cluster for QoS buckets within a thread group.
4265 * The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
4268 sched_edge_update_preferred_cluster(
4269 sched_clutch_t sched_clutch
,
4270 bitmap_t
*clutch_bucket_modify_bitmap
,
4271 uint32_t *tg_bucket_preferred_cluster
)
4273 for (int bucket
= bitmap_first(clutch_bucket_modify_bitmap
, TH_BUCKET_SCHED_MAX
); bucket
>= 0; bucket
= bitmap_next(clutch_bucket_modify_bitmap
, bucket
)) {
4274 os_atomic_store(&sched_clutch
->sc_clutch_groups
[bucket
].scbg_preferred_cluster
, tg_bucket_preferred_cluster
[bucket
], relaxed
);
4279 * sched_edge_migrate_thread_group_runnable_threads()
4281 * Routine to implement the migration of threads on a cluster when the thread group
4282 * recommendation is updated. The migration works using a 2-phase
4285 * Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
4286 * For each clutch bucket, if it needs to be migrated immediately, drain the threads
4287 * into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
4290 * Phase 2: After unlocking the pset, drain all the threads from the local thread
4291 * queue and mark them runnable which should land them in the right hierarchy.
4293 * The routine assumes that the preferences for the clutch buckets/clutch bucket
4294 * groups have already been updated by the caller.
4296 * - Called with the pset locked and interrupts disabled.
4297 * - Returns with the pset unlocked.
4300 sched_edge_migrate_thread_group_runnable_threads(
4301 sched_clutch_t sched_clutch
,
4302 sched_clutch_root_t root_clutch
,
4303 bitmap_t
*clutch_bucket_modify_bitmap
,
4304 __unused
uint32_t *tg_bucket_preferred_cluster
,
4305 bool migrate_immediately
)
4307 /* Queue to hold threads that have been drained from clutch buckets to be migrated */
4308 queue_head_t clutch_threads
;
4309 queue_init(&clutch_threads
);
4311 for (int bucket
= bitmap_first(clutch_bucket_modify_bitmap
, TH_BUCKET_SCHED_MAX
); bucket
>= 0; bucket
= bitmap_next(clutch_bucket_modify_bitmap
, bucket
)) {
4312 /* Get the clutch bucket for this cluster and sched bucket */
4313 sched_clutch_bucket_group_t clutch_bucket_group
= &(sched_clutch
->sc_clutch_groups
[bucket
]);
4314 sched_clutch_bucket_t clutch_bucket
= &(clutch_bucket_group
->scbg_clutch_buckets
[root_clutch
->scr_cluster_id
]);
4315 sched_clutch_root_t scb_root
= os_atomic_load(&clutch_bucket
->scb_root
, relaxed
);
4316 if (scb_root
== NULL
) {
4317 /* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
4318 assert(clutch_bucket
->scb_thr_count
== 0);
4321 assert(scb_root
== root_clutch
);
4322 uint32_t clutch_bucket_preferred_cluster
= sched_clutch_bucket_preferred_cluster(clutch_bucket
);
4324 if (migrate_immediately
) {
4326 * For transitions where threads need to be migrated immediately, drain the threads into a
4327 * local queue unless we are looking at the clutch buckets for the newly recommended
4330 if (root_clutch
->scr_cluster_id
!= clutch_bucket_preferred_cluster
) {
4331 sched_edge_clutch_bucket_threads_drain(clutch_bucket
, scb_root
, &clutch_threads
);
4333 sched_clutch_bucket_mark_native(clutch_bucket
, root_clutch
);
4336 /* Check if this cluster is the same type as the newly recommended cluster */
4337 boolean_t homogeneous_cluster
= (pset_type_for_id(root_clutch
->scr_cluster_id
) == pset_type_for_id(clutch_bucket_preferred_cluster
));
4339 * If threads do not have to be migrated immediately, just change the native/foreign
4340 * flag on the clutch bucket.
4342 if (homogeneous_cluster
) {
4343 sched_clutch_bucket_mark_native(clutch_bucket
, root_clutch
);
4345 sched_clutch_bucket_mark_foreign(clutch_bucket
, root_clutch
);
4350 pset_unlock(root_clutch
->scr_pset
);
4351 sched_edge_run_drained_threads(&clutch_threads
);
4355 * sched_edge_migrate_thread_group_running_threads()
4357 * Routine to find all running threads of a thread group on a specific cluster
4358 * and IPI them if they need to be moved immediately.
4361 sched_edge_migrate_thread_group_running_threads(
4362 sched_clutch_t sched_clutch
,
4363 sched_clutch_root_t root_clutch
,
4364 __unused bitmap_t
*clutch_bucket_modify_bitmap
,
4365 uint32_t *tg_bucket_preferred_cluster
,
4366 bool migrate_immediately
)
4368 if (migrate_immediately
== false) {
4369 /* If CLPC has recommended not to move threads immediately, nothing to do here */
4374 * Edge Scheduler Optimization
4376 * When the system has a large number of clusters and cores, it might be useful to
4377 * narrow down the iteration by using a thread running bitmap per clutch.
4379 uint64_t ast_processor_map
= 0;
4380 sched_ipi_type_t ipi_type
[MAX_CPUS
] = {SCHED_IPI_NONE
};
4382 uint64_t running_map
= root_clutch
->scr_pset
->cpu_state_map
[PROCESSOR_RUNNING
];
4384 * Iterate all CPUs and look for the ones running threads from this thread group and are
4385 * not restricted to the specific cluster (due to overrides etc.)
4387 for (int cpuid
= lsb_first(running_map
); cpuid
>= 0; cpuid
= lsb_next(running_map
, cpuid
)) {
4388 processor_t src_processor
= processor_array
[cpuid
];
4389 boolean_t expected_tg
= (src_processor
->current_thread_group
== sched_clutch
->sc_tg
);
4390 sched_bucket_t processor_sched_bucket
= src_processor
->processor_set
->cpu_running_buckets
[cpuid
];
4391 boolean_t non_preferred_cluster
= tg_bucket_preferred_cluster
[processor_sched_bucket
] != root_clutch
->scr_cluster_id
;
4393 if (expected_tg
&& non_preferred_cluster
) {
4394 ipi_type
[cpuid
] = sched_ipi_action(src_processor
, NULL
, false, SCHED_IPI_EVENT_REBALANCE
);
4395 if (ipi_type
[cpuid
] != SCHED_IPI_NONE
) {
4396 bit_set(ast_processor_map
, cpuid
);
4397 } else if (src_processor
== current_processor()) {
4398 ast_on(AST_PREEMPT
);
4399 bit_set(root_clutch
->scr_pset
->pending_AST_PREEMPT_cpu_mask
, cpuid
);
4404 /* Perform all the IPIs */
4405 if (bit_first(ast_processor_map
) != -1) {
4406 for (int cpuid
= lsb_first(ast_processor_map
); cpuid
>= 0; cpuid
= lsb_next(ast_processor_map
, cpuid
)) {
4407 processor_t ast_processor
= processor_array
[cpuid
];
4408 sched_ipi_perform(ast_processor
, ipi_type
[cpuid
]);
4410 KDBG(MACHDBG_CODE(DBG_MACH_SCHED
, MACH_AMP_RECOMMENDATION_CHANGE
) | DBG_FUNC_NONE
, thread_group_get_id(sched_clutch
->sc_tg
), ast_processor_map
, 0, 0);
4415 * sched_edge_tg_preferred_cluster_change()
4417 * Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
4418 * is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
4419 * thread migrations based on the policy specified by CLPC.
4420 * tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
4421 * for each QoS within the thread group.
4424 sched_edge_tg_preferred_cluster_change(struct thread_group
*tg
, uint32_t *tg_bucket_preferred_cluster
, sched_perfcontrol_preferred_cluster_options_t options
)
4426 sched_clutch_t clutch
= sched_clutch_for_thread_group(tg
);
4428 * In order to optimize the processing, create a bitmap which represents all QoS buckets
4429 * for which the preferred cluster has changed.
4431 bitmap_t clutch_bucket_modify_bitmap
[BITMAP_LEN(TH_BUCKET_SCHED_MAX
)] = {0};
4432 for (sched_bucket_t bucket
= TH_BUCKET_FIXPRI
; bucket
< TH_BUCKET_SCHED_MAX
; bucket
++) {
4433 uint32_t old_preferred_cluster
= sched_edge_clutch_bucket_group_preferred_cluster(&clutch
->sc_clutch_groups
[bucket
]);
4434 uint32_t new_preferred_cluster
= tg_bucket_preferred_cluster
[bucket
];
4435 if (old_preferred_cluster
!= new_preferred_cluster
) {
4436 bitmap_set(clutch_bucket_modify_bitmap
, bucket
);
4439 if (bitmap_lsb_first(clutch_bucket_modify_bitmap
, TH_BUCKET_SCHED_MAX
) == -1) {
4440 /* No changes in any clutch buckets; nothing to do here */
4444 for (uint32_t cluster_id
= 0; cluster_id
< MAX_PSETS
; cluster_id
++) {
4445 processor_set_t pset
= pset_array
[cluster_id
];
4446 spl_t s
= splsched();
4449 * The first operation is to update the preferred cluster for all QoS buckets within the
4450 * thread group so that any future threads becoming runnable would see the new preferred
4453 sched_edge_update_preferred_cluster(clutch
, clutch_bucket_modify_bitmap
, tg_bucket_preferred_cluster
);
4455 * Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
4456 * by keeping a per-clutch bitmap of clusters running threads for a particular TG.
4458 * <Edge Multi-cluster Support Needed>
4460 /* Migrate all running threads of the TG on this cluster based on options specified by CLPC */
4461 sched_edge_migrate_thread_group_running_threads(clutch
, &pset
->pset_clutch_root
, clutch_bucket_modify_bitmap
,
4462 tg_bucket_preferred_cluster
, (options
& SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING
));
4463 /* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */
4464 sched_edge_migrate_thread_group_runnable_threads(clutch
, &pset
->pset_clutch_root
, clutch_bucket_modify_bitmap
,
4465 tg_bucket_preferred_cluster
, (options
& SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE
));
4466 /* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */
4472 * sched_edge_pset_made_schedulable()
4474 * Routine to migrate all the clutch buckets which are not in their recommended
4475 * pset hierarchy now that a new pset has become runnable. Its possible that this
4476 * routine is called when the pset is already marked schedulable.
4478 * Invoked with the pset lock held and interrupts disabled.
4481 sched_edge_pset_made_schedulable(__unused processor_t processor
, processor_set_t dst_pset
, boolean_t drop_lock
)
4483 if (bitmap_test(sched_edge_available_pset_bitmask
, dst_pset
->pset_cluster_id
)) {
4484 /* Nothing to do here since pset is already marked schedulable */
4486 pset_unlock(dst_pset
);
4491 bitmap_set(sched_edge_available_pset_bitmask
, dst_pset
->pset_cluster_id
);
4493 thread_t thread
= sched_edge_processor_idle(dst_pset
);
4494 if (thread
!= THREAD_NULL
) {
4495 thread_lock(thread
);
4496 thread_setrun(thread
, SCHED_TAILQ
);
4497 thread_unlock(thread
);
4501 pset_lock(dst_pset
);
4505 extern int sched_amp_spill_deferred_ipi
;
4506 extern int sched_amp_pcores_preempt_immediate_ipi
;
4508 int sched_edge_migrate_ipi_immediate
= 1;
4511 sched_edge_ipi_policy(processor_t dst
, thread_t thread
, boolean_t dst_idle
, sched_ipi_event_t event
)
4513 processor_set_t pset
= dst
->processor_set
;
4514 assert(bit_test(pset
->pending_AST_URGENT_cpu_mask
, dst
->cpu_id
) == false);
4515 assert(dst
!= current_processor());
4517 boolean_t deferred_ipi_supported
= false;
4518 #if defined(CONFIG_SCHED_DEFERRED_AST)
4519 deferred_ipi_supported
= true;
4520 #endif /* CONFIG_SCHED_DEFERRED_AST */
4523 case SCHED_IPI_EVENT_SPILL
:
4524 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
4525 if (deferred_ipi_supported
&& sched_amp_spill_deferred_ipi
) {
4526 return sched_ipi_deferred_policy(pset
, dst
, event
);
4529 case SCHED_IPI_EVENT_PREEMPT
:
4530 /* For preemption, the default policy is to use deferred IPIs
4531 * for Non-RT P-core preemption. Override that behavior if
4532 * sched_amp_pcores_preempt_immediate_ipi is set
4534 if (thread
&& thread
->sched_pri
< BASEPRI_RTQUEUES
) {
4535 if (sched_edge_migrate_ipi_immediate
) {
4537 * For workloads that are going wide, it might be useful use Immediate IPI to
4538 * wakeup the idle CPU if the scheduler estimates that the preferred pset will
4539 * be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
4540 * latency on the preferred pset as an estimate of busyness.
4542 * <Edge Multi-cluster Support Needed>
4544 processor_set_t preferred_pset
= pset_array
[sched_edge_thread_preferred_cluster(thread
)];
4545 if ((preferred_pset
->pset_execution_time
[thread
->th_sched_bucket
].pset_avg_thread_execution_time
* NSEC_PER_USEC
) >= ml_cpu_signal_deferred_get_timer()) {
4546 return dst_idle
? SCHED_IPI_IDLE
: SCHED_IPI_IMMEDIATE
;
4549 if (sched_amp_pcores_preempt_immediate_ipi
&& (pset_type_for_id(pset
->pset_cluster_id
) == CLUSTER_TYPE_P
)) {
4550 return dst_idle
? SCHED_IPI_IDLE
: SCHED_IPI_IMMEDIATE
;
4557 /* Default back to the global policy for all other scenarios */
4558 return sched_ipi_policy(dst
, thread
, dst_idle
, event
);
4562 * sched_edge_qos_max_parallelism()
4565 sched_edge_qos_max_parallelism(int qos
, uint64_t options
)
4567 uint32_t ecount
= 0;
4568 uint32_t pcount
= 0;
4570 for (int cluster_id
= 0; cluster_id
< MAX_PSETS
; cluster_id
++) {
4571 processor_set_t pset
= pset_array
[cluster_id
];
4572 if (pset_type_for_id(cluster_id
) == CLUSTER_TYPE_P
) {
4573 pcount
+= pset
->cpu_set_count
;
4575 ecount
+= pset
->cpu_set_count
;
4579 if (options
& QOS_PARALLELISM_REALTIME
) {
4580 /* For realtime threads on AMP, we would want them
4581 * to limit the width to just the P-cores since we
4582 * do not spill/rebalance for RT threads.
4588 * The Edge scheduler supports per-QoS recommendations for thread groups.
4589 * This enables lower QoS buckets (such as UT) to be scheduled on all
4590 * CPUs on the system.
4592 * The only restriction is for BG/Maintenance QoS classes for which the
4593 * performance controller would never recommend execution on the P-cores.
4594 * If that policy changes in the future, this value should be changed.
4597 case THREAD_QOS_BACKGROUND
:
4598 case THREAD_QOS_MAINTENANCE
:
4601 return ecount
+ pcount
;
4607 #endif /* CONFIG_SCHED_EDGE */
4609 #endif /* CONFIG_SCHED_CLUTCH */