2  * Copyright (c) 2018 Apple Inc. All rights reserved. 
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 
   6  * This file contains Original Code and/or Modifications of Original Code 
   7  * as defined in and that are subject to the Apple Public Source License 
   8  * Version 2.0 (the 'License'). You may not use this file except in 
   9  * compliance with the License. The rights granted to you under the License 
  10  * may not be used to create, or enable the creation or redistribution of, 
  11  * unlawful or unlicensed copies of an Apple operating system, or to 
  12  * circumvent, violate, or enable the circumvention or violation of, any 
  13  * terms of an Apple operating system software license agreement. 
  15  * Please obtain a copy of the License at 
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file. 
  18  * The Original Code and all software distributed under the License are 
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
  23  * Please see the License for the specific language governing rights and 
  24  * limitations under the License. 
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 
  29 #include <mach/mach_types.h> 
  30 #include <mach/machine.h> 
  31 #include <machine/machine_routines.h> 
  32 #include <machine/sched_param.h> 
  33 #include <machine/machine_cpu.h> 
  34 #include <kern/kern_types.h> 
  35 #include <kern/debug.h> 
  36 #include <kern/machine.h> 
  37 #include <kern/misc_protos.h> 
  38 #include <kern/processor.h> 
  39 #include <kern/queue.h> 
  40 #include <kern/sched.h> 
  41 #include <kern/sched_prim.h> 
  42 #include <kern/task.h> 
  43 #include <kern/thread.h> 
  44 #include <kern/sched_clutch.h> 
  45 #include <machine/atomic.h> 
  46 #include <kern/sched_clutch.h> 
  47 #include <sys/kdebug.h> 
  50 #include <kern/sched_amp_common.h> 
  51 #endif /* CONFIG_SCHED_EDGE */ 
  53 #if CONFIG_SCHED_CLUTCH 
  55 /* Forward declarations of static routines */ 
  57 /* Root level hierarchy management */ 
  58 static void sched_clutch_root_init(sched_clutch_root_t
, processor_set_t
); 
  59 static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t
, sched_bucket_t
, bool); 
  60 static void sched_clutch_root_pri_update(sched_clutch_root_t
); 
  61 static void sched_clutch_root_urgency_inc(sched_clutch_root_t
, thread_t
); 
  62 static void sched_clutch_root_urgency_dec(sched_clutch_root_t
, thread_t
); 
  64 __enum_decl(sched_clutch_highest_root_bucket_type_t
, uint32_t, { 
  65         SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE           
= 0, 
  66         SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY   
= 1, 
  67         SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL            
= 2, 
  70 static sched_clutch_root_bucket_t 
sched_clutch_root_highest_root_bucket(sched_clutch_root_t
, uint64_t, sched_clutch_highest_root_bucket_type_t
); 
  73 /* Support for foreign threads on AMP platforms */ 
  74 static boolean_t 
sched_clutch_root_foreign_empty(sched_clutch_root_t
); 
  75 static thread_t 
sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t
); 
  76 #endif /* CONFIG_SCHED_EDGE */ 
  78 /* Root bucket level hierarchy management */ 
  79 static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t
, uint64_t); 
  80 static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t
, sched_clutch_root_t
, uint64_t); 
  82 /* Options for clutch bucket ordering in the runq */ 
  83 __options_decl(sched_clutch_bucket_options_t
, uint32_t, { 
  84         SCHED_CLUTCH_BUCKET_OPTIONS_NONE        
= 0x0, 
  85         /* Round robin clutch bucket on thread removal */ 
  86         SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR  
= 0x1, 
  87         /* Insert clutch bucket at head (for thread preemption) */ 
  88         SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ       
= 0x2, 
  89         /* Insert clutch bucket at tail (default) */ 
  90         SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ       
= 0x4, 
  93 /* Clutch bucket level hierarchy management */ 
  94 static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t
, sched_clutch_bucket_t
, sched_bucket_t
, uint64_t, sched_clutch_bucket_options_t
); 
  95 static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t
, sched_clutch_bucket_t
, sched_bucket_t
, uint64_t, sched_clutch_bucket_options_t
); 
  96 static boolean_t 
sched_clutch_bucket_runnable(sched_clutch_bucket_t
, sched_clutch_root_t
, uint64_t, sched_clutch_bucket_options_t
); 
  97 static boolean_t 
sched_clutch_bucket_update(sched_clutch_bucket_t
, sched_clutch_root_t
, uint64_t, sched_clutch_bucket_options_t
); 
  98 static void sched_clutch_bucket_empty(sched_clutch_bucket_t
, sched_clutch_root_t
, uint64_t, sched_clutch_bucket_options_t
); 
  99 static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t
, uint64_t); 
 101 /* Clutch bucket group level properties management */ 
 102 static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t
, uint64_t); 
 103 static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t
, uint8_t); 
 104 static void sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t
, sched_clutch_bucket_t
, uint64_t); 
 105 static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t
, uint64_t); 
 106 static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t
); 
 107 static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t
); 
 108 static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t
, uint64_t); 
 110 /* Clutch timeshare properties updates */ 
 111 static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t
, sched_bucket_t
); 
 112 static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t
, sched_bucket_t
); 
 114 /* Clutch membership management */ 
 115 static boolean_t 
sched_clutch_thread_insert(sched_clutch_root_t
, thread_t
, integer_t
); 
 116 static void sched_clutch_thread_remove(sched_clutch_root_t
, thread_t
, uint64_t, sched_clutch_bucket_options_t
); 
 117 static thread_t 
sched_clutch_thread_highest_remove(sched_clutch_root_t
); 
 119 /* Clutch properties updates */ 
 120 static uint32_t sched_clutch_root_urgency(sched_clutch_root_t
); 
 121 static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t
); 
 122 static int sched_clutch_root_priority(sched_clutch_root_t
); 
 123 static sched_clutch_bucket_t 
sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t
); 
 124 static boolean_t 
sched_thread_sched_pri_promoted(thread_t
); 
 126 #if CONFIG_SCHED_EDGE 
 127 /* System based routines */ 
 128 static bool sched_edge_pset_available(processor_set_t
); 
 129 static uint32_t sched_edge_thread_bound_cluster_id(thread_t
); 
 130 #endif /* CONFIG_SCHED_EDGE */ 
 132 /* Helper debugging routines */ 
 133 static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t
); 
 135 extern processor_set_t pset_array
[MAX_PSETS
]; 
 138  * Special markers for buckets that have invalid WCELs/quantums etc. 
 140 #define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0) 
 141 #define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0) 
 144  * Root level bucket WCELs 
 146  * The root level bucket selection algorithm is an Earliest Deadline 
 147  * First (EDF) algorithm where the deadline for buckets are defined 
 148  * by the worst-case-execution-latency and the make runnable timestamp 
 152 static uint32_t sched_clutch_root_bucket_wcel_us
[TH_BUCKET_SCHED_MAX
] = { 
 153         SCHED_CLUTCH_INVALID_TIME_32
,                   /* FIXPRI */ 
 155         37500,                                          /* IN (37.5ms) */ 
 156         75000,                                          /* DF (75ms) */ 
 157         150000,                                         /* UT (150ms) */ 
 158         250000                                          /* BG (250ms) */ 
 160 static uint64_t sched_clutch_root_bucket_wcel
[TH_BUCKET_SCHED_MAX
] = {0}; 
 163  * Root level bucket warp 
 165  * Each root level bucket has a warp value associated with it as well. 
 166  * The warp value allows the root bucket to effectively warp ahead of 
 167  * lower priority buckets for a limited time even if it has a later 
 168  * deadline. The warping behavior provides extra (but limited) 
 169  * opportunity for high priority buckets to remain responsive. 
 172 /* Special warp deadline value to indicate that the bucket has not used any warp yet */ 
 173 #define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED    (SCHED_CLUTCH_INVALID_TIME_64) 
 175 /* Warp window durations for various tiers */ 
 176 static uint32_t sched_clutch_root_bucket_warp_us
[TH_BUCKET_SCHED_MAX
] = { 
 177         SCHED_CLUTCH_INVALID_TIME_32
,                   /* FIXPRI */ 
 184 static uint64_t sched_clutch_root_bucket_warp
[TH_BUCKET_SCHED_MAX
] = {0}; 
 187  * Thread level quantum 
 189  * The algorithm defines quantums for threads at various buckets. This 
 190  * (combined with the root level bucket quantums) restricts how much 
 191  * the lower priority levels can preempt the higher priority threads. 
 194 #if XNU_TARGET_OS_OSX 
 195 static uint32_t sched_clutch_thread_quantum_us
[TH_BUCKET_SCHED_MAX
] = { 
 196         10000,                                          /* FIXPRI (10ms) */ 
 197         10000,                                          /* FG (10ms) */ 
 198         10000,                                          /* IN (10ms) */ 
 199         10000,                                          /* DF (10ms) */ 
 203 #else /* XNU_TARGET_OS_OSX */ 
 204 static uint32_t sched_clutch_thread_quantum_us
[TH_BUCKET_SCHED_MAX
] = { 
 205         10000,                                          /* FIXPRI (10ms) */ 
 206         10000,                                          /* FG (10ms) */ 
 212 #endif /* XNU_TARGET_OS_OSX */ 
 214 static uint64_t sched_clutch_thread_quantum
[TH_BUCKET_SCHED_MAX
] = {0}; 
 217  * sched_clutch_us_to_abstime() 
 219  * Initializer for converting all durations in usec to abstime 
 222 sched_clutch_us_to_abstime(uint32_t *us_vals
, uint64_t *abstime_vals
) 
 224         for (int i 
= 0; i 
< TH_BUCKET_SCHED_MAX
; i
++) { 
 225                 if (us_vals
[i
] == SCHED_CLUTCH_INVALID_TIME_32
) { 
 226                         abstime_vals
[i
] = SCHED_CLUTCH_INVALID_TIME_64
; 
 228                         clock_interval_to_absolutetime_interval(us_vals
[i
], 
 229                             NSEC_PER_USEC
, &abstime_vals
[i
]); 
 234 /* Clutch/Edge Scheduler Debugging support */ 
 235 #define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c)        ((uint64_t)c | ((uint64_t)b << 16) | ((uint64_t)a << 32)) 
 237 #if DEVELOPMENT || DEBUG 
 240  * sched_clutch_hierarchy_locked_assert() 
 242  * Debugging helper routine. Asserts that the hierarchy is locked. The locking 
 243  * for the hierarchy depends on where the hierarchy is hooked. The current 
 244  * implementation hooks the hierarchy at the pset, so the hierarchy is locked 
 245  * using the pset lock. 
 248 sched_clutch_hierarchy_locked_assert( 
 249         sched_clutch_root_t root_clutch
) 
 251         pset_assert_locked(root_clutch
->scr_pset
); 
 254 #else /* DEVELOPMENT || DEBUG */ 
 257 sched_clutch_hierarchy_locked_assert( 
 258         __unused sched_clutch_root_t root_clutch
) 
 262 #endif /* DEVELOPMENT || DEBUG */ 
 265  * sched_clutch_thr_count_inc() 
 267  * Increment thread count at a hierarchy level with overflow checks. 
 270 sched_clutch_thr_count_inc( 
 273         if (__improbable(os_inc_overflow(thr_count
))) { 
 274                 panic("sched_clutch thread count overflowed!"); 
 279  * sched_clutch_thr_count_dec() 
 281  * Decrement thread count at a hierarchy level with underflow checks. 
 284 sched_clutch_thr_count_dec( 
 287         if (__improbable(os_dec_overflow(thr_count
))) { 
 288                 panic("sched_clutch thread count underflowed!"); 
 293  * The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups 
 294  * based on the amount of time they have been pending and the load at that 
 295  * scheduling bucket level. Since the clutch bucket groups are global (i.e. span 
 296  * multiple clusters, its important to keep the load also as a global counter. 
 298 static uint32_t _Atomic sched_clutch_global_bucket_load
[TH_BUCKET_SCHED_MAX
]; 
 301  * sched_clutch_root_init() 
 303  * Routine to initialize the scheduler hierarchy root. 
 306 sched_clutch_root_init( 
 307         sched_clutch_root_t root_clutch
, 
 308         processor_set_t pset
) 
 310         root_clutch
->scr_thr_count 
= 0; 
 311         root_clutch
->scr_priority 
= NOPRI
; 
 312         root_clutch
->scr_urgency 
= 0; 
 313         root_clutch
->scr_pset 
= pset
; 
 314 #if CONFIG_SCHED_EDGE 
 315         root_clutch
->scr_cluster_id 
= pset
->pset_cluster_id
; 
 316 #else /* CONFIG_SCHED_EDGE */ 
 317         root_clutch
->scr_cluster_id 
= 0; 
 318 #endif /* CONFIG_SCHED_EDGE */ 
 320         /* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */ 
 321         queue_init(&root_clutch
->scr_clutch_buckets
); 
 323         /* Initialize the priority queue which maintains all runnable foreign clutch buckets */ 
 324         priority_queue_init(&root_clutch
->scr_foreign_buckets
); 
 325         bzero(&root_clutch
->scr_cumulative_run_count
, sizeof(root_clutch
->scr_cumulative_run_count
)); 
 326         bitmap_zero(root_clutch
->scr_bound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
); 
 327         bitmap_zero(root_clutch
->scr_bound_warp_available
, TH_BUCKET_SCHED_MAX
); 
 328         priority_queue_init(&root_clutch
->scr_bound_root_buckets
); 
 330         /* Initialize the bitmap and priority queue of runnable root buckets */ 
 331         priority_queue_init(&root_clutch
->scr_unbound_root_buckets
); 
 332         bitmap_zero(root_clutch
->scr_unbound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
); 
 333         bitmap_zero(root_clutch
->scr_unbound_warp_available
, TH_BUCKET_SCHED_MAX
); 
 335         /* Initialize all the root buckets */ 
 336         for (uint32_t i 
= 0; i 
< TH_BUCKET_SCHED_MAX
; i
++) { 
 337                 sched_clutch_root_bucket_init(&root_clutch
->scr_unbound_buckets
[i
], i
, false); 
 338                 sched_clutch_root_bucket_init(&root_clutch
->scr_bound_buckets
[i
], i
, true); 
 343  * Clutch Bucket Runqueues 
 345  * The clutch buckets are maintained in a runq at the root bucket level. The 
 346  * runq organization allows clutch buckets to be ordered based on various 
 349  * - Clutch buckets are round robin'ed at the same priority level when a 
 350  *   thread is selected from a clutch bucket. This prevents a clutch bucket 
 351  *   from starving out other clutch buckets at the same priority. 
 353  * - Clutch buckets are inserted at the head when it becomes runnable due to 
 354  *   thread preemption. This allows threads that were preempted to maintain 
 355  *   their order in the queue. 
 360  * sched_clutch_bucket_runq_init() 
 362  * Initialize a clutch bucket runq. 
 365 sched_clutch_bucket_runq_init( 
 366         sched_clutch_bucket_runq_t clutch_buckets_rq
) 
 368         clutch_buckets_rq
->scbrq_highq 
= NOPRI
; 
 369         for (uint8_t i 
= 0; i 
< BITMAP_LEN(NRQS
); i
++) { 
 370                 clutch_buckets_rq
->scbrq_bitmap
[i
] = 0; 
 372         clutch_buckets_rq
->scbrq_count 
= 0; 
 373         for (int i 
= 0; i 
< NRQS
; i
++) { 
 374                 circle_queue_init(&clutch_buckets_rq
->scbrq_queues
[i
]); 
 379  * sched_clutch_bucket_runq_empty() 
 381  * Returns if a clutch bucket runq is empty. 
 384 sched_clutch_bucket_runq_empty( 
 385         sched_clutch_bucket_runq_t clutch_buckets_rq
) 
 387         return clutch_buckets_rq
->scbrq_count 
== 0; 
 391  * sched_clutch_bucket_runq_peek() 
 393  * Returns the highest priority clutch bucket in the runq. 
 395 static sched_clutch_bucket_t
 
 396 sched_clutch_bucket_runq_peek( 
 397         sched_clutch_bucket_runq_t clutch_buckets_rq
) 
 399         if (clutch_buckets_rq
->scbrq_count 
> 0) { 
 400                 circle_queue_t queue 
= &clutch_buckets_rq
->scbrq_queues
[clutch_buckets_rq
->scbrq_highq
]; 
 401                 return cqe_queue_first(queue
, struct sched_clutch_bucket
, scb_runqlink
); 
 408  * sched_clutch_bucket_runq_enqueue() 
 410  * Enqueue a clutch bucket into the runq based on the options passed in. 
 413 sched_clutch_bucket_runq_enqueue( 
 414         sched_clutch_bucket_runq_t clutch_buckets_rq
, 
 415         sched_clutch_bucket_t clutch_bucket
, 
 416         sched_clutch_bucket_options_t options
) 
 418         circle_queue_t queue 
= &clutch_buckets_rq
->scbrq_queues
[clutch_bucket
->scb_priority
]; 
 419         if (circle_queue_empty(queue
)) { 
 420                 circle_enqueue_tail(queue
, &clutch_bucket
->scb_runqlink
); 
 421                 bitmap_set(clutch_buckets_rq
->scbrq_bitmap
, clutch_bucket
->scb_priority
); 
 422                 if (clutch_bucket
->scb_priority 
> clutch_buckets_rq
->scbrq_highq
) { 
 423                         clutch_buckets_rq
->scbrq_highq 
= clutch_bucket
->scb_priority
; 
 426                 if (options 
& SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ
) { 
 427                         circle_enqueue_head(queue
, &clutch_bucket
->scb_runqlink
); 
 430                          * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ & 
 431                          * SCHED_CLUTCH_BUCKET_OPTIONS_NONE) 
 433                         circle_enqueue_tail(queue
, &clutch_bucket
->scb_runqlink
); 
 436         clutch_buckets_rq
->scbrq_count
++; 
 440  * sched_clutch_bucket_runq_remove() 
 442  * Remove a clutch bucket from the runq. 
 445 sched_clutch_bucket_runq_remove( 
 446         sched_clutch_bucket_runq_t clutch_buckets_rq
, 
 447         sched_clutch_bucket_t clutch_bucket
) 
 449         circle_queue_t queue 
= &clutch_buckets_rq
->scbrq_queues
[clutch_bucket
->scb_priority
]; 
 450         circle_dequeue(queue
, &clutch_bucket
->scb_runqlink
); 
 451         assert(clutch_buckets_rq
->scbrq_count 
> 0); 
 452         clutch_buckets_rq
->scbrq_count
--; 
 453         if (circle_queue_empty(queue
)) { 
 454                 bitmap_clear(clutch_buckets_rq
->scbrq_bitmap
, clutch_bucket
->scb_priority
); 
 455                 clutch_buckets_rq
->scbrq_highq 
= bitmap_first(clutch_buckets_rq
->scbrq_bitmap
, NRQS
); 
 460 sched_clutch_bucket_runq_rotate( 
 461         sched_clutch_bucket_runq_t clutch_buckets_rq
, 
 462         sched_clutch_bucket_t clutch_bucket
) 
 464         circle_queue_t queue 
= &clutch_buckets_rq
->scbrq_queues
[clutch_bucket
->scb_priority
]; 
 465         assert(clutch_bucket 
== cqe_queue_first(queue
, struct sched_clutch_bucket
, scb_runqlink
)); 
 466         circle_queue_rotate_head_forward(queue
); 
 470  * sched_clutch_root_bucket_init() 
 472  * Routine to initialize root buckets. 
 475 sched_clutch_root_bucket_init( 
 476         sched_clutch_root_bucket_t root_bucket
, 
 477         sched_bucket_t bucket
, 
 478         bool bound_root_bucket
) 
 480         root_bucket
->scrb_bucket 
= bucket
; 
 481         if (bound_root_bucket
) { 
 482                 /* For bound root buckets, initialize the bound thread runq. */ 
 483                 root_bucket
->scrb_bound 
= true; 
 484                 run_queue_init(&root_bucket
->scrb_bound_thread_runq
); 
 487                  * The unbounded root buckets contain a runq of runnable clutch buckets 
 488                  * which then hold the runnable threads. 
 490                 root_bucket
->scrb_bound 
= false; 
 491                 sched_clutch_bucket_runq_init(&root_bucket
->scrb_clutch_buckets
); 
 493         priority_queue_entry_init(&root_bucket
->scrb_pqlink
); 
 494         root_bucket
->scrb_pqlink
.deadline 
= SCHED_CLUTCH_INVALID_TIME_64
; 
 495         root_bucket
->scrb_warped_deadline 
= 0; 
 496         root_bucket
->scrb_warp_remaining 
= sched_clutch_root_bucket_warp
[root_bucket
->scrb_bucket
]; 
 497         root_bucket
->scrb_starvation_avoidance 
= false; 
 498         root_bucket
->scrb_starvation_ts 
= 0; 
 502  * Special case scheduling for Above UI bucket. 
 504  * AboveUI threads are typically system critical threads that need low latency 
 505  * which is why they are handled specially. 
 507  * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is 
 508  * important to maintain some native priority order between those buckets. For unbounded 
 509  * root buckets, the policy is to compare the highest clutch buckets of both buckets; if the 
 510  * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the 
 511  * deadline based scheduling which should pickup the timeshare buckets. For the bound 
 512  * case, the policy simply compares the priority of the highest runnable threads in 
 513  * the above UI and timeshare buckets. 
 515  * The implementation allows extremely low latency CPU access for Above UI threads 
 516  * while supporting the use case of high priority timeshare threads contending with 
 517  * lower priority fixed priority threads. 
 522  * sched_clutch_root_unbound_select_aboveui() 
 524  * Routine to determine if the above UI unbounded bucket should be selected for execution. 
 527 sched_clutch_root_unbound_select_aboveui( 
 528         sched_clutch_root_t root_clutch
) 
 530         if (bitmap_test(root_clutch
->scr_unbound_runnable_bitmap
, TH_BUCKET_FIXPRI
)) { 
 531                 sched_clutch_root_bucket_t root_bucket_aboveui 
= &root_clutch
->scr_unbound_buckets
[TH_BUCKET_FIXPRI
]; 
 532                 sched_clutch_root_bucket_t root_bucket_sharefg 
= &root_clutch
->scr_unbound_buckets
[TH_BUCKET_SHARE_FG
]; 
 533                 if (!bitmap_test(root_clutch
->scr_unbound_runnable_bitmap
, TH_BUCKET_SHARE_FG
)) { 
 534                         /* If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling */ 
 537                 sched_clutch_bucket_t clutch_bucket_aboveui 
= sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_aboveui
); 
 538                 sched_clutch_bucket_t clutch_bucket_sharefg 
= sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_sharefg
); 
 539                 if (clutch_bucket_aboveui
->scb_priority 
>= clutch_bucket_sharefg
->scb_priority
) { 
 547  * sched_clutch_root_bound_select_aboveui() 
 549  * Routine to determine if the above UI bounded bucket should be selected for execution. 
 552 sched_clutch_root_bound_select_aboveui( 
 553         sched_clutch_root_t root_clutch
) 
 555         sched_clutch_root_bucket_t root_bucket_aboveui 
= &root_clutch
->scr_bound_buckets
[TH_BUCKET_FIXPRI
]; 
 556         sched_clutch_root_bucket_t root_bucket_sharefg 
= &root_clutch
->scr_bound_buckets
[TH_BUCKET_SHARE_FG
]; 
 557         if (root_bucket_aboveui
->scrb_bound_thread_runq
.count 
== 0) { 
 560         return root_bucket_aboveui
->scrb_bound_thread_runq
.highq 
>= root_bucket_sharefg
->scrb_bound_thread_runq
.highq
; 
 564  * sched_clutch_root_highest_root_bucket() 
 566  * Main routine to find the highest runnable root level bucket. 
 567  * This routine is called from performance sensitive contexts; so it is 
 568  * crucial to keep this O(1). The options parameter determines if 
 569  * the selection logic should look at unbounded threads only (for 
 570  * cross-cluster stealing operations) or both bounded and unbounded 
 571  * threads (for selecting next thread for execution on current cluster). 
 573 static sched_clutch_root_bucket_t
 
 574 sched_clutch_root_highest_root_bucket( 
 575         sched_clutch_root_t root_clutch
, 
 577         sched_clutch_highest_root_bucket_type_t type
) 
 579         sched_clutch_hierarchy_locked_assert(root_clutch
); 
 580         int highest_runnable_bucket 
= -1; 
 581         if (type 
== SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY
) { 
 582                 highest_runnable_bucket 
= bitmap_lsb_first(root_clutch
->scr_unbound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
); 
 584                 int highest_unbound_runnable_bucket 
= bitmap_lsb_first(root_clutch
->scr_unbound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
); 
 585                 int highest_bound_runnable_bucket 
= bitmap_lsb_first(root_clutch
->scr_bound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
); 
 586                 highest_runnable_bucket 
= (highest_bound_runnable_bucket 
!= -1) ? ((highest_unbound_runnable_bucket 
!= -1) ? MIN(highest_bound_runnable_bucket
, highest_unbound_runnable_bucket
) : highest_bound_runnable_bucket
) : highest_unbound_runnable_bucket
; 
 589         if (highest_runnable_bucket 
== -1) { 
 593         /* Above UI root bucket selection (see comment above for more details on this special case handling) */ 
 594         bool unbound_aboveui 
= sched_clutch_root_unbound_select_aboveui(root_clutch
); 
 595         if (type 
== SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY
) { 
 596                 if (unbound_aboveui
) { 
 597                         return &root_clutch
->scr_unbound_buckets
[TH_BUCKET_FIXPRI
]; 
 599                 /* Fall through to selecting a timeshare root bucket */ 
 601                 bool bound_aboveui 
= sched_clutch_root_bound_select_aboveui(root_clutch
); 
 602                 sched_clutch_root_bucket_t unbound_aboveui_root_bucket 
= &root_clutch
->scr_unbound_buckets
[TH_BUCKET_FIXPRI
]; 
 603                 sched_clutch_root_bucket_t bound_aboveui_root_bucket 
= &root_clutch
->scr_bound_buckets
[TH_BUCKET_FIXPRI
]; 
 605                 if (unbound_aboveui 
&& bound_aboveui
) { 
 607                          * In this scenario both the bounded and unbounded above UI buckets are runnable; choose based on the 
 608                          * highest runnable priority in both the buckets. 
 610                         int bound_aboveui_pri 
= root_clutch
->scr_bound_buckets
[TH_BUCKET_FIXPRI
].scrb_bound_thread_runq
.highq
; 
 611                         sched_clutch_bucket_t clutch_bucket 
= sched_clutch_root_bucket_highest_clutch_bucket(unbound_aboveui_root_bucket
); 
 612                         int unbound_aboveui_pri 
= priority_queue_max_sched_pri(&clutch_bucket
->scb_clutchpri_prioq
); 
 613                         return (bound_aboveui_pri 
>= unbound_aboveui_pri
) ? bound_aboveui_root_bucket 
: unbound_aboveui_root_bucket
; 
 615                 if (unbound_aboveui
) { 
 616                         return unbound_aboveui_root_bucket
; 
 619                         return bound_aboveui_root_bucket
; 
 621                 /* Fall through to selecting a timeshare root bucket */ 
 625          * Above UI bucket is not runnable or has a low priority runnable thread; use the 
 626          * earliest deadline model to schedule threads. The idea is that as the timeshare 
 627          * buckets use CPU, they will drop their interactivity score/sched priority and 
 628          * allow the low priority AboveUI buckets to be scheduled. 
 631         /* Find the earliest deadline bucket */ 
 632         sched_clutch_root_bucket_t edf_bucket 
= NULL
; 
 633         sched_clutch_root_bucket_t warp_bucket 
= NULL
; 
 634         int warp_bucket_index 
= -1; 
 636 evaluate_root_buckets
: 
 637         if (type 
== SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY
) { 
 638                 edf_bucket 
= priority_queue_min(&root_clutch
->scr_unbound_root_buckets
, struct sched_clutch_root_bucket
, scrb_pqlink
); 
 640                 sched_clutch_root_bucket_t unbound_bucket 
= priority_queue_min(&root_clutch
->scr_unbound_root_buckets
, struct sched_clutch_root_bucket
, scrb_pqlink
); 
 641                 sched_clutch_root_bucket_t bound_bucket 
= priority_queue_min(&root_clutch
->scr_bound_root_buckets
, struct sched_clutch_root_bucket
, scrb_pqlink
); 
 642                 if (bound_bucket 
&& unbound_bucket
) { 
 643                         /* If bound and unbound root buckets are runnable, select the one with the earlier deadline */ 
 644                         edf_bucket 
= (bound_bucket
->scrb_pqlink
.deadline 
<= unbound_bucket
->scrb_pqlink
.deadline
) ? bound_bucket 
: unbound_bucket
; 
 646                         edf_bucket 
= (bound_bucket
) ? bound_bucket 
: unbound_bucket
; 
 650          * Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of 
 651          * buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that 
 652          * makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower 
 655         bitmap_t 
*warp_available_bitmap 
= (edf_bucket
->scrb_bound
) ? (root_clutch
->scr_bound_warp_available
) : (root_clutch
->scr_unbound_warp_available
); 
 656         warp_bucket_index 
= bitmap_lsb_first(warp_available_bitmap
, TH_BUCKET_SCHED_MAX
); 
 658         if ((warp_bucket_index 
== -1) || (warp_bucket_index 
>= edf_bucket
->scrb_bucket
)) { 
 659                 /* No higher buckets have warp left; best choice is the EDF based bucket */ 
 660                 if (edf_bucket
->scrb_starvation_avoidance
) { 
 662                          * Indicates that the earliest deadline bucket is in starvation avoidance mode. Check to see if the 
 663                          * starvation avoidance window is still open and return this bucket if it is. 
 665                          * The starvation avoidance window is calculated based on the quantum of threads at that bucket and 
 666                          * the number of CPUs in the cluster. The idea is to basically provide one quantum worth of starvation 
 667                          * avoidance across all CPUs. 
 669                         uint64_t starvation_window 
= sched_clutch_thread_quantum
[edf_bucket
->scrb_bucket
] / root_clutch
->scr_pset
->online_processor_count
; 
 670                         if (timestamp 
< (edf_bucket
->scrb_starvation_ts 
+ starvation_window
)) { 
 673                                 /* Starvation avoidance window is over; update deadline and re-evaluate EDF */ 
 674                                 edf_bucket
->scrb_starvation_avoidance 
= false; 
 675                                 edf_bucket
->scrb_starvation_ts 
= 0; 
 676                                 sched_clutch_root_bucket_deadline_update(edf_bucket
, root_clutch
, timestamp
); 
 678                         goto evaluate_root_buckets
; 
 681                 /* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */ 
 682                 if (highest_runnable_bucket 
< edf_bucket
->scrb_bucket
) { 
 683                         /* Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance */ 
 684                         edf_bucket
->scrb_starvation_avoidance 
= true; 
 685                         edf_bucket
->scrb_starvation_ts 
= timestamp
; 
 687                         /* EDF bucket is being selected in the natural order; update deadline and reset warp */ 
 688                         sched_clutch_root_bucket_deadline_update(edf_bucket
, root_clutch
, timestamp
); 
 689                         edf_bucket
->scrb_warp_remaining 
= sched_clutch_root_bucket_warp
[edf_bucket
->scrb_bucket
]; 
 690                         edf_bucket
->scrb_warped_deadline 
= SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED
; 
 691                         if (edf_bucket
->scrb_bound
) { 
 692                                 bitmap_set(root_clutch
->scr_bound_warp_available
, edf_bucket
->scrb_bucket
); 
 694                                 bitmap_set(root_clutch
->scr_unbound_warp_available
, edf_bucket
->scrb_bucket
); 
 701          * Looks like there is a root bucket which is higher in the natural priority 
 702          * order than edf_bucket and might have some warp remaining. 
 704         warp_bucket 
= (edf_bucket
->scrb_bound
) ? &root_clutch
->scr_bound_buckets
[warp_bucket_index
] : &root_clutch
->scr_unbound_buckets
[warp_bucket_index
]; 
 705         if (warp_bucket
->scrb_warped_deadline 
== SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED
) { 
 706                 /* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */ 
 707                 warp_bucket
->scrb_warped_deadline 
= timestamp 
+ warp_bucket
->scrb_warp_remaining
; 
 708                 sched_clutch_root_bucket_deadline_update(warp_bucket
, root_clutch
, timestamp
); 
 711         if (warp_bucket
->scrb_warped_deadline 
> timestamp
) { 
 712                 /* Root bucket already has a warp window open with some warp remaining */ 
 713                 sched_clutch_root_bucket_deadline_update(warp_bucket
, root_clutch
, timestamp
); 
 717         /* For this bucket, warp window was opened sometime in the past but has now 
 718          * expired. Mark the bucket as not avilable for warp anymore and re-run the 
 719          * warp bucket selection logic. 
 721         warp_bucket
->scrb_warp_remaining 
= 0; 
 722         if (warp_bucket
->scrb_bound
) { 
 723                 bitmap_clear(root_clutch
->scr_bound_warp_available
, warp_bucket
->scrb_bucket
); 
 725                 bitmap_clear(root_clutch
->scr_unbound_warp_available
, warp_bucket
->scrb_bucket
); 
 727         goto evaluate_root_buckets
; 
 731  * sched_clutch_root_bucket_deadline_calculate() 
 733  * Calculate the deadline for the bucket based on its WCEL 
 736 sched_clutch_root_bucket_deadline_calculate( 
 737         sched_clutch_root_bucket_t root_bucket
, 
 740         /* For fixpri AboveUI bucket always return it as the earliest deadline */ 
 741         if (root_bucket
->scrb_bucket 
< TH_BUCKET_SHARE_FG
) { 
 745         /* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */ 
 746         return timestamp 
+ sched_clutch_root_bucket_wcel
[root_bucket
->scrb_bucket
]; 
 750  * sched_clutch_root_bucket_deadline_update() 
 752  * Routine to update the deadline of the root bucket when it is selected. 
 753  * Updating the deadline also moves the root_bucket in the EDF priority 
 757 sched_clutch_root_bucket_deadline_update( 
 758         sched_clutch_root_bucket_t root_bucket
, 
 759         sched_clutch_root_t root_clutch
, 
 762         if (root_bucket
->scrb_bucket 
== TH_BUCKET_FIXPRI
) { 
 763                 /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */ 
 767         uint64_t old_deadline 
= root_bucket
->scrb_pqlink
.deadline
; 
 768         uint64_t new_deadline 
= sched_clutch_root_bucket_deadline_calculate(root_bucket
, timestamp
); 
 769         if (__improbable(old_deadline 
> new_deadline
)) { 
 770                 panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline
, new_deadline
, root_bucket
->scrb_bucket
, timestamp
); 
 772         if (old_deadline 
!= new_deadline
) { 
 773                 root_bucket
->scrb_pqlink
.deadline 
= new_deadline
; 
 774                 struct priority_queue_deadline_min 
*prioq 
= (root_bucket
->scrb_bound
) ? &root_clutch
->scr_bound_root_buckets 
: &root_clutch
->scr_unbound_root_buckets
; 
 775                 priority_queue_entry_increased(prioq
, &root_bucket
->scrb_pqlink
); 
 780  * sched_clutch_root_bucket_runnable() 
 782  * Routine to insert a newly runnable root bucket into the hierarchy. 
 783  * Also updates the deadline and warp parameters as necessary. 
 786 sched_clutch_root_bucket_runnable( 
 787         sched_clutch_root_bucket_t root_bucket
, 
 788         sched_clutch_root_t root_clutch
, 
 791         /* Mark the root bucket as runnable */ 
 792         bitmap_t 
*runnable_bitmap 
= (root_bucket
->scrb_bound
) ? root_clutch
->scr_bound_runnable_bitmap 
: root_clutch
->scr_unbound_runnable_bitmap
; 
 793         bitmap_set(runnable_bitmap
, root_bucket
->scrb_bucket
); 
 795         if (root_bucket
->scrb_bucket 
== TH_BUCKET_FIXPRI
) { 
 796                 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */ 
 800         if (root_bucket
->scrb_starvation_avoidance 
== false) { 
 802                  * Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in 
 803                  * starvation avoidance and its window has expired, the highest root bucket selection logic will notice 
 804                  * that and fix it up. 
 806                 root_bucket
->scrb_pqlink
.deadline 
= sched_clutch_root_bucket_deadline_calculate(root_bucket
, timestamp
); 
 808         struct priority_queue_deadline_min 
*prioq 
= (root_bucket
->scrb_bound
) ? &root_clutch
->scr_bound_root_buckets 
: &root_clutch
->scr_unbound_root_buckets
; 
 809         priority_queue_insert(prioq
, &root_bucket
->scrb_pqlink
); 
 810         if (root_bucket
->scrb_warp_remaining
) { 
 811                 /* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */ 
 812                 bitmap_t 
*warp_bitmap 
= (root_bucket
->scrb_bound
) ? root_clutch
->scr_bound_warp_available 
: root_clutch
->scr_unbound_warp_available
; 
 813                 bitmap_set(warp_bitmap
, root_bucket
->scrb_bucket
); 
 818  * sched_clutch_root_bucket_empty() 
 820  * Routine to remove an empty root bucket from the hierarchy. 
 821  * Also updates the deadline and warp parameters as necessary. 
 824 sched_clutch_root_bucket_empty( 
 825         sched_clutch_root_bucket_t root_bucket
, 
 826         sched_clutch_root_t root_clutch
, 
 829         bitmap_t 
*runnable_bitmap 
= (root_bucket
->scrb_bound
) ? root_clutch
->scr_bound_runnable_bitmap 
: root_clutch
->scr_unbound_runnable_bitmap
; 
 830         bitmap_clear(runnable_bitmap
, root_bucket
->scrb_bucket
); 
 832         if (root_bucket
->scrb_bucket 
== TH_BUCKET_FIXPRI
) { 
 833                 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */ 
 837         struct priority_queue_deadline_min 
*prioq 
= (root_bucket
->scrb_bound
) ? &root_clutch
->scr_bound_root_buckets 
: &root_clutch
->scr_unbound_root_buckets
; 
 838         priority_queue_remove(prioq
, &root_bucket
->scrb_pqlink
); 
 840         bitmap_t 
*warp_bitmap 
= (root_bucket
->scrb_bound
) ? root_clutch
->scr_bound_warp_available 
: root_clutch
->scr_unbound_warp_available
; 
 841         bitmap_clear(warp_bitmap
, root_bucket
->scrb_bucket
); 
 843         if (root_bucket
->scrb_warped_deadline 
> timestamp
) { 
 845                  * For root buckets that were using the warp, check if the warp 
 846                  * deadline is in the future. If yes, remove the wall time the 
 847                  * warp was active and update the warp remaining. This allows 
 848                  * the root bucket to use the remaining warp the next time it 
 851                 root_bucket
->scrb_warp_remaining 
= root_bucket
->scrb_warped_deadline 
- timestamp
; 
 852         } else if (root_bucket
->scrb_warped_deadline 
!= SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED
) { 
 854                  * If the root bucket's warped deadline is in the past, it has used up 
 855                  * all the warp it was assigned. Empty out its warp remaining. 
 857                 root_bucket
->scrb_warp_remaining 
= 0; 
 862 sched_clutch_global_bucket_load_get( 
 863         sched_bucket_t bucket
) 
 865         return (int)os_atomic_load(&sched_clutch_global_bucket_load
[bucket
], relaxed
); 
 869  * sched_clutch_root_pri_update() 
 871  * The root level priority is used for thread selection and preemption 
 874  * The logic uses the same decision as thread selection for deciding between the 
 875  * above UI and timeshare buckets. If one of the timesharing buckets have to be 
 876  * used for priority calculation, the logic is slightly different from thread 
 877  * selection, because thread selection considers deadlines, warps etc. to 
 878  * decide the most optimal bucket at a given timestamp. Since the priority 
 879  * value is used for preemption decisions only, it needs to be based on the 
 880  * highest runnable thread available in the timeshare domain. This logic can 
 881  * be made more sophisticated if there are cases of unnecessary preemption 
 882  * being seen in workloads. 
 885 sched_clutch_root_pri_update( 
 886         sched_clutch_root_t root_clutch
) 
 888         sched_clutch_hierarchy_locked_assert(root_clutch
); 
 889         int16_t root_bound_pri 
= NOPRI
; 
 890         int16_t root_unbound_pri 
= NOPRI
; 
 892         if (bitmap_lsb_first(root_clutch
->scr_bound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
) == -1) { 
 893                 goto root_pri_update_unbound
; 
 895         sched_clutch_root_bucket_t root_bucket_bound 
= NULL
; 
 896         if (sched_clutch_root_bound_select_aboveui(root_clutch
)) { 
 897                 root_bucket_bound 
= &root_clutch
->scr_bound_buckets
[TH_BUCKET_FIXPRI
]; 
 899                 int root_bucket_index 
= bitmap_lsb_next(root_clutch
->scr_bound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
, TH_BUCKET_FIXPRI
); 
 900                 assert(root_bucket_index 
!= -1); 
 901                 root_bucket_bound 
= &root_clutch
->scr_bound_buckets
[root_bucket_index
]; 
 903         root_bound_pri 
= root_bucket_bound
->scrb_bound_thread_runq
.highq
; 
 905 root_pri_update_unbound
: 
 906         if (bitmap_lsb_first(root_clutch
->scr_unbound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
) == -1) { 
 907                 goto root_pri_update_complete
; 
 909         sched_clutch_root_bucket_t root_bucket_unbound 
= NULL
; 
 910         if (sched_clutch_root_unbound_select_aboveui(root_clutch
)) { 
 911                 root_bucket_unbound 
= &root_clutch
->scr_unbound_buckets
[TH_BUCKET_FIXPRI
]; 
 913                 int root_bucket_index 
= bitmap_lsb_next(root_clutch
->scr_unbound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
, TH_BUCKET_FIXPRI
); 
 914                 assert(root_bucket_index 
!= -1); 
 915                 root_bucket_unbound 
= &root_clutch
->scr_unbound_buckets
[root_bucket_index
]; 
 917         /* For the selected root bucket, find the highest priority clutch bucket */ 
 918         sched_clutch_bucket_t clutch_bucket 
= sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_unbound
); 
 919         root_unbound_pri 
= priority_queue_max_sched_pri(&clutch_bucket
->scb_clutchpri_prioq
); 
 921 root_pri_update_complete
: 
 922         root_clutch
->scr_priority 
= MAX(root_bound_pri
, root_unbound_pri
); 
 926  * sched_clutch_root_urgency_inc() 
 928  * Routine to increment the urgency at the root level based on the thread 
 929  * priority that is being inserted into the hierarchy. The root urgency 
 930  * counter is updated based on the urgency of threads in any of the 
 931  * clutch buckets which are part of the hierarchy. 
 933  * Always called with the pset lock held. 
 936 sched_clutch_root_urgency_inc( 
 937         sched_clutch_root_t root_clutch
, 
 940         if (SCHED(priority_is_urgent
)(thread
->sched_pri
)) { 
 941                 root_clutch
->scr_urgency
++; 
 946  * sched_clutch_root_urgency_dec() 
 948  * Routine to decrement the urgency at the root level based on the thread 
 949  * priority that is being removed from the hierarchy. The root urgency 
 950  * counter is updated based on the urgency of threads in any of the 
 951  * clutch buckets which are part of the hierarchy. 
 953  * Always called with the pset lock held. 
 956 sched_clutch_root_urgency_dec( 
 957         sched_clutch_root_t root_clutch
, 
 960         if (SCHED(priority_is_urgent
)(thread
->sched_pri
)) { 
 961                 root_clutch
->scr_urgency
--; 
 966  * Clutch bucket level scheduling 
 968  * The second level of scheduling is the clutch bucket level scheduling 
 969  * which tries to schedule thread groups within root_buckets. Each 
 970  * clutch represents a thread group and a clutch_bucket_group represents 
 971  * threads at a particular sched_bucket within that thread group. The 
 972  * clutch_bucket_group contains a clutch_bucket per cluster on the system 
 973  * where it holds the runnable threads destined for execution on that 
 976  * The goal of this level of scheduling is to allow interactive thread 
 977  * groups low latency access to the CPU. It also provides slight 
 978  * scheduling preference for App and unrestricted thread groups. 
 980  * The clutch bucket scheduling algorithm measures an interactivity 
 981  * score for all clutch bucket groups. The interactivity score is based 
 982  * on the ratio of the CPU used and the voluntary blocking of threads 
 983  * within the clutch bucket group. The algorithm is very close to the ULE 
 984  * scheduler on FreeBSD in terms of calculations. The interactivity 
 985  * score provides an interactivity boost in the range of 
 986  * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive 
 987  * thread groups to win over CPU spinners. 
 989  * The interactivity score of the clutch bucket group is combined with the 
 990  * highest base/promoted priority of threads in the clutch bucket to form 
 991  * the overall priority of the clutch bucket. 
 994 /* Priority boost range for interactivity */ 
 995 #define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT     (8) 
 996 uint8_t sched_clutch_bucket_group_interactive_pri 
= SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT
; 
 998 /* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */ 
 999 uint64_t sched_clutch_bucket_group_adjust_threshold 
= 0; 
1000 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS      (500000) 
1002 /* The ratio to scale the cpu/blocked time per window */ 
1003 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO                (10) 
1006  * In order to allow App thread groups some preference over daemon thread 
1007  * groups, the App clutch_buckets get a priority boost. The boost value should 
1008  * be chosen such that badly behaved apps are still penalized over well 
1009  * behaved interactive daemons. 
1011 static uint8_t sched_clutch_bucket_group_pri_boost
[SCHED_CLUTCH_TG_PRI_MAX
] = { 
1012         [SCHED_CLUTCH_TG_PRI_LOW
]  = 0, 
1013         [SCHED_CLUTCH_TG_PRI_MED
]  = 2, 
1014         [SCHED_CLUTCH_TG_PRI_HIGH
] = 4, 
1017 /* Initial value for voluntary blocking time for the clutch_bucket */ 
1018 #define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID          (uint64_t)(~0) 
1020 /* Value indicating the clutch bucket is not pending execution */ 
1021 #define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID             ((uint64_t)(~0)) 
1024  * Thread group CPU starvation avoidance 
1026  * In heavily CPU contended scenarios, it is possible that some thread groups 
1027  * which have a low interactivity score do not get CPU time at all. In order to 
1028  * resolve that, the scheduler tries to ageout the CPU usage of the clutch 
1029  * bucket group when it has been pending execution for a certain time as defined 
1030  * by the sched_clutch_bucket_group_pending_delta_us values below. 
1032  * The values chosen here are very close to the WCEL values for each sched bucket. 
1033  * These values are multiplied by the load average of the relevant root bucket to 
1034  * provide an estimate of the actual clutch bucket load. 
1036 static uint32_t sched_clutch_bucket_group_pending_delta_us
[TH_BUCKET_SCHED_MAX
] = { 
1037         SCHED_CLUTCH_INVALID_TIME_32
,           /* FIXPRI */ 
1044 static uint64_t sched_clutch_bucket_group_pending_delta
[TH_BUCKET_SCHED_MAX
] = {0}; 
1047  * sched_clutch_bucket_init() 
1049  * Initializer for clutch buckets. 
1052 sched_clutch_bucket_init( 
1053         sched_clutch_bucket_t clutch_bucket
, 
1054         sched_clutch_bucket_group_t clutch_bucket_group
, 
1055         sched_bucket_t bucket
) 
1057         bzero(clutch_bucket
, sizeof(struct sched_clutch_bucket
)); 
1059         clutch_bucket
->scb_bucket 
= bucket
; 
1060         /* scb_priority will be recalculated when a thread is inserted in the clutch bucket */ 
1061         clutch_bucket
->scb_priority 
= 0; 
1062 #if CONFIG_SCHED_EDGE 
1063         clutch_bucket
->scb_foreign 
= false; 
1064         priority_queue_entry_init(&clutch_bucket
->scb_foreignlink
); 
1065 #endif /* CONFIG_SCHED_EDGE */ 
1066         clutch_bucket
->scb_group 
= clutch_bucket_group
; 
1067         clutch_bucket
->scb_root 
= NULL
; 
1068         priority_queue_init(&clutch_bucket
->scb_clutchpri_prioq
); 
1069         priority_queue_init(&clutch_bucket
->scb_thread_runq
); 
1070         queue_init(&clutch_bucket
->scb_thread_timeshare_queue
); 
1074  * sched_clutch_bucket_group_init() 
1076  * Initializer for clutch bucket groups. 
1079 sched_clutch_bucket_group_init( 
1080         sched_clutch_bucket_group_t clutch_bucket_group
, 
1081         sched_clutch_t clutch
, 
1082         sched_bucket_t bucket
) 
1084         bzero(clutch_bucket_group
, sizeof(struct sched_clutch_bucket_group
)); 
1085         clutch_bucket_group
->scbg_bucket 
= bucket
; 
1086         clutch_bucket_group
->scbg_clutch 
= clutch
; 
1087         for (int i 
= 0; i 
< MAX_PSETS
; i
++) { 
1088                 sched_clutch_bucket_init(&clutch_bucket_group
->scbg_clutch_buckets
[i
], clutch_bucket_group
, bucket
); 
1090         os_atomic_store(&clutch_bucket_group
->scbg_timeshare_tick
, 0, relaxed
); 
1091         os_atomic_store(&clutch_bucket_group
->scbg_pri_shift
, INT8_MAX
, relaxed
); 
1092         os_atomic_store(&clutch_bucket_group
->scbg_preferred_cluster
, pset0
.pset_cluster_id
, relaxed
); 
1094          * All thread groups should be initialized to be interactive; this allows the newly launched 
1095          * thread groups to fairly compete with already running thread groups. 
1097         clutch_bucket_group
->scbg_interactivity_data
.scct_count 
= (sched_clutch_bucket_group_interactive_pri 
* 2); 
1098         clutch_bucket_group
->scbg_interactivity_data
.scct_timestamp 
= 0; 
1099         os_atomic_store(&clutch_bucket_group
->scbg_cpu_data
.cpu_data
.scbcd_cpu_blocked
, (clutch_cpu_data_t
)sched_clutch_bucket_group_adjust_threshold
, relaxed
); 
1101         lck_spin_init(&clutch_bucket_group
->scbg_stats_lock
, &pset_lck_grp
, NULL
); 
1102 #endif /* !__LP64__ */ 
1103         clutch_bucket_group
->scbg_blocked_data
.scct_timestamp 
= SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID
; 
1104         clutch_bucket_group
->scbg_pending_data
.scct_timestamp 
= SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID
; 
1108  * sched_clutch_init_with_thread_group() 
1110  * Initialize the sched_clutch when the thread group is being created 
1113 sched_clutch_init_with_thread_group( 
1114         sched_clutch_t clutch
, 
1115         struct thread_group 
*tg
) 
1117         os_atomic_store(&clutch
->sc_thr_count
, 0, relaxed
); 
1119         /* Initialize all the clutch buckets */ 
1120         for (uint32_t i 
= 0; i 
< TH_BUCKET_SCHED_MAX
; i
++) { 
1121                 sched_clutch_bucket_group_init(&(clutch
->sc_clutch_groups
[i
]), clutch
, i
); 
1124         /* Grouping specific fields */ 
1126         os_atomic_store(&clutch
->sc_tg_priority
, 0, relaxed
); 
1130  * sched_clutch_destroy() 
1132  * Destructor for clutch; called from thread group release code. 
1135 sched_clutch_destroy( 
1136         __unused sched_clutch_t clutch
) 
1138         assert(os_atomic_load(&clutch
->sc_thr_count
, relaxed
) == 0); 
1141 #if CONFIG_SCHED_EDGE 
1144  * The current edge scheduler still relies on globals for E & P clusters. It uses these 
1145  * globals for the following operations: 
1146  * - Sysctl support for configuring edges 
1147  * - Edge scheduler initialization 
1149  * These should be removed for multi-cluster platforms once a clear policy for the above 
1150  * operations is defined. 
1151  * <Edge Multi-cluster Support Needed> 
1153 static uint32_t ecore_cluster_id 
= 0; 
1154 static uint32_t pcore_cluster_id 
= 1; 
1157  * Edge Scheduler Preferred Cluster Mechanism 
1159  * In order to have better control over various QoS buckets within a thread group, the Edge 
1160  * scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These 
1161  * preferences are stored at the sched_clutch_bucket_group level since that represents all 
1162  * threads at a particular QoS level within a sched_clutch. For any lookup of preferred 
1163  * cluster, the logic always goes back to the preference stored at the clutch_bucket_group. 
1167 sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group
) 
1169         return os_atomic_load(&clutch_bucket_group
->scbg_preferred_cluster
, relaxed
); 
1173 sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket
) 
1175         return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket
->scb_group
); 
1179 sched_edge_thread_preferred_cluster(thread_t thread
) 
1181         if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread
)) { 
1182                 /* For threads bound to a specific cluster, return the bound cluster id */ 
1183                 return sched_edge_thread_bound_cluster_id(thread
); 
1186         sched_clutch_t clutch 
= sched_clutch_for_thread(thread
); 
1187         sched_clutch_bucket_group_t clutch_bucket_group 
= &clutch
->sc_clutch_groups
[thread
->th_sched_bucket
]; 
1188         return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group
); 
1192  * Edge Scheduler Foreign Bucket Support 
1194  * In the Edge Scheduler, each cluster maintains a priority queue of clutch buckets containing 
1195  * threads that are not native to the cluster. A clutch bucket is considered native if its 
1196  * preferred cluster has the same type as the cluster its enqueued in. The foreign clutch 
1197  * bucket priority queue is used for rebalance operations to get threads back to their native 
1200  * It is possible to make this policy even more aggressive by considering all clusters that 
1201  * are not the preferred cluster as the foreign cluster, but that would mean a lot of thread 
1202  * migrations which might have performance implications. 
1206 sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket
, sched_clutch_root_t root_clutch
) 
1208         if (clutch_bucket
->scb_foreign
) { 
1209                 clutch_bucket
->scb_foreign 
= false; 
1210                 priority_queue_remove(&root_clutch
->scr_foreign_buckets
, &clutch_bucket
->scb_foreignlink
); 
1215 sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket
, sched_clutch_root_t root_clutch
) 
1217         if (!clutch_bucket
->scb_foreign
) { 
1218                 clutch_bucket
->scb_foreign 
= true; 
1219                 priority_queue_entry_set_sched_pri(&root_clutch
->scr_foreign_buckets
, &clutch_bucket
->scb_foreignlink
, clutch_bucket
->scb_priority
, 0); 
1220                 priority_queue_insert(&root_clutch
->scr_foreign_buckets
, &clutch_bucket
->scb_foreignlink
); 
1225  * Edge Scheduler Cumulative Load Average 
1227  * The Edge scheduler maintains a per-QoS/scheduling bucket load average for 
1228  * making thread migration decisions. The per-bucket load is maintained as a 
1229  * cumulative count since higher scheduling buckets impact load on lower buckets 
1230  * for thread migration decisions. 
1235 sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch
, sched_bucket_t bucket
) 
1238         case TH_BUCKET_FIXPRI
:    os_atomic_inc(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_FIXPRI
], relaxed
); OS_FALLTHROUGH
; 
1239         case TH_BUCKET_SHARE_FG
:  os_atomic_inc(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_FG
], relaxed
); OS_FALLTHROUGH
; 
1240         case TH_BUCKET_SHARE_IN
:  os_atomic_inc(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_IN
], relaxed
); OS_FALLTHROUGH
; 
1241         case TH_BUCKET_SHARE_DF
:  os_atomic_inc(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_DF
], relaxed
); OS_FALLTHROUGH
; 
1242         case TH_BUCKET_SHARE_UT
:  os_atomic_inc(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_UT
], relaxed
); OS_FALLTHROUGH
; 
1243         case TH_BUCKET_SHARE_BG
:  os_atomic_inc(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_BG
], relaxed
); break; 
1245                 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()"); 
1250 sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch
, sched_bucket_t bucket
) 
1253         case TH_BUCKET_FIXPRI
:    os_atomic_dec(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_FIXPRI
], relaxed
); OS_FALLTHROUGH
; 
1254         case TH_BUCKET_SHARE_FG
:  os_atomic_dec(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_FG
], relaxed
); OS_FALLTHROUGH
; 
1255         case TH_BUCKET_SHARE_IN
:  os_atomic_dec(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_IN
], relaxed
); OS_FALLTHROUGH
; 
1256         case TH_BUCKET_SHARE_DF
:  os_atomic_dec(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_DF
], relaxed
); OS_FALLTHROUGH
; 
1257         case TH_BUCKET_SHARE_UT
:  os_atomic_dec(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_UT
], relaxed
); OS_FALLTHROUGH
; 
1258         case TH_BUCKET_SHARE_BG
:  os_atomic_dec(&root_clutch
->scr_cumulative_run_count
[TH_BUCKET_SHARE_BG
], relaxed
); break; 
1260                 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()"); 
1265 sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch
, sched_bucket_t bucket
) 
1267         return os_atomic_load(&root_clutch
->scr_cumulative_run_count
[bucket
], relaxed
); 
1270 #endif /* CONFIG_SCHED_EDGE */ 
1273  * sched_clutch_bucket_hierarchy_insert() 
1275  * Routine to insert a newly runnable clutch_bucket into the root hierarchy. 
1278 sched_clutch_bucket_hierarchy_insert( 
1279         sched_clutch_root_t root_clutch
, 
1280         sched_clutch_bucket_t clutch_bucket
, 
1281         sched_bucket_t bucket
, 
1283         sched_clutch_bucket_options_t options
) 
1285         sched_clutch_hierarchy_locked_assert(root_clutch
); 
1286         if (bucket 
> TH_BUCKET_FIXPRI
) { 
1287                 /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */ 
1288                 enqueue_tail(&root_clutch
->scr_clutch_buckets
, &clutch_bucket
->scb_listlink
); 
1290 #if CONFIG_SCHED_EDGE 
1291         /* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */ 
1292         uint32_t preferred_cluster 
= sched_clutch_bucket_preferred_cluster(clutch_bucket
); 
1293         if (pset_type_for_id(preferred_cluster
) != pset_type_for_id(root_clutch
->scr_cluster_id
)) { 
1294                 sched_clutch_bucket_mark_foreign(clutch_bucket
, root_clutch
); 
1296 #endif /* CONFIG_SCHED_EDGE */ 
1297         sched_clutch_root_bucket_t root_bucket 
= &root_clutch
->scr_unbound_buckets
[bucket
]; 
1299         /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */ 
1300         if (sched_clutch_bucket_runq_empty(&root_bucket
->scrb_clutch_buckets
)) { 
1301                 sched_clutch_root_bucket_runnable(root_bucket
, root_clutch
, timestamp
); 
1304         /* Insert the clutch bucket into the root bucket run queue with order based on options */ 
1305         sched_clutch_bucket_runq_enqueue(&root_bucket
->scrb_clutch_buckets
, clutch_bucket
, options
); 
1306         os_atomic_store(&clutch_bucket
->scb_root
, root_clutch
, relaxed
); 
1307         os_atomic_inc(&sched_clutch_global_bucket_load
[bucket
], relaxed
); 
1311  * sched_clutch_bucket_hierarchy_remove() 
1313  * Rotuine to remove a empty clutch bucket from the root hierarchy. 
1316 sched_clutch_bucket_hierarchy_remove( 
1317         sched_clutch_root_t root_clutch
, 
1318         sched_clutch_bucket_t clutch_bucket
, 
1319         sched_bucket_t bucket
, 
1321         __unused sched_clutch_bucket_options_t options
) 
1323         sched_clutch_hierarchy_locked_assert(root_clutch
); 
1324         if (bucket 
> TH_BUCKET_FIXPRI
) { 
1325                 /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */ 
1326                 remqueue(&clutch_bucket
->scb_listlink
); 
1328 #if CONFIG_SCHED_EDGE 
1329         sched_clutch_bucket_mark_native(clutch_bucket
, root_clutch
); 
1330 #endif /* CONFIG_SCHED_EDGE */ 
1332         sched_clutch_root_bucket_t root_bucket 
= &root_clutch
->scr_unbound_buckets
[bucket
]; 
1334         /* Remove the clutch bucket from the root bucket priority queue */ 
1335         sched_clutch_bucket_runq_remove(&root_bucket
->scrb_clutch_buckets
, clutch_bucket
); 
1336         os_atomic_store(&clutch_bucket
->scb_root
, NULL
, relaxed
); 
1338         /* If the root bucket priority queue is now empty, remove it from the root priority queue */ 
1339         if (sched_clutch_bucket_runq_empty(&root_bucket
->scrb_clutch_buckets
)) { 
1340                 sched_clutch_root_bucket_empty(root_bucket
, root_clutch
, timestamp
); 
1342         os_atomic_dec(&sched_clutch_global_bucket_load
[bucket
], relaxed
); 
1346  * sched_clutch_bucket_base_pri() 
1348  * Calculates the "base" priority of the clutch bucket. The base 
1349  * priority of the clutch bucket is the sum of the max of highest 
1350  * base_pri and highest sched_pri in the clutch bucket and any 
1351  * grouping specific (App/Daemon...) boosts applicable to the 
1355 sched_clutch_bucket_base_pri( 
1356         sched_clutch_bucket_t clutch_bucket
) 
1358         uint8_t clutch_boost 
= 0; 
1359         assert(priority_queue_empty(&clutch_bucket
->scb_thread_runq
) == false); 
1361         sched_clutch_t clutch 
= clutch_bucket
->scb_group
->scbg_clutch
; 
1364          * Since the clutch bucket can contain threads that are members of the group due 
1365          * to the sched_pri being promoted or due to their base pri, the base priority of 
1366          * the entire clutch bucket should be based on the highest thread (promoted or base) 
1367          * in the clutch bucket. 
1369         uint8_t max_pri 
= 0; 
1370         if (!priority_queue_empty(&clutch_bucket
->scb_clutchpri_prioq
)) { 
1371                 max_pri 
= priority_queue_max_sched_pri(&clutch_bucket
->scb_clutchpri_prioq
); 
1374         sched_clutch_tg_priority_t tg_pri 
= os_atomic_load(&clutch
->sc_tg_priority
, relaxed
); 
1375         clutch_boost 
= sched_clutch_bucket_group_pri_boost
[tg_pri
]; 
1376         return max_pri 
+ clutch_boost
; 
1380  * sched_clutch_interactivity_from_cpu_data() 
1382  * Routine to calculate the interactivity score of a clutch bucket group from its CPU usage 
1385 sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group
) 
1387         sched_clutch_bucket_cpu_data_t scb_cpu_data
; 
1388         scb_cpu_data
.scbcd_cpu_data_packed 
= os_atomic_load_wide(&clutch_bucket_group
->scbg_cpu_data
.scbcd_cpu_data_packed
, relaxed
); 
1389         clutch_cpu_data_t cpu_used 
= scb_cpu_data
.cpu_data
.scbcd_cpu_used
; 
1390         clutch_cpu_data_t cpu_blocked 
= scb_cpu_data
.cpu_data
.scbcd_cpu_blocked
; 
1391         uint8_t interactive_score 
= 0; 
1393         if ((cpu_blocked 
== 0) && (cpu_used 
== 0)) { 
1394                 return (uint8_t)clutch_bucket_group
->scbg_interactivity_data
.scct_count
; 
1397          * For all timeshare buckets, calculate the interactivity score of the bucket 
1398          * and add it to the base priority 
1400         if (cpu_blocked 
> cpu_used
) { 
1401                 /* Interactive clutch_bucket case */ 
1402                 interactive_score 
= sched_clutch_bucket_group_interactive_pri 
+ 
1403                     ((sched_clutch_bucket_group_interactive_pri 
* (cpu_blocked 
- cpu_used
)) / cpu_blocked
); 
1405                 /* Non-interactive clutch_bucket case */ 
1406                 interactive_score 
= ((sched_clutch_bucket_group_interactive_pri 
* cpu_blocked
) / cpu_used
); 
1408         return interactive_score
; 
1412  * sched_clutch_bucket_pri_calculate() 
1414  * The priority calculation algorithm for the clutch_bucket is a slight 
1415  * modification on the ULE interactivity score. It uses the base priority 
1416  * of the clutch bucket and applies an interactivity score boost to the 
1417  * highly responsive clutch buckets. 
1420 sched_clutch_bucket_pri_calculate( 
1421         sched_clutch_bucket_t clutch_bucket
, 
1424         /* For empty clutch buckets, return priority 0 */ 
1425         if (clutch_bucket
->scb_thr_count 
== 0) { 
1429         uint8_t base_pri 
= sched_clutch_bucket_base_pri(clutch_bucket
); 
1430         uint8_t interactive_score 
= sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket
->scb_group
, timestamp
); 
1432         assert(((uint64_t)base_pri 
+ interactive_score
) <= UINT8_MAX
); 
1433         uint8_t pri 
= base_pri 
+ interactive_score
; 
1434         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_CLUTCH_TG_BUCKET_PRI
) | DBG_FUNC_NONE
, 
1435             thread_group_get_id(clutch_bucket
->scb_group
->scbg_clutch
->sc_tg
), clutch_bucket
->scb_bucket
, pri
, interactive_score
, 0); 
1440  * sched_clutch_root_bucket_highest_clutch_bucket() 
1442  * Routine to find the highest priority clutch bucket 
1443  * within the root bucket. 
1445 static sched_clutch_bucket_t
 
1446 sched_clutch_root_bucket_highest_clutch_bucket( 
1447         sched_clutch_root_bucket_t root_bucket
) 
1449         if (sched_clutch_bucket_runq_empty(&root_bucket
->scrb_clutch_buckets
)) { 
1452         return sched_clutch_bucket_runq_peek(&root_bucket
->scrb_clutch_buckets
); 
1456  * sched_clutch_bucket_runnable() 
1458  * Perform all operations needed when a new clutch bucket becomes runnable. 
1459  * It involves inserting the clutch_bucket into the hierarchy and updating the 
1460  * root priority appropriately. 
1463 sched_clutch_bucket_runnable( 
1464         sched_clutch_bucket_t clutch_bucket
, 
1465         sched_clutch_root_t root_clutch
, 
1467         sched_clutch_bucket_options_t options
) 
1469         sched_clutch_hierarchy_locked_assert(root_clutch
); 
1470         /* Since the clutch bucket became newly runnable, update its pending timestamp */ 
1471         clutch_bucket
->scb_priority 
= sched_clutch_bucket_pri_calculate(clutch_bucket
, timestamp
); 
1472         sched_clutch_bucket_hierarchy_insert(root_clutch
, clutch_bucket
, clutch_bucket
->scb_bucket
, timestamp
, options
); 
1474         /* Update the timesharing properties of this clutch_bucket; also done every sched_tick */ 
1475         sched_clutch_bucket_group_timeshare_update(clutch_bucket
->scb_group
, clutch_bucket
, timestamp
); 
1476         int16_t root_old_pri 
= root_clutch
->scr_priority
; 
1477         sched_clutch_root_pri_update(root_clutch
); 
1478         return root_clutch
->scr_priority 
> root_old_pri
; 
1482  * sched_clutch_bucket_update() 
1484  * Update the clutch_bucket's position in the hierarchy. This routine is 
1485  * called when a new thread is inserted or removed from a runnable clutch 
1486  * bucket. The options specify some properties about the clutch bucket 
1487  * insertion order into the clutch bucket runq. 
1490 sched_clutch_bucket_update( 
1491         sched_clutch_bucket_t clutch_bucket
, 
1492         sched_clutch_root_t root_clutch
, 
1494         sched_clutch_bucket_options_t options
) 
1496         sched_clutch_hierarchy_locked_assert(root_clutch
); 
1497         uint64_t new_pri 
= sched_clutch_bucket_pri_calculate(clutch_bucket
, timestamp
); 
1498         sched_clutch_bucket_runq_t bucket_runq 
= &root_clutch
->scr_unbound_buckets
[clutch_bucket
->scb_bucket
].scrb_clutch_buckets
; 
1499         if (new_pri 
== clutch_bucket
->scb_priority
) { 
1501                  * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket 
1502                  * to the end of the runq. Typically used when a thread is selected for execution 
1503                  * from a clutch bucket. 
1505                 if (options 
& SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR
) { 
1506                         sched_clutch_bucket_runq_rotate(bucket_runq
, clutch_bucket
); 
1510         sched_clutch_bucket_runq_remove(bucket_runq
, clutch_bucket
); 
1511 #if CONFIG_SCHED_EDGE 
1512         if (clutch_bucket
->scb_foreign
) { 
1513                 priority_queue_remove(&root_clutch
->scr_foreign_buckets
, &clutch_bucket
->scb_foreignlink
); 
1515 #endif /* CONFIG_SCHED_EDGE */ 
1516         clutch_bucket
->scb_priority 
= new_pri
; 
1517 #if CONFIG_SCHED_EDGE 
1518         if (clutch_bucket
->scb_foreign
) { 
1519                 priority_queue_entry_set_sched_pri(&root_clutch
->scr_foreign_buckets
, &clutch_bucket
->scb_foreignlink
, clutch_bucket
->scb_priority
, 0); 
1520                 priority_queue_insert(&root_clutch
->scr_foreign_buckets
, &clutch_bucket
->scb_foreignlink
); 
1522 #endif /* CONFIG_SCHED_EDGE */ 
1523         sched_clutch_bucket_runq_enqueue(bucket_runq
, clutch_bucket
, options
); 
1525         int16_t root_old_pri 
= root_clutch
->scr_priority
; 
1526         sched_clutch_root_pri_update(root_clutch
); 
1527         return root_clutch
->scr_priority 
> root_old_pri
; 
1531  * sched_clutch_bucket_empty() 
1533  * Perform all the operations needed when a clutch_bucket is no longer runnable. 
1534  * It involves removing the clutch bucket from the hierarchy and updaing the root 
1535  * priority appropriately. 
1538 sched_clutch_bucket_empty( 
1539         sched_clutch_bucket_t clutch_bucket
, 
1540         sched_clutch_root_t root_clutch
, 
1542         sched_clutch_bucket_options_t options
) 
1544         sched_clutch_hierarchy_locked_assert(root_clutch
); 
1545         sched_clutch_bucket_hierarchy_remove(root_clutch
, clutch_bucket
, clutch_bucket
->scb_bucket
, timestamp
, options
); 
1546         clutch_bucket
->scb_priority 
= sched_clutch_bucket_pri_calculate(clutch_bucket
, timestamp
); 
1547         sched_clutch_root_pri_update(root_clutch
); 
1551  * sched_clutch_cpu_usage_update() 
1553  * Routine to update CPU usage of the thread in the hierarchy. 
1556 sched_clutch_cpu_usage_update( 
1560         if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread
) || SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread
)) { 
1564         sched_clutch_t clutch 
= sched_clutch_for_thread(thread
); 
1565         sched_clutch_bucket_group_t clutch_bucket_group 
= &(clutch
->sc_clutch_groups
[thread
->th_sched_bucket
]); 
1566         sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group
, delta
); 
1570  * sched_clutch_bucket_group_cpu_usage_update() 
1572  * Routine to update the CPU usage of the clutch_bucket. 
1575 sched_clutch_bucket_group_cpu_usage_update( 
1576         sched_clutch_bucket_group_t clutch_bucket_group
, 
1579         if (clutch_bucket_group
->scbg_bucket 
== TH_BUCKET_FIXPRI
) { 
1580                 /* Since Above UI bucket has maximum interactivity score always, nothing to do here */ 
1583         delta 
= MIN(delta
, sched_clutch_bucket_group_adjust_threshold
); 
1584         os_atomic_add(&(clutch_bucket_group
->scbg_cpu_data
.cpu_data
.scbcd_cpu_used
), (clutch_cpu_data_t
)delta
, relaxed
); 
1588  * sched_clutch_bucket_group_cpu_pending_adjust() 
1590  * Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done 
1591  * such that one "pending interval" provides one point improvement in interactivity score. 
1593 static inline uint64_t 
1594 sched_clutch_bucket_group_cpu_pending_adjust( 
1596         uint64_t cpu_blocked
, 
1597         uint8_t pending_intervals
) 
1599         uint64_t cpu_used_adjusted 
= 0; 
1600         if (cpu_blocked 
< cpu_used
) { 
1601                 cpu_used_adjusted 
= (sched_clutch_bucket_group_interactive_pri 
* cpu_blocked 
* cpu_used
); 
1602                 cpu_used_adjusted 
= cpu_used_adjusted 
/ ((sched_clutch_bucket_group_interactive_pri 
* cpu_blocked
) + (cpu_used 
* pending_intervals
)); 
1604                 uint64_t adjust_factor 
= (cpu_blocked 
* pending_intervals
) / sched_clutch_bucket_group_interactive_pri
; 
1605                 cpu_used_adjusted 
= (adjust_factor 
> cpu_used
) ? 0 : (cpu_used 
- adjust_factor
); 
1607         return cpu_used_adjusted
; 
1611  * sched_clutch_bucket_group_cpu_adjust() 
1613  * Routine to scale the cpu usage and blocked time once the sum gets bigger 
1614  * than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain 
1615  * manageable and maintain the same ratio while allowing clutch buckets to 
1616  * adjust behavior and reflect in the interactivity score in a reasonable 
1617  * amount of time. Also adjusts the CPU usage based on pending_intervals 
1618  * which allows ageout of CPU to avoid starvation in highly contended scenarios. 
1621 sched_clutch_bucket_group_cpu_adjust( 
1622         sched_clutch_bucket_group_t clutch_bucket_group
, 
1623         uint8_t pending_intervals
) 
1625         sched_clutch_bucket_cpu_data_t old_cpu_data 
= {}; 
1626         sched_clutch_bucket_cpu_data_t new_cpu_data 
= {}; 
1627         os_atomic_rmw_loop(&clutch_bucket_group
->scbg_cpu_data
.scbcd_cpu_data_packed
, old_cpu_data
.scbcd_cpu_data_packed
, new_cpu_data
.scbcd_cpu_data_packed
, relaxed
, { 
1628                 clutch_cpu_data_t cpu_used 
= old_cpu_data
.cpu_data
.scbcd_cpu_used
; 
1629                 clutch_cpu_data_t cpu_blocked 
= old_cpu_data
.cpu_data
.scbcd_cpu_blocked
; 
1631                 if ((pending_intervals 
== 0) && (cpu_used 
+ cpu_blocked
) < sched_clutch_bucket_group_adjust_threshold
) { 
1632                         /* No changes to the CPU used and blocked values */ 
1633                         os_atomic_rmw_loop_give_up(); 
1635                 if ((cpu_used 
+ cpu_blocked
) >= sched_clutch_bucket_group_adjust_threshold
) { 
1636                         /* Only keep the recent CPU history to better indicate how this TG has been behaving */ 
1637                         cpu_used 
= cpu_used 
/ SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO
; 
1638                         cpu_blocked 
= cpu_blocked 
/ SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO
; 
1640                 /* Use the shift passed in to ageout the CPU usage */ 
1641                 cpu_used 
= (clutch_cpu_data_t
)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used
, cpu_blocked
, pending_intervals
); 
1642                 new_cpu_data
.cpu_data
.scbcd_cpu_used 
= cpu_used
; 
1643                 new_cpu_data
.cpu_data
.scbcd_cpu_blocked 
= cpu_blocked
; 
1648  * Thread level scheduling algorithm 
1650  * The thread level scheduling algorithm uses the mach timeshare 
1651  * decay based algorithm to achieve sharing between threads within the 
1652  * same clutch bucket. The load/priority shifts etc. are all maintained 
1653  * at the clutch bucket level and used for decay calculation of the 
1654  * threads. The load sampling is still driven off the scheduler tick 
1655  * for runnable clutch buckets (it does not use the new higher frequency 
1656  * EWMA based load calculation). The idea is that the contention and load 
1657  * within clutch_buckets should be limited enough to not see heavy decay 
1658  * and timeshare effectively. 
1662  * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr() 
1664  * Increment the run count for the clutch bucket associated with the 
1668 sched_clutch_thread_run_bucket_incr( 
1670         sched_bucket_t bucket
) 
1672         if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread
)) { 
1675         sched_clutch_t clutch 
= sched_clutch_for_thread(thread
); 
1676         return sched_clutch_run_bucket_incr(clutch
, bucket
); 
1680 sched_clutch_run_bucket_incr( 
1681         sched_clutch_t clutch
, 
1682         sched_bucket_t bucket
) 
1684         assert(bucket 
!= TH_BUCKET_RUN
); 
1685         sched_clutch_bucket_group_t clutch_bucket_group 
= &(clutch
->sc_clutch_groups
[bucket
]); 
1686         return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group
); 
1690  * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr() 
1692  * Decrement the run count for the clutch bucket associated with the 
1696 sched_clutch_thread_run_bucket_decr( 
1698         sched_bucket_t bucket
) 
1700         if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread
)) { 
1703         sched_clutch_t clutch 
= sched_clutch_for_thread(thread
); 
1704         return sched_clutch_run_bucket_decr(clutch
, bucket
); 
1708 sched_clutch_run_bucket_decr( 
1709         sched_clutch_t clutch
, 
1710         sched_bucket_t bucket
) 
1712         assert(bucket 
!= TH_BUCKET_RUN
); 
1713         sched_clutch_bucket_group_t clutch_bucket_group 
= &(clutch
->sc_clutch_groups
[bucket
]); 
1714         return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group
); 
1718  * sched_clutch_bucket_group_timeshare_update() 
1720  * Routine to update the load and priority shift for the clutch_bucket_group 
1721  * every sched_tick. For multi-cluster platforms, each QoS level will have multiple 
1722  * clutch buckets with runnable threads in them. So it is important to maintain 
1723  * the timesharing information at the clutch_bucket_group level instead of 
1724  * individual clutch buckets (because the algorithm is trying to timeshare all 
1725  * threads at the same QoS irrespective of which hierarchy they are enqueued in). 
1727  * The routine is called from the sched tick handling code to make sure this value 
1728  * is updated at least once every sched tick. For clutch bucket groups which have 
1729  * not been runnable for very long, the clutch_bucket_group maintains a "last 
1730  * updated schedtick" parameter. As threads become runnable in the clutch bucket group, 
1731  * if this value is outdated, the load and shifts are updated. 
1733  * Possible optimization: 
1734  * - The current algorithm samples the load every sched tick (125ms). 
1735  *   This is prone to spikes in runnable counts; if that turns out to be 
1736  *   a problem, a simple solution would be to do the EWMA trick to sample 
1737  *   load at every load_tick (30ms) and use the averaged value for the pri 
1738  *   shift calculation. 
1741 sched_clutch_bucket_group_timeshare_update( 
1742         sched_clutch_bucket_group_t clutch_bucket_group
, 
1743         sched_clutch_bucket_t clutch_bucket
, 
1746         if (clutch_bucket_group
->scbg_bucket 
< TH_BUCKET_SHARE_FG
) { 
1747                 /* No timesharing needed for fixed priority Above UI threads */ 
1752          * Update the timeshare parameters for the clutch bucket group 
1753          * if they havent been updated in this tick. 
1755         uint32_t sched_ts 
= os_atomic_load(&clutch_bucket_group
->scbg_timeshare_tick
, relaxed
); 
1756         uint32_t current_sched_ts 
= sched_tick
; 
1757         if (sched_ts 
< current_sched_ts
) { 
1758                 os_atomic_store(&clutch_bucket_group
->scbg_timeshare_tick
, current_sched_ts
, relaxed
); 
1759                 /* NCPU wide workloads should not experience decay */ 
1760                 uint64_t bucket_group_run_count 
= os_atomic_load_wide(&clutch_bucket_group
->scbg_blocked_data
.scct_count
, relaxed
) - 1; 
1761                 uint32_t bucket_group_load 
= (uint32_t)(bucket_group_run_count 
/ processor_avail_count
); 
1762                 bucket_group_load 
= MIN(bucket_group_load
, NRQS 
- 1); 
1763                 uint32_t pri_shift 
= sched_fixed_shift 
- sched_load_shifts
[bucket_group_load
]; 
1764                 /* Ensure that the pri_shift value is reasonable */ 
1765                 pri_shift 
= (pri_shift 
> SCHED_PRI_SHIFT_MAX
) ? INT8_MAX 
: pri_shift
; 
1766                 os_atomic_store(&clutch_bucket_group
->scbg_pri_shift
, pri_shift
, relaxed
); 
1770          * Update the clutch bucket priority; this allows clutch buckets that have been pending 
1771          * for a long time to get an updated interactivity score. 
1773         sched_clutch_bucket_update(clutch_bucket
, clutch_bucket
->scb_root
, ctime
, SCHED_CLUTCH_BUCKET_OPTIONS_NONE
); 
1777  * sched_clutch_thread_clutch_update() 
1779  * Routine called when the thread changes its thread group. The current 
1780  * implementation relies on the fact that the thread group is changed only 
1781  * from the context of the thread itself. Due to this fact, the thread 
1782  * group change causes only counter updates in the old & new clutch 
1783  * buckets and no hierarchy changes. The routine also attributes the CPU 
1784  * used so far to the old clutch. 
1787 sched_clutch_thread_clutch_update( 
1789         sched_clutch_t old_clutch
, 
1790         sched_clutch_t new_clutch
) 
1793         assert(current_thread() == thread
); 
1796                 sched_clutch_run_bucket_decr(old_clutch
, thread
->th_sched_bucket
); 
1798                  * Calculate the CPU used by this thread in the old bucket and 
1799                  * add it to the old clutch bucket. This uses the same CPU usage 
1800                  * logic as update_priority etc. 
1802                 thread_timer_delta(thread
, cpu_delta
); 
1803                 if (thread
->pri_shift 
< INT8_MAX
) { 
1804                         thread
->sched_usage 
+= cpu_delta
; 
1806                 thread
->cpu_delta 
+= cpu_delta
; 
1807                 if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread
)) { 
1808                         sched_clutch_bucket_group_t clutch_bucket_group 
= &(old_clutch
->sc_clutch_groups
[thread
->th_sched_bucket
]); 
1809                         sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group
, cpu_delta
); 
1814                 sched_clutch_run_bucket_incr(new_clutch
, thread
->th_sched_bucket
); 
1818 /* Thread Insertion/Removal/Selection routines */ 
1820 #if CONFIG_SCHED_EDGE 
1823  * Edge Scheduler Bound Thread Support 
1825  * The edge scheduler allows threads to be bound to specific clusters. The scheduler 
1826  * maintains a separate runq on the clutch root to hold these bound threads. These 
1827  * bound threads count towards the root priority and thread count, but are ignored 
1828  * for thread migration/steal decisions. Bound threads that are enqueued in the 
1829  * separate runq have the th_bound_cluster_enqueued flag set to allow easy 
1832  * Bound Threads Timesharing 
1833  * The bound threads share the timesharing properties of the clutch bucket group they are 
1834  * part of. They contribute to the load and use priority shifts/decay values from the 
1835  * clutch bucket group. 
1839 sched_edge_bound_thread_insert( 
1840         sched_clutch_root_t root_clutch
, 
1844         /* Update the clutch runnable count and priority */ 
1845         sched_clutch_thr_count_inc(&root_clutch
->scr_thr_count
); 
1846         sched_clutch_root_bucket_t root_bucket 
= &root_clutch
->scr_bound_buckets
[thread
->th_sched_bucket
]; 
1847         if (root_bucket
->scrb_bound_thread_runq
.count 
== 0) { 
1848                 sched_clutch_root_bucket_runnable(root_bucket
, root_clutch
, mach_absolute_time()); 
1851         assert((thread
->th_bound_cluster_enqueued
) == false); 
1852         run_queue_enqueue(&root_bucket
->scrb_bound_thread_runq
, thread
, options
); 
1853         thread
->th_bound_cluster_enqueued 
= true; 
1855         int16_t root_old_pri 
= root_clutch
->scr_priority
; 
1856         sched_clutch_root_pri_update(root_clutch
); 
1857         return root_clutch
->scr_priority 
> root_old_pri
; 
1861 sched_edge_bound_thread_remove( 
1862         sched_clutch_root_t root_clutch
, 
1865         sched_clutch_root_bucket_t root_bucket 
= &root_clutch
->scr_bound_buckets
[thread
->th_sched_bucket
]; 
1866         assert((thread
->th_bound_cluster_enqueued
) == true); 
1867         run_queue_remove(&root_bucket
->scrb_bound_thread_runq
, thread
); 
1868         thread
->th_bound_cluster_enqueued 
= false; 
1870         /* Update the clutch runnable count and priority */ 
1871         sched_clutch_thr_count_dec(&root_clutch
->scr_thr_count
); 
1872         if (root_bucket
->scrb_bound_thread_runq
.count 
== 0) { 
1873                 sched_clutch_root_bucket_empty(root_bucket
, root_clutch
, mach_absolute_time()); 
1875         sched_clutch_root_pri_update(root_clutch
); 
1878 #endif /* CONFIG_SCHED_EDGE */ 
1881  * sched_clutch_thread_bound_lookup() 
1883  * Routine to lookup the highest priority runnable thread in a bounded root bucket. 
1886 sched_clutch_thread_bound_lookup( 
1887         __unused sched_clutch_root_t root_clutch
, 
1888         sched_clutch_root_bucket_t root_bucket
) 
1890         return run_queue_peek(&root_bucket
->scrb_bound_thread_runq
); 
1894  * Clutch Bucket Group Thread Counts and Pending time calculation 
1896  * The pending time on the clutch_bucket_group allows the scheduler to track if it 
1897  * needs to ageout the CPU usage because the clutch_bucket_group has been pending for 
1898  * a very long time. The pending time is set to the timestamp as soon as a thread becomes 
1899  * runnable. When a thread is picked up for execution from this clutch_bucket_group, the 
1900  * pending time is advanced to the time of thread selection. 
1902  * Since threads for a clutch bucket group can be added or removed from multiple CPUs 
1903  * simulataneously, it is important that the updates to thread counts and pending timestamps 
1904  * happen atomically. The implementation relies on the following aspects to make that work 
1906  * - The clutch scheduler would be deployed on single cluster platforms where the pset lock 
1907  *   is held when threads are added/removed and pending timestamps are updated 
1908  * - The edge scheduler would have support for double wide 128 bit atomics which allow the 
1909  *   thread count and pending timestamp to be updated atomically. 
1911  * Clutch bucket group interactivity timestamp and score updates also rely on the properties 
1912  * above to atomically update the interactivity score for a clutch bucket group. 
1915 #if CONFIG_SCHED_EDGE 
1918 sched_clutch_bucket_group_thr_count_inc( 
1919         sched_clutch_bucket_group_t clutch_bucket_group
, 
1922         sched_clutch_counter_time_t old_pending_data
; 
1923         sched_clutch_counter_time_t new_pending_data
; 
1924         os_atomic_rmw_loop(&clutch_bucket_group
->scbg_pending_data
.scct_packed
, old_pending_data
.scct_packed
, new_pending_data
.scct_packed
, relaxed
, { 
1925                 new_pending_data
.scct_count 
= old_pending_data
.scct_count 
+ 1; 
1926                 new_pending_data
.scct_timestamp 
= old_pending_data
.scct_timestamp
; 
1927                 if (old_pending_data
.scct_count 
== 0) { 
1928                         new_pending_data
.scct_timestamp 
= timestamp
; 
1934 sched_clutch_bucket_group_thr_count_dec( 
1935         sched_clutch_bucket_group_t clutch_bucket_group
, 
1938         sched_clutch_counter_time_t old_pending_data
; 
1939         sched_clutch_counter_time_t new_pending_data
; 
1940         os_atomic_rmw_loop(&clutch_bucket_group
->scbg_pending_data
.scct_packed
, old_pending_data
.scct_packed
, new_pending_data
.scct_packed
, relaxed
, { 
1941                 new_pending_data
.scct_count 
= old_pending_data
.scct_count 
- 1; 
1942                 if (new_pending_data
.scct_count 
== 0) { 
1943                         new_pending_data
.scct_timestamp 
= SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID
; 
1945                         new_pending_data
.scct_timestamp 
= timestamp
; 
1951 sched_clutch_bucket_group_pending_ageout( 
1952         sched_clutch_bucket_group_t clutch_bucket_group
, 
1955         int bucket_load 
= sched_clutch_global_bucket_load_get(clutch_bucket_group
->scbg_bucket
); 
1956         sched_clutch_counter_time_t old_pending_data
; 
1957         sched_clutch_counter_time_t new_pending_data
; 
1958         uint8_t cpu_usage_shift 
= 0; 
1960         os_atomic_rmw_loop(&clutch_bucket_group
->scbg_pending_data
.scct_packed
, old_pending_data
.scct_packed
, new_pending_data
.scct_packed
, relaxed
, { 
1961                 cpu_usage_shift 
= 0; 
1962                 uint64_t old_pending_ts 
= old_pending_data
.scct_timestamp
; 
1963                 bool old_update 
= (old_pending_ts 
>= timestamp
); 
1964                 bool no_pending_time 
= (old_pending_ts 
== SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID
); 
1965                 bool no_bucket_load 
= (bucket_load 
== 0); 
1966                 if (old_update 
|| no_pending_time 
|| no_bucket_load
) { 
1967                         os_atomic_rmw_loop_give_up(); 
1970                 /* Calculate the time the clutch bucket group has been pending */ 
1971                 uint64_t pending_delta 
= timestamp 
- old_pending_ts
; 
1972                 uint64_t interactivity_delta 
= sched_clutch_bucket_group_pending_delta
[clutch_bucket_group
->scbg_bucket
] * bucket_load
; 
1973                 if (pending_delta 
< interactivity_delta
) { 
1974                         os_atomic_rmw_loop_give_up(); 
1976                 cpu_usage_shift 
= (pending_delta 
/ interactivity_delta
); 
1977                 new_pending_data
.scct_timestamp 
= old_pending_ts 
+ (cpu_usage_shift 
* interactivity_delta
); 
1978                 new_pending_data
.scct_count 
= old_pending_data
.scct_count
; 
1980         return cpu_usage_shift
; 
1984 sched_clutch_bucket_group_interactivity_score_calculate( 
1985         sched_clutch_bucket_group_t clutch_bucket_group
, 
1988         if (clutch_bucket_group
->scbg_bucket 
== TH_BUCKET_FIXPRI
) { 
1990                  * Since the root bucket selection algorithm for Above UI looks at clutch bucket 
1991                  * priorities, make sure all AboveUI buckets are marked interactive. 
1993                 assert(clutch_bucket_group
->scbg_interactivity_data
.scct_count 
== (2 * sched_clutch_bucket_group_interactive_pri
)); 
1994                 return (uint8_t)clutch_bucket_group
->scbg_interactivity_data
.scct_count
; 
1996         /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */ 
1997         uint8_t pending_intervals 
= sched_clutch_bucket_group_pending_ageout(clutch_bucket_group
, timestamp
); 
1998         /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */ 
1999         sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group
, pending_intervals
); 
2000         uint8_t interactivity_score 
= sched_clutch_interactivity_from_cpu_data(clutch_bucket_group
); 
2001         sched_clutch_counter_time_t old_interactivity_data
; 
2002         sched_clutch_counter_time_t new_interactivity_data
; 
2004         bool score_updated 
= os_atomic_rmw_loop(&clutch_bucket_group
->scbg_interactivity_data
.scct_packed
, old_interactivity_data
.scct_packed
, new_interactivity_data
.scct_packed
, relaxed
, { 
2005                 if (old_interactivity_data
.scct_timestamp 
>= timestamp
) { 
2006                         os_atomic_rmw_loop_give_up(); 
2008                 new_interactivity_data
.scct_timestamp 
= timestamp
; 
2009                 if (old_interactivity_data
.scct_timestamp 
!= 0) { 
2010                         new_interactivity_data
.scct_count 
= interactivity_score
; 
2013         if (score_updated
) { 
2014                 return (uint8_t)new_interactivity_data
.scct_count
; 
2016                 return (uint8_t)old_interactivity_data
.scct_count
; 
2020 #else /* CONFIG_SCHED_EDGE */ 
2023  * For the clutch scheduler, atomicity is ensured by making sure all operations 
2024  * are happening under the pset lock of the only cluster present on the platform. 
2027 sched_clutch_bucket_group_thr_count_inc( 
2028         sched_clutch_bucket_group_t clutch_bucket_group
, 
2031         sched_clutch_hierarchy_locked_assert(&pset0
.pset_clutch_root
); 
2032         if (clutch_bucket_group
->scbg_pending_data
.scct_count 
== 0) { 
2033                 clutch_bucket_group
->scbg_pending_data
.scct_timestamp 
= timestamp
; 
2035         clutch_bucket_group
->scbg_pending_data
.scct_count
++; 
2039 sched_clutch_bucket_group_thr_count_dec( 
2040         sched_clutch_bucket_group_t clutch_bucket_group
, 
2043         sched_clutch_hierarchy_locked_assert(&pset0
.pset_clutch_root
); 
2044         clutch_bucket_group
->scbg_pending_data
.scct_count
--; 
2045         if (clutch_bucket_group
->scbg_pending_data
.scct_count 
== 0) { 
2046                 clutch_bucket_group
->scbg_pending_data
.scct_timestamp 
= SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID
; 
2048                 clutch_bucket_group
->scbg_pending_data
.scct_timestamp 
= timestamp
; 
2053 sched_clutch_bucket_group_pending_ageout( 
2054         sched_clutch_bucket_group_t clutch_bucket_group
, 
2057         sched_clutch_hierarchy_locked_assert(&pset0
.pset_clutch_root
); 
2058         int bucket_load 
= sched_clutch_global_bucket_load_get(clutch_bucket_group
->scbg_bucket
); 
2059         uint64_t old_pending_ts 
= clutch_bucket_group
->scbg_pending_data
.scct_timestamp
; 
2060         bool old_update 
= (old_pending_ts 
>= timestamp
); 
2061         bool no_pending_time 
= (old_pending_ts 
== SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID
); 
2062         bool no_bucket_load 
= (bucket_load 
== 0); 
2063         if (old_update 
|| no_pending_time 
|| no_bucket_load
) { 
2066         uint64_t pending_delta 
= timestamp 
- old_pending_ts
; 
2067         uint64_t interactivity_delta 
= sched_clutch_bucket_group_pending_delta
[clutch_bucket_group
->scbg_bucket
] * bucket_load
; 
2068         if (pending_delta 
< interactivity_delta
) { 
2071         uint8_t cpu_usage_shift 
= (pending_delta 
/ interactivity_delta
); 
2072         clutch_bucket_group
->scbg_pending_data
.scct_timestamp 
= old_pending_ts 
+ (cpu_usage_shift 
* interactivity_delta
); 
2073         return cpu_usage_shift
; 
2077 sched_clutch_bucket_group_interactivity_score_calculate( 
2078         sched_clutch_bucket_group_t clutch_bucket_group
, 
2081         sched_clutch_hierarchy_locked_assert(&pset0
.pset_clutch_root
); 
2082         if (clutch_bucket_group
->scbg_bucket 
== TH_BUCKET_FIXPRI
) { 
2084                  * Since the root bucket selection algorithm for Above UI looks at clutch bucket 
2085                  * priorities, make sure all AboveUI buckets are marked interactive. 
2087                 assert(clutch_bucket_group
->scbg_interactivity_data
.scct_count 
== (2 * sched_clutch_bucket_group_interactive_pri
)); 
2088                 return (uint8_t)clutch_bucket_group
->scbg_interactivity_data
.scct_count
; 
2090         /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */ 
2091         uint8_t pending_intervals 
= sched_clutch_bucket_group_pending_ageout(clutch_bucket_group
, timestamp
); 
2092         /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */ 
2093         sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group
, pending_intervals
); 
2094         uint8_t interactivity_score 
= sched_clutch_interactivity_from_cpu_data(clutch_bucket_group
); 
2095         if (timestamp 
> clutch_bucket_group
->scbg_interactivity_data
.scct_timestamp
) { 
2096                 clutch_bucket_group
->scbg_interactivity_data
.scct_count 
= interactivity_score
; 
2097                 clutch_bucket_group
->scbg_interactivity_data
.scct_timestamp 
= timestamp
; 
2098                 return interactivity_score
; 
2100                 return (uint8_t)clutch_bucket_group
->scbg_interactivity_data
.scct_count
; 
2104 #endif /* CONFIG_SCHED_EDGE */ 
2107  * Clutch Bucket Group Run Count and Blocked Time Accounting 
2109  * The clutch bucket group maintains the number of runnable/running threads in the group. 
2110  * Since the blocked time of the clutch bucket group is based on this count, it is 
2111  * important to make sure the blocking timestamp and the run count are updated atomically. 
2113  * Since the run count increments happen without any pset locks held, the scheduler makes 
2114  * these updates atomic in the following way: 
2115  * - On 64-bit platforms, it uses double wide atomics to update the count & timestamp 
2116  * - On 32-bit platforms, it uses a lock to synchronize the count & timestamp update 
2122 sched_clutch_bucket_group_run_count_inc( 
2123         sched_clutch_bucket_group_t clutch_bucket_group
) 
2125         uint64_t blocked_time 
= 0; 
2126         uint64_t updated_run_count 
= 0; 
2128         lck_spin_lock(&clutch_bucket_group
->scbg_stats_lock
); 
2129         if ((clutch_bucket_group
->scbg_blocked_data
.scct_timestamp 
!= SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID
) && 
2130             (clutch_bucket_group
->scbg_blocked_data
.scct_count 
== 0)) { 
2131                 /* Run count is transitioning from 0 to 1; calculate blocked time and add it to CPU data */ 
2132                 blocked_time 
= mach_absolute_time() - clutch_bucket_group
->scbg_blocked_data
.scct_timestamp
; 
2133                 clutch_bucket_group
->scbg_blocked_data
.scct_timestamp 
= SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID
; 
2135         clutch_bucket_group
->scbg_blocked_data
.scct_count 
= clutch_bucket_group
->scbg_blocked_data
.scct_count 
+ 1; 
2136         updated_run_count 
= clutch_bucket_group
->scbg_blocked_data
.scct_count
; 
2137         lck_spin_unlock(&clutch_bucket_group
->scbg_stats_lock
); 
2139         blocked_time 
= MIN(blocked_time
, sched_clutch_bucket_group_adjust_threshold
); 
2140         os_atomic_add(&(clutch_bucket_group
->scbg_cpu_data
.cpu_data
.scbcd_cpu_blocked
), (clutch_cpu_data_t
)blocked_time
, relaxed
); 
2141         return (uint32_t)updated_run_count
; 
2145 sched_clutch_bucket_group_run_count_dec( 
2146         sched_clutch_bucket_group_t clutch_bucket_group
) 
2148         uint64_t updated_run_count 
= 0; 
2150         lck_spin_lock(&clutch_bucket_group
->scbg_stats_lock
); 
2151         clutch_bucket_group
->scbg_blocked_data
.scct_count 
= clutch_bucket_group
->scbg_blocked_data
.scct_count 
- 1; 
2152         if (clutch_bucket_group
->scbg_blocked_data
.scct_count 
== 0) { 
2153                 /* Run count is transitioning from 1 to 0; start the blocked timer */ 
2154                 clutch_bucket_group
->scbg_blocked_data
.scct_timestamp 
= mach_absolute_time(); 
2156         updated_run_count 
= clutch_bucket_group
->scbg_blocked_data
.scct_count
; 
2157         lck_spin_unlock(&clutch_bucket_group
->scbg_stats_lock
); 
2158         return (uint32_t)updated_run_count
; 
2161 #else /* !__LP64__ */ 
2164 sched_clutch_bucket_group_run_count_inc( 
2165         sched_clutch_bucket_group_t clutch_bucket_group
) 
2167         sched_clutch_counter_time_t old_blocked_data
; 
2168         sched_clutch_counter_time_t new_blocked_data
; 
2170         bool update_blocked_time 
= false; 
2171         os_atomic_rmw_loop(&clutch_bucket_group
->scbg_blocked_data
.scct_packed
, old_blocked_data
.scct_packed
, new_blocked_data
.scct_packed
, relaxed
, { 
2172                 new_blocked_data
.scct_count 
= old_blocked_data
.scct_count 
+ 1; 
2173                 new_blocked_data
.scct_timestamp 
= old_blocked_data
.scct_timestamp
; 
2174                 update_blocked_time 
= false; 
2175                 if (old_blocked_data
.scct_count 
== 0) { 
2176                         new_blocked_data
.scct_timestamp 
= SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID
; 
2177                         update_blocked_time 
= true; 
2180         if (update_blocked_time 
&& (old_blocked_data
.scct_timestamp 
!= SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID
)) { 
2181                 uint64_t ctime 
= mach_absolute_time(); 
2182                 if (ctime 
> old_blocked_data
.scct_timestamp
) { 
2183                         uint64_t blocked_time 
= ctime 
- old_blocked_data
.scct_timestamp
; 
2184                         blocked_time 
= MIN(blocked_time
, sched_clutch_bucket_group_adjust_threshold
); 
2185                         os_atomic_add(&(clutch_bucket_group
->scbg_cpu_data
.cpu_data
.scbcd_cpu_blocked
), (clutch_cpu_data_t
)blocked_time
, relaxed
); 
2188         return (uint32_t)new_blocked_data
.scct_count
; 
2192 sched_clutch_bucket_group_run_count_dec( 
2193         sched_clutch_bucket_group_t clutch_bucket_group
) 
2195         sched_clutch_counter_time_t old_blocked_data
; 
2196         sched_clutch_counter_time_t new_blocked_data
; 
2198         uint64_t ctime 
= mach_absolute_time(); 
2199         os_atomic_rmw_loop(&clutch_bucket_group
->scbg_blocked_data
.scct_packed
, old_blocked_data
.scct_packed
, new_blocked_data
.scct_packed
, relaxed
, { 
2200                 new_blocked_data
.scct_count 
= old_blocked_data
.scct_count 
- 1; 
2201                 new_blocked_data
.scct_timestamp 
= old_blocked_data
.scct_timestamp
; 
2202                 if (new_blocked_data
.scct_count 
== 0) { 
2203                         new_blocked_data
.scct_timestamp 
= ctime
; 
2206         return (uint32_t)new_blocked_data
.scct_count
; 
2209 #endif /* !__LP64__ */ 
2212  * sched_clutch_thread_insert() 
2214  * Routine to insert a thread into the sched clutch hierarchy. 
2215  * Update the counts at all levels of the hierarchy and insert the nodes 
2216  * as they become runnable. Always called with the pset lock held. 
2219 sched_clutch_thread_insert( 
2220         sched_clutch_root_t root_clutch
, 
2224         boolean_t result 
= FALSE
; 
2226         sched_clutch_hierarchy_locked_assert(root_clutch
); 
2227 #if CONFIG_SCHED_EDGE 
2228         sched_edge_cluster_cumulative_count_incr(root_clutch
, thread
->th_sched_bucket
); 
2230          * Check if the thread is bound and is being enqueued in its desired bound cluster. 
2231          * One scenario where a bound thread might not be getting enqueued in the bound cluster 
2232          * hierarchy would be if the thread is "soft" bound and the bound cluster is 
2233          * de-recommended. In that case, the thread should be treated as an unbound 
2236         if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread
) && (sched_edge_thread_bound_cluster_id(thread
) == root_clutch
->scr_cluster_id
)) { 
2237                 return sched_edge_bound_thread_insert(root_clutch
, thread
, options
); 
2239 #endif /* CONFIG_SCHED_EDGE */ 
2240         sched_clutch_t clutch 
= sched_clutch_for_thread(thread
); 
2241         assert(thread
->thread_group 
== clutch
->sc_tg
); 
2243         uint64_t current_timestamp 
= mach_absolute_time(); 
2244         sched_clutch_bucket_group_t clutch_bucket_group 
= &(clutch
->sc_clutch_groups
[thread
->th_sched_bucket
]); 
2245         sched_clutch_bucket_t clutch_bucket 
= &(clutch_bucket_group
->scbg_clutch_buckets
[root_clutch
->scr_cluster_id
]); 
2246         assert((clutch_bucket
->scb_root 
== NULL
) || (clutch_bucket
->scb_root 
== root_clutch
)); 
2249          * Thread linkage in clutch_bucket 
2251          * A thread has a few linkages within the clutch bucket: 
2252          * - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket 
2253          * - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation) 
2254          * - A queue linkage used for timesharing operations of threads at the scheduler tick 
2257         /* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */ 
2258         thread
->th_clutch_runq_link
.stamp 
= current_timestamp
; 
2259         priority_queue_entry_set_sched_pri(&clutch_bucket
->scb_thread_runq
, &thread
->th_clutch_runq_link
, thread
->sched_pri
, 
2260             (options 
& SCHED_TAILQ
) ? PRIORITY_QUEUE_ENTRY_NONE 
: PRIORITY_QUEUE_ENTRY_PREEMPTED
); 
2261         priority_queue_insert(&clutch_bucket
->scb_thread_runq
, &thread
->th_clutch_runq_link
); 
2263         /* Insert thread into clutch_bucket priority queue based on the promoted or base priority */ 
2264         priority_queue_entry_set_sched_pri(&clutch_bucket
->scb_clutchpri_prioq
, &thread
->th_clutch_pri_link
, 
2265             sched_thread_sched_pri_promoted(thread
) ? thread
->sched_pri 
: thread
->base_pri
, false); 
2266         priority_queue_insert(&clutch_bucket
->scb_clutchpri_prioq
, &thread
->th_clutch_pri_link
); 
2268         /* Insert thread into timesharing queue of the clutch bucket */ 
2269         enqueue_tail(&clutch_bucket
->scb_thread_timeshare_queue
, &thread
->th_clutch_timeshare_link
); 
2271         /* Increment the urgency counter for the root if necessary */ 
2272         sched_clutch_root_urgency_inc(root_clutch
, thread
); 
2274         os_atomic_inc(&clutch
->sc_thr_count
, relaxed
); 
2275         sched_clutch_bucket_group_thr_count_inc(clutch_bucket
->scb_group
, current_timestamp
); 
2277         /* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */ 
2278         sched_clutch_bucket_options_t scb_options 
= (options 
& SCHED_HEADQ
) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ 
: SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ
; 
2279         if (clutch_bucket
->scb_thr_count 
== 0) { 
2280                 sched_clutch_thr_count_inc(&clutch_bucket
->scb_thr_count
); 
2281                 sched_clutch_thr_count_inc(&root_clutch
->scr_thr_count
); 
2282                 result 
= sched_clutch_bucket_runnable(clutch_bucket
, root_clutch
, current_timestamp
, scb_options
); 
2284                 sched_clutch_thr_count_inc(&clutch_bucket
->scb_thr_count
); 
2285                 sched_clutch_thr_count_inc(&root_clutch
->scr_thr_count
); 
2286                 result 
= sched_clutch_bucket_update(clutch_bucket
, root_clutch
, current_timestamp
, scb_options
); 
2289         KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_CLUTCH_THR_COUNT
) | DBG_FUNC_NONE
, 
2290             root_clutch
->scr_cluster_id
, thread_group_get_id(clutch_bucket
->scb_group
->scbg_clutch
->sc_tg
), clutch_bucket
->scb_bucket
, 
2291             SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch
->scr_thr_count
, os_atomic_load(&clutch
->sc_thr_count
, relaxed
), clutch_bucket
->scb_thr_count
)); 
2296  * sched_clutch_thread_remove() 
2298  * Routine to remove a thread from the sched clutch hierarchy. 
2299  * Update the counts at all levels of the hierarchy and remove the nodes 
2300  * as they become empty. Always called with the pset lock held. 
2303 sched_clutch_thread_remove( 
2304         sched_clutch_root_t root_clutch
, 
2306         uint64_t current_timestamp
, 
2307         sched_clutch_bucket_options_t options
) 
2309         sched_clutch_hierarchy_locked_assert(root_clutch
); 
2310 #if CONFIG_SCHED_EDGE 
2311         sched_edge_cluster_cumulative_count_decr(root_clutch
, thread
->th_sched_bucket
); 
2312         if (thread
->th_bound_cluster_enqueued
) { 
2313                 sched_edge_bound_thread_remove(root_clutch
, thread
); 
2316 #endif /* CONFIG_SCHED_EDGE */ 
2317         sched_clutch_t clutch 
= sched_clutch_for_thread(thread
); 
2318         assert(thread
->thread_group 
== clutch
->sc_tg
); 
2319         assert(thread
->runq 
!= PROCESSOR_NULL
); 
2321         sched_clutch_bucket_group_t clutch_bucket_group 
= &(clutch
->sc_clutch_groups
[thread
->th_sched_bucket
]); 
2322         sched_clutch_bucket_t clutch_bucket 
= &(clutch_bucket_group
->scbg_clutch_buckets
[root_clutch
->scr_cluster_id
]); 
2323         assert(clutch_bucket
->scb_root 
== root_clutch
); 
2325         /* Decrement the urgency counter for the root if necessary */ 
2326         sched_clutch_root_urgency_dec(root_clutch
, thread
); 
2327         /* Remove thread from the clutch_bucket */ 
2328         priority_queue_remove(&clutch_bucket
->scb_thread_runq
, &thread
->th_clutch_runq_link
); 
2329         remqueue(&thread
->th_clutch_timeshare_link
); 
2330         thread
->runq 
= PROCESSOR_NULL
; 
2332         priority_queue_remove(&clutch_bucket
->scb_clutchpri_prioq
, &thread
->th_clutch_pri_link
); 
2334         /* Update counts at various levels of the hierarchy */ 
2335         os_atomic_dec(&clutch
->sc_thr_count
, relaxed
); 
2336         sched_clutch_bucket_group_thr_count_dec(clutch_bucket
->scb_group
, current_timestamp
); 
2337         sched_clutch_thr_count_dec(&root_clutch
->scr_thr_count
); 
2338         sched_clutch_thr_count_dec(&clutch_bucket
->scb_thr_count
); 
2340         /* Remove the clutch from hierarchy (if needed) and update properties */ 
2341         if (clutch_bucket
->scb_thr_count 
== 0) { 
2342                 sched_clutch_bucket_empty(clutch_bucket
, root_clutch
, current_timestamp
, options
); 
2344                 sched_clutch_bucket_update(clutch_bucket
, root_clutch
, current_timestamp
, options
); 
2346         KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_CLUTCH_THR_COUNT
) | DBG_FUNC_NONE
, 
2347             root_clutch
->scr_cluster_id
, thread_group_get_id(clutch_bucket
->scb_group
->scbg_clutch
->sc_tg
), clutch_bucket
->scb_bucket
, 
2348             SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch
->scr_thr_count
, os_atomic_load(&clutch
->sc_thr_count
, relaxed
), clutch_bucket
->scb_thr_count
)); 
2352  * sched_clutch_thread_unbound_lookup() 
2354  * Routine to find the highest unbound thread in the root clutch. 
2355  * Helps find threads easily for steal/migrate scenarios in the 
2359 sched_clutch_thread_unbound_lookup( 
2360         sched_clutch_root_t root_clutch
, 
2361         sched_clutch_root_bucket_t root_bucket
) 
2363         sched_clutch_hierarchy_locked_assert(root_clutch
); 
2365         /* Find the highest priority clutch bucket in this root bucket */ 
2366         sched_clutch_bucket_t clutch_bucket 
= sched_clutch_root_bucket_highest_clutch_bucket(root_bucket
); 
2367         assert(clutch_bucket 
!= NULL
); 
2369         /* Find the highest priority runnable thread in this clutch bucket */ 
2370         thread_t thread 
= priority_queue_max(&clutch_bucket
->scb_thread_runq
, struct thread
, th_clutch_runq_link
); 
2371         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_CLUTCH_THREAD_SELECT
) | DBG_FUNC_NONE
, 
2372             thread_tid(thread
), thread_group_get_id(clutch_bucket
->scb_group
->scbg_clutch
->sc_tg
), clutch_bucket
->scb_bucket
, 0, 0); 
2377  * sched_clutch_thread_highest_remove() 
2379  * Routine to find and remove the highest priority thread 
2380  * from the sched clutch hierarchy. The algorithm looks at the 
2381  * hierarchy for the most eligible runnable thread and calls 
2382  * sched_clutch_thread_remove(). Always called with the 
2386 sched_clutch_thread_highest_remove( 
2387         sched_clutch_root_t root_clutch
) 
2389         sched_clutch_hierarchy_locked_assert(root_clutch
); 
2390         uint64_t current_timestamp 
= mach_absolute_time(); 
2392         sched_clutch_root_bucket_t root_bucket 
= sched_clutch_root_highest_root_bucket(root_clutch
, current_timestamp
, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL
); 
2393         if (root_bucket 
== NULL
) { 
2397         thread_t highest_thread 
= THREAD_NULL
; 
2398         if (root_bucket
->scrb_bound
) { 
2399                 highest_thread 
= sched_clutch_thread_bound_lookup(root_clutch
, root_bucket
); 
2401                 highest_thread 
= sched_clutch_thread_unbound_lookup(root_clutch
, root_bucket
); 
2403         sched_clutch_thread_remove(root_clutch
, highest_thread
, current_timestamp
, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR
); 
2404         return highest_thread
; 
2407 /* High level global accessor routines */ 
2410  * sched_clutch_root_urgency() 
2412  * Routine to get the urgency of the highest runnable 
2413  * thread in the hierarchy. 
2416 sched_clutch_root_urgency( 
2417         sched_clutch_root_t root_clutch
) 
2419         return root_clutch
->scr_urgency
; 
2423  * sched_clutch_root_count_sum() 
2425  * The count_sum mechanism is used for scheduler runq 
2426  * statistics calculation. Its only useful for debugging 
2427  * purposes; since it takes a mach_absolute_time() on 
2428  * other scheduler implementations, its better to avoid 
2429  * populating this until absolutely necessary. 
2432 sched_clutch_root_count_sum( 
2433         __unused sched_clutch_root_t root_clutch
) 
2439  * sched_clutch_root_priority() 
2441  * Routine to get the priority of the highest runnable 
2442  * thread in the hierarchy. 
2445 sched_clutch_root_priority( 
2446         sched_clutch_root_t root_clutch
) 
2448         return root_clutch
->scr_priority
; 
2452  * sched_clutch_root_count() 
2454  * Returns total number of runnable threads in the hierarchy. 
2457 sched_clutch_root_count( 
2458         sched_clutch_root_t root_clutch
) 
2460         return root_clutch
->scr_thr_count
; 
2463 #if CONFIG_SCHED_EDGE 
2466  * sched_clutch_root_foreign_empty() 
2468  * Routine to check if the foreign clutch bucket priority list is empty for a cluster. 
2471 sched_clutch_root_foreign_empty( 
2472         sched_clutch_root_t root_clutch
) 
2474         return priority_queue_empty(&root_clutch
->scr_foreign_buckets
); 
2478  * sched_clutch_root_highest_foreign_thread_remove() 
2480  * Routine to return the thread in the highest priority clutch bucket in a cluster. 
2481  * Must be called with the pset for the cluster locked. 
2484 sched_clutch_root_highest_foreign_thread_remove( 
2485         sched_clutch_root_t root_clutch
) 
2487         thread_t thread 
= THREAD_NULL
; 
2488         if (priority_queue_empty(&root_clutch
->scr_foreign_buckets
)) { 
2491         sched_clutch_bucket_t clutch_bucket 
= priority_queue_max(&root_clutch
->scr_foreign_buckets
, struct sched_clutch_bucket
, scb_foreignlink
); 
2492         thread 
= priority_queue_max(&clutch_bucket
->scb_thread_runq
, struct thread
, th_clutch_runq_link
); 
2493         sched_clutch_thread_remove(root_clutch
, thread
, mach_absolute_time(), 0); 
2497 #endif /* CONFIG_SCHED_EDGE */ 
2500  * sched_clutch_thread_pri_shift() 
2502  * Routine to get the priority shift value for a thread. 
2503  * Since the timesharing is done at the clutch_bucket level, 
2504  * this routine gets the clutch_bucket and retrieves the 
2505  * values from there. 
2508 sched_clutch_thread_pri_shift( 
2510         sched_bucket_t bucket
) 
2512         if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread
)) { 
2515         assert(bucket 
!= TH_BUCKET_RUN
); 
2516         sched_clutch_t clutch 
= sched_clutch_for_thread(thread
); 
2517         sched_clutch_bucket_group_t clutch_bucket_group 
= &(clutch
->sc_clutch_groups
[bucket
]); 
2518         return os_atomic_load(&clutch_bucket_group
->scbg_pri_shift
, relaxed
); 
2521 #pragma mark -- Clutch Scheduler Algorithm 
2524 sched_clutch_init(void); 
2527 sched_clutch_steal_thread(processor_set_t pset
); 
2530 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context
); 
2533 sched_clutch_processor_enqueue(processor_t processor
, thread_t thread
, 
2534     sched_options_t options
); 
2537 sched_clutch_processor_queue_remove(processor_t processor
, thread_t thread
); 
2540 sched_clutch_processor_csw_check(processor_t processor
); 
2543 sched_clutch_processor_queue_has_priority(processor_t processor
, int priority
, boolean_t gte
); 
2546 sched_clutch_runq_count(processor_t processor
); 
2549 sched_clutch_processor_queue_empty(processor_t processor
); 
2552 sched_clutch_runq_stats_count_sum(processor_t processor
); 
2555 sched_clutch_processor_bound_count(processor_t processor
); 
2558 sched_clutch_pset_init(processor_set_t pset
); 
2561 sched_clutch_processor_init(processor_t processor
); 
2564 sched_clutch_choose_thread(processor_t processor
, int priority
, ast_t reason
); 
2567 sched_clutch_processor_queue_shutdown(processor_t processor
); 
2570 sched_clutch_initial_thread_sched_mode(task_t parent_task
); 
2573 sched_clutch_initial_quantum_size(thread_t thread
); 
2576 sched_clutch_thread_avoid_processor(processor_t processor
, thread_t thread
); 
2579 sched_clutch_run_incr(thread_t thread
); 
2582 sched_clutch_run_decr(thread_t thread
); 
2585 sched_clutch_update_thread_bucket(thread_t thread
); 
2587 const struct sched_dispatch_table sched_clutch_dispatch 
= { 
2588         .sched_name                                     
= "clutch", 
2589         .init                                           
= sched_clutch_init
, 
2590         .timebase_init                                  
= sched_timeshare_timebase_init
, 
2591         .processor_init                                 
= sched_clutch_processor_init
, 
2592         .pset_init                                      
= sched_clutch_pset_init
, 
2593         .maintenance_continuation                       
= sched_timeshare_maintenance_continue
, 
2594         .choose_thread                                  
= sched_clutch_choose_thread
, 
2595         .steal_thread_enabled                           
= sched_steal_thread_enabled
, 
2596         .steal_thread                                   
= sched_clutch_steal_thread
, 
2597         .compute_timeshare_priority                     
= sched_compute_timeshare_priority
, 
2598         .choose_node                                    
= sched_choose_node
, 
2599         .choose_processor                               
= choose_processor
, 
2600         .processor_enqueue                              
= sched_clutch_processor_enqueue
, 
2601         .processor_queue_shutdown                       
= sched_clutch_processor_queue_shutdown
, 
2602         .processor_queue_remove                         
= sched_clutch_processor_queue_remove
, 
2603         .processor_queue_empty                          
= sched_clutch_processor_queue_empty
, 
2604         .priority_is_urgent                             
= priority_is_urgent
, 
2605         .processor_csw_check                            
= sched_clutch_processor_csw_check
, 
2606         .processor_queue_has_priority                   
= sched_clutch_processor_queue_has_priority
, 
2607         .initial_quantum_size                           
= sched_clutch_initial_quantum_size
, 
2608         .initial_thread_sched_mode                      
= sched_clutch_initial_thread_sched_mode
, 
2609         .can_update_priority                            
= can_update_priority
, 
2610         .update_priority                                
= update_priority
, 
2611         .lightweight_update_priority                    
= lightweight_update_priority
, 
2612         .quantum_expire                                 
= sched_default_quantum_expire
, 
2613         .processor_runq_count                           
= sched_clutch_runq_count
, 
2614         .processor_runq_stats_count_sum                 
= sched_clutch_runq_stats_count_sum
, 
2615         .processor_bound_count                          
= sched_clutch_processor_bound_count
, 
2616         .thread_update_scan                             
= sched_clutch_thread_update_scan
, 
2617         .multiple_psets_enabled                         
= TRUE
, 
2618         .sched_groups_enabled                           
= FALSE
, 
2619         .avoid_processor_enabled                        
= TRUE
, 
2620         .thread_avoid_processor                         
= sched_clutch_thread_avoid_processor
, 
2621         .processor_balance                              
= sched_SMT_balance
, 
2623         .rt_runq                                        
= sched_rtlocal_runq
, 
2624         .rt_init                                        
= sched_rtlocal_init
, 
2625         .rt_queue_shutdown                              
= sched_rtlocal_queue_shutdown
, 
2626         .rt_runq_scan                                   
= sched_rtlocal_runq_scan
, 
2627         .rt_runq_count_sum                              
= sched_rtlocal_runq_count_sum
, 
2629         .qos_max_parallelism                            
= sched_qos_max_parallelism
, 
2630         .check_spill                                    
= sched_check_spill
, 
2631         .ipi_policy                                     
= sched_ipi_policy
, 
2632         .thread_should_yield                            
= sched_thread_should_yield
, 
2633         .run_count_incr                                 
= sched_clutch_run_incr
, 
2634         .run_count_decr                                 
= sched_clutch_run_decr
, 
2635         .update_thread_bucket                           
= sched_clutch_update_thread_bucket
, 
2636         .pset_made_schedulable                          
= sched_pset_made_schedulable
, 
2639 __attribute__((always_inline
)) 
2640 static inline run_queue_t
 
2641 sched_clutch_bound_runq(processor_t processor
) 
2643         return &processor
->runq
; 
2646 __attribute__((always_inline
)) 
2647 static inline sched_clutch_root_t
 
2648 sched_clutch_processor_root_clutch(processor_t processor
) 
2650         return &processor
->processor_set
->pset_clutch_root
; 
2653 __attribute__((always_inline
)) 
2654 static inline run_queue_t
 
2655 sched_clutch_thread_bound_runq(processor_t processor
, __assert_only thread_t thread
) 
2657         assert(thread
->bound_processor 
== processor
); 
2658         return sched_clutch_bound_runq(processor
); 
2662 sched_clutch_initial_quantum_size(thread_t thread
) 
2664         if (thread 
== THREAD_NULL
) { 
2667         assert(sched_clutch_thread_quantum
[thread
->th_sched_bucket
] <= UINT32_MAX
); 
2668         return (uint32_t)sched_clutch_thread_quantum
[thread
->th_sched_bucket
]; 
2672 sched_clutch_initial_thread_sched_mode(task_t parent_task
) 
2674         if (parent_task 
== kernel_task
) { 
2675                 return TH_MODE_FIXED
; 
2677                 return TH_MODE_TIMESHARE
; 
2682 sched_clutch_processor_init(processor_t processor
) 
2684         run_queue_init(&processor
->runq
); 
2688 sched_clutch_pset_init(processor_set_t pset
) 
2690         sched_clutch_root_init(&pset
->pset_clutch_root
, pset
); 
2694 sched_clutch_tunables_init(void) 
2696         sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us
, sched_clutch_root_bucket_wcel
); 
2697         sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us
, sched_clutch_root_bucket_warp
); 
2698         sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us
, sched_clutch_thread_quantum
); 
2699         clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS
, 
2700             NSEC_PER_USEC
, &sched_clutch_bucket_group_adjust_threshold
); 
2701         assert(sched_clutch_bucket_group_adjust_threshold 
<= CLUTCH_CPU_DATA_MAX
); 
2702         sched_clutch_us_to_abstime(sched_clutch_bucket_group_pending_delta_us
, sched_clutch_bucket_group_pending_delta
); 
2706 sched_clutch_init(void) 
2708         if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri
, sizeof(sched_clutch_bucket_group_interactive_pri
))) { 
2709                 sched_clutch_bucket_group_interactive_pri 
= SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT
; 
2711         sched_timeshare_init(); 
2712         sched_clutch_tunables_init(); 
2716 sched_clutch_choose_thread( 
2717         processor_t      processor
, 
2719         __unused ast_t            reason
) 
2721         int clutch_pri 
= sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor
)); 
2722         uint32_t clutch_count 
= sched_clutch_root_count(sched_clutch_processor_root_clutch(processor
)); 
2723         run_queue_t bound_runq 
= sched_clutch_bound_runq(processor
); 
2724         boolean_t choose_from_boundq 
= false; 
2726         if (bound_runq
->highq 
< priority 
&& 
2727             clutch_pri 
< priority
) { 
2731         if (bound_runq
->count 
&& clutch_count
) { 
2732                 if (bound_runq
->highq 
>= clutch_pri
) { 
2733                         choose_from_boundq 
= true; 
2735         } else if (bound_runq
->count
) { 
2736                 choose_from_boundq 
= true; 
2737         } else if (clutch_count
) { 
2738                 choose_from_boundq 
= false; 
2743         thread_t thread 
= THREAD_NULL
; 
2744         if (choose_from_boundq 
== false) { 
2745                 sched_clutch_root_t pset_clutch_root 
= sched_clutch_processor_root_clutch(processor
); 
2746                 thread 
= sched_clutch_thread_highest_remove(pset_clutch_root
); 
2748                 thread 
= run_queue_dequeue(bound_runq
, SCHED_HEADQ
); 
2754 sched_clutch_processor_enqueue( 
2755         processor_t       processor
, 
2757         sched_options_t   options
) 
2761         thread
->runq 
= processor
; 
2762         if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread
)) { 
2763                 sched_clutch_root_t pset_clutch_root 
= sched_clutch_processor_root_clutch(processor
); 
2764                 result 
= sched_clutch_thread_insert(pset_clutch_root
, thread
, options
); 
2766                 run_queue_t rq 
= sched_clutch_thread_bound_runq(processor
, thread
); 
2767                 result 
= run_queue_enqueue(rq
, thread
, options
); 
2773 sched_clutch_processor_queue_empty(processor_t processor
) 
2775         return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor
)) == 0 && 
2776                sched_clutch_bound_runq(processor
)->count 
== 0; 
2780 sched_clutch_processor_csw_check(processor_t processor
) 
2782         boolean_t       has_higher
; 
2785         if (sched_clutch_thread_avoid_processor(processor
, current_thread())) { 
2786                 return AST_PREEMPT 
| AST_URGENT
; 
2789         run_queue_t bound_runq 
= sched_clutch_bound_runq(processor
); 
2790         int clutch_pri 
= sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor
)); 
2792         assert(processor
->active_thread 
!= NULL
); 
2794         pri 
= MAX(clutch_pri
, bound_runq
->highq
); 
2796         if (processor
->first_timeslice
) { 
2797                 has_higher 
= (pri 
> processor
->current_pri
); 
2799                 has_higher 
= (pri 
>= processor
->current_pri
); 
2803                 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor
)) > 0) { 
2804                         return AST_PREEMPT 
| AST_URGENT
; 
2807                 if (bound_runq
->urgency 
> 0) { 
2808                         return AST_PREEMPT 
| AST_URGENT
; 
2818 sched_clutch_processor_queue_has_priority(processor_t    processor
, 
2822         run_queue_t bound_runq 
= sched_clutch_bound_runq(processor
); 
2824         int qpri 
= MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor
)), bound_runq
->highq
); 
2827                 return qpri 
>= priority
; 
2829                 return qpri 
> priority
; 
2834 sched_clutch_runq_count(processor_t processor
) 
2836         return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor
)) + sched_clutch_bound_runq(processor
)->count
; 
2840 sched_clutch_runq_stats_count_sum(processor_t processor
) 
2842         uint64_t bound_sum 
= sched_clutch_bound_runq(processor
)->runq_stats
.count_sum
; 
2844         if (processor
->cpu_id 
== processor
->processor_set
->cpu_set_low
) { 
2845                 return bound_sum 
+ sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor
)); 
2851 sched_clutch_processor_bound_count(processor_t processor
) 
2853         return sched_clutch_bound_runq(processor
)->count
; 
2857 sched_clutch_processor_queue_shutdown(processor_t processor
) 
2859         processor_set_t pset 
= processor
->processor_set
; 
2860         sched_clutch_root_t pset_clutch_root 
= sched_clutch_processor_root_clutch(processor
); 
2862         queue_head_t    tqueue
; 
2864         /* We only need to migrate threads if this is the last active processor in the pset */ 
2865         if (pset
->online_processor_count 
> 0) { 
2870         queue_init(&tqueue
); 
2871         while (sched_clutch_root_count(pset_clutch_root
) > 0) { 
2872                 thread 
= sched_clutch_thread_highest_remove(pset_clutch_root
); 
2873                 enqueue_tail(&tqueue
, &thread
->runq_links
); 
2878         qe_foreach_element_safe(thread
, &tqueue
, runq_links
) { 
2879                 remqueue(&thread
->runq_links
); 
2880                 thread_lock(thread
); 
2881                 thread_setrun(thread
, SCHED_TAILQ
); 
2882                 thread_unlock(thread
); 
2887 sched_clutch_processor_queue_remove( 
2888         processor_t processor
, 
2892         processor_set_t         pset 
= processor
->processor_set
; 
2896         if (processor 
== thread
->runq
) { 
2898                  * Thread is on a run queue and we have a lock on 
2901                 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread
)) { 
2902                         sched_clutch_root_t pset_clutch_root 
= sched_clutch_processor_root_clutch(processor
); 
2903                         sched_clutch_thread_remove(pset_clutch_root
, thread
, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE
); 
2905                         rq 
= sched_clutch_thread_bound_runq(processor
, thread
); 
2906                         run_queue_remove(rq
, thread
); 
2910                  * The thread left the run queue before we could 
2911                  * lock the run queue. 
2913                 assert(thread
->runq 
== PROCESSOR_NULL
); 
2914                 processor 
= PROCESSOR_NULL
; 
2919         return processor 
!= PROCESSOR_NULL
; 
2923 sched_clutch_steal_thread(__unused processor_set_t pset
) 
2925         /* Thread stealing is not enabled for single cluster clutch scheduler platforms */ 
2930 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context
) 
2932         boolean_t               restart_needed 
= FALSE
; 
2933         processor_t             processor 
= processor_list
; 
2934         processor_set_t         pset
; 
2939          *  We update the threads associated with each processor (bound and idle threads) 
2940          *  and then update the threads in each pset runqueue. 
2945                         pset 
= processor
->processor_set
; 
2950                         restart_needed 
= runq_scan(sched_clutch_bound_runq(processor
), scan_context
); 
2955                         if (restart_needed
) { 
2959                         thread 
= processor
->idle_thread
; 
2960                         if (thread 
!= THREAD_NULL 
&& thread
->sched_stamp 
!= sched_tick
) { 
2961                                 if (thread_update_add_thread(thread
) == FALSE
) { 
2962                                         restart_needed 
= TRUE
; 
2966                 } while ((processor 
= processor
->processor_list
) != NULL
); 
2968                 /* Ok, we now have a collection of candidates -- fix them. */ 
2969                 thread_update_process_threads(); 
2970         } while (restart_needed
); 
2972         pset_node_t node 
= &pset_node0
; 
2977                         restart_needed 
= FALSE
; 
2978                         while (pset 
!= NULL
) { 
2982                                 if (sched_clutch_root_count(&pset
->pset_clutch_root
) > 0) { 
2983                                         for (sched_bucket_t bucket 
= TH_BUCKET_SHARE_FG
; bucket 
< TH_BUCKET_SCHED_MAX
; bucket
++) { 
2984                                                 restart_needed 
= runq_scan(&pset
->pset_clutch_root
.scr_bound_buckets
[bucket
].scrb_bound_thread_runq
, scan_context
); 
2985                                                 if (restart_needed
) { 
2989                                         queue_t clutch_bucket_list 
= &pset
->pset_clutch_root
.scr_clutch_buckets
; 
2990                                         sched_clutch_bucket_t clutch_bucket
; 
2991                                         qe_foreach_element(clutch_bucket
, clutch_bucket_list
, scb_listlink
) { 
2992                                                 sched_clutch_bucket_group_timeshare_update(clutch_bucket
->scb_group
, clutch_bucket
, scan_context
->sched_tick_last_abstime
); 
2993                                                 restart_needed 
= sched_clutch_timeshare_scan(&clutch_bucket
->scb_thread_timeshare_queue
, clutch_bucket
->scb_thr_count
, scan_context
); 
3000                                 if (restart_needed
) { 
3003                                 pset 
= pset
->pset_list
; 
3006                         if (restart_needed
) { 
3009                 } while (((node 
= node
->node_list
) != NULL
) && ((pset 
= node
->psets
) != NULL
)); 
3011                 /* Ok, we now have a collection of candidates -- fix them. */ 
3012                 thread_update_process_threads(); 
3013         } while (restart_needed
); 
3016 extern int sched_allow_rt_smt
; 
3018 /* Return true if this thread should not continue running on this processor */ 
3020 sched_clutch_thread_avoid_processor(processor_t processor
, thread_t thread
) 
3022         if (processor
->processor_primary 
!= processor
) { 
3024                  * This is a secondary SMT processor.  If the primary is running 
3025                  * a realtime thread, only allow realtime threads on the secondary. 
3027                 if ((processor
->processor_primary
->current_pri 
>= BASEPRI_RTQUEUES
) && ((thread
->sched_pri 
< BASEPRI_RTQUEUES
) || !sched_allow_rt_smt
)) { 
3036  * For the clutch scheduler, the run counts are maintained in the clutch 
3037  * buckets (i.e thread group scheduling structure). 
3040 sched_clutch_run_incr(thread_t thread
) 
3042         assert((thread
->state 
& (TH_RUN 
| TH_IDLE
)) == TH_RUN
); 
3043         uint32_t new_count 
= os_atomic_inc(&sched_run_buckets
[TH_BUCKET_RUN
], relaxed
); 
3044         sched_clutch_thread_run_bucket_incr(thread
, thread
->th_sched_bucket
); 
3049 sched_clutch_run_decr(thread_t thread
) 
3051         assert((thread
->state 
& (TH_RUN 
| TH_IDLE
)) != TH_RUN
); 
3052         uint32_t new_count 
= os_atomic_dec(&sched_run_buckets
[TH_BUCKET_RUN
], relaxed
); 
3053         sched_clutch_thread_run_bucket_decr(thread
, thread
->th_sched_bucket
); 
3057 static sched_bucket_t
 
3058 sched_convert_pri_to_bucket(uint8_t priority
) 
3060         sched_bucket_t bucket 
= TH_BUCKET_RUN
; 
3062         if (priority 
> BASEPRI_USER_INITIATED
) { 
3063                 bucket 
= TH_BUCKET_SHARE_FG
; 
3064         } else if (priority 
> BASEPRI_DEFAULT
) { 
3065                 bucket 
= TH_BUCKET_SHARE_IN
; 
3066         } else if (priority 
> BASEPRI_UTILITY
) { 
3067                 bucket 
= TH_BUCKET_SHARE_DF
; 
3068         } else if (priority 
> MAXPRI_THROTTLE
) { 
3069                 bucket 
= TH_BUCKET_SHARE_UT
; 
3071                 bucket 
= TH_BUCKET_SHARE_BG
; 
3077  * For threads that have changed sched_pri without changing the 
3078  * base_pri for any reason other than decay, use the sched_pri 
3079  * as the bucketizing priority instead of base_pri. All such 
3080  * changes are typically due to kernel locking primitives boosts 
3084 sched_thread_sched_pri_promoted(thread_t thread
) 
3086         return (thread
->sched_flags 
& TH_SFLAG_PROMOTED
) || 
3087                (thread
->sched_flags 
& TH_SFLAG_PROMOTE_REASON_MASK
) || 
3088                (thread
->sched_flags 
& TH_SFLAG_DEMOTED_MASK
) || 
3089                (thread
->sched_flags 
& TH_SFLAG_DEPRESSED_MASK
) || 
3090                (thread
->kern_promotion_schedpri 
!= 0); 
3094  * Routine to update the scheduling bucket for the thread. 
3096  * In the clutch scheduler implementation, the thread's bucket 
3097  * is based on sched_pri if it was promoted due to a kernel 
3098  * primitive; otherwise its based on the thread base_pri. This 
3099  * enhancement allows promoted threads to reach a higher priority 
3100  * bucket and potentially get selected sooner for scheduling. 
3102  * Also, the clutch scheduler does not honor fixed priority below 
3103  * FG priority. It simply puts those threads in the corresponding 
3104  * timeshare bucket. The reason for to do that is because it is 
3105  * extremely hard to define the scheduling properties of such threads 
3106  * and they typically lead to performance issues. 
3110 sched_clutch_update_thread_bucket(thread_t thread
) 
3112         sched_bucket_t old_bucket 
= thread
->th_sched_bucket
; 
3113         sched_bucket_t new_bucket 
= TH_BUCKET_RUN
; 
3114         assert(thread
->runq 
== PROCESSOR_NULL
); 
3115         int pri 
= (sched_thread_sched_pri_promoted(thread
)) ? thread
->sched_pri 
: thread
->base_pri
; 
3117         switch (thread
->sched_mode
) { 
3119                 if (pri 
>= BASEPRI_FOREGROUND
) { 
3120                         new_bucket 
= TH_BUCKET_FIXPRI
; 
3122                         new_bucket 
= sched_convert_pri_to_bucket(pri
); 
3126         case TH_MODE_REALTIME
: 
3127                 new_bucket 
= TH_BUCKET_FIXPRI
; 
3130         case TH_MODE_TIMESHARE
: 
3131                 new_bucket 
= sched_convert_pri_to_bucket(pri
); 
3135                 panic("unexpected mode: %d", thread
->sched_mode
); 
3139         if (old_bucket 
== new_bucket
) { 
3143         thread
->th_sched_bucket 
= new_bucket
; 
3144         thread
->pri_shift 
= sched_clutch_thread_pri_shift(thread
, new_bucket
); 
3146          * Since this is called after the thread has been removed from the runq, 
3147          * only the run counts need to be updated. The re-insert into the runq 
3148          * would put the thread into the correct new bucket's runq. 
3150         if ((thread
->state 
& (TH_RUN 
| TH_IDLE
)) == TH_RUN
) { 
3151                 sched_clutch_thread_run_bucket_decr(thread
, old_bucket
); 
3152                 sched_clutch_thread_run_bucket_incr(thread
, new_bucket
); 
3156 #if CONFIG_SCHED_EDGE 
3158 /* Implementation of the AMP version of the clutch scheduler */ 
3161 sched_edge_init(void); 
3164 sched_edge_processor_idle(processor_set_t pset
); 
3167 sched_edge_processor_csw_check(processor_t processor
); 
3170 sched_edge_processor_queue_has_priority(processor_t processor
, int priority
, boolean_t gte
); 
3173 sched_edge_processor_queue_empty(processor_t processor
); 
3176 sched_edge_choose_thread(processor_t processor
, int priority
, ast_t reason
); 
3179 sched_edge_processor_queue_shutdown(processor_t processor
); 
3182 sched_edge_choose_processor(processor_set_t pset
, processor_t processor
, thread_t thread
); 
3185 sched_edge_thread_avoid_processor(processor_t processor
, thread_t thread
); 
3188 sched_edge_balance(processor_t cprocessor
, processor_set_t cpset
); 
3191 sched_edge_check_spill(processor_set_t pset
, thread_t thread
); 
3194 sched_edge_thread_should_yield(processor_t processor
, thread_t thread
); 
3197 sched_edge_pset_made_schedulable(processor_t processor
, processor_set_t dst_pset
, boolean_t drop_lock
); 
3200 sched_edge_steal_thread_enabled(processor_set_t pset
); 
3202 static sched_ipi_type_t
 
3203 sched_edge_ipi_policy(processor_t dst
, thread_t thread
, boolean_t dst_idle
, sched_ipi_event_t event
); 
3206 sched_edge_qos_max_parallelism(int qos
, uint64_t options
); 
3208 const struct sched_dispatch_table sched_edge_dispatch 
= { 
3209         .sched_name                                     
= "edge", 
3210         .init                                           
= sched_edge_init
, 
3211         .timebase_init                                  
= sched_timeshare_timebase_init
, 
3212         .processor_init                                 
= sched_clutch_processor_init
, 
3213         .pset_init                                      
= sched_clutch_pset_init
, 
3214         .maintenance_continuation                       
= sched_timeshare_maintenance_continue
, 
3215         .choose_thread                                  
= sched_edge_choose_thread
, 
3216         .steal_thread_enabled                           
= sched_edge_steal_thread_enabled
, 
3217         .steal_thread                                   
= sched_edge_processor_idle
, 
3218         .compute_timeshare_priority                     
= sched_compute_timeshare_priority
, 
3219         .choose_node                                    
= sched_choose_node
, 
3220         .choose_processor                               
= sched_edge_choose_processor
, 
3221         .processor_enqueue                              
= sched_clutch_processor_enqueue
, 
3222         .processor_queue_shutdown                       
= sched_edge_processor_queue_shutdown
, 
3223         .processor_queue_remove                         
= sched_clutch_processor_queue_remove
, 
3224         .processor_queue_empty                          
= sched_edge_processor_queue_empty
, 
3225         .priority_is_urgent                             
= priority_is_urgent
, 
3226         .processor_csw_check                            
= sched_edge_processor_csw_check
, 
3227         .processor_queue_has_priority                   
= sched_edge_processor_queue_has_priority
, 
3228         .initial_quantum_size                           
= sched_clutch_initial_quantum_size
, 
3229         .initial_thread_sched_mode                      
= sched_clutch_initial_thread_sched_mode
, 
3230         .can_update_priority                            
= can_update_priority
, 
3231         .update_priority                                
= update_priority
, 
3232         .lightweight_update_priority                    
= lightweight_update_priority
, 
3233         .quantum_expire                                 
= sched_default_quantum_expire
, 
3234         .processor_runq_count                           
= sched_clutch_runq_count
, 
3235         .processor_runq_stats_count_sum                 
= sched_clutch_runq_stats_count_sum
, 
3236         .processor_bound_count                          
= sched_clutch_processor_bound_count
, 
3237         .thread_update_scan                             
= sched_clutch_thread_update_scan
, 
3238         .multiple_psets_enabled                         
= TRUE
, 
3239         .sched_groups_enabled                           
= FALSE
, 
3240         .avoid_processor_enabled                        
= TRUE
, 
3241         .thread_avoid_processor                         
= sched_edge_thread_avoid_processor
, 
3242         .processor_balance                              
= sched_edge_balance
, 
3244         .rt_runq                                        
= sched_amp_rt_runq
, 
3245         .rt_init                                        
= sched_amp_rt_init
, 
3246         .rt_queue_shutdown                              
= sched_amp_rt_queue_shutdown
, 
3247         .rt_runq_scan                                   
= sched_amp_rt_runq_scan
, 
3248         .rt_runq_count_sum                              
= sched_amp_rt_runq_count_sum
, 
3250         .qos_max_parallelism                            
= sched_edge_qos_max_parallelism
, 
3251         .check_spill                                    
= sched_edge_check_spill
, 
3252         .ipi_policy                                     
= sched_edge_ipi_policy
, 
3253         .thread_should_yield                            
= sched_edge_thread_should_yield
, 
3254         .run_count_incr                                 
= sched_clutch_run_incr
, 
3255         .run_count_decr                                 
= sched_clutch_run_decr
, 
3256         .update_thread_bucket                           
= sched_clutch_update_thread_bucket
, 
3257         .pset_made_schedulable                          
= sched_edge_pset_made_schedulable
, 
3258         .thread_group_recommendation_change             
= NULL
, 
3261 static struct processor_set pset1
; 
3262 static struct pset_node pset_node1
; 
3263 static bitmap_t sched_edge_available_pset_bitmask
[BITMAP_LEN(MAX_PSETS
)]; 
3266  * sched_edge_pset_available() 
3268  * Routine to determine if a pset is available for scheduling. 
3271 sched_edge_pset_available(processor_set_t pset
) 
3273         return bitmap_test(sched_edge_available_pset_bitmask
, pset
->pset_cluster_id
); 
3277  * sched_edge_thread_bound_cluster_id() 
3279  * Routine to determine which cluster a particular thread is bound to. Uses 
3280  * the sched_flags on the thread to map back to a specific cluster id. 
3282  * <Edge Multi-cluster Support Needed> 
3285 sched_edge_thread_bound_cluster_id(thread_t thread
) 
3287         assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread
)); 
3288         if (thread
->sched_flags 
& TH_SFLAG_ECORE_ONLY
) { 
3289                 return (pset_array
[0]->pset_type 
== CLUSTER_TYPE_E
) ? 0 : 1; 
3291                 return (pset_array
[0]->pset_type 
== CLUSTER_TYPE_P
) ? 0 : 1; 
3295 /* Forward declaration for some thread migration routines */ 
3296 static boolean_t 
sched_edge_foreign_runnable_thread_available(processor_set_t pset
); 
3297 static boolean_t 
sched_edge_foreign_running_thread_available(processor_set_t pset
); 
3298 static processor_set_t 
sched_edge_steal_candidate(processor_set_t pset
); 
3299 static processor_set_t 
sched_edge_migrate_candidate(processor_set_t preferred_pset
, thread_t thread
, processor_set_t locked_pset
, bool switch_pset_locks
); 
3302  * sched_edge_config_set() 
3304  * Support to update an edge configuration. Typically used by CLPC to affect thread migration 
3305  * policies in the scheduler. 
3308 sched_edge_config_set(uint32_t src_cluster
, uint32_t dst_cluster
, sched_clutch_edge edge_config
) 
3310         sched_clutch_edge 
*edge 
= &pset_array
[src_cluster
]->sched_edges
[dst_cluster
]; 
3311         edge
->sce_edge_packed 
= edge_config
.sce_edge_packed
; 
3315  * sched_edge_config_get() 
3317  * Support to get an edge configuration. Typically used by CLPC to query edge configs to decide 
3318  * if it needs to update edges. 
3320 static sched_clutch_edge
 
3321 sched_edge_config_get(uint32_t src_cluster
, uint32_t dst_cluster
) 
3323         return pset_array
[src_cluster
]->sched_edges
[dst_cluster
]; 
3326 #if DEVELOPMENT || DEBUG 
3329  * Helper Routines for edge scheduler sysctl configuration 
3331  * The current support is limited to dual cluster AMP platforms. 
3332  * <Edge Multi-cluster Support Needed> 
3336 sched_edge_sysctl_configure_e_to_p(uint64_t edge_config
) 
3338         pset_array
[ecore_cluster_id
]->sched_edges
[pcore_cluster_id
].sce_edge_packed 
= edge_config
; 
3339         return KERN_SUCCESS
; 
3343 sched_edge_sysctl_configure_p_to_e(uint64_t edge_config
) 
3345         pset_array
[pcore_cluster_id
]->sched_edges
[ecore_cluster_id
].sce_edge_packed 
= edge_config
; 
3346         return KERN_SUCCESS
; 
3350 sched_edge_e_to_p(void) 
3352         return sched_edge_config_get(ecore_cluster_id
, pcore_cluster_id
); 
3356 sched_edge_p_to_e(void) 
3358         return sched_edge_config_get(pcore_cluster_id
, ecore_cluster_id
); 
3361 #endif /* DEVELOPMENT || DEBUG */ 
3364  * sched_edge_matrix_set() 
3366  * Routine to update various edges in the cluster edge matrix. The edge_changes_bitmap 
3367  * indicates which edges need to be updated. Both the edge_matrix & edge_changes_bitmap 
3368  * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array. 
3371 sched_edge_matrix_set(sched_clutch_edge 
*edge_matrix
, bool *edge_changes_bitmap
, __unused 
uint64_t flags
, uint64_t matrix_order
) 
3373         uint32_t edge_index 
= 0; 
3374         for (uint32_t src_cluster 
= 0; src_cluster 
< matrix_order
; src_cluster
++) { 
3375                 for (uint32_t dst_cluster 
= 0; dst_cluster 
< matrix_order
; dst_cluster
++) { 
3376                         if (edge_changes_bitmap
[edge_index
]) { 
3377                                 sched_edge_config_set(src_cluster
, dst_cluster
, edge_matrix
[edge_index
]); 
3385  * sched_edge_matrix_get() 
3387  * Routine to retrieve various edges in the cluster edge matrix. The edge_request_bitmap 
3388  * indicates which edges need to be retrieved. Both the edge_matrix & edge_request_bitmap 
3389  * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array. 
3392 sched_edge_matrix_get(sched_clutch_edge 
*edge_matrix
, bool *edge_request_bitmap
, __unused 
uint64_t flags
, uint64_t matrix_order
) 
3394         uint32_t edge_index 
= 0; 
3395         for (uint32_t src_cluster 
= 0; src_cluster 
< matrix_order
; src_cluster
++) { 
3396                 for (uint32_t dst_cluster 
= 0; dst_cluster 
< matrix_order
; dst_cluster
++) { 
3397                         if (edge_request_bitmap
[edge_index
]) { 
3398                                 edge_matrix
[edge_index
] = sched_edge_config_get(src_cluster
, dst_cluster
); 
3408  * Routine to initialize the data structures for the Edge scheduler. The current implementation 
3409  * relies on this being enabled for a dual cluster AMP system. Once a better design for MAX_PSETS, 
3410  * edge config etc. is defined, it should be made more generic to handle the multi-cluster 
3412  * <Edge Multi-cluster Support Needed> 
3415 sched_edge_init(void) 
3417         processor_set_t ecore_set 
= &pset0
; 
3418         processor_set_t pcore_set 
= &pset1
; 
3420         if (ml_get_boot_cluster() == CLUSTER_TYPE_P
) { 
3421                 /* If the device boots on a P-cluster, fixup the IDs etc. */ 
3424                 bitmap_set(sched_edge_available_pset_bitmask
, pcore_cluster_id
); 
3426                 bitmap_set(sched_edge_available_pset_bitmask
, ecore_cluster_id
); 
3429         ecore_set
->pset_cluster_type 
= PSET_AMP_E
; 
3430         ecore_set
->pset_cluster_id 
= ecore_cluster_id
; 
3432         pcore_set
->pset_cluster_type 
= PSET_AMP_P
; 
3433         pcore_set
->pset_cluster_id 
= pcore_cluster_id
; 
3435         pset_init(&pset1
, &pset_node1
); 
3436         pset_node1
.psets 
= &pset1
; 
3437         pset_node0
.node_list 
= &pset_node1
; 
3439         pset_array
[ecore_cluster_id
] = ecore_set
; 
3440         pset_array
[ecore_cluster_id
]->pset_type 
= CLUSTER_TYPE_E
; 
3441         bitmap_set(pset_array
[ecore_cluster_id
]->foreign_psets
, pcore_cluster_id
); 
3443         sched_edge_config_set(ecore_cluster_id
, ecore_cluster_id
, (sched_clutch_edge
){.sce_migration_weight 
= 0, .sce_migration_allowed 
= 0, .sce_steal_allowed 
= 0}); 
3444         sched_edge_config_set(ecore_cluster_id
, pcore_cluster_id
, (sched_clutch_edge
){.sce_migration_weight 
= 0, .sce_migration_allowed 
= 0, .sce_steal_allowed 
= 0}); 
3446         pset_array
[pcore_cluster_id
] = pcore_set
; 
3447         pset_array
[pcore_cluster_id
]->pset_type 
= CLUSTER_TYPE_P
; 
3448         bitmap_set(pset_array
[pcore_cluster_id
]->foreign_psets
, ecore_cluster_id
); 
3450         sched_edge_config_set(pcore_cluster_id
, pcore_cluster_id
, (sched_clutch_edge
){.sce_migration_weight 
= 0, .sce_migration_allowed 
= 0, .sce_steal_allowed 
= 0}); 
3451         sched_edge_config_set(pcore_cluster_id
, ecore_cluster_id
, (sched_clutch_edge
){.sce_migration_weight 
= 64, .sce_migration_allowed 
= 1, .sce_steal_allowed 
= 1}); 
3453         sched_timeshare_init(); 
3454         sched_clutch_tunables_init(); 
3458 sched_edge_choose_thread( 
3459         processor_t      processor
, 
3461         __unused ast_t            reason
) 
3463         int clutch_pri 
= sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor
)); 
3464         run_queue_t bound_runq 
= sched_clutch_bound_runq(processor
); 
3465         boolean_t choose_from_boundq 
= false; 
3467         if ((bound_runq
->highq 
< priority
) && 
3468             (clutch_pri 
< priority
)) { 
3472         if (bound_runq
->highq 
>= clutch_pri
) { 
3473                 choose_from_boundq 
= true; 
3476         thread_t thread 
= THREAD_NULL
; 
3477         if (choose_from_boundq 
== false) { 
3478                 sched_clutch_root_t pset_clutch_root 
= sched_clutch_processor_root_clutch(processor
); 
3479                 thread 
= sched_clutch_thread_highest_remove(pset_clutch_root
); 
3481                 thread 
= run_queue_dequeue(bound_runq
, SCHED_HEADQ
); 
3487 sched_edge_processor_queue_empty(processor_t processor
) 
3489         return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor
)) == 0) && 
3490                (sched_clutch_bound_runq(processor
)->count 
== 0); 
3494 sched_edge_check_spill(__unused processor_set_t pset
, __unused thread_t thread
) 
3496         assert(thread
->bound_processor 
== PROCESSOR_NULL
); 
3499 __options_decl(sched_edge_thread_yield_reason_t
, uint32_t, { 
3500         SCHED_EDGE_YIELD_RUNQ_NONEMPTY       
= 0x0, 
3501         SCHED_EDGE_YIELD_FOREIGN_RUNNABLE    
= 0x1, 
3502         SCHED_EDGE_YIELD_FOREIGN_RUNNING     
= 0x2, 
3503         SCHED_EDGE_YIELD_STEAL_POSSIBLE      
= 0x3, 
3504         SCHED_EDGE_YIELD_DISALLOW            
= 0x4, 
3508 sched_edge_thread_should_yield(processor_t processor
, __unused thread_t thread
) 
3510         if (!sched_edge_processor_queue_empty(processor
) || (rt_runq_count(processor
->processor_set
) > 0)) { 
3511                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_SHOULD_YIELD
) | DBG_FUNC_NONE
, 
3512                     thread_tid(thread
), processor
->processor_set
->pset_cluster_id
, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY
); 
3517          * The yield logic should follow the same logic that steal_thread () does. The 
3518          * thread_should_yield() is effectively trying to quickly check that if the 
3519          * current thread gave up CPU, is there any other thread that would execute 
3520          * on this CPU. So it needs to provide the same answer as the steal_thread()/ 
3521          * processor Idle logic. 
3523         if (sched_edge_foreign_runnable_thread_available(processor
->processor_set
)) { 
3524                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_SHOULD_YIELD
) | DBG_FUNC_NONE
, 
3525                     thread_tid(thread
), processor
->processor_set
->pset_cluster_id
, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNABLE
); 
3528         if (sched_edge_foreign_running_thread_available(processor
->processor_set
)) { 
3529                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_SHOULD_YIELD
) | DBG_FUNC_NONE
, 
3530                     thread_tid(thread
), processor
->processor_set
->pset_cluster_id
, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING
); 
3534         processor_set_t steal_candidate 
= sched_edge_steal_candidate(processor
->processor_set
); 
3535         if (steal_candidate 
!= NULL
) { 
3536                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_SHOULD_YIELD
) | DBG_FUNC_NONE
, 
3537                     thread_tid(thread
), processor
->processor_set
->pset_cluster_id
, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE
); 
3541         KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_SHOULD_YIELD
) | DBG_FUNC_NONE
, thread_tid(thread
), processor
->processor_set
->pset_cluster_id
, 
3542             0, SCHED_EDGE_YIELD_DISALLOW
); 
3547 sched_edge_processor_csw_check(processor_t processor
) 
3549         boolean_t       has_higher
; 
3552         int clutch_pri 
= sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor
)); 
3553         run_queue_t bound_runq 
= sched_clutch_bound_runq(processor
); 
3555         assert(processor
->active_thread 
!= NULL
); 
3557         pri 
= MAX(clutch_pri
, bound_runq
->highq
); 
3559         if (processor
->first_timeslice
) { 
3560                 has_higher 
= (pri 
> processor
->current_pri
); 
3562                 has_higher 
= (pri 
>= processor
->current_pri
); 
3566                 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor
)) > 0) { 
3567                         return AST_PREEMPT 
| AST_URGENT
; 
3570                 if (bound_runq
->urgency 
> 0) { 
3571                         return AST_PREEMPT 
| AST_URGENT
; 
3581 sched_edge_processor_queue_has_priority(processor_t    processor
, 
3585         run_queue_t bound_runq 
= sched_clutch_bound_runq(processor
); 
3587         int qpri 
= MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor
)), bound_runq
->highq
); 
3589                 return qpri 
>= priority
; 
3591                 return qpri 
> priority
; 
3596 sched_edge_processor_queue_shutdown(processor_t processor
) 
3598         processor_set_t pset 
= processor
->processor_set
; 
3599         sched_clutch_root_t pset_clutch_root 
= sched_clutch_processor_root_clutch(processor
); 
3601         queue_head_t    tqueue
; 
3603         /* We only need to migrate threads if this is the last active or last recommended processor in the pset */ 
3604         if ((pset
->online_processor_count 
> 0) && pset_is_recommended(pset
)) { 
3609         bitmap_clear(sched_edge_available_pset_bitmask
, pset
->pset_cluster_id
); 
3611         queue_init(&tqueue
); 
3612         while (sched_clutch_root_count(pset_clutch_root
) > 0) { 
3613                 thread 
= sched_clutch_thread_highest_remove(pset_clutch_root
); 
3614                 enqueue_tail(&tqueue
, &thread
->runq_links
); 
3618         qe_foreach_element_safe(thread
, &tqueue
, runq_links
) { 
3619                 remqueue(&thread
->runq_links
); 
3620                 thread_lock(thread
); 
3621                 thread_setrun(thread
, SCHED_TAILQ
); 
3622                 thread_unlock(thread
); 
3627  * sched_edge_cluster_load_metric() 
3629  * The load metric for a cluster is a measure of the average scheduling latency 
3630  * experienced by threads on that cluster. It is a product of the average number 
3631  * of threads in the runqueue and the average execution time for threads. The metric 
3632  * has special values in the following cases: 
3633  * - UINT32_MAX: If the cluster is not available for scheduling, its load is set to 
3634  *   the maximum value to disallow any threads to migrate to this cluster. 
3635  * - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads 
3636  *   to be spread across the platform quickly for ncpu wide workloads. 
3639 sched_edge_cluster_load_metric(processor_set_t pset
, sched_bucket_t sched_bucket
) 
3641         if (sched_edge_pset_available(pset
) == false) { 
3644         return (uint32_t)sched_get_pset_load_average(pset
, sched_bucket
); 
3649  * Edge Scheduler Steal/Rebalance logic 
3651  * = Generic scheduler logic = 
3653  * The SCHED(steal_thread) scheduler callout is invoked when the processor does not 
3654  * find any thread for execution in its runqueue. The aim of the steal operation 
3655  * is to find other threads running/runnable in other clusters which should be 
3658  * If the steal callout does not return a thread, the thread_select() logic calls 
3659  * SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance 
3660  * threads and idle out the current CPU. 
3662  * = SCHED(steal_thread) for Edge Scheduler = 
3664  * The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This 
3665  * routine tries to do the following operations in order: 
3666  * (1) Find foreign runnnable threads in non-native cluster 
3667  *     runqueues (sched_edge_foreign_runnable_thread_remove()) 
3668  * (2) Check if foreign threads are running on the non-native 
3669  *     clusters (sched_edge_foreign_running_thread_available()) 
3670  *         - If yes, return THREAD_NULL for the steal callout and 
3671  *         perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance() 
3672  * (3) Steal a thread from another cluster based on edge 
3673  *     weights (sched_edge_steal_thread()) 
3675  * = SCHED(processor_balance) for Edge Scheduler = 
3677  * If steal_thread did not return a thread for the processor, use 
3678  * sched_edge_balance() to rebalance foreign running threads and idle out this CPU. 
3680  * = Clutch Bucket Preferred Cluster Overrides = 
3682  * Since these operations (just like thread migrations on enqueue) 
3683  * move threads across clusters, they need support for handling clutch 
3684  * bucket group level preferred cluster recommendations. 
3685  * For (1), a clutch bucket will be in the foreign runnable queue based 
3686  * on the clutch bucket group preferred cluster. 
3687  * For (2), the running thread will set the bit on the processor based 
3688  * on its preferred cluster type. 
3689  * For (3), the edge configuration would prevent threads from being stolen 
3690  * in the wrong direction. 
3692  * = SCHED(thread_should_yield) = 
3693  * The thread_should_yield() logic needs to have the same logic as sched_edge_processor_idle() 
3694  * since that is expecting the same answer as if thread_select() was called on a core 
3695  * with an empty runqueue. 
3699 sched_edge_steal_thread_enabled(__unused processor_set_t pset
) 
3702          * For edge scheduler, the gating for steal is being done by sched_edge_steal_candidate() 
3707 static processor_set_t
 
3708 sched_edge_steal_candidate(processor_set_t pset
) 
3711          * Edge Scheduler Optimization 
3713          * Investigate a better policy for stealing. The current implementation looks 
3714          * at all the incoming weights for the pset that just became idle and sees which 
3715          * clusters have loads > edge weights. It is effectively trying to simulate a 
3716          * overload migration as if a thread had become runnable on the candidate cluster. 
3718          * The logic today bails as soon as it finds a cluster where the cluster load is 
3719          * greater than the edge weight. This helps the check to be quick which is useful 
3720          * for sched_edge_thread_should_yield() which uses this. Maybe it should have a 
3721          * more advanced version for the actual steal operation which looks for the 
3722          * maximum delta etc. 
3724         processor_set_t target_pset 
= NULL
; 
3725         uint32_t dst_cluster_id 
= pset
->pset_cluster_id
; 
3727         for (int cluster_id 
= 0; cluster_id 
< MAX_PSETS
; cluster_id
++) { 
3728                 processor_set_t candidate_pset 
= pset_array
[cluster_id
]; 
3730                 if (candidate_pset 
== pset
) { 
3734                 sched_clutch_edge 
*incoming_edge 
= &pset_array
[cluster_id
]->sched_edges
[dst_cluster_id
]; 
3735                 if (incoming_edge
->sce_steal_allowed 
== false) { 
3739                 uint32_t incoming_weight 
= incoming_edge
->sce_migration_weight
; 
3740                 int highest_runnable_bucket 
= bitmap_lsb_first(candidate_pset
->pset_clutch_root
.scr_unbound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
); 
3741                 if (highest_runnable_bucket 
== -1) { 
3742                         /* Candidate cluster runq is empty */ 
3745                 /* Use the load metrics for highest runnable bucket since that would be stolen next */ 
3746                 uint32_t candidate_load 
= sched_edge_cluster_load_metric(candidate_pset
, (sched_bucket_t
)highest_runnable_bucket
); 
3747                 if (candidate_load 
> incoming_weight
) { 
3748                         /* Only steal from the candidate if its load is higher than the incoming edge and it has runnable threads */ 
3749                         target_pset 
= candidate_pset
; 
3758 sched_edge_foreign_runnable_thread_available(processor_set_t pset
) 
3760         /* Find all the clusters that are foreign for this cluster */ 
3761         bitmap_t 
*foreign_pset_bitmap 
= pset
->foreign_psets
; 
3762         for (int cluster 
= bitmap_first(foreign_pset_bitmap
, MAX_PSETS
); cluster 
>= 0; cluster 
= bitmap_next(foreign_pset_bitmap
, cluster
)) { 
3764                  * For each cluster, see if there are any runnable foreign threads. 
3765                  * This check is currently being done without the pset lock to make it cheap for 
3768                 processor_set_t target_pset 
= pset_array
[cluster
]; 
3769                 if (sched_edge_pset_available(target_pset
) == false) { 
3773                 if (!sched_clutch_root_foreign_empty(&target_pset
->pset_clutch_root
)) { 
3781 sched_edge_foreign_runnable_thread_remove(processor_set_t pset
, uint64_t ctime
) 
3783         thread_t thread 
= THREAD_NULL
; 
3785         /* Find all the clusters that are foreign for this cluster */ 
3786         bitmap_t 
*foreign_pset_bitmap 
= pset
->foreign_psets
; 
3787         for (int cluster 
= bitmap_first(foreign_pset_bitmap
, MAX_PSETS
); cluster 
>= 0; cluster 
= bitmap_next(foreign_pset_bitmap
, cluster
)) { 
3789                  * For each cluster, see if there are any runnable foreign threads. 
3790                  * This check is currently being done without the pset lock to make it cheap for 
3793                 processor_set_t target_pset 
= pset_array
[cluster
]; 
3794                 if (sched_edge_pset_available(target_pset
) == false) { 
3798                 if (sched_clutch_root_foreign_empty(&target_pset
->pset_clutch_root
)) { 
3802                  * Looks like there are runnable foreign threads in the hierarchy; lock the pset 
3803                  * and get the highest priority thread. 
3805                 pset_lock(target_pset
); 
3806                 if (sched_edge_pset_available(target_pset
)) { 
3807                         thread 
= sched_clutch_root_highest_foreign_thread_remove(&target_pset
->pset_clutch_root
); 
3808                         sched_update_pset_load_average(target_pset
, ctime
); 
3810                 pset_unlock(target_pset
); 
3813                  * Edge Scheduler Optimization 
3815                  * The current implementation immediately returns as soon as it finds a foreign 
3816                  * runnable thread. This could be enhanced to look at highest priority threads 
3817                  * from all foreign clusters and pick the highest amongst them. That would need 
3818                  * some form of global state across psets to make that kind of a check cheap. 
3820                 if (thread 
!= THREAD_NULL
) { 
3821                         KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_REBAL_RUNNABLE
) | DBG_FUNC_NONE
, thread_tid(thread
), pset
->pset_cluster_id
, target_pset
->pset_cluster_id
, 0); 
3824                 /* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */ 
3831 sched_edge_foreign_running_thread_available(processor_set_t pset
) 
3833         bitmap_t 
*foreign_pset_bitmap 
= pset
->foreign_psets
; 
3834         for (int cluster 
= bitmap_first(foreign_pset_bitmap
, MAX_PSETS
); cluster 
>= 0; cluster 
= bitmap_next(foreign_pset_bitmap
, cluster
)) { 
3835                 /* Skip the pset if its not schedulable */ 
3836                 processor_set_t target_pset 
= pset_array
[cluster
]; 
3837                 if (sched_edge_pset_available(target_pset
) == false) { 
3841                 uint64_t running_foreign_bitmap 
= target_pset
->cpu_state_map
[PROCESSOR_RUNNING
] & target_pset
->cpu_running_foreign
; 
3842                 if (lsb_first(running_foreign_bitmap
) != -1) { 
3843                         /* Found a non-native CPU running a foreign thread; rebalance is needed */ 
3851 sched_edge_steal_thread(processor_set_t pset
) 
3853         thread_t thread 
= THREAD_NULL
; 
3854         processor_set_t steal_from_pset 
= sched_edge_steal_candidate(pset
); 
3855         if (steal_from_pset
) { 
3857                  * sched_edge_steal_candidate() has found a pset which is ideal to steal from. 
3858                  * Lock the pset and select the highest thread in that runqueue. 
3860                 pset_lock(steal_from_pset
); 
3861                 if (bitmap_first(steal_from_pset
->pset_clutch_root
.scr_unbound_runnable_bitmap
, TH_BUCKET_SCHED_MAX
) != -1) { 
3862                         uint64_t current_timestamp 
= mach_absolute_time(); 
3863                         sched_clutch_root_bucket_t root_bucket 
= sched_clutch_root_highest_root_bucket(&steal_from_pset
->pset_clutch_root
, current_timestamp
, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY
); 
3864                         thread 
= sched_clutch_thread_unbound_lookup(&steal_from_pset
->pset_clutch_root
, root_bucket
); 
3865                         sched_clutch_thread_remove(&steal_from_pset
->pset_clutch_root
, thread
, current_timestamp
, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR
); 
3866                         KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_STEAL
) | DBG_FUNC_NONE
, thread_tid(thread
), pset
->pset_cluster_id
, steal_from_pset
->pset_cluster_id
, 0); 
3867                         sched_update_pset_load_average(steal_from_pset
, current_timestamp
); 
3870                  * Edge Scheduler Optimization 
3871                  * Maybe this needs to circle around if the steal candidate did not have any threads by 
3872                  * by the time the pset lock was taken. 
3874                 pset_unlock(steal_from_pset
); 
3880  * sched_edge_processor_idle() 
3882  * The routine is the implementation for steal_thread() for the Edge scheduler. 
3885 sched_edge_processor_idle(processor_set_t pset
) 
3887         thread_t thread 
= THREAD_NULL
; 
3889         uint64_t ctime 
= mach_absolute_time(); 
3891         /* Each of the operations acquire the lock for the pset they target */ 
3894         /* Find highest priority runnable thread on all non-native clusters */ 
3895         thread 
= sched_edge_foreign_runnable_thread_remove(pset
, ctime
); 
3896         if (thread 
!= THREAD_NULL
) { 
3900         /* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */ 
3901         boolean_t rebalance_needed 
= sched_edge_foreign_running_thread_available(pset
); 
3902         if (rebalance_needed
) { 
3906         /* No foreign threads found; find a thread to steal from a pset based on weights/loads etc. */ 
3907         thread 
= sched_edge_steal_thread(pset
); 
3911 /* Return true if this thread should not continue running on this processor */ 
3913 sched_edge_thread_avoid_processor(processor_t processor
, thread_t thread
) 
3915         processor_set_t preferred_pset 
= pset_array
[sched_edge_thread_preferred_cluster(thread
)]; 
3917          * For long running parallel workloads, it is important to rebalance threads across 
3918          * E/P clusters so that they make equal forward progress. This is achieved through 
3919          * threads expiring their quantum on the non-preferred cluster type and explicitly 
3920          * rebalancing to the preferred cluster runqueue. 
3922          * <Edge Multi-Cluster Support Needed> 
3923          * For multi-cluster platforms, it mignt be useful to move the thread incase its 
3924          * preferred pset is idle now. 
3926         if (processor
->processor_set
->pset_type 
!= preferred_pset
->pset_type
) { 
3929         /* If thread already running on preferred cluster, do not avoid */ 
3930         if (processor
->processor_set 
== preferred_pset
) { 
3934          * The thread is running on a processor that is of the same type as the 
3935          * preferred pset, but is not the actual preferred pset. In that case 
3936          * look at edge weights to see if this thread should continue execution 
3937          * here or go back to its preferred cluster. 
3939          * <Edge Multi-Cluster Support Needed> 
3940          * This logic needs to ensure that the current thread is not counted against 
3941          * the load average for the current pset otherwise it would always end up avoiding 
3942          * the current cluster. 
3944         processor_set_t chosen_pset 
= sched_edge_migrate_candidate(preferred_pset
, thread
, processor
->processor_set
, false); 
3945         return chosen_pset 
!= processor
->processor_set
; 
3949 sched_edge_balance(__unused processor_t cprocessor
, processor_set_t cpset
) 
3951         assert(cprocessor 
== current_processor()); 
3954         uint64_t ast_processor_map 
= 0; 
3955         sched_ipi_type_t ipi_type
[MAX_CPUS
] = {SCHED_IPI_NONE
}; 
3957         bitmap_t 
*foreign_pset_bitmap 
= cpset
->foreign_psets
; 
3958         for (int cluster 
= bitmap_first(foreign_pset_bitmap
, MAX_PSETS
); cluster 
>= 0; cluster 
= bitmap_next(foreign_pset_bitmap
, cluster
)) { 
3959                 /* Skip the pset if its not schedulable */ 
3960                 processor_set_t target_pset 
= pset_array
[cluster
]; 
3961                 if (sched_edge_pset_available(target_pset
) == false) { 
3965                 pset_lock(target_pset
); 
3966                 uint64_t cpu_running_foreign_map 
= (target_pset
->cpu_running_foreign 
& target_pset
->cpu_state_map
[PROCESSOR_RUNNING
]); 
3967                 for (int cpuid 
= lsb_first(cpu_running_foreign_map
); cpuid 
>= 0; cpuid 
= lsb_next(cpu_running_foreign_map
, cpuid
)) { 
3968                         processor_t target_cpu 
= processor_array
[cpuid
]; 
3969                         ipi_type
[target_cpu
->cpu_id
] = sched_ipi_action(target_cpu
, NULL
, false, SCHED_IPI_EVENT_REBALANCE
); 
3970                         if (ipi_type
[cpuid
] != SCHED_IPI_NONE
) { 
3971                                 bit_set(ast_processor_map
, cpuid
); 
3974                 pset_unlock(target_pset
); 
3977         for (int cpuid 
= lsb_first(ast_processor_map
); cpuid 
>= 0; cpuid 
= lsb_next(ast_processor_map
, cpuid
)) { 
3978                 processor_t ast_processor 
= processor_array
[cpuid
]; 
3979                 sched_ipi_perform(ast_processor
, ipi_type
[cpuid
]); 
3980                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_REBAL_RUNNING
) | DBG_FUNC_NONE
, 0, cprocessor
->cpu_id
, cpuid
, 0); 
3985  * sched_edge_migrate_edges_evaluate() 
3987  * Routine to find the candidate for thread migration based on edge weights. 
3989  * Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can 
3990  * return preferred_pset if its the most ideal destination for this thread. 
3992 static processor_set_t
 
3993 sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset
, uint32_t preferred_cluster_load
, thread_t thread
) 
3995         processor_set_t selected_pset 
= preferred_pset
; 
3996         uint32_t preferred_cluster_id 
= preferred_pset
->pset_cluster_id
; 
3997         cluster_type_t preferred_cluster_type 
= pset_type_for_id(preferred_cluster_id
); 
3999         /* Look at edge deltas with other clusters to find the ideal migration candidate */ 
4000         sched_clutch_edge 
*edge 
= preferred_pset
->sched_edges
; 
4001         uint32_t max_edge_delta 
= 0; 
4004          * Edge Scheduler Optimization 
4006          * For really large cluster count systems, it might make sense to optimize the 
4007          * clusters iterated by using bitmaps and skipping over clusters that are not 
4008          * available for scheduling or have migration disabled from this cluster. 
4010         for (uint32_t cluster_id 
= 0; cluster_id 
< MAX_PSETS
; cluster_id
++) { 
4011                 processor_set_t dst_pset 
= pset_array
[cluster_id
]; 
4012                 if (cluster_id 
== preferred_cluster_id
) { 
4016                 if (edge
[cluster_id
].sce_migration_allowed 
== false) { 
4020                 uint32_t dst_load 
= sched_edge_cluster_load_metric(dst_pset
, thread
->th_sched_bucket
); 
4021                 if (dst_load 
> preferred_cluster_load
) { 
4026                  * Fast path for idle dst cluster 
4028                  * For extremely parallel workloads, it is important to load up 
4029                  * all clusters as quickly as possible. This short-circuit allows 
4031                  * <Edge Multi-cluster Support Needed> 
4033                  * For multi-cluster platforms, the loop should start with the homogeneous 
4036                 if (dst_load 
== 0) { 
4037                         selected_pset 
= dst_pset
; 
4041                 uint32_t edge_delta 
= preferred_cluster_load 
- dst_load
; 
4042                 if (edge_delta 
< edge
[cluster_id
].sce_migration_weight
) { 
4046                 if (edge_delta 
< max_edge_delta
) { 
4050                 if (edge_delta 
== max_edge_delta
) { 
4051                         /* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */ 
4052                         boolean_t selected_homogeneous 
= (pset_type_for_id(selected_pset
->pset_cluster_id
) == preferred_cluster_type
); 
4053                         boolean_t candidate_homogeneous 
= (pset_type_for_id(dst_pset
->pset_cluster_id
) == preferred_cluster_type
); 
4054                         if (selected_homogeneous 
|| !candidate_homogeneous
) { 
4058                 /* dst_pset seems to be the best candidate for migration */ 
4059                 max_edge_delta 
= edge_delta
; 
4060                 selected_pset 
= dst_pset
; 
4062         return selected_pset
; 
4066  * sched_edge_candidate_alternative() 
4068  * Routine to find an alternative cluster from candidate_cluster_bitmap since the 
4069  * selected_pset is not available for execution. The logic tries to prefer homogeneous 
4070  * clusters over heterogeneous clusters since this is typically used in thread 
4071  * placement decisions. 
4073 _Static_assert(MAX_PSETS 
<= 64, "Unable to fit maximum number of psets in uint64_t bitmask"); 
4074 static processor_set_t
 
4075 sched_edge_candidate_alternative(processor_set_t selected_pset
, uint64_t candidate_cluster_bitmap
) 
4078          * It looks like the most ideal pset is not available for scheduling currently. 
4079          * Try to find a homogeneous cluster that is still available. 
4081         bitmap_t 
*foreign_clusters 
= selected_pset
->foreign_psets
; 
4082         uint64_t available_native_clusters 
= ~(foreign_clusters
[0]) & candidate_cluster_bitmap
; 
4083         int available_cluster_id 
= lsb_first(available_native_clusters
); 
4084         if (available_cluster_id 
== -1) { 
4085                 /* Looks like none of the homogeneous clusters are available; pick the first available cluster */ 
4086                 available_cluster_id 
= bit_first(candidate_cluster_bitmap
); 
4088         assert(available_cluster_id 
!= -1); 
4089         return pset_array
[available_cluster_id
]; 
4093  * sched_edge_switch_pset_lock() 
4095  * Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on 
4096  * switch_pset_locks. 
4097  * Returns the newly locked pset after the switch. 
4099 static processor_set_t
 
4100 sched_edge_switch_pset_lock(processor_set_t selected_pset
, processor_set_t locked_pset
, bool switch_pset_locks
) 
4102         if (!switch_pset_locks
) { 
4105         if (selected_pset 
!= locked_pset
) { 
4106                 pset_unlock(locked_pset
); 
4107                 pset_lock(selected_pset
); 
4108                 return selected_pset
; 
4115  * sched_edge_migrate_candidate() 
4117  * Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of 
4118  * the thread and the preferred cluster to determine the best available pset for scheduling. 
4120  * The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an 
4121  * accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the 
4122  * decision has to be synchronized with another CPU changing the recommendation of clusters available 
4123  * on the system. If this parameter is set to false, this routine returns the best effort indication of 
4124  * the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as 
4125  * SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a 
4126  * more appropriate cluster. 
4128  * Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the 
4129  * resultant pset lock is held. 
4131 static processor_set_t
 
4132 sched_edge_migrate_candidate(processor_set_t preferred_pset
, thread_t thread
, processor_set_t locked_pset
, bool switch_pset_locks
) 
4134         __kdebug_only 
uint32_t preferred_cluster_id 
= preferred_pset
->pset_cluster_id
; 
4135         processor_set_t selected_pset 
= preferred_pset
; 
4137         if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread
)) { 
4138                 /* For bound threads always recommend the cluster its bound to */ 
4139                 selected_pset 
= pset_array
[sched_edge_thread_bound_cluster_id(thread
)]; 
4140                 locked_pset 
= sched_edge_switch_pset_lock(selected_pset
, locked_pset
, switch_pset_locks
); 
4141                 if (sched_edge_pset_available(selected_pset
) || (SCHED_CLUTCH_THREAD_CLUSTER_BOUND_SOFT(thread
) == false)) { 
4143                          * If the bound cluster is not available, check if the thread is soft bound. For soft bound threads, 
4144                          * fall through to the regular cluster selection logic which handles unavailable clusters 
4145                          * appropriately. If the thread is hard bound, then return the bound cluster always. 
4147                         return selected_pset
; 
4151         uint64_t candidate_cluster_bitmap 
= mask(MAX_PSETS
); 
4152         if (thread
->sched_pri 
>= BASEPRI_RTQUEUES
) { 
4153                 /* For realtime threads, try and schedule them on the preferred pset always */ 
4154                 goto migrate_candidate_available_check
; 
4158          * If a thread is being rebalanced for achieving equal progress of parallel workloads, 
4159          * it needs to end up on the preferred runqueue. 
4161         uint32_t preferred_cluster_load 
= sched_edge_cluster_load_metric(preferred_pset
, thread
->th_sched_bucket
); 
4162         boolean_t amp_rebalance 
= (thread
->reason 
& (AST_REBALANCE 
| AST_QUANTUM
)) == (AST_REBALANCE 
| AST_QUANTUM
); 
4163         if ((preferred_cluster_load 
== 0) || amp_rebalance
) { 
4164                 goto migrate_candidate_available_check
; 
4167         /* Look at edge weights to decide the most ideal migration candidate for this thread */ 
4168         selected_pset 
= sched_edge_migrate_edges_evaluate(preferred_pset
, preferred_cluster_load
, thread
); 
4170 migrate_candidate_available_check
: 
4171         locked_pset 
= sched_edge_switch_pset_lock(selected_pset
, locked_pset
, switch_pset_locks
); 
4172         if (sched_edge_pset_available(selected_pset
) == true) { 
4173                 if (selected_pset 
!= preferred_pset
) { 
4174                         KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_CLUSTER_OVERLOAD
) | DBG_FUNC_NONE
, thread_tid(thread
), preferred_cluster_id
, selected_pset
->pset_cluster_id
, preferred_cluster_load
); 
4176                 return selected_pset
; 
4178         /* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */ 
4179         bitmap_clear(&candidate_cluster_bitmap
, selected_pset
->pset_cluster_id
); 
4180         if (__improbable(bitmap_first(&candidate_cluster_bitmap
, MAX_PSETS
) == -1)) { 
4182                  * None of the clusters are available for scheduling; this situation should be rare but if it happens, 
4183                  * simply return the boot cluster. 
4185                 selected_pset 
= &pset0
; 
4186                 locked_pset 
= sched_edge_switch_pset_lock(selected_pset
, locked_pset
, switch_pset_locks
); 
4187                 if (selected_pset 
!= preferred_pset
) { 
4188                         KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH
, MACH_SCHED_EDGE_CLUSTER_OVERLOAD
) | DBG_FUNC_NONE
, thread_tid(thread
), preferred_cluster_id
, selected_pset
->pset_cluster_id
, preferred_cluster_load
); 
4190                 return selected_pset
; 
4192         /* Try and find an alternative for the selected pset */ 
4193         selected_pset 
= sched_edge_candidate_alternative(selected_pset
, candidate_cluster_bitmap
); 
4194         goto migrate_candidate_available_check
; 
4198 sched_edge_choose_processor(processor_set_t pset
, processor_t processor
, thread_t thread
) 
4200         /* Bound threads don't call this function */ 
4201         assert(thread
->bound_processor 
== PROCESSOR_NULL
); 
4202         processor_t chosen_processor 
= PROCESSOR_NULL
; 
4205          * sched_edge_preferred_pset() returns the preferred pset for a given thread. 
4206          * It should take the passed in "pset" as a hint which represents the recency metric for 
4207          * pset selection logic. 
4209         processor_set_t preferred_pset 
= pset_array
[sched_edge_thread_preferred_cluster(thread
)]; 
4210         processor_set_t chosen_pset 
= preferred_pset
; 
4212          * If the preferred pset is overloaded, find a pset which is the best candidate to migrate 
4213          * threads to. sched_edge_migrate_candidate() returns the preferred pset 
4214          * if it has capacity; otherwise finds the best candidate pset to migrate this thread to. 
4216          * <Edge Multi-cluster Support Needed> 
4217          * It might be useful to build a recency metric for the thread for multiple clusters and 
4218          * factor that into the migration decisions. 
4220         chosen_pset 
= sched_edge_migrate_candidate(preferred_pset
, thread
, pset
, true); 
4221         chosen_processor 
= choose_processor(chosen_pset
, processor
, thread
); 
4222         assert(chosen_processor
->processor_set 
== chosen_pset
); 
4223         return chosen_processor
; 
4227  * sched_edge_clutch_bucket_threads_drain() 
4229  * Drains all the runnable threads which are not restricted to the root_clutch (due to clutch 
4230  * bucket overrides etc.) into a local thread queue. 
4233 sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket
, sched_clutch_root_t root_clutch
, queue_t clutch_threads
) 
4235         thread_t thread 
= THREAD_NULL
; 
4236         uint64_t current_timestamp 
= mach_approximate_time(); 
4237         qe_foreach_element_safe(thread
, &clutch_bucket
->scb_thread_timeshare_queue
, th_clutch_timeshare_link
) { 
4238                 sched_clutch_thread_remove(root_clutch
, thread
, current_timestamp
, SCHED_CLUTCH_BUCKET_OPTIONS_NONE
); 
4239                 enqueue_tail(clutch_threads
, &thread
->runq_links
); 
4244  * sched_edge_run_drained_threads() 
4246  * Makes all drained threads in a local queue runnable. 
4249 sched_edge_run_drained_threads(queue_t clutch_threads
) 
4252         /* Now setrun all the threads in the local queue */ 
4253         qe_foreach_element_safe(thread
, clutch_threads
, runq_links
) { 
4254                 remqueue(&thread
->runq_links
); 
4255                 thread_lock(thread
); 
4256                 thread_setrun(thread
, SCHED_TAILQ
); 
4257                 thread_unlock(thread
); 
4262  * sched_edge_update_preferred_cluster() 
4264  * Routine to update the preferred cluster for QoS buckets within a thread group. 
4265  * The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap). 
4268 sched_edge_update_preferred_cluster( 
4269         sched_clutch_t sched_clutch
, 
4270         bitmap_t 
*clutch_bucket_modify_bitmap
, 
4271         uint32_t *tg_bucket_preferred_cluster
) 
4273         for (int bucket 
= bitmap_first(clutch_bucket_modify_bitmap
, TH_BUCKET_SCHED_MAX
); bucket 
>= 0; bucket 
= bitmap_next(clutch_bucket_modify_bitmap
, bucket
)) { 
4274                 os_atomic_store(&sched_clutch
->sc_clutch_groups
[bucket
].scbg_preferred_cluster
, tg_bucket_preferred_cluster
[bucket
], relaxed
); 
4279  * sched_edge_migrate_thread_group_runnable_threads() 
4281  * Routine to implement the migration of threads on a cluster when the thread group 
4282  * recommendation is updated. The migration works using a 2-phase 
4285  * Phase 1: With the pset lock held, check the recommendation of the clutch buckets. 
4286  * For each clutch bucket, if it needs to be migrated immediately, drain the threads 
4287  * into a local thread queue. Otherwise mark the clutch bucket as native/foreign as 
4290  * Phase 2: After unlocking the pset, drain all the threads from the local thread 
4291  * queue and mark them runnable which should land them in the right hierarchy. 
4293  * The routine assumes that the preferences for the clutch buckets/clutch bucket 
4294  * groups have already been updated by the caller. 
4296  * - Called with the pset locked and interrupts disabled. 
4297  * - Returns with the pset unlocked. 
4300 sched_edge_migrate_thread_group_runnable_threads( 
4301         sched_clutch_t sched_clutch
, 
4302         sched_clutch_root_t root_clutch
, 
4303         bitmap_t 
*clutch_bucket_modify_bitmap
, 
4304         __unused 
uint32_t *tg_bucket_preferred_cluster
, 
4305         bool migrate_immediately
) 
4307         /* Queue to hold threads that have been drained from clutch buckets to be migrated */ 
4308         queue_head_t clutch_threads
; 
4309         queue_init(&clutch_threads
); 
4311         for (int bucket 
= bitmap_first(clutch_bucket_modify_bitmap
, TH_BUCKET_SCHED_MAX
); bucket 
>= 0; bucket 
= bitmap_next(clutch_bucket_modify_bitmap
, bucket
)) { 
4312                 /* Get the clutch bucket for this cluster and sched bucket */ 
4313                 sched_clutch_bucket_group_t clutch_bucket_group 
= &(sched_clutch
->sc_clutch_groups
[bucket
]); 
4314                 sched_clutch_bucket_t clutch_bucket 
= &(clutch_bucket_group
->scbg_clutch_buckets
[root_clutch
->scr_cluster_id
]); 
4315                 sched_clutch_root_t scb_root 
= os_atomic_load(&clutch_bucket
->scb_root
, relaxed
); 
4316                 if (scb_root 
== NULL
) { 
4317                         /* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */ 
4318                         assert(clutch_bucket
->scb_thr_count 
== 0); 
4321                 assert(scb_root 
== root_clutch
); 
4322                 uint32_t clutch_bucket_preferred_cluster 
= sched_clutch_bucket_preferred_cluster(clutch_bucket
); 
4324                 if (migrate_immediately
) { 
4326                          * For transitions where threads need to be migrated immediately, drain the threads into a 
4327                          * local queue unless we are looking at the clutch buckets for the newly recommended 
4330                         if (root_clutch
->scr_cluster_id 
!= clutch_bucket_preferred_cluster
) { 
4331                                 sched_edge_clutch_bucket_threads_drain(clutch_bucket
, scb_root
, &clutch_threads
); 
4333                                 sched_clutch_bucket_mark_native(clutch_bucket
, root_clutch
); 
4336                         /* Check if this cluster is the same type as the newly recommended cluster */ 
4337                         boolean_t homogeneous_cluster 
= (pset_type_for_id(root_clutch
->scr_cluster_id
) == pset_type_for_id(clutch_bucket_preferred_cluster
)); 
4339                          * If threads do not have to be migrated immediately, just change the native/foreign 
4340                          * flag on the clutch bucket. 
4342                         if (homogeneous_cluster
) { 
4343                                 sched_clutch_bucket_mark_native(clutch_bucket
, root_clutch
); 
4345                                 sched_clutch_bucket_mark_foreign(clutch_bucket
, root_clutch
); 
4350         pset_unlock(root_clutch
->scr_pset
); 
4351         sched_edge_run_drained_threads(&clutch_threads
); 
4355  * sched_edge_migrate_thread_group_running_threads() 
4357  * Routine to find all running threads of a thread group on a specific cluster 
4358  * and IPI them if they need to be moved immediately. 
4361 sched_edge_migrate_thread_group_running_threads( 
4362         sched_clutch_t sched_clutch
, 
4363         sched_clutch_root_t root_clutch
, 
4364         __unused bitmap_t 
*clutch_bucket_modify_bitmap
, 
4365         uint32_t *tg_bucket_preferred_cluster
, 
4366         bool migrate_immediately
) 
4368         if (migrate_immediately 
== false) { 
4369                 /* If CLPC has recommended not to move threads immediately, nothing to do here */ 
4374          * Edge Scheduler Optimization 
4376          * When the system has a large number of clusters and cores, it might be useful to 
4377          * narrow down the iteration by using a thread running bitmap per clutch. 
4379         uint64_t ast_processor_map 
= 0; 
4380         sched_ipi_type_t ipi_type
[MAX_CPUS
] = {SCHED_IPI_NONE
}; 
4382         uint64_t running_map 
= root_clutch
->scr_pset
->cpu_state_map
[PROCESSOR_RUNNING
]; 
4384          * Iterate all CPUs and look for the ones running threads from this thread group and are 
4385          * not restricted to the specific cluster (due to overrides etc.) 
4387         for (int cpuid 
= lsb_first(running_map
); cpuid 
>= 0; cpuid 
= lsb_next(running_map
, cpuid
)) { 
4388                 processor_t src_processor 
= processor_array
[cpuid
]; 
4389                 boolean_t expected_tg 
= (src_processor
->current_thread_group 
== sched_clutch
->sc_tg
); 
4390                 sched_bucket_t processor_sched_bucket 
= src_processor
->processor_set
->cpu_running_buckets
[cpuid
]; 
4391                 boolean_t non_preferred_cluster 
= tg_bucket_preferred_cluster
[processor_sched_bucket
] != root_clutch
->scr_cluster_id
; 
4393                 if (expected_tg 
&& non_preferred_cluster
) { 
4394                         ipi_type
[cpuid
] = sched_ipi_action(src_processor
, NULL
, false, SCHED_IPI_EVENT_REBALANCE
); 
4395                         if (ipi_type
[cpuid
] != SCHED_IPI_NONE
) { 
4396                                 bit_set(ast_processor_map
, cpuid
); 
4397                         } else if (src_processor 
== current_processor()) { 
4398                                 ast_on(AST_PREEMPT
); 
4399                                 bit_set(root_clutch
->scr_pset
->pending_AST_PREEMPT_cpu_mask
, cpuid
); 
4404         /* Perform all the IPIs */ 
4405         if (bit_first(ast_processor_map
) != -1) { 
4406                 for (int cpuid 
= lsb_first(ast_processor_map
); cpuid 
>= 0; cpuid 
= lsb_next(ast_processor_map
, cpuid
)) { 
4407                         processor_t ast_processor 
= processor_array
[cpuid
]; 
4408                         sched_ipi_perform(ast_processor
, ipi_type
[cpuid
]); 
4410                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED
, MACH_AMP_RECOMMENDATION_CHANGE
) | DBG_FUNC_NONE
, thread_group_get_id(sched_clutch
->sc_tg
), ast_processor_map
, 0, 0); 
4415  * sched_edge_tg_preferred_cluster_change() 
4417  * Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster 
4418  * is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs 
4419  * thread migrations based on the policy specified by CLPC. 
4420  * tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster 
4421  * for each QoS within the thread group. 
4424 sched_edge_tg_preferred_cluster_change(struct thread_group 
*tg
, uint32_t *tg_bucket_preferred_cluster
, sched_perfcontrol_preferred_cluster_options_t options
) 
4426         sched_clutch_t clutch 
= sched_clutch_for_thread_group(tg
); 
4428          * In order to optimize the processing, create a bitmap which represents all QoS buckets 
4429          * for which the preferred cluster has changed. 
4431         bitmap_t clutch_bucket_modify_bitmap
[BITMAP_LEN(TH_BUCKET_SCHED_MAX
)] = {0}; 
4432         for (sched_bucket_t bucket 
= TH_BUCKET_FIXPRI
; bucket 
< TH_BUCKET_SCHED_MAX
; bucket
++) { 
4433                 uint32_t old_preferred_cluster 
= sched_edge_clutch_bucket_group_preferred_cluster(&clutch
->sc_clutch_groups
[bucket
]); 
4434                 uint32_t new_preferred_cluster 
= tg_bucket_preferred_cluster
[bucket
]; 
4435                 if (old_preferred_cluster 
!= new_preferred_cluster
) { 
4436                         bitmap_set(clutch_bucket_modify_bitmap
, bucket
); 
4439         if (bitmap_lsb_first(clutch_bucket_modify_bitmap
, TH_BUCKET_SCHED_MAX
) == -1) { 
4440                 /* No changes in any clutch buckets; nothing to do here */ 
4444         for (uint32_t cluster_id 
= 0; cluster_id 
< MAX_PSETS
; cluster_id
++) { 
4445                 processor_set_t pset 
= pset_array
[cluster_id
]; 
4446                 spl_t s 
= splsched(); 
4449                  * The first operation is to update the preferred cluster for all QoS buckets within the 
4450                  * thread group so that any future threads becoming runnable would see the new preferred 
4453                 sched_edge_update_preferred_cluster(clutch
, clutch_bucket_modify_bitmap
, tg_bucket_preferred_cluster
); 
4455                  * Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized 
4456                  * by keeping a per-clutch bitmap of clusters running threads for a particular TG. 
4458                  * <Edge Multi-cluster Support Needed> 
4460                 /* Migrate all running threads of the TG on this cluster based on options specified by CLPC */ 
4461                 sched_edge_migrate_thread_group_running_threads(clutch
, &pset
->pset_clutch_root
, clutch_bucket_modify_bitmap
, 
4462                     tg_bucket_preferred_cluster
, (options 
& SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING
)); 
4463                 /* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */ 
4464                 sched_edge_migrate_thread_group_runnable_threads(clutch
, &pset
->pset_clutch_root
, clutch_bucket_modify_bitmap
, 
4465                     tg_bucket_preferred_cluster
, (options 
& SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE
)); 
4466                 /* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */ 
4472  * sched_edge_pset_made_schedulable() 
4474  * Routine to migrate all the clutch buckets which are not in their recommended 
4475  * pset hierarchy now that a new pset has become runnable. Its possible that this 
4476  * routine is called when the pset is already marked schedulable. 
4478  * Invoked with the pset lock held and interrupts disabled. 
4481 sched_edge_pset_made_schedulable(__unused processor_t processor
, processor_set_t dst_pset
, boolean_t drop_lock
) 
4483         if (bitmap_test(sched_edge_available_pset_bitmask
, dst_pset
->pset_cluster_id
)) { 
4484                 /* Nothing to do here since pset is already marked schedulable */ 
4486                         pset_unlock(dst_pset
); 
4491         bitmap_set(sched_edge_available_pset_bitmask
, dst_pset
->pset_cluster_id
); 
4493         thread_t thread 
= sched_edge_processor_idle(dst_pset
); 
4494         if (thread 
!= THREAD_NULL
) { 
4495                 thread_lock(thread
); 
4496                 thread_setrun(thread
, SCHED_TAILQ
); 
4497                 thread_unlock(thread
); 
4501                 pset_lock(dst_pset
); 
4505 extern int sched_amp_spill_deferred_ipi
; 
4506 extern int sched_amp_pcores_preempt_immediate_ipi
; 
4508 int sched_edge_migrate_ipi_immediate 
= 1; 
4511 sched_edge_ipi_policy(processor_t dst
, thread_t thread
, boolean_t dst_idle
, sched_ipi_event_t event
) 
4513         processor_set_t pset 
= dst
->processor_set
; 
4514         assert(bit_test(pset
->pending_AST_URGENT_cpu_mask
, dst
->cpu_id
) == false); 
4515         assert(dst 
!= current_processor()); 
4517         boolean_t deferred_ipi_supported 
= false; 
4518 #if defined(CONFIG_SCHED_DEFERRED_AST) 
4519         deferred_ipi_supported 
= true; 
4520 #endif /* CONFIG_SCHED_DEFERRED_AST */ 
4523         case SCHED_IPI_EVENT_SPILL
: 
4524                 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */ 
4525                 if (deferred_ipi_supported 
&& sched_amp_spill_deferred_ipi
) { 
4526                         return sched_ipi_deferred_policy(pset
, dst
, event
); 
4529         case SCHED_IPI_EVENT_PREEMPT
: 
4530                 /* For preemption, the default policy is to use deferred IPIs 
4531                  * for Non-RT P-core preemption. Override that behavior if 
4532                  * sched_amp_pcores_preempt_immediate_ipi is set 
4534                 if (thread 
&& thread
->sched_pri 
< BASEPRI_RTQUEUES
) { 
4535                         if (sched_edge_migrate_ipi_immediate
) { 
4537                                  * For workloads that are going wide, it might be useful use Immediate IPI to 
4538                                  * wakeup the idle CPU if the scheduler estimates that the preferred pset will 
4539                                  * be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution 
4540                                  * latency on the preferred pset as an estimate of busyness. 
4542                                  * <Edge Multi-cluster Support Needed> 
4544                                 processor_set_t preferred_pset 
= pset_array
[sched_edge_thread_preferred_cluster(thread
)]; 
4545                                 if ((preferred_pset
->pset_execution_time
[thread
->th_sched_bucket
].pset_avg_thread_execution_time 
* NSEC_PER_USEC
) >= ml_cpu_signal_deferred_get_timer()) { 
4546                                         return dst_idle 
? SCHED_IPI_IDLE 
: SCHED_IPI_IMMEDIATE
; 
4549                         if (sched_amp_pcores_preempt_immediate_ipi 
&& (pset_type_for_id(pset
->pset_cluster_id
) == CLUSTER_TYPE_P
)) { 
4550                                 return dst_idle 
? SCHED_IPI_IDLE 
: SCHED_IPI_IMMEDIATE
; 
4557         /* Default back to the global policy for all other scenarios */ 
4558         return sched_ipi_policy(dst
, thread
, dst_idle
, event
); 
4562  * sched_edge_qos_max_parallelism() 
4565 sched_edge_qos_max_parallelism(int qos
, uint64_t options
) 
4567         uint32_t ecount 
= 0; 
4568         uint32_t pcount 
= 0; 
4570         for (int cluster_id 
= 0; cluster_id 
< MAX_PSETS
; cluster_id
++) { 
4571                 processor_set_t pset 
= pset_array
[cluster_id
]; 
4572                 if (pset_type_for_id(cluster_id
) == CLUSTER_TYPE_P
) { 
4573                         pcount 
+= pset
->cpu_set_count
; 
4575                         ecount 
+= pset
->cpu_set_count
; 
4579         if (options 
& QOS_PARALLELISM_REALTIME
) { 
4580                 /* For realtime threads on AMP, we would want them 
4581                  * to limit the width to just the P-cores since we 
4582                  * do not spill/rebalance for RT threads. 
4588          * The Edge scheduler supports per-QoS recommendations for thread groups. 
4589          * This enables lower QoS buckets (such as UT) to be scheduled on all 
4590          * CPUs on the system. 
4592          * The only restriction is for BG/Maintenance QoS classes for which the 
4593          * performance controller would never recommend execution on the P-cores. 
4594          * If that policy changes in the future, this value should be changed. 
4597         case THREAD_QOS_BACKGROUND
: 
4598         case THREAD_QOS_MAINTENANCE
: 
4601                 return ecount 
+ pcount
; 
4607 #endif /* CONFIG_SCHED_EDGE */ 
4609 #endif /* CONFIG_SCHED_CLUTCH */