osfmk/kern/sched_clutch.c

   1 /*
   2  * Copyright (c) 2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <mach/mach_types.h>
  30 #include <mach/machine.h>
  31 #include <machine/machine_routines.h>
  32 #include <machine/sched_param.h>
  33 #include <machine/machine_cpu.h>
  34 #include <kern/kern_types.h>
  35 #include <kern/debug.h>
  36 #include <kern/machine.h>
  37 #include <kern/misc_protos.h>
  38 #include <kern/processor.h>
  39 #include <kern/queue.h>
  40 #include <kern/sched.h>
  41 #include <kern/sched_prim.h>
  42 #include <kern/task.h>
  43 #include <kern/thread.h>
  44 #include <kern/sched_clutch.h>
  45 #include <machine/atomic.h>
  46 #include <kern/sched_clutch.h>
  47 #include <sys/kdebug.h>
  48
  49 #if CONFIG_SCHED_EDGE
  50 #include <kern/sched_amp_common.h>
  51 #endif /* CONFIG_SCHED_EDGE */
  52
  53 #if CONFIG_SCHED_CLUTCH
  54
  55 /* Forward declarations of static routines */
  56
  57 /* Root level hierarchy management */
  58 static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
  59 static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t, bool);
  60 static void sched_clutch_root_pri_update(sched_clutch_root_t);
  61 static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
  62 static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
  63
  64 __enum_decl(sched_clutch_highest_root_bucket_type_t, uint32_t, {
  65         SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE           = 0,
  66         SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY   = 1,
  67         SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL            = 2,
  68 });
  69
  70 static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t, sched_clutch_highest_root_bucket_type_t);
  71
  72 #if CONFIG_SCHED_EDGE
  73 /* Support for foreign threads on AMP platforms */
  74 static boolean_t sched_clutch_root_foreign_empty(sched_clutch_root_t);
  75 static thread_t sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t);
  76 #endif /* CONFIG_SCHED_EDGE */
  77
  78 /* Root bucket level hierarchy management */
  79 static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
  80 static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t);
  81
  82 /* Options for clutch bucket ordering in the runq */
  83 __options_decl(sched_clutch_bucket_options_t, uint32_t, {
  84         SCHED_CLUTCH_BUCKET_OPTIONS_NONE        = 0x0,
  85         /* Round robin clutch bucket on thread removal */
  86         SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR  = 0x1,
  87         /* Insert clutch bucket at head (for thread preemption) */
  88         SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ       = 0x2,
  89         /* Insert clutch bucket at tail (default) */
  90         SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ       = 0x4,
  91 });
  92
  93 /* Clutch bucket level hierarchy management */
  94 static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
  95 static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
  96 static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
  97 static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
  98 static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
  99 static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
 100
 101 /* Clutch bucket group level properties management */
 102 static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t);
 103 static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t);
 104 static void sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t, sched_clutch_bucket_t, uint64_t);
 105 static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t);
 106 static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t);
 107 static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t);
 108 static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t, uint64_t);
 109
 110 /* Clutch timeshare properties updates */
 111 static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
 112 static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
 113
 114 /* Clutch membership management */
 115 static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
 116 static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
 117 static thread_t sched_clutch_thread_highest_remove(sched_clutch_root_t);
 118
 119 /* Clutch properties updates */
 120 static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
 121 static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
 122 static int sched_clutch_root_priority(sched_clutch_root_t);
 123 static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t);
 124 static boolean_t sched_thread_sched_pri_promoted(thread_t);
 125
 126 #if CONFIG_SCHED_EDGE
 127 /* System based routines */
 128 static bool sched_edge_pset_available(processor_set_t);
 129 static uint32_t sched_edge_thread_bound_cluster_id(thread_t);
 130 #endif /* CONFIG_SCHED_EDGE */
 131
 132 /* Helper debugging routines */
 133 static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
 134
 135 extern processor_set_t pset_array[MAX_PSETS];
 136
 137 /*
 138  * Special markers for buckets that have invalid WCELs/quantums etc.
 139  */
 140 #define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
 141 #define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
 142
 143 /*
 144  * Root level bucket WCELs
 145  *
 146  * The root level bucket selection algorithm is an Earliest Deadline
 147  * First (EDF) algorithm where the deadline for buckets are defined
 148  * by the worst-case-execution-latency and the make runnable timestamp
 149  * for the bucket.
 150  *
 151  */
 152 static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
 153         SCHED_CLUTCH_INVALID_TIME_32,                   /* FIXPRI */
 154         0,                                              /* FG */
 155         37500,                                          /* IN (37.5ms) */
 156         75000,                                          /* DF (75ms) */
 157         150000,                                         /* UT (150ms) */
 158         250000                                          /* BG (250ms) */
 159 };
 160 static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
 161
 162 /*
 163  * Root level bucket warp
 164  *
 165  * Each root level bucket has a warp value associated with it as well.
 166  * The warp value allows the root bucket to effectively warp ahead of
 167  * lower priority buckets for a limited time even if it has a later
 168  * deadline. The warping behavior provides extra (but limited)
 169  * opportunity for high priority buckets to remain responsive.
 170  */
 171
 172 /* Special warp deadline value to indicate that the bucket has not used any warp yet */
 173 #define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED    (SCHED_CLUTCH_INVALID_TIME_64)
 174
 175 /* Warp window durations for various tiers */
 176 static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
 177         SCHED_CLUTCH_INVALID_TIME_32,                   /* FIXPRI */
 178         8000,                                           /* FG (8ms)*/
 179         4000,                                           /* IN (4ms) */
 180         2000,                                           /* DF (2ms) */
 181         1000,                                           /* UT (1ms) */
 182         0                                               /* BG (0ms) */
 183 };
 184 static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
 185
 186 /*
 187  * Thread level quantum
 188  *
 189  * The algorithm defines quantums for threads at various buckets. This
 190  * (combined with the root level bucket quantums) restricts how much
 191  * the lower priority levels can preempt the higher priority threads.
 192  */
 193
 194 #if XNU_TARGET_OS_OSX
 195 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
 196         10000,                                          /* FIXPRI (10ms) */
 197         10000,                                          /* FG (10ms) */
 198         10000,                                          /* IN (10ms) */
 199         10000,                                          /* DF (10ms) */
 200         4000,                                           /* UT (4ms) */
 201         2000                                            /* BG (2ms) */
 202 };
 203 #else /* XNU_TARGET_OS_OSX */
 204 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
 205         10000,                                          /* FIXPRI (10ms) */
 206         10000,                                          /* FG (10ms) */
 207         8000,                                           /* IN (8ms) */
 208         6000,                                           /* DF (6ms) */
 209         4000,                                           /* UT (4ms) */
 210         2000                                            /* BG (2ms) */
 211 };
 212 #endif /* XNU_TARGET_OS_OSX */
 213
 214 static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
 215
 216 /*
 217  * sched_clutch_us_to_abstime()
 218  *
 219  * Initializer for converting all durations in usec to abstime
 220  */
 221 static void
 222 sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
 223 {
 224         for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
 225                 if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
 226                         abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
 227                 } else {
 228                         clock_interval_to_absolutetime_interval(us_vals[i],
 229                             NSEC_PER_USEC, &abstime_vals[i]);
 230                 }
 231         }
 232 }
 233
 234 /* Clutch/Edge Scheduler Debugging support */
 235 #define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c)        ((uint64_t)c | ((uint64_t)b << 16) | ((uint64_t)a << 32))
 236
 237 #if DEVELOPMENT || DEBUG
 238
 239 /*
 240  * sched_clutch_hierarchy_locked_assert()
 241  *
 242  * Debugging helper routine. Asserts that the hierarchy is locked. The locking
 243  * for the hierarchy depends on where the hierarchy is hooked. The current
 244  * implementation hooks the hierarchy at the pset, so the hierarchy is locked
 245  * using the pset lock.
 246  */
 247 static inline void
 248 sched_clutch_hierarchy_locked_assert(
 249         sched_clutch_root_t root_clutch)
 250 {
 251         pset_assert_locked(root_clutch->scr_pset);
 252 }
 253
 254 #else /* DEVELOPMENT || DEBUG */
 255
 256 static inline void
 257 sched_clutch_hierarchy_locked_assert(
 258         __unused sched_clutch_root_t root_clutch)
 259 {
 260 }
 261
 262 #endif /* DEVELOPMENT || DEBUG */
 263
 264 /*
 265  * sched_clutch_thr_count_inc()
 266  *
 267  * Increment thread count at a hierarchy level with overflow checks.
 268  */
 269 static void
 270 sched_clutch_thr_count_inc(
 271         uint16_t *thr_count)
 272 {
 273         if (__improbable(os_inc_overflow(thr_count))) {
 274                 panic("sched_clutch thread count overflowed!");
 275         }
 276 }
 277
 278 /*
 279  * sched_clutch_thr_count_dec()
 280  *
 281  * Decrement thread count at a hierarchy level with underflow checks.
 282  */
 283 static void
 284 sched_clutch_thr_count_dec(
 285         uint16_t *thr_count)
 286 {
 287         if (__improbable(os_dec_overflow(thr_count))) {
 288                 panic("sched_clutch thread count underflowed!");
 289         }
 290 }
 291
 292 /*
 293  * The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
 294  * based on the amount of time they have been pending and the load at that
 295  * scheduling bucket level. Since the clutch bucket groups are global (i.e. span
 296  * multiple clusters, its important to keep the load also as a global counter.
 297  */
 298 static uint32_t _Atomic sched_clutch_global_bucket_load[TH_BUCKET_SCHED_MAX];
 299
 300 /*
 301  * sched_clutch_root_init()
 302  *
 303  * Routine to initialize the scheduler hierarchy root.
 304  */
 305 static void
 306 sched_clutch_root_init(
 307         sched_clutch_root_t root_clutch,
 308         processor_set_t pset)
 309 {
 310         root_clutch->scr_thr_count = 0;
 311         root_clutch->scr_priority = NOPRI;
 312         root_clutch->scr_urgency = 0;
 313         root_clutch->scr_pset = pset;
 314 #if CONFIG_SCHED_EDGE
 315         root_clutch->scr_cluster_id = pset->pset_cluster_id;
 316 #else /* CONFIG_SCHED_EDGE */
 317         root_clutch->scr_cluster_id = 0;
 318 #endif /* CONFIG_SCHED_EDGE */
 319
 320         /* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
 321         queue_init(&root_clutch->scr_clutch_buckets);
 322
 323         /* Initialize the priority queue which maintains all runnable foreign clutch buckets */
 324         priority_queue_init(&root_clutch->scr_foreign_buckets);
 325         bzero(&root_clutch->scr_cumulative_run_count, sizeof(root_clutch->scr_cumulative_run_count));
 326         bitmap_zero(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
 327         bitmap_zero(root_clutch->scr_bound_warp_available, TH_BUCKET_SCHED_MAX);
 328         priority_queue_init(&root_clutch->scr_bound_root_buckets);
 329
 330         /* Initialize the bitmap and priority queue of runnable root buckets */
 331         priority_queue_init(&root_clutch->scr_unbound_root_buckets);
 332         bitmap_zero(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
 333         bitmap_zero(root_clutch->scr_unbound_warp_available, TH_BUCKET_SCHED_MAX);
 334
 335         /* Initialize all the root buckets */
 336         for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
 337                 sched_clutch_root_bucket_init(&root_clutch->scr_unbound_buckets[i], i, false);
 338                 sched_clutch_root_bucket_init(&root_clutch->scr_bound_buckets[i], i, true);
 339         }
 340 }
 341
 342 /*
 343  * Clutch Bucket Runqueues
 344  *
 345  * The clutch buckets are maintained in a runq at the root bucket level. The
 346  * runq organization allows clutch buckets to be ordered based on various
 347  * factors such as:
 348  *
 349  * - Clutch buckets are round robin'ed at the same priority level when a
 350  *   thread is selected from a clutch bucket. This prevents a clutch bucket
 351  *   from starving out other clutch buckets at the same priority.
 352  *
 353  * - Clutch buckets are inserted at the head when it becomes runnable due to
 354  *   thread preemption. This allows threads that were preempted to maintain
 355  *   their order in the queue.
 356  *
 357  */
 358
 359 /*
 360  * sched_clutch_bucket_runq_init()
 361  *
 362  * Initialize a clutch bucket runq.
 363  */
 364 static void
 365 sched_clutch_bucket_runq_init(
 366         sched_clutch_bucket_runq_t clutch_buckets_rq)
 367 {
 368         clutch_buckets_rq->scbrq_highq = NOPRI;
 369         for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) {
 370                 clutch_buckets_rq->scbrq_bitmap[i] = 0;
 371         }
 372         clutch_buckets_rq->scbrq_count = 0;
 373         for (int i = 0; i < NRQS; i++) {
 374                 circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
 375         }
 376 }
 377
 378 /*
 379  * sched_clutch_bucket_runq_empty()
 380  *
 381  * Returns if a clutch bucket runq is empty.
 382  */
 383 static boolean_t
 384 sched_clutch_bucket_runq_empty(
 385         sched_clutch_bucket_runq_t clutch_buckets_rq)
 386 {
 387         return clutch_buckets_rq->scbrq_count == 0;
 388 }
 389
 390 /*
 391  * sched_clutch_bucket_runq_peek()
 392  *
 393  * Returns the highest priority clutch bucket in the runq.
 394  */
 395 static sched_clutch_bucket_t
 396 sched_clutch_bucket_runq_peek(
 397         sched_clutch_bucket_runq_t clutch_buckets_rq)
 398 {
 399         if (clutch_buckets_rq->scbrq_count > 0) {
 400                 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
 401                 return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
 402         } else {
 403                 return NULL;
 404         }
 405 }
 406
 407 /*
 408  * sched_clutch_bucket_runq_enqueue()
 409  *
 410  * Enqueue a clutch bucket into the runq based on the options passed in.
 411  */
 412 static void
 413 sched_clutch_bucket_runq_enqueue(
 414         sched_clutch_bucket_runq_t clutch_buckets_rq,
 415         sched_clutch_bucket_t clutch_bucket,
 416         sched_clutch_bucket_options_t options)
 417 {
 418         circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
 419         if (circle_queue_empty(queue)) {
 420                 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
 421                 bitmap_set(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
 422                 if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
 423                         clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
 424                 }
 425         } else {
 426                 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
 427                         circle_enqueue_head(queue, &clutch_bucket->scb_runqlink);
 428                 } else {
 429                         /*
 430                          * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
 431                          * SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
 432                          */
 433                         circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
 434                 }
 435         }
 436         clutch_buckets_rq->scbrq_count++;
 437 }
 438
 439 /*
 440  * sched_clutch_bucket_runq_remove()
 441  *
 442  * Remove a clutch bucket from the runq.
 443  */
 444 static void
 445 sched_clutch_bucket_runq_remove(
 446         sched_clutch_bucket_runq_t clutch_buckets_rq,
 447         sched_clutch_bucket_t clutch_bucket)
 448 {
 449         circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
 450         circle_dequeue(queue, &clutch_bucket->scb_runqlink);
 451         assert(clutch_buckets_rq->scbrq_count > 0);
 452         clutch_buckets_rq->scbrq_count--;
 453         if (circle_queue_empty(queue)) {
 454                 bitmap_clear(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
 455                 clutch_buckets_rq->scbrq_highq = bitmap_first(clutch_buckets_rq->scbrq_bitmap, NRQS);
 456         }
 457 }
 458
 459 static void
 460 sched_clutch_bucket_runq_rotate(
 461         sched_clutch_bucket_runq_t clutch_buckets_rq,
 462         sched_clutch_bucket_t clutch_bucket)
 463 {
 464         circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
 465         assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
 466         circle_queue_rotate_head_forward(queue);
 467 }
 468
 469 /*
 470  * sched_clutch_root_bucket_init()
 471  *
 472  * Routine to initialize root buckets.
 473  */
 474 static void
 475 sched_clutch_root_bucket_init(
 476         sched_clutch_root_bucket_t root_bucket,
 477         sched_bucket_t bucket,
 478         bool bound_root_bucket)
 479 {
 480         root_bucket->scrb_bucket = bucket;
 481         if (bound_root_bucket) {
 482                 /* For bound root buckets, initialize the bound thread runq. */
 483                 root_bucket->scrb_bound = true;
 484                 run_queue_init(&root_bucket->scrb_bound_thread_runq);
 485         } else {
 486                 /*
 487                  * The unbounded root buckets contain a runq of runnable clutch buckets
 488                  * which then hold the runnable threads.
 489                  */
 490                 root_bucket->scrb_bound = false;
 491                 sched_clutch_bucket_runq_init(&root_bucket->scrb_clutch_buckets);
 492         }
 493         priority_queue_entry_init(&root_bucket->scrb_pqlink);
 494         root_bucket->scrb_pqlink.deadline = SCHED_CLUTCH_INVALID_TIME_64;
 495         root_bucket->scrb_warped_deadline = 0;
 496         root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
 497         root_bucket->scrb_starvation_avoidance = false;
 498         root_bucket->scrb_starvation_ts = 0;
 499 }
 500
 501 /*
 502  * Special case scheduling for Above UI bucket.
 503  *
 504  * AboveUI threads are typically system critical threads that need low latency
 505  * which is why they are handled specially.
 506  *
 507  * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
 508  * important to maintain some native priority order between those buckets. For unbounded
 509  * root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
 510  * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
 511  * deadline based scheduling which should pickup the timeshare buckets. For the bound
 512  * case, the policy simply compares the priority of the highest runnable threads in
 513  * the above UI and timeshare buckets.
 514  *
 515  * The implementation allows extremely low latency CPU access for Above UI threads
 516  * while supporting the use case of high priority timeshare threads contending with
 517  * lower priority fixed priority threads.
 518  */
 519
 520
 521 /*
 522  * sched_clutch_root_unbound_select_aboveui()
 523  *
 524  * Routine to determine if the above UI unbounded bucket should be selected for execution.
 525  */
 526 static bool
 527 sched_clutch_root_unbound_select_aboveui(
 528         sched_clutch_root_t root_clutch)
 529 {
 530         if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_FIXPRI)) {
 531                 sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
 532                 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_unbound_buckets[TH_BUCKET_SHARE_FG];
 533                 if (!bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
 534                         /* If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling */
 535                         return true;
 536                 }
 537                 sched_clutch_bucket_t clutch_bucket_aboveui = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_aboveui);
 538                 sched_clutch_bucket_t clutch_bucket_sharefg = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_sharefg);
 539                 if (clutch_bucket_aboveui->scb_priority >= clutch_bucket_sharefg->scb_priority) {
 540                         return true;
 541                 }
 542         }
 543         return false;
 544 }
 545
 546 /*
 547  * sched_clutch_root_bound_select_aboveui()
 548  *
 549  * Routine to determine if the above UI bounded bucket should be selected for execution.
 550  */
 551 static bool
 552 sched_clutch_root_bound_select_aboveui(
 553         sched_clutch_root_t root_clutch)
 554 {
 555         sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
 556         sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_bound_buckets[TH_BUCKET_SHARE_FG];
 557         if (root_bucket_aboveui->scrb_bound_thread_runq.count == 0) {
 558                 return false;
 559         }
 560         return root_bucket_aboveui->scrb_bound_thread_runq.highq >= root_bucket_sharefg->scrb_bound_thread_runq.highq;
 561 }
 562
 563 /*
 564  * sched_clutch_root_highest_root_bucket()
 565  *
 566  * Main routine to find the highest runnable root level bucket.
 567  * This routine is called from performance sensitive contexts; so it is
 568  * crucial to keep this O(1). The options parameter determines if
 569  * the selection logic should look at unbounded threads only (for
 570  * cross-cluster stealing operations) or both bounded and unbounded
 571  * threads (for selecting next thread for execution on current cluster).
 572  */
 573 static sched_clutch_root_bucket_t
 574 sched_clutch_root_highest_root_bucket(
 575         sched_clutch_root_t root_clutch,
 576         uint64_t timestamp,
 577         sched_clutch_highest_root_bucket_type_t type)
 578 {
 579         sched_clutch_hierarchy_locked_assert(root_clutch);
 580         int highest_runnable_bucket = -1;
 581         if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
 582                 highest_runnable_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
 583         } else {
 584                 int highest_unbound_runnable_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
 585                 int highest_bound_runnable_bucket = bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
 586                 highest_runnable_bucket = (highest_bound_runnable_bucket != -1) ? ((highest_unbound_runnable_bucket != -1) ? MIN(highest_bound_runnable_bucket, highest_unbound_runnable_bucket) : highest_bound_runnable_bucket) : highest_unbound_runnable_bucket;
 587         }
 588
 589         if (highest_runnable_bucket == -1) {
 590                 return NULL;
 591         }
 592
 593         /* Above UI root bucket selection (see comment above for more details on this special case handling) */
 594         bool unbound_aboveui = sched_clutch_root_unbound_select_aboveui(root_clutch);
 595         if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
 596                 if (unbound_aboveui) {
 597                         return &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
 598                 }
 599                 /* Fall through to selecting a timeshare root bucket */
 600         } else {
 601                 bool bound_aboveui = sched_clutch_root_bound_select_aboveui(root_clutch);
 602                 sched_clutch_root_bucket_t unbound_aboveui_root_bucket = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
 603                 sched_clutch_root_bucket_t bound_aboveui_root_bucket = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
 604
 605                 if (unbound_aboveui && bound_aboveui) {
 606                         /*
 607                          * In this scenario both the bounded and unbounded above UI buckets are runnable; choose based on the
 608                          * highest runnable priority in both the buckets.
 609                          * */
 610                         int bound_aboveui_pri = root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI].scrb_bound_thread_runq.highq;
 611                         sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(unbound_aboveui_root_bucket);
 612                         int unbound_aboveui_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
 613                         return (bound_aboveui_pri >= unbound_aboveui_pri) ? bound_aboveui_root_bucket : unbound_aboveui_root_bucket;
 614                 }
 615                 if (unbound_aboveui) {
 616                         return unbound_aboveui_root_bucket;
 617                 }
 618                 if (bound_aboveui) {
 619                         return bound_aboveui_root_bucket;
 620                 }
 621                 /* Fall through to selecting a timeshare root bucket */
 622         }
 623
 624         /*
 625          * Above UI bucket is not runnable or has a low priority runnable thread; use the
 626          * earliest deadline model to schedule threads. The idea is that as the timeshare
 627          * buckets use CPU, they will drop their interactivity score/sched priority and
 628          * allow the low priority AboveUI buckets to be scheduled.
 629          */
 630
 631         /* Find the earliest deadline bucket */
 632         sched_clutch_root_bucket_t edf_bucket = NULL;
 633         sched_clutch_root_bucket_t warp_bucket = NULL;
 634         int warp_bucket_index = -1;
 635
 636 evaluate_root_buckets:
 637         if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
 638                 edf_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
 639         } else {
 640                 sched_clutch_root_bucket_t unbound_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
 641                 sched_clutch_root_bucket_t bound_bucket = priority_queue_min(&root_clutch->scr_bound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
 642                 if (bound_bucket && unbound_bucket) {
 643                         /* If bound and unbound root buckets are runnable, select the one with the earlier deadline */
 644                         edf_bucket = (bound_bucket->scrb_pqlink.deadline <= unbound_bucket->scrb_pqlink.deadline) ? bound_bucket : unbound_bucket;
 645                 } else {
 646                         edf_bucket = (bound_bucket) ? bound_bucket : unbound_bucket;
 647                 }
 648         }
 649         /*
 650          * Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
 651          * buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
 652          * makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
 653          * QoS root bucket.
 654          */
 655         bitmap_t *warp_available_bitmap = (edf_bucket->scrb_bound) ? (root_clutch->scr_bound_warp_available) : (root_clutch->scr_unbound_warp_available);
 656         warp_bucket_index = bitmap_lsb_first(warp_available_bitmap, TH_BUCKET_SCHED_MAX);
 657
 658         if ((warp_bucket_index == -1) || (warp_bucket_index >= edf_bucket->scrb_bucket)) {
 659                 /* No higher buckets have warp left; best choice is the EDF based bucket */
 660                 if (edf_bucket->scrb_starvation_avoidance) {
 661                         /*
 662                          * Indicates that the earliest deadline bucket is in starvation avoidance mode. Check to see if the
 663                          * starvation avoidance window is still open and return this bucket if it is.
 664                          *
 665                          * The starvation avoidance window is calculated based on the quantum of threads at that bucket and
 666                          * the number of CPUs in the cluster. The idea is to basically provide one quantum worth of starvation
 667                          * avoidance across all CPUs.
 668                          */
 669                         uint64_t starvation_window = sched_clutch_thread_quantum[edf_bucket->scrb_bucket] / root_clutch->scr_pset->online_processor_count;
 670                         if (timestamp < (edf_bucket->scrb_starvation_ts + starvation_window)) {
 671                                 return edf_bucket;
 672                         } else {
 673                                 /* Starvation avoidance window is over; update deadline and re-evaluate EDF */
 674                                 edf_bucket->scrb_starvation_avoidance = false;
 675                                 edf_bucket->scrb_starvation_ts = 0;
 676                                 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
 677                         }
 678                         goto evaluate_root_buckets;
 679                 }
 680
 681                 /* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */
 682                 if (highest_runnable_bucket < edf_bucket->scrb_bucket) {
 683                         /* Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance */
 684                         edf_bucket->scrb_starvation_avoidance = true;
 685                         edf_bucket->scrb_starvation_ts = timestamp;
 686                 } else {
 687                         /* EDF bucket is being selected in the natural order; update deadline and reset warp */
 688                         sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
 689                         edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
 690                         edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
 691                         if (edf_bucket->scrb_bound) {
 692                                 bitmap_set(root_clutch->scr_bound_warp_available, edf_bucket->scrb_bucket);
 693                         } else {
 694                                 bitmap_set(root_clutch->scr_unbound_warp_available, edf_bucket->scrb_bucket);
 695                         }
 696                 }
 697                 return edf_bucket;
 698         }
 699
 700         /*
 701          * Looks like there is a root bucket which is higher in the natural priority
 702          * order than edf_bucket and might have some warp remaining.
 703          */
 704         warp_bucket = (edf_bucket->scrb_bound) ? &root_clutch->scr_bound_buckets[warp_bucket_index] : &root_clutch->scr_unbound_buckets[warp_bucket_index];
 705         if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
 706                 /* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
 707                 warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
 708                 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
 709                 return warp_bucket;
 710         }
 711         if (warp_bucket->scrb_warped_deadline > timestamp) {
 712                 /* Root bucket already has a warp window open with some warp remaining */
 713                 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
 714                 return warp_bucket;
 715         }
 716
 717         /* For this bucket, warp window was opened sometime in the past but has now
 718          * expired. Mark the bucket as not avilable for warp anymore and re-run the
 719          * warp bucket selection logic.
 720          */
 721         warp_bucket->scrb_warp_remaining = 0;
 722         if (warp_bucket->scrb_bound) {
 723                 bitmap_clear(root_clutch->scr_bound_warp_available, warp_bucket->scrb_bucket);
 724         } else {
 725                 bitmap_clear(root_clutch->scr_unbound_warp_available, warp_bucket->scrb_bucket);
 726         }
 727         goto evaluate_root_buckets;
 728 }
 729
 730 /*
 731  * sched_clutch_root_bucket_deadline_calculate()
 732  *
 733  * Calculate the deadline for the bucket based on its WCEL
 734  */
 735 static uint64_t
 736 sched_clutch_root_bucket_deadline_calculate(
 737         sched_clutch_root_bucket_t root_bucket,
 738         uint64_t timestamp)
 739 {
 740         /* For fixpri AboveUI bucket always return it as the earliest deadline */
 741         if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
 742                 return 0;
 743         }
 744
 745         /* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
 746         return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
 747 }
 748
 749 /*
 750  * sched_clutch_root_bucket_deadline_update()
 751  *
 752  * Routine to update the deadline of the root bucket when it is selected.
 753  * Updating the deadline also moves the root_bucket in the EDF priority
 754  * queue.
 755  */
 756 static void
 757 sched_clutch_root_bucket_deadline_update(
 758         sched_clutch_root_bucket_t root_bucket,
 759         sched_clutch_root_t root_clutch,
 760         uint64_t timestamp)
 761 {
 762         if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
 763                 /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
 764                 return;
 765         }
 766
 767         uint64_t old_deadline = root_bucket->scrb_pqlink.deadline;
 768         uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
 769         if (__improbable(old_deadline > new_deadline)) {
 770                 panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline, new_deadline, root_bucket->scrb_bucket, timestamp);
 771         }
 772         if (old_deadline != new_deadline) {
 773                 root_bucket->scrb_pqlink.deadline = new_deadline;
 774                 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
 775                 priority_queue_entry_increased(prioq, &root_bucket->scrb_pqlink);
 776         }
 777 }
 778
 779 /*
 780  * sched_clutch_root_bucket_runnable()
 781  *
 782  * Routine to insert a newly runnable root bucket into the hierarchy.
 783  * Also updates the deadline and warp parameters as necessary.
 784  */
 785 static void
 786 sched_clutch_root_bucket_runnable(
 787         sched_clutch_root_bucket_t root_bucket,
 788         sched_clutch_root_t root_clutch,
 789         uint64_t timestamp)
 790 {
 791         /* Mark the root bucket as runnable */
 792         bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
 793         bitmap_set(runnable_bitmap, root_bucket->scrb_bucket);
 794
 795         if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
 796                 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
 797                 return;
 798         }
 799
 800         if (root_bucket->scrb_starvation_avoidance == false) {
 801                 /*
 802                  * Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
 803                  * starvation avoidance and its window has expired, the highest root bucket selection logic will notice
 804                  * that and fix it up.
 805                  */
 806                 root_bucket->scrb_pqlink.deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
 807         }
 808         struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
 809         priority_queue_insert(prioq, &root_bucket->scrb_pqlink);
 810         if (root_bucket->scrb_warp_remaining) {
 811                 /* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
 812                 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
 813                 bitmap_set(warp_bitmap, root_bucket->scrb_bucket);
 814         }
 815 }
 816
 817 /*
 818  * sched_clutch_root_bucket_empty()
 819  *
 820  * Routine to remove an empty root bucket from the hierarchy.
 821  * Also updates the deadline and warp parameters as necessary.
 822  */
 823 static void
 824 sched_clutch_root_bucket_empty(
 825         sched_clutch_root_bucket_t root_bucket,
 826         sched_clutch_root_t root_clutch,
 827         uint64_t timestamp)
 828 {
 829         bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
 830         bitmap_clear(runnable_bitmap, root_bucket->scrb_bucket);
 831
 832         if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
 833                 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
 834                 return;
 835         }
 836
 837         struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
 838         priority_queue_remove(prioq, &root_bucket->scrb_pqlink);
 839
 840         bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
 841         bitmap_clear(warp_bitmap, root_bucket->scrb_bucket);
 842
 843         if (root_bucket->scrb_warped_deadline > timestamp) {
 844                 /*
 845                  * For root buckets that were using the warp, check if the warp
 846                  * deadline is in the future. If yes, remove the wall time the
 847                  * warp was active and update the warp remaining. This allows
 848                  * the root bucket to use the remaining warp the next time it
 849                  * becomes runnable.
 850                  */
 851                 root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
 852         } else if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
 853                 /*
 854                  * If the root bucket's warped deadline is in the past, it has used up
 855                  * all the warp it was assigned. Empty out its warp remaining.
 856                  */
 857                 root_bucket->scrb_warp_remaining = 0;
 858         }
 859 }
 860
 861 static int
 862 sched_clutch_global_bucket_load_get(
 863         sched_bucket_t bucket)
 864 {
 865         return (int)os_atomic_load(&sched_clutch_global_bucket_load[bucket], relaxed);
 866 }
 867
 868 /*
 869  * sched_clutch_root_pri_update()
 870  *
 871  * The root level priority is used for thread selection and preemption
 872  * logic.
 873  *
 874  * The logic uses the same decision as thread selection for deciding between the
 875  * above UI and timeshare buckets. If one of the timesharing buckets have to be
 876  * used for priority calculation, the logic is slightly different from thread
 877  * selection, because thread selection considers deadlines, warps etc. to
 878  * decide the most optimal bucket at a given timestamp. Since the priority
 879  * value is used for preemption decisions only, it needs to be based on the
 880  * highest runnable thread available in the timeshare domain. This logic can
 881  * be made more sophisticated if there are cases of unnecessary preemption
 882  * being seen in workloads.
 883  */
 884 static void
 885 sched_clutch_root_pri_update(
 886         sched_clutch_root_t root_clutch)
 887 {
 888         sched_clutch_hierarchy_locked_assert(root_clutch);
 889         int16_t root_bound_pri = NOPRI;
 890         int16_t root_unbound_pri = NOPRI;
 891
 892         if (bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
 893                 goto root_pri_update_unbound;
 894         }
 895         sched_clutch_root_bucket_t root_bucket_bound = NULL;
 896         if (sched_clutch_root_bound_select_aboveui(root_clutch)) {
 897                 root_bucket_bound = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
 898         } else {
 899                 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
 900                 assert(root_bucket_index != -1);
 901                 root_bucket_bound = &root_clutch->scr_bound_buckets[root_bucket_index];
 902         }
 903         root_bound_pri = root_bucket_bound->scrb_bound_thread_runq.highq;
 904
 905 root_pri_update_unbound:
 906         if (bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
 907                 goto root_pri_update_complete;
 908         }
 909         sched_clutch_root_bucket_t root_bucket_unbound = NULL;
 910         if (sched_clutch_root_unbound_select_aboveui(root_clutch)) {
 911                 root_bucket_unbound = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
 912         } else {
 913                 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
 914                 assert(root_bucket_index != -1);
 915                 root_bucket_unbound = &root_clutch->scr_unbound_buckets[root_bucket_index];
 916         }
 917         /* For the selected root bucket, find the highest priority clutch bucket */
 918         sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_unbound);
 919         root_unbound_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
 920
 921 root_pri_update_complete:
 922         root_clutch->scr_priority = MAX(root_bound_pri, root_unbound_pri);
 923 }
 924
 925 /*
 926  * sched_clutch_root_urgency_inc()
 927  *
 928  * Routine to increment the urgency at the root level based on the thread
 929  * priority that is being inserted into the hierarchy. The root urgency
 930  * counter is updated based on the urgency of threads in any of the
 931  * clutch buckets which are part of the hierarchy.
 932  *
 933  * Always called with the pset lock held.
 934  */
 935 static void
 936 sched_clutch_root_urgency_inc(
 937         sched_clutch_root_t root_clutch,
 938         thread_t thread)
 939 {
 940         if (SCHED(priority_is_urgent)(thread->sched_pri)) {
 941                 root_clutch->scr_urgency++;
 942         }
 943 }
 944
 945 /*
 946  * sched_clutch_root_urgency_dec()
 947  *
 948  * Routine to decrement the urgency at the root level based on the thread
 949  * priority that is being removed from the hierarchy. The root urgency
 950  * counter is updated based on the urgency of threads in any of the
 951  * clutch buckets which are part of the hierarchy.
 952  *
 953  * Always called with the pset lock held.
 954  */
 955 static void
 956 sched_clutch_root_urgency_dec(
 957         sched_clutch_root_t root_clutch,
 958         thread_t thread)
 959 {
 960         if (SCHED(priority_is_urgent)(thread->sched_pri)) {
 961                 root_clutch->scr_urgency--;
 962         }
 963 }
 964
 965 /*
 966  * Clutch bucket level scheduling
 967  *
 968  * The second level of scheduling is the clutch bucket level scheduling
 969  * which tries to schedule thread groups within root_buckets. Each
 970  * clutch represents a thread group and a clutch_bucket_group represents
 971  * threads at a particular sched_bucket within that thread group. The
 972  * clutch_bucket_group contains a clutch_bucket per cluster on the system
 973  * where it holds the runnable threads destined for execution on that
 974  * cluster.
 975  *
 976  * The goal of this level of scheduling is to allow interactive thread
 977  * groups low latency access to the CPU. It also provides slight
 978  * scheduling preference for App and unrestricted thread groups.
 979  *
 980  * The clutch bucket scheduling algorithm measures an interactivity
 981  * score for all clutch bucket groups. The interactivity score is based
 982  * on the ratio of the CPU used and the voluntary blocking of threads
 983  * within the clutch bucket group. The algorithm is very close to the ULE
 984  * scheduler on FreeBSD in terms of calculations. The interactivity
 985  * score provides an interactivity boost in the range of
 986  * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
 987  * thread groups to win over CPU spinners.
 988  *
 989  * The interactivity score of the clutch bucket group is combined with the
 990  * highest base/promoted priority of threads in the clutch bucket to form
 991  * the overall priority of the clutch bucket.
 992  */
 993
 994 /* Priority boost range for interactivity */
 995 #define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT     (8)
 996 uint8_t sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
 997
 998 /* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
 999 uint64_t sched_clutch_bucket_group_adjust_threshold = 0;
1000 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS      (500000)
1001
1002 /* The ratio to scale the cpu/blocked time per window */
1003 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO                (10)
1004
1005 /*
1006  * In order to allow App thread groups some preference over daemon thread
1007  * groups, the App clutch_buckets get a priority boost. The boost value should
1008  * be chosen such that badly behaved apps are still penalized over well
1009  * behaved interactive daemons.
1010  */
1011 static uint8_t sched_clutch_bucket_group_pri_boost[SCHED_CLUTCH_TG_PRI_MAX] = {
1012         [SCHED_CLUTCH_TG_PRI_LOW]  = 0,
1013         [SCHED_CLUTCH_TG_PRI_MED]  = 2,
1014         [SCHED_CLUTCH_TG_PRI_HIGH] = 4,
1015 };
1016
1017 /* Initial value for voluntary blocking time for the clutch_bucket */
1018 #define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID          (uint64_t)(~0)
1019
1020 /* Value indicating the clutch bucket is not pending execution */
1021 #define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID             ((uint64_t)(~0))
1022
1023 /*
1024  * Thread group CPU starvation avoidance
1025  *
1026  * In heavily CPU contended scenarios, it is possible that some thread groups
1027  * which have a low interactivity score do not get CPU time at all. In order to
1028  * resolve that, the scheduler tries to ageout the CPU usage of the clutch
1029  * bucket group when it has been pending execution for a certain time as defined
1030  * by the sched_clutch_bucket_group_pending_delta_us values below.
1031  *
1032  * The values chosen here are very close to the WCEL values for each sched bucket.
1033  * These values are multiplied by the load average of the relevant root bucket to
1034  * provide an estimate of the actual clutch bucket load.
1035  */
1036 static uint32_t sched_clutch_bucket_group_pending_delta_us[TH_BUCKET_SCHED_MAX] = {
1037         SCHED_CLUTCH_INVALID_TIME_32,           /* FIXPRI */
1038         10000,                                  /* FG */
1039         37500,                                  /* IN */
1040         75000,                                  /* DF */
1041         150000,                                 /* UT */
1042         250000,                                 /* BG */
1043 };
1044 static uint64_t sched_clutch_bucket_group_pending_delta[TH_BUCKET_SCHED_MAX] = {0};
1045
1046 /*
1047  * sched_clutch_bucket_init()
1048  *
1049  * Initializer for clutch buckets.
1050  */
1051 static void
1052 sched_clutch_bucket_init(
1053         sched_clutch_bucket_t clutch_bucket,
1054         sched_clutch_bucket_group_t clutch_bucket_group,
1055         sched_bucket_t bucket)
1056 {
1057         bzero(clutch_bucket, sizeof(struct sched_clutch_bucket));
1058
1059         clutch_bucket->scb_bucket = bucket;
1060         /* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
1061         clutch_bucket->scb_priority = 0;
1062 #if CONFIG_SCHED_EDGE
1063         clutch_bucket->scb_foreign = false;
1064         priority_queue_entry_init(&clutch_bucket->scb_foreignlink);
1065 #endif /* CONFIG_SCHED_EDGE */
1066         clutch_bucket->scb_group = clutch_bucket_group;
1067         clutch_bucket->scb_root = NULL;
1068         priority_queue_init(&clutch_bucket->scb_clutchpri_prioq);
1069         priority_queue_init(&clutch_bucket->scb_thread_runq);
1070         queue_init(&clutch_bucket->scb_thread_timeshare_queue);
1071 }
1072
1073 /*
1074  * sched_clutch_bucket_group_init()
1075  *
1076  * Initializer for clutch bucket groups.
1077  */
1078 static void
1079 sched_clutch_bucket_group_init(
1080         sched_clutch_bucket_group_t clutch_bucket_group,
1081         sched_clutch_t clutch,
1082         sched_bucket_t bucket)
1083 {
1084         bzero(clutch_bucket_group, sizeof(struct sched_clutch_bucket_group));
1085         clutch_bucket_group->scbg_bucket = bucket;
1086         clutch_bucket_group->scbg_clutch = clutch;
1087         for (int i = 0; i < MAX_PSETS; i++) {
1088                 sched_clutch_bucket_init(&clutch_bucket_group->scbg_clutch_buckets[i], clutch_bucket_group, bucket);
1089         }
1090         os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, 0, relaxed);
1091         os_atomic_store(&clutch_bucket_group->scbg_pri_shift, INT8_MAX, relaxed);
1092         os_atomic_store(&clutch_bucket_group->scbg_preferred_cluster, pset0.pset_cluster_id, relaxed);
1093         /*
1094          * All thread groups should be initialized to be interactive; this allows the newly launched
1095          * thread groups to fairly compete with already running thread groups.
1096          */
1097         clutch_bucket_group->scbg_interactivity_data.scct_count = (sched_clutch_bucket_group_interactive_pri * 2);
1098         clutch_bucket_group->scbg_interactivity_data.scct_timestamp = 0;
1099         os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed);
1100 #if !__LP64__
1101         lck_spin_init(&clutch_bucket_group->scbg_stats_lock, &pset_lck_grp, NULL);
1102 #endif /* !__LP64__ */
1103         clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
1104         clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1105 }
1106
1107 /*
1108  * sched_clutch_init_with_thread_group()
1109  *
1110  * Initialize the sched_clutch when the thread group is being created
1111  */
1112 void
1113 sched_clutch_init_with_thread_group(
1114         sched_clutch_t clutch,
1115         struct thread_group *tg)
1116 {
1117         os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
1118
1119         /* Initialize all the clutch buckets */
1120         for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1121                 sched_clutch_bucket_group_init(&(clutch->sc_clutch_groups[i]), clutch, i);
1122         }
1123
1124         /* Grouping specific fields */
1125         clutch->sc_tg = tg;
1126         os_atomic_store(&clutch->sc_tg_priority, 0, relaxed);
1127 }
1128
1129 /*
1130  * sched_clutch_destroy()
1131  *
1132  * Destructor for clutch; called from thread group release code.
1133  */
1134 void
1135 sched_clutch_destroy(
1136         __unused sched_clutch_t clutch)
1137 {
1138         assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
1139 }
1140
1141 #if CONFIG_SCHED_EDGE
1142
1143 /*
1144  * The current edge scheduler still relies on globals for E & P clusters. It uses these
1145  * globals for the following operations:
1146  * - Sysctl support for configuring edges
1147  * - Edge scheduler initialization
1148  *
1149  * These should be removed for multi-cluster platforms once a clear policy for the above
1150  * operations is defined.
1151  * <Edge Multi-cluster Support Needed>
1152  */
1153 static uint32_t ecore_cluster_id = 0;
1154 static uint32_t pcore_cluster_id = 1;
1155
1156 /*
1157  * Edge Scheduler Preferred Cluster Mechanism
1158  *
1159  * In order to have better control over various QoS buckets within a thread group, the Edge
1160  * scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
1161  * preferences are stored at the sched_clutch_bucket_group level since that represents all
1162  * threads at a particular QoS level within a sched_clutch. For any lookup of preferred
1163  * cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
1164  */
1165
1166 static uint32_t
1167 sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)
1168 {
1169         return os_atomic_load(&clutch_bucket_group->scbg_preferred_cluster, relaxed);
1170 }
1171
1172 static uint32_t
1173 sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)
1174 {
1175         return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket->scb_group);
1176 }
1177
1178 uint32_t
1179 sched_edge_thread_preferred_cluster(thread_t thread)
1180 {
1181         if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1182                 /* For threads bound to a specific cluster, return the bound cluster id */
1183                 return sched_edge_thread_bound_cluster_id(thread);
1184         }
1185
1186         sched_clutch_t clutch = sched_clutch_for_thread(thread);
1187         sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[thread->th_sched_bucket];
1188         return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group);
1189 }
1190
1191 /*
1192  * Edge Scheduler Foreign Bucket Support
1193  *
1194  * In the Edge Scheduler, each cluster maintains a priority queue of clutch buckets containing
1195  * threads that are not native to the cluster. A clutch bucket is considered native if its
1196  * preferred cluster has the same type as the cluster its enqueued in. The foreign clutch
1197  * bucket priority queue is used for rebalance operations to get threads back to their native
1198  * cluster quickly.
1199  *
1200  * It is possible to make this policy even more aggressive by considering all clusters that
1201  * are not the preferred cluster as the foreign cluster, but that would mean a lot of thread
1202  * migrations which might have performance implications.
1203  */
1204
1205 static void
1206 sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1207 {
1208         if (clutch_bucket->scb_foreign) {
1209                 clutch_bucket->scb_foreign = false;
1210                 priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1211         }
1212 }
1213
1214 static void
1215 sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1216 {
1217         if (!clutch_bucket->scb_foreign) {
1218                 clutch_bucket->scb_foreign = true;
1219                 priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1220                 priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1221         }
1222 }
1223
1224 /*
1225  * Edge Scheduler Cumulative Load Average
1226  *
1227  * The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1228  * making thread migration decisions. The per-bucket load is maintained as a
1229  * cumulative count since higher scheduling buckets impact load on lower buckets
1230  * for thread migration decisions.
1231  *
1232  */
1233
1234 static void
1235 sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1236 {
1237         switch (bucket) {
1238         case TH_BUCKET_FIXPRI:    os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1239         case TH_BUCKET_SHARE_FG:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1240         case TH_BUCKET_SHARE_IN:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1241         case TH_BUCKET_SHARE_DF:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1242         case TH_BUCKET_SHARE_UT:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1243         case TH_BUCKET_SHARE_BG:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1244         default:
1245                 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1246         }
1247 }
1248
1249 static void
1250 sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1251 {
1252         switch (bucket) {
1253         case TH_BUCKET_FIXPRI:    os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1254         case TH_BUCKET_SHARE_FG:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1255         case TH_BUCKET_SHARE_IN:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1256         case TH_BUCKET_SHARE_DF:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1257         case TH_BUCKET_SHARE_UT:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1258         case TH_BUCKET_SHARE_BG:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1259         default:
1260                 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
1261         }
1262 }
1263
1264 uint16_t
1265 sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1266 {
1267         return os_atomic_load(&root_clutch->scr_cumulative_run_count[bucket], relaxed);
1268 }
1269
1270 #endif /* CONFIG_SCHED_EDGE */
1271
1272 /*
1273  * sched_clutch_bucket_hierarchy_insert()
1274  *
1275  * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
1276  */
1277 static void
1278 sched_clutch_bucket_hierarchy_insert(
1279         sched_clutch_root_t root_clutch,
1280         sched_clutch_bucket_t clutch_bucket,
1281         sched_bucket_t bucket,
1282         uint64_t timestamp,
1283         sched_clutch_bucket_options_t options)
1284 {
1285         sched_clutch_hierarchy_locked_assert(root_clutch);
1286         if (bucket > TH_BUCKET_FIXPRI) {
1287                 /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
1288                 enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
1289         }
1290 #if CONFIG_SCHED_EDGE
1291         /* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
1292         uint32_t preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
1293         if (pset_type_for_id(preferred_cluster) != pset_type_for_id(root_clutch->scr_cluster_id)) {
1294                 sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
1295         }
1296 #endif /* CONFIG_SCHED_EDGE */
1297         sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1298
1299         /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
1300         if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1301                 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
1302         }
1303
1304         /* Insert the clutch bucket into the root bucket run queue with order based on options */
1305         sched_clutch_bucket_runq_enqueue(&root_bucket->scrb_clutch_buckets, clutch_bucket, options);
1306         os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
1307         os_atomic_inc(&sched_clutch_global_bucket_load[bucket], relaxed);
1308 }
1309
1310 /*
1311  * sched_clutch_bucket_hierarchy_remove()
1312  *
1313  * Rotuine to remove a empty clutch bucket from the root hierarchy.
1314  */
1315 static void
1316 sched_clutch_bucket_hierarchy_remove(
1317         sched_clutch_root_t root_clutch,
1318         sched_clutch_bucket_t clutch_bucket,
1319         sched_bucket_t bucket,
1320         uint64_t timestamp,
1321         __unused sched_clutch_bucket_options_t options)
1322 {
1323         sched_clutch_hierarchy_locked_assert(root_clutch);
1324         if (bucket > TH_BUCKET_FIXPRI) {
1325                 /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
1326                 remqueue(&clutch_bucket->scb_listlink);
1327         }
1328 #if CONFIG_SCHED_EDGE
1329         sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
1330 #endif /* CONFIG_SCHED_EDGE */
1331
1332         sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1333
1334         /* Remove the clutch bucket from the root bucket priority queue */
1335         sched_clutch_bucket_runq_remove(&root_bucket->scrb_clutch_buckets, clutch_bucket);
1336         os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
1337
1338         /* If the root bucket priority queue is now empty, remove it from the root priority queue */
1339         if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1340                 sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
1341         }
1342         os_atomic_dec(&sched_clutch_global_bucket_load[bucket], relaxed);
1343 }
1344
1345 /*
1346  * sched_clutch_bucket_base_pri()
1347  *
1348  * Calculates the "base" priority of the clutch bucket. The base
1349  * priority of the clutch bucket is the sum of the max of highest
1350  * base_pri and highest sched_pri in the clutch bucket and any
1351  * grouping specific (App/Daemon...) boosts applicable to the
1352  * clutch_bucket.
1353  */
1354 static uint8_t
1355 sched_clutch_bucket_base_pri(
1356         sched_clutch_bucket_t clutch_bucket)
1357 {
1358         uint8_t clutch_boost = 0;
1359         assert(priority_queue_empty(&clutch_bucket->scb_thread_runq) == false);
1360
1361         sched_clutch_t clutch = clutch_bucket->scb_group->scbg_clutch;
1362
1363         /*
1364          * Since the clutch bucket can contain threads that are members of the group due
1365          * to the sched_pri being promoted or due to their base pri, the base priority of
1366          * the entire clutch bucket should be based on the highest thread (promoted or base)
1367          * in the clutch bucket.
1368          */
1369         uint8_t max_pri = 0;
1370         if (!priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq)) {
1371                 max_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1372         }
1373
1374         sched_clutch_tg_priority_t tg_pri = os_atomic_load(&clutch->sc_tg_priority, relaxed);
1375         clutch_boost = sched_clutch_bucket_group_pri_boost[tg_pri];
1376         return max_pri + clutch_boost;
1377 }
1378
1379 /*
1380  * sched_clutch_interactivity_from_cpu_data()
1381  *
1382  * Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
1383  */
1384 static uint8_t
1385 sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)
1386 {
1387         sched_clutch_bucket_cpu_data_t scb_cpu_data;
1388         scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
1389         clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
1390         clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
1391         uint8_t interactive_score = 0;
1392
1393         if ((cpu_blocked == 0) && (cpu_used == 0)) {
1394                 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1395         }
1396         /*
1397          * For all timeshare buckets, calculate the interactivity score of the bucket
1398          * and add it to the base priority
1399          */
1400         if (cpu_blocked > cpu_used) {
1401                 /* Interactive clutch_bucket case */
1402                 interactive_score = sched_clutch_bucket_group_interactive_pri +
1403                     ((sched_clutch_bucket_group_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
1404         } else {
1405                 /* Non-interactive clutch_bucket case */
1406                 interactive_score = ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) / cpu_used);
1407         }
1408         return interactive_score;
1409 }
1410
1411 /*
1412  * sched_clutch_bucket_pri_calculate()
1413  *
1414  * The priority calculation algorithm for the clutch_bucket is a slight
1415  * modification on the ULE interactivity score. It uses the base priority
1416  * of the clutch bucket and applies an interactivity score boost to the
1417  * highly responsive clutch buckets.
1418  */
1419 static uint8_t
1420 sched_clutch_bucket_pri_calculate(
1421         sched_clutch_bucket_t clutch_bucket,
1422         uint64_t timestamp)
1423 {
1424         /* For empty clutch buckets, return priority 0 */
1425         if (clutch_bucket->scb_thr_count == 0) {
1426                 return 0;
1427         }
1428
1429         uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
1430         uint8_t interactive_score = sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket->scb_group, timestamp);
1431
1432         assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1433         uint8_t pri = base_pri + interactive_score;
1434         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
1435             thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
1436         return pri;
1437 }
1438
1439 /*
1440  * sched_clutch_root_bucket_highest_clutch_bucket()
1441  *
1442  * Routine to find the highest priority clutch bucket
1443  * within the root bucket.
1444  */
1445 static sched_clutch_bucket_t
1446 sched_clutch_root_bucket_highest_clutch_bucket(
1447         sched_clutch_root_bucket_t root_bucket)
1448 {
1449         if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1450                 return NULL;
1451         }
1452         return sched_clutch_bucket_runq_peek(&root_bucket->scrb_clutch_buckets);
1453 }
1454
1455 /*
1456  * sched_clutch_bucket_runnable()
1457  *
1458  * Perform all operations needed when a new clutch bucket becomes runnable.
1459  * It involves inserting the clutch_bucket into the hierarchy and updating the
1460  * root priority appropriately.
1461  */
1462 static boolean_t
1463 sched_clutch_bucket_runnable(
1464         sched_clutch_bucket_t clutch_bucket,
1465         sched_clutch_root_t root_clutch,
1466         uint64_t timestamp,
1467         sched_clutch_bucket_options_t options)
1468 {
1469         sched_clutch_hierarchy_locked_assert(root_clutch);
1470         /* Since the clutch bucket became newly runnable, update its pending timestamp */
1471         clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1472         sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1473
1474         /* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
1475         sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, timestamp);
1476         int16_t root_old_pri = root_clutch->scr_priority;
1477         sched_clutch_root_pri_update(root_clutch);
1478         return root_clutch->scr_priority > root_old_pri;
1479 }
1480
1481 /*
1482  * sched_clutch_bucket_update()
1483  *
1484  * Update the clutch_bucket's position in the hierarchy. This routine is
1485  * called when a new thread is inserted or removed from a runnable clutch
1486  * bucket. The options specify some properties about the clutch bucket
1487  * insertion order into the clutch bucket runq.
1488  */
1489 static boolean_t
1490 sched_clutch_bucket_update(
1491         sched_clutch_bucket_t clutch_bucket,
1492         sched_clutch_root_t root_clutch,
1493         uint64_t timestamp,
1494         sched_clutch_bucket_options_t options)
1495 {
1496         sched_clutch_hierarchy_locked_assert(root_clutch);
1497         uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1498         sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_unbound_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1499         if (new_pri == clutch_bucket->scb_priority) {
1500                 /*
1501                  * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1502                  * to the end of the runq. Typically used when a thread is selected for execution
1503                  * from a clutch bucket.
1504                  */
1505                 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
1506                         sched_clutch_bucket_runq_rotate(bucket_runq, clutch_bucket);
1507                 }
1508                 return false;
1509         }
1510         sched_clutch_bucket_runq_remove(bucket_runq, clutch_bucket);
1511 #if CONFIG_SCHED_EDGE
1512         if (clutch_bucket->scb_foreign) {
1513                 priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1514         }
1515 #endif /* CONFIG_SCHED_EDGE */
1516         clutch_bucket->scb_priority = new_pri;
1517 #if CONFIG_SCHED_EDGE
1518         if (clutch_bucket->scb_foreign) {
1519                 priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1520                 priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1521         }
1522 #endif /* CONFIG_SCHED_EDGE */
1523         sched_clutch_bucket_runq_enqueue(bucket_runq, clutch_bucket, options);
1524
1525         int16_t root_old_pri = root_clutch->scr_priority;
1526         sched_clutch_root_pri_update(root_clutch);
1527         return root_clutch->scr_priority > root_old_pri;
1528 }
1529
1530 /*
1531  * sched_clutch_bucket_empty()
1532  *
1533  * Perform all the operations needed when a clutch_bucket is no longer runnable.
1534  * It involves removing the clutch bucket from the hierarchy and updaing the root
1535  * priority appropriately.
1536  */
1537 static void
1538 sched_clutch_bucket_empty(
1539         sched_clutch_bucket_t clutch_bucket,
1540         sched_clutch_root_t root_clutch,
1541         uint64_t timestamp,
1542         sched_clutch_bucket_options_t options)
1543 {
1544         sched_clutch_hierarchy_locked_assert(root_clutch);
1545         sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1546         clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1547         sched_clutch_root_pri_update(root_clutch);
1548 }
1549
1550 /*
1551  * sched_clutch_cpu_usage_update()
1552  *
1553  * Routine to update CPU usage of the thread in the hierarchy.
1554  */
1555 void
1556 sched_clutch_cpu_usage_update(
1557         thread_t thread,
1558         uint64_t delta)
1559 {
1560         if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread) || SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1561                 return;
1562         }
1563
1564         sched_clutch_t clutch = sched_clutch_for_thread(thread);
1565         sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
1566         sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta);
1567 }
1568
1569 /*
1570  * sched_clutch_bucket_group_cpu_usage_update()
1571  *
1572  * Routine to update the CPU usage of the clutch_bucket.
1573  */
1574 static void
1575 sched_clutch_bucket_group_cpu_usage_update(
1576         sched_clutch_bucket_group_t clutch_bucket_group,
1577         uint64_t delta)
1578 {
1579         if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
1580                 /* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1581                 return;
1582         }
1583         delta = MIN(delta, sched_clutch_bucket_group_adjust_threshold);
1584         os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1585 }
1586
1587 /*
1588  * sched_clutch_bucket_group_cpu_pending_adjust()
1589  *
1590  * Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
1591  * such that one "pending interval" provides one point improvement in interactivity score.
1592  */
1593 static inline uint64_t
1594 sched_clutch_bucket_group_cpu_pending_adjust(
1595         uint64_t cpu_used,
1596         uint64_t cpu_blocked,
1597         uint8_t pending_intervals)
1598 {
1599         uint64_t cpu_used_adjusted = 0;
1600         if (cpu_blocked < cpu_used) {
1601                 cpu_used_adjusted = (sched_clutch_bucket_group_interactive_pri * cpu_blocked * cpu_used);
1602                 cpu_used_adjusted = cpu_used_adjusted / ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) + (cpu_used * pending_intervals));
1603         } else {
1604                 uint64_t adjust_factor = (cpu_blocked * pending_intervals) / sched_clutch_bucket_group_interactive_pri;
1605                 cpu_used_adjusted = (adjust_factor > cpu_used) ? 0 : (cpu_used - adjust_factor);
1606         }
1607         return cpu_used_adjusted;
1608 }
1609
1610 /*
1611  * sched_clutch_bucket_group_cpu_adjust()
1612  *
1613  * Routine to scale the cpu usage and blocked time once the sum gets bigger
1614  * than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
1615  * manageable and maintain the same ratio while allowing clutch buckets to
1616  * adjust behavior and reflect in the interactivity score in a reasonable
1617  * amount of time. Also adjusts the CPU usage based on pending_intervals
1618  * which allows ageout of CPU to avoid starvation in highly contended scenarios.
1619  */
1620 static void
1621 sched_clutch_bucket_group_cpu_adjust(
1622         sched_clutch_bucket_group_t clutch_bucket_group,
1623         uint8_t pending_intervals)
1624 {
1625         sched_clutch_bucket_cpu_data_t old_cpu_data = {};
1626         sched_clutch_bucket_cpu_data_t new_cpu_data = {};
1627         os_atomic_rmw_loop(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed, {
1628                 clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
1629                 clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
1630
1631                 if ((pending_intervals == 0) && (cpu_used + cpu_blocked) < sched_clutch_bucket_group_adjust_threshold) {
1632                         /* No changes to the CPU used and blocked values */
1633                         os_atomic_rmw_loop_give_up();
1634                 }
1635                 if ((cpu_used + cpu_blocked) >= sched_clutch_bucket_group_adjust_threshold) {
1636                         /* Only keep the recent CPU history to better indicate how this TG has been behaving */
1637                         cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1638                         cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1639                 }
1640                 /* Use the shift passed in to ageout the CPU usage */
1641                 cpu_used = (clutch_cpu_data_t)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used, cpu_blocked, pending_intervals);
1642                 new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used;
1643                 new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked;
1644         });
1645 }
1646
1647 /*
1648  * Thread level scheduling algorithm
1649  *
1650  * The thread level scheduling algorithm uses the mach timeshare
1651  * decay based algorithm to achieve sharing between threads within the
1652  * same clutch bucket. The load/priority shifts etc. are all maintained
1653  * at the clutch bucket level and used for decay calculation of the
1654  * threads. The load sampling is still driven off the scheduler tick
1655  * for runnable clutch buckets (it does not use the new higher frequency
1656  * EWMA based load calculation). The idea is that the contention and load
1657  * within clutch_buckets should be limited enough to not see heavy decay
1658  * and timeshare effectively.
1659  */
1660
1661 /*
1662  * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1663  *
1664  * Increment the run count for the clutch bucket associated with the
1665  * thread.
1666  */
1667 uint32_t
1668 sched_clutch_thread_run_bucket_incr(
1669         thread_t thread,
1670         sched_bucket_t bucket)
1671 {
1672         if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1673                 return 0;
1674         }
1675         sched_clutch_t clutch = sched_clutch_for_thread(thread);
1676         return sched_clutch_run_bucket_incr(clutch, bucket);
1677 }
1678
1679 static uint32_t
1680 sched_clutch_run_bucket_incr(
1681         sched_clutch_t clutch,
1682         sched_bucket_t bucket)
1683 {
1684         assert(bucket != TH_BUCKET_RUN);
1685         sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1686         return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group);
1687 }
1688
1689 /*
1690  * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
1691  *
1692  * Decrement the run count for the clutch bucket associated with the
1693  * thread.
1694  */
1695 uint32_t
1696 sched_clutch_thread_run_bucket_decr(
1697         thread_t thread,
1698         sched_bucket_t bucket)
1699 {
1700         if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1701                 return 0;
1702         }
1703         sched_clutch_t clutch = sched_clutch_for_thread(thread);
1704         return sched_clutch_run_bucket_decr(clutch, bucket);
1705 }
1706
1707 static uint32_t
1708 sched_clutch_run_bucket_decr(
1709         sched_clutch_t clutch,
1710         sched_bucket_t bucket)
1711 {
1712         assert(bucket != TH_BUCKET_RUN);
1713         sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1714         return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group);
1715 }
1716
1717 /*
1718  * sched_clutch_bucket_group_timeshare_update()
1719  *
1720  * Routine to update the load and priority shift for the clutch_bucket_group
1721  * every sched_tick. For multi-cluster platforms, each QoS level will have multiple
1722  * clutch buckets with runnable threads in them. So it is important to maintain
1723  * the timesharing information at the clutch_bucket_group level instead of
1724  * individual clutch buckets (because the algorithm is trying to timeshare all
1725  * threads at the same QoS irrespective of which hierarchy they are enqueued in).
1726  *
1727  * The routine is called from the sched tick handling code to make sure this value
1728  * is updated at least once every sched tick. For clutch bucket groups which have
1729  * not been runnable for very long, the clutch_bucket_group maintains a "last
1730  * updated schedtick" parameter. As threads become runnable in the clutch bucket group,
1731  * if this value is outdated, the load and shifts are updated.
1732  *
1733  * Possible optimization:
1734  * - The current algorithm samples the load every sched tick (125ms).
1735  *   This is prone to spikes in runnable counts; if that turns out to be
1736  *   a problem, a simple solution would be to do the EWMA trick to sample
1737  *   load at every load_tick (30ms) and use the averaged value for the pri
1738  *   shift calculation.
1739  */
1740 static void
1741 sched_clutch_bucket_group_timeshare_update(
1742         sched_clutch_bucket_group_t clutch_bucket_group,
1743         sched_clutch_bucket_t clutch_bucket,
1744         uint64_t ctime)
1745 {
1746         if (clutch_bucket_group->scbg_bucket < TH_BUCKET_SHARE_FG) {
1747                 /* No timesharing needed for fixed priority Above UI threads */
1748                 return;
1749         }
1750
1751         /*
1752          * Update the timeshare parameters for the clutch bucket group
1753          * if they havent been updated in this tick.
1754          */
1755         uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed);
1756         uint32_t current_sched_ts = sched_tick;
1757         if (sched_ts < current_sched_ts) {
1758                 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed);
1759                 /* NCPU wide workloads should not experience decay */
1760                 uint64_t bucket_group_run_count = os_atomic_load_wide(&clutch_bucket_group->scbg_blocked_data.scct_count, relaxed) - 1;
1761                 uint32_t bucket_group_load = (uint32_t)(bucket_group_run_count / processor_avail_count);
1762                 bucket_group_load = MIN(bucket_group_load, NRQS - 1);
1763                 uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_group_load];
1764                 /* Ensure that the pri_shift value is reasonable */
1765                 pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift;
1766                 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed);
1767         }
1768
1769         /*
1770          * Update the clutch bucket priority; this allows clutch buckets that have been pending
1771          * for a long time to get an updated interactivity score.
1772          */
1773         sched_clutch_bucket_update(clutch_bucket, clutch_bucket->scb_root, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
1774 }
1775
1776 /*
1777  * sched_clutch_thread_clutch_update()
1778  *
1779  * Routine called when the thread changes its thread group. The current
1780  * implementation relies on the fact that the thread group is changed only
1781  * from the context of the thread itself. Due to this fact, the thread
1782  * group change causes only counter updates in the old & new clutch
1783  * buckets and no hierarchy changes. The routine also attributes the CPU
1784  * used so far to the old clutch.
1785  */
1786 void
1787 sched_clutch_thread_clutch_update(
1788         thread_t thread,
1789         sched_clutch_t old_clutch,
1790         sched_clutch_t new_clutch)
1791 {
1792         uint32_t cpu_delta;
1793         assert(current_thread() == thread);
1794
1795         if (old_clutch) {
1796                 sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
1797                 /*
1798                  * Calculate the CPU used by this thread in the old bucket and
1799                  * add it to the old clutch bucket. This uses the same CPU usage
1800                  * logic as update_priority etc.
1801                  */
1802                 thread_timer_delta(thread, cpu_delta);
1803                 if (thread->pri_shift < INT8_MAX) {
1804                         thread->sched_usage += cpu_delta;
1805                 }
1806                 thread->cpu_delta += cpu_delta;
1807                 if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1808                         sched_clutch_bucket_group_t clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]);
1809                         sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, cpu_delta);
1810                 }
1811         }
1812
1813         if (new_clutch) {
1814                 sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
1815         }
1816 }
1817
1818 /* Thread Insertion/Removal/Selection routines */
1819
1820 #if CONFIG_SCHED_EDGE
1821
1822 /*
1823  * Edge Scheduler Bound Thread Support
1824  *
1825  * The edge scheduler allows threads to be bound to specific clusters. The scheduler
1826  * maintains a separate runq on the clutch root to hold these bound threads. These
1827  * bound threads count towards the root priority and thread count, but are ignored
1828  * for thread migration/steal decisions. Bound threads that are enqueued in the
1829  * separate runq have the th_bound_cluster_enqueued flag set to allow easy
1830  * removal.
1831  *
1832  * Bound Threads Timesharing
1833  * The bound threads share the timesharing properties of the clutch bucket group they are
1834  * part of. They contribute to the load and use priority shifts/decay values from the
1835  * clutch bucket group.
1836  */
1837
1838 static boolean_t
1839 sched_edge_bound_thread_insert(
1840         sched_clutch_root_t root_clutch,
1841         thread_t thread,
1842         integer_t options)
1843 {
1844         /* Update the clutch runnable count and priority */
1845         sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
1846         sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
1847         if (root_bucket->scrb_bound_thread_runq.count == 0) {
1848                 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, mach_absolute_time());
1849         }
1850
1851         assert((thread->th_bound_cluster_enqueued) == false);
1852         run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options);
1853         thread->th_bound_cluster_enqueued = true;
1854
1855         int16_t root_old_pri = root_clutch->scr_priority;
1856         sched_clutch_root_pri_update(root_clutch);
1857         return root_clutch->scr_priority > root_old_pri;
1858 }
1859
1860 static void
1861 sched_edge_bound_thread_remove(
1862         sched_clutch_root_t root_clutch,
1863         thread_t thread)
1864 {
1865         sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
1866         assert((thread->th_bound_cluster_enqueued) == true);
1867         run_queue_remove(&root_bucket->scrb_bound_thread_runq, thread);
1868         thread->th_bound_cluster_enqueued = false;
1869
1870         /* Update the clutch runnable count and priority */
1871         sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
1872         if (root_bucket->scrb_bound_thread_runq.count == 0) {
1873                 sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time());
1874         }
1875         sched_clutch_root_pri_update(root_clutch);
1876 }
1877
1878 #endif /* CONFIG_SCHED_EDGE */
1879
1880 /*
1881  * sched_clutch_thread_bound_lookup()
1882  *
1883  * Routine to lookup the highest priority runnable thread in a bounded root bucket.
1884  */
1885 static thread_t
1886 sched_clutch_thread_bound_lookup(
1887         __unused sched_clutch_root_t root_clutch,
1888         sched_clutch_root_bucket_t root_bucket)
1889 {
1890         return run_queue_peek(&root_bucket->scrb_bound_thread_runq);
1891 }
1892
1893 /*
1894  * Clutch Bucket Group Thread Counts and Pending time calculation
1895  *
1896  * The pending time on the clutch_bucket_group allows the scheduler to track if it
1897  * needs to ageout the CPU usage because the clutch_bucket_group has been pending for
1898  * a very long time. The pending time is set to the timestamp as soon as a thread becomes
1899  * runnable. When a thread is picked up for execution from this clutch_bucket_group, the
1900  * pending time is advanced to the time of thread selection.
1901  *
1902  * Since threads for a clutch bucket group can be added or removed from multiple CPUs
1903  * simulataneously, it is important that the updates to thread counts and pending timestamps
1904  * happen atomically. The implementation relies on the following aspects to make that work
1905  * as expected:
1906  * - The clutch scheduler would be deployed on single cluster platforms where the pset lock
1907  *   is held when threads are added/removed and pending timestamps are updated
1908  * - The edge scheduler would have support for double wide 128 bit atomics which allow the
1909  *   thread count and pending timestamp to be updated atomically.
1910  *
1911  * Clutch bucket group interactivity timestamp and score updates also rely on the properties
1912  * above to atomically update the interactivity score for a clutch bucket group.
1913  */
1914
1915 #if CONFIG_SCHED_EDGE
1916
1917 static void
1918 sched_clutch_bucket_group_thr_count_inc(
1919         sched_clutch_bucket_group_t clutch_bucket_group,
1920         uint64_t timestamp)
1921 {
1922         sched_clutch_counter_time_t old_pending_data;
1923         sched_clutch_counter_time_t new_pending_data;
1924         os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
1925                 new_pending_data.scct_count = old_pending_data.scct_count + 1;
1926                 new_pending_data.scct_timestamp = old_pending_data.scct_timestamp;
1927                 if (old_pending_data.scct_count == 0) {
1928                         new_pending_data.scct_timestamp = timestamp;
1929                 }
1930         });
1931 }
1932
1933 static void
1934 sched_clutch_bucket_group_thr_count_dec(
1935         sched_clutch_bucket_group_t clutch_bucket_group,
1936         uint64_t timestamp)
1937 {
1938         sched_clutch_counter_time_t old_pending_data;
1939         sched_clutch_counter_time_t new_pending_data;
1940         os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
1941                 new_pending_data.scct_count = old_pending_data.scct_count - 1;
1942                 if (new_pending_data.scct_count == 0) {
1943                         new_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1944                 } else {
1945                         new_pending_data.scct_timestamp = timestamp;
1946                 }
1947         });
1948 }
1949
1950 static uint8_t
1951 sched_clutch_bucket_group_pending_ageout(
1952         sched_clutch_bucket_group_t clutch_bucket_group,
1953         uint64_t timestamp)
1954 {
1955         int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
1956         sched_clutch_counter_time_t old_pending_data;
1957         sched_clutch_counter_time_t new_pending_data;
1958         uint8_t cpu_usage_shift = 0;
1959
1960         os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
1961                 cpu_usage_shift = 0;
1962                 uint64_t old_pending_ts = old_pending_data.scct_timestamp;
1963                 bool old_update = (old_pending_ts >= timestamp);
1964                 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
1965                 bool no_bucket_load = (bucket_load == 0);
1966                 if (old_update || no_pending_time || no_bucket_load) {
1967                         os_atomic_rmw_loop_give_up();
1968                 }
1969
1970                 /* Calculate the time the clutch bucket group has been pending */
1971                 uint64_t pending_delta = timestamp - old_pending_ts;
1972                 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
1973                 if (pending_delta < interactivity_delta) {
1974                         os_atomic_rmw_loop_give_up();
1975                 }
1976                 cpu_usage_shift = (pending_delta / interactivity_delta);
1977                 new_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
1978                 new_pending_data.scct_count = old_pending_data.scct_count;
1979         });
1980         return cpu_usage_shift;
1981 }
1982
1983 static uint8_t
1984 sched_clutch_bucket_group_interactivity_score_calculate(
1985         sched_clutch_bucket_group_t clutch_bucket_group,
1986         uint64_t timestamp)
1987 {
1988         if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
1989                 /*
1990                  * Since the root bucket selection algorithm for Above UI looks at clutch bucket
1991                  * priorities, make sure all AboveUI buckets are marked interactive.
1992                  */
1993                 assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
1994                 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1995         }
1996         /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
1997         uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
1998         /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
1999         sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2000         uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2001         sched_clutch_counter_time_t old_interactivity_data;
2002         sched_clutch_counter_time_t new_interactivity_data;
2003
2004         bool score_updated = os_atomic_rmw_loop(&clutch_bucket_group->scbg_interactivity_data.scct_packed, old_interactivity_data.scct_packed, new_interactivity_data.scct_packed, relaxed, {
2005                 if (old_interactivity_data.scct_timestamp >= timestamp) {
2006                         os_atomic_rmw_loop_give_up();
2007                 }
2008                 new_interactivity_data.scct_timestamp = timestamp;
2009                 if (old_interactivity_data.scct_timestamp != 0) {
2010                         new_interactivity_data.scct_count = interactivity_score;
2011                 }
2012         });
2013         if (score_updated) {
2014                 return (uint8_t)new_interactivity_data.scct_count;
2015         } else {
2016                 return (uint8_t)old_interactivity_data.scct_count;
2017         }
2018 }
2019
2020 #else /* CONFIG_SCHED_EDGE */
2021
2022 /*
2023  * For the clutch scheduler, atomicity is ensured by making sure all operations
2024  * are happening under the pset lock of the only cluster present on the platform.
2025  */
2026 static void
2027 sched_clutch_bucket_group_thr_count_inc(
2028         sched_clutch_bucket_group_t clutch_bucket_group,
2029         uint64_t timestamp)
2030 {
2031         sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2032         if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2033                 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2034         }
2035         clutch_bucket_group->scbg_pending_data.scct_count++;
2036 }
2037
2038 static void
2039 sched_clutch_bucket_group_thr_count_dec(
2040         sched_clutch_bucket_group_t clutch_bucket_group,
2041         uint64_t timestamp)
2042 {
2043         sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2044         clutch_bucket_group->scbg_pending_data.scct_count--;
2045         if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2046                 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2047         } else {
2048                 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2049         }
2050 }
2051
2052 static uint8_t
2053 sched_clutch_bucket_group_pending_ageout(
2054         sched_clutch_bucket_group_t clutch_bucket_group,
2055         uint64_t timestamp)
2056 {
2057         sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2058         int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2059         uint64_t old_pending_ts = clutch_bucket_group->scbg_pending_data.scct_timestamp;
2060         bool old_update = (old_pending_ts >= timestamp);
2061         bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2062         bool no_bucket_load = (bucket_load == 0);
2063         if (old_update || no_pending_time || no_bucket_load) {
2064                 return 0;
2065         }
2066         uint64_t pending_delta = timestamp - old_pending_ts;
2067         uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
2068         if (pending_delta < interactivity_delta) {
2069                 return 0;
2070         }
2071         uint8_t cpu_usage_shift = (pending_delta / interactivity_delta);
2072         clutch_bucket_group->scbg_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2073         return cpu_usage_shift;
2074 }
2075
2076 static uint8_t
2077 sched_clutch_bucket_group_interactivity_score_calculate(
2078         sched_clutch_bucket_group_t clutch_bucket_group,
2079         uint64_t timestamp)
2080 {
2081         sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2082         if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2083                 /*
2084                  * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2085                  * priorities, make sure all AboveUI buckets are marked interactive.
2086                  */
2087                 assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2088                 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2089         }
2090         /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2091         uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2092         /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2093         sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2094         uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2095         if (timestamp > clutch_bucket_group->scbg_interactivity_data.scct_timestamp) {
2096                 clutch_bucket_group->scbg_interactivity_data.scct_count = interactivity_score;
2097                 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = timestamp;
2098                 return interactivity_score;
2099         } else {
2100                 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2101         }
2102 }
2103
2104 #endif /* CONFIG_SCHED_EDGE */
2105
2106 /*
2107  * Clutch Bucket Group Run Count and Blocked Time Accounting
2108  *
2109  * The clutch bucket group maintains the number of runnable/running threads in the group.
2110  * Since the blocked time of the clutch bucket group is based on this count, it is
2111  * important to make sure the blocking timestamp and the run count are updated atomically.
2112  *
2113  * Since the run count increments happen without any pset locks held, the scheduler makes
2114  * these updates atomic in the following way:
2115  * - On 64-bit platforms, it uses double wide atomics to update the count & timestamp
2116  * - On 32-bit platforms, it uses a lock to synchronize the count & timestamp update
2117  */
2118
2119 #if !__LP64__
2120
2121 static uint32_t
2122 sched_clutch_bucket_group_run_count_inc(
2123         sched_clutch_bucket_group_t clutch_bucket_group)
2124 {
2125         uint64_t blocked_time = 0;
2126         uint64_t updated_run_count = 0;
2127
2128         lck_spin_lock(&clutch_bucket_group->scbg_stats_lock);
2129         if ((clutch_bucket_group->scbg_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID) &&
2130             (clutch_bucket_group->scbg_blocked_data.scct_count == 0)) {
2131                 /* Run count is transitioning from 0 to 1; calculate blocked time and add it to CPU data */
2132                 blocked_time = mach_absolute_time() - clutch_bucket_group->scbg_blocked_data.scct_timestamp;
2133                 clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2134         }
2135         clutch_bucket_group->scbg_blocked_data.scct_count = clutch_bucket_group->scbg_blocked_data.scct_count + 1;
2136         updated_run_count = clutch_bucket_group->scbg_blocked_data.scct_count;
2137         lck_spin_unlock(&clutch_bucket_group->scbg_stats_lock);
2138
2139         blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2140         os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2141         return (uint32_t)updated_run_count;
2142 }
2143
2144 static uint32_t
2145 sched_clutch_bucket_group_run_count_dec(
2146         sched_clutch_bucket_group_t clutch_bucket_group)
2147 {
2148         uint64_t updated_run_count = 0;
2149
2150         lck_spin_lock(&clutch_bucket_group->scbg_stats_lock);
2151         clutch_bucket_group->scbg_blocked_data.scct_count = clutch_bucket_group->scbg_blocked_data.scct_count - 1;
2152         if (clutch_bucket_group->scbg_blocked_data.scct_count == 0) {
2153                 /* Run count is transitioning from 1 to 0; start the blocked timer */
2154                 clutch_bucket_group->scbg_blocked_data.scct_timestamp = mach_absolute_time();
2155         }
2156         updated_run_count = clutch_bucket_group->scbg_blocked_data.scct_count;
2157         lck_spin_unlock(&clutch_bucket_group->scbg_stats_lock);
2158         return (uint32_t)updated_run_count;
2159 }
2160
2161 #else /* !__LP64__ */
2162
2163 static uint32_t
2164 sched_clutch_bucket_group_run_count_inc(
2165         sched_clutch_bucket_group_t clutch_bucket_group)
2166 {
2167         sched_clutch_counter_time_t old_blocked_data;
2168         sched_clutch_counter_time_t new_blocked_data;
2169
2170         bool update_blocked_time = false;
2171         os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2172                 new_blocked_data.scct_count = old_blocked_data.scct_count + 1;
2173                 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2174                 update_blocked_time = false;
2175                 if (old_blocked_data.scct_count == 0) {
2176                         new_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2177                         update_blocked_time = true;
2178                 }
2179         });
2180         if (update_blocked_time && (old_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID)) {
2181                 uint64_t ctime = mach_absolute_time();
2182                 if (ctime > old_blocked_data.scct_timestamp) {
2183                         uint64_t blocked_time = ctime - old_blocked_data.scct_timestamp;
2184                         blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2185                         os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2186                 }
2187         }
2188         return (uint32_t)new_blocked_data.scct_count;
2189 }
2190
2191 static uint32_t
2192 sched_clutch_bucket_group_run_count_dec(
2193         sched_clutch_bucket_group_t clutch_bucket_group)
2194 {
2195         sched_clutch_counter_time_t old_blocked_data;
2196         sched_clutch_counter_time_t new_blocked_data;
2197
2198         uint64_t ctime = mach_absolute_time();
2199         os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2200                 new_blocked_data.scct_count = old_blocked_data.scct_count - 1;
2201                 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2202                 if (new_blocked_data.scct_count == 0) {
2203                         new_blocked_data.scct_timestamp = ctime;
2204                 }
2205         });
2206         return (uint32_t)new_blocked_data.scct_count;
2207 }
2208
2209 #endif /* !__LP64__ */
2210
2211 /*
2212  * sched_clutch_thread_insert()
2213  *
2214  * Routine to insert a thread into the sched clutch hierarchy.
2215  * Update the counts at all levels of the hierarchy and insert the nodes
2216  * as they become runnable. Always called with the pset lock held.
2217  */
2218 static boolean_t
2219 sched_clutch_thread_insert(
2220         sched_clutch_root_t root_clutch,
2221         thread_t thread,
2222         integer_t options)
2223 {
2224         boolean_t result = FALSE;
2225
2226         sched_clutch_hierarchy_locked_assert(root_clutch);
2227 #if CONFIG_SCHED_EDGE
2228         sched_edge_cluster_cumulative_count_incr(root_clutch, thread->th_sched_bucket);
2229         /*
2230          * Check if the thread is bound and is being enqueued in its desired bound cluster.
2231          * One scenario where a bound thread might not be getting enqueued in the bound cluster
2232          * hierarchy would be if the thread is "soft" bound and the bound cluster is
2233          * de-recommended. In that case, the thread should be treated as an unbound
2234          * thread.
2235          */
2236         if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) && (sched_edge_thread_bound_cluster_id(thread) == root_clutch->scr_cluster_id)) {
2237                 return sched_edge_bound_thread_insert(root_clutch, thread, options);
2238         }
2239 #endif /* CONFIG_SCHED_EDGE */
2240         sched_clutch_t clutch = sched_clutch_for_thread(thread);
2241         assert(thread->thread_group == clutch->sc_tg);
2242
2243         uint64_t current_timestamp = mach_absolute_time();
2244         sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2245         sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2246         assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2247
2248         /*
2249          * Thread linkage in clutch_bucket
2250          *
2251          * A thread has a few linkages within the clutch bucket:
2252          * - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
2253          * - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
2254          * - A queue linkage used for timesharing operations of threads at the scheduler tick
2255          */
2256
2257         /* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */
2258         thread->th_clutch_runq_link.stamp = current_timestamp;
2259         priority_queue_entry_set_sched_pri(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link, thread->sched_pri,
2260             (options & SCHED_TAILQ) ? PRIORITY_QUEUE_ENTRY_NONE : PRIORITY_QUEUE_ENTRY_PREEMPTED);
2261         priority_queue_insert(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2262
2263         /* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
2264         priority_queue_entry_set_sched_pri(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link,
2265             sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, false);
2266         priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2267
2268         /* Insert thread into timesharing queue of the clutch bucket */
2269         enqueue_tail(&clutch_bucket->scb_thread_timeshare_queue, &thread->th_clutch_timeshare_link);
2270
2271         /* Increment the urgency counter for the root if necessary */
2272         sched_clutch_root_urgency_inc(root_clutch, thread);
2273
2274         os_atomic_inc(&clutch->sc_thr_count, relaxed);
2275         sched_clutch_bucket_group_thr_count_inc(clutch_bucket->scb_group, current_timestamp);
2276
2277         /* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
2278         sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
2279         if (clutch_bucket->scb_thr_count == 0) {
2280                 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2281                 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2282                 result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp, scb_options);
2283         } else {
2284                 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2285                 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2286                 result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, scb_options);
2287         }
2288
2289         KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2290             root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2291             SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2292         return result;
2293 }
2294
2295 /*
2296  * sched_clutch_thread_remove()
2297  *
2298  * Routine to remove a thread from the sched clutch hierarchy.
2299  * Update the counts at all levels of the hierarchy and remove the nodes
2300  * as they become empty. Always called with the pset lock held.
2301  */
2302 static void
2303 sched_clutch_thread_remove(
2304         sched_clutch_root_t root_clutch,
2305         thread_t thread,
2306         uint64_t current_timestamp,
2307         sched_clutch_bucket_options_t options)
2308 {
2309         sched_clutch_hierarchy_locked_assert(root_clutch);
2310 #if CONFIG_SCHED_EDGE
2311         sched_edge_cluster_cumulative_count_decr(root_clutch, thread->th_sched_bucket);
2312         if (thread->th_bound_cluster_enqueued) {
2313                 sched_edge_bound_thread_remove(root_clutch, thread);
2314                 return;
2315         }
2316 #endif /* CONFIG_SCHED_EDGE */
2317         sched_clutch_t clutch = sched_clutch_for_thread(thread);
2318         assert(thread->thread_group == clutch->sc_tg);
2319         assert(thread->runq != PROCESSOR_NULL);
2320
2321         sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2322         sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2323         assert(clutch_bucket->scb_root == root_clutch);
2324
2325         /* Decrement the urgency counter for the root if necessary */
2326         sched_clutch_root_urgency_dec(root_clutch, thread);
2327         /* Remove thread from the clutch_bucket */
2328         priority_queue_remove(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2329         remqueue(&thread->th_clutch_timeshare_link);
2330         thread->runq = PROCESSOR_NULL;
2331
2332         priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2333
2334         /* Update counts at various levels of the hierarchy */
2335         os_atomic_dec(&clutch->sc_thr_count, relaxed);
2336         sched_clutch_bucket_group_thr_count_dec(clutch_bucket->scb_group, current_timestamp);
2337         sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2338         sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
2339
2340         /* Remove the clutch from hierarchy (if needed) and update properties */
2341         if (clutch_bucket->scb_thr_count == 0) {
2342                 sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp, options);
2343         } else {
2344                 sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, options);
2345         }
2346         KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2347             root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2348             SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2349 }
2350
2351 /*
2352  * sched_clutch_thread_unbound_lookup()
2353  *
2354  * Routine to find the highest unbound thread in the root clutch.
2355  * Helps find threads easily for steal/migrate scenarios in the
2356  * Edge scheduler.
2357  */
2358 static thread_t
2359 sched_clutch_thread_unbound_lookup(
2360         sched_clutch_root_t root_clutch,
2361         sched_clutch_root_bucket_t root_bucket)
2362 {
2363         sched_clutch_hierarchy_locked_assert(root_clutch);
2364
2365         /* Find the highest priority clutch bucket in this root bucket */
2366         sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
2367         assert(clutch_bucket != NULL);
2368
2369         /* Find the highest priority runnable thread in this clutch bucket */
2370         thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2371         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
2372             thread_tid(thread), thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, 0, 0);
2373         return thread;
2374 }
2375
2376 /*
2377  * sched_clutch_thread_highest_remove()
2378  *
2379  * Routine to find and remove the highest priority thread
2380  * from the sched clutch hierarchy. The algorithm looks at the
2381  * hierarchy for the most eligible runnable thread and calls
2382  * sched_clutch_thread_remove(). Always called with the
2383  * pset lock held.
2384  */
2385 static thread_t
2386 sched_clutch_thread_highest_remove(
2387         sched_clutch_root_t root_clutch)
2388 {
2389         sched_clutch_hierarchy_locked_assert(root_clutch);
2390         uint64_t current_timestamp = mach_absolute_time();
2391
2392         sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
2393         if (root_bucket == NULL) {
2394                 return THREAD_NULL;
2395         }
2396
2397         thread_t highest_thread = THREAD_NULL;
2398         if (root_bucket->scrb_bound) {
2399                 highest_thread = sched_clutch_thread_bound_lookup(root_clutch, root_bucket);
2400         } else {
2401                 highest_thread = sched_clutch_thread_unbound_lookup(root_clutch, root_bucket);
2402         }
2403         sched_clutch_thread_remove(root_clutch, highest_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
2404         return highest_thread;
2405 }
2406
2407 /* High level global accessor routines */
2408
2409 /*
2410  * sched_clutch_root_urgency()
2411  *
2412  * Routine to get the urgency of the highest runnable
2413  * thread in the hierarchy.
2414  */
2415 static uint32_t
2416 sched_clutch_root_urgency(
2417         sched_clutch_root_t root_clutch)
2418 {
2419         return root_clutch->scr_urgency;
2420 }
2421
2422 /*
2423  * sched_clutch_root_count_sum()
2424  *
2425  * The count_sum mechanism is used for scheduler runq
2426  * statistics calculation. Its only useful for debugging
2427  * purposes; since it takes a mach_absolute_time() on
2428  * other scheduler implementations, its better to avoid
2429  * populating this until absolutely necessary.
2430  */
2431 static uint32_t
2432 sched_clutch_root_count_sum(
2433         __unused sched_clutch_root_t root_clutch)
2434 {
2435         return 0;
2436 }
2437
2438 /*
2439  * sched_clutch_root_priority()
2440  *
2441  * Routine to get the priority of the highest runnable
2442  * thread in the hierarchy.
2443  */
2444 static int
2445 sched_clutch_root_priority(
2446         sched_clutch_root_t root_clutch)
2447 {
2448         return root_clutch->scr_priority;
2449 }
2450
2451 /*
2452  * sched_clutch_root_count()
2453  *
2454  * Returns total number of runnable threads in the hierarchy.
2455  */
2456 uint32_t
2457 sched_clutch_root_count(
2458         sched_clutch_root_t root_clutch)
2459 {
2460         return root_clutch->scr_thr_count;
2461 }
2462
2463 #if CONFIG_SCHED_EDGE
2464
2465 /*
2466  * sched_clutch_root_foreign_empty()
2467  *
2468  * Routine to check if the foreign clutch bucket priority list is empty for a cluster.
2469  */
2470 static boolean_t
2471 sched_clutch_root_foreign_empty(
2472         sched_clutch_root_t root_clutch)
2473 {
2474         return priority_queue_empty(&root_clutch->scr_foreign_buckets);
2475 }
2476
2477 /*
2478  * sched_clutch_root_highest_foreign_thread_remove()
2479  *
2480  * Routine to return the thread in the highest priority clutch bucket in a cluster.
2481  * Must be called with the pset for the cluster locked.
2482  */
2483 static thread_t
2484 sched_clutch_root_highest_foreign_thread_remove(
2485         sched_clutch_root_t root_clutch)
2486 {
2487         thread_t thread = THREAD_NULL;
2488         if (priority_queue_empty(&root_clutch->scr_foreign_buckets)) {
2489                 return thread;
2490         }
2491         sched_clutch_bucket_t clutch_bucket = priority_queue_max(&root_clutch->scr_foreign_buckets, struct sched_clutch_bucket, scb_foreignlink);
2492         thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2493         sched_clutch_thread_remove(root_clutch, thread, mach_absolute_time(), 0);
2494         return thread;
2495 }
2496
2497 #endif /* CONFIG_SCHED_EDGE */
2498
2499 /*
2500  * sched_clutch_thread_pri_shift()
2501  *
2502  * Routine to get the priority shift value for a thread.
2503  * Since the timesharing is done at the clutch_bucket level,
2504  * this routine gets the clutch_bucket and retrieves the
2505  * values from there.
2506  */
2507 uint32_t
2508 sched_clutch_thread_pri_shift(
2509         thread_t thread,
2510         sched_bucket_t bucket)
2511 {
2512         if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2513                 return INT8_MAX;
2514         }
2515         assert(bucket != TH_BUCKET_RUN);
2516         sched_clutch_t clutch = sched_clutch_for_thread(thread);
2517         sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2518         return os_atomic_load(&clutch_bucket_group->scbg_pri_shift, relaxed);
2519 }
2520
2521 #pragma mark -- Clutch Scheduler Algorithm
2522
2523 static void
2524 sched_clutch_init(void);
2525
2526 static thread_t
2527 sched_clutch_steal_thread(processor_set_t pset);
2528
2529 static void
2530 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
2531
2532 static boolean_t
2533 sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
2534     sched_options_t options);
2535
2536 static boolean_t
2537 sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
2538
2539 static ast_t
2540 sched_clutch_processor_csw_check(processor_t processor);
2541
2542 static boolean_t
2543 sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
2544
2545 static int
2546 sched_clutch_runq_count(processor_t processor);
2547
2548 static boolean_t
2549 sched_clutch_processor_queue_empty(processor_t processor);
2550
2551 static uint64_t
2552 sched_clutch_runq_stats_count_sum(processor_t processor);
2553
2554 static int
2555 sched_clutch_processor_bound_count(processor_t processor);
2556
2557 static void
2558 sched_clutch_pset_init(processor_set_t pset);
2559
2560 static void
2561 sched_clutch_processor_init(processor_t processor);
2562
2563 static thread_t
2564 sched_clutch_choose_thread(processor_t processor, int priority, ast_t reason);
2565
2566 static void
2567 sched_clutch_processor_queue_shutdown(processor_t processor);
2568
2569 static sched_mode_t
2570 sched_clutch_initial_thread_sched_mode(task_t parent_task);
2571
2572 static uint32_t
2573 sched_clutch_initial_quantum_size(thread_t thread);
2574
2575 static bool
2576 sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread);
2577
2578 static uint32_t
2579 sched_clutch_run_incr(thread_t thread);
2580
2581 static uint32_t
2582 sched_clutch_run_decr(thread_t thread);
2583
2584 static void
2585 sched_clutch_update_thread_bucket(thread_t thread);
2586
2587 const struct sched_dispatch_table sched_clutch_dispatch = {
2588         .sched_name                                     = "clutch",
2589         .init                                           = sched_clutch_init,
2590         .timebase_init                                  = sched_timeshare_timebase_init,
2591         .processor_init                                 = sched_clutch_processor_init,
2592         .pset_init                                      = sched_clutch_pset_init,
2593         .maintenance_continuation                       = sched_timeshare_maintenance_continue,
2594         .choose_thread                                  = sched_clutch_choose_thread,
2595         .steal_thread_enabled                           = sched_steal_thread_enabled,
2596         .steal_thread                                   = sched_clutch_steal_thread,
2597         .compute_timeshare_priority                     = sched_compute_timeshare_priority,
2598         .choose_node                                    = sched_choose_node,
2599         .choose_processor                               = choose_processor,
2600         .processor_enqueue                              = sched_clutch_processor_enqueue,
2601         .processor_queue_shutdown                       = sched_clutch_processor_queue_shutdown,
2602         .processor_queue_remove                         = sched_clutch_processor_queue_remove,
2603         .processor_queue_empty                          = sched_clutch_processor_queue_empty,
2604         .priority_is_urgent                             = priority_is_urgent,
2605         .processor_csw_check                            = sched_clutch_processor_csw_check,
2606         .processor_queue_has_priority                   = sched_clutch_processor_queue_has_priority,
2607         .initial_quantum_size                           = sched_clutch_initial_quantum_size,
2608         .initial_thread_sched_mode                      = sched_clutch_initial_thread_sched_mode,
2609         .can_update_priority                            = can_update_priority,
2610         .update_priority                                = update_priority,
2611         .lightweight_update_priority                    = lightweight_update_priority,
2612         .quantum_expire                                 = sched_default_quantum_expire,
2613         .processor_runq_count                           = sched_clutch_runq_count,
2614         .processor_runq_stats_count_sum                 = sched_clutch_runq_stats_count_sum,
2615         .processor_bound_count                          = sched_clutch_processor_bound_count,
2616         .thread_update_scan                             = sched_clutch_thread_update_scan,
2617         .multiple_psets_enabled                         = TRUE,
2618         .sched_groups_enabled                           = FALSE,
2619         .avoid_processor_enabled                        = TRUE,
2620         .thread_avoid_processor                         = sched_clutch_thread_avoid_processor,
2621         .processor_balance                              = sched_SMT_balance,
2622
2623         .rt_runq                                        = sched_rtlocal_runq,
2624         .rt_init                                        = sched_rtlocal_init,
2625         .rt_queue_shutdown                              = sched_rtlocal_queue_shutdown,
2626         .rt_runq_scan                                   = sched_rtlocal_runq_scan,
2627         .rt_runq_count_sum                              = sched_rtlocal_runq_count_sum,
2628
2629         .qos_max_parallelism                            = sched_qos_max_parallelism,
2630         .check_spill                                    = sched_check_spill,
2631         .ipi_policy                                     = sched_ipi_policy,
2632         .thread_should_yield                            = sched_thread_should_yield,
2633         .run_count_incr                                 = sched_clutch_run_incr,
2634         .run_count_decr                                 = sched_clutch_run_decr,
2635         .update_thread_bucket                           = sched_clutch_update_thread_bucket,
2636         .pset_made_schedulable                          = sched_pset_made_schedulable,
2637 };
2638
2639 __attribute__((always_inline))
2640 static inline run_queue_t
2641 sched_clutch_bound_runq(processor_t processor)
2642 {
2643         return &processor->runq;
2644 }
2645
2646 __attribute__((always_inline))
2647 static inline sched_clutch_root_t
2648 sched_clutch_processor_root_clutch(processor_t processor)
2649 {
2650         return &processor->processor_set->pset_clutch_root;
2651 }
2652
2653 __attribute__((always_inline))
2654 static inline run_queue_t
2655 sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
2656 {
2657         assert(thread->bound_processor == processor);
2658         return sched_clutch_bound_runq(processor);
2659 }
2660
2661 static uint32_t
2662 sched_clutch_initial_quantum_size(thread_t thread)
2663 {
2664         if (thread == THREAD_NULL) {
2665                 return std_quantum;
2666         }
2667         assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
2668         return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
2669 }
2670
2671 static sched_mode_t
2672 sched_clutch_initial_thread_sched_mode(task_t parent_task)
2673 {
2674         if (parent_task == kernel_task) {
2675                 return TH_MODE_FIXED;
2676         } else {
2677                 return TH_MODE_TIMESHARE;
2678         }
2679 }
2680
2681 static void
2682 sched_clutch_processor_init(processor_t processor)
2683 {
2684         run_queue_init(&processor->runq);
2685 }
2686
2687 static void
2688 sched_clutch_pset_init(processor_set_t pset)
2689 {
2690         sched_clutch_root_init(&pset->pset_clutch_root, pset);
2691 }
2692
2693 static void
2694 sched_clutch_tunables_init(void)
2695 {
2696         sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
2697         sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
2698         sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
2699         clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS,
2700             NSEC_PER_USEC, &sched_clutch_bucket_group_adjust_threshold);
2701         assert(sched_clutch_bucket_group_adjust_threshold <= CLUTCH_CPU_DATA_MAX);
2702         sched_clutch_us_to_abstime(sched_clutch_bucket_group_pending_delta_us, sched_clutch_bucket_group_pending_delta);
2703 }
2704
2705 static void
2706 sched_clutch_init(void)
2707 {
2708         if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
2709                 sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
2710         }
2711         sched_timeshare_init();
2712         sched_clutch_tunables_init();
2713 }
2714
2715 static thread_t
2716 sched_clutch_choose_thread(
2717         processor_t      processor,
2718         int              priority,
2719         __unused ast_t            reason)
2720 {
2721         int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2722         uint32_t clutch_count = sched_clutch_root_count(sched_clutch_processor_root_clutch(processor));
2723         run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2724         boolean_t choose_from_boundq = false;
2725
2726         if (bound_runq->highq < priority &&
2727             clutch_pri < priority) {
2728                 return THREAD_NULL;
2729         }
2730
2731         if (bound_runq->count && clutch_count) {
2732                 if (bound_runq->highq >= clutch_pri) {
2733                         choose_from_boundq = true;
2734                 }
2735         } else if (bound_runq->count) {
2736                 choose_from_boundq = true;
2737         } else if (clutch_count) {
2738                 choose_from_boundq = false;
2739         } else {
2740                 return THREAD_NULL;
2741         }
2742
2743         thread_t thread = THREAD_NULL;
2744         if (choose_from_boundq == false) {
2745                 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2746                 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
2747         } else {
2748                 thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
2749         }
2750         return thread;
2751 }
2752
2753 static boolean_t
2754 sched_clutch_processor_enqueue(
2755         processor_t       processor,
2756         thread_t          thread,
2757         sched_options_t   options)
2758 {
2759         boolean_t       result;
2760
2761         thread->runq = processor;
2762         if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2763                 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2764                 result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
2765         } else {
2766                 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
2767                 result = run_queue_enqueue(rq, thread, options);
2768         }
2769         return result;
2770 }
2771
2772 static boolean_t
2773 sched_clutch_processor_queue_empty(processor_t processor)
2774 {
2775         return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
2776                sched_clutch_bound_runq(processor)->count == 0;
2777 }
2778
2779 static ast_t
2780 sched_clutch_processor_csw_check(processor_t processor)
2781 {
2782         boolean_t       has_higher;
2783         int             pri;
2784
2785         if (sched_clutch_thread_avoid_processor(processor, current_thread())) {
2786                 return AST_PREEMPT | AST_URGENT;
2787         }
2788
2789         run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2790         int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2791
2792         assert(processor->active_thread != NULL);
2793
2794         pri = MAX(clutch_pri, bound_runq->highq);
2795
2796         if (processor->first_timeslice) {
2797                 has_higher = (pri > processor->current_pri);
2798         } else {
2799                 has_higher = (pri >= processor->current_pri);
2800         }
2801
2802         if (has_higher) {
2803                 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
2804                         return AST_PREEMPT | AST_URGENT;
2805                 }
2806
2807                 if (bound_runq->urgency > 0) {
2808                         return AST_PREEMPT | AST_URGENT;
2809                 }
2810
2811                 return AST_PREEMPT;
2812         }
2813
2814         return AST_NONE;
2815 }
2816
2817 static boolean_t
2818 sched_clutch_processor_queue_has_priority(processor_t    processor,
2819     int            priority,
2820     boolean_t      gte)
2821 {
2822         run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2823
2824         int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
2825
2826         if (gte) {
2827                 return qpri >= priority;
2828         } else {
2829                 return qpri > priority;
2830         }
2831 }
2832
2833 static int
2834 sched_clutch_runq_count(processor_t processor)
2835 {
2836         return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
2837 }
2838
2839 static uint64_t
2840 sched_clutch_runq_stats_count_sum(processor_t processor)
2841 {
2842         uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
2843
2844         if (processor->cpu_id == processor->processor_set->cpu_set_low) {
2845                 return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
2846         } else {
2847                 return bound_sum;
2848         }
2849 }
2850 static int
2851 sched_clutch_processor_bound_count(processor_t processor)
2852 {
2853         return sched_clutch_bound_runq(processor)->count;
2854 }
2855
2856 static void
2857 sched_clutch_processor_queue_shutdown(processor_t processor)
2858 {
2859         processor_set_t pset = processor->processor_set;
2860         sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2861         thread_t        thread;
2862         queue_head_t    tqueue;
2863
2864         /* We only need to migrate threads if this is the last active processor in the pset */
2865         if (pset->online_processor_count > 0) {
2866                 pset_unlock(pset);
2867                 return;
2868         }
2869
2870         queue_init(&tqueue);
2871         while (sched_clutch_root_count(pset_clutch_root) > 0) {
2872                 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
2873                 enqueue_tail(&tqueue, &thread->runq_links);
2874         }
2875
2876         pset_unlock(pset);
2877
2878         qe_foreach_element_safe(thread, &tqueue, runq_links) {
2879                 remqueue(&thread->runq_links);
2880                 thread_lock(thread);
2881                 thread_setrun(thread, SCHED_TAILQ);
2882                 thread_unlock(thread);
2883         }
2884 }
2885
2886 static boolean_t
2887 sched_clutch_processor_queue_remove(
2888         processor_t processor,
2889         thread_t    thread)
2890 {
2891         run_queue_t             rq;
2892         processor_set_t         pset = processor->processor_set;
2893
2894         pset_lock(pset);
2895
2896         if (processor == thread->runq) {
2897                 /*
2898                  * Thread is on a run queue and we have a lock on
2899                  * that run queue.
2900                  */
2901                 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2902                         sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2903                         sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
2904                 } else {
2905                         rq = sched_clutch_thread_bound_runq(processor, thread);
2906                         run_queue_remove(rq, thread);
2907                 }
2908         } else {
2909                 /*
2910                  * The thread left the run queue before we could
2911                  * lock the run queue.
2912                  */
2913                 assert(thread->runq == PROCESSOR_NULL);
2914                 processor = PROCESSOR_NULL;
2915         }
2916
2917         pset_unlock(pset);
2918
2919         return processor != PROCESSOR_NULL;
2920 }
2921
2922 static thread_t
2923 sched_clutch_steal_thread(__unused processor_set_t pset)
2924 {
2925         /* Thread stealing is not enabled for single cluster clutch scheduler platforms */
2926         return THREAD_NULL;
2927 }
2928
2929 static void
2930 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
2931 {
2932         boolean_t               restart_needed = FALSE;
2933         processor_t             processor = processor_list;
2934         processor_set_t         pset;
2935         thread_t                thread;
2936         spl_t                   s;
2937
2938         /*
2939          *  We update the threads associated with each processor (bound and idle threads)
2940          *  and then update the threads in each pset runqueue.
2941          */
2942
2943         do {
2944                 do {
2945                         pset = processor->processor_set;
2946
2947                         s = splsched();
2948                         pset_lock(pset);
2949
2950                         restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
2951
2952                         pset_unlock(pset);
2953                         splx(s);
2954
2955                         if (restart_needed) {
2956                                 break;
2957                         }
2958
2959                         thread = processor->idle_thread;
2960                         if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
2961                                 if (thread_update_add_thread(thread) == FALSE) {
2962                                         restart_needed = TRUE;
2963                                         break;
2964                                 }
2965                         }
2966                 } while ((processor = processor->processor_list) != NULL);
2967
2968                 /* Ok, we now have a collection of candidates -- fix them. */
2969                 thread_update_process_threads();
2970         } while (restart_needed);
2971
2972         pset_node_t node = &pset_node0;
2973         pset = node->psets;
2974
2975         do {
2976                 do {
2977                         restart_needed = FALSE;
2978                         while (pset != NULL) {
2979                                 s = splsched();
2980                                 pset_lock(pset);
2981
2982                                 if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
2983                                         for (sched_bucket_t bucket = TH_BUCKET_SHARE_FG; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
2984                                                 restart_needed = runq_scan(&pset->pset_clutch_root.scr_bound_buckets[bucket].scrb_bound_thread_runq, scan_context);
2985                                                 if (restart_needed) {
2986                                                         break;
2987                                                 }
2988                                         }
2989                                         queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
2990                                         sched_clutch_bucket_t clutch_bucket;
2991                                         qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
2992                                                 sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, scan_context->sched_tick_last_abstime);
2993                                                 restart_needed = sched_clutch_timeshare_scan(&clutch_bucket->scb_thread_timeshare_queue, clutch_bucket->scb_thr_count, scan_context);
2994                                         }
2995                                 }
2996
2997                                 pset_unlock(pset);
2998                                 splx(s);
2999
3000                                 if (restart_needed) {
3001                                         break;
3002                                 }
3003                                 pset = pset->pset_list;
3004                         }
3005
3006                         if (restart_needed) {
3007                                 break;
3008                         }
3009                 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
3010
3011                 /* Ok, we now have a collection of candidates -- fix them. */
3012                 thread_update_process_threads();
3013         } while (restart_needed);
3014 }
3015
3016 extern int sched_allow_rt_smt;
3017
3018 /* Return true if this thread should not continue running on this processor */
3019 static bool
3020 sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread)
3021 {
3022         if (processor->processor_primary != processor) {
3023                 /*
3024                  * This is a secondary SMT processor.  If the primary is running
3025                  * a realtime thread, only allow realtime threads on the secondary.
3026                  */
3027                 if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
3028                         return true;
3029                 }
3030         }
3031
3032         return false;
3033 }
3034
3035 /*
3036  * For the clutch scheduler, the run counts are maintained in the clutch
3037  * buckets (i.e thread group scheduling structure).
3038  */
3039 static uint32_t
3040 sched_clutch_run_incr(thread_t thread)
3041 {
3042         assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
3043         uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3044         sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
3045         return new_count;
3046 }
3047
3048 static uint32_t
3049 sched_clutch_run_decr(thread_t thread)
3050 {
3051         assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
3052         uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3053         sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
3054         return new_count;
3055 }
3056
3057 static sched_bucket_t
3058 sched_convert_pri_to_bucket(uint8_t priority)
3059 {
3060         sched_bucket_t bucket = TH_BUCKET_RUN;
3061
3062         if (priority > BASEPRI_USER_INITIATED) {
3063                 bucket = TH_BUCKET_SHARE_FG;
3064         } else if (priority > BASEPRI_DEFAULT) {
3065                 bucket = TH_BUCKET_SHARE_IN;
3066         } else if (priority > BASEPRI_UTILITY) {
3067                 bucket = TH_BUCKET_SHARE_DF;
3068         } else if (priority > MAXPRI_THROTTLE) {
3069                 bucket = TH_BUCKET_SHARE_UT;
3070         } else {
3071                 bucket = TH_BUCKET_SHARE_BG;
3072         }
3073         return bucket;
3074 }
3075
3076 /*
3077  * For threads that have changed sched_pri without changing the
3078  * base_pri for any reason other than decay, use the sched_pri
3079  * as the bucketizing priority instead of base_pri. All such
3080  * changes are typically due to kernel locking primitives boosts
3081  * or demotions.
3082  */
3083 static boolean_t
3084 sched_thread_sched_pri_promoted(thread_t thread)
3085 {
3086         return (thread->sched_flags & TH_SFLAG_PROMOTED) ||
3087                (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
3088                (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
3089                (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
3090                (thread->kern_promotion_schedpri != 0);
3091 }
3092
3093 /*
3094  * Routine to update the scheduling bucket for the thread.
3095  *
3096  * In the clutch scheduler implementation, the thread's bucket
3097  * is based on sched_pri if it was promoted due to a kernel
3098  * primitive; otherwise its based on the thread base_pri. This
3099  * enhancement allows promoted threads to reach a higher priority
3100  * bucket and potentially get selected sooner for scheduling.
3101  *
3102  * Also, the clutch scheduler does not honor fixed priority below
3103  * FG priority. It simply puts those threads in the corresponding
3104  * timeshare bucket. The reason for to do that is because it is
3105  * extremely hard to define the scheduling properties of such threads
3106  * and they typically lead to performance issues.
3107  */
3108
3109 void
3110 sched_clutch_update_thread_bucket(thread_t thread)
3111 {
3112         sched_bucket_t old_bucket = thread->th_sched_bucket;
3113         sched_bucket_t new_bucket = TH_BUCKET_RUN;
3114         assert(thread->runq == PROCESSOR_NULL);
3115         int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
3116
3117         switch (thread->sched_mode) {
3118         case TH_MODE_FIXED:
3119                 if (pri >= BASEPRI_FOREGROUND) {
3120                         new_bucket = TH_BUCKET_FIXPRI;
3121                 } else {
3122                         new_bucket = sched_convert_pri_to_bucket(pri);
3123                 }
3124                 break;
3125
3126         case TH_MODE_REALTIME:
3127                 new_bucket = TH_BUCKET_FIXPRI;
3128                 break;
3129
3130         case TH_MODE_TIMESHARE:
3131                 new_bucket = sched_convert_pri_to_bucket(pri);
3132                 break;
3133
3134         default:
3135                 panic("unexpected mode: %d", thread->sched_mode);
3136                 break;
3137         }
3138
3139         if (old_bucket == new_bucket) {
3140                 return;
3141         }
3142
3143         thread->th_sched_bucket = new_bucket;
3144         thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
3145         /*
3146          * Since this is called after the thread has been removed from the runq,
3147          * only the run counts need to be updated. The re-insert into the runq
3148          * would put the thread into the correct new bucket's runq.
3149          */
3150         if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
3151                 sched_clutch_thread_run_bucket_decr(thread, old_bucket);
3152                 sched_clutch_thread_run_bucket_incr(thread, new_bucket);
3153         }
3154 }
3155
3156 #if CONFIG_SCHED_EDGE
3157
3158 /* Implementation of the AMP version of the clutch scheduler */
3159
3160 static void
3161 sched_edge_init(void);
3162
3163 static thread_t
3164 sched_edge_processor_idle(processor_set_t pset);
3165
3166 static ast_t
3167 sched_edge_processor_csw_check(processor_t processor);
3168
3169 static boolean_t
3170 sched_edge_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
3171
3172 static boolean_t
3173 sched_edge_processor_queue_empty(processor_t processor);
3174
3175 static thread_t
3176 sched_edge_choose_thread(processor_t processor, int priority, ast_t reason);
3177
3178 static void
3179 sched_edge_processor_queue_shutdown(processor_t processor);
3180
3181 static processor_t
3182 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
3183
3184 static bool
3185 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread);
3186
3187 static void
3188 sched_edge_balance(processor_t cprocessor, processor_set_t cpset);
3189
3190 static void
3191 sched_edge_check_spill(processor_set_t pset, thread_t thread);
3192
3193 static bool
3194 sched_edge_thread_should_yield(processor_t processor, thread_t thread);
3195
3196 static void
3197 sched_edge_pset_made_schedulable(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
3198
3199 static bool
3200 sched_edge_steal_thread_enabled(processor_set_t pset);
3201
3202 static sched_ipi_type_t
3203 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
3204
3205 static uint32_t
3206 sched_edge_qos_max_parallelism(int qos, uint64_t options);
3207
3208 const struct sched_dispatch_table sched_edge_dispatch = {
3209         .sched_name                                     = "edge",
3210         .init                                           = sched_edge_init,
3211         .timebase_init                                  = sched_timeshare_timebase_init,
3212         .processor_init                                 = sched_clutch_processor_init,
3213         .pset_init                                      = sched_clutch_pset_init,
3214         .maintenance_continuation                       = sched_timeshare_maintenance_continue,
3215         .choose_thread                                  = sched_edge_choose_thread,
3216         .steal_thread_enabled                           = sched_edge_steal_thread_enabled,
3217         .steal_thread                                   = sched_edge_processor_idle,
3218         .compute_timeshare_priority                     = sched_compute_timeshare_priority,
3219         .choose_node                                    = sched_choose_node,
3220         .choose_processor                               = sched_edge_choose_processor,
3221         .processor_enqueue                              = sched_clutch_processor_enqueue,
3222         .processor_queue_shutdown                       = sched_edge_processor_queue_shutdown,
3223         .processor_queue_remove                         = sched_clutch_processor_queue_remove,
3224         .processor_queue_empty                          = sched_edge_processor_queue_empty,
3225         .priority_is_urgent                             = priority_is_urgent,
3226         .processor_csw_check                            = sched_edge_processor_csw_check,
3227         .processor_queue_has_priority                   = sched_edge_processor_queue_has_priority,
3228         .initial_quantum_size                           = sched_clutch_initial_quantum_size,
3229         .initial_thread_sched_mode                      = sched_clutch_initial_thread_sched_mode,
3230         .can_update_priority                            = can_update_priority,
3231         .update_priority                                = update_priority,
3232         .lightweight_update_priority                    = lightweight_update_priority,
3233         .quantum_expire                                 = sched_default_quantum_expire,
3234         .processor_runq_count                           = sched_clutch_runq_count,
3235         .processor_runq_stats_count_sum                 = sched_clutch_runq_stats_count_sum,
3236         .processor_bound_count                          = sched_clutch_processor_bound_count,
3237         .thread_update_scan                             = sched_clutch_thread_update_scan,
3238         .multiple_psets_enabled                         = TRUE,
3239         .sched_groups_enabled                           = FALSE,
3240         .avoid_processor_enabled                        = TRUE,
3241         .thread_avoid_processor                         = sched_edge_thread_avoid_processor,
3242         .processor_balance                              = sched_edge_balance,
3243
3244         .rt_runq                                        = sched_amp_rt_runq,
3245         .rt_init                                        = sched_amp_rt_init,
3246         .rt_queue_shutdown                              = sched_amp_rt_queue_shutdown,
3247         .rt_runq_scan                                   = sched_amp_rt_runq_scan,
3248         .rt_runq_count_sum                              = sched_amp_rt_runq_count_sum,
3249
3250         .qos_max_parallelism                            = sched_edge_qos_max_parallelism,
3251         .check_spill                                    = sched_edge_check_spill,
3252         .ipi_policy                                     = sched_edge_ipi_policy,
3253         .thread_should_yield                            = sched_edge_thread_should_yield,
3254         .run_count_incr                                 = sched_clutch_run_incr,
3255         .run_count_decr                                 = sched_clutch_run_decr,
3256         .update_thread_bucket                           = sched_clutch_update_thread_bucket,
3257         .pset_made_schedulable                          = sched_edge_pset_made_schedulable,
3258         .thread_group_recommendation_change             = NULL,
3259 };
3260
3261 static struct processor_set pset1;
3262 static struct pset_node pset_node1;
3263 static bitmap_t sched_edge_available_pset_bitmask[BITMAP_LEN(MAX_PSETS)];
3264
3265 /*
3266  * sched_edge_pset_available()
3267  *
3268  * Routine to determine if a pset is available for scheduling.
3269  */
3270 static bool
3271 sched_edge_pset_available(processor_set_t pset)
3272 {
3273         return bitmap_test(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
3274 }
3275
3276 /*
3277  * sched_edge_thread_bound_cluster_id()
3278  *
3279  * Routine to determine which cluster a particular thread is bound to. Uses
3280  * the sched_flags on the thread to map back to a specific cluster id.
3281  *
3282  * <Edge Multi-cluster Support Needed>
3283  */
3284 static uint32_t
3285 sched_edge_thread_bound_cluster_id(thread_t thread)
3286 {
3287         assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread));
3288         if (thread->sched_flags & TH_SFLAG_ECORE_ONLY) {
3289                 return (pset_array[0]->pset_type == CLUSTER_TYPE_E) ? 0 : 1;
3290         } else {
3291                 return (pset_array[0]->pset_type == CLUSTER_TYPE_P) ? 0 : 1;
3292         }
3293 }
3294
3295 /* Forward declaration for some thread migration routines */
3296 static boolean_t sched_edge_foreign_runnable_thread_available(processor_set_t pset);
3297 static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset);
3298 static processor_set_t sched_edge_steal_candidate(processor_set_t pset);
3299 static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks);
3300
3301 /*
3302  * sched_edge_config_set()
3303  *
3304  * Support to update an edge configuration. Typically used by CLPC to affect thread migration
3305  * policies in the scheduler.
3306  */
3307 static void
3308 sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_clutch_edge edge_config)
3309 {
3310         sched_clutch_edge *edge = &pset_array[src_cluster]->sched_edges[dst_cluster];
3311         edge->sce_edge_packed = edge_config.sce_edge_packed;
3312 }
3313
3314 /*
3315  * sched_edge_config_get()
3316  *
3317  * Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
3318  * if it needs to update edges.
3319  */
3320 static sched_clutch_edge
3321 sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster)
3322 {
3323         return pset_array[src_cluster]->sched_edges[dst_cluster];
3324 }
3325
3326 #if DEVELOPMENT || DEBUG
3327
3328 /*
3329  * Helper Routines for edge scheduler sysctl configuration
3330  *
3331  * The current support is limited to dual cluster AMP platforms.
3332  * <Edge Multi-cluster Support Needed>
3333  */
3334
3335 kern_return_t
3336 sched_edge_sysctl_configure_e_to_p(uint64_t edge_config)
3337 {
3338         pset_array[ecore_cluster_id]->sched_edges[pcore_cluster_id].sce_edge_packed = edge_config;
3339         return KERN_SUCCESS;
3340 }
3341
3342 kern_return_t
3343 sched_edge_sysctl_configure_p_to_e(uint64_t edge_config)
3344 {
3345         pset_array[pcore_cluster_id]->sched_edges[ecore_cluster_id].sce_edge_packed = edge_config;
3346         return KERN_SUCCESS;
3347 }
3348
3349 sched_clutch_edge
3350 sched_edge_e_to_p(void)
3351 {
3352         return sched_edge_config_get(ecore_cluster_id, pcore_cluster_id);
3353 }
3354
3355 sched_clutch_edge
3356 sched_edge_p_to_e(void)
3357 {
3358         return sched_edge_config_get(pcore_cluster_id, ecore_cluster_id);
3359 }
3360
3361 #endif /* DEVELOPMENT || DEBUG */
3362
3363 /*
3364  * sched_edge_matrix_set()
3365  *
3366  * Routine to update various edges in the cluster edge matrix. The edge_changes_bitmap
3367  * indicates which edges need to be updated. Both the edge_matrix & edge_changes_bitmap
3368  * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3369  */
3370 void
3371 sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changes_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3372 {
3373         uint32_t edge_index = 0;
3374         for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3375                 for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3376                         if (edge_changes_bitmap[edge_index]) {
3377                                 sched_edge_config_set(src_cluster, dst_cluster, edge_matrix[edge_index]);
3378                         }
3379                         edge_index++;
3380                 }
3381         }
3382 }
3383
3384 /*
3385  * sched_edge_matrix_get()
3386  *
3387  * Routine to retrieve various edges in the cluster edge matrix. The edge_request_bitmap
3388  * indicates which edges need to be retrieved. Both the edge_matrix & edge_request_bitmap
3389  * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3390  */
3391 void
3392 sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_request_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3393 {
3394         uint32_t edge_index = 0;
3395         for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3396                 for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3397                         if (edge_request_bitmap[edge_index]) {
3398                                 edge_matrix[edge_index] = sched_edge_config_get(src_cluster, dst_cluster);
3399                         }
3400                         edge_index++;
3401                 }
3402         }
3403 }
3404
3405 /*
3406  * sched_edge_init()
3407  *
3408  * Routine to initialize the data structures for the Edge scheduler. The current implementation
3409  * relies on this being enabled for a dual cluster AMP system. Once a better design for MAX_PSETS,
3410  * edge config etc. is defined, it should be made more generic to handle the multi-cluster
3411  * platorms.
3412  * <Edge Multi-cluster Support Needed>
3413  */
3414 static void
3415 sched_edge_init(void)
3416 {
3417         processor_set_t ecore_set = &pset0;
3418         processor_set_t pcore_set = &pset1;
3419
3420         if (ml_get_boot_cluster() == CLUSTER_TYPE_P) {
3421                 /* If the device boots on a P-cluster, fixup the IDs etc. */
3422                 pcore_set = &pset0;
3423                 ecore_set = &pset1;
3424                 bitmap_set(sched_edge_available_pset_bitmask, pcore_cluster_id);
3425         } else {
3426                 bitmap_set(sched_edge_available_pset_bitmask, ecore_cluster_id);
3427         }
3428
3429         ecore_set->pset_cluster_type = PSET_AMP_E;
3430         ecore_set->pset_cluster_id = ecore_cluster_id;
3431
3432         pcore_set->pset_cluster_type = PSET_AMP_P;
3433         pcore_set->pset_cluster_id = pcore_cluster_id;
3434
3435         pset_init(&pset1, &pset_node1);
3436         pset_node1.psets = &pset1;
3437         pset_node0.node_list = &pset_node1;
3438
3439         pset_array[ecore_cluster_id] = ecore_set;
3440         pset_array[ecore_cluster_id]->pset_type = CLUSTER_TYPE_E;
3441         bitmap_set(pset_array[ecore_cluster_id]->foreign_psets, pcore_cluster_id);
3442
3443         sched_edge_config_set(ecore_cluster_id, ecore_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
3444         sched_edge_config_set(ecore_cluster_id, pcore_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
3445
3446         pset_array[pcore_cluster_id] = pcore_set;
3447         pset_array[pcore_cluster_id]->pset_type = CLUSTER_TYPE_P;
3448         bitmap_set(pset_array[pcore_cluster_id]->foreign_psets, ecore_cluster_id);
3449
3450         sched_edge_config_set(pcore_cluster_id, pcore_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
3451         sched_edge_config_set(pcore_cluster_id, ecore_cluster_id, (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
3452
3453         sched_timeshare_init();
3454         sched_clutch_tunables_init();
3455 }
3456
3457 static thread_t
3458 sched_edge_choose_thread(
3459         processor_t      processor,
3460         int              priority,
3461         __unused ast_t            reason)
3462 {
3463         int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
3464         run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3465         boolean_t choose_from_boundq = false;
3466
3467         if ((bound_runq->highq < priority) &&
3468             (clutch_pri < priority)) {
3469                 return THREAD_NULL;
3470         }
3471
3472         if (bound_runq->highq >= clutch_pri) {
3473                 choose_from_boundq = true;
3474         }
3475
3476         thread_t thread = THREAD_NULL;
3477         if (choose_from_boundq == false) {
3478                 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3479                 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3480         } else {
3481                 thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
3482         }
3483         return thread;
3484 }
3485
3486 static boolean_t
3487 sched_edge_processor_queue_empty(processor_t processor)
3488 {
3489         return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
3490                (sched_clutch_bound_runq(processor)->count == 0);
3491 }
3492
3493 static void
3494 sched_edge_check_spill(__unused processor_set_t pset, __unused thread_t thread)
3495 {
3496         assert(thread->bound_processor == PROCESSOR_NULL);
3497 }
3498
3499 __options_decl(sched_edge_thread_yield_reason_t, uint32_t, {
3500         SCHED_EDGE_YIELD_RUNQ_NONEMPTY       = 0x0,
3501         SCHED_EDGE_YIELD_FOREIGN_RUNNABLE    = 0x1,
3502         SCHED_EDGE_YIELD_FOREIGN_RUNNING     = 0x2,
3503         SCHED_EDGE_YIELD_STEAL_POSSIBLE      = 0x3,
3504         SCHED_EDGE_YIELD_DISALLOW            = 0x4,
3505 });
3506
3507 static bool
3508 sched_edge_thread_should_yield(processor_t processor, __unused thread_t thread)
3509 {
3510         if (!sched_edge_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
3511                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3512                     thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY);
3513                 return true;
3514         }
3515
3516         /*
3517          * The yield logic should follow the same logic that steal_thread () does. The
3518          * thread_should_yield() is effectively trying to quickly check that if the
3519          * current thread gave up CPU, is there any other thread that would execute
3520          * on this CPU. So it needs to provide the same answer as the steal_thread()/
3521          * processor Idle logic.
3522          */
3523         if (sched_edge_foreign_runnable_thread_available(processor->processor_set)) {
3524                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3525                     thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNABLE);
3526                 return true;
3527         }
3528         if (sched_edge_foreign_running_thread_available(processor->processor_set)) {
3529                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3530                     thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING);
3531                 return true;
3532         }
3533
3534         processor_set_t steal_candidate = sched_edge_steal_candidate(processor->processor_set);
3535         if (steal_candidate != NULL) {
3536                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3537                     thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE);
3538                 return true;
3539         }
3540
3541         KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE, thread_tid(thread), processor->processor_set->pset_cluster_id,
3542             0, SCHED_EDGE_YIELD_DISALLOW);
3543         return false;
3544 }
3545
3546 static ast_t
3547 sched_edge_processor_csw_check(processor_t processor)
3548 {
3549         boolean_t       has_higher;
3550         int             pri;
3551
3552         int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
3553         run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3554
3555         assert(processor->active_thread != NULL);
3556
3557         pri = MAX(clutch_pri, bound_runq->highq);
3558
3559         if (processor->first_timeslice) {
3560                 has_higher = (pri > processor->current_pri);
3561         } else {
3562                 has_higher = (pri >= processor->current_pri);
3563         }
3564
3565         if (has_higher) {
3566                 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
3567                         return AST_PREEMPT | AST_URGENT;
3568                 }
3569
3570                 if (bound_runq->urgency > 0) {
3571                         return AST_PREEMPT | AST_URGENT;
3572                 }
3573
3574                 return AST_PREEMPT;
3575         }
3576
3577         return AST_NONE;
3578 }
3579
3580 static boolean_t
3581 sched_edge_processor_queue_has_priority(processor_t    processor,
3582     int            priority,
3583     boolean_t      gte)
3584 {
3585         run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3586
3587         int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
3588         if (gte) {
3589                 return qpri >= priority;
3590         } else {
3591                 return qpri > priority;
3592         }
3593 }
3594
3595 static void
3596 sched_edge_processor_queue_shutdown(processor_t processor)
3597 {
3598         processor_set_t pset = processor->processor_set;
3599         sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3600         thread_t        thread;
3601         queue_head_t    tqueue;
3602
3603         /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
3604         if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
3605                 pset_unlock(pset);
3606                 return;
3607         }
3608
3609         bitmap_clear(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
3610
3611         queue_init(&tqueue);
3612         while (sched_clutch_root_count(pset_clutch_root) > 0) {
3613                 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3614                 enqueue_tail(&tqueue, &thread->runq_links);
3615         }
3616         pset_unlock(pset);
3617
3618         qe_foreach_element_safe(thread, &tqueue, runq_links) {
3619                 remqueue(&thread->runq_links);
3620                 thread_lock(thread);
3621                 thread_setrun(thread, SCHED_TAILQ);
3622                 thread_unlock(thread);
3623         }
3624 }
3625
3626 /*
3627  * sched_edge_cluster_load_metric()
3628  *
3629  * The load metric for a cluster is a measure of the average scheduling latency
3630  * experienced by threads on that cluster. It is a product of the average number
3631  * of threads in the runqueue and the average execution time for threads. The metric
3632  * has special values in the following cases:
3633  * - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
3634  *   the maximum value to disallow any threads to migrate to this cluster.
3635  * - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
3636  *   to be spread across the platform quickly for ncpu wide workloads.
3637  */
3638 static uint32_t
3639 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket)
3640 {
3641         if (sched_edge_pset_available(pset) == false) {
3642                 return UINT32_MAX;
3643         }
3644         return (uint32_t)sched_get_pset_load_average(pset, sched_bucket);
3645 }
3646
3647 /*
3648  *
3649  * Edge Scheduler Steal/Rebalance logic
3650  *
3651  * = Generic scheduler logic =
3652  *
3653  * The SCHED(steal_thread) scheduler callout is invoked when the processor does not
3654  * find any thread for execution in its runqueue. The aim of the steal operation
3655  * is to find other threads running/runnable in other clusters which should be
3656  * executed here.
3657  *
3658  * If the steal callout does not return a thread, the thread_select() logic calls
3659  * SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
3660  * threads and idle out the current CPU.
3661  *
3662  * = SCHED(steal_thread) for Edge Scheduler =
3663  *
3664  * The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
3665  * routine tries to do the following operations in order:
3666  * (1) Find foreign runnnable threads in non-native cluster
3667  *     runqueues (sched_edge_foreign_runnable_thread_remove())
3668  * (2) Check if foreign threads are running on the non-native
3669  *     clusters (sched_edge_foreign_running_thread_available())
3670  *         - If yes, return THREAD_NULL for the steal callout and
3671  *         perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
3672  * (3) Steal a thread from another cluster based on edge
3673  *     weights (sched_edge_steal_thread())
3674  *
3675  * = SCHED(processor_balance) for Edge Scheduler =
3676  *
3677  * If steal_thread did not return a thread for the processor, use
3678  * sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
3679  *
3680  * = Clutch Bucket Preferred Cluster Overrides =
3681  *
3682  * Since these operations (just like thread migrations on enqueue)
3683  * move threads across clusters, they need support for handling clutch
3684  * bucket group level preferred cluster recommendations.
3685  * For (1), a clutch bucket will be in the foreign runnable queue based
3686  * on the clutch bucket group preferred cluster.
3687  * For (2), the running thread will set the bit on the processor based
3688  * on its preferred cluster type.
3689  * For (3), the edge configuration would prevent threads from being stolen
3690  * in the wrong direction.
3691  *
3692  * = SCHED(thread_should_yield) =
3693  * The thread_should_yield() logic needs to have the same logic as sched_edge_processor_idle()
3694  * since that is expecting the same answer as if thread_select() was called on a core
3695  * with an empty runqueue.
3696  */
3697
3698 static bool
3699 sched_edge_steal_thread_enabled(__unused processor_set_t pset)
3700 {
3701         /*
3702          * For edge scheduler, the gating for steal is being done by sched_edge_steal_candidate()
3703          */
3704         return true;
3705 }
3706
3707 static processor_set_t
3708 sched_edge_steal_candidate(processor_set_t pset)
3709 {
3710         /*
3711          * Edge Scheduler Optimization
3712          *
3713          * Investigate a better policy for stealing. The current implementation looks
3714          * at all the incoming weights for the pset that just became idle and sees which
3715          * clusters have loads > edge weights. It is effectively trying to simulate a
3716          * overload migration as if a thread had become runnable on the candidate cluster.
3717          *
3718          * The logic today bails as soon as it finds a cluster where the cluster load is
3719          * greater than the edge weight. This helps the check to be quick which is useful
3720          * for sched_edge_thread_should_yield() which uses this. Maybe it should have a
3721          * more advanced version for the actual steal operation which looks for the
3722          * maximum delta etc.
3723          */
3724         processor_set_t target_pset = NULL;
3725         uint32_t dst_cluster_id = pset->pset_cluster_id;
3726
3727         for (int cluster_id = 0; cluster_id < MAX_PSETS; cluster_id++) {
3728                 processor_set_t candidate_pset = pset_array[cluster_id];
3729
3730                 if (candidate_pset == pset) {
3731                         continue;
3732                 }
3733
3734                 sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[dst_cluster_id];
3735                 if (incoming_edge->sce_steal_allowed == false) {
3736                         continue;
3737                 }
3738
3739                 uint32_t incoming_weight = incoming_edge->sce_migration_weight;
3740                 int highest_runnable_bucket = bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
3741                 if (highest_runnable_bucket == -1) {
3742                         /* Candidate cluster runq is empty */
3743                         continue;
3744                 }
3745                 /* Use the load metrics for highest runnable bucket since that would be stolen next */
3746                 uint32_t candidate_load = sched_edge_cluster_load_metric(candidate_pset, (sched_bucket_t)highest_runnable_bucket);
3747                 if (candidate_load > incoming_weight) {
3748                         /* Only steal from the candidate if its load is higher than the incoming edge and it has runnable threads */
3749                         target_pset = candidate_pset;
3750                         break;
3751                 }
3752         }
3753
3754         return target_pset;
3755 }
3756
3757 static boolean_t
3758 sched_edge_foreign_runnable_thread_available(processor_set_t pset)
3759 {
3760         /* Find all the clusters that are foreign for this cluster */
3761         bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3762         for (int cluster = bitmap_first(foreign_pset_bitmap, MAX_PSETS); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3763                 /*
3764                  * For each cluster, see if there are any runnable foreign threads.
3765                  * This check is currently being done without the pset lock to make it cheap for
3766                  * the common case.
3767                  */
3768                 processor_set_t target_pset = pset_array[cluster];
3769                 if (sched_edge_pset_available(target_pset) == false) {
3770                         continue;
3771                 }
3772
3773                 if (!sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
3774                         return true;
3775                 }
3776         }
3777         return false;
3778 }
3779
3780 static thread_t
3781 sched_edge_foreign_runnable_thread_remove(processor_set_t pset, uint64_t ctime)
3782 {
3783         thread_t thread = THREAD_NULL;
3784
3785         /* Find all the clusters that are foreign for this cluster */
3786         bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3787         for (int cluster = bitmap_first(foreign_pset_bitmap, MAX_PSETS); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3788                 /*
3789                  * For each cluster, see if there are any runnable foreign threads.
3790                  * This check is currently being done without the pset lock to make it cheap for
3791                  * the common case.
3792                  */
3793                 processor_set_t target_pset = pset_array[cluster];
3794                 if (sched_edge_pset_available(target_pset) == false) {
3795                         continue;
3796                 }
3797
3798                 if (sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
3799                         continue;
3800                 }
3801                 /*
3802                  * Looks like there are runnable foreign threads in the hierarchy; lock the pset
3803                  * and get the highest priority thread.
3804                  */
3805                 pset_lock(target_pset);
3806                 if (sched_edge_pset_available(target_pset)) {
3807                         thread = sched_clutch_root_highest_foreign_thread_remove(&target_pset->pset_clutch_root);
3808                         sched_update_pset_load_average(target_pset, ctime);
3809                 }
3810                 pset_unlock(target_pset);
3811
3812                 /*
3813                  * Edge Scheduler Optimization
3814                  *
3815                  * The current implementation immediately returns as soon as it finds a foreign
3816                  * runnable thread. This could be enhanced to look at highest priority threads
3817                  * from all foreign clusters and pick the highest amongst them. That would need
3818                  * some form of global state across psets to make that kind of a check cheap.
3819                  */
3820                 if (thread != THREAD_NULL) {
3821                         KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNABLE) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, target_pset->pset_cluster_id, 0);
3822                         break;
3823                 }
3824                 /* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */
3825         }
3826
3827         return thread;
3828 }
3829
3830 static boolean_t
3831 sched_edge_foreign_running_thread_available(processor_set_t pset)
3832 {
3833         bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3834         for (int cluster = bitmap_first(foreign_pset_bitmap, MAX_PSETS); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3835                 /* Skip the pset if its not schedulable */
3836                 processor_set_t target_pset = pset_array[cluster];
3837                 if (sched_edge_pset_available(target_pset) == false) {
3838                         continue;
3839                 }
3840
3841                 uint64_t running_foreign_bitmap = target_pset->cpu_state_map[PROCESSOR_RUNNING] & target_pset->cpu_running_foreign;
3842                 if (lsb_first(running_foreign_bitmap) != -1) {
3843                         /* Found a non-native CPU running a foreign thread; rebalance is needed */
3844                         return true;
3845                 }
3846         }
3847         return false;
3848 }
3849
3850 static thread_t
3851 sched_edge_steal_thread(processor_set_t pset)
3852 {
3853         thread_t thread = THREAD_NULL;
3854         processor_set_t steal_from_pset = sched_edge_steal_candidate(pset);
3855         if (steal_from_pset) {
3856                 /*
3857                  * sched_edge_steal_candidate() has found a pset which is ideal to steal from.
3858                  * Lock the pset and select the highest thread in that runqueue.
3859                  */
3860                 pset_lock(steal_from_pset);
3861                 if (bitmap_first(steal_from_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) != -1) {
3862                         uint64_t current_timestamp = mach_absolute_time();
3863                         sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(&steal_from_pset->pset_clutch_root, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
3864                         thread = sched_clutch_thread_unbound_lookup(&steal_from_pset->pset_clutch_root, root_bucket);
3865                         sched_clutch_thread_remove(&steal_from_pset->pset_clutch_root, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
3866                         KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STEAL) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, steal_from_pset->pset_cluster_id, 0);
3867                         sched_update_pset_load_average(steal_from_pset, current_timestamp);
3868                 }
3869                 /*
3870                  * Edge Scheduler Optimization
3871                  * Maybe this needs to circle around if the steal candidate did not have any threads by
3872                  * by the time the pset lock was taken.
3873                  */
3874                 pset_unlock(steal_from_pset);
3875         }
3876         return thread;
3877 }
3878
3879 /*
3880  * sched_edge_processor_idle()
3881  *
3882  * The routine is the implementation for steal_thread() for the Edge scheduler.
3883  */
3884 static thread_t
3885 sched_edge_processor_idle(processor_set_t pset)
3886 {
3887         thread_t thread = THREAD_NULL;
3888
3889         uint64_t ctime = mach_absolute_time();
3890
3891         /* Each of the operations acquire the lock for the pset they target */
3892         pset_unlock(pset);
3893
3894         /* Find highest priority runnable thread on all non-native clusters */
3895         thread = sched_edge_foreign_runnable_thread_remove(pset, ctime);
3896         if (thread != THREAD_NULL) {
3897                 return thread;
3898         }
3899
3900         /* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */
3901         boolean_t rebalance_needed = sched_edge_foreign_running_thread_available(pset);
3902         if (rebalance_needed) {
3903                 return THREAD_NULL;
3904         }
3905
3906         /* No foreign threads found; find a thread to steal from a pset based on weights/loads etc. */
3907         thread = sched_edge_steal_thread(pset);
3908         return thread;
3909 }
3910
3911 /* Return true if this thread should not continue running on this processor */
3912 static bool
3913 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread)
3914 {
3915         processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
3916         /*
3917          * For long running parallel workloads, it is important to rebalance threads across
3918          * E/P clusters so that they make equal forward progress. This is achieved through
3919          * threads expiring their quantum on the non-preferred cluster type and explicitly
3920          * rebalancing to the preferred cluster runqueue.
3921          *
3922          * <Edge Multi-Cluster Support Needed>
3923          * For multi-cluster platforms, it mignt be useful to move the thread incase its
3924          * preferred pset is idle now.
3925          */
3926         if (processor->processor_set->pset_type != preferred_pset->pset_type) {
3927                 return true;
3928         }
3929         /* If thread already running on preferred cluster, do not avoid */
3930         if (processor->processor_set == preferred_pset) {
3931                 return false;
3932         }
3933         /*
3934          * The thread is running on a processor that is of the same type as the
3935          * preferred pset, but is not the actual preferred pset. In that case
3936          * look at edge weights to see if this thread should continue execution
3937          * here or go back to its preferred cluster.
3938          *
3939          * <Edge Multi-Cluster Support Needed>
3940          * This logic needs to ensure that the current thread is not counted against
3941          * the load average for the current pset otherwise it would always end up avoiding
3942          * the current cluster.
3943          */
3944         processor_set_t chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, processor->processor_set, false);
3945         return chosen_pset != processor->processor_set;
3946 }
3947
3948 static void
3949 sched_edge_balance(__unused processor_t cprocessor, processor_set_t cpset)
3950 {
3951         assert(cprocessor == current_processor());
3952         pset_unlock(cpset);
3953
3954         uint64_t ast_processor_map = 0;
3955         sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
3956
3957         bitmap_t *foreign_pset_bitmap = cpset->foreign_psets;
3958         for (int cluster = bitmap_first(foreign_pset_bitmap, MAX_PSETS); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3959                 /* Skip the pset if its not schedulable */
3960                 processor_set_t target_pset = pset_array[cluster];
3961                 if (sched_edge_pset_available(target_pset) == false) {
3962                         continue;
3963                 }
3964
3965                 pset_lock(target_pset);
3966                 uint64_t cpu_running_foreign_map = (target_pset->cpu_running_foreign & target_pset->cpu_state_map[PROCESSOR_RUNNING]);
3967                 for (int cpuid = lsb_first(cpu_running_foreign_map); cpuid >= 0; cpuid = lsb_next(cpu_running_foreign_map, cpuid)) {
3968                         processor_t target_cpu = processor_array[cpuid];
3969                         ipi_type[target_cpu->cpu_id] = sched_ipi_action(target_cpu, NULL, false, SCHED_IPI_EVENT_REBALANCE);
3970                         if (ipi_type[cpuid] != SCHED_IPI_NONE) {
3971                                 bit_set(ast_processor_map, cpuid);
3972                         }
3973                 }
3974                 pset_unlock(target_pset);
3975         }
3976
3977         for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
3978                 processor_t ast_processor = processor_array[cpuid];
3979                 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
3980                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNING) | DBG_FUNC_NONE, 0, cprocessor->cpu_id, cpuid, 0);
3981         }
3982 }
3983
3984 /*
3985  * sched_edge_migrate_edges_evaluate()
3986  *
3987  * Routine to find the candidate for thread migration based on edge weights.
3988  *
3989  * Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
3990  * return preferred_pset if its the most ideal destination for this thread.
3991  */
3992 static processor_set_t
3993 sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t preferred_cluster_load, thread_t thread)
3994 {
3995         processor_set_t selected_pset = preferred_pset;
3996         uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
3997         cluster_type_t preferred_cluster_type = pset_type_for_id(preferred_cluster_id);
3998
3999         /* Look at edge deltas with other clusters to find the ideal migration candidate */
4000         sched_clutch_edge *edge = preferred_pset->sched_edges;
4001         uint32_t max_edge_delta = 0;
4002
4003         /*
4004          * Edge Scheduler Optimization
4005          *
4006          * For really large cluster count systems, it might make sense to optimize the
4007          * clusters iterated by using bitmaps and skipping over clusters that are not
4008          * available for scheduling or have migration disabled from this cluster.
4009          */
4010         for (uint32_t cluster_id = 0; cluster_id < MAX_PSETS; cluster_id++) {
4011                 processor_set_t dst_pset = pset_array[cluster_id];
4012                 if (cluster_id == preferred_cluster_id) {
4013                         continue;
4014                 }
4015
4016                 if (edge[cluster_id].sce_migration_allowed == false) {
4017                         continue;
4018                 }
4019
4020                 uint32_t dst_load = sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket);
4021                 if (dst_load > preferred_cluster_load) {
4022                         continue;
4023                 }
4024
4025                 /*
4026                  * Fast path for idle dst cluster
4027                  *
4028                  * For extremely parallel workloads, it is important to load up
4029                  * all clusters as quickly as possible. This short-circuit allows
4030                  * that.
4031                  * <Edge Multi-cluster Support Needed>
4032                  *
4033                  * For multi-cluster platforms, the loop should start with the homogeneous
4034                  * clusters first.
4035                  */
4036                 if (dst_load == 0) {
4037                         selected_pset = dst_pset;
4038                         break;
4039                 }
4040
4041                 uint32_t edge_delta = preferred_cluster_load - dst_load;
4042                 if (edge_delta < edge[cluster_id].sce_migration_weight) {
4043                         continue;
4044                 }
4045
4046                 if (edge_delta < max_edge_delta) {
4047                         continue;
4048                 }
4049
4050                 if (edge_delta == max_edge_delta) {
4051                         /* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */
4052                         boolean_t selected_homogeneous = (pset_type_for_id(selected_pset->pset_cluster_id) == preferred_cluster_type);
4053                         boolean_t candidate_homogeneous = (pset_type_for_id(dst_pset->pset_cluster_id) == preferred_cluster_type);
4054                         if (selected_homogeneous || !candidate_homogeneous) {
4055                                 continue;
4056                         }
4057                 }
4058                 /* dst_pset seems to be the best candidate for migration */
4059                 max_edge_delta = edge_delta;
4060                 selected_pset = dst_pset;
4061         }
4062         return selected_pset;
4063 }
4064
4065 /*
4066  * sched_edge_candidate_alternative()
4067  *
4068  * Routine to find an alternative cluster from candidate_cluster_bitmap since the
4069  * selected_pset is not available for execution. The logic tries to prefer homogeneous
4070  * clusters over heterogeneous clusters since this is typically used in thread
4071  * placement decisions.
4072  */
4073 _Static_assert(MAX_PSETS <= 64, "Unable to fit maximum number of psets in uint64_t bitmask");
4074 static processor_set_t
4075 sched_edge_candidate_alternative(processor_set_t selected_pset, uint64_t candidate_cluster_bitmap)
4076 {
4077         /*
4078          * It looks like the most ideal pset is not available for scheduling currently.
4079          * Try to find a homogeneous cluster that is still available.
4080          */
4081         bitmap_t *foreign_clusters = selected_pset->foreign_psets;
4082         uint64_t available_native_clusters = ~(foreign_clusters[0]) & candidate_cluster_bitmap;
4083         int available_cluster_id = lsb_first(available_native_clusters);
4084         if (available_cluster_id == -1) {
4085                 /* Looks like none of the homogeneous clusters are available; pick the first available cluster */
4086                 available_cluster_id = bit_first(candidate_cluster_bitmap);
4087         }
4088         assert(available_cluster_id != -1);
4089         return pset_array[available_cluster_id];
4090 }
4091
4092 /*
4093  * sched_edge_switch_pset_lock()
4094  *
4095  * Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
4096  * switch_pset_locks.
4097  * Returns the newly locked pset after the switch.
4098  */
4099 static processor_set_t
4100 sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locked_pset, bool switch_pset_locks)
4101 {
4102         if (!switch_pset_locks) {
4103                 return locked_pset;
4104         }
4105         if (selected_pset != locked_pset) {
4106                 pset_unlock(locked_pset);
4107                 pset_lock(selected_pset);
4108                 return selected_pset;
4109         } else {
4110                 return locked_pset;
4111         }
4112 }
4113
4114 /*
4115  * sched_edge_migrate_candidate()
4116  *
4117  * Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
4118  * the thread and the preferred cluster to determine the best available pset for scheduling.
4119  *
4120  * The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
4121  * accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
4122  * decision has to be synchronized with another CPU changing the recommendation of clusters available
4123  * on the system. If this parameter is set to false, this routine returns the best effort indication of
4124  * the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
4125  * SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
4126  * more appropriate cluster.
4127  *
4128  * Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
4129  * resultant pset lock is held.
4130  */
4131 static processor_set_t
4132 sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks)
4133 {
4134         __kdebug_only uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
4135         processor_set_t selected_pset = preferred_pset;
4136
4137         if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
4138                 /* For bound threads always recommend the cluster its bound to */
4139                 selected_pset = pset_array[sched_edge_thread_bound_cluster_id(thread)];
4140                 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4141                 if (sched_edge_pset_available(selected_pset) || (SCHED_CLUTCH_THREAD_CLUSTER_BOUND_SOFT(thread) == false)) {
4142                         /*
4143                          * If the bound cluster is not available, check if the thread is soft bound. For soft bound threads,
4144                          * fall through to the regular cluster selection logic which handles unavailable clusters
4145                          * appropriately. If the thread is hard bound, then return the bound cluster always.
4146                          */
4147                         return selected_pset;
4148                 }
4149         }
4150
4151         uint64_t candidate_cluster_bitmap = mask(MAX_PSETS);
4152         if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4153                 /* For realtime threads, try and schedule them on the preferred pset always */
4154                 goto migrate_candidate_available_check;
4155         }
4156
4157         /*
4158          * If a thread is being rebalanced for achieving equal progress of parallel workloads,
4159          * it needs to end up on the preferred runqueue.
4160          */
4161         uint32_t preferred_cluster_load = sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket);
4162         boolean_t amp_rebalance = (thread->reason & (AST_REBALANCE | AST_QUANTUM)) == (AST_REBALANCE | AST_QUANTUM);
4163         if ((preferred_cluster_load == 0) || amp_rebalance) {
4164                 goto migrate_candidate_available_check;
4165         }
4166
4167         /* Look at edge weights to decide the most ideal migration candidate for this thread */
4168         selected_pset = sched_edge_migrate_edges_evaluate(preferred_pset, preferred_cluster_load, thread);
4169
4170 migrate_candidate_available_check:
4171         locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4172         if (sched_edge_pset_available(selected_pset) == true) {
4173                 if (selected_pset != preferred_pset) {
4174                         KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
4175                 }
4176                 return selected_pset;
4177         }
4178         /* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */
4179         bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id);
4180         if (__improbable(bitmap_first(&candidate_cluster_bitmap, MAX_PSETS) == -1)) {
4181                 /*
4182                  * None of the clusters are available for scheduling; this situation should be rare but if it happens,
4183                  * simply return the boot cluster.
4184                  */
4185                 selected_pset = &pset0;
4186                 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4187                 if (selected_pset != preferred_pset) {
4188                         KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
4189                 }
4190                 return selected_pset;
4191         }
4192         /* Try and find an alternative for the selected pset */
4193         selected_pset = sched_edge_candidate_alternative(selected_pset, candidate_cluster_bitmap);
4194         goto migrate_candidate_available_check;
4195 }
4196
4197 static processor_t
4198 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
4199 {
4200         /* Bound threads don't call this function */
4201         assert(thread->bound_processor == PROCESSOR_NULL);
4202         processor_t chosen_processor = PROCESSOR_NULL;
4203
4204         /*
4205          * sched_edge_preferred_pset() returns the preferred pset for a given thread.
4206          * It should take the passed in "pset" as a hint which represents the recency metric for
4207          * pset selection logic.
4208          */
4209         processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4210         processor_set_t chosen_pset = preferred_pset;
4211         /*
4212          * If the preferred pset is overloaded, find a pset which is the best candidate to migrate
4213          * threads to. sched_edge_migrate_candidate() returns the preferred pset
4214          * if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
4215          *
4216          * <Edge Multi-cluster Support Needed>
4217          * It might be useful to build a recency metric for the thread for multiple clusters and
4218          * factor that into the migration decisions.
4219          */
4220         chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true);
4221         chosen_processor = choose_processor(chosen_pset, processor, thread);
4222         assert(chosen_processor->processor_set == chosen_pset);
4223         return chosen_processor;
4224 }
4225
4226 /*
4227  * sched_edge_clutch_bucket_threads_drain()
4228  *
4229  * Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
4230  * bucket overrides etc.) into a local thread queue.
4231  */
4232 static void
4233 sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
4234 {
4235         thread_t thread = THREAD_NULL;
4236         uint64_t current_timestamp = mach_approximate_time();
4237         qe_foreach_element_safe(thread, &clutch_bucket->scb_thread_timeshare_queue, th_clutch_timeshare_link) {
4238                 sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
4239                 enqueue_tail(clutch_threads, &thread->runq_links);
4240         }
4241 }
4242
4243 /*
4244  * sched_edge_run_drained_threads()
4245  *
4246  * Makes all drained threads in a local queue runnable.
4247  */
4248 static void
4249 sched_edge_run_drained_threads(queue_t clutch_threads)
4250 {
4251         thread_t thread;
4252         /* Now setrun all the threads in the local queue */
4253         qe_foreach_element_safe(thread, clutch_threads, runq_links) {
4254                 remqueue(&thread->runq_links);
4255                 thread_lock(thread);
4256                 thread_setrun(thread, SCHED_TAILQ);
4257                 thread_unlock(thread);
4258         }
4259 }
4260
4261 /*
4262  * sched_edge_update_preferred_cluster()
4263  *
4264  * Routine to update the preferred cluster for QoS buckets within a thread group.
4265  * The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
4266  */
4267 static void
4268 sched_edge_update_preferred_cluster(
4269         sched_clutch_t sched_clutch,
4270         bitmap_t *clutch_bucket_modify_bitmap,
4271         uint32_t *tg_bucket_preferred_cluster)
4272 {
4273         for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4274                 os_atomic_store(&sched_clutch->sc_clutch_groups[bucket].scbg_preferred_cluster, tg_bucket_preferred_cluster[bucket], relaxed);
4275         }
4276 }
4277
4278 /*
4279  * sched_edge_migrate_thread_group_runnable_threads()
4280  *
4281  * Routine to implement the migration of threads on a cluster when the thread group
4282  * recommendation is updated. The migration works using a 2-phase
4283  * algorithm.
4284  *
4285  * Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
4286  * For each clutch bucket, if it needs to be migrated immediately, drain the threads
4287  * into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
4288  * appropriate.
4289  *
4290  * Phase 2: After unlocking the pset, drain all the threads from the local thread
4291  * queue and mark them runnable which should land them in the right hierarchy.
4292  *
4293  * The routine assumes that the preferences for the clutch buckets/clutch bucket
4294  * groups have already been updated by the caller.
4295  *
4296  * - Called with the pset locked and interrupts disabled.
4297  * - Returns with the pset unlocked.
4298  */
4299 static void
4300 sched_edge_migrate_thread_group_runnable_threads(
4301         sched_clutch_t sched_clutch,
4302         sched_clutch_root_t root_clutch,
4303         bitmap_t *clutch_bucket_modify_bitmap,
4304         __unused uint32_t *tg_bucket_preferred_cluster,
4305         bool migrate_immediately)
4306 {
4307         /* Queue to hold threads that have been drained from clutch buckets to be migrated */
4308         queue_head_t clutch_threads;
4309         queue_init(&clutch_threads);
4310
4311         for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4312                 /* Get the clutch bucket for this cluster and sched bucket */
4313                 sched_clutch_bucket_group_t clutch_bucket_group = &(sched_clutch->sc_clutch_groups[bucket]);
4314                 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
4315                 sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
4316                 if (scb_root == NULL) {
4317                         /* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
4318                         assert(clutch_bucket->scb_thr_count == 0);
4319                         continue;
4320                 }
4321                 assert(scb_root == root_clutch);
4322                 uint32_t clutch_bucket_preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
4323
4324                 if (migrate_immediately) {
4325                         /*
4326                          * For transitions where threads need to be migrated immediately, drain the threads into a
4327                          * local queue unless we are looking at the clutch buckets for the newly recommended
4328                          * cluster.
4329                          */
4330                         if (root_clutch->scr_cluster_id != clutch_bucket_preferred_cluster) {
4331                                 sched_edge_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
4332                         } else {
4333                                 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
4334                         }
4335                 } else {
4336                         /* Check if this cluster is the same type as the newly recommended cluster */
4337                         boolean_t homogeneous_cluster = (pset_type_for_id(root_clutch->scr_cluster_id) == pset_type_for_id(clutch_bucket_preferred_cluster));
4338                         /*
4339                          * If threads do not have to be migrated immediately, just change the native/foreign
4340                          * flag on the clutch bucket.
4341                          */
4342                         if (homogeneous_cluster) {
4343                                 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
4344                         } else {
4345                                 sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
4346                         }
4347                 }
4348         }
4349
4350         pset_unlock(root_clutch->scr_pset);
4351         sched_edge_run_drained_threads(&clutch_threads);
4352 }
4353
4354 /*
4355  * sched_edge_migrate_thread_group_running_threads()
4356  *
4357  * Routine to find all running threads of a thread group on a specific cluster
4358  * and IPI them if they need to be moved immediately.
4359  */
4360 static void
4361 sched_edge_migrate_thread_group_running_threads(
4362         sched_clutch_t sched_clutch,
4363         sched_clutch_root_t root_clutch,
4364         __unused bitmap_t *clutch_bucket_modify_bitmap,
4365         uint32_t *tg_bucket_preferred_cluster,
4366         bool migrate_immediately)
4367 {
4368         if (migrate_immediately == false) {
4369                 /* If CLPC has recommended not to move threads immediately, nothing to do here */
4370                 return;
4371         }
4372
4373         /*
4374          * Edge Scheduler Optimization
4375          *
4376          * When the system has a large number of clusters and cores, it might be useful to
4377          * narrow down the iteration by using a thread running bitmap per clutch.
4378          */
4379         uint64_t ast_processor_map = 0;
4380         sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
4381
4382         uint64_t running_map = root_clutch->scr_pset->cpu_state_map[PROCESSOR_RUNNING];
4383         /*
4384          * Iterate all CPUs and look for the ones running threads from this thread group and are
4385          * not restricted to the specific cluster (due to overrides etc.)
4386          */
4387         for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
4388                 processor_t src_processor = processor_array[cpuid];
4389                 boolean_t expected_tg = (src_processor->current_thread_group == sched_clutch->sc_tg);
4390                 sched_bucket_t processor_sched_bucket = src_processor->processor_set->cpu_running_buckets[cpuid];
4391                 boolean_t non_preferred_cluster = tg_bucket_preferred_cluster[processor_sched_bucket] != root_clutch->scr_cluster_id;
4392
4393                 if (expected_tg && non_preferred_cluster) {
4394                         ipi_type[cpuid] = sched_ipi_action(src_processor, NULL, false, SCHED_IPI_EVENT_REBALANCE);
4395                         if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4396                                 bit_set(ast_processor_map, cpuid);
4397                         } else if (src_processor == current_processor()) {
4398                                 ast_on(AST_PREEMPT);
4399                                 bit_set(root_clutch->scr_pset->pending_AST_PREEMPT_cpu_mask, cpuid);
4400                         }
4401                 }
4402         }
4403
4404         /* Perform all the IPIs */
4405         if (bit_first(ast_processor_map) != -1) {
4406                 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
4407                         processor_t ast_processor = processor_array[cpuid];
4408                         sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4409                 }
4410                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, thread_group_get_id(sched_clutch->sc_tg), ast_processor_map, 0, 0);
4411         }
4412 }
4413
4414 /*
4415  * sched_edge_tg_preferred_cluster_change()
4416  *
4417  * Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
4418  * is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
4419  * thread migrations based on the policy specified by CLPC.
4420  * tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
4421  * for each QoS within the thread group.
4422  */
4423 void
4424 sched_edge_tg_preferred_cluster_change(struct thread_group *tg, uint32_t *tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options)
4425 {
4426         sched_clutch_t clutch = sched_clutch_for_thread_group(tg);
4427         /*
4428          * In order to optimize the processing, create a bitmap which represents all QoS buckets
4429          * for which the preferred cluster has changed.
4430          */
4431         bitmap_t clutch_bucket_modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {0};
4432         for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4433                 uint32_t old_preferred_cluster = sched_edge_clutch_bucket_group_preferred_cluster(&clutch->sc_clutch_groups[bucket]);
4434                 uint32_t new_preferred_cluster = tg_bucket_preferred_cluster[bucket];
4435                 if (old_preferred_cluster != new_preferred_cluster) {
4436                         bitmap_set(clutch_bucket_modify_bitmap, bucket);
4437                 }
4438         }
4439         if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
4440                 /* No changes in any clutch buckets; nothing to do here */
4441                 return;
4442         }
4443
4444         for (uint32_t cluster_id = 0; cluster_id < MAX_PSETS; cluster_id++) {
4445                 processor_set_t pset = pset_array[cluster_id];
4446                 spl_t s = splsched();
4447                 pset_lock(pset);
4448                 /*
4449                  * The first operation is to update the preferred cluster for all QoS buckets within the
4450                  * thread group so that any future threads becoming runnable would see the new preferred
4451                  * cluster value.
4452                  */
4453                 sched_edge_update_preferred_cluster(clutch, clutch_bucket_modify_bitmap, tg_bucket_preferred_cluster);
4454                 /*
4455                  * Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
4456                  * by keeping a per-clutch bitmap of clusters running threads for a particular TG.
4457                  *
4458                  * <Edge Multi-cluster Support Needed>
4459                  */
4460                 /* Migrate all running threads of the TG on this cluster based on options specified by CLPC */
4461                 sched_edge_migrate_thread_group_running_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
4462                     tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING));
4463                 /* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */
4464                 sched_edge_migrate_thread_group_runnable_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
4465                     tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE));
4466                 /* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */
4467                 splx(s);
4468         }
4469 }
4470
4471 /*
4472  * sched_edge_pset_made_schedulable()
4473  *
4474  * Routine to migrate all the clutch buckets which are not in their recommended
4475  * pset hierarchy now that a new pset has become runnable. Its possible that this
4476  * routine is called when the pset is already marked schedulable.
4477  *
4478  * Invoked with the pset lock held and interrupts disabled.
4479  */
4480 static void
4481 sched_edge_pset_made_schedulable(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
4482 {
4483         if (bitmap_test(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id)) {
4484                 /* Nothing to do here since pset is already marked schedulable */
4485                 if (drop_lock) {
4486                         pset_unlock(dst_pset);
4487                 }
4488                 return;
4489         }
4490
4491         bitmap_set(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id);
4492
4493         thread_t thread = sched_edge_processor_idle(dst_pset);
4494         if (thread != THREAD_NULL) {
4495                 thread_lock(thread);
4496                 thread_setrun(thread, SCHED_TAILQ);
4497                 thread_unlock(thread);
4498         }
4499
4500         if (!drop_lock) {
4501                 pset_lock(dst_pset);
4502         }
4503 }
4504
4505 extern int sched_amp_spill_deferred_ipi;
4506 extern int sched_amp_pcores_preempt_immediate_ipi;
4507
4508 int sched_edge_migrate_ipi_immediate = 1;
4509
4510 sched_ipi_type_t
4511 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4512 {
4513         processor_set_t pset = dst->processor_set;
4514         assert(bit_test(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id) == false);
4515         assert(dst != current_processor());
4516
4517         boolean_t deferred_ipi_supported = false;
4518 #if defined(CONFIG_SCHED_DEFERRED_AST)
4519         deferred_ipi_supported = true;
4520 #endif /* CONFIG_SCHED_DEFERRED_AST */
4521
4522         switch (event) {
4523         case SCHED_IPI_EVENT_SPILL:
4524                 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
4525                 if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
4526                         return sched_ipi_deferred_policy(pset, dst, event);
4527                 }
4528                 break;
4529         case SCHED_IPI_EVENT_PREEMPT:
4530                 /* For preemption, the default policy is to use deferred IPIs
4531                  * for Non-RT P-core preemption. Override that behavior if
4532                  * sched_amp_pcores_preempt_immediate_ipi is set
4533                  */
4534                 if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
4535                         if (sched_edge_migrate_ipi_immediate) {
4536                                 /*
4537                                  * For workloads that are going wide, it might be useful use Immediate IPI to
4538                                  * wakeup the idle CPU if the scheduler estimates that the preferred pset will
4539                                  * be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
4540                                  * latency on the preferred pset as an estimate of busyness.
4541                                  *
4542                                  * <Edge Multi-cluster Support Needed>
4543                                  */
4544                                 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4545                                 if ((preferred_pset->pset_execution_time[thread->th_sched_bucket].pset_avg_thread_execution_time * NSEC_PER_USEC) >= ml_cpu_signal_deferred_get_timer()) {
4546                                         return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4547                                 }
4548                         }
4549                         if (sched_amp_pcores_preempt_immediate_ipi && (pset_type_for_id(pset->pset_cluster_id) == CLUSTER_TYPE_P)) {
4550                                 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4551                         }
4552                 }
4553                 break;
4554         default:
4555                 break;
4556         }
4557         /* Default back to the global policy for all other scenarios */
4558         return sched_ipi_policy(dst, thread, dst_idle, event);
4559 }
4560
4561 /*
4562  * sched_edge_qos_max_parallelism()
4563  */
4564 uint32_t
4565 sched_edge_qos_max_parallelism(int qos, uint64_t options)
4566 {
4567         uint32_t ecount = 0;
4568         uint32_t pcount = 0;
4569
4570         for (int cluster_id = 0; cluster_id < MAX_PSETS; cluster_id++) {
4571                 processor_set_t pset = pset_array[cluster_id];
4572                 if (pset_type_for_id(cluster_id) == CLUSTER_TYPE_P) {
4573                         pcount += pset->cpu_set_count;
4574                 } else {
4575                         ecount += pset->cpu_set_count;
4576                 }
4577         }
4578
4579         if (options & QOS_PARALLELISM_REALTIME) {
4580                 /* For realtime threads on AMP, we would want them
4581                  * to limit the width to just the P-cores since we
4582                  * do not spill/rebalance for RT threads.
4583                  */
4584                 return pcount;
4585         }
4586
4587         /*
4588          * The Edge scheduler supports per-QoS recommendations for thread groups.
4589          * This enables lower QoS buckets (such as UT) to be scheduled on all
4590          * CPUs on the system.
4591          *
4592          * The only restriction is for BG/Maintenance QoS classes for which the
4593          * performance controller would never recommend execution on the P-cores.
4594          * If that policy changes in the future, this value should be changed.
4595          */
4596         switch (qos) {
4597         case THREAD_QOS_BACKGROUND:
4598         case THREAD_QOS_MAINTENANCE:
4599                 return ecount;
4600         default:
4601                 return ecount + pcount;
4602         }
4603 }
4604
4605
4606
4607 #endif /* CONFIG_SCHED_EDGE */
4608
4609 #endif /* CONFIG_SCHED_CLUTCH */