osfmk/kern/sched_amp_common.c

   1 /*
   2  * Copyright (c) 2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <mach/mach_types.h>
  30 #include <mach/machine.h>
  31 #include <machine/machine_routines.h>
  32 #include <machine/sched_param.h>
  33 #include <machine/machine_cpu.h>
  34 #include <kern/kern_types.h>
  35 #include <kern/debug.h>
  36 #include <kern/machine.h>
  37 #include <kern/misc_protos.h>
  38 #include <kern/processor.h>
  39 #include <kern/queue.h>
  40 #include <kern/sched.h>
  41 #include <kern/sched_prim.h>
  42 #include <kern/task.h>
  43 #include <kern/thread.h>
  44 #include <machine/atomic.h>
  45 #include <sys/kdebug.h>
  46 #include <kern/sched_amp_common.h>
  47 #include <stdatomic.h>
  48
  49 #if __AMP__
  50
  51 /* Exported globals */
  52 processor_set_t ecore_set = NULL;
  53 processor_set_t pcore_set = NULL;
  54
  55 static struct processor_set pset1;
  56 static struct pset_node pset_node1;
  57
  58 #if DEVELOPMENT || DEBUG
  59 bool system_ecore_only = false;
  60 #endif /* DEVELOPMENT || DEBUG */
  61
  62 /*
  63  * sched_amp_init()
  64  *
  65  * Initialize the pcore_set and ecore_set globals which describe the
  66  * P/E processor sets.
  67  */
  68 void
  69 sched_amp_init(void)
  70 {
  71         pset_init(&pset1, &pset_node1);
  72         pset_node1.psets = &pset1;
  73         pset_node0.node_list = &pset_node1;
  74
  75         if (ml_get_boot_cluster() == CLUSTER_TYPE_P) {
  76                 pcore_set = &pset0;
  77                 ecore_set = &pset1;
  78         } else {
  79                 ecore_set = &pset0;
  80                 pcore_set = &pset1;
  81         }
  82
  83         ecore_set->pset_cluster_type = PSET_AMP_E;
  84         ecore_set->pset_cluster_id = 0;
  85
  86         pcore_set->pset_cluster_type = PSET_AMP_P;
  87         pcore_set->pset_cluster_id = 1;
  88
  89 #if DEVELOPMENT || DEBUG
  90         if (PE_parse_boot_argn("enable_skstsct", NULL, 0)) {
  91                 system_ecore_only = true;
  92         }
  93 #endif /* DEVELOPMENT || DEBUG */
  94
  95         sched_timeshare_init();
  96 }
  97
  98 /* Spill threshold load average is ncpus in pset + (sched_amp_spill_count/(1 << PSET_LOAD_FRACTIONAL_SHIFT) */
  99 int sched_amp_spill_count = 3;
 100 int sched_amp_idle_steal = 1;
 101 int sched_amp_spill_steal = 1;
 102
 103 /*
 104  * We see performance gains from doing immediate IPIs to P-cores to run
 105  * P-eligible threads and lesser P-E migrations from using deferred IPIs
 106  * for spill.
 107  */
 108 int sched_amp_spill_deferred_ipi = 1;
 109 int sched_amp_pcores_preempt_immediate_ipi = 1;
 110
 111 /*
 112  * sched_perfcontrol_inherit_recommendation_from_tg changes amp
 113  * scheduling policy away from default and allows policy to be
 114  * modified at run-time.
 115  *
 116  * once modified from default, the policy toggles between "follow
 117  * thread group" and "restrict to e".
 118  */
 119
 120 _Atomic sched_perfctl_class_policy_t sched_perfctl_policy_util = SCHED_PERFCTL_POLICY_DEFAULT;
 121 _Atomic sched_perfctl_class_policy_t sched_perfctl_policy_bg = SCHED_PERFCTL_POLICY_DEFAULT;
 122
 123 /*
 124  * sched_amp_spill_threshold()
 125  *
 126  * Routine to calulate spill threshold which decides if cluster should spill.
 127  */
 128 int
 129 sched_amp_spill_threshold(processor_set_t pset)
 130 {
 131         int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
 132
 133         return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + sched_amp_spill_count;
 134 }
 135
 136 /*
 137  * pset_signal_spill()
 138  *
 139  * Routine to signal a running/idle CPU to cause a spill onto that CPU.
 140  * Called with pset locked, returns unlocked
 141  */
 142 void
 143 pset_signal_spill(processor_set_t pset, int spilled_thread_priority)
 144 {
 145         processor_t processor;
 146         sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
 147
 148         uint64_t idle_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE];
 149         for (int cpuid = lsb_first(idle_map); cpuid >= 0; cpuid = lsb_next(idle_map, cpuid)) {
 150                 processor = processor_array[cpuid];
 151                 if (bit_set_if_clear(pset->pending_spill_cpu_mask, processor->cpu_id)) {
 152                         KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 0, 0, 0);
 153
 154                         processor->deadline = UINT64_MAX;
 155                         pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
 156
 157                         if (processor == current_processor()) {
 158                                 bit_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
 159                         } else {
 160                                 ipi_type = sched_ipi_action(processor, NULL, true, SCHED_IPI_EVENT_SPILL);
 161                         }
 162                         pset_unlock(pset);
 163                         sched_ipi_perform(processor, ipi_type);
 164                         return;
 165                 }
 166         }
 167
 168         processor_t ast_processor = NULL;
 169         uint64_t running_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING];
 170         for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
 171                 processor = processor_array[cpuid];
 172                 if (processor->current_recommended_pset_type == PSET_AMP_P) {
 173                         /* Already running a spilled P-core recommended thread */
 174                         continue;
 175                 }
 176                 if (bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
 177                         /* Already received a spill signal */
 178                         continue;
 179                 }
 180                 if (processor->current_pri >= spilled_thread_priority) {
 181                         /* Already running a higher or equal priority thread */
 182                         continue;
 183                 }
 184
 185                 /* Found a suitable processor */
 186                 bit_set(pset->pending_spill_cpu_mask, processor->cpu_id);
 187                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 1, 0, 0);
 188                 if (processor == current_processor()) {
 189                         ast_on(AST_PREEMPT);
 190                 }
 191                 ipi_type = sched_ipi_action(processor, NULL, false, SCHED_IPI_EVENT_SPILL);
 192                 if (ipi_type != SCHED_IPI_NONE) {
 193                         ast_processor = processor;
 194                 }
 195                 break;
 196         }
 197
 198         pset_unlock(pset);
 199         sched_ipi_perform(ast_processor, ipi_type);
 200 }
 201
 202 /*
 203  * pset_should_accept_spilled_thread()
 204  *
 205  * Routine to decide if pset should accept spilled threads.
 206  * This function must be safe to call (to use as a hint) without holding the pset lock.
 207  */
 208 bool
 209 pset_should_accept_spilled_thread(processor_set_t pset, int spilled_thread_priority)
 210 {
 211         if ((pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
 212                 return true;
 213         }
 214
 215         uint64_t cpu_map = (pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING]);
 216
 217         for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) {
 218                 processor_t processor = processor_array[cpuid];
 219
 220                 if (processor->current_recommended_pset_type == PSET_AMP_P) {
 221                         /* This processor is already running a spilled thread */
 222                         continue;
 223                 }
 224
 225                 if (processor->current_pri < spilled_thread_priority) {
 226                         return true;
 227                 }
 228         }
 229
 230         return false;
 231 }
 232
 233 /*
 234  * should_spill_to_ecores()
 235  *
 236  * Spill policy is implemented here
 237  */
 238 bool
 239 should_spill_to_ecores(processor_set_t nset, thread_t thread)
 240 {
 241         if (nset->pset_cluster_type == PSET_AMP_E) {
 242                 /* Not relevant if ecores already preferred */
 243                 return false;
 244         }
 245
 246         if (!pset_is_recommended(ecore_set)) {
 247                 /* E cores must be recommended */
 248                 return false;
 249         }
 250
 251         if (thread->sched_flags & TH_SFLAG_PCORE_ONLY) {
 252                 return false;
 253         }
 254
 255         if (thread->sched_pri >= BASEPRI_RTQUEUES) {
 256                 /* Never spill realtime threads */
 257                 return false;
 258         }
 259
 260         if ((nset->recommended_bitmask & nset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
 261                 /* Don't spill if idle cores */
 262                 return false;
 263         }
 264
 265         if ((sched_get_pset_load_average(nset, 0) >= sched_amp_spill_threshold(nset)) &&  /* There is already a load on P cores */
 266             pset_should_accept_spilled_thread(ecore_set, thread->sched_pri)) { /* There are lower priority E cores */
 267                 return true;
 268         }
 269
 270         return false;
 271 }
 272
 273 /*
 274  * sched_amp_check_spill()
 275  *
 276  * Routine to check if the thread should be spilled and signal the pset if needed.
 277  */
 278 void
 279 sched_amp_check_spill(processor_set_t pset, thread_t thread)
 280 {
 281         /* pset is unlocked */
 282
 283         /* Bound threads don't call this function */
 284         assert(thread->bound_processor == PROCESSOR_NULL);
 285
 286         if (should_spill_to_ecores(pset, thread)) {
 287                 pset_lock(ecore_set);
 288
 289                 pset_signal_spill(ecore_set, thread->sched_pri);
 290                 /* returns with ecore_set unlocked */
 291         }
 292 }
 293
 294 /*
 295  * sched_amp_steal_threshold()
 296  *
 297  * Routine to calculate the steal threshold
 298  */
 299 int
 300 sched_amp_steal_threshold(processor_set_t pset, bool spill_pending)
 301 {
 302         int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
 303
 304         return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + (spill_pending ? sched_amp_spill_steal : sched_amp_idle_steal);
 305 }
 306
 307 /*
 308  * sched_amp_steal_thread_enabled()
 309  *
 310  */
 311 bool
 312 sched_amp_steal_thread_enabled(processor_set_t pset)
 313 {
 314         return (pset->pset_cluster_type == PSET_AMP_E) && (pcore_set->online_processor_count > 0);
 315 }
 316
 317 /*
 318  * sched_amp_balance()
 319  *
 320  * Invoked with pset locked, returns with pset unlocked
 321  */
 322 void
 323 sched_amp_balance(processor_t cprocessor, processor_set_t cpset)
 324 {
 325         assert(cprocessor == current_processor());
 326
 327         pset_unlock(cpset);
 328
 329         if (cpset->pset_cluster_type == PSET_AMP_E || !cprocessor->is_recommended) {
 330                 return;
 331         }
 332
 333         /*
 334          * cprocessor is an idle, recommended P core processor.
 335          * Look for P-eligible threads that have spilled to an E core
 336          * and coax them to come back.
 337          */
 338
 339         processor_set_t pset = ecore_set;
 340
 341         pset_lock(pset);
 342
 343         processor_t eprocessor;
 344         uint64_t ast_processor_map = 0;
 345
 346         sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
 347         uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
 348         for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
 349                 eprocessor = processor_array[cpuid];
 350                 if ((eprocessor->current_pri < BASEPRI_RTQUEUES) &&
 351                     (eprocessor->current_recommended_pset_type == PSET_AMP_P)) {
 352                         ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, false, SCHED_IPI_EVENT_REBALANCE);
 353                         if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
 354                                 bit_set(ast_processor_map, eprocessor->cpu_id);
 355                                 assert(eprocessor != cprocessor);
 356                         }
 357                 }
 358         }
 359
 360         pset_unlock(pset);
 361
 362         for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
 363                 processor_t ast_processor = processor_array[cpuid];
 364                 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
 365         }
 366 }
 367
 368 /*
 369  * Helper function for sched_amp_thread_group_recommendation_change()
 370  * Find all the cores in the pset running threads from the thread_group tg
 371  * and send them a rebalance interrupt.
 372  */
 373 void
 374 sched_amp_bounce_thread_group_from_ecores(processor_set_t pset, struct thread_group *tg)
 375 {
 376         assert(pset->pset_cluster_type == PSET_AMP_E);
 377         uint64_t ast_processor_map = 0;
 378         sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
 379
 380         spl_t s = splsched();
 381         pset_lock(pset);
 382
 383         uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
 384         for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
 385                 processor_t eprocessor = processor_array[cpuid];
 386                 if (eprocessor->current_thread_group == tg) {
 387                         ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, false, SCHED_IPI_EVENT_REBALANCE);
 388                         if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
 389                                 bit_set(ast_processor_map, eprocessor->cpu_id);
 390                         } else if (eprocessor == current_processor()) {
 391                                 ast_on(AST_PREEMPT);
 392                                 bit_set(pset->pending_AST_PREEMPT_cpu_mask, eprocessor->cpu_id);
 393                         }
 394                 }
 395         }
 396
 397         KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, tg, ast_processor_map, 0, 0);
 398
 399         pset_unlock(pset);
 400
 401         for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
 402                 processor_t ast_processor = processor_array[cpuid];
 403                 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
 404         }
 405
 406         splx(s);
 407 }
 408
 409 /*
 410  * sched_amp_ipi_policy()
 411  */
 412 sched_ipi_type_t
 413 sched_amp_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
 414 {
 415         processor_set_t pset = dst->processor_set;
 416         assert(bit_test(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id) == false);
 417         assert(dst != current_processor());
 418
 419         boolean_t deferred_ipi_supported = false;
 420 #if defined(CONFIG_SCHED_DEFERRED_AST)
 421         deferred_ipi_supported = true;
 422 #endif /* CONFIG_SCHED_DEFERRED_AST */
 423
 424         switch (event) {
 425         case SCHED_IPI_EVENT_SPILL:
 426                 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
 427                 if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
 428                         return sched_ipi_deferred_policy(pset, dst, event);
 429                 }
 430                 break;
 431         case SCHED_IPI_EVENT_PREEMPT:
 432                 /* For preemption, the default policy is to use deferred IPIs
 433                  * for Non-RT P-core preemption. Override that behavior if
 434                  * sched_amp_pcores_preempt_immediate_ipi is set
 435                  */
 436                 if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
 437                         if (sched_amp_pcores_preempt_immediate_ipi && (pset == pcore_set)) {
 438                                 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
 439                         }
 440                 }
 441                 break;
 442         default:
 443                 break;
 444         }
 445         /* Default back to the global policy for all other scenarios */
 446         return sched_ipi_policy(dst, thread, dst_idle, event);
 447 }
 448
 449 /*
 450  * sched_amp_qos_max_parallelism()
 451  */
 452 uint32_t
 453 sched_amp_qos_max_parallelism(int qos, uint64_t options)
 454 {
 455         uint32_t ecount = ecore_set->cpu_set_count;
 456         uint32_t pcount = pcore_set->cpu_set_count;
 457
 458         if (options & QOS_PARALLELISM_REALTIME) {
 459                 /* For realtime threads on AMP, we would want them
 460                  * to limit the width to just the P-cores since we
 461                  * do not spill/rebalance for RT threads.
 462                  */
 463                 return pcount;
 464         }
 465
 466         /*
 467          * The default AMP scheduler policy is to run utility and by
 468          * threads on E-Cores only.  Run-time policy adjustment unlocks
 469          * ability of utility and bg to threads to be scheduled based on
 470          * run-time conditions.
 471          */
 472         switch (qos) {
 473         case THREAD_QOS_UTILITY:
 474                 return (os_atomic_load(&sched_perfctl_policy_util, relaxed) == SCHED_PERFCTL_POLICY_DEFAULT) ? ecount : (ecount + pcount);
 475         case THREAD_QOS_BACKGROUND:
 476         case THREAD_QOS_MAINTENANCE:
 477                 return (os_atomic_load(&sched_perfctl_policy_bg, relaxed) == SCHED_PERFCTL_POLICY_DEFAULT) ? ecount : (ecount + pcount);
 478         default:
 479                 return ecount + pcount;
 480         }
 481 }
 482
 483 pset_node_t
 484 sched_amp_choose_node(thread_t thread)
 485 {
 486         if (recommended_pset_type(thread) == PSET_AMP_P) {
 487                 return pcore_set->node;
 488         } else {
 489                 return ecore_set->node;
 490         }
 491 }
 492
 493 /*
 494  * sched_amp_rt_runq()
 495  */
 496 rt_queue_t
 497 sched_amp_rt_runq(processor_set_t pset)
 498 {
 499         return &pset->rt_runq;
 500 }
 501
 502 /*
 503  * sched_amp_rt_init()
 504  */
 505 void
 506 sched_amp_rt_init(processor_set_t pset)
 507 {
 508         pset_rt_init(pset);
 509 }
 510
 511 /*
 512  * sched_amp_rt_queue_shutdown()
 513  */
 514 void
 515 sched_amp_rt_queue_shutdown(processor_t processor)
 516 {
 517         processor_set_t pset = processor->processor_set;
 518         thread_t        thread;
 519         queue_head_t    tqueue;
 520
 521         pset_lock(pset);
 522
 523         /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
 524         if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
 525                 pset_unlock(pset);
 526                 return;
 527         }
 528
 529         queue_init(&tqueue);
 530
 531         while (rt_runq_count(pset) > 0) {
 532                 thread = qe_dequeue_head(&pset->rt_runq.queue, struct thread, runq_links);
 533                 thread->runq = PROCESSOR_NULL;
 534                 SCHED_STATS_RUNQ_CHANGE(&pset->rt_runq.runq_stats,
 535                     os_atomic_load(&pset->rt_runq.count, relaxed));
 536                 rt_runq_count_decr(pset);
 537                 enqueue_tail(&tqueue, &thread->runq_links);
 538         }
 539         sched_update_pset_load_average(pset, 0);
 540         pset_unlock(pset);
 541
 542         qe_foreach_element_safe(thread, &tqueue, runq_links) {
 543                 remqueue(&thread->runq_links);
 544
 545                 thread_lock(thread);
 546
 547                 thread_setrun(thread, SCHED_TAILQ);
 548
 549                 thread_unlock(thread);
 550         }
 551 }
 552
 553 /*
 554  * sched_amp_rt_runq_scan()
 555  *
 556  * Assumes RT lock is not held, and acquires splsched/rt_lock itself
 557  */
 558 void
 559 sched_amp_rt_runq_scan(sched_update_scan_context_t scan_context)
 560 {
 561         thread_t        thread;
 562
 563         pset_node_t node = &pset_node0;
 564         processor_set_t pset = node->psets;
 565
 566         spl_t s = splsched();
 567         do {
 568                 while (pset != NULL) {
 569                         pset_lock(pset);
 570
 571                         qe_foreach_element_safe(thread, &pset->rt_runq.queue, runq_links) {
 572                                 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
 573                                         scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
 574                                 }
 575                         }
 576
 577                         pset_unlock(pset);
 578
 579                         pset = pset->pset_list;
 580                 }
 581         } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
 582         splx(s);
 583 }
 584
 585 /*
 586  * sched_amp_rt_runq_count_sum()
 587  */
 588 int64_t
 589 sched_amp_rt_runq_count_sum(void)
 590 {
 591         pset_node_t node = &pset_node0;
 592         processor_set_t pset = node->psets;
 593         int64_t count = 0;
 594
 595         do {
 596                 while (pset != NULL) {
 597                         count += pset->rt_runq.runq_stats.count_sum;
 598
 599                         pset = pset->pset_list;
 600                 }
 601         } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
 602
 603         return count;
 604 }
 605
 606 #endif /* __AMP__ */