osfmk/kern/sched_amp_common.c

   1 /*
   2  * Copyright (c) 2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <mach/mach_types.h>
  30 #include <mach/machine.h>
  31 #include <machine/machine_routines.h>
  32 #include <machine/sched_param.h>
  33 #include <machine/machine_cpu.h>
  34 #include <kern/kern_types.h>
  35 #include <kern/debug.h>
  36 #include <kern/machine.h>
  37 #include <kern/misc_protos.h>
  38 #include <kern/processor.h>
  39 #include <kern/queue.h>
  40 #include <kern/sched.h>
  41 #include <kern/sched_prim.h>
  42 #include <kern/task.h>
  43 #include <kern/thread.h>
  44 #include <machine/atomic.h>
  45 #include <sys/kdebug.h>
  46 #include <kern/sched_amp_common.h>
  47
  48 #if __AMP__
  49
  50 /* Exported globals */
  51 processor_set_t ecore_set = NULL;
  52 processor_set_t pcore_set = NULL;
  53
  54 static struct processor_set pset1;
  55 static struct pset_node pset_node1;
  56
  57 #if DEVELOPMENT || DEBUG
  58 bool system_ecore_only = false;
  59 #endif /* DEVELOPMENT || DEBUG */
  60
  61 /*
  62  * sched_amp_init()
  63  *
  64  * Initialize the pcore_set and ecore_set globals which describe the
  65  * P/E processor sets.
  66  */
  67 void
  68 sched_amp_init(void)
  69 {
  70         pset_init(&pset1, &pset_node1);
  71         pset_node1.psets = &pset1;
  72         pset_node0.node_list = &pset_node1;
  73
  74         if (ml_get_boot_cluster() == CLUSTER_TYPE_P) {
  75                 pcore_set = &pset0;
  76                 ecore_set = &pset1;
  77         } else {
  78                 ecore_set = &pset0;
  79                 pcore_set = &pset1;
  80         }
  81
  82         ecore_set->pset_cluster_type = PSET_AMP_E;
  83         ecore_set->pset_cluster_id = 0;
  84
  85         pcore_set->pset_cluster_type = PSET_AMP_P;
  86         pcore_set->pset_cluster_id = 1;
  87
  88 #if !CONFIG_SCHED_CLUTCH
  89         /*
  90          * For non-clutch scheduler, allow system to be e-core only.
  91          * Clutch scheduler support for this feature needs to be implemented.
  92          */
  93 #if DEVELOPMENT || DEBUG
  94         if (PE_parse_boot_argn("enable_skstsct", NULL, 0)) {
  95                 system_ecore_only = true;
  96         }
  97 #endif /* DEVELOPMENT || DEBUG */
  98
  99 #endif /* !CONFIG_SCHED_CLUTCH */
 100         sched_timeshare_init();
 101 }
 102
 103 /* Spill threshold load average is ncpus in pset + (sched_amp_spill_count/(1 << PSET_LOAD_FRACTIONAL_SHIFT) */
 104 int sched_amp_spill_count = 3;
 105 int sched_amp_idle_steal = 1;
 106 int sched_amp_spill_steal = 1;
 107
 108 /*
 109  * We see performance gains from doing immediate IPIs to P-cores to run
 110  * P-eligible threads and lesser P-E migrations from using deferred IPIs
 111  * for spill.
 112  */
 113 int sched_amp_spill_deferred_ipi = 1;
 114 int sched_amp_pcores_preempt_immediate_ipi = 1;
 115
 116
 117 /*
 118  * sched_amp_spill_threshold()
 119  *
 120  * Routine to calulate spill threshold which decides if cluster should spill.
 121  */
 122 int
 123 sched_amp_spill_threshold(processor_set_t pset)
 124 {
 125         int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
 126
 127         return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + sched_amp_spill_count;
 128 }
 129
 130 /*
 131  * pset_signal_spill()
 132  *
 133  * Routine to signal a running/idle CPU to cause a spill onto that CPU.
 134  * Called with pset locked, returns unlocked
 135  */
 136 void
 137 pset_signal_spill(processor_set_t pset, int spilled_thread_priority)
 138 {
 139         processor_t processor;
 140         sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
 141
 142         uint64_t idle_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE];
 143         for (int cpuid = lsb_first(idle_map); cpuid >= 0; cpuid = lsb_next(idle_map, cpuid)) {
 144                 processor = processor_array[cpuid];
 145                 if (bit_set_if_clear(pset->pending_spill_cpu_mask, processor->cpu_id)) {
 146                         KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 0, 0, 0);
 147
 148                         processor->deadline = UINT64_MAX;
 149                         pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
 150
 151                         if (processor == current_processor()) {
 152                                 bit_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
 153                         } else {
 154                                 ipi_type = sched_ipi_action(processor, NULL, true, SCHED_IPI_EVENT_SPILL);
 155                         }
 156                         pset_unlock(pset);
 157                         sched_ipi_perform(processor, ipi_type);
 158                         return;
 159                 }
 160         }
 161
 162         processor_t ast_processor = NULL;
 163         uint64_t running_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING];
 164         for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
 165                 processor = processor_array[cpuid];
 166                 if (processor->current_recommended_pset_type == PSET_AMP_P) {
 167                         /* Already running a spilled P-core recommended thread */
 168                         continue;
 169                 }
 170                 if (bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
 171                         /* Already received a spill signal */
 172                         continue;
 173                 }
 174                 if (processor->current_pri >= spilled_thread_priority) {
 175                         /* Already running a higher or equal priority thread */
 176                         continue;
 177                 }
 178
 179                 /* Found a suitable processor */
 180                 bit_set(pset->pending_spill_cpu_mask, processor->cpu_id);
 181                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 1, 0, 0);
 182                 if (processor == current_processor()) {
 183                         ast_on(AST_PREEMPT);
 184                 }
 185                 ipi_type = sched_ipi_action(processor, NULL, false, SCHED_IPI_EVENT_SPILL);
 186                 if (ipi_type != SCHED_IPI_NONE) {
 187                         ast_processor = processor;
 188                 }
 189                 break;
 190         }
 191
 192         pset_unlock(pset);
 193         sched_ipi_perform(ast_processor, ipi_type);
 194 }
 195
 196 /*
 197  * pset_should_accept_spilled_thread()
 198  *
 199  * Routine to decide if pset should accept spilled threads.
 200  * This function must be safe to call (to use as a hint) without holding the pset lock.
 201  */
 202 bool
 203 pset_should_accept_spilled_thread(processor_set_t pset, int spilled_thread_priority)
 204 {
 205         if ((pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
 206                 return true;
 207         }
 208
 209         uint64_t cpu_map = (pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING]);
 210
 211         for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) {
 212                 processor_t processor = processor_array[cpuid];
 213
 214                 if (processor->current_recommended_pset_type == PSET_AMP_P) {
 215                         /* This processor is already running a spilled thread */
 216                         continue;
 217                 }
 218
 219                 if (processor->current_pri < spilled_thread_priority) {
 220                         return true;
 221                 }
 222         }
 223
 224         return false;
 225 }
 226
 227 /*
 228  * should_spill_to_ecores()
 229  *
 230  * Spill policy is implemented here
 231  */
 232 bool
 233 should_spill_to_ecores(processor_set_t nset, thread_t thread)
 234 {
 235         if (nset->pset_cluster_type == PSET_AMP_E) {
 236                 /* Not relevant if ecores already preferred */
 237                 return false;
 238         }
 239
 240         if (!pset_is_recommended(ecore_set)) {
 241                 /* E cores must be recommended */
 242                 return false;
 243         }
 244
 245 #if !CONFIG_SCHED_CLUTCH
 246         /* Per-thread P-core scheduling support needs to be implemented for clutch scheduler */
 247         if (thread->sched_flags & TH_SFLAG_PCORE_ONLY) {
 248                 return false;
 249         }
 250 #endif /* !CONFIG_SCHED_CLUTCH */
 251
 252         if (thread->sched_pri >= BASEPRI_RTQUEUES) {
 253                 /* Never spill realtime threads */
 254                 return false;
 255         }
 256
 257         if ((nset->recommended_bitmask & nset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
 258                 /* Don't spill if idle cores */
 259                 return false;
 260         }
 261
 262         if ((sched_get_pset_load_average(nset) >= sched_amp_spill_threshold(nset)) &&  /* There is already a load on P cores */
 263             pset_should_accept_spilled_thread(ecore_set, thread->sched_pri)) { /* There are lower priority E cores */
 264                 return true;
 265         }
 266
 267         return false;
 268 }
 269
 270 /*
 271  * sched_amp_check_spill()
 272  *
 273  * Routine to check if the thread should be spilled and signal the pset if needed.
 274  */
 275 void
 276 sched_amp_check_spill(processor_set_t pset, thread_t thread)
 277 {
 278         /* pset is unlocked */
 279
 280         /* Bound threads don't call this function */
 281         assert(thread->bound_processor == PROCESSOR_NULL);
 282
 283         if (should_spill_to_ecores(pset, thread)) {
 284                 pset_lock(ecore_set);
 285
 286                 pset_signal_spill(ecore_set, thread->sched_pri);
 287                 /* returns with ecore_set unlocked */
 288         }
 289 }
 290
 291 /*
 292  * sched_amp_steal_threshold()
 293  *
 294  * Routine to calculate the steal threshold
 295  */
 296 int
 297 sched_amp_steal_threshold(processor_set_t pset, bool spill_pending)
 298 {
 299         int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
 300
 301         return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + (spill_pending ? sched_amp_spill_steal : sched_amp_idle_steal);
 302 }
 303
 304 /*
 305  * sched_amp_steal_thread_enabled()
 306  *
 307  */
 308 bool
 309 sched_amp_steal_thread_enabled(processor_set_t pset)
 310 {
 311         return (pset->pset_cluster_type == PSET_AMP_E) && (pcore_set->online_processor_count > 0);
 312 }
 313
 314 /*
 315  * sched_amp_balance()
 316  *
 317  * Invoked with pset locked, returns with pset unlocked
 318  */
 319 void
 320 sched_amp_balance(processor_t cprocessor, processor_set_t cpset)
 321 {
 322         assert(cprocessor == current_processor());
 323
 324         pset_unlock(cpset);
 325
 326         if (cpset->pset_cluster_type == PSET_AMP_E || !cprocessor->is_recommended) {
 327                 return;
 328         }
 329
 330         /*
 331          * cprocessor is an idle, recommended P core processor.
 332          * Look for P-eligible threads that have spilled to an E core
 333          * and coax them to come back.
 334          */
 335
 336         processor_set_t pset = ecore_set;
 337
 338         pset_lock(pset);
 339
 340         processor_t eprocessor;
 341         uint64_t ast_processor_map = 0;
 342
 343         sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
 344         uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
 345         for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
 346                 eprocessor = processor_array[cpuid];
 347                 if ((eprocessor->current_pri < BASEPRI_RTQUEUES) &&
 348                     (eprocessor->current_recommended_pset_type == PSET_AMP_P)) {
 349                         ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, false, SCHED_IPI_EVENT_REBALANCE);
 350                         if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
 351                                 bit_set(ast_processor_map, eprocessor->cpu_id);
 352                                 assert(eprocessor != cprocessor);
 353                         }
 354                 }
 355         }
 356
 357         pset_unlock(pset);
 358
 359         for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
 360                 processor_t ast_processor = processor_array[cpuid];
 361                 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
 362         }
 363 }
 364
 365 /*
 366  * Helper function for sched_amp_thread_group_recommendation_change()
 367  * Find all the cores in the pset running threads from the thread_group tg
 368  * and send them a rebalance interrupt.
 369  */
 370 void
 371 sched_amp_bounce_thread_group_from_ecores(processor_set_t pset, struct thread_group *tg)
 372 {
 373         assert(pset->pset_cluster_type == PSET_AMP_E);
 374         uint64_t ast_processor_map = 0;
 375         sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
 376
 377         spl_t s = splsched();
 378         pset_lock(pset);
 379
 380         uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
 381         for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
 382                 processor_t eprocessor = processor_array[cpuid];
 383                 if (eprocessor->current_thread_group == tg) {
 384                         ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, false, SCHED_IPI_EVENT_REBALANCE);
 385                         if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
 386                                 bit_set(ast_processor_map, eprocessor->cpu_id);
 387                         } else if (eprocessor == current_processor()) {
 388                                 ast_on(AST_PREEMPT);
 389                                 bit_set(pset->pending_AST_PREEMPT_cpu_mask, eprocessor->cpu_id);
 390                         }
 391                 }
 392         }
 393
 394         KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, tg, ast_processor_map, 0, 0);
 395
 396         pset_unlock(pset);
 397
 398         for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
 399                 processor_t ast_processor = processor_array[cpuid];
 400                 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
 401         }
 402
 403         splx(s);
 404 }
 405
 406 /*
 407  * sched_amp_ipi_policy()
 408  */
 409 sched_ipi_type_t
 410 sched_amp_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
 411 {
 412         processor_set_t pset = dst->processor_set;
 413         assert(bit_test(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id) == false);
 414         assert(dst != current_processor());
 415
 416         boolean_t deferred_ipi_supported = false;
 417 #if defined(CONFIG_SCHED_DEFERRED_AST)
 418         deferred_ipi_supported = true;
 419 #endif /* CONFIG_SCHED_DEFERRED_AST */
 420
 421         switch (event) {
 422         case SCHED_IPI_EVENT_SPILL:
 423                 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
 424                 if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
 425                         return sched_ipi_deferred_policy(pset, dst, event);
 426                 }
 427                 break;
 428         case SCHED_IPI_EVENT_PREEMPT:
 429                 /* For preemption, the default policy is to use deferred IPIs
 430                  * for Non-RT P-core preemption. Override that behavior if
 431                  * sched_amp_pcores_preempt_immediate_ipi is set
 432                  */
 433                 if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
 434                         if (sched_amp_pcores_preempt_immediate_ipi && (pset == pcore_set)) {
 435                                 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
 436                         }
 437                 }
 438                 break;
 439         default:
 440                 break;
 441         }
 442         /* Default back to the global policy for all other scenarios */
 443         return sched_ipi_policy(dst, thread, dst_idle, event);
 444 }
 445
 446 /*
 447  * sched_amp_qos_max_parallelism()
 448  */
 449 uint32_t
 450 sched_amp_qos_max_parallelism(int qos, uint64_t options)
 451 {
 452         uint32_t ecount = ecore_set->cpu_set_count;
 453         uint32_t pcount = pcore_set->cpu_set_count;
 454
 455         if (options & QOS_PARALLELISM_REALTIME) {
 456                 /* For realtime threads on AMP, we would want them
 457                  * to limit the width to just the P-cores since we
 458                  * do not spill/rebalance for RT threads.
 459                  */
 460                 return pcount;
 461         }
 462
 463         /*
 464          * The current AMP scheduler policy is not run
 465          * background and utility threads on the P-Cores.
 466          */
 467         switch (qos) {
 468         case THREAD_QOS_UTILITY:
 469         case THREAD_QOS_BACKGROUND:
 470         case THREAD_QOS_MAINTENANCE:
 471                 return ecount;
 472         default:
 473                 return ecount + pcount;
 474         }
 475 }
 476
 477 /*
 478  * sched_amp_rt_runq()
 479  */
 480 rt_queue_t
 481 sched_amp_rt_runq(processor_set_t pset)
 482 {
 483         return &pset->rt_runq;
 484 }
 485
 486 /*
 487  * sched_amp_rt_init()
 488  */
 489 void
 490 sched_amp_rt_init(processor_set_t pset)
 491 {
 492         pset_rt_init(pset);
 493 }
 494
 495 /*
 496  * sched_amp_rt_queue_shutdown()
 497  */
 498 void
 499 sched_amp_rt_queue_shutdown(processor_t processor)
 500 {
 501         processor_set_t pset = processor->processor_set;
 502         thread_t        thread;
 503         queue_head_t    tqueue;
 504
 505         pset_lock(pset);
 506
 507         /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
 508         if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
 509                 pset_unlock(pset);
 510                 return;
 511         }
 512
 513         queue_init(&tqueue);
 514
 515         rt_lock_lock(pset);
 516
 517         while (rt_runq_count(pset) > 0) {
 518                 thread = qe_dequeue_head(&pset->rt_runq.queue, struct thread, runq_links);
 519                 thread->runq = PROCESSOR_NULL;
 520                 SCHED_STATS_RUNQ_CHANGE(&pset->rt_runq.runq_stats, pset->rt_runq.count);
 521                 rt_runq_count_decr(pset);
 522                 enqueue_tail(&tqueue, &thread->runq_links);
 523         }
 524         rt_lock_unlock(pset);
 525         sched_update_pset_load_average(pset);
 526         pset_unlock(pset);
 527
 528         qe_foreach_element_safe(thread, &tqueue, runq_links) {
 529                 remqueue(&thread->runq_links);
 530
 531                 thread_lock(thread);
 532
 533                 thread_setrun(thread, SCHED_TAILQ);
 534
 535                 thread_unlock(thread);
 536         }
 537 }
 538
 539 /*
 540  * sched_amp_rt_runq_scan()
 541  *
 542  * Assumes RT lock is not held, and acquires splsched/rt_lock itself
 543  */
 544 void
 545 sched_amp_rt_runq_scan(sched_update_scan_context_t scan_context)
 546 {
 547         thread_t        thread;
 548
 549         pset_node_t node = &pset_node0;
 550         processor_set_t pset = node->psets;
 551
 552         spl_t s = splsched();
 553         do {
 554                 while (pset != NULL) {
 555                         rt_lock_lock(pset);
 556
 557                         qe_foreach_element_safe(thread, &pset->rt_runq.queue, runq_links) {
 558                                 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
 559                                         scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
 560                                 }
 561                         }
 562
 563                         rt_lock_unlock(pset);
 564
 565                         pset = pset->pset_list;
 566                 }
 567         } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
 568         splx(s);
 569 }
 570
 571 /*
 572  * sched_amp_rt_runq_count_sum()
 573  */
 574 int64_t
 575 sched_amp_rt_runq_count_sum(void)
 576 {
 577         pset_node_t node = &pset_node0;
 578         processor_set_t pset = node->psets;
 579         int64_t count = 0;
 580
 581         do {
 582                 while (pset != NULL) {
 583                         count += pset->rt_runq.runq_stats.count_sum;
 584
 585                         pset = pset->pset_list;
 586                 }
 587         } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
 588
 589         return count;
 590 }
 591
 592 #endif /* __AMP__ */