2 * Copyright (c) 2019 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <mach/mach_types.h>
30 #include <mach/machine.h>
31 #include <machine/machine_routines.h>
32 #include <machine/sched_param.h>
33 #include <machine/machine_cpu.h>
34 #include <kern/kern_types.h>
35 #include <kern/debug.h>
36 #include <kern/machine.h>
37 #include <kern/misc_protos.h>
38 #include <kern/processor.h>
39 #include <kern/queue.h>
40 #include <kern/sched.h>
41 #include <kern/sched_prim.h>
42 #include <kern/task.h>
43 #include <kern/thread.h>
44 #include <machine/atomic.h>
45 #include <sys/kdebug.h>
46 #include <kern/sched_amp_common.h>
47 #include <stdatomic.h>
51 /* Exported globals */
52 processor_set_t ecore_set
= NULL
;
53 processor_set_t pcore_set
= NULL
;
55 static struct processor_set pset1
;
56 static struct pset_node pset_node1
;
58 #if DEVELOPMENT || DEBUG
59 bool system_ecore_only
= false;
60 #endif /* DEVELOPMENT || DEBUG */
65 * Initialize the pcore_set and ecore_set globals which describe the
71 pset_init(&pset1
, &pset_node1
);
72 pset_node1
.psets
= &pset1
;
73 pset_node0
.node_list
= &pset_node1
;
75 if (ml_get_boot_cluster() == CLUSTER_TYPE_P
) {
83 ecore_set
->pset_cluster_type
= PSET_AMP_E
;
84 ecore_set
->pset_cluster_id
= 0;
86 pcore_set
->pset_cluster_type
= PSET_AMP_P
;
87 pcore_set
->pset_cluster_id
= 1;
89 #if DEVELOPMENT || DEBUG
90 if (PE_parse_boot_argn("enable_skstsct", NULL
, 0)) {
91 system_ecore_only
= true;
93 #endif /* DEVELOPMENT || DEBUG */
95 sched_timeshare_init();
98 /* Spill threshold load average is ncpus in pset + (sched_amp_spill_count/(1 << PSET_LOAD_FRACTIONAL_SHIFT) */
99 int sched_amp_spill_count
= 3;
100 int sched_amp_idle_steal
= 1;
101 int sched_amp_spill_steal
= 1;
104 * We see performance gains from doing immediate IPIs to P-cores to run
105 * P-eligible threads and lesser P-E migrations from using deferred IPIs
108 int sched_amp_spill_deferred_ipi
= 1;
109 int sched_amp_pcores_preempt_immediate_ipi
= 1;
112 * sched_perfcontrol_inherit_recommendation_from_tg changes amp
113 * scheduling policy away from default and allows policy to be
114 * modified at run-time.
116 * once modified from default, the policy toggles between "follow
117 * thread group" and "restrict to e".
120 _Atomic sched_perfctl_class_policy_t sched_perfctl_policy_util
= SCHED_PERFCTL_POLICY_DEFAULT
;
121 _Atomic sched_perfctl_class_policy_t sched_perfctl_policy_bg
= SCHED_PERFCTL_POLICY_DEFAULT
;
124 * sched_amp_spill_threshold()
126 * Routine to calulate spill threshold which decides if cluster should spill.
129 sched_amp_spill_threshold(processor_set_t pset
)
131 int recommended_processor_count
= bit_count(pset
->recommended_bitmask
& pset
->cpu_bitmask
);
133 return (recommended_processor_count
<< PSET_LOAD_FRACTIONAL_SHIFT
) + sched_amp_spill_count
;
137 * pset_signal_spill()
139 * Routine to signal a running/idle CPU to cause a spill onto that CPU.
140 * Called with pset locked, returns unlocked
143 pset_signal_spill(processor_set_t pset
, int spilled_thread_priority
)
145 processor_t processor
;
146 sched_ipi_type_t ipi_type
= SCHED_IPI_NONE
;
148 uint64_t idle_map
= pset
->recommended_bitmask
& pset
->cpu_state_map
[PROCESSOR_IDLE
];
149 for (int cpuid
= lsb_first(idle_map
); cpuid
>= 0; cpuid
= lsb_next(idle_map
, cpuid
)) {
150 processor
= processor_array
[cpuid
];
151 if (bit_set_if_clear(pset
->pending_spill_cpu_mask
, processor
->cpu_id
)) {
152 KDBG(MACHDBG_CODE(DBG_MACH_SCHED
, MACH_AMP_SIGNAL_SPILL
) | DBG_FUNC_NONE
, processor
->cpu_id
, 0, 0, 0);
154 processor
->deadline
= UINT64_MAX
;
155 pset_update_processor_state(pset
, processor
, PROCESSOR_DISPATCHING
);
157 if (processor
== current_processor()) {
158 bit_set(pset
->pending_AST_URGENT_cpu_mask
, processor
->cpu_id
);
160 ipi_type
= sched_ipi_action(processor
, NULL
, true, SCHED_IPI_EVENT_SPILL
);
163 sched_ipi_perform(processor
, ipi_type
);
168 processor_t ast_processor
= NULL
;
169 uint64_t running_map
= pset
->recommended_bitmask
& pset
->cpu_state_map
[PROCESSOR_RUNNING
];
170 for (int cpuid
= lsb_first(running_map
); cpuid
>= 0; cpuid
= lsb_next(running_map
, cpuid
)) {
171 processor
= processor_array
[cpuid
];
172 if (processor
->current_recommended_pset_type
== PSET_AMP_P
) {
173 /* Already running a spilled P-core recommended thread */
176 if (bit_test(pset
->pending_spill_cpu_mask
, processor
->cpu_id
)) {
177 /* Already received a spill signal */
180 if (processor
->current_pri
>= spilled_thread_priority
) {
181 /* Already running a higher or equal priority thread */
185 /* Found a suitable processor */
186 bit_set(pset
->pending_spill_cpu_mask
, processor
->cpu_id
);
187 KDBG(MACHDBG_CODE(DBG_MACH_SCHED
, MACH_AMP_SIGNAL_SPILL
) | DBG_FUNC_NONE
, processor
->cpu_id
, 1, 0, 0);
188 if (processor
== current_processor()) {
191 ipi_type
= sched_ipi_action(processor
, NULL
, false, SCHED_IPI_EVENT_SPILL
);
192 if (ipi_type
!= SCHED_IPI_NONE
) {
193 ast_processor
= processor
;
199 sched_ipi_perform(ast_processor
, ipi_type
);
203 * pset_should_accept_spilled_thread()
205 * Routine to decide if pset should accept spilled threads.
206 * This function must be safe to call (to use as a hint) without holding the pset lock.
209 pset_should_accept_spilled_thread(processor_set_t pset
, int spilled_thread_priority
)
211 if ((pset
->recommended_bitmask
& pset
->cpu_state_map
[PROCESSOR_IDLE
]) != 0) {
215 uint64_t cpu_map
= (pset
->recommended_bitmask
& pset
->cpu_state_map
[PROCESSOR_RUNNING
]);
217 for (int cpuid
= lsb_first(cpu_map
); cpuid
>= 0; cpuid
= lsb_next(cpu_map
, cpuid
)) {
218 processor_t processor
= processor_array
[cpuid
];
220 if (processor
->current_recommended_pset_type
== PSET_AMP_P
) {
221 /* This processor is already running a spilled thread */
225 if (processor
->current_pri
< spilled_thread_priority
) {
234 * should_spill_to_ecores()
236 * Spill policy is implemented here
239 should_spill_to_ecores(processor_set_t nset
, thread_t thread
)
241 if (nset
->pset_cluster_type
== PSET_AMP_E
) {
242 /* Not relevant if ecores already preferred */
246 if (!pset_is_recommended(ecore_set
)) {
247 /* E cores must be recommended */
251 if (thread
->sched_flags
& TH_SFLAG_PCORE_ONLY
) {
255 if (thread
->sched_pri
>= BASEPRI_RTQUEUES
) {
256 /* Never spill realtime threads */
260 if ((nset
->recommended_bitmask
& nset
->cpu_state_map
[PROCESSOR_IDLE
]) != 0) {
261 /* Don't spill if idle cores */
265 if ((sched_get_pset_load_average(nset
, 0) >= sched_amp_spill_threshold(nset
)) && /* There is already a load on P cores */
266 pset_should_accept_spilled_thread(ecore_set
, thread
->sched_pri
)) { /* There are lower priority E cores */
274 * sched_amp_check_spill()
276 * Routine to check if the thread should be spilled and signal the pset if needed.
279 sched_amp_check_spill(processor_set_t pset
, thread_t thread
)
281 /* pset is unlocked */
283 /* Bound threads don't call this function */
284 assert(thread
->bound_processor
== PROCESSOR_NULL
);
286 if (should_spill_to_ecores(pset
, thread
)) {
287 pset_lock(ecore_set
);
289 pset_signal_spill(ecore_set
, thread
->sched_pri
);
290 /* returns with ecore_set unlocked */
295 * sched_amp_steal_threshold()
297 * Routine to calculate the steal threshold
300 sched_amp_steal_threshold(processor_set_t pset
, bool spill_pending
)
302 int recommended_processor_count
= bit_count(pset
->recommended_bitmask
& pset
->cpu_bitmask
);
304 return (recommended_processor_count
<< PSET_LOAD_FRACTIONAL_SHIFT
) + (spill_pending
? sched_amp_spill_steal
: sched_amp_idle_steal
);
308 * sched_amp_steal_thread_enabled()
312 sched_amp_steal_thread_enabled(processor_set_t pset
)
314 return (pset
->pset_cluster_type
== PSET_AMP_E
) && (pcore_set
->online_processor_count
> 0);
318 * sched_amp_balance()
320 * Invoked with pset locked, returns with pset unlocked
323 sched_amp_balance(processor_t cprocessor
, processor_set_t cpset
)
325 assert(cprocessor
== current_processor());
329 if (cpset
->pset_cluster_type
== PSET_AMP_E
|| !cprocessor
->is_recommended
) {
334 * cprocessor is an idle, recommended P core processor.
335 * Look for P-eligible threads that have spilled to an E core
336 * and coax them to come back.
339 processor_set_t pset
= ecore_set
;
343 processor_t eprocessor
;
344 uint64_t ast_processor_map
= 0;
346 sched_ipi_type_t ipi_type
[MAX_CPUS
] = {SCHED_IPI_NONE
};
347 uint64_t running_map
= pset
->cpu_state_map
[PROCESSOR_RUNNING
];
348 for (int cpuid
= lsb_first(running_map
); cpuid
>= 0; cpuid
= lsb_next(running_map
, cpuid
)) {
349 eprocessor
= processor_array
[cpuid
];
350 if ((eprocessor
->current_pri
< BASEPRI_RTQUEUES
) &&
351 (eprocessor
->current_recommended_pset_type
== PSET_AMP_P
)) {
352 ipi_type
[eprocessor
->cpu_id
] = sched_ipi_action(eprocessor
, NULL
, false, SCHED_IPI_EVENT_REBALANCE
);
353 if (ipi_type
[eprocessor
->cpu_id
] != SCHED_IPI_NONE
) {
354 bit_set(ast_processor_map
, eprocessor
->cpu_id
);
355 assert(eprocessor
!= cprocessor
);
362 for (int cpuid
= lsb_first(ast_processor_map
); cpuid
>= 0; cpuid
= lsb_next(ast_processor_map
, cpuid
)) {
363 processor_t ast_processor
= processor_array
[cpuid
];
364 sched_ipi_perform(ast_processor
, ipi_type
[cpuid
]);
369 * Helper function for sched_amp_thread_group_recommendation_change()
370 * Find all the cores in the pset running threads from the thread_group tg
371 * and send them a rebalance interrupt.
374 sched_amp_bounce_thread_group_from_ecores(processor_set_t pset
, struct thread_group
*tg
)
376 assert(pset
->pset_cluster_type
== PSET_AMP_E
);
377 uint64_t ast_processor_map
= 0;
378 sched_ipi_type_t ipi_type
[MAX_CPUS
] = {SCHED_IPI_NONE
};
380 spl_t s
= splsched();
383 uint64_t running_map
= pset
->cpu_state_map
[PROCESSOR_RUNNING
];
384 for (int cpuid
= lsb_first(running_map
); cpuid
>= 0; cpuid
= lsb_next(running_map
, cpuid
)) {
385 processor_t eprocessor
= processor_array
[cpuid
];
386 if (eprocessor
->current_thread_group
== tg
) {
387 ipi_type
[eprocessor
->cpu_id
] = sched_ipi_action(eprocessor
, NULL
, false, SCHED_IPI_EVENT_REBALANCE
);
388 if (ipi_type
[eprocessor
->cpu_id
] != SCHED_IPI_NONE
) {
389 bit_set(ast_processor_map
, eprocessor
->cpu_id
);
390 } else if (eprocessor
== current_processor()) {
392 bit_set(pset
->pending_AST_PREEMPT_cpu_mask
, eprocessor
->cpu_id
);
397 KDBG(MACHDBG_CODE(DBG_MACH_SCHED
, MACH_AMP_RECOMMENDATION_CHANGE
) | DBG_FUNC_NONE
, tg
, ast_processor_map
, 0, 0);
401 for (int cpuid
= lsb_first(ast_processor_map
); cpuid
>= 0; cpuid
= lsb_next(ast_processor_map
, cpuid
)) {
402 processor_t ast_processor
= processor_array
[cpuid
];
403 sched_ipi_perform(ast_processor
, ipi_type
[cpuid
]);
410 * sched_amp_ipi_policy()
413 sched_amp_ipi_policy(processor_t dst
, thread_t thread
, boolean_t dst_idle
, sched_ipi_event_t event
)
415 processor_set_t pset
= dst
->processor_set
;
416 assert(bit_test(pset
->pending_AST_URGENT_cpu_mask
, dst
->cpu_id
) == false);
417 assert(dst
!= current_processor());
419 boolean_t deferred_ipi_supported
= false;
420 #if defined(CONFIG_SCHED_DEFERRED_AST)
421 deferred_ipi_supported
= true;
422 #endif /* CONFIG_SCHED_DEFERRED_AST */
425 case SCHED_IPI_EVENT_SPILL
:
426 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
427 if (deferred_ipi_supported
&& sched_amp_spill_deferred_ipi
) {
428 return sched_ipi_deferred_policy(pset
, dst
, event
);
431 case SCHED_IPI_EVENT_PREEMPT
:
432 /* For preemption, the default policy is to use deferred IPIs
433 * for Non-RT P-core preemption. Override that behavior if
434 * sched_amp_pcores_preempt_immediate_ipi is set
436 if (thread
&& thread
->sched_pri
< BASEPRI_RTQUEUES
) {
437 if (sched_amp_pcores_preempt_immediate_ipi
&& (pset
== pcore_set
)) {
438 return dst_idle
? SCHED_IPI_IDLE
: SCHED_IPI_IMMEDIATE
;
445 /* Default back to the global policy for all other scenarios */
446 return sched_ipi_policy(dst
, thread
, dst_idle
, event
);
450 * sched_amp_qos_max_parallelism()
453 sched_amp_qos_max_parallelism(int qos
, uint64_t options
)
455 uint32_t ecount
= ecore_set
->cpu_set_count
;
456 uint32_t pcount
= pcore_set
->cpu_set_count
;
458 if (options
& QOS_PARALLELISM_REALTIME
) {
459 /* For realtime threads on AMP, we would want them
460 * to limit the width to just the P-cores since we
461 * do not spill/rebalance for RT threads.
467 * The default AMP scheduler policy is to run utility and by
468 * threads on E-Cores only. Run-time policy adjustment unlocks
469 * ability of utility and bg to threads to be scheduled based on
470 * run-time conditions.
473 case THREAD_QOS_UTILITY
:
474 return (os_atomic_load(&sched_perfctl_policy_util
, relaxed
) == SCHED_PERFCTL_POLICY_DEFAULT
) ? ecount
: (ecount
+ pcount
);
475 case THREAD_QOS_BACKGROUND
:
476 case THREAD_QOS_MAINTENANCE
:
477 return (os_atomic_load(&sched_perfctl_policy_bg
, relaxed
) == SCHED_PERFCTL_POLICY_DEFAULT
) ? ecount
: (ecount
+ pcount
);
479 return ecount
+ pcount
;
484 sched_amp_choose_node(thread_t thread
)
486 if (recommended_pset_type(thread
) == PSET_AMP_P
) {
487 return pcore_set
->node
;
489 return ecore_set
->node
;
494 * sched_amp_rt_runq()
497 sched_amp_rt_runq(processor_set_t pset
)
499 return &pset
->rt_runq
;
503 * sched_amp_rt_init()
506 sched_amp_rt_init(processor_set_t pset
)
512 * sched_amp_rt_queue_shutdown()
515 sched_amp_rt_queue_shutdown(processor_t processor
)
517 processor_set_t pset
= processor
->processor_set
;
523 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
524 if ((pset
->online_processor_count
> 0) && pset_is_recommended(pset
)) {
531 while (rt_runq_count(pset
) > 0) {
532 thread
= qe_dequeue_head(&pset
->rt_runq
.queue
, struct thread
, runq_links
);
533 thread
->runq
= PROCESSOR_NULL
;
534 SCHED_STATS_RUNQ_CHANGE(&pset
->rt_runq
.runq_stats
,
535 os_atomic_load(&pset
->rt_runq
.count
, relaxed
));
536 rt_runq_count_decr(pset
);
537 enqueue_tail(&tqueue
, &thread
->runq_links
);
539 sched_update_pset_load_average(pset
, 0);
542 qe_foreach_element_safe(thread
, &tqueue
, runq_links
) {
543 remqueue(&thread
->runq_links
);
547 thread_setrun(thread
, SCHED_TAILQ
);
549 thread_unlock(thread
);
554 * sched_amp_rt_runq_scan()
556 * Assumes RT lock is not held, and acquires splsched/rt_lock itself
559 sched_amp_rt_runq_scan(sched_update_scan_context_t scan_context
)
563 pset_node_t node
= &pset_node0
;
564 processor_set_t pset
= node
->psets
;
566 spl_t s
= splsched();
568 while (pset
!= NULL
) {
571 qe_foreach_element_safe(thread
, &pset
->rt_runq
.queue
, runq_links
) {
572 if (thread
->last_made_runnable_time
< scan_context
->earliest_rt_make_runnable_time
) {
573 scan_context
->earliest_rt_make_runnable_time
= thread
->last_made_runnable_time
;
579 pset
= pset
->pset_list
;
581 } while (((node
= node
->node_list
) != NULL
) && ((pset
= node
->psets
) != NULL
));
586 * sched_amp_rt_runq_count_sum()
589 sched_amp_rt_runq_count_sum(void)
591 pset_node_t node
= &pset_node0
;
592 processor_set_t pset
= node
->psets
;
596 while (pset
!= NULL
) {
597 count
+= pset
->rt_runq
.runq_stats
.count_sum
;
599 pset
= pset
->pset_list
;
601 } while (((node
= node
->node_list
) != NULL
) && ((pset
= node
->psets
) != NULL
));