#include <machine/machine_routines.h>
#include <machine/sched_param.h>
#include <machine/machine_cpu.h>
-#include <machine/machlimits.h>
+#include <machine/limits.h>
#include <machine/atomic.h>
#include <machine/commpage.h>
#include <kern/kern_types.h>
#include <kern/backtrace.h>
#include <kern/clock.h>
-#include <kern/counters.h>
#include <kern/cpu_number.h>
#include <kern/cpu_data.h>
#include <kern/smp.h>
#include <kern/host.h>
#include <stdatomic.h>
+struct sched_statistics PERCPU_DATA(sched_stats);
+bool sched_stats_active;
+
int
rt_runq_count(processor_set_t pset)
{
}
#define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
-int default_preemption_rate = DEFAULT_PREEMPTION_RATE;
+TUNABLE(int, default_preemption_rate, "preempt", DEFAULT_PREEMPTION_RATE);
#define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
-int default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
+TUNABLE(int, default_bg_preemption_rate, "bg_preempt", DEFAULT_BG_PREEMPTION_RATE);
-#define MAX_UNSAFE_QUANTA 800
-int max_unsafe_quanta = MAX_UNSAFE_QUANTA;
+#define MAX_UNSAFE_QUANTA 800
+TUNABLE(int, max_unsafe_quanta, "unsafe", MAX_UNSAFE_QUANTA);
-#define MAX_POLL_QUANTA 2
-int max_poll_quanta = MAX_POLL_QUANTA;
+#define MAX_POLL_QUANTA 2
+TUNABLE(int, max_poll_quanta, "poll", MAX_POLL_QUANTA);
#define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
-int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
+int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
uint64_t max_poll_computation;
uint32_t max_rt_quantum;
uint32_t min_rt_quantum;
+uint32_t rt_constraint_threshold;
+
#if defined(CONFIG_SCHED_TIMESHARE_CORE)
unsigned sched_tick;
#endif /* __arm__ || __arm64__ */
uint64_t sched_one_second_interval;
+boolean_t allow_direct_handoff = TRUE;
/* Forwards */
#endif /* CONFIG_SCHED_TIMESHARE_CORE */
-#if CONFIG_SCHED_IDLE_IN_PLACE
-static thread_t thread_select_idle(
- thread_t thread,
- processor_t processor);
-#endif
-
thread_t processor_idle(
thread_t thread,
processor_t processor);
#if defined(CONFIG_SCHED_TIMESHARE_CORE)
int8_t sched_load_shifts[NRQS];
-bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS)];
+bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS_MAX)];
#endif /* CONFIG_SCHED_TIMESHARE_CORE */
-const struct sched_dispatch_table *sched_current_dispatch = NULL;
-
/*
* Statically allocate a buffer to hold the longest possible
* scheduler description string, as currently implemented.
/* Global flag which indicates whether Background Stepper Context is enabled */
static int cpu_throttle_enabled = 1;
-#if DEBUG
-
-/* Since using the indirect function dispatch table has a negative impact on
- * context switch performance, only allow DEBUG kernels to use that mechanism.
- */
-static void
-sched_init_override(void)
-{
- char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' };
-
- /* Check for runtime selection of the scheduler algorithm */
- if (!PE_parse_boot_argn("sched", sched_arg, sizeof(sched_arg))) {
- sched_arg[0] = '\0';
- }
- if (strlen(sched_arg) > 0) {
- if (0) {
- /* Allow pattern below */
-#if defined(CONFIG_SCHED_TRADITIONAL)
- } else if (0 == strcmp(sched_arg, sched_traditional_dispatch.sched_name)) {
- sched_current_dispatch = &sched_traditional_dispatch;
- } else if (0 == strcmp(sched_arg, sched_traditional_with_pset_runqueue_dispatch.sched_name)) {
- sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
-#endif
-#if defined(CONFIG_SCHED_MULTIQ)
- } else if (0 == strcmp(sched_arg, sched_multiq_dispatch.sched_name)) {
- sched_current_dispatch = &sched_multiq_dispatch;
- } else if (0 == strcmp(sched_arg, sched_dualq_dispatch.sched_name)) {
- sched_current_dispatch = &sched_dualq_dispatch;
-#endif
- } else {
-#if defined(CONFIG_SCHED_TRADITIONAL)
- printf("Unrecognized scheduler algorithm: %s\n", sched_arg);
- printf("Scheduler: Using instead: %s\n", sched_traditional_with_pset_runqueue_dispatch.sched_name);
- sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
-#else
- panic("Unrecognized scheduler algorithm: %s", sched_arg);
-#endif
- }
- kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name));
- } else {
-#if defined(CONFIG_SCHED_MULTIQ)
- sched_current_dispatch = &sched_dualq_dispatch;
-#elif defined(CONFIG_SCHED_TRADITIONAL)
- sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
-#else
-#error No default scheduler implementation
-#endif
- kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
- }
-}
-
-#endif /* DEBUG */
-
void
sched_init(void)
{
-#if DEBUG
- sched_init_override();
-#else /* DEBUG */
+ boolean_t direct_handoff = FALSE;
kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
-#endif /* DEBUG */
if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
/* No boot-args, check in device tree */
SCHED(pset_init)(&pset0);
SCHED(processor_init)(master_processor);
+
+ if (PE_parse_boot_argn("direct_handoff", &direct_handoff, sizeof(direct_handoff))) {
+ allow_direct_handoff = direct_handoff;
+ }
}
void
/* timeshare load calculation interval & deadline initialization */
clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs);
- sched_load_compute_deadline = sched_load_compute_interval_abs;
+ os_atomic_init(&sched_load_compute_deadline, sched_load_compute_interval_abs);
/*
* Compute conversion factor from usage to
void
pset_rt_init(processor_set_t pset)
{
- rt_lock_init(pset);
-
- pset->rt_runq.count = 0;
+ os_atomic_init(&pset->rt_runq.count, 0);
queue_init(&pset->rt_runq.queue);
memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats);
}
-rt_queue_t
-sched_rtglobal_runq(processor_set_t pset)
-{
- (void)pset;
-
- return &pset0.rt_runq;
-}
-
-void
-sched_rtglobal_init(processor_set_t pset)
-{
- if (pset == &pset0) {
- return pset_rt_init(pset);
- }
-
- /* Only pset0 rt_runq is used, so make it easy to detect
- * buggy accesses to others.
- */
- memset(&pset->rt_runq, 0xfd, sizeof pset->rt_runq);
-}
-
-void
-sched_rtglobal_queue_shutdown(processor_t processor)
-{
- (void)processor;
-}
-
static void
sched_realtime_timebase_init(void)
{
50, 1000 * NSEC_PER_USEC, &abstime);
assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
max_rt_quantum = (uint32_t)abstime;
+
+ /* constraint threshold for sending backup IPIs (4 ms) */
+ clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime);
+ assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
+ rt_constraint_threshold = (uint32_t)abstime;
}
void
bool
sched_steal_thread_enabled(processor_set_t pset)
{
- return pset->node->pset_count > 1;
+ return bit_count(pset->node->pset_map) > 1;
}
#if defined(CONFIG_SCHED_TIMESHARE_CORE)
thread->wait_timer_is_set = FALSE;
}
+ boolean_t aticontext, pidle;
+ ml_get_power_state(&aticontext, &pidle);
+
/*
* Update scheduling state: not waiting,
* set running.
}
/* Update the runnable thread count */
- new_run_count = sched_run_incr(thread);
+ new_run_count = SCHED(run_count_incr)(thread);
+
+#if CONFIG_SCHED_AUTO_JOIN
+ if (aticontext == FALSE && work_interval_should_propagate(cthread, thread)) {
+ work_interval_auto_join_propagate(cthread, thread);
+ }
+#endif /*CONFIG_SCHED_AUTO_JOIN */
} else {
/*
* Either the thread is idling in place on another processor,
* or it hasn't finished context switching yet.
*/
-#if CONFIG_SCHED_IDLE_IN_PLACE
- if (thread->state & TH_IDLE) {
- processor_t processor = thread->last_processor;
-
- if (processor != current_processor()) {
- machine_signal_idle(processor);
- }
- }
-#else
assert((thread->state & TH_IDLE) == 0);
-#endif
/*
* The run count is only dropped after the context switch completes
* and the thread is still waiting, so we should not run_incr here
*/
- new_run_count = sched_run_buckets[TH_BUCKET_RUN];
+ new_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
}
-
/*
* Calculate deadline for real-time threads.
*/
ctime = mach_absolute_time();
thread->realtime.deadline = thread->realtime.constraint + ctime;
+ KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) | DBG_FUNC_NONE,
+ (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
}
/*
* DRK: consider removing the callout wakeup counters in the future
* they're present for verification at the moment.
*/
- boolean_t aticontext, pidle;
- ml_get_power_state(&aticontext, &pidle);
if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
- uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd);
+ uint64_t ttd = current_processor()->timer_call_ttd;
if (ttd) {
if (ttd <= timer_deadline_tracking_bin_1) {
}
if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
- thread->callout_woken_from_icontext = aticontext;
- thread->callout_woken_from_platform_idle = pidle;
+ thread->callout_woken_from_icontext = !!aticontext;
+ thread->callout_woken_from_platform_idle = !!pidle;
thread->callout_woke_thread = FALSE;
}
return ready_for_runq;
}
+/*
+ * Routine: thread_allowed_for_handoff
+ * Purpose:
+ * Check if the thread is allowed for handoff operation
+ * Conditions:
+ * thread lock held, IPC locks may be held.
+ * TODO: In future, do not allow handoff if threads have different cluster
+ * recommendations.
+ */
+boolean_t
+thread_allowed_for_handoff(
+ thread_t thread)
+{
+ thread_t self = current_thread();
+
+ if (allow_direct_handoff &&
+ thread->sched_mode == TH_MODE_REALTIME &&
+ self->sched_mode == TH_MODE_REALTIME) {
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
/*
* Routine: thread_go
* Purpose:
kern_return_t
thread_go(
thread_t thread,
- wait_result_t wresult)
+ wait_result_t wresult,
+ waitq_options_t option)
{
+ thread_t self = current_thread();
+
assert_thread_magic(thread);
assert(thread->at_safe_point == FALSE);
if (thread_unblock(thread, wresult)) {
#if SCHED_TRACE_THREAD_WAKEUPS
backtrace(&thread->thread_wakeup_bt[0],
- (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)));
+ (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)), NULL);
#endif
- thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
+ if ((option & WQ_OPTION_HANDOFF) &&
+ thread_allowed_for_handoff(thread)) {
+ thread_reference(thread);
+ assert(self->handoff_thread == NULL);
+ self->handoff_thread = thread;
+ } else {
+ thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
+ }
}
return KERN_SUCCESS;
boolean_t at_safe_point;
wait_interrupt_t interruptible = interruptible_orig;
+ if (thread->state & TH_IDLE) {
+ panic("Invalid attempt to wait while running the idle thread");
+ }
+
assert(!(thread->state & (TH_WAIT | TH_IDLE | TH_UNINT | TH_TERMINATE2 | TH_WAIT_REPORT)));
/*
/* TODO: Can we instead assert TH_TERMINATE is not set? */
if ((thread->state & (TH_WAIT | TH_TERMINATE)) == TH_WAIT) {
- return thread_go(thread, wresult);
+ return thread_go(thread, wresult, WQ_OPTION_NONE);
} else {
return KERN_NOT_WAITING;
}
int sched_smt_balance = 1;
#endif
-#if __SMP__
/* Invoked with pset locked, returns with pset unlocked */
void
sched_SMT_balance(processor_t cprocessor, processor_set_t cpset)
sched_ipi_perform(ast_processor, ipi_type);
}
}
-#else
-/* Invoked with pset locked, returns with pset unlocked */
-void
-sched_SMT_balance(__unused processor_t cprocessor, processor_set_t cpset)
+
+static cpumap_t
+pset_available_cpumap(processor_set_t pset)
{
- pset_unlock(cpset);
+ return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]) &
+ pset->recommended_bitmask;
+}
+
+static cpumap_t
+pset_available_but_not_running_cpumap(processor_set_t pset)
+{
+ return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
+ pset->recommended_bitmask;
+}
+
+bool
+pset_has_stealable_threads(processor_set_t pset)
+{
+ pset_assert_locked(pset);
+
+ cpumap_t avail_map = pset_available_but_not_running_cpumap(pset);
+ /*
+ * Secondary CPUs never steal, so allow stealing of threads if there are more threads than
+ * available primary CPUs
+ */
+ avail_map &= pset->primary_map;
+
+ return (pset->pset_runq.count > 0) && ((pset->pset_runq.count + rt_runq_count(pset)) > bit_count(avail_map));
}
-#endif /* __SMP__ */
/*
* Called with pset locked, on a processor that is committing to run a new thread
static void
pset_commit_processor_to_new_thread(processor_set_t pset, processor_t processor, thread_t new_thread)
{
+ pset_assert_locked(pset);
+
if (processor->state == PROCESSOR_DISPATCHING || processor->state == PROCESSOR_IDLE) {
assert(current_thread() == processor->idle_thread);
}
processor_state_update_from_thread(processor, new_thread);
+
+ if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
+ bit_set(pset->realtime_map, processor->cpu_id);
+ } else {
+ bit_clear(pset->realtime_map, processor->cpu_id);
+ }
+
+ pset_node_t node = pset->node;
+
+ if (bit_count(node->pset_map) == 1) {
+ /* Node has only a single pset, so skip node pset map updates */
+ return;
+ }
+
+ cpumap_t avail_map = pset_available_cpumap(pset);
+
+ if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
+ if ((avail_map & pset->realtime_map) == avail_map) {
+ /* No more non-RT CPUs in this pset */
+ atomic_bit_clear(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
+ }
+ avail_map &= pset->primary_map;
+ if ((avail_map & pset->realtime_map) == avail_map) {
+ /* No more non-RT primary CPUs in this pset */
+ atomic_bit_clear(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
+ }
+ } else {
+ if ((avail_map & pset->realtime_map) != avail_map) {
+ if (!bit_test(atomic_load(&node->pset_non_rt_map), pset->pset_id)) {
+ atomic_bit_set(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
+ }
+ }
+ avail_map &= pset->primary_map;
+ if ((avail_map & pset->realtime_map) != avail_map) {
+ if (!bit_test(atomic_load(&node->pset_non_rt_primary_map), pset->pset_id)) {
+ atomic_bit_set(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
+ }
+ }
+ }
}
-static processor_t choose_processor_for_realtime_thread(processor_set_t pset);
+static processor_t choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries);
static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset);
+#if defined(__x86_64__)
static bool these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map);
+#endif
static bool sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor);
+static bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor);
int sched_allow_rt_smt = 1;
int sched_avoid_cpu0 = 1;
*/
if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
if (rt_runq_count(pset) > 0) {
- rt_lock_lock(pset);
-
- if (rt_runq_count(pset) > 0) {
- thread_t next_rt = qe_queue_first(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links);
+ thread_t next_rt = qe_queue_first(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links);
- if (next_rt->realtime.deadline < processor->deadline &&
- (next_rt->bound_processor == PROCESSOR_NULL ||
- next_rt->bound_processor == processor)) {
- /* The next RT thread is better, so pick it off the runqueue. */
- goto pick_new_rt_thread;
- }
+ if (next_rt->realtime.deadline < processor->deadline &&
+ (next_rt->bound_processor == PROCESSOR_NULL ||
+ next_rt->bound_processor == processor)) {
+ /* The next RT thread is better, so pick it off the runqueue. */
+ goto pick_new_rt_thread;
}
-
- rt_lock_unlock(pset);
}
/* This is still the best RT thread to run. */
processor->deadline = thread->realtime.deadline;
- sched_update_pset_load_average(pset);
+ sched_update_pset_load_average(pset, 0);
processor_t next_rt_processor = PROCESSOR_NULL;
sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
- if (rt_runq_count(pset) > 0) {
- next_rt_processor = choose_processor_for_realtime_thread(pset);
+ if (rt_runq_count(pset) - bit_count(pset->pending_AST_URGENT_cpu_mask) > 0) {
+ next_rt_processor = choose_processor_for_realtime_thread(pset, processor, true);
if (next_rt_processor) {
+ SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
+ (uintptr_t)0, (uintptr_t)-4, next_rt_processor->cpu_id, next_rt_processor->state, 0);
if (next_rt_processor->state == PROCESSOR_IDLE) {
pset_update_processor_state(pset, next_rt_processor, PROCESSOR_DISPATCHING);
}
/* This thread is still the highest priority runnable (non-idle) thread */
processor->deadline = UINT64_MAX;
- sched_update_pset_load_average(pset);
+ sched_update_pset_load_average(pset, 0);
pset_unlock(pset);
return thread;
}
}
+ bool secondary_forced_idle = ((processor->processor_secondary != PROCESSOR_NULL) &&
+ (thread_no_smt(thread) || (thread->sched_pri >= BASEPRI_RTQUEUES)) &&
+ (processor->processor_secondary->state == PROCESSOR_IDLE));
+
/* OK, so we're not going to run the current thread. Look at the RT queue. */
bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor);
if ((rt_runq_count(pset) > 0) && ok_to_run_realtime_thread) {
- rt_lock_lock(pset);
+ thread_t next_rt = qe_queue_first(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links);
- if ((rt_runq_count(pset) > 0) && ok_to_run_realtime_thread) {
- thread_t next_rt = qe_queue_first(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links);
-
- if (__probable((next_rt->bound_processor == PROCESSOR_NULL ||
- (next_rt->bound_processor == processor)))) {
+ if (__probable((next_rt->bound_processor == PROCESSOR_NULL ||
+ (next_rt->bound_processor == processor)))) {
pick_new_rt_thread:
- new_thread = qe_dequeue_head(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links);
+ new_thread = qe_dequeue_head(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links);
- new_thread->runq = PROCESSOR_NULL;
- SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset));
- rt_runq_count_decr(pset);
+ new_thread->runq = PROCESSOR_NULL;
+ SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset));
+ rt_runq_count_decr(pset);
- processor->deadline = new_thread->realtime.deadline;
+ processor->deadline = new_thread->realtime.deadline;
- pset_commit_processor_to_new_thread(pset, processor, new_thread);
+ pset_commit_processor_to_new_thread(pset, processor, new_thread);
- rt_lock_unlock(pset);
- sched_update_pset_load_average(pset);
+ sched_update_pset_load_average(pset, 0);
- processor_t ast_processor = PROCESSOR_NULL;
- processor_t next_rt_processor = PROCESSOR_NULL;
- sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
- sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
+ processor_t ast_processor = PROCESSOR_NULL;
+ processor_t next_rt_processor = PROCESSOR_NULL;
+ sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
+ sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
- if (processor->processor_secondary != NULL) {
- processor_t sprocessor = processor->processor_secondary;
- if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) {
- ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL);
- ast_processor = sprocessor;
- }
+ if (processor->processor_secondary != NULL) {
+ processor_t sprocessor = processor->processor_secondary;
+ if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) {
+ ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL);
+ ast_processor = sprocessor;
}
- if (rt_runq_count(pset) > 0) {
- next_rt_processor = choose_processor_for_realtime_thread(pset);
- if (next_rt_processor) {
- if (next_rt_processor->state == PROCESSOR_IDLE) {
- pset_update_processor_state(pset, next_rt_processor, PROCESSOR_DISPATCHING);
- }
- next_rt_ipi_type = sched_ipi_action(next_rt_processor, NULL, false, SCHED_IPI_EVENT_PREEMPT);
+ }
+ if (rt_runq_count(pset) - bit_count(pset->pending_AST_URGENT_cpu_mask) > 0) {
+ next_rt_processor = choose_processor_for_realtime_thread(pset, processor, true);
+ if (next_rt_processor) {
+ SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
+ (uintptr_t)0, (uintptr_t)-5, next_rt_processor->cpu_id, next_rt_processor->state, 0);
+ if (next_rt_processor->state == PROCESSOR_IDLE) {
+ pset_update_processor_state(pset, next_rt_processor, PROCESSOR_DISPATCHING);
}
+ next_rt_ipi_type = sched_ipi_action(next_rt_processor, NULL, false, SCHED_IPI_EVENT_PREEMPT);
}
- pset_unlock(pset);
-
- if (ast_processor) {
- sched_ipi_perform(ast_processor, ipi_type);
- }
+ }
+ pset_unlock(pset);
- if (next_rt_processor) {
- sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
- }
+ if (ast_processor) {
+ sched_ipi_perform(ast_processor, ipi_type);
+ }
- return new_thread;
+ if (next_rt_processor) {
+ sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
}
- }
- rt_lock_unlock(pset);
+ return new_thread;
+ }
}
if (secondary_can_only_run_realtime_thread) {
goto idle;
/* No RT threads, so let's look at the regular threads. */
if ((new_thread = SCHED(choose_thread)(processor, MINPRI, *reason)) != THREAD_NULL) {
- sched_update_pset_load_average(pset);
-
pset_commit_processor_to_new_thread(pset, processor, new_thread);
+ sched_update_pset_load_average(pset, 0);
processor_t ast_processor = PROCESSOR_NULL;
sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL);
ast_processor = sprocessor;
}
+ } else if (secondary_forced_idle && !thread_no_smt(new_thread) && pset_has_stealable_threads(pset)) {
+ pset_update_processor_state(pset, sprocessor, PROCESSOR_DISPATCHING);
+ ipi_type = sched_ipi_action(sprocessor, NULL, true, SCHED_IPI_EVENT_PREEMPT);
+ ast_processor = sprocessor;
}
pset_unlock(pset);
goto idle;
}
-#if __SMP__
- if (SCHED(steal_thread_enabled)(pset)) {
+ if (SCHED(steal_thread_enabled)(pset) && (processor->processor_primary == processor)) {
/*
* No runnable threads, attempt to steal
* from other processors. Returns with pset lock dropped.
goto restart;
}
}
-#endif
idle:
/*
processor_state_update_idle(processor);
}
-#if __SMP__
/* Invoked with pset locked, returns with pset unlocked */
SCHED(processor_balance)(processor, pset);
-#else
- pset_unlock(pset);
-#endif
-
-#if CONFIG_SCHED_IDLE_IN_PLACE
- /*
- * Choose idle thread if fast idle is not possible.
- */
- if (processor->processor_primary != processor) {
- return processor->idle_thread;
- }
-
- if ((thread->state & (TH_IDLE | TH_TERMINATE | TH_SUSP)) || !(thread->state & TH_WAIT) || thread->wake_active || thread->sched_pri >= BASEPRI_RTQUEUES) {
- return processor->idle_thread;
- }
-
- /*
- * Perform idling activities directly without a
- * context switch. Return dispatched thread,
- * else check again for a runnable thread.
- */
- new_thread = thread_select_idle(thread, processor);
-#else /* !CONFIG_SCHED_IDLE_IN_PLACE */
-
- /*
- * Do a full context switch to idle so that the current
- * thread can start running on another processor without
- * waiting for the fast-idled processor to wake up.
- */
new_thread = processor->idle_thread;
-
-#endif /* !CONFIG_SCHED_IDLE_IN_PLACE */
} while (new_thread == THREAD_NULL);
return new_thread;
}
-#if CONFIG_SCHED_IDLE_IN_PLACE
-/*
- * thread_select_idle:
- *
- * Idle the processor using the current thread context.
- *
- * Called with thread locked, then dropped and relocked.
- */
-static thread_t
-thread_select_idle(
- thread_t thread,
- processor_t processor)
-{
- thread_t new_thread;
- uint64_t arg1, arg2;
- int urgency;
-
- sched_run_decr(thread);
-
- thread->state |= TH_IDLE;
- processor_state_update_idle(procssor);
-
- /* Reload precise timing global policy to thread-local policy */
- thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
-
- thread_unlock(thread);
-
- /*
- * Switch execution timing to processor idle thread.
- */
- processor->last_dispatch = mach_absolute_time();
-
-#ifdef CONFIG_MACH_APPROXIMATE_TIME
- commpage_update_mach_approximate_time(processor->last_dispatch);
-#endif
-
- thread->last_run_time = processor->last_dispatch;
- processor_timer_switch_thread(processor->last_dispatch,
- &processor->idle_thread->system_timer);
- PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
-
-
- /*
- * Cancel the quantum timer while idling.
- */
- timer_call_quantum_timer_cancel(&processor->quantum_timer);
- processor->first_timeslice = FALSE;
-
- if (thread->sched_call) {
- (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
- }
-
- thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL);
-
- /*
- * Enable interrupts and perform idling activities. No
- * preemption due to TH_IDLE being set.
- */
- spllo(); new_thread = processor_idle(thread, processor);
-
- /*
- * Return at splsched.
- */
- if (thread->sched_call) {
- (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
- }
-
- thread_lock(thread);
-
- /*
- * If awakened, switch to thread timer and start a new quantum.
- * Otherwise skip; we will context switch to another thread or return here.
- */
- if (!(thread->state & TH_WAIT)) {
- uint64_t time_now = processor->last_dispatch = mach_absolute_time();
- processor_timer_switch_thread(time_now, &thread->system_timer);
- timer_update(&thread->runnable_timer, time_now);
- PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
- thread_quantum_init(thread);
- processor->quantum_end = time_now + thread->quantum_remaining;
- timer_call_quantum_timer_enter(&processor->quantum_timer,
- thread, processor->quantum_end, time_now);
- processor->first_timeslice = TRUE;
-
- thread->computation_epoch = time_now;
- }
-
- thread->state &= ~TH_IDLE;
-
- urgency = thread_get_urgency(thread, &arg1, &arg2);
-
- thread_tell_urgency(urgency, arg1, arg2, 0, new_thread);
-
- sched_run_incr(thread);
-
- return new_thread;
-}
-#endif /* CONFIG_SCHED_IDLE_IN_PLACE */
-
/*
* thread_invoke
*
#endif
#if defined(CONFIG_SCHED_TIMESHARE_CORE)
- if ((thread->state & TH_IDLE) == 0) {
+ if (!((thread->state & TH_IDLE) != 0 ||
+ ((reason & AST_HANDOFF) && self->sched_mode == TH_MODE_REALTIME))) {
sched_timeshare_consider_maintenance(ctime);
}
#endif
/*
* Context switch by performing a stack handoff.
+ * Requires both threads to be parked in a continuation.
*/
continuation = thread->continuation;
parameter = thread->parameter;
self->last_run_time = ctime;
processor_timer_switch_thread(ctime, &thread->system_timer);
timer_update(&thread->runnable_timer, ctime);
- PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
+ processor->kernel_timer = &thread->system_timer;
/*
* Since non-precise user/kernel time doesn't update the state timer
* during privilege transitions, synthesize an event now.
*/
if (!thread->precise_user_kernel_time) {
- timer_update(PROCESSOR_DATA(processor, current_state), ctime);
+ timer_update(processor->current_state, ctime);
}
KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
kperf_off_cpu(self);
#endif /* KPERF */
+ /*
+ * This is where we actually switch thread identity,
+ * and address space if required. However, register
+ * state is not switched - this routine leaves the
+ * stack and register state active on the current CPU.
+ */
TLOG(1, "thread_invoke: calling stack_handoff\n");
stack_handoff(self, thread);
thread->continuation = thread->parameter = NULL;
- counter(c_thread_invoke_hits++);
+ boolean_t enable_interrupts = TRUE;
+
+ /* idle thread needs to stay interrupts-disabled */
+ if ((thread->state & TH_IDLE)) {
+ enable_interrupts = FALSE;
+ }
assert(continuation);
- call_continuation(continuation, parameter, thread->wait_result, TRUE);
+ call_continuation(continuation, parameter,
+ thread->wait_result, enable_interrupts);
/*NOTREACHED*/
} else if (thread == self) {
/* same thread but with continuation */
ast_context(self);
- counter(++c_thread_invoke_same);
thread_unlock(self);
self->continuation = self->parameter = NULL;
- call_continuation(continuation, parameter, self->wait_result, TRUE);
+ boolean_t enable_interrupts = TRUE;
+
+ /* idle thread needs to stay interrupts-disabled */
+ if ((self->state & TH_IDLE)) {
+ enable_interrupts = FALSE;
+ }
+
+ call_continuation(continuation, parameter,
+ self->wait_result, enable_interrupts);
/*NOTREACHED*/
}
} else {
if (!thread->kernel_stack) {
need_stack:
if (!stack_alloc_try(thread)) {
- counter(c_thread_invoke_misses++);
thread_unlock(thread);
thread_stack_enqueue(thread);
return FALSE;
}
} else if (thread == self) {
ast_context(self);
- counter(++c_thread_invoke_same);
thread_unlock(self);
KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
thread_unlock(thread);
- counter(c_thread_invoke_csw++);
-
self->reason = reason;
processor->last_dispatch = ctime;
self->last_run_time = ctime;
processor_timer_switch_thread(ctime, &thread->system_timer);
timer_update(&thread->runnable_timer, ctime);
- PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
+ processor->kernel_timer = &thread->system_timer;
/*
* Since non-precise user/kernel time doesn't update the state timer
* during privilege transitions, synthesize an event now.
*/
if (!thread->precise_user_kernel_time) {
- timer_update(PROCESSOR_DATA(processor, current_state), ctime);
+ timer_update(processor->current_state, ctime);
}
KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
* been stored on the stack or a non-volatile register, but a stale idea of
* what was on the CPU is newly-accurate because that thread is again
* running on the CPU.
+ *
+ * If one of the threads is using a continuation, thread_continue
+ * is used to stitch up its context.
+ *
+ * If we are invoking a thread which is resuming from a continuation,
+ * the CPU will invoke thread_continue next.
+ *
+ * If the current thread is parking in a continuation, then its state
+ * won't be saved and the stack will be discarded. When the stack is
+ * re-allocated, it will be configured to resume from thread_continue.
*/
assert(continuation == self->continuation);
thread = machine_switch_context(self, continuation, thread);
assert(self == current_thread_volatile());
TLOG(1, "thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
+ assert(continuation == NULL && self->continuation == NULL);
+
DTRACE_SCHED(on__cpu);
#if KPERF
kperf_on_cpu(self, NULL, __builtin_frame_address(0));
#endif /* KPERF */
- /*
- * We have been resumed and are set to run.
- */
+ /* We have been resumed and are set to run. */
thread_dispatch(thread, self);
- if (continuation) {
- self->continuation = self->parameter = NULL;
-
- call_continuation(continuation, parameter, self->wait_result, TRUE);
- /*NOTREACHED*/
- }
-
return TRUE;
}
uint32_t sampled_sched_run_count;
pset_lock(pset);
- sampled_sched_run_count = (volatile uint32_t) sched_run_buckets[TH_BUCKET_RUN];
+ sampled_sched_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
/*
* If we have emptied the run queue, and our current thread is runnable, we
* reasonable facsimile of PROCESSOR_IDLE.
*/
- assert(active_processor->next_thread == THREAD_NULL);
processor_state_update_idle(active_processor);
active_processor->deadline = UINT64_MAX;
pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE);
* "self" is the new current thread that we have context switched to
*
* Called at splsched.
+ *
*/
void
thread_dispatch(
thread_t self)
{
processor_t processor = self->last_processor;
+ bool was_idle = false;
assert(processor == current_processor());
assert(self == current_thread_volatile());
}
if (thread->state & TH_IDLE) {
+ was_idle = true;
KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
(uintptr_t)thread_tid(thread), 0, thread->state,
(consumed - thread->t_deduct_bank_ledger_time));
}
thread->t_deduct_bank_ledger_time = 0;
+ if (consumed > 0) {
+ /*
+ * This should never be negative, but in traces we are seeing some instances
+ * of consumed being negative.
+ * <rdar://problem/57782596> thread_dispatch() thread CPU consumed calculation sometimes results in negative value
+ */
+ sched_update_pset_avg_execution_time(current_processor()->processor_set, consumed, processor->last_dispatch, thread->th_sched_bucket);
+ }
}
wake_lock(thread);
* consumed the entire quantum.
*/
if (thread->quantum_remaining == 0) {
+ KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CANCEL_RT_DEADLINE) | DBG_FUNC_NONE,
+ (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
thread->realtime.deadline = UINT64_MAX;
}
} else {
thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
thread->chosen_processor = PROCESSOR_NULL;
- new_run_count = sched_run_decr(thread);
+ new_run_count = SCHED(run_count_decr)(thread);
+
+#if CONFIG_SCHED_AUTO_JOIN
+ if ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0) {
+ work_interval_auto_join_unwind(thread);
+ }
+#endif /* CONFIG_SCHED_AUTO_JOIN */
#if CONFIG_SCHED_SFI
if (thread->reason & AST_SFI) {
}
}
}
+ /*
+ * The thread could have been added to the termination queue, so it's
+ * unsafe to use after this point.
+ */
+ thread = THREAD_NULL;
}
int urgency = THREAD_URGENCY_NONE;
uint64_t latency = 0;
- /* Update (new) current thread and reprogram quantum timer */
+ /* Update (new) current thread and reprogram running timers */
thread_lock(self);
if (!(self->state & TH_IDLE)) {
/*
* Set up quantum timer and timeslice.
*/
- processor->quantum_end = processor->last_dispatch + self->quantum_remaining;
- timer_call_quantum_timer_enter(&processor->quantum_timer, self,
- processor->quantum_end, processor->last_dispatch);
+ processor->quantum_end = processor->last_dispatch +
+ self->quantum_remaining;
+ running_timer_setup(processor, RUNNING_TIMER_QUANTUM, self,
+ processor->quantum_end, processor->last_dispatch);
+ if (was_idle) {
+ /*
+ * kperf's running timer is active whenever the idle thread for a
+ * CPU is not running.
+ */
+ kperf_running_setup(processor, processor->last_dispatch);
+ }
+ running_timers_activate(processor);
processor->first_timeslice = TRUE;
} else {
- timer_call_quantum_timer_cancel(&processor->quantum_timer);
+ running_timers_deactivate(processor);
processor->first_timeslice = FALSE;
-
thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
}
* TODO: Can we state that redispatching our old thread is also
* uninteresting?
*/
- if ((((volatile uint32_t)sched_run_buckets[TH_BUCKET_RUN]) == 1) &&
- !(self->state & TH_IDLE)) {
+ if ((os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) == 1) && !(self->state & TH_IDLE)) {
pset_cancel_deferred_dispatch(processor->processor_set, processor);
}
#endif
* thread resumes, it will execute the continuation function
* on a new kernel stack.
*/
-counter(mach_counter_t c_thread_block_calls = 0; )
-
wait_result_t
thread_block_reason(
thread_continue_t continuation,
thread_t new_thread;
spl_t s;
- counter(++c_thread_block_calls);
-
s = splsched();
processor = current_processor();
reason = AST_HANDOFF;
}
+ /*
+ * If this thread hadn't been setrun'ed, it
+ * might not have a chosen processor, so give it one
+ */
+ if (new_thread->chosen_processor == NULL) {
+ new_thread->chosen_processor = current_processor();
+ }
+
self->continuation = continuation;
self->parameter = parameter;
*
* Called at splsched when a thread first receives
* a new stack after a continuation.
+ *
+ * Called with THREAD_NULL as the old thread when
+ * invoked by machine_load_context.
*/
void
thread_continue(
continuation = self->continuation;
parameter = self->parameter;
+ assert(continuation != NULL);
+
#if KPERF
kperf_on_cpu(self, continuation, NULL);
#endif
TLOG(1, "thread_continue: calling call_continuation\n");
- boolean_t enable_interrupts = thread != THREAD_NULL;
+ boolean_t enable_interrupts = TRUE;
+
+ /* bootstrap thread, idle thread need to stay interrupts-disabled */
+ if (thread == THREAD_NULL || (self->state & TH_IDLE)) {
+ enable_interrupts = FALSE;
+ }
+
call_continuation(continuation, parameter, self->wait_result, enable_interrupts);
/*NOTREACHED*/
}
}
rq->urgency = rq->count = 0;
for (int i = 0; i < NRQS; i++) {
- queue_init(&rq->queues[i]);
+ circle_queue_init(&rq->queues[i]);
}
}
*/
thread_t
run_queue_dequeue(
- run_queue_t rq,
- integer_t options)
+ run_queue_t rq,
+ sched_options_t options)
{
- thread_t thread;
- queue_t queue = &rq->queues[rq->highq];
-
- if (options & SCHED_PEEK) {
- if (options & SCHED_HEADQ) {
- thread = qe_queue_first(queue, struct thread, runq_links);
- } else {
- thread = qe_queue_last(queue, struct thread, runq_links);
- }
- return thread;
- }
+ thread_t thread;
+ circle_queue_t queue = &rq->queues[rq->highq];
if (options & SCHED_HEADQ) {
- thread = qe_dequeue_head(queue, struct thread, runq_links);
+ thread = cqe_dequeue_head(queue, struct thread, runq_links);
} else {
- thread = qe_dequeue_tail(queue, struct thread, runq_links);
+ thread = cqe_dequeue_tail(queue, struct thread, runq_links);
}
assert(thread != THREAD_NULL);
if (SCHED(priority_is_urgent)(rq->highq)) {
rq->urgency--; assert(rq->urgency >= 0);
}
- if (queue_empty(queue)) {
+ if (circle_queue_empty(queue)) {
bitmap_clear(rq->bitmap, rq->highq);
rq->highq = bitmap_first(rq->bitmap, NRQS);
}
*/
boolean_t
run_queue_enqueue(
- run_queue_t rq,
- thread_t thread,
- integer_t options)
+ run_queue_t rq,
+ thread_t thread,
+ sched_options_t options)
{
- queue_t queue = &rq->queues[thread->sched_pri];
- boolean_t result = FALSE;
+ circle_queue_t queue = &rq->queues[thread->sched_pri];
+ boolean_t result = FALSE;
assert_thread_magic(thread);
- if (queue_empty(queue)) {
- enqueue_tail(queue, &thread->runq_links);
+ if (circle_queue_empty(queue)) {
+ circle_enqueue_tail(queue, &thread->runq_links);
rq_bitmap_set(rq->bitmap, thread->sched_pri);
if (thread->sched_pri > rq->highq) {
}
} else {
if (options & SCHED_TAILQ) {
- enqueue_tail(queue, &thread->runq_links);
+ circle_enqueue_tail(queue, &thread->runq_links);
} else {
- enqueue_head(queue, &thread->runq_links);
+ circle_enqueue_head(queue, &thread->runq_links);
}
}
if (SCHED(priority_is_urgent)(thread->sched_pri)) {
run_queue_t rq,
thread_t thread)
{
+ circle_queue_t queue = &rq->queues[thread->sched_pri];
+
assert(thread->runq != PROCESSOR_NULL);
assert_thread_magic(thread);
- remqueue(&thread->runq_links);
+ circle_dequeue(queue, &thread->runq_links);
SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
rq->count--;
if (SCHED(priority_is_urgent)(thread->sched_pri)) {
rq->urgency--; assert(rq->urgency >= 0);
}
- if (queue_empty(&rq->queues[thread->sched_pri])) {
+ if (circle_queue_empty(queue)) {
/* update run queue status */
bitmap_clear(rq->bitmap, thread->sched_pri);
rq->highq = bitmap_first(rq->bitmap, NRQS);
thread->runq = PROCESSOR_NULL;
}
-/* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
+/*
+ * run_queue_peek
+ *
+ * Peek at the runq and return the highest
+ * priority thread from the runq.
+ *
+ * The run queue must be locked.
+ */
+thread_t
+run_queue_peek(
+ run_queue_t rq)
+{
+ if (rq->count > 0) {
+ circle_queue_t queue = &rq->queues[rq->highq];
+ thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
+ assert_thread_magic(thread);
+ return thread;
+ } else {
+ return THREAD_NULL;
+ }
+}
+
+rt_queue_t
+sched_rtlocal_runq(processor_set_t pset)
+{
+ return &pset->rt_runq;
+}
+
void
-sched_rtglobal_runq_scan(sched_update_scan_context_t scan_context)
+sched_rtlocal_init(processor_set_t pset)
{
- spl_t s;
+ pset_rt_init(pset);
+}
+
+void
+sched_rtlocal_queue_shutdown(processor_t processor)
+{
+ processor_set_t pset = processor->processor_set;
thread_t thread;
+ queue_head_t tqueue;
- processor_set_t pset = &pset0;
+ pset_lock(pset);
- s = splsched();
- rt_lock_lock(pset);
+ /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
+ if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
+ pset_unlock(pset);
+ return;
+ }
- qe_foreach_element_safe(thread, &pset->rt_runq.queue, runq_links) {
- if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
- scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
- }
+ queue_init(&tqueue);
+
+ while (rt_runq_count(pset) > 0) {
+ thread = qe_dequeue_head(&pset->rt_runq.queue, struct thread, runq_links);
+ thread->runq = PROCESSOR_NULL;
+ SCHED_STATS_RUNQ_CHANGE(&pset->rt_runq.runq_stats, rt_runq_count(pset));
+ rt_runq_count_decr(pset);
+ enqueue_tail(&tqueue, &thread->runq_links);
+ }
+ sched_update_pset_load_average(pset, 0);
+ pset_unlock(pset);
+
+ qe_foreach_element_safe(thread, &tqueue, runq_links) {
+ remqueue(&thread->runq_links);
+
+ thread_lock(thread);
+
+ thread_setrun(thread, SCHED_TAILQ);
+
+ thread_unlock(thread);
}
+}
- rt_lock_unlock(pset);
+/* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
+void
+sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)
+{
+ thread_t thread;
+
+ pset_node_t node = &pset_node0;
+ processor_set_t pset = node->psets;
+
+ spl_t s = splsched();
+ do {
+ while (pset != NULL) {
+ pset_lock(pset);
+
+ qe_foreach_element_safe(thread, &pset->rt_runq.queue, runq_links) {
+ if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
+ scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
+ }
+ }
+
+ pset_unlock(pset);
+
+ pset = pset->pset_list;
+ }
+ } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
splx(s);
}
int64_t
-sched_rtglobal_runq_count_sum(void)
+sched_rtlocal_runq_count_sum(void)
{
- return pset0.rt_runq.runq_stats.count_sum;
+ pset_node_t node = &pset_node0;
+ processor_set_t pset = node->psets;
+ int64_t count = 0;
+
+ do {
+ while (pset != NULL) {
+ count += pset->rt_runq.runq_stats.count_sum;
+
+ pset = pset->pset_list;
+ }
+ } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
+
+ return count;
}
/*
uint64_t deadline = thread->realtime.deadline;
boolean_t preempt = FALSE;
- rt_lock_lock(pset);
+ pset_assert_locked(pset);
if (queue_empty(queue)) {
enqueue_tail(queue, &thread->runq_links);
SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset));
rt_runq_count_incr(pset);
- rt_lock_unlock(pset);
-
return preempt;
}
+#define MAX_BACKUP_PROCESSORS 7
+#if defined(__x86_64__)
+#define DEFAULT_BACKUP_PROCESSORS 1
+#else
+#define DEFAULT_BACKUP_PROCESSORS 0
+#endif
+
+int sched_rt_n_backup_processors = DEFAULT_BACKUP_PROCESSORS;
+
+int
+sched_get_rt_n_backup_processors(void)
+{
+ return sched_rt_n_backup_processors;
+}
+
+void
+sched_set_rt_n_backup_processors(int n)
+{
+ if (n < 0) {
+ n = 0;
+ } else if (n > MAX_BACKUP_PROCESSORS) {
+ n = MAX_BACKUP_PROCESSORS;
+ }
+
+ sched_rt_n_backup_processors = n;
+}
+
/*
* realtime_setrun:
*
*/
static void
realtime_setrun(
- processor_t processor,
+ processor_t chosen_processor,
thread_t thread)
{
- processor_set_t pset = processor->processor_set;
+ processor_set_t pset = chosen_processor->processor_set;
pset_assert_locked(pset);
ast_t preempt;
- sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
+ int n_backup = 0;
- thread->chosen_processor = processor;
+ if (thread->realtime.constraint <= rt_constraint_threshold) {
+ n_backup = sched_rt_n_backup_processors;
+ }
+ assert((n_backup >= 0) && (n_backup <= MAX_BACKUP_PROCESSORS));
+
+ sched_ipi_type_t ipi_type[MAX_BACKUP_PROCESSORS + 1] = {};
+ processor_t ipi_processor[MAX_BACKUP_PROCESSORS + 1] = {};
+
+ thread->chosen_processor = chosen_processor;
/* <rdar://problem/15102234> */
assert(thread->bound_processor == PROCESSOR_NULL);
- /*
- * Dispatch directly onto idle processor.
- */
- if ((thread->bound_processor == processor)
- && processor->state == PROCESSOR_IDLE) {
- processor->next_thread = thread;
- processor_state_update_from_thread(processor, thread);
- processor->deadline = thread->realtime.deadline;
- pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
+ realtime_queue_insert(chosen_processor, pset, thread);
- ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR);
- pset_unlock(pset);
- sched_ipi_perform(processor, ipi_type);
- return;
- }
+ processor_t processor = chosen_processor;
+ bool chosen_process_is_secondary = chosen_processor->processor_primary != chosen_processor;
- if (processor->current_pri < BASEPRI_RTQUEUES) {
- preempt = (AST_PREEMPT | AST_URGENT);
- } else if (thread->realtime.deadline < processor->deadline) {
- preempt = (AST_PREEMPT | AST_URGENT);
- } else {
- preempt = AST_NONE;
- }
+ int count = 0;
+ for (int i = 0; i <= n_backup; i++) {
+ if (i > 0) {
+ processor = choose_processor_for_realtime_thread(pset, chosen_processor, chosen_process_is_secondary);
+ if ((processor == PROCESSOR_NULL) || (sched_avoid_cpu0 && (processor->cpu_id == 0))) {
+ break;
+ }
+ SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
+ (uintptr_t)thread_tid(thread), (uintptr_t)-3, processor->cpu_id, processor->state, 0);
+ }
+ ipi_type[i] = SCHED_IPI_NONE;
+ ipi_processor[i] = processor;
+ count++;
- realtime_queue_insert(processor, pset, thread);
+ if (processor->current_pri < BASEPRI_RTQUEUES) {
+ preempt = (AST_PREEMPT | AST_URGENT);
+ } else if (thread->realtime.deadline < processor->deadline) {
+ preempt = (AST_PREEMPT | AST_URGENT);
+ } else {
+ preempt = AST_NONE;
+ }
- ipi_type = SCHED_IPI_NONE;
- if (preempt != AST_NONE) {
- if (processor->state == PROCESSOR_IDLE) {
- processor->next_thread = THREAD_NULL;
- processor_state_update_from_thread(processor, thread);
- processor->deadline = thread->realtime.deadline;
- pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
- if (processor == current_processor()) {
- ast_on(preempt);
- } else {
- ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_PREEMPT);
- }
- } else if (processor->state == PROCESSOR_DISPATCHING) {
- if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) || (processor->deadline > thread->realtime.deadline))) {
+ if (preempt != AST_NONE) {
+ if (processor->state == PROCESSOR_IDLE) {
processor_state_update_from_thread(processor, thread);
processor->deadline = thread->realtime.deadline;
- }
- } else {
- if (processor == current_processor()) {
- ast_on(preempt);
+ pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
+ if (processor == current_processor()) {
+ ast_on(preempt);
- if ((preempt & AST_URGENT) == AST_URGENT) {
- bit_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
- }
+ if ((preempt & AST_URGENT) == AST_URGENT) {
+ bit_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
+ }
- if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
- bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
+ if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
+ bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
+ }
+ } else {
+ ipi_type[i] = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_PREEMPT);
+ }
+ } else if (processor->state == PROCESSOR_DISPATCHING) {
+ if ((processor->current_pri < thread->sched_pri) || (processor->deadline > thread->realtime.deadline)) {
+ processor_state_update_from_thread(processor, thread);
+ processor->deadline = thread->realtime.deadline;
}
} else {
- ipi_type = sched_ipi_action(processor, thread, false, SCHED_IPI_EVENT_PREEMPT);
+ if (processor == current_processor()) {
+ ast_on(preempt);
+
+ if ((preempt & AST_URGENT) == AST_URGENT) {
+ bit_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
+ }
+
+ if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
+ bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
+ }
+ } else {
+ ipi_type[i] = sched_ipi_action(processor, thread, false, SCHED_IPI_EVENT_PREEMPT);
+ }
}
+ } else {
+ /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
}
- } else {
- /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
}
pset_unlock(pset);
- sched_ipi_perform(processor, ipi_type);
+
+ assert((count > 0) && (count <= (n_backup + 1)));
+ for (int i = 0; i < count; i++) {
+ assert(ipi_processor[i] != PROCESSOR_NULL);
+ sched_ipi_perform(ipi_processor[i], ipi_type[i]);
+ }
}
#endif
if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri) {
preempt = (AST_PREEMPT | AST_URGENT);
- } else if (processor->active_thread && thread_eager_preemption(processor->active_thread)) {
+ } else if (processor->current_is_eagerpreempt) {
preempt = (AST_PREEMPT | AST_URGENT);
} else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
if (SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
}
SCHED(processor_enqueue)(processor, thread, options);
- sched_update_pset_load_average(pset);
+ sched_update_pset_load_average(pset, 0);
if (preempt != AST_NONE) {
if (processor->state == PROCESSOR_IDLE) {
- processor->next_thread = THREAD_NULL;
processor_state_update_from_thread(processor, thread);
processor->deadline = UINT64_MAX;
pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
ipi_action = eExitIdle;
} else if (processor->state == PROCESSOR_DISPATCHING) {
- if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) {
+ if (processor->current_pri < thread->sched_pri) {
processor_state_update_from_thread(processor, thread);
processor->deadline = UINT64_MAX;
}
thread->sched_pri >= processor->current_pri) {
ipi_action = eInterruptRunning;
} else if (processor->state == PROCESSOR_IDLE) {
- processor->next_thread = THREAD_NULL;
processor_state_update_from_thread(processor, thread);
processor->deadline = UINT64_MAX;
pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
return nset;
}
+inline static processor_set_t
+change_locked_pset(processor_set_t current_pset, processor_set_t new_pset)
+{
+ if (current_pset != new_pset) {
+ pset_unlock(current_pset);
+ pset_lock(new_pset);
+ }
+
+ return new_pset;
+}
+
/*
* choose_processor:
*
processor = PROCESSOR_NULL;
} else if (!processor->is_recommended) {
processor = PROCESSOR_NULL;
- } else if ((thread->sched_pri >= BASEPRI_RTQUEUES) && !sched_ok_to_run_realtime_thread(pset, processor)) {
- processor = PROCESSOR_NULL;
} else {
switch (processor->state) {
case PROCESSOR_START:
* idle processor. The platform layer had an opportunity to provide
* the "least cost idle" processor above.
*/
- return processor;
+ if ((thread->sched_pri < BASEPRI_RTQUEUES) || processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
+ return processor;
+ }
+ processor = PROCESSOR_NULL;
+ break;
case PROCESSOR_RUNNING:
case PROCESSOR_DISPATCHING:
/*
* to regain their previous executing processor.
*/
if ((thread->sched_pri >= BASEPRI_RTQUEUES) &&
- (processor->current_pri < BASEPRI_RTQUEUES)) {
+ processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
return processor;
}
lc_processor = processor;
}
- do {
- int cpuid;
+ if (thread->sched_pri >= BASEPRI_RTQUEUES) {
+ pset_node_t node = pset->node;
+ int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0);
+ for (; consider_secondaries < 2; consider_secondaries++) {
+ pset = change_locked_pset(pset, starting_pset);
+ do {
+ processor = choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, consider_secondaries);
+ if (processor) {
+ return processor;
+ }
- if (thread->sched_pri >= BASEPRI_RTQUEUES) {
- processor = choose_processor_for_realtime_thread(pset);
- if (processor) {
- return processor;
- }
- } else {
- /*
- * Choose an idle processor, in pset traversal order
- */
+ /* NRG Collect processor stats for furthest deadline etc. here */
- uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
- pset->primary_map &
- pset->recommended_bitmask);
+ nset = next_pset(pset);
+
+ if (nset != starting_pset) {
+ pset = change_locked_pset(pset, nset);
+ }
+ } while (nset != starting_pset);
+ }
+ /* Or we could just let it change to starting_pset in the loop above */
+ pset = change_locked_pset(pset, starting_pset);
+ }
- /* there shouldn't be a pending AST if the processor is idle */
- assert((idle_primary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
+ do {
+ /*
+ * Choose an idle processor, in pset traversal order
+ */
- cpuid = lsb_first(idle_primary_map);
- if (cpuid >= 0) {
- processor = processor_array[cpuid];
- return processor;
- }
+ uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
+ pset->primary_map &
+ pset->recommended_bitmask);
+
+ /* there shouldn't be a pending AST if the processor is idle */
+ assert((idle_primary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
+
+ int cpuid = lsb_first(idle_primary_map);
+ if (cpuid >= 0) {
+ processor = processor_array[cpuid];
+ return processor;
}
/*
}
/*
+ * lc_processor is used to indicate the best processor set run queue
+ * on which to enqueue a thread when all available CPUs are busy with
+ * higher priority threads, so try to make sure it is initialized.
+ */
+ if (lc_processor == PROCESSOR_NULL) {
+ cpumap_t available_map = ((pset->cpu_state_map[PROCESSOR_IDLE] |
+ pset->cpu_state_map[PROCESSOR_RUNNING] |
+ pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
+ pset->recommended_bitmask);
+ cpuid = lsb_first(available_map);
+ if (cpuid >= 0) {
+ lc_processor = processor_array[cpuid];
+ lowest_count = SCHED(processor_runq_count)(lc_processor);
+ }
+ }
+
+ /*
+ * Move onto the next processor set.
+ *
* If all primary processors in this pset are running a higher
* priority thread, move on to next pset. Only when we have
* exhausted the search for primary processors do we
nset = next_pset(pset);
if (nset != starting_pset) {
- pset_unlock(pset);
-
- pset = nset;
- pset_lock(pset);
+ pset = change_locked_pset(pset, nset);
}
} while (nset != starting_pset);
* the secondary processor that would perturb the least priority
* primary, or the least busy primary.
*/
+ boolean_t fallback_processor = false;
do {
/* lowest_priority is evaluated in the main loops above */
if (lp_idle_secondary_processor != PROCESSOR_NULL) {
lc_processor = PROCESSOR_NULL;
} else {
/*
- * All processors are executing higher
- * priority threads, and the lowest_count
- * candidate was not usable, so we pick a processor
- * to give this thread somewhere to be enqueued.
+ * All processors are executing higher priority threads, and
+ * the lowest_count candidate was not usable.
*
- * TODO: Need tracepoint or something to show when this happens
- * TODO: Prefer a processor in the original pset
+ * For AMP platforms running the clutch scheduler always
+ * return a processor from the requested pset to allow the
+ * thread to be enqueued in the correct runq. For non-AMP
+ * platforms, simply return the master_processor.
*/
+ fallback_processor = true;
+#if CONFIG_SCHED_EDGE
+ processor = processor_array[lsb_first(starting_pset->primary_map)];
+#else /* CONFIG_SCHED_EDGE */
processor = master_processor;
+#endif /* CONFIG_SCHED_EDGE */
}
/*
* Check that the correct processor set is
* returned locked.
*/
- if (pset != processor->processor_set) {
- pset_unlock(pset);
- pset = processor->processor_set;
- pset_lock(pset);
- }
+ pset = change_locked_pset(pset, processor->processor_set);
/*
* We must verify that the chosen processor is still available.
- * master_processor is an exception, since we may need to preempt
- * a running thread on it during processor shutdown (for sleep),
- * and that thread needs to be enqueued on its runqueue to run
- * when the processor is restarted.
+ * The cases where we pick the master_processor or the fallback
+ * processor are execptions, since we may need enqueue a thread
+ * on its runqueue if this is the last remaining processor
+ * during pset shutdown.
+ *
+ * <rdar://problem/47559304> would really help here since it
+ * gets rid of the weird last processor SHUTDOWN case where
+ * the pset is still schedulable.
*/
- if (processor != master_processor && (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)) {
+ if (processor != master_processor && (fallback_processor == false) && (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)) {
processor = PROCESSOR_NULL;
}
} while (processor == PROCESSOR_NULL);
return processor;
}
+/*
+ * Default implementation of SCHED(choose_node)()
+ * for single node systems
+ */
+pset_node_t
+sched_choose_node(__unused thread_t thread)
+{
+ return &pset_node0;
+}
+
+/*
+ * choose_starting_pset:
+ *
+ * Choose a starting processor set for the thread.
+ * May return a processor hint within the pset.
+ *
+ * Returns a starting processor set, to be used by
+ * choose_processor.
+ *
+ * The thread must be locked. The resulting pset is unlocked on return,
+ * and is chosen without taking any pset locks.
+ */
+processor_set_t
+choose_starting_pset(pset_node_t node, thread_t thread, processor_t *processor_hint)
+{
+ processor_set_t pset;
+ processor_t processor = PROCESSOR_NULL;
+
+ if (thread->affinity_set != AFFINITY_SET_NULL) {
+ /*
+ * Use affinity set policy hint.
+ */
+ pset = thread->affinity_set->aset_pset;
+ } else if (thread->last_processor != PROCESSOR_NULL) {
+ /*
+ * Simple (last processor) affinity case.
+ */
+ processor = thread->last_processor;
+ pset = processor->processor_set;
+ } else {
+ /*
+ * No Affinity case:
+ *
+ * Utilitize a per task hint to spread threads
+ * among the available processor sets.
+ * NRG this seems like the wrong thing to do.
+ * See also task->pset_hint = pset in thread_setrun()
+ */
+ task_t task = thread->task;
+
+ pset = task->pset_hint;
+ if (pset == PROCESSOR_SET_NULL) {
+ pset = current_processor()->processor_set;
+ }
+
+ pset = choose_next_pset(pset);
+ }
+
+ if (!bit_test(node->pset_map, pset->pset_id)) {
+ /* pset is not from this node so choose one that is */
+ int id = lsb_first(node->pset_map);
+ assert(id >= 0);
+ pset = pset_array[id];
+ }
+
+ if (bit_count(node->pset_map) == 1) {
+ /* Only a single pset in this node */
+ goto out;
+ }
+
+ bool avoid_cpu0 = false;
+
+#if defined(__x86_64__)
+ if ((thread->sched_pri >= BASEPRI_RTQUEUES) && sched_avoid_cpu0) {
+ /* Avoid the pset containing cpu0 */
+ avoid_cpu0 = true;
+ /* Assert that cpu0 is in pset0. I expect this to be true on __x86_64__ */
+ assert(bit_test(pset_array[0]->cpu_bitmask, 0));
+ }
+#endif
+
+ if (thread->sched_pri >= BASEPRI_RTQUEUES) {
+ pset_map_t rt_target_map = atomic_load(&node->pset_non_rt_primary_map);
+ if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
+ if (avoid_cpu0) {
+ rt_target_map = bit_ror64(rt_target_map, 1);
+ }
+ int rotid = lsb_first(rt_target_map);
+ if (rotid >= 0) {
+ int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
+ pset = pset_array[id];
+ goto out;
+ }
+ }
+ if (!pset->is_SMT || !sched_allow_rt_smt) {
+ /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
+ goto out;
+ }
+ rt_target_map = atomic_load(&node->pset_non_rt_map);
+ if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
+ if (avoid_cpu0) {
+ rt_target_map = bit_ror64(rt_target_map, 1);
+ }
+ int rotid = lsb_first(rt_target_map);
+ if (rotid >= 0) {
+ int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
+ pset = pset_array[id];
+ goto out;
+ }
+ }
+ /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
+ } else {
+ pset_map_t idle_map = atomic_load(&node->pset_idle_map);
+ if (!bit_test(idle_map, pset->pset_id)) {
+ int next_idle_pset_id = lsb_first(idle_map);
+ if (next_idle_pset_id >= 0) {
+ pset = pset_array[next_idle_pset_id];
+ }
+ }
+ }
+
+out:
+ if ((processor != PROCESSOR_NULL) && (processor->processor_set != pset)) {
+ processor = PROCESSOR_NULL;
+ }
+ if (processor != PROCESSOR_NULL) {
+ *processor_hint = processor;
+ }
+
+ return pset;
+}
+
/*
* thread_setrun:
*
void
thread_setrun(
thread_t thread,
- integer_t options)
+ sched_options_t options)
{
processor_t processor;
processor_set_t pset;
assert(thread->runq == PROCESSOR_NULL);
-#if __SMP__
if (thread->bound_processor == PROCESSOR_NULL) {
/*
* Unbound case.
*/
- if (thread->affinity_set != AFFINITY_SET_NULL) {
- /*
- * Use affinity set policy hint.
- */
- pset = thread->affinity_set->aset_pset;
- pset_lock(pset);
-
- processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
- pset = processor->processor_set;
-
- SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
- (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
- } else if (thread->last_processor != PROCESSOR_NULL) {
- /*
- * Simple (last processor) affinity case.
- */
- processor = thread->last_processor;
- pset = processor->processor_set;
- pset_lock(pset);
- processor = SCHED(choose_processor)(pset, processor, thread);
- pset = processor->processor_set;
+ processor_t processor_hint = PROCESSOR_NULL;
+ pset_node_t node = SCHED(choose_node)(thread);
+ processor_set_t starting_pset = choose_starting_pset(node, thread, &processor_hint);
- SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
- (uintptr_t)thread_tid(thread), thread->last_processor->cpu_id, processor->cpu_id, processor->state, 0);
- } else {
- /*
- * No Affinity case:
- *
- * Utilitize a per task hint to spread threads
- * among the available processor sets.
- */
- task_t task = thread->task;
-
- pset = task->pset_hint;
- if (pset == PROCESSOR_SET_NULL) {
- pset = current_processor()->processor_set;
- }
-
- pset = choose_next_pset(pset);
- pset_lock(pset);
+ pset_lock(starting_pset);
- processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
- pset = processor->processor_set;
- task->pset_hint = pset;
+ processor = SCHED(choose_processor)(starting_pset, processor_hint, thread);
+ pset = processor->processor_set;
+ task_t task = thread->task;
+ task->pset_hint = pset; /* NRG this is done without holding the task lock */
- SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
- (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
- }
+ SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
+ (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
} else {
/*
* Bound case:
SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
(uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
}
-#else /* !__SMP__ */
- /* Only one processor to choose */
- assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == master_processor);
- processor = master_processor;
- pset = processor->processor_set;
- pset_lock(pset);
-#endif /* !__SMP__ */
/*
* Dispatch the thread on the chosen processor.
}
}
-#if __SMP__
/*
* If the current thread is running on a processor that is no longer recommended,
* urgently preempt it, at which point thread_select() should
if (!processor->is_recommended) {
return check_reason | AST_PREEMPT | AST_URGENT;
}
-#endif
result = SCHED(processor_csw_check)(processor);
if (result != AST_NONE) {
- return check_reason | result | (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE);
+ return check_reason | result | (thread_is_eager_preempt(thread) ? AST_URGENT : AST_NONE);
}
-#if __SMP__
/*
* Same for avoid-processor
*
processor->processor_primary != processor) {
return check_reason | AST_PREEMPT;
}
-#endif
if (thread->state & TH_SUSP) {
return check_reason | AST_PREEMPT;
void
set_sched_pri(
thread_t thread,
- int new_priority,
+ int16_t new_priority,
set_sched_pri_options_t options)
{
bool is_current_thread = (thread == current_thread());
bool removed_from_runq = false;
bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY);
- int old_priority = thread->sched_pri;
+ int16_t old_priority = thread->sched_pri;
/* If we're already at this priority, no need to mess with the runqueue */
if (new_priority == old_priority) {
+#if CONFIG_SCHED_CLUTCH
+ /* For the first thread in the system, the priority is correct but
+ * th_sched_bucket is still TH_BUCKET_RUN. Since the clutch
+ * scheduler relies on the bucket being set for all threads, update
+ * its bucket here.
+ */
+ if (thread->th_sched_bucket == TH_BUCKET_RUN) {
+ assert(is_current_thread);
+ SCHED(update_thread_bucket)(thread);
+ }
+#endif /* CONFIG_SCHED_CLUTCH */
+
return;
}
thread->sched_pri = new_priority;
+#if CONFIG_SCHED_CLUTCH
+ /*
+ * Since for the clutch scheduler, the thread's bucket determines its runq
+ * in the hierarchy it is important to update the bucket when the thread
+ * lock is held and the thread has been removed from the runq hierarchy.
+ */
+ SCHED(update_thread_bucket)(thread);
+
+#endif /* CONFIG_SCHED_CLUTCH */
+
KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
(uintptr_t)thread_tid(thread),
thread->base_pri,
thread_lock(thread);
/*
- * Check that the thread is not bound
- * to a different processor, and that realtime
- * is not involved.
+ * Check that the thread is not bound to a different processor,
+ * NO_SMT flag is not set on the thread, cluster type of
+ * processor matches with thread if the thread is pinned to a
+ * particular cluster and that realtime is not involved.
*
- * Next, pull it off its run queue. If it
- * doesn't come, it's not eligible.
+ * Next, pull it off its run queue. If it doesn't come, it's not eligible.
*/
-
processor_t processor = current_processor();
- if (processor->current_pri < BASEPRI_RTQUEUES && thread->sched_pri < BASEPRI_RTQUEUES &&
- (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)) {
+ if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
+ && (!thread_no_smt(thread))
+ && (processor->current_pri < BASEPRI_RTQUEUES)
+ && (thread->sched_pri < BASEPRI_RTQUEUES)
+#if __AMP__
+ && ((!(thread->sched_flags & TH_SFLAG_PCORE_ONLY)) ||
+ processor->processor_set->pset_cluster_type == PSET_AMP_P)
+ && ((!(thread->sched_flags & TH_SFLAG_ECORE_ONLY)) ||
+ processor->processor_set->pset_cluster_type == PSET_AMP_E)
+#endif /* __AMP__ */
+ ) {
if (thread_run_queue_remove(thread)) {
pulled_thread = thread;
}
return pulled_thread;
}
+/*
+ * thread_prepare_for_handoff
+ *
+ * Make the thread ready for handoff.
+ * If the thread was runnable then pull it off the runq, if the thread could
+ * not be pulled, return NULL.
+ *
+ * If the thread was woken up from wait for handoff, make sure it is not bound to
+ * different processor.
+ *
+ * Called at splsched
+ *
+ * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
+ * This may be different than the thread that was passed in.
+ */
+thread_t
+thread_prepare_for_handoff(thread_t thread, thread_handoff_option_t option)
+{
+ thread_t pulled_thread = THREAD_NULL;
+
+ if (option & THREAD_HANDOFF_SETRUN_NEEDED) {
+ processor_t processor = current_processor();
+ thread_lock(thread);
+
+ /*
+ * Check that the thread is not bound to a different processor,
+ * NO_SMT flag is not set on the thread and cluster type of
+ * processor matches with thread if the thread is pinned to a
+ * particular cluster. Call setrun instead if above conditions
+ * are not satisfied.
+ */
+ if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
+ && (!thread_no_smt(thread))
+#if __AMP__
+ && ((!(thread->sched_flags & TH_SFLAG_PCORE_ONLY)) ||
+ processor->processor_set->pset_cluster_type == PSET_AMP_P)
+ && ((!(thread->sched_flags & TH_SFLAG_ECORE_ONLY)) ||
+ processor->processor_set->pset_cluster_type == PSET_AMP_E)
+#endif /* __AMP__ */
+ ) {
+ pulled_thread = thread;
+ } else {
+ thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
+ }
+ thread_unlock(thread);
+ } else {
+ pulled_thread = thread_run_queue_remove_for_handoff(thread);
+ }
+
+ return pulled_thread;
+}
+
/*
* thread_run_queue_remove:
*
processor_set_t pset = processor->processor_set;
- rt_lock_lock(pset);
+ pset_lock(pset);
if (thread->runq != PROCESSOR_NULL) {
/*
removed = TRUE;
}
- rt_lock_unlock(pset);
+ pset_unlock(pset);
return removed;
}
* thread locked, at splsched
*/
void
-thread_run_queue_reinsert(thread_t thread, integer_t options)
+thread_run_queue_reinsert(thread_t thread, sched_options_t options)
{
assert(thread->runq == PROCESSOR_NULL);
assert(thread->state & (TH_RUN));
MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_START,
(uintptr_t)thread_tid(thread), 0, 0, 0, 0);
- SCHED_STATS_CPU_IDLE_START(processor);
+ SCHED_STATS_INC(idle_transitions);
+ assert(processor->running_timers_active == false);
uint64_t ctime = mach_absolute_time();
- timer_switch(&PROCESSOR_DATA(processor, system_state), ctime, &PROCESSOR_DATA(processor, idle_state));
- PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
+ timer_switch(&processor->system_state, ctime, &processor->idle_state);
+ processor->current_state = &processor->idle_state;
cpu_quiescent_counter_leave(ctime);
}
}
-#if CONFIG_SCHED_IDLE_IN_PLACE
- if (thread != THREAD_NULL) {
- /* Did idle-in-place thread wake up */
- if ((thread->state & (TH_WAIT | TH_SUSP)) != TH_WAIT || thread->wake_active) {
- break;
- }
- }
-#endif
-
IDLE_KERNEL_DEBUG_CONSTANT(
MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0);
machine_track_platform_idle(TRUE);
machine_idle();
+ /* returns with interrupts enabled */
machine_track_platform_idle(FALSE);
ctime = mach_absolute_time();
- timer_switch(&PROCESSOR_DATA(processor, idle_state), ctime, &PROCESSOR_DATA(processor, system_state));
- PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
+ timer_switch(&processor->idle_state, ctime, &processor->system_state);
+ processor->current_state = &processor->system_state;
cpu_quiescent_counter_join(ctime);
- assert(processor->next_thread == NULL);
-
ast_t reason = AST_NONE;
/* We're handling all scheduling AST's */
thread_t new_thread = thread_select(current_thread, processor, &reason);
thread_unlock(current_thread);
+ assert(processor->running_timers_active == false);
+
KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_END,
(uintptr_t)thread_tid(thread), processor->state, (uintptr_t)thread_tid(new_thread), reason, 0);
* Each processor has a dedicated thread which
* executes the idle loop when there is no suitable
* previous context.
+ *
+ * This continuation is entered with interrupts disabled.
*/
void
-idle_thread(void)
+idle_thread(__assert_only void* parameter,
+ __unused wait_result_t result)
{
- processor_t processor = current_processor();
- thread_t new_thread;
+ assert(ml_get_interrupts_enabled() == FALSE);
+ assert(parameter == NULL);
+
+ processor_t processor = current_processor();
+
+ /*
+ * Ensure that anything running in idle context triggers
+ * preemption-disabled checks.
+ */
+ disable_preemption();
+
+ /*
+ * Enable interrupts temporarily to handle any pending interrupts
+ * or IPIs before deciding to sleep
+ */
+ spllo();
+
+ thread_t new_thread = processor_idle(THREAD_NULL, processor);
+ /* returns with interrupts disabled */
+
+ enable_preemption();
- new_thread = processor_idle(THREAD_NULL, processor);
if (new_thread != THREAD_NULL) {
- thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread);
+ thread_run(processor->idle_thread,
+ idle_thread, NULL, new_thread);
/*NOTREACHED*/
}
- thread_block((thread_continue_t)idle_thread);
+ thread_block(idle_thread);
/*NOTREACHED*/
}
spl_t s;
char name[MAXTHREADNAMESIZE];
- result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread);
+ result = kernel_thread_create(idle_thread, NULL, MAXPRI_KERNEL, &thread);
if (result != KERN_SUCCESS) {
return result;
}
#endif /* __arm__ || __arm64__ */
result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
- (void *)SCHED(maintenance_continuation), MAXPRI_KERNEL, &thread);
+ NULL, MAXPRI_KERNEL, &thread);
if (result != KERN_SUCCESS) {
panic("sched_startup");
}
sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
}
+ scan_context.sched_tick_last_abstime = sched_tick_last_abstime;
KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_START,
sched_tick_delta, late_time, 0, 0, 0);
uint64_t ndeadline = ctime + sched_tick_interval;
- if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) {
+ if (__probable(os_atomic_cmpxchg(&sched_maintenance_deadline, deadline, ndeadline, seq_cst))) {
thread_wakeup((event_t)sched_timeshare_maintenance_continue);
sched_maintenance_wakeups++;
}
}
- uint64_t load_compute_deadline = __c11_atomic_load(&sched_load_compute_deadline, memory_order_relaxed);
+#if !CONFIG_SCHED_CLUTCH
+ /*
+ * Only non-clutch schedulers use the global load calculation EWMA algorithm. For clutch
+ * scheduler, the load is maintained at the thread group and bucket level.
+ */
+ uint64_t load_compute_deadline = os_atomic_load_wide(&sched_load_compute_deadline, relaxed);
if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
uint64_t new_deadline = 0;
- if (__c11_atomic_compare_exchange_strong(&sched_load_compute_deadline, &load_compute_deadline, new_deadline,
- memory_order_relaxed, memory_order_relaxed)) {
+ if (os_atomic_cmpxchg(&sched_load_compute_deadline, load_compute_deadline, new_deadline, relaxed)) {
compute_sched_load();
new_deadline = ctime + sched_load_compute_interval_abs;
- __c11_atomic_store(&sched_load_compute_deadline, new_deadline, memory_order_relaxed);
+ os_atomic_store_wide(&sched_load_compute_deadline, new_deadline, relaxed);
}
}
+#endif /* CONFIG_SCHED_CLUTCH */
#if __arm64__
- uint64_t perf_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline, memory_order_relaxed);
+ uint64_t perf_deadline = os_atomic_load(&sched_perfcontrol_callback_deadline, relaxed);
if (__improbable(perf_deadline && ctime >= perf_deadline)) {
/* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */
- if (__c11_atomic_compare_exchange_strong(&sched_perfcontrol_callback_deadline, &perf_deadline, 0,
- memory_order_relaxed, memory_order_relaxed)) {
+ if (os_atomic_cmpxchg(&sched_perfcontrol_callback_deadline, perf_deadline, 0, relaxed)) {
machine_perfcontrol_deadline_passed(perf_deadline);
}
}
#endif /* CONFIG_SCHED_TIMESHARE_CORE */
void
-sched_init_thread(void (*continuation)(void))
+sched_init_thread(void)
{
thread_block(THREAD_CONTINUE_NULL);
sched_maintenance_thread = thread;
- continuation();
+ SCHED(maintenance_continuation)();
/*NOTREACHED*/
}
thread_update_count = 0;
}
+static boolean_t
+runq_scan_thread(
+ thread_t thread,
+ sched_update_scan_context_t scan_context)
+{
+ assert_thread_magic(thread);
+
+ if (thread->sched_stamp != sched_tick &&
+ thread->sched_mode == TH_MODE_TIMESHARE) {
+ if (thread_update_add_thread(thread) == FALSE) {
+ return TRUE;
+ }
+ }
+
+ if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
+ if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
+ scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
+ }
+ } else {
+ if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
+ scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
+ }
+ }
+
+ return FALSE;
+}
+
/*
* Scan a runq for candidate threads.
*
queue_index >= 0;
queue_index = bitmap_next(runq->bitmap, queue_index)) {
thread_t thread;
- queue_t queue = &runq->queues[queue_index];
+ circle_queue_t queue = &runq->queues[queue_index];
- qe_foreach_element(thread, queue, runq_links) {
+ cqe_foreach_element(thread, queue, runq_links) {
assert(count > 0);
- assert_thread_magic(thread);
-
- if (thread->sched_stamp != sched_tick &&
- thread->sched_mode == TH_MODE_TIMESHARE) {
- if (thread_update_add_thread(thread) == FALSE) {
- return TRUE;
- }
- }
-
- if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
- if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
- scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
- }
- } else {
- if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
- scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
- }
+ if (runq_scan_thread(thread, scan_context) == TRUE) {
+ return TRUE;
}
count--;
}
return FALSE;
}
-#endif /* CONFIG_SCHED_TIMESHARE_CORE */
+#if CONFIG_SCHED_CLUTCH
boolean_t
-thread_eager_preemption(thread_t thread)
+sched_clutch_timeshare_scan(
+ queue_t thread_queue,
+ uint16_t thread_count,
+ sched_update_scan_context_t scan_context)
+{
+ if (thread_count == 0) {
+ return FALSE;
+ }
+
+ thread_t thread;
+ qe_foreach_element_safe(thread, thread_queue, th_clutch_timeshare_link) {
+ if (runq_scan_thread(thread, scan_context) == TRUE) {
+ return TRUE;
+ }
+ thread_count--;
+ }
+
+ assert(thread_count == 0);
+ return FALSE;
+}
+
+
+#endif /* CONFIG_SCHED_CLUTCH */
+
+#endif /* CONFIG_SCHED_TIMESHARE_CORE */
+
+bool
+thread_is_eager_preempt(thread_t thread)
{
- return (thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != 0;
+ return thread->sched_flags & TH_SFLAG_EAGERPREEMPT;
}
void
thread_set_eager_preempt(thread_t thread)
{
- spl_t x;
- processor_t p;
- ast_t ast = AST_NONE;
+ spl_t s = splsched();
+ thread_lock(thread);
- x = splsched();
- p = current_processor();
+ assert(!thread_is_eager_preempt(thread));
- thread_lock(thread);
thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
if (thread == current_thread()) {
- ast = csw_check(thread, p, AST_NONE);
+ /* csw_check updates current_is_eagerpreempt on the processor */
+ ast_t ast = csw_check(thread, current_processor(), AST_NONE);
+
thread_unlock(thread);
+
if (ast != AST_NONE) {
- (void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
+ thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
}
} else {
- p = thread->last_processor;
+ processor_t last_processor = thread->last_processor;
- if (p != PROCESSOR_NULL && p->state == PROCESSOR_RUNNING &&
- p->active_thread == thread) {
- cause_ast_check(p);
+ if (last_processor != PROCESSOR_NULL &&
+ last_processor->state == PROCESSOR_RUNNING &&
+ last_processor->active_thread == thread) {
+ cause_ast_check(last_processor);
}
thread_unlock(thread);
}
- splx(x);
+ splx(s);
}
void
thread_clear_eager_preempt(thread_t thread)
{
- spl_t x;
-
- x = splsched();
+ spl_t s = splsched();
thread_lock(thread);
+ assert(thread_is_eager_preempt(thread));
+
thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
+ if (thread == current_thread()) {
+ current_processor()->current_is_eagerpreempt = false;
+ }
+
thread_unlock(thread);
- splx(x);
+ splx(s);
}
/*
void
sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
{
- struct processor_sched_statistics *stats;
+ struct sched_statistics *stats;
boolean_t to_realtime = FALSE;
- stats = &processor->processor_data.sched_stats;
+ stats = PERCPU_GET_RELATIVE(sched_stats, processor, processor);
stats->csw_count++;
if (otherpri >= BASEPRI_REALTIME) {
}
if (processor->state != PROCESSOR_OFF_LINE) {
avail_count++;
+ SCHED(pset_made_schedulable)(processor, pset, false);
}
}
} while ((processor = processor->processor_list) != NULL);
(host_info_t)&hinfo, &count);
assert(kret == KERN_SUCCESS);
- /* We would not want multiple realtime threads running on the
- * same physical core; even for SMT capable machines.
- */
- if (options & QOS_PARALLELISM_REALTIME) {
- return hinfo.physical_cpu;
- }
-
if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
return hinfo.logical_cpu;
} else {
bool
thread_no_smt(thread_t thread)
{
-#if DEBUG || DEVELOPMENT
return sched_allow_NO_SMT_threads && (thread->bound_processor == PROCESSOR_NULL) && ((thread->sched_flags & TH_SFLAG_NO_SMT) || (thread->task->t_flags & TF_NO_SMT));
-#else
- return sched_allow_NO_SMT_threads && (thread->bound_processor == PROCESSOR_NULL) && (thread->sched_flags & TH_SFLAG_NO_SMT);
-#endif
}
bool
* then I cancelled the callback, otherwise I didn't
*/
- uint64_t old_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline,
- memory_order_relaxed);
+ return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline,
+ relaxed) != 0;
+}
+
+#endif /* __arm64__ */
+
+#if CONFIG_SCHED_EDGE
+
+#define SCHED_PSET_LOAD_EWMA_TC_NSECS 10000000u
+
+/*
+ * sched_edge_pset_running_higher_bucket()
+ *
+ * Routine to calculate cumulative running counts for each scheduling
+ * bucket. This effectively lets the load calculation calculate if a
+ * cluster is running any threads at a QoS lower than the thread being
+ * migrated etc.
+ */
+
+static void
+sched_edge_pset_running_higher_bucket(processor_set_t pset, uint32_t *running_higher)
+{
+ bitmap_t *active_map = &pset->cpu_state_map[PROCESSOR_RUNNING];
+
+ /* Edge Scheduler Optimization */
+ for (int cpu = bitmap_first(active_map, MAX_CPUS); cpu >= 0; cpu = bitmap_next(active_map, cpu)) {
+ sched_bucket_t cpu_bucket = os_atomic_load(&pset->cpu_running_buckets[cpu], relaxed);
+ for (sched_bucket_t bucket = cpu_bucket; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
+ running_higher[bucket]++;
+ }
+ }
+}
+
+/*
+ * sched_update_pset_load_average()
+ *
+ * Updates the load average for each sched bucket for a cluster.
+ * This routine must be called with the pset lock held.
+ */
+void
+sched_update_pset_load_average(processor_set_t pset, uint64_t curtime)
+{
+ if (pset->online_processor_count == 0) {
+ /* Looks like the pset is not runnable any more; nothing to do here */
+ return;
+ }
+
+ /*
+ * Edge Scheduler Optimization
+ *
+ * See if more callers of this routine can pass in timestamps to avoid the
+ * mach_absolute_time() call here.
+ */
+
+ if (!curtime) {
+ curtime = mach_absolute_time();
+ }
+ uint64_t last_update = os_atomic_load(&pset->pset_load_last_update, relaxed);
+ int64_t delta_ticks = curtime - last_update;
+ if (delta_ticks < 0) {
+ return;
+ }
+ uint64_t delta_nsecs = 0;
+ absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
- while (!__c11_atomic_compare_exchange_weak(&sched_perfcontrol_callback_deadline,
- &old_deadline, new_deadline,
- memory_order_relaxed, memory_order_relaxed)) {
- ;
+ if (__improbable(delta_nsecs > UINT32_MAX)) {
+ delta_nsecs = UINT32_MAX;
}
+ uint32_t running_higher[TH_BUCKET_SCHED_MAX] = {0};
+ sched_edge_pset_running_higher_bucket(pset, running_higher);
- /* now old_deadline contains previous value, which might not be the same if it raced */
+ for (sched_bucket_t sched_bucket = TH_BUCKET_FIXPRI; sched_bucket < TH_BUCKET_SCHED_MAX; sched_bucket++) {
+ uint64_t old_load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
+ uint64_t old_load_average_factor = old_load_average * SCHED_PSET_LOAD_EWMA_TC_NSECS;
+ uint32_t current_runq_depth = (sched_edge_cluster_cumulative_count(&pset->pset_clutch_root, sched_bucket) + rt_runq_count(pset) + running_higher[sched_bucket]) / pset->online_processor_count;
- return (old_deadline != 0) ? TRUE : FALSE;
+ /*
+ * For the new load average multiply current_runq_depth by delta_nsecs (which resuts in a 32.0 value).
+ * Since we want to maintain the load average as a 24.8 fixed arithmetic value for precision, the
+ * new load averga needs to be shifted before it can be added to the old load average.
+ */
+ uint64_t new_load_average_factor = (current_runq_depth * delta_nsecs) << SCHED_PSET_LOAD_EWMA_FRACTION_BITS;
+
+ /*
+ * For extremely parallel workloads, it is important that the load average on a cluster moves zero to non-zero
+ * instantly to allow threads to be migrated to other (potentially idle) clusters quickly. Hence use the EWMA
+ * when the system is already loaded; otherwise for an idle system use the latest load average immediately.
+ */
+ int old_load_shifted = (int)((old_load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
+ boolean_t load_uptick = (old_load_shifted == 0) && (current_runq_depth != 0);
+ boolean_t load_downtick = (old_load_shifted != 0) && (current_runq_depth == 0);
+ uint64_t load_average;
+ if (load_uptick || load_downtick) {
+ load_average = (current_runq_depth << SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
+ } else {
+ /* Indicates a loaded system; use EWMA for load average calculation */
+ load_average = (old_load_average_factor + new_load_average_factor) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
+ }
+ os_atomic_store(&pset->pset_load_average[sched_bucket], load_average, relaxed);
+ KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_LOAD_AVG) | DBG_FUNC_NONE, pset->pset_cluster_id, (load_average >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS), load_average & SCHED_PSET_LOAD_EWMA_FRACTION_MASK, sched_bucket);
+ }
+ os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
}
-#endif /* __arm64__ */
+void
+sched_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket)
+{
+ pset_execution_time_t old_execution_time_packed, new_execution_time_packed;
+ uint64_t avg_thread_execution_time = 0;
+
+ os_atomic_rmw_loop(&pset->pset_execution_time[sched_bucket].pset_execution_time_packed,
+ old_execution_time_packed.pset_execution_time_packed,
+ new_execution_time_packed.pset_execution_time_packed, relaxed, {
+ uint64_t last_update = old_execution_time_packed.pset_execution_time_last_update;
+ int64_t delta_ticks = curtime - last_update;
+ if (delta_ticks < 0) {
+ /*
+ * Its possible that another CPU came in and updated the pset_execution_time
+ * before this CPU could do it. Since the average execution time is meant to
+ * be an approximate measure per cluster, ignore the older update.
+ */
+ os_atomic_rmw_loop_give_up(return );
+ }
+ uint64_t delta_nsecs = 0;
+ absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
+
+ uint64_t nanotime = 0;
+ absolutetime_to_nanoseconds(execution_time, &nanotime);
+ uint64_t execution_time_us = nanotime / NSEC_PER_USEC;
+
+ uint64_t old_execution_time = (old_execution_time_packed.pset_avg_thread_execution_time * SCHED_PSET_LOAD_EWMA_TC_NSECS);
+ uint64_t new_execution_time = (execution_time_us * delta_nsecs);
+
+ avg_thread_execution_time = (old_execution_time + new_execution_time) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
+ new_execution_time_packed.pset_avg_thread_execution_time = avg_thread_execution_time;
+ new_execution_time_packed.pset_execution_time_last_update = curtime;
+ });
+ KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_AVG_EXEC_TIME) | DBG_FUNC_NONE, pset->pset_cluster_id, avg_thread_execution_time, sched_bucket);
+}
+
+#else /* CONFIG_SCHED_EDGE */
void
-sched_update_pset_load_average(processor_set_t pset)
+sched_update_pset_load_average(processor_set_t pset, __unused uint64_t curtime)
{
- int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
- int new_load_average = (pset->load_average + load) >> 1;
+ int non_rt_load = pset->pset_runq.count;
+ int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
+ int new_load_average = ((int)pset->load_average + load) >> 1;
pset->load_average = new_load_average;
-
#if (DEVELOPMENT || DEBUG)
+#if __AMP__
+ if (pset->pset_cluster_type == PSET_AMP_P) {
+ KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) | DBG_FUNC_NONE, sched_get_pset_load_average(pset, 0), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)));
+ }
+#endif
+#endif
+}
+
+void
+sched_update_pset_avg_execution_time(__unused processor_set_t pset, __unused uint64_t execution_time, __unused uint64_t curtime, __unused sched_bucket_t sched_bucket)
+{
+}
+#endif /* CONFIG_SCHED_EDGE */
+
+/* pset is locked */
+static bool
+processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor)
+{
+ int cpuid = processor->cpu_id;
+#if defined(__x86_64__)
+ if (sched_avoid_cpu0 && (cpuid == 0)) {
+ return false;
+ }
#endif
+
+ cpumap_t fasttrack_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
+
+ return bit_test(fasttrack_map, cpuid);
}
/* pset is locked */
static processor_t
-choose_processor_for_realtime_thread(processor_set_t pset)
+choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries)
{
#if defined(__x86_64__)
bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
const bool avoid_cpu0 = false;
#endif
- uint64_t cpu_map = (pset->cpu_bitmask & pset->recommended_bitmask & ~pset->pending_AST_URGENT_cpu_mask);
+ cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
+ if (skip_processor) {
+ bit_clear(cpu_map, skip_processor->cpu_id);
+ }
+
+ cpumap_t primary_map = cpu_map & pset->primary_map;
if (avoid_cpu0) {
- cpu_map = bit_ror64(cpu_map, 1);
+ primary_map = bit_ror64(primary_map, 1);
}
- for (int rotid = lsb_first(cpu_map); rotid >= 0; rotid = lsb_next(cpu_map, rotid)) {
- int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
+ int rotid = lsb_first(primary_map);
+ if (rotid >= 0) {
+ int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
processor_t processor = processor_array[cpuid];
- if (processor->processor_primary != processor) {
- continue;
- }
-
- if (processor->state == PROCESSOR_IDLE) {
- return processor;
- }
-
- if ((processor->state != PROCESSOR_RUNNING) && (processor->state != PROCESSOR_DISPATCHING)) {
- continue;
- }
-
- if (processor->current_pri >= BASEPRI_RTQUEUES) {
- continue;
- }
-
return processor;
}
- if (!sched_allow_rt_smt) {
- return PROCESSOR_NULL;
+ if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
+ goto out;
}
/* Consider secondary processors */
+ cpumap_t secondary_map = cpu_map & ~pset->primary_map;
if (avoid_cpu0) {
/* Also avoid cpu1 */
- cpu_map = bit_ror64(cpu_map, 1);
+ secondary_map = bit_ror64(secondary_map, 2);
}
- for (int rotid = lsb_first(cpu_map); rotid >= 0; rotid = lsb_next(cpu_map, rotid)) {
+ rotid = lsb_first(secondary_map);
+ if (rotid >= 0) {
int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
processor_t processor = processor_array[cpuid];
- if (processor->processor_primary == processor) {
- continue;
- }
+ return processor;
+ }
- if (processor->state == PROCESSOR_IDLE) {
- return processor;
- }
+out:
+ if (skip_processor) {
+ return PROCESSOR_NULL;
+ }
- if ((processor->state != PROCESSOR_RUNNING) && (processor->state != PROCESSOR_DISPATCHING)) {
- continue;
- }
+ /*
+ * If we didn't find an obvious processor to choose, but there are still more CPUs
+ * not already running realtime threads than realtime threads in the realtime run queue,
+ * this thread belongs in this pset, so choose some other processor in this pset
+ * to ensure the thread is enqueued here.
+ */
+ cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
+ if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
+ cpu_map = non_realtime_map;
+ assert(cpu_map != 0);
+ int cpuid = bit_first(cpu_map);
+ assert(cpuid >= 0);
+ return processor_array[cpuid];
+ }
- if (processor->current_pri >= BASEPRI_RTQUEUES) {
- continue;
- }
+ if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
+ goto skip_secondaries;
+ }
- return processor;
+ non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
+ if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
+ cpu_map = non_realtime_map;
+ assert(cpu_map != 0);
+ int cpuid = bit_first(cpu_map);
+ assert(cpuid >= 0);
+ return processor_array[cpuid];
}
+skip_secondaries:
return PROCESSOR_NULL;
}
static bool
all_available_primaries_are_running_realtime_threads(processor_set_t pset)
{
- return these_processors_are_running_realtime_threads(pset, pset->primary_map);
+ cpumap_t cpu_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
+ return rt_runq_count(pset) > bit_count(cpu_map);
}
+#if defined(__x86_64__)
/* pset is locked */
static bool
these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map)
{
- uint64_t cpu_map = (pset->cpu_bitmask & pset->recommended_bitmask) & these_map;
-
- for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) {
- processor_t processor = processor_array[cpuid];
-
- if (processor->state == PROCESSOR_IDLE) {
- return false;
- }
-
- if (processor->state == PROCESSOR_DISPATCHING) {
- return false;
- }
-
- if (processor->state != PROCESSOR_RUNNING) {
- /*
- * All other processor states are considered unavailable to run
- * realtime threads. In particular, we prefer an available secondary
- * processor over the risk of leaving a realtime thread on the run queue
- * while waiting for a processor in PROCESSOR_START state,
- * which should anyway be a rare case.
- */
- continue;
- }
-
- if (processor->current_pri < BASEPRI_RTQUEUES) {
- return false;
- }
- }
-
- return true;
+ cpumap_t cpu_map = pset_available_cpumap(pset) & these_map & ~pset->realtime_map;
+ return rt_runq_count(pset) > bit_count(cpu_map);
}
+#endif
static bool
sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor)
} else if (sched_avoid_cpu0 && (processor->cpu_id == 1) && processor->is_SMT) {
ok_to_run_realtime_thread = sched_allow_rt_smt && these_processors_are_running_realtime_threads(pset, ~0x2);
} else if (processor->processor_primary != processor) {
- ok_to_run_realtime_thread = sched_allow_rt_smt && all_available_primaries_are_running_realtime_threads(pset);
+ ok_to_run_realtime_thread = (sched_allow_rt_smt && all_available_primaries_are_running_realtime_threads(pset));
}
#else
(void)pset;
return ok_to_run_realtime_thread;
}
+void
+sched_pset_made_schedulable(__unused processor_t processor, processor_set_t pset, boolean_t drop_lock)
+{
+ if (drop_lock) {
+ pset_unlock(pset);
+ }
+}
+
void
thread_set_no_smt(bool set)
{
+ if (!system_is_SMT) {
+ /* Not a machine that supports SMT */
+ return;
+ }
+
thread_t thread = current_thread();
spl_t s = splsched();
thread_lock(thread);
if (set) {
thread->sched_flags |= TH_SFLAG_NO_SMT;
- } else {
- thread->sched_flags &= ~TH_SFLAG_NO_SMT;
}
thread_unlock(thread);
splx(s);
return current_thread()->sched_flags & TH_SFLAG_NO_SMT;
}
+extern void task_set_no_smt(task_t);
+void
+task_set_no_smt(task_t task)
+{
+ if (!system_is_SMT) {
+ /* Not a machine that supports SMT */
+ return;
+ }
+
+ if (task == TASK_NULL) {
+ task = current_task();
+ }
+
+ task_lock(task);
+ task->t_flags |= TF_NO_SMT;
+ task_unlock(task);
+}
+
#if DEBUG || DEVELOPMENT
extern void sysctl_task_set_no_smt(char no_smt);
void
sysctl_task_set_no_smt(char no_smt)
{
- thread_t thread = current_thread();
- task_t task = thread->task;
+ if (!system_is_SMT) {
+ /* Not a machine that supports SMT */
+ return;
+ }
+
+ task_t task = current_task();
+ task_lock(task);
if (no_smt == '1') {
task->t_flags |= TF_NO_SMT;
- } else {
- task->t_flags &= ~TF_NO_SMT;
}
+ task_unlock(task);
}
extern char sysctl_task_get_no_smt(void);
char
sysctl_task_get_no_smt(void)
{
- thread_t thread = current_thread();
- task_t task = thread->task;
+ task_t task = current_task();
if (task->t_flags & TF_NO_SMT) {
return '1';
}
return '0';
}
-#endif
+#endif /* DEVELOPMENT || DEBUG */
+
+
+__private_extern__ void
+thread_bind_cluster_type(thread_t thread, char cluster_type, bool soft_bound)
+{
+#if __AMP__
+ spl_t s = splsched();
+ thread_lock(thread);
+ thread->sched_flags &= ~(TH_SFLAG_ECORE_ONLY | TH_SFLAG_PCORE_ONLY | TH_SFLAG_BOUND_SOFT);
+ if (soft_bound) {
+ thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
+ }
+ switch (cluster_type) {
+ case 'e':
+ case 'E':
+ thread->sched_flags |= TH_SFLAG_ECORE_ONLY;
+ break;
+ case 'p':
+ case 'P':
+ thread->sched_flags |= TH_SFLAG_PCORE_ONLY;
+ break;
+ default:
+ break;
+ }
+ thread_unlock(thread);
+ splx(s);
+
+ if (thread == current_thread()) {
+ thread_block(THREAD_CONTINUE_NULL);
+ }
+#else /* __AMP__ */
+ (void)thread;
+ (void)cluster_type;
+ (void)soft_bound;
+#endif /* __AMP__ */
+}
+
+#if DEVELOPMENT || DEBUG
+extern int32_t sysctl_get_bound_cpuid(void);
+int32_t
+sysctl_get_bound_cpuid(void)
+{
+ int32_t cpuid = -1;
+ thread_t self = current_thread();
+
+ processor_t processor = self->bound_processor;
+ if (processor == NULL) {
+ cpuid = -1;
+ } else {
+ cpuid = processor->cpu_id;
+ }
+
+ return cpuid;
+}
+
+extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
+kern_return_t
+sysctl_thread_bind_cpuid(int32_t cpuid)
+{
+ processor_t processor = PROCESSOR_NULL;
+
+ if (cpuid == -1) {
+ goto unbind;
+ }
+
+ if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) {
+ return KERN_INVALID_VALUE;
+ }
+
+ processor = processor_array[cpuid];
+ if (processor == PROCESSOR_NULL) {
+ return KERN_INVALID_VALUE;
+ }
+
+#if __AMP__
+
+ thread_t thread = current_thread();
+
+ if (thread->sched_flags & (TH_SFLAG_ECORE_ONLY | TH_SFLAG_PCORE_ONLY)) {
+ if ((thread->sched_flags & TH_SFLAG_BOUND_SOFT) == 0) {
+ /* Cannot hard-bind an already hard-cluster-bound thread */
+ return KERN_NOT_SUPPORTED;
+ }
+ }
+
+#endif /* __AMP__ */
+
+unbind:
+ thread_bind(processor);
+
+ thread_block(THREAD_CONTINUE_NULL);
+ return KERN_SUCCESS;
+}
+#endif /* DEVELOPMENT || DEBUG */