X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/3e170ce000f1506b7b5d2c5c7faec85ceabb573d..d9a64523371fa019c4575bb400cbbc3a50ac9903:/osfmk/kern/sched_prim.c diff --git a/osfmk/kern/sched_prim.c b/osfmk/kern/sched_prim.c index 9a0a9427c..751b57417 100644 --- a/osfmk/kern/sched_prim.c +++ b/osfmk/kern/sched_prim.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -76,12 +76,12 @@ #include #include #include +#include -#ifdef CONFIG_MACH_APPROXIMATE_TIME #include -#endif #include +#include #include #include #include @@ -91,6 +91,9 @@ #include #include #include +#if MONOTONIC +#include +#endif /* MONOTONIC */ #include #include #include @@ -102,36 +105,40 @@ #include #include #include +#include +#include #include #include #include +#include #include +#include +#include #include - +#include +#include +#include #include +#include +#include -#if defined(CONFIG_TELEMETRY) && defined(CONFIG_SCHED_TIMESHARE_CORE) -#include -#endif - -struct rt_queue rt_runq; +int rt_runq_count(processor_set_t pset) +{ + return atomic_load_explicit(&SCHED(rt_runq)(pset)->count, memory_order_relaxed); +} -uintptr_t sched_thread_on_rt_queue = (uintptr_t)0xDEAFBEE0; +void rt_runq_count_incr(processor_set_t pset) +{ + atomic_fetch_add_explicit(&SCHED(rt_runq)(pset)->count, 1, memory_order_relaxed); +} -/* Lock RT runq, must be done with interrupts disabled (under splsched()) */ -#if __SMP__ -decl_simple_lock_data(static,rt_lock); -#define rt_lock_init() simple_lock_init(&rt_lock, 0) -#define rt_lock_lock() simple_lock(&rt_lock) -#define rt_lock_unlock() simple_unlock(&rt_lock) -#else -#define rt_lock_init() do { } while(0) -#define rt_lock_lock() do { } while(0) -#define rt_lock_unlock() do { } while(0) -#endif +void rt_runq_count_decr(processor_set_t pset) +{ + atomic_fetch_sub_explicit(&SCHED(rt_runq)(pset)->count, 1, memory_order_relaxed); +} #define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */ int default_preemption_rate = DEFAULT_PREEMPTION_RATE; @@ -173,17 +180,16 @@ uint32_t min_rt_quantum; #if defined(CONFIG_SCHED_TIMESHARE_CORE) -unsigned sched_tick; -uint32_t sched_tick_interval; -#if defined(CONFIG_TELEMETRY) -uint32_t sched_telemetry_interval; -#endif /* CONFIG_TELEMETRY */ +unsigned sched_tick; +uint32_t sched_tick_interval; -uint32_t sched_pri_shift = INT8_MAX; -uint32_t sched_background_pri_shift = INT8_MAX; -uint32_t sched_combined_fgbg_pri_shift = INT8_MAX; +/* Timeshare load calculation interval (15ms) */ +uint32_t sched_load_compute_interval_us = 15000; +uint64_t sched_load_compute_interval_abs; +static _Atomic uint64_t sched_load_compute_deadline; + +uint32_t sched_pri_shifts[TH_BUCKET_MAX]; uint32_t sched_fixed_shift; -uint32_t sched_use_combined_fgbg_decay = 0; uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */ @@ -200,14 +206,22 @@ int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT; uint64_t timer_deadline_tracking_bin_1; uint64_t timer_deadline_tracking_bin_2; +#endif /* CONFIG_SCHED_TIMESHARE_CORE */ + thread_t sched_maintenance_thread; -#endif /* CONFIG_SCHED_TIMESHARE_CORE */ +#if __arm__ || __arm64__ +/* interrupts disabled lock to guard recommended cores state */ +decl_simple_lock_data(static,sched_recommended_cores_lock); +static void sched_recommended_cores_maintenance(void); +static void sched_update_recommended_cores(uint32_t recommended_cores); -uint64_t sched_one_second_interval; +uint64_t perfcontrol_failsafe_starvation_threshold; +extern char *proc_name_address(struct proc *p); + +#endif /* __arm__ || __arm64__ */ -uint32_t sched_run_count, sched_share_count, sched_background_count; -uint32_t sched_load_average, sched_mach_factor; +uint64_t sched_one_second_interval; /* Forwards */ @@ -218,11 +232,6 @@ static void preempt_pri_init(void); #endif /* CONFIG_SCHED_TIMESHARE_CORE */ -static thread_t thread_select( - thread_t thread, - processor_t processor, - ast_t reason); - #if CONFIG_SCHED_IDLE_IN_PLACE static thread_t thread_select_idle( thread_t thread, @@ -243,9 +252,6 @@ static void processor_setrun( thread_t thread, integer_t options); -static void -sched_realtime_init(void); - static void sched_realtime_timebase_init(void); @@ -269,7 +275,7 @@ sched_vm_group_maintenance(void); #if defined(CONFIG_SCHED_TIMESHARE_CORE) int8_t sched_load_shifts[NRQS]; -int sched_preempt_pri[NRQBM]; +bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS)]; #endif /* CONFIG_SCHED_TIMESHARE_CORE */ const struct sched_dispatch_table *sched_current_dispatch = NULL; @@ -293,38 +299,25 @@ const struct sched_dispatch_table *sched_current_dispatch = NULL; */ char sched_string[SCHED_STRING_MAX_LENGTH]; -uint32_t sched_debug_flags; +uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS; /* Global flag which indicates whether Background Stepper Context is enabled */ static int cpu_throttle_enabled = 1; -void -sched_init(void) +#if DEBUG + +/* Since using the indirect function dispatch table has a negative impact on + * context switch performance, only allow DEBUG kernels to use that mechanism. + */ +static void +sched_init_override(void) { char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' }; /* Check for runtime selection of the scheduler algorithm */ if (!PE_parse_boot_argn("sched", sched_arg, sizeof (sched_arg))) { - /* If no boot-args override, look in device tree */ - if (!PE_get_default("kern.sched", sched_arg, - SCHED_STRING_MAX_LENGTH)) { - sched_arg[0] = '\0'; - } - } - - - if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) { - /* No boot-args, check in device tree */ - if (!PE_get_default("kern.sched_pri_decay_limit", - &sched_pri_decay_band_limit, - sizeof(sched_pri_decay_band_limit))) { - /* Allow decay all the way to normal limits */ - sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT; - } + sched_arg[0] = '\0'; } - - kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit); - if (strlen(sched_arg) > 0) { if (0) { /* Allow pattern below */ @@ -334,14 +327,6 @@ sched_init(void) } else if (0 == strcmp(sched_arg, sched_traditional_with_pset_runqueue_dispatch.sched_name)) { sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch; #endif -#if defined(CONFIG_SCHED_PROTO) - } else if (0 == strcmp(sched_arg, sched_proto_dispatch.sched_name)) { - sched_current_dispatch = &sched_proto_dispatch; -#endif -#if defined(CONFIG_SCHED_GRRR) - } else if (0 == strcmp(sched_arg, sched_grrr_dispatch.sched_name)) { - sched_current_dispatch = &sched_grrr_dispatch; -#endif #if defined(CONFIG_SCHED_MULTIQ) } else if (0 == strcmp(sched_arg, sched_multiq_dispatch.sched_name)) { sched_current_dispatch = &sched_multiq_dispatch; @@ -360,28 +345,48 @@ sched_init(void) kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name)); } else { #if defined(CONFIG_SCHED_MULTIQ) - sched_current_dispatch = &sched_multiq_dispatch; + sched_current_dispatch = &sched_dualq_dispatch; #elif defined(CONFIG_SCHED_TRADITIONAL) sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch; -#elif defined(CONFIG_SCHED_PROTO) - sched_current_dispatch = &sched_proto_dispatch; -#elif defined(CONFIG_SCHED_GRRR) - sched_current_dispatch = &sched_grrr_dispatch; #else #error No default scheduler implementation #endif kprintf("Scheduler: Default of %s\n", SCHED(sched_name)); } +} - strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string)); +#endif /* DEBUG */ + +void +sched_init(void) +{ +#if DEBUG + sched_init_override(); +#else /* DEBUG */ + kprintf("Scheduler: Default of %s\n", SCHED(sched_name)); +#endif /* DEBUG */ + + if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) { + /* No boot-args, check in device tree */ + if (!PE_get_default("kern.sched_pri_decay_limit", + &sched_pri_decay_band_limit, + sizeof(sched_pri_decay_band_limit))) { + /* Allow decay all the way to normal limits */ + sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT; + } + } + kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit); + if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) { kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags); } - + strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string)); + + cpu_quiescent_counter_init(); + SCHED(init)(); - sched_realtime_init(); - ast_init(); + SCHED(rt_init)(&pset0); sched_timer_deadline_tracking_init(); SCHED(pset_init)(&pset0); @@ -455,6 +460,10 @@ sched_timeshare_timebase_init(void) assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); sched_tick_interval = (uint32_t)abstime; + /* timeshare load calculation interval & deadline initialization */ + clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs); + sched_load_compute_deadline = sched_load_compute_interval_abs; + /* * Compute conversion factor from usage to * timesharing priorities with 5/8 ** n aging. @@ -464,31 +473,59 @@ sched_timeshare_timebase_init(void) abstime >>= 1; sched_fixed_shift = shift; + for (uint32_t i = 0 ; i < TH_BUCKET_MAX ; i++) + sched_pri_shifts[i] = INT8_MAX; + max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum; sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum; - + max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum; thread_depress_time = 1 * std_quantum; default_timeshare_computation = std_quantum / 2; default_timeshare_constraint = std_quantum; -#if defined(CONFIG_TELEMETRY) - /* interval for high frequency telemetry */ - clock_interval_to_absolutetime_interval(10, NSEC_PER_MSEC, &abstime); - assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); - sched_telemetry_interval = (uint32_t)abstime; -#endif +#if __arm__ || __arm64__ + perfcontrol_failsafe_starvation_threshold = (2 * sched_tick_interval); +#endif /* __arm__ || __arm64__ */ } #endif /* CONFIG_SCHED_TIMESHARE_CORE */ -static void -sched_realtime_init(void) +void +pset_rt_init(processor_set_t pset) +{ + rt_lock_init(pset); + + pset->rt_runq.count = 0; + queue_init(&pset->rt_runq.queue); + memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats); +} + +rt_queue_t +sched_rtglobal_runq(processor_set_t pset) +{ + (void)pset; + + return &pset0.rt_runq; +} + +void +sched_rtglobal_init(processor_set_t pset) { - rt_lock_init(); + if (pset == &pset0) { + return pset_rt_init(pset); + } + + /* Only pset0 rt_runq is used, so make it easy to detect + * buggy accesses to others. + */ + memset(&pset->rt_runq, 0xfd, sizeof pset->rt_runq); +} - rt_runq.count = 0; - queue_init(&rt_runq.queue); +void +sched_rtglobal_queue_shutdown(processor_t processor) +{ + (void)processor; } static void @@ -509,6 +546,23 @@ sched_realtime_timebase_init(void) } +void +sched_check_spill(processor_set_t pset, thread_t thread) +{ + (void)pset; + (void)thread; + + return; +} + +bool +sched_thread_should_yield(processor_t processor, thread_t thread) +{ + (void)thread; + + return (!SCHED(processor_queue_empty)(processor) || rt_runq_count(processor->processor_set) > 0); +} + #if defined(CONFIG_SCHED_TIMESHARE_CORE) /* @@ -531,10 +585,6 @@ load_shift_init(void) kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor); } - if (PE_parse_boot_argn("sched_use_combined_fgbg_decay", &sched_use_combined_fgbg_decay, sizeof (sched_use_combined_fgbg_decay))) { - kprintf("Overriding schedule fg/bg decay calculation: %u\n", sched_use_combined_fgbg_decay); - } - if (sched_decay_penalty == 0) { /* * There is no penalty for timeshare threads for using too much @@ -567,13 +617,13 @@ load_shift_init(void) static void preempt_pri_init(void) { - int i, *p = sched_preempt_pri; + bitmap_t *p = sched_preempt_pri; - for (i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) - setbit(i, p); + for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) + bitmap_set(p, i); - for (i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) - setbit(i, p); + for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) + bitmap_set(p, i); } #endif /* CONFIG_SCHED_TIMESHARE_CORE */ @@ -589,6 +639,8 @@ thread_timer_expire( thread_t thread = p0; spl_t s; + assert_thread_magic(thread); + s = splsched(); thread_lock(thread); if (--thread->wait_timer_active == 0) { @@ -620,6 +672,7 @@ thread_unblock( boolean_t ready_for_runq = FALSE; thread_t cthread = current_thread(); uint32_t new_run_count; + int old_thread_state; /* * Set wait_result. @@ -639,29 +692,27 @@ thread_unblock( * Update scheduling state: not waiting, * set running. */ - thread->state &= ~(TH_WAIT|TH_UNINT); + old_thread_state = thread->state; + thread->state = (old_thread_state | TH_RUN) & + ~(TH_WAIT|TH_UNINT|TH_WAIT_REPORT); - if (!(thread->state & TH_RUN)) { - thread->state |= TH_RUN; - thread->last_made_runnable_time = mach_approximate_time(); + if ((old_thread_state & TH_RUN) == 0) { + uint64_t ctime = mach_approximate_time(); + thread->last_made_runnable_time = thread->last_basepri_change_time = ctime; + timer_start(&thread->runnable_timer, ctime); ready_for_runq = TRUE; - (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread); + if (old_thread_state & TH_WAIT_REPORT) { + (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread); + } - /* - * Update run counts. - */ + /* Update the runnable thread count */ new_run_count = sched_run_incr(thread); - if (thread->sched_mode == TH_MODE_TIMESHARE) { - sched_share_incr(thread); - - if (thread->sched_flags & TH_SFLAG_THROTTLED) - sched_background_incr(thread); - } } else { /* - * Signal if idling on another processor. + * Either the thread is idling in place on another processor, + * or it hasn't finished context switching yet. */ #if CONFIG_SCHED_IDLE_IN_PLACE if (thread->state & TH_IDLE) { @@ -673,8 +724,11 @@ thread_unblock( #else assert((thread->state & TH_IDLE) == 0); #endif - - new_run_count = sched_run_count; /* updated in thread_select_idle() */ + /* + * The run count is only dropped after the context switch completes + * and the thread is still waiting, so we should not run_incr here + */ + new_run_count = sched_run_buckets[TH_BUCKET_RUN]; } @@ -694,6 +748,7 @@ thread_unblock( thread->quantum_remaining = 0; thread->computation_metered = 0; thread->reason = AST_NONE; + thread->block_hint = kThreadWaitNone; /* Obtain power-relevant interrupt and "platform-idle exit" statistics. * We also account for "double hop" thread signaling via @@ -705,7 +760,6 @@ thread_unblock( ml_get_power_state(&aticontext, &pidle); if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) { - ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1); DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, thread->task->bsd_info); uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd); @@ -718,32 +772,46 @@ thread_unblock( thread->thread_timer_wakeups_bin_2++; } + ledger_credit_thread(thread, thread->t_ledger, + task_ledgers.interrupt_wakeups, 1); if (pidle) { - ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1); + ledger_credit_thread(thread, thread->t_ledger, + task_ledgers.platform_idle_wakeups, 1); } } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) { + /* TODO: what about an interrupt that does a wake taken on a callout thread? */ if (cthread->callout_woken_from_icontext) { - ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1); + ledger_credit_thread(thread, thread->t_ledger, + task_ledgers.interrupt_wakeups, 1); thread->thread_callout_interrupt_wakeups++; + if (cthread->callout_woken_from_platform_idle) { - ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1); + ledger_credit_thread(thread, thread->t_ledger, + task_ledgers.platform_idle_wakeups, 1); thread->thread_callout_platform_idle_wakeups++; } - + cthread->callout_woke_thread = TRUE; } } - + if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) { thread->callout_woken_from_icontext = aticontext; thread->callout_woken_from_platform_idle = pidle; thread->callout_woke_thread = FALSE; } +#if KPERF + if (ready_for_runq) { + kperf_make_runnable(thread, aticontext); + } +#endif /* KPERF */ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result, new_run_count, 0); + (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result, + sched_run_buckets[TH_BUCKET_RUN], 0); DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, thread->task->bsd_info); @@ -768,6 +836,8 @@ thread_go( thread_t thread, wait_result_t wresult) { + assert_thread_magic(thread); + assert(thread->at_safe_point == FALSE); assert(thread->wait_event == NO_EVENT64); assert(thread->waitq == NULL); @@ -776,8 +846,13 @@ thread_go( assert(thread->state & TH_WAIT); - if (thread_unblock(thread, wresult)) + if (thread_unblock(thread, wresult)) { +#if SCHED_TRACE_THREAD_WAKEUPS + backtrace(&thread->thread_wakeup_bt[0], + (sizeof(thread->thread_wakeup_bt)/sizeof(uintptr_t))); +#endif thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ); + } return (KERN_SUCCESS); } @@ -795,12 +870,12 @@ __private_extern__ wait_result_t thread_mark_wait_locked( thread_t thread, - wait_interrupt_t interruptible) + wait_interrupt_t interruptible_orig) { - boolean_t at_safe_point; + boolean_t at_safe_point; + wait_interrupt_t interruptible = interruptible_orig; - assert(thread == current_thread()); - assert(!(thread->state & (TH_WAIT|TH_IDLE|TH_UNINT|TH_TERMINATE2))); + assert(!(thread->state & (TH_WAIT|TH_IDLE|TH_UNINT|TH_TERMINATE2|TH_WAIT_REPORT))); /* * The thread may have certain types of interrupts/aborts masked @@ -808,6 +883,7 @@ thread_mark_wait_locked( * are OK, we have to honor mask settings (outer-scoped code may * not be able to handle aborts at the moment). */ + interruptible &= TH_OPT_INTMASK; if (interruptible > (thread->options & TH_OPT_INTMASK)) interruptible = thread->options & TH_OPT_INTMASK; @@ -821,13 +897,34 @@ thread_mark_wait_locked( if ( !(thread->state & TH_TERMINATE)) DTRACE_SCHED(sleep); - thread->state |= (interruptible) ? TH_WAIT : (TH_WAIT | TH_UNINT); + int state_bits = TH_WAIT; + if (!interruptible) { + state_bits |= TH_UNINT; + } + if (thread->sched_call) { + wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER; + if (is_kerneltask(thread->task)) { + mask = THREAD_WAIT_NOREPORT_KERNEL; + } + if ((interruptible_orig & mask) == 0) { + state_bits |= TH_WAIT_REPORT; + } + } + thread->state |= state_bits; thread->at_safe_point = at_safe_point; + + /* TODO: pass this through assert_wait instead, have + * assert_wait just take a struct as an argument */ + assert(!thread->block_hint); + thread->block_hint = thread->pending_block_hint; + thread->pending_block_hint = kThreadWaitNone; + return (thread->wait_result = THREAD_WAITING); + } else { + if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) + thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK; } - else - if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) - thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK; + thread->pending_block_hint = kThreadWaitNone; return (thread->wait_result = THREAD_INTERRUPTED); } @@ -858,28 +955,6 @@ thread_interrupt_level( return result; } -/* - * Check to see if an assert wait is possible, without actually doing one. - * This is used by debug code in locks and elsewhere to verify that it is - * always OK to block when trying to take a blocking lock (since waiting - * for the actual assert_wait to catch the case may make it hard to detect - * this case. - */ -boolean_t -assert_wait_possible(void) -{ - - thread_t thread; - -#if DEBUG - if(debug_mode) return TRUE; /* Always succeed in debug mode */ -#endif - - thread = current_thread(); - - return (thread == NULL || waitq_wait_possible(thread)); -} - /* * assert_wait: * @@ -896,13 +971,25 @@ assert_wait( KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, - VM_KERNEL_UNSLIDE(event), 0, 0, 0, 0); + VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0); struct waitq *waitq; waitq = global_eventq(event); return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER); } +/* + * assert_wait_queue: + * + * Return the global waitq for the specified event + */ +struct waitq * +assert_wait_queue( + event_t event) +{ + return global_eventq(event); +} + wait_result_t assert_wait_timeout( event_t event, @@ -923,13 +1010,12 @@ assert_wait_timeout( s = splsched(); waitq_lock(waitq); - thread_lock(thread); clock_interval_to_deadline(interval, scale_factor, &deadline); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, - VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0); + VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0); wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), interruptible, @@ -937,7 +1023,6 @@ assert_wait_timeout( deadline, TIMEOUT_NO_LEEWAY, thread); - thread_unlock(thread); waitq_unlock(waitq); splx(s); return wresult; @@ -974,18 +1059,16 @@ assert_wait_timeout_with_leeway( s = splsched(); waitq_lock(waitq); - thread_lock(thread); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, - VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0); + VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0); wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), interruptible, urgency, deadline, slop, thread); - thread_unlock(thread); waitq_unlock(waitq); splx(s); return wresult; @@ -1009,17 +1092,15 @@ assert_wait_deadline( s = splsched(); waitq_lock(waitq); - thread_lock(thread); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, - VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0); + VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0); wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_URGENCY_SYS_NORMAL, deadline, TIMEOUT_NO_LEEWAY, thread); - thread_unlock(thread); waitq_unlock(waitq); splx(s); return wresult; @@ -1045,18 +1126,15 @@ assert_wait_deadline_with_leeway( s = splsched(); waitq_lock(waitq); - thread_lock(thread); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, - VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0); + VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0); wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), interruptible, urgency, deadline, leeway, thread); - - thread_unlock(thread); waitq_unlock(waitq); splx(s); return wresult; @@ -1309,21 +1387,19 @@ clear_wait_internal( thread_t thread, wait_result_t wresult) { - uint32_t i = LockTimeOut; + uint32_t i = LockTimeOutUsec; struct waitq *waitq = thread->waitq; - + do { if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT)) return (KERN_FAILURE); if (waitq != NULL) { - assert(waitq_irq_safe(waitq)); //irqs are already disabled! - if (waitq_lock_try(waitq)) { - waitq_pull_thread_locked(waitq, thread); - waitq_unlock(waitq); - } else { + if (!waitq_pull_thread_locked(waitq, thread)) { thread_unlock(thread); delay(1); + if (i > 0 && !machine_timeout_suspended()) + i--; thread_lock(thread); if (waitq != thread->waitq) return KERN_NOT_WAITING; @@ -1336,7 +1412,7 @@ clear_wait_internal( return (thread_go(thread, wresult)); else return (KERN_NOT_WAITING); - } while ((--i > 0) || machine_timeout_suspended()); + } while (i > 0); panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n", thread, waitq, cpu_number()); @@ -1381,33 +1457,75 @@ clear_wait( */ kern_return_t thread_wakeup_prim( - event_t event, - boolean_t one_thread, - wait_result_t result) + event_t event, + boolean_t one_thread, + wait_result_t result) { - return (thread_wakeup_prim_internal(event, one_thread, result, -1)); + if (__improbable(event == NO_EVENT)) + panic("%s() called with NO_EVENT", __func__); + + struct waitq *wq = global_eventq(event); + + if (one_thread) + return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES); + else + return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES); } +/* + * Wakeup a specified thread if and only if it's waiting for this event + */ +kern_return_t +thread_wakeup_thread( + event_t event, + thread_t thread) +{ + if (__improbable(event == NO_EVENT)) + panic("%s() called with NO_EVENT", __func__); + + if (__improbable(thread == THREAD_NULL)) + panic("%s() called with THREAD_NULL", __func__); + + struct waitq *wq = global_eventq(event); + + return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED); +} +/* + * Wakeup a thread waiting on an event and promote it to a priority. + * + * Requires woken thread to un-promote itself when done. + */ kern_return_t -thread_wakeup_prim_internal( - event_t event, - boolean_t one_thread, - wait_result_t result, - int priority) +thread_wakeup_one_with_pri( + event_t event, + int priority) { if (__improbable(event == NO_EVENT)) panic("%s() called with NO_EVENT", __func__); - struct waitq *wq; + struct waitq *wq = global_eventq(event); - wq = global_eventq(event); - priority = (priority == -1 ? WAITQ_ALL_PRIORITIES : priority); + return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority); +} - if (one_thread) - return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, priority); - else - return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, priority); +/* + * Wakeup a thread waiting on an event, + * promote it to a priority, + * and return a reference to the woken thread. + * + * Requires woken thread to un-promote itself when done. + */ +thread_t +thread_wakeup_identify(event_t event, + int priority) +{ + if (__improbable(event == NO_EVENT)) + panic("%s() called with NO_EVENT", __func__); + + struct waitq *wq = global_eventq(event); + + return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority); } /* @@ -1640,7 +1758,7 @@ int sched_smt_balance = 1; #if __SMP__ /* Invoked with pset locked, returns with pset unlocked */ -static void +void sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) { processor_t ast_processor = NULL; @@ -1663,19 +1781,21 @@ sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) { processor_t sprocessor; - sprocessor = (processor_t)queue_first(&cpset->active_queue); - - while (!queue_end(&cpset->active_queue, (queue_entry_t)sprocessor)) { - if ((sprocessor->state == PROCESSOR_RUNNING) && - (sprocessor->processor_primary != sprocessor) && - (sprocessor->processor_primary->state == PROCESSOR_RUNNING) && - (sprocessor->current_pri < BASEPRI_RTQUEUES) && - ((cpset->pending_AST_cpu_mask & (1ULL << sprocessor->cpu_id)) == 0)) { - assert(sprocessor != cprocessor); - ast_processor = sprocessor; - break; + sched_ipi_type_t ipi_type = SCHED_IPI_NONE; + uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] & + ~cpset->primary_map); + for (int cpuid = lsb_first(running_secondary_map); cpuid >= 0; cpuid = lsb_next(running_secondary_map, cpuid)) { + sprocessor = processor_array[cpuid]; + if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) && + (sprocessor->current_pri < BASEPRI_RTQUEUES)) { + + ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL); + if (ipi_type != SCHED_IPI_NONE) { + assert(sprocessor != cprocessor); + ast_processor = sprocessor; + break; + } } - sprocessor = (processor_t)queue_next((queue_entry_t)sprocessor); } smt_balance_exit: @@ -1683,11 +1803,22 @@ smt_balance_exit: if (ast_processor) { KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0); - cause_ast_check(ast_processor); + sched_ipi_perform(ast_processor, ipi_type); } } +#else +/* Invoked with pset locked, returns with pset unlocked */ +void +sched_SMT_balance(__unused processor_t cprocessor, processor_set_t cpset) +{ + pset_unlock(cpset); +} #endif /* __SMP__ */ +static processor_t choose_processor_for_realtime_thread(processor_set_t pset); +static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset); +int sched_allow_rt_smt = 1; + /* * thread_select: * @@ -1696,10 +1827,9 @@ smt_balance_exit: * May select the current thread, which must be locked. */ static thread_t -thread_select( - thread_t thread, - processor_t processor, - ast_t reason) +thread_select(thread_t thread, + processor_t processor, + ast_t *reason) { processor_set_t pset = processor->processor_set; thread_t new_thread = THREAD_NULL; @@ -1714,9 +1844,7 @@ thread_select( if (SCHED(can_update_priority)(thread)) SCHED(update_priority)(thread); - processor->current_pri = thread->sched_pri; - processor->current_thmode = thread->sched_mode; - processor->current_sfi_class = thread->sfi_class; + processor_state_update_from_thread(processor, thread); pset_lock(pset); @@ -1739,13 +1867,22 @@ thread_select( * An exception is that bound threads are dispatched to a processor without going through * choose_processor(), so in those cases we should continue trying to dequeue work. */ - if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue) && !rt_runq.count) { - goto idle; + if (!SCHED(processor_bound_count)(processor)) { + if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) { + goto idle; + } + + /* There are no idle primaries */ + + if (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) { + bool secondary_can_run_realtime_thread = sched_allow_rt_smt && rt_runq_count(pset) && all_available_primaries_are_running_realtime_threads(pset); + if (!secondary_can_run_realtime_thread) { + goto idle; + } + } } } - rt_lock_lock(); - /* * Test to see if the current thread should continue * to run on this processor. Must not be attempting to wait, and not @@ -1757,80 +1894,154 @@ thread_select( * This code is very insanely tricky. */ - if (((thread->state & (TH_TERMINATE|TH_IDLE|TH_WAIT|TH_RUN|TH_SUSP)) == TH_RUN) && - (thread->sched_pri >= BASEPRI_RTQUEUES || processor->processor_primary == processor) && - (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor) && - (thread->affinity_set == AFFINITY_SET_NULL || thread->affinity_set->aset_pset == pset)) { + /* i.e. not waiting, not TH_SUSP'ed */ + boolean_t still_running = ((thread->state & (TH_TERMINATE|TH_IDLE|TH_WAIT|TH_RUN|TH_SUSP)) == TH_RUN); + + /* + * Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads. + * TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors' + */ + boolean_t needs_smt_rebalance = (thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor); + + boolean_t affinity_mismatch = (thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset); + + boolean_t bound_elsewhere = (thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor); + + boolean_t avoid_processor = (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread)); + + if (still_running && !needs_smt_rebalance && !affinity_mismatch && !bound_elsewhere && !avoid_processor) { /* + * This thread is eligible to keep running on this processor. + * * RT threads with un-expired quantum stay on processor, * unless there's a valid RT thread with an earlier deadline. */ if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) { - if (rt_runq.count > 0) { - thread_t next_rt; - - next_rt = (thread_t)queue_first(&rt_runq.queue); + if (rt_runq_count(pset) > 0) { - assert(next_rt->runq == THREAD_ON_RT_RUNQ); - - if (next_rt->realtime.deadline < processor->deadline && - (next_rt->bound_processor == PROCESSOR_NULL || - next_rt->bound_processor == processor)) { - /* The next RT thread is better, so pick it off the runqueue. */ - goto pick_new_rt_thread; + rt_lock_lock(pset); + + if (rt_runq_count(pset) > 0) { + + thread_t next_rt = qe_queue_first(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links); + + if (next_rt->realtime.deadline < processor->deadline && + (next_rt->bound_processor == PROCESSOR_NULL || + next_rt->bound_processor == processor)) { + /* The next RT thread is better, so pick it off the runqueue. */ + goto pick_new_rt_thread; + } } + + rt_lock_unlock(pset); } /* This is still the best RT thread to run. */ processor->deadline = thread->realtime.deadline; - rt_lock_unlock(); + sched_update_pset_load_average(pset); + + processor_t next_rt_processor = PROCESSOR_NULL; + sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE; + + if (rt_runq_count(pset) > 0) { + next_rt_processor = choose_processor_for_realtime_thread(pset); + if (next_rt_processor) { + next_rt_ipi_type = sched_ipi_action(next_rt_processor, NULL, false, SCHED_IPI_EVENT_PREEMPT); + } + } pset_unlock(pset); + if (next_rt_processor) { + sched_ipi_perform(next_rt_processor, next_rt_ipi_type); + } + return (thread); } - if ((rt_runq.count == 0) && + if ((rt_runq_count(pset) == 0) && SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) { /* This thread is still the highest priority runnable (non-idle) thread */ processor->deadline = UINT64_MAX; - rt_lock_unlock(); + sched_update_pset_load_average(pset); pset_unlock(pset); return (thread); } + } else { + /* + * This processor must context switch. + * If it's due to a rebalance, we should aggressively find this thread a new home. + */ + if (needs_smt_rebalance || affinity_mismatch || bound_elsewhere || avoid_processor) + *reason |= AST_REBALANCE; } /* OK, so we're not going to run the current thread. Look at the RT queue. */ - if (rt_runq.count > 0) { - thread_t next_rt = (thread_t)queue_first(&rt_runq.queue); + if (rt_runq_count(pset) > 0) { - assert(next_rt->runq == THREAD_ON_RT_RUNQ); + rt_lock_lock(pset); - if (__probable((next_rt->bound_processor == PROCESSOR_NULL || - (next_rt->bound_processor == processor)))) { + if (rt_runq_count(pset) > 0) { + thread_t next_rt = qe_queue_first(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links); + + if (__probable((next_rt->bound_processor == PROCESSOR_NULL || + (next_rt->bound_processor == processor)))) { pick_new_rt_thread: - new_thread = (thread_t)dequeue_head(&rt_runq.queue); + new_thread = qe_dequeue_head(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links); - new_thread->runq = PROCESSOR_NULL; - SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count); - rt_runq.count--; + new_thread->runq = PROCESSOR_NULL; + SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset)); + rt_runq_count_decr(pset); - processor->deadline = new_thread->realtime.deadline; + processor->deadline = new_thread->realtime.deadline; + processor_state_update_from_thread(processor, new_thread); - rt_lock_unlock(); - pset_unlock(pset); + rt_lock_unlock(pset); + sched_update_pset_load_average(pset); - return (new_thread); + processor_t ast_processor = PROCESSOR_NULL; + processor_t next_rt_processor = PROCESSOR_NULL; + sched_ipi_type_t ipi_type = SCHED_IPI_NONE; + sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE; + + if (processor->processor_secondary != NULL) { + processor_t sprocessor = processor->processor_secondary; + if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) { + ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL); + ast_processor = sprocessor; + } + } + if (rt_runq_count(pset) > 0) { + next_rt_processor = choose_processor_for_realtime_thread(pset); + if (next_rt_processor) { + next_rt_ipi_type = sched_ipi_action(next_rt_processor, NULL, false, SCHED_IPI_EVENT_PREEMPT); + } + } + pset_unlock(pset); + + if (ast_processor) { + sched_ipi_perform(ast_processor, ipi_type); + } + + if (next_rt_processor) { + sched_ipi_perform(next_rt_processor, next_rt_ipi_type); + } + + return (new_thread); + } } + + rt_lock_unlock(pset); } processor->deadline = UINT64_MAX; - rt_lock_unlock(); /* No RT threads, so let's look at the regular threads. */ - if ((new_thread = SCHED(choose_thread)(processor, MINPRI, reason)) != THREAD_NULL) { + if ((new_thread = SCHED(choose_thread)(processor, MINPRI, *reason)) != THREAD_NULL) { + sched_update_pset_load_average(pset); + processor_state_update_from_thread(processor, new_thread); pset_unlock(pset); return (new_thread); } @@ -1850,7 +2061,7 @@ pick_new_rt_thread: * If other threads have appeared, shortcut * around again. */ - if (!SCHED(processor_queue_empty)(processor) || rt_runq.count > 0) + if (!SCHED(processor_queue_empty)(processor) || rt_runq_count(pset) > 0) continue; pset_lock(pset); @@ -1863,20 +2074,12 @@ pick_new_rt_thread: * was running. */ if (processor->state == PROCESSOR_RUNNING) { - remqueue((queue_entry_t)processor); - processor->state = PROCESSOR_IDLE; - - if (processor->processor_primary == processor) { - enqueue_head(&pset->idle_queue, (queue_entry_t)processor); - } - else { - enqueue_head(&pset->idle_secondary_queue, (queue_entry_t)processor); - } + pset_update_processor_state(pset, processor, PROCESSOR_IDLE); } #if __SMP__ /* Invoked with pset locked, returns with pset unlocked */ - sched_SMT_balance(processor, pset); + SCHED(processor_balance)(processor, pset); #else pset_unlock(pset); #endif @@ -1931,18 +2134,10 @@ thread_select_idle( uint64_t arg1, arg2; int urgency; - if (thread->sched_mode == TH_MODE_TIMESHARE) { - if (thread->sched_flags & TH_SFLAG_THROTTLED) - sched_background_decr(thread); - - sched_share_decr(thread); - } sched_run_decr(thread); thread->state |= TH_IDLE; - processor->current_pri = IDLEPRI; - processor->current_thmode = TH_MODE_NONE; - processor->current_sfi_class = SFI_CLASS_KERNEL; + processor_state_update_idle(procssor); /* Reload precise timing global policy to thread-local policy */ thread->precise_user_kernel_time = use_precise_user_kernel_time(thread); @@ -1959,16 +2154,20 @@ thread_select_idle( #endif thread->last_run_time = processor->last_dispatch; - thread_timer_event(processor->last_dispatch, &processor->idle_thread->system_timer); + processor_timer_switch_thread(processor->last_dispatch, + &processor->idle_thread->system_timer); PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer; + /* * Cancel the quantum timer while idling. */ - timer_call_cancel(&processor->quantum_timer); + timer_call_quantum_timer_cancel(&processor->quantum_timer); processor->first_timeslice = FALSE; - (*thread->sched_call)(SCHED_CALL_BLOCK, thread); + if (thread->sched_call) { + (*thread->sched_call)(SCHED_CALL_BLOCK, thread); + } thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL); @@ -1981,7 +2180,9 @@ thread_select_idle( /* * Return at splsched. */ - (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread); + if (thread->sched_call) { + (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread); + } thread_lock(thread); @@ -1990,16 +2191,17 @@ thread_select_idle( * Otherwise skip; we will context switch to another thread or return here. */ if (!(thread->state & TH_WAIT)) { - processor->last_dispatch = mach_absolute_time(); - thread_timer_event(processor->last_dispatch, &thread->system_timer); + uint64_t time_now = processor->last_dispatch = mach_absolute_time(); + processor_timer_switch_thread(time_now, &thread->system_timer); + timer_update(&thread->runnable_timer, time_now); PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer; - thread_quantum_init(thread); - processor->quantum_end = processor->last_dispatch + thread->quantum_remaining; - timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL); + processor->quantum_end = time_now + thread->quantum_remaining; + timer_call_quantum_timer_enter(&processor->quantum_timer, + thread, processor->quantum_end, time_now); processor->first_timeslice = TRUE; - thread->computation_epoch = processor->last_dispatch; + thread->computation_epoch = time_now; } thread->state &= ~TH_IDLE; @@ -2009,12 +2211,6 @@ thread_select_idle( thread_tell_urgency(urgency, arg1, arg2, 0, new_thread); sched_run_incr(thread); - if (thread->sched_mode == TH_MODE_TIMESHARE) { - sched_share_incr(thread); - - if (thread->sched_flags & TH_SFLAG_THROTTLED) - sched_background_incr(thread); - } return (new_thread); } @@ -2058,15 +2254,22 @@ thread_invoke( #endif #if defined(CONFIG_SCHED_TIMESHARE_CORE) - sched_timeshare_consider_maintenance(ctime); + if ((thread->state & TH_IDLE) == 0) + sched_timeshare_consider_maintenance(ctime); #endif +#if MONOTONIC + mt_sched_update(self); +#endif /* MONOTONIC */ + + assert_thread_magic(self); assert(self == current_thread()); assert(self->runq == PROCESSOR_NULL); assert((self->state & (TH_RUN|TH_TERMINATE2)) == TH_RUN); thread_lock(thread); + assert_thread_magic(thread); assert((thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_TERMINATE|TH_TERMINATE2)) == TH_RUN); assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor()); assert(thread->runq == PROCESSOR_NULL); @@ -2076,11 +2279,25 @@ thread_invoke( /* Update SFI class based on other factors */ thread->sfi_class = sfi_thread_classify(thread); + + /* Update the same_pri_latency for the thread (used by perfcontrol callouts) */ + thread->same_pri_latency = ctime - thread->last_basepri_change_time; + /* + * In case a base_pri update happened between the timestamp and + * taking the thread lock + */ + if (ctime <= thread->last_basepri_change_time) + thread->same_pri_latency = ctime - thread->last_made_runnable_time; /* Allow realtime threads to hang onto a stack. */ if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack) self->reserved_stack = self->kernel_stack; + /* Prepare for spin debugging */ +#if INTERRUPT_MASKED_DEBUG + ml_spin_debug_clear(thread); +#endif + if (continuation != NULL) { if (!thread->kernel_stack) { /* @@ -2099,9 +2316,8 @@ thread_invoke( processor = current_processor(); processor->active_thread = thread; - processor->current_pri = thread->sched_pri; - processor->current_thmode = thread->sched_mode; - processor->current_sfi_class = thread->sfi_class; + processor_state_update_from_thread(processor, thread); + if (thread->last_processor != processor && thread->last_processor != NULL) { if (thread->last_processor->processor_set != processor->processor_set) thread->ps_switch++; @@ -2117,7 +2333,8 @@ thread_invoke( processor->last_dispatch = ctime; self->last_run_time = ctime; - thread_timer_event(ctime, &thread->system_timer); + processor_timer_switch_thread(ctime, &thread->system_timer); + timer_update(&thread->runnable_timer, ctime); PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer; /* @@ -2125,11 +2342,9 @@ thread_invoke( * during privilege transitions, synthesize an event now. */ if (!thread->precise_user_kernel_time) { - timer_switch(PROCESSOR_DATA(processor, current_state), - ctime, - PROCESSOR_DATA(processor, current_state)); + timer_update(PROCESSOR_DATA(processor, current_state), ctime); } - + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)|DBG_FUNC_NONE, self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); @@ -2143,24 +2358,36 @@ thread_invoke( SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri); +#if KPERF + kperf_off_cpu(self); +#endif /* KPERF */ + TLOG(1, "thread_invoke: calling stack_handoff\n"); stack_handoff(self, thread); /* 'self' is now off core */ - assert(thread == current_thread()); + assert(thread == current_thread_volatile()); DTRACE_SCHED(on__cpu); +#if KPERF + kperf_on_cpu(thread, continuation, NULL); +#endif /* KPERF */ + thread_dispatch(self, thread); +#if KASAN + /* Old thread's stack has been moved to the new thread, so explicitly + * unpoison it. */ + kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size); +#endif + thread->continuation = thread->parameter = NULL; counter(c_thread_invoke_hits++); - (void) spllo(); - assert(continuation); - call_continuation(continuation, parameter, thread->wait_result); + call_continuation(continuation, parameter, thread->wait_result, TRUE); /*NOTREACHED*/ } else if (thread == self) { @@ -2170,15 +2397,25 @@ thread_invoke( thread_unlock(self); +#if KPERF + kperf_on_cpu(thread, continuation, NULL); +#endif /* KPERF */ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE, self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); - self->continuation = self->parameter = NULL; +#if KASAN + /* stack handoff to self - no thread_dispatch(), so clear the stack + * and free the fakestack directly */ + kasan_fakestack_drop(self); + kasan_fakestack_gc(self); + kasan_unpoison_stack(self->kernel_stack, kernel_stack_size); +#endif - (void) spllo(); + self->continuation = self->parameter = NULL; - call_continuation(continuation, parameter, self->wait_result); + call_continuation(continuation, parameter, self->wait_result, TRUE); /*NOTREACHED*/ } } else { @@ -2211,9 +2448,8 @@ need_stack: */ processor = current_processor(); processor->active_thread = thread; - processor->current_pri = thread->sched_pri; - processor->current_thmode = thread->sched_mode; - processor->current_sfi_class = thread->sfi_class; + processor_state_update_from_thread(processor, thread); + if (thread->last_processor != processor && thread->last_processor != NULL) { if (thread->last_processor->processor_set != processor->processor_set) thread->ps_switch++; @@ -2231,7 +2467,8 @@ need_stack: processor->last_dispatch = ctime; self->last_run_time = ctime; - thread_timer_event(ctime, &thread->system_timer); + processor_timer_switch_thread(ctime, &thread->system_timer); + timer_update(&thread->runnable_timer, ctime); PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer; /* @@ -2239,9 +2476,7 @@ need_stack: * during privilege transitions, synthesize an event now. */ if (!thread->precise_user_kernel_time) { - timer_switch(PROCESSOR_DATA(processor, current_state), - ctime, - PROCESSOR_DATA(processor, current_state)); + timer_update(PROCESSOR_DATA(processor, current_state), ctime); } KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, @@ -2257,6 +2492,10 @@ need_stack: SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri); +#if KPERF + kperf_off_cpu(self); +#endif /* KPERF */ + /* * This is where we actually switch register context, * and address space if required. We will next run @@ -2274,11 +2513,15 @@ need_stack: */ assert(continuation == self->continuation); thread = machine_switch_context(self, continuation, thread); - assert(self == current_thread()); + assert(self == current_thread_volatile()); TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread); DTRACE_SCHED(on__cpu); +#if KPERF + kperf_on_cpu(self, NULL, __builtin_frame_address(0)); +#endif /* KPERF */ + /* * We have been resumed and are set to run. */ @@ -2287,9 +2530,7 @@ need_stack: if (continuation) { self->continuation = self->parameter = NULL; - (void) spllo(); - - call_continuation(continuation, parameter, self->wait_result); + call_continuation(continuation, parameter, self->wait_result, TRUE); /*NOTREACHED*/ } @@ -2316,7 +2557,7 @@ pset_cancel_deferred_dispatch( uint32_t sampled_sched_run_count; pset_lock(pset); - sampled_sched_run_count = (volatile uint32_t) sched_run_count; + sampled_sched_run_count = (volatile uint32_t) sched_run_buckets[TH_BUCKET_RUN]; /* * If we have emptied the run queue, and our current thread is runnable, we @@ -2333,11 +2574,14 @@ pset_cancel_deferred_dispatch( * correct (we won't accidentally have a runnable thread that hasn't been * dispatched to an idle processor), if not ideal (we may be restarting the * dispatch process, which could have some overhead). - * */ - if ((sampled_sched_run_count == 1) && - (pset->pending_deferred_AST_cpu_mask)) { - qe_foreach_element_safe(active_processor, &pset->active_queue, processor_queue) { + + if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) { + uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] & + pset->pending_deferred_AST_cpu_mask & + ~pset->pending_AST_cpu_mask); + for (int cpuid = lsb_first(dispatching_map); cpuid >= 0; cpuid = lsb_next(dispatching_map, cpuid)) { + active_processor = processor_array[cpuid]; /* * If a processor is DISPATCHING, it could be because of * a cancelable signal. @@ -2359,30 +2603,17 @@ pset_cancel_deferred_dispatch( * should be no different than if the core took some * interrupt while IDLE. */ - if ((active_processor->state == PROCESSOR_DISPATCHING) && - (pset->pending_deferred_AST_cpu_mask & (1ULL << active_processor->cpu_id)) && - (!(pset->pending_AST_cpu_mask & (1ULL << active_processor->cpu_id))) && - (active_processor != processor)) { + if (active_processor != processor) { /* * Squash all of the processor state back to some * reasonable facsimile of PROCESSOR_IDLE. - * - * TODO: What queue policy do we actually want here? - * We want to promote selection of a good processor - * to run on. Do we want to enqueue at the head? - * The tail? At the (relative) old position in the - * queue? Or something else entirely? */ - re_queue_head(&pset->idle_queue, (queue_entry_t)active_processor); assert(active_processor->next_thread == THREAD_NULL); - - active_processor->current_pri = IDLEPRI; - active_processor->current_thmode = TH_MODE_FIXED; - active_processor->current_sfi_class = SFI_CLASS_KERNEL; + processor_state_update_idle(active_processor); active_processor->deadline = UINT64_MAX; - active_processor->state = PROCESSOR_IDLE; - pset->pending_deferred_AST_cpu_mask &= ~(1U << active_processor->cpu_id); + pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE); + bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id); machine_signal_idle_cancel(active_processor); } @@ -2395,6 +2626,19 @@ pset_cancel_deferred_dispatch( /* We don't support deferred ASTs; everything is candycanes and sunshine. */ #endif +static void +thread_csw_callout( + thread_t old, + thread_t new, + uint64_t timestamp) +{ + perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH; + uint64_t same_pri_latency = (new->state & TH_IDLE) ? 0 : new->same_pri_latency; + machine_switch_perfcontrol_context(event, timestamp, 0, + same_pri_latency, old, new); +} + + /* * thread_dispatch: * @@ -2416,10 +2660,42 @@ thread_dispatch( processor_t processor = self->last_processor; assert(processor == current_processor()); - assert(self == current_thread()); + assert(self == current_thread_volatile()); assert(thread != self); if (thread != THREAD_NULL) { + /* + * Do the perfcontrol callout for context switch. + * The reason we do this here is: + * - thread_dispatch() is called from various places that are not + * the direct context switch path for eg. processor shutdown etc. + * So adding the callout here covers all those cases. + * - We want this callout as early as possible to be close + * to the timestamp taken in thread_invoke() + * - We want to avoid holding the thread lock while doing the + * callout + * - We do not want to callout if "thread" is NULL. + */ + thread_csw_callout(thread, self, processor->last_dispatch); + +#if KASAN + if (thread->continuation != NULL) { + /* + * Thread has a continuation and the normal stack is going away. + * Unpoison the stack and mark all fakestack objects as unused. + */ + kasan_fakestack_drop(thread); + if (thread->kernel_stack) { + kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size); + } + } + + /* + * Free all unused fakestack objects. + */ + kasan_fakestack_gc(thread); +#endif + /* * If blocked at a continuation, discard * the stack. @@ -2429,8 +2705,9 @@ thread_dispatch( if (thread->state & TH_IDLE) { KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), 0, thread->state, sched_run_count, 0); + MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE, + (uintptr_t)thread_tid(thread), 0, thread->state, + sched_run_buckets[TH_BUCKET_RUN], 0); } else { int64_t consumed; int64_t remainder = 0; @@ -2446,28 +2723,40 @@ thread_dispatch( * Bill CPU time to both the task and * the individual thread. */ - ledger_credit(thread->t_ledger, - task_ledgers.cpu_time, consumed); - ledger_credit(thread->t_threadledger, - thread_ledgers.cpu_time, consumed); -#ifdef CONFIG_BANK + ledger_credit_thread(thread, thread->t_ledger, + task_ledgers.cpu_time, consumed); + ledger_credit_thread(thread, thread->t_threadledger, + thread_ledgers.cpu_time, consumed); if (thread->t_bankledger) { - ledger_credit(thread->t_bankledger, - bank_ledgers.cpu_time, - (consumed - thread->t_deduct_bank_ledger_time)); - + ledger_credit_thread(thread, thread->t_bankledger, + bank_ledgers.cpu_time, + (consumed - thread->t_deduct_bank_ledger_time)); } - thread->t_deduct_bank_ledger_time =0; -#endif + thread->t_deduct_bank_ledger_time = 0; } wake_lock(thread); thread_lock(thread); /* - * Compute remainder of current quantum. + * Apply a priority floor if the thread holds a kernel resource + * Do this before checking starting_pri to avoid overpenalizing + * repeated rwlock blockers. */ - if (processor->first_timeslice && + if (__improbable(thread->rwlock_count != 0)) + lck_rw_set_promotion_locked(thread); + + boolean_t keep_quantum = processor->first_timeslice; + + /* + * Treat a thread which has dropped priority since it got on core + * as having expired its quantum. + */ + if (processor->starting_pri > thread->sched_pri) + keep_quantum = FALSE; + + /* Compute remainder of current quantum. */ + if (keep_quantum && processor->quantum_end > processor->last_dispatch) thread->quantum_remaining = (uint32_t)remainder; else @@ -2521,46 +2810,44 @@ thread_dispatch( thread->computation_metered += (processor->last_dispatch - thread->computation_epoch); - if ((thread->rwlock_count != 0) && !(LcksOpts & disLkRWPrio)) { - integer_t priority; - - priority = thread->sched_pri; - - if (priority < thread->base_pri) - priority = thread->base_pri; - if (priority < BASEPRI_BACKGROUND) - priority = BASEPRI_BACKGROUND; - - if ((thread->sched_pri < priority) || !(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), thread->sched_pri, thread->base_pri, priority, 0); - - thread->sched_flags |= TH_SFLAG_RW_PROMOTED; - - if (thread->sched_pri < priority) - set_sched_pri(thread, priority); - } - } - if (!(thread->state & TH_WAIT)) { /* * Still runnable. */ - thread->last_made_runnable_time = mach_approximate_time(); - - machine_thread_going_off_core(thread, FALSE); + thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch; + + machine_thread_going_off_core(thread, FALSE, processor->last_dispatch); + + ast_t reason = thread->reason; + sched_options_t options = SCHED_NONE; + + if (reason & AST_REBALANCE) { + options |= SCHED_REBALANCE; + if (reason & AST_QUANTUM) { + /* + * Having gone to the trouble of forcing this thread off a less preferred core, + * we should force the preferable core to reschedule immediately to give this + * thread a chance to run instead of just sitting on the run queue where + * it may just be stolen back by the idle core we just forced it off. + * But only do this at the end of a quantum to prevent cascading effects. + */ + options |= SCHED_PREEMPT; + } + } - if (thread->reason & AST_QUANTUM) - thread_setrun(thread, SCHED_TAILQ); - else if (thread->reason & AST_PREEMPT) - thread_setrun(thread, SCHED_HEADQ); + if (reason & AST_QUANTUM) + options |= SCHED_TAILQ; + else if (reason & AST_PREEMPT) + options |= SCHED_HEADQ; else - thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ); + options |= (SCHED_PREEMPT | SCHED_TAILQ); + + thread_setrun(thread, options); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), thread->reason, thread->state, sched_run_count, 0); + MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE, + (uintptr_t)thread_tid(thread), thread->reason, thread->state, + sched_run_buckets[TH_BUCKET_RUN], 0); if (thread->wake_active) { thread->wake_active = FALSE; @@ -2578,43 +2865,43 @@ thread_dispatch( */ boolean_t should_terminate = FALSE; uint32_t new_run_count; + int thread_state = thread->state; /* Only the first call to thread_dispatch * after explicit termination should add * the thread to the termination queue */ - if ((thread->state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) { + if ((thread_state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) { should_terminate = TRUE; - thread->state |= TH_TERMINATE2; + thread_state |= TH_TERMINATE2; } - thread->state &= ~TH_RUN; - thread->last_made_runnable_time = ~0ULL; - thread->chosen_processor = PROCESSOR_NULL; + timer_stop(&thread->runnable_timer, processor->last_dispatch); - if (thread->sched_mode == TH_MODE_TIMESHARE) { - if (thread->sched_flags & TH_SFLAG_THROTTLED) - sched_background_decr(thread); + thread_state &= ~TH_RUN; + thread->state = thread_state; + + thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE; + thread->chosen_processor = PROCESSOR_NULL; - sched_share_decr(thread); - } new_run_count = sched_run_decr(thread); #if CONFIG_SCHED_SFI - if ((thread->state & (TH_WAIT | TH_TERMINATE)) == TH_WAIT) { - if (thread->reason & AST_SFI) { - thread->wait_sfi_begin_time = processor->last_dispatch; - } + if (thread->reason & AST_SFI) { + thread->wait_sfi_begin_time = processor->last_dispatch; } #endif - machine_thread_going_off_core(thread, should_terminate); + machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), thread->reason, thread->state, new_run_count, 0); + MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE, + (uintptr_t)thread_tid(thread), thread->reason, thread_state, + new_run_count, 0); - (*thread->sched_call)(SCHED_CALL_BLOCK, thread); + if (thread_state & TH_WAIT_REPORT) { + (*thread->sched_call)(SCHED_CALL_BLOCK, thread); + } if (thread->wake_active) { thread->wake_active = FALSE; @@ -2633,12 +2920,14 @@ thread_dispatch( } } + int urgency = THREAD_URGENCY_NONE; + uint64_t latency = 0; + /* Update (new) current thread and reprogram quantum timer */ thread_lock(self); + if (!(self->state & TH_IDLE)) { uint64_t arg1, arg2; - int urgency; - uint64_t latency; #if CONFIG_SCHED_SFI ast_t new_ast; @@ -2650,15 +2939,19 @@ thread_dispatch( } #endif - assert(processor->last_dispatch >= self->last_made_runnable_time); + assertf(processor->last_dispatch >= self->last_made_runnable_time, + "Non-monotonic time? dispatch at 0x%llx, runnable at 0x%llx", + processor->last_dispatch, self->last_made_runnable_time); + + assert(self->last_made_runnable_time <= self->last_basepri_change_time); + latency = processor->last_dispatch - self->last_made_runnable_time; + assert(latency >= self->same_pri_latency); urgency = thread_get_urgency(self, &arg1, &arg2); thread_tell_urgency(urgency, arg1, arg2, latency, self); - machine_thread_going_on_core(self, urgency, latency); - /* * Get a new quantum if none remaining. */ @@ -2670,33 +2963,37 @@ thread_dispatch( * Set up quantum timer and timeslice. */ processor->quantum_end = processor->last_dispatch + self->quantum_remaining; - timer_call_enter1(&processor->quantum_timer, self, processor->quantum_end, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL); + timer_call_quantum_timer_enter(&processor->quantum_timer, self, + processor->quantum_end, processor->last_dispatch); processor->first_timeslice = TRUE; } else { - timer_call_cancel(&processor->quantum_timer); + timer_call_quantum_timer_cancel(&processor->quantum_timer); processor->first_timeslice = FALSE; thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self); - machine_thread_going_on_core(self, THREAD_URGENCY_NONE, 0); } + assert(self->block_hint == kThreadWaitNone); self->computation_epoch = processor->last_dispatch; self->reason = AST_NONE; + processor->starting_pri = self->sched_pri; thread_unlock(self); + machine_thread_going_on_core(self, urgency, latency, self->same_pri_latency, + processor->last_dispatch); + #if defined(CONFIG_SCHED_DEFERRED_AST) /* * TODO: Can we state that redispatching our old thread is also * uninteresting? */ - if ((((volatile uint32_t)sched_run_count) == 1) && + if ((((volatile uint32_t)sched_run_buckets[TH_BUCKET_RUN]) == 1) && !(self->state & TH_IDLE)) { pset_cancel_deferred_dispatch(processor->processor_set, processor); } #endif - } /* @@ -2736,6 +3033,14 @@ thread_block_reason( /* We're handling all scheduling AST's */ ast_off(AST_SCHEDULING); +#if PROC_REF_DEBUG + if ((continuation != NULL) && (self->task != kernel_task)) { + if (uthread_get_proc_refcount(self->uthread) != 0) { + panic("thread_block_reason with continuation uthread %p with uu_proc_refcount != 0", self->uthread); + } + } +#endif + self->continuation = continuation; self->parameter = parameter; @@ -2747,7 +3052,7 @@ thread_block_reason( do { thread_lock(self); - new_thread = thread_select(self, processor, reason); + new_thread = thread_select(self, processor, &reason); thread_unlock(self); } while (!thread_invoke(self, new_thread, reason)); @@ -2793,18 +3098,20 @@ thread_run( void *parameter, thread_t new_thread) { - ast_t handoff = AST_HANDOFF; + ast_t reason = AST_HANDOFF; self->continuation = continuation; self->parameter = parameter; - while (!thread_invoke(self, new_thread, handoff)) { - processor_t processor = current_processor(); + while (!thread_invoke(self, new_thread, reason)) { + /* the handoff failed, so we have to fall back to the normal block path */ + processor_t processor = current_processor(); + + reason = AST_NONE; thread_lock(self); - new_thread = thread_select(self, processor, AST_NONE); + new_thread = thread_select(self, processor, &reason); thread_unlock(self); - handoff = AST_NONE; } return (self->wait_result); @@ -2829,15 +3136,23 @@ thread_continue( continuation = self->continuation; parameter = self->parameter; +#if KPERF + kperf_on_cpu(self, continuation, NULL); +#endif + thread_dispatch(thread, self); self->continuation = self->parameter = NULL; - if (thread != THREAD_NULL) - (void)spllo(); +#if INTERRUPT_MASKED_DEBUG + /* Reset interrupt-masked spin debugging timeout */ + ml_spin_debug_clear(self); +#endif - TLOG(1, "thread_continue: calling call_continuation \n"); - call_continuation(continuation, parameter, self->wait_result); + TLOG(1, "thread_continue: calling call_continuation\n"); + + boolean_t enable_interrupts = thread != THREAD_NULL; + call_continuation(continuation, parameter, self->wait_result, enable_interrupts); /*NOTREACHED*/ } @@ -2854,10 +3169,10 @@ thread_quantum_init(thread_t thread) uint32_t sched_timeshare_initial_quantum_size(thread_t thread) { - if ((thread == THREAD_NULL) || !(thread->sched_flags & TH_SFLAG_THROTTLED)) - return std_quantum; - else + if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG) return bg_quantum; + else + return std_quantum; } /* @@ -2869,14 +3184,11 @@ void run_queue_init( run_queue_t rq) { - int i; - - rq->highq = IDLEPRI; - for (i = 0; i < NRQBM; i++) + rq->highq = NOPRI; + for (u_int i = 0; i < BITMAP_LEN(NRQS); i++) rq->bitmap[i] = 0; - setbit(MAXPRI - IDLEPRI, rq->bitmap); rq->urgency = rq->count = 0; - for (i = 0; i < NRQS; i++) + for (int i = 0; i < NRQS; i++) queue_init(&rq->queues[i]); } @@ -2891,19 +3203,21 @@ run_queue_init( */ thread_t run_queue_dequeue( - run_queue_t rq, - integer_t options) + run_queue_t rq, + integer_t options) { - thread_t thread; - queue_t queue = rq->queues + rq->highq; + thread_t thread; + queue_t queue = &rq->queues[rq->highq]; if (options & SCHED_HEADQ) { - thread = (thread_t)dequeue_head(queue); - } - else { - thread = (thread_t)dequeue_tail(queue); + thread = qe_dequeue_head(queue, struct thread, runq_links); + } else { + thread = qe_dequeue_tail(queue, struct thread, runq_links); } + assert(thread != THREAD_NULL); + assert_thread_magic(thread); + thread->runq = PROCESSOR_NULL; SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; @@ -2911,12 +3225,11 @@ run_queue_dequeue( rq->urgency--; assert(rq->urgency >= 0); } if (queue_empty(queue)) { - if (rq->highq != IDLEPRI) - clrbit(MAXPRI - rq->highq, rq->bitmap); - rq->highq = MAXPRI - ffsbit(rq->bitmap); + bitmap_clear(rq->bitmap, rq->highq); + rq->highq = bitmap_first(rq->bitmap, NRQS); } - return (thread); + return thread; } /* @@ -2929,34 +3242,35 @@ run_queue_dequeue( */ boolean_t run_queue_enqueue( - run_queue_t rq, - thread_t thread, - integer_t options) + run_queue_t rq, + thread_t thread, + integer_t options) { - queue_t queue = rq->queues + thread->sched_pri; - boolean_t result = FALSE; - + queue_t queue = &rq->queues[thread->sched_pri]; + boolean_t result = FALSE; + + assert_thread_magic(thread); + if (queue_empty(queue)) { - enqueue_tail(queue, (queue_entry_t)thread); - - setbit(MAXPRI - thread->sched_pri, rq->bitmap); + enqueue_tail(queue, &thread->runq_links); + + rq_bitmap_set(rq->bitmap, thread->sched_pri); if (thread->sched_pri > rq->highq) { rq->highq = thread->sched_pri; result = TRUE; } } else { if (options & SCHED_TAILQ) - enqueue_tail(queue, (queue_entry_t)thread); + enqueue_tail(queue, &thread->runq_links); else - enqueue_head(queue, (queue_entry_t)thread); + enqueue_head(queue, &thread->runq_links); } if (SCHED(priority_is_urgent)(thread->sched_pri)) rq->urgency++; SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count++; - + return (result); - } /* @@ -2968,47 +3282,55 @@ run_queue_enqueue( */ void run_queue_remove( - run_queue_t rq, - thread_t thread) + run_queue_t rq, + thread_t thread) { + assert(thread->runq != PROCESSOR_NULL); + assert_thread_magic(thread); - remqueue((queue_entry_t)thread); + remqueue(&thread->runq_links); SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; if (SCHED(priority_is_urgent)(thread->sched_pri)) { rq->urgency--; assert(rq->urgency >= 0); } - - if (queue_empty(rq->queues + thread->sched_pri)) { + + if (queue_empty(&rq->queues[thread->sched_pri])) { /* update run queue status */ - if (thread->sched_pri != IDLEPRI) - clrbit(MAXPRI - thread->sched_pri, rq->bitmap); - rq->highq = MAXPRI - ffsbit(rq->bitmap); + bitmap_clear(rq->bitmap, thread->sched_pri); + rq->highq = bitmap_first(rq->bitmap, NRQS); } - + thread->runq = PROCESSOR_NULL; } /* Assumes RT lock is not held, and acquires splsched/rt_lock itself */ void -rt_runq_scan(sched_update_scan_context_t scan_context) +sched_rtglobal_runq_scan(sched_update_scan_context_t scan_context) { spl_t s; thread_t thread; + processor_set_t pset = &pset0; + s = splsched(); - rt_lock_lock(); + rt_lock_lock(pset); - qe_foreach_element_safe(thread, &rt_runq.queue, links) { + qe_foreach_element_safe(thread, &pset->rt_runq.queue, runq_links) { if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) { scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time; } } - rt_lock_unlock(); + rt_lock_unlock(pset); splx(s); } +int64_t +sched_rtglobal_runq_count_sum(void) +{ + return pset0.rt_runq.runq_stats.count_sum; +} /* * realtime_queue_insert: @@ -3016,43 +3338,41 @@ rt_runq_scan(sched_update_scan_context_t scan_context) * Enqueue a thread for realtime execution. */ static boolean_t -realtime_queue_insert( - thread_t thread) +realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread) { - queue_t queue = &rt_runq.queue; - uint64_t deadline = thread->realtime.deadline; - boolean_t preempt = FALSE; + queue_t queue = &SCHED(rt_runq)(pset)->queue; + uint64_t deadline = thread->realtime.deadline; + boolean_t preempt = FALSE; - rt_lock_lock(); + rt_lock_lock(pset); if (queue_empty(queue)) { - enqueue_tail(queue, (queue_entry_t)thread); + enqueue_tail(queue, &thread->runq_links); preempt = TRUE; - } - else { - register thread_t entry = (thread_t)queue_first(queue); - - while (TRUE) { - if ( queue_end(queue, (queue_entry_t)entry) || - deadline < entry->realtime.deadline ) { - entry = (thread_t)queue_prev((queue_entry_t)entry); + } else { + /* Insert into rt_runq in thread deadline order */ + queue_entry_t iter; + qe_foreach(iter, queue) { + thread_t iter_thread = qe_element(iter, struct thread, runq_links); + assert_thread_magic(iter_thread); + + if (deadline < iter_thread->realtime.deadline) { + if (iter == queue_first(queue)) + preempt = TRUE; + insque(&thread->runq_links, queue_prev(iter)); + break; + } else if (iter == queue_last(queue)) { + enqueue_tail(queue, &thread->runq_links); break; } - - entry = (thread_t)queue_next((queue_entry_t)entry); } - - if ((queue_entry_t)entry == queue) - preempt = TRUE; - - insque((queue_entry_t)thread, (queue_entry_t)entry); } - thread->runq = THREAD_ON_RT_RUNQ; - SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count); - rt_runq.count++; + thread->runq = processor; + SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset)); + rt_runq_count_incr(pset); - rt_lock_unlock(); + rt_lock_unlock(pset); return (preempt); } @@ -3070,10 +3390,11 @@ realtime_setrun( processor_t processor, thread_t thread) { - processor_set_t pset = processor->processor_set; - ast_t preempt; + processor_set_t pset = processor->processor_set; + pset_assert_locked(pset); + ast_t preempt; - boolean_t do_signal_idle = FALSE, do_cause_ast = FALSE; + sched_ipi_type_t ipi_type = SCHED_IPI_NONE; thread->chosen_processor = processor; @@ -3085,28 +3406,15 @@ realtime_setrun( */ if ( (thread->bound_processor == processor) && processor->state == PROCESSOR_IDLE) { - remqueue((queue_entry_t)processor); - enqueue_tail(&pset->active_queue, (queue_entry_t)processor); processor->next_thread = thread; - processor->current_pri = thread->sched_pri; - processor->current_thmode = thread->sched_mode; - processor->current_sfi_class = thread->sfi_class; + processor_state_update_from_thread(processor, thread); processor->deadline = thread->realtime.deadline; - processor->state = PROCESSOR_DISPATCHING; + pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); - if (processor != current_processor()) { - if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) { - /* cleared on exit from main processor_idle() loop */ - pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id); - do_signal_idle = TRUE; - } - } + ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR); pset_unlock(pset); - - if (do_signal_idle) { - machine_signal_idle(processor); - } + sched_ipi_perform(processor, ipi_type); return; } @@ -3117,43 +3425,30 @@ realtime_setrun( else preempt = AST_NONE; - realtime_queue_insert(thread); + realtime_queue_insert(processor, pset, thread); + ipi_type = SCHED_IPI_NONE; if (preempt != AST_NONE) { if (processor->state == PROCESSOR_IDLE) { - remqueue((queue_entry_t)processor); - enqueue_tail(&pset->active_queue, (queue_entry_t)processor); processor->next_thread = THREAD_NULL; - processor->current_pri = thread->sched_pri; - processor->current_thmode = thread->sched_mode; - processor->current_sfi_class = thread->sfi_class; + processor_state_update_from_thread(processor, thread); processor->deadline = thread->realtime.deadline; - processor->state = PROCESSOR_DISPATCHING; + pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); if (processor == current_processor()) { ast_on(preempt); } else { - if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) { - /* cleared on exit from main processor_idle() loop */ - pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id); - do_signal_idle = TRUE; - } + ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_PREEMPT); } } else if (processor->state == PROCESSOR_DISPATCHING) { if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) || (processor->deadline > thread->realtime.deadline))) { - processor->current_pri = thread->sched_pri; - processor->current_thmode = thread->sched_mode; - processor->current_sfi_class = thread->sfi_class; + processor_state_update_from_thread(processor, thread); processor->deadline = thread->realtime.deadline; } } else { if (processor == current_processor()) { ast_on(preempt); } else { - if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) { - /* cleared after IPI causes csw_check() to be called */ - pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id); - do_cause_ast = TRUE; - } + ipi_type = sched_ipi_action(processor, thread, false, SCHED_IPI_EVENT_PREEMPT); } } } else { @@ -3161,21 +3456,123 @@ realtime_setrun( } pset_unlock(pset); + sched_ipi_perform(processor, ipi_type); +} - if (do_signal_idle) { - machine_signal_idle(processor); - } else if (do_cause_ast) { - cause_ast_check(processor); - } + +sched_ipi_type_t sched_ipi_deferred_policy(processor_set_t pset, processor_t dst, + __unused sched_ipi_event_t event) +{ +#if defined(CONFIG_SCHED_DEFERRED_AST) + if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) { + return SCHED_IPI_DEFERRED; + } +#else /* CONFIG_SCHED_DEFERRED_AST */ + panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id); +#endif /* CONFIG_SCHED_DEFERRED_AST */ + return SCHED_IPI_NONE; +} + +sched_ipi_type_t sched_ipi_action(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event) +{ + sched_ipi_type_t ipi_type = SCHED_IPI_NONE; + assert(dst != NULL); + + processor_set_t pset = dst->processor_set; + if (current_processor() == dst) { + return SCHED_IPI_NONE; + } + + if (bit_test(pset->pending_AST_cpu_mask, dst->cpu_id)) { + return SCHED_IPI_NONE; + } + + ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event); + switch(ipi_type) { + case SCHED_IPI_NONE: + return SCHED_IPI_NONE; +#if defined(CONFIG_SCHED_DEFERRED_AST) + case SCHED_IPI_DEFERRED: + bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id); + break; +#endif /* CONFIG_SCHED_DEFERRED_AST */ + default: + bit_set(pset->pending_AST_cpu_mask, dst->cpu_id); + break; + } + return ipi_type; +} + +sched_ipi_type_t sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event) +{ + sched_ipi_type_t ipi_type = SCHED_IPI_NONE; + boolean_t deferred_ipi_supported = false; + processor_set_t pset = dst->processor_set; + +#if defined(CONFIG_SCHED_DEFERRED_AST) + deferred_ipi_supported = true; +#endif /* CONFIG_SCHED_DEFERRED_AST */ + + switch(event) { + case SCHED_IPI_EVENT_SPILL: + case SCHED_IPI_EVENT_SMT_REBAL: + case SCHED_IPI_EVENT_REBALANCE: + case SCHED_IPI_EVENT_BOUND_THR: + /* + * The spill, SMT rebalance, rebalance and the bound thread + * scenarios use immediate IPIs always. + */ + ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE; + break; + case SCHED_IPI_EVENT_PREEMPT: + /* In the preemption case, use immediate IPIs for RT threads */ + if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) { + ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE; + break; + } + + /* + * For Non-RT threads preemption, + * If the core is active, use immediate IPIs. + * If the core is idle, use deferred IPIs if supported; otherwise immediate IPI. + */ + if (deferred_ipi_supported && dst_idle) { + return sched_ipi_deferred_policy(pset, dst, event); + } + ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE; + break; + default: + panic("Unrecognized scheduler IPI event type %d", event); + } + assert(ipi_type != SCHED_IPI_NONE); + return ipi_type; } +void sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi) +{ + switch (ipi) { + case SCHED_IPI_NONE: + break; + case SCHED_IPI_IDLE: + machine_signal_idle(dst); + break; + case SCHED_IPI_IMMEDIATE: + cause_ast_check(dst); + break; + case SCHED_IPI_DEFERRED: + machine_signal_idle_deferred(dst); + break; + default: + panic("Unrecognized scheduler IPI type: %d", ipi); + } +} #if defined(CONFIG_SCHED_TIMESHARE_CORE) boolean_t priority_is_urgent(int priority) { - return testbit(priority, sched_preempt_pri) ? TRUE : FALSE; + return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE; } #endif /* CONFIG_SCHED_TIMESHARE_CORE */ @@ -3195,12 +3592,12 @@ processor_setrun( thread_t thread, integer_t options) { - processor_set_t pset = processor->processor_set; - ast_t preempt; + processor_set_t pset = processor->processor_set; + pset_assert_locked(pset); + ast_t preempt; enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing; - enum { eNoSignal, eDoSignal, eDoDeferredSignal } do_signal_idle = eNoSignal; - boolean_t do_cause_ast = FALSE; + sched_ipi_type_t ipi_type = SCHED_IPI_NONE; thread->chosen_processor = processor; @@ -3210,28 +3607,15 @@ processor_setrun( if ( (SCHED(direct_dispatch_to_idle_processors) || thread->bound_processor == processor) && processor->state == PROCESSOR_IDLE) { - remqueue((queue_entry_t)processor); - enqueue_tail(&pset->active_queue, (queue_entry_t)processor); processor->next_thread = thread; - processor->current_pri = thread->sched_pri; - processor->current_thmode = thread->sched_mode; - processor->current_sfi_class = thread->sfi_class; + processor_state_update_from_thread(processor, thread); processor->deadline = UINT64_MAX; - processor->state = PROCESSOR_DISPATCHING; - - if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) { - /* cleared on exit from main processor_idle() loop */ - pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id); - do_signal_idle = eDoSignal; - } + pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); + ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR); pset_unlock(pset); - - if (do_signal_idle == eDoSignal) { - machine_signal_idle(processor); - } - + sched_ipi_perform(processor, ipi_type); return; } @@ -3254,25 +3638,29 @@ processor_setrun( } else preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE; + if ((options & (SCHED_PREEMPT|SCHED_REBALANCE)) == (SCHED_PREEMPT|SCHED_REBALANCE)) { + /* + * Having gone to the trouble of forcing this thread off a less preferred core, + * we should force the preferable core to reschedule immediately to give this + * thread a chance to run instead of just sitting on the run queue where + * it may just be stolen back by the idle core we just forced it off. + */ + preempt |= AST_PREEMPT; + } + SCHED(processor_enqueue)(processor, thread, options); + sched_update_pset_load_average(pset); if (preempt != AST_NONE) { if (processor->state == PROCESSOR_IDLE) { - remqueue((queue_entry_t)processor); - enqueue_tail(&pset->active_queue, (queue_entry_t)processor); processor->next_thread = THREAD_NULL; - processor->current_pri = thread->sched_pri; - processor->current_thmode = thread->sched_mode; - processor->current_sfi_class = thread->sfi_class; + processor_state_update_from_thread(processor, thread); processor->deadline = UINT64_MAX; - processor->state = PROCESSOR_DISPATCHING; - + pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); ipi_action = eExitIdle; } else if ( processor->state == PROCESSOR_DISPATCHING) { if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) { - processor->current_pri = thread->sched_pri; - processor->current_thmode = thread->sched_mode; - processor->current_sfi_class = thread->sfi_class; + processor_state_update_from_thread(processor, thread); processor->deadline = UINT64_MAX; } } else if ( (processor->state == PROCESSOR_RUNNING || @@ -3288,82 +3676,28 @@ processor_setrun( if (processor->state == PROCESSOR_SHUTDOWN && thread->sched_pri >= processor->current_pri ) { ipi_action = eInterruptRunning; - } else if ( processor->state == PROCESSOR_IDLE && - processor != current_processor() ) { - remqueue((queue_entry_t)processor); - enqueue_tail(&pset->active_queue, (queue_entry_t)processor); + } else if (processor->state == PROCESSOR_IDLE) { + processor->next_thread = THREAD_NULL; - processor->current_pri = thread->sched_pri; - processor->current_thmode = thread->sched_mode; - processor->current_sfi_class = thread->sfi_class; + processor_state_update_from_thread(processor, thread); processor->deadline = UINT64_MAX; - processor->state = PROCESSOR_DISPATCHING; + pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); ipi_action = eExitIdle; } } - switch (ipi_action) { - case eDoNothing: - break; - case eExitIdle: - if (processor == current_processor()) { - if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE) - ast_on(preempt); - } else { -#if defined(CONFIG_SCHED_DEFERRED_AST) - if (!(pset->pending_deferred_AST_cpu_mask & (1ULL << processor->cpu_id)) && - !(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) { - /* cleared on exit from main processor_idle() loop */ - pset->pending_deferred_AST_cpu_mask |= (1ULL << processor->cpu_id); - do_signal_idle = eDoDeferredSignal; - } -#else - if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) { - /* cleared on exit from main processor_idle() loop */ - pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id); - do_signal_idle = eDoSignal; - } -#endif - } - break; - case eInterruptRunning: - if (processor == current_processor()) { - if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE) - ast_on(preempt); - } else { - if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) { - /* cleared after IPI causes csw_check() to be called */ - pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id); - do_cause_ast = TRUE; - } - } - break; + if (ipi_action != eDoNothing) { + if (processor == current_processor()) { + if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE) + ast_on(preempt); + } else { + sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT; + ipi_type = sched_ipi_action(processor, thread, (ipi_action == eExitIdle), event); + } } - pset_unlock(pset); - - if (do_signal_idle == eDoSignal) { - machine_signal_idle(processor); - } -#if defined(CONFIG_SCHED_DEFERRED_AST) - else if (do_signal_idle == eDoDeferredSignal) { - /* - * TODO: The ability to cancel this signal could make - * sending it outside of the pset lock an issue. Do - * we need to address this? Or would the only fallout - * be that the core takes a signal? As long as we do - * not run the risk of having a core marked as signal - * outstanding, with no real signal outstanding, the - * only result should be that we fail to cancel some - * signals. - */ - machine_signal_idle_deferred(processor); - } -#endif - else if (do_cause_ast) { - cause_ast_check(processor); - } + sched_ipi_perform(processor, ipi_type); } /* @@ -3402,12 +3736,15 @@ choose_next_pset( */ processor_t choose_processor( - processor_set_t pset, - processor_t processor, - thread_t thread) + processor_set_t starting_pset, + processor_t processor, + thread_t thread) { - processor_set_t nset, cset = pset; - + processor_set_t pset = starting_pset; + processor_set_t nset; + + assert(thread->sched_pri <= BASEPRI_RTQUEUES); + /* * Prefer the hinted processor, when appropriate. */ @@ -3458,7 +3795,6 @@ choose_processor( * the "least cost idle" processor above. */ return (processor); - break; case PROCESSOR_RUNNING: case PROCESSOR_DISPATCHING: /* @@ -3493,12 +3829,14 @@ choose_processor( */ integer_t lowest_priority = MAXPRI + 1; + integer_t lowest_secondary_priority = MAXPRI + 1; integer_t lowest_unpaired_primary_priority = MAXPRI + 1; integer_t lowest_count = INT_MAX; uint64_t furthest_deadline = 1; processor_t lp_processor = PROCESSOR_NULL; processor_t lp_unpaired_primary_processor = PROCESSOR_NULL; processor_t lp_unpaired_secondary_processor = PROCESSOR_NULL; + processor_t lp_paired_secondary_processor = PROCESSOR_NULL; processor_t lc_processor = PROCESSOR_NULL; processor_t fd_processor = PROCESSOR_NULL; @@ -3519,30 +3857,45 @@ choose_processor( } do { - /* * Choose an idle processor, in pset traversal order */ - qe_foreach_element(processor, &cset->idle_queue, processor_queue) { - if (processor->is_recommended) - return processor; + + uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & + pset->primary_map & + pset->recommended_bitmask & + ~pset->pending_AST_cpu_mask); + + int cpuid = lsb_first(idle_primary_map); + if (cpuid >= 0) { + processor = processor_array[cpuid]; + return processor; } /* - * Otherwise, enumerate active and idle processors to find candidates + * Otherwise, enumerate active and idle processors to find primary candidates * with lower priority/etc. */ - qe_foreach_element(processor, &cset->active_queue, processor_queue) { - - if (!processor->is_recommended) { - continue; - } + uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) & + pset->recommended_bitmask & + ~pset->pending_AST_cpu_mask); + active_map = bit_ror64(active_map, (pset->last_chosen + 1)); + for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) { + cpuid = ((rotid + pset->last_chosen + 1) & 63); + processor = processor_array[cpuid]; integer_t cpri = processor->current_pri; - if (cpri < lowest_priority) { - lowest_priority = cpri; - lp_processor = processor; + if (processor->processor_primary != processor) { + if (cpri < lowest_secondary_priority) { + lowest_secondary_priority = cpri; + lp_paired_secondary_processor = processor; + } + } else { + if (cpri < lowest_priority) { + lowest_priority = cpri; + lp_processor = processor; + } } if ((cpri >= BASEPRI_RTQUEUES) && (processor->deadline > furthest_deadline)) { @@ -3561,14 +3914,23 @@ choose_processor( * For SMT configs, these idle secondary processors must have active primary. Otherwise * the idle primary would have short-circuited the loop above */ - qe_foreach_element(processor, &cset->idle_secondary_queue, processor_queue) { + uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & + ~pset->primary_map & + pset->recommended_bitmask & + ~pset->pending_AST_cpu_mask); - if (!processor->is_recommended) { - continue; - } + for (cpuid = lsb_first(idle_secondary_map); cpuid >= 0; cpuid = lsb_next(idle_secondary_map, cpuid)) { + processor = processor_array[cpuid]; processor_t cprimary = processor->processor_primary; + if (!cprimary->is_recommended) { + continue; + } + if (bit_test(pset->pending_AST_cpu_mask, cprimary->cpu_id)) { + continue; + } + /* If the primary processor is offline or starting up, it's not a candidate for this path */ if (cprimary->state == PROCESSOR_RUNNING || cprimary->state == PROCESSOR_DISPATCHING) { integer_t primary_pri = cprimary->current_pri; @@ -3592,15 +3954,17 @@ choose_processor( */ if (thread->sched_pri > lowest_unpaired_primary_priority) { - /* Move to end of active queue so that the next thread doesn't also pick it */ - re_queue_tail(&cset->active_queue, (queue_entry_t)lp_unpaired_primary_processor); + pset->last_chosen = lp_unpaired_primary_processor->cpu_id; return lp_unpaired_primary_processor; } if (thread->sched_pri > lowest_priority) { - /* Move to end of active queue so that the next thread doesn't also pick it */ - re_queue_tail(&cset->active_queue, (queue_entry_t)lp_processor); + pset->last_chosen = lp_processor->cpu_id; return lp_processor; } + if (sched_allow_rt_smt && (thread->sched_pri > lowest_secondary_priority)) { + pset->last_chosen = lp_paired_secondary_processor->cpu_id; + return lp_paired_secondary_processor; + } if (thread->realtime.deadline < furthest_deadline) return fd_processor; @@ -3613,13 +3977,11 @@ choose_processor( else { if (thread->sched_pri > lowest_unpaired_primary_priority) { - /* Move to end of active queue so that the next thread doesn't also pick it */ - re_queue_tail(&cset->active_queue, (queue_entry_t)lp_unpaired_primary_processor); + pset->last_chosen = lp_unpaired_primary_processor->cpu_id; return lp_unpaired_primary_processor; } if (thread->sched_pri > lowest_priority) { - /* Move to end of active queue so that the next thread doesn't also pick it */ - re_queue_tail(&cset->active_queue, (queue_entry_t)lp_processor); + pset->last_chosen = lp_processor->cpu_id; return lp_processor; } @@ -3633,15 +3995,15 @@ choose_processor( /* * Move onto the next processor set. */ - nset = next_pset(cset); + nset = next_pset(pset); - if (nset != pset) { - pset_unlock(cset); + if (nset != starting_pset) { + pset_unlock(pset); - cset = nset; - pset_lock(cset); + pset = nset; + pset_lock(pset); } - } while (nset != pset); + } while (nset != starting_pset); /* * Make sure that we pick a running processor, @@ -3660,6 +4022,9 @@ choose_processor( if (lp_unpaired_secondary_processor != PROCESSOR_NULL) { processor = lp_unpaired_secondary_processor; lp_unpaired_secondary_processor = PROCESSOR_NULL; + } else if (lp_paired_secondary_processor != PROCESSOR_NULL) { + processor = lp_paired_secondary_processor; + lp_paired_secondary_processor = PROCESSOR_NULL; } else if (lc_processor != PROCESSOR_NULL) { processor = lc_processor; lc_processor = PROCESSOR_NULL; @@ -3676,10 +4041,10 @@ choose_processor( * Check that the correct processor set is * returned locked. */ - if (cset != processor->processor_set) { - pset_unlock(cset); - cset = processor->processor_set; - pset_lock(cset); + if (pset != processor->processor_set) { + pset_unlock(pset); + pset = processor->processor_set; + pset_lock(pset); } /* @@ -3694,7 +4059,8 @@ choose_processor( } while (processor == PROCESSOR_NULL); - return (processor); + pset->last_chosen = processor->cpu_id; + return processor; } /* @@ -3740,6 +4106,7 @@ thread_setrun( pset_lock(pset); processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread); + pset = processor->processor_set; SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0); @@ -3751,6 +4118,7 @@ thread_setrun( pset = processor->processor_set; pset_lock(pset); processor = SCHED(choose_processor)(pset, processor, thread); + pset = processor->processor_set; SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), thread->last_processor->cpu_id, processor->cpu_id, processor->state, 0); @@ -3771,7 +4139,8 @@ thread_setrun( pset_lock(pset); processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread); - task->pset_hint = processor->processor_set; + pset = processor->processor_set; + task->pset_hint = pset; SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0); @@ -3801,10 +4170,15 @@ thread_setrun( * Dispatch the thread on the chosen processor. * TODO: This should be based on sched_mode, not sched_pri */ - if (thread->sched_pri >= BASEPRI_RTQUEUES) + if (thread->sched_pri >= BASEPRI_RTQUEUES) { realtime_setrun(processor, thread); - else + } else { processor_setrun(processor, thread, options); + } + /* pset is now unlocked */ + if (thread->bound_processor == PROCESSOR_NULL) { + SCHED(check_spill)(pset, thread); + } } processor_set_t @@ -3836,7 +4210,7 @@ csw_check( pset_lock(pset); /* If we were sent a remote AST and interrupted a running processor, acknowledge it here with pset lock held */ - pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id); + bit_clear(pset->pending_AST_cpu_mask, processor->cpu_id); result = csw_check_locked(processor, pset, check_reason); @@ -3852,18 +4226,18 @@ csw_check( ast_t csw_check_locked( processor_t processor, - processor_set_t pset __unused, + processor_set_t pset, ast_t check_reason) { ast_t result; thread_t thread = processor->active_thread; if (processor->first_timeslice) { - if (rt_runq.count > 0) + if (rt_runq_count(pset) > 0) return (check_reason | AST_PREEMPT | AST_URGENT); } else { - if (rt_runq.count > 0) { + if (rt_runq_count(pset) > 0) { if (BASEPRI_RTQUEUES > processor->current_pri) return (check_reason | AST_PREEMPT | AST_URGENT); else @@ -3871,19 +4245,30 @@ csw_check_locked( } } +#if __SMP__ + /* + * If the current thread is running on a processor that is no longer recommended, + * urgently preempt it, at which point thread_select() should + * try to idle the processor and re-dispatch the thread to a recommended processor. + */ + if (!processor->is_recommended) { + return (check_reason | AST_PREEMPT | AST_URGENT); + } +#endif + result = SCHED(processor_csw_check)(processor); if (result != AST_NONE) return (check_reason | result | (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE)); #if __SMP__ - /* - * If the current thread is running on a processor that is no longer recommended, gently - * (non-urgently) get to a point and then block, and which point thread_select() should - * try to idle the processor and re-dispatch the thread to a recommended processor. + * Same for avoid-processor + * + * TODO: Should these set AST_REBALANCE? */ - if (!processor->is_recommended) + if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread)) { return (check_reason | AST_PREEMPT); + } /* * Even though we could continue executing on this processor, a @@ -3892,11 +4277,6 @@ csw_check_locked( * TODO: Should this do the same check that thread_select does? i.e. * if no bound threads target this processor, and idle primaries exist, preempt * The case of RT threads existing is already taken care of above - * Consider Capri in this scenario. - * - * if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue)) - * - * TODO: Alternatively - check if only primary is idle, or check if primary's pri is lower than mine. */ if (processor->current_pri < BASEPRI_RTQUEUES && @@ -3932,7 +4312,8 @@ csw_check_locked( void set_sched_pri( thread_t thread, - int priority) + int new_priority, + set_sched_pri_options_t options) { thread_t cthread = current_thread(); boolean_t is_current_thread = (thread == cthread) ? TRUE : FALSE; @@ -3940,8 +4321,12 @@ set_sched_pri( uint64_t urgency_param1, urgency_param2; boolean_t removed_from_runq = FALSE; + bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY); + + int old_priority = thread->sched_pri; + /* If we're already at this priority, no need to mess with the runqueue */ - if (priority == thread->sched_pri) + if (new_priority == old_priority) return; if (is_current_thread) { @@ -3951,15 +4336,15 @@ set_sched_pri( removed_from_runq = thread_run_queue_remove(thread); } + thread->sched_pri = new_priority; + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY), (uintptr_t)thread_tid(thread), thread->base_pri, thread->sched_pri, - 0, /* eventually, 'reason' */ + thread->sched_usage, 0); - thread->sched_pri = priority; - if (is_current_thread) { nurgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2); /* @@ -3968,28 +4353,36 @@ set_sched_pri( * those are lazily handled. QoS classes have distinct priority bands, and QoS * inheritance is expected to involve priority changes. */ + uint64_t ctime = mach_approximate_time(); if (nurgency != curgency) { thread_tell_urgency(nurgency, urgency_param1, urgency_param2, 0, thread); - machine_thread_going_on_core(thread, nurgency, 0); } + machine_thread_going_on_core(thread, nurgency, 0, 0, ctime); } - /* TODO: Should this be TAILQ if it went down, HEADQ if it went up? */ if (removed_from_runq) thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ); else if (thread->state & TH_RUN) { processor_t processor = thread->last_processor; if (is_current_thread) { - ast_t preempt; + processor_state_update_from_thread(processor, thread); - processor->current_pri = priority; - processor->current_thmode = thread->sched_mode; - processor->current_sfi_class = thread->sfi_class = sfi_thread_classify(thread); - if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE) - ast_on(preempt); - } else if (processor != PROCESSOR_NULL && processor->active_thread == thread) + /* + * When dropping in priority, check if the thread no longer belongs on core. + * If a thread raises its own priority, don't aggressively rebalance it. + * + */ + if (!lazy_update && new_priority < old_priority) { + ast_t preempt; + + if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE) + ast_on(preempt); + } + } else if (!lazy_update && processor != PROCESSOR_NULL && + processor != current_processor() && processor->active_thread == thread) { cause_ast_check(processor); + } } } @@ -4082,7 +4475,9 @@ thread_run_queue_remove( return SCHED(processor_queue_remove)(processor, thread); } - rt_lock_lock(); + processor_set_t pset = processor->processor_set; + + rt_lock_lock(pset); if (thread->runq != PROCESSOR_NULL) { /* @@ -4090,18 +4485,16 @@ thread_run_queue_remove( * that run queue. */ - assert(thread->runq == THREAD_ON_RT_RUNQ); - - remqueue((queue_entry_t)thread); - SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count); - rt_runq.count--; + remqueue(&thread->runq_links); + SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset)); + rt_runq_count_decr(pset); thread->runq = PROCESSOR_NULL; removed = TRUE; } - rt_lock_unlock(); + rt_lock_unlock(pset); return (removed); } @@ -4117,19 +4510,18 @@ void thread_run_queue_reinsert(thread_t thread, integer_t options) { assert(thread->runq == PROCESSOR_NULL); + assert(thread->state & (TH_RUN)); - assert(thread->state & (TH_RUN)); - thread_setrun(thread, options); - + thread_setrun(thread, options); } void -sys_override_cpu_throttle(int flag) +sys_override_cpu_throttle(boolean_t enable_override) { - if (flag == CPU_THROTTLE_ENABLE) - cpu_throttle_enabled = 1; - if (flag == CPU_THROTTLE_DISABLE) + if (enable_override) cpu_throttle_enabled = 0; + else + cpu_throttle_enabled = 1; } int @@ -4149,7 +4541,6 @@ thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2) ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) { /* * Background urgency applied when thread priority is MAXPRI_THROTTLE or lower and thread is not promoted - * TODO: Use TH_SFLAG_THROTTLED instead? */ *arg1 = thread->sched_pri; *arg2 = thread->base_pri; @@ -4159,13 +4550,36 @@ thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2) /* For otherwise unclassified threads, report throughput QoS * parameters */ - *arg1 = thread->effective_policy.t_through_qos; - *arg2 = thread->task->effective_policy.t_through_qos; - + *arg1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS); + *arg2 = proc_get_effective_task_policy(thread->task, TASK_POLICY_THROUGH_QOS); + return (THREAD_URGENCY_NORMAL); } } +perfcontrol_class_t +thread_get_perfcontrol_class(thread_t thread) +{ + /* Special case handling */ + if (thread->state & TH_IDLE) + return PERFCONTROL_CLASS_IDLE; + if (thread->task == kernel_task) + return PERFCONTROL_CLASS_KERNEL; + if (thread->sched_mode == TH_MODE_REALTIME) + return PERFCONTROL_CLASS_REALTIME; + + /* perfcontrol_class based on base_pri */ + if (thread->base_pri <= MAXPRI_THROTTLE) + return PERFCONTROL_CLASS_BACKGROUND; + else if (thread->base_pri <= BASEPRI_UTILITY) + return PERFCONTROL_CLASS_UTILITY; + else if (thread->base_pri <= BASEPRI_DEFAULT) + return PERFCONTROL_CLASS_NONUI; + else if (thread->base_pri <= BASEPRI_FOREGROUND) + return PERFCONTROL_CLASS_UI; + else + return PERFCONTROL_CLASS_ABOVEUI; +} /* * This is the processor idle loop, which just looks for other threads @@ -4197,17 +4611,32 @@ processor_idle( SCHED_STATS_CPU_IDLE_START(processor); - timer_switch(&PROCESSOR_DATA(processor, system_state), - mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state)); - PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state); + uint64_t ctime = mach_absolute_time(); + + timer_switch(&PROCESSOR_DATA(processor, system_state), ctime, &PROCESSOR_DATA(processor, idle_state)); + PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state); + + cpu_quiescent_counter_leave(ctime); while (1) { - if (processor->state != PROCESSOR_IDLE) /* unsafe, but worst case we loop around once */ + /* + * Ensure that updates to my processor and pset state, + * made by the IPI source processor before sending the IPI, + * are visible on this processor now (even though we don't + * take the pset lock yet). + */ + atomic_thread_fence(memory_order_acquire); + + if (processor->state != PROCESSOR_IDLE) + break; + if (bit_test(pset->pending_AST_cpu_mask, processor->cpu_id)) break; - if (pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id)) +#if defined(CONFIG_SCHED_DEFERRED_AST) + if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id)) break; - if (processor->is_recommended) { - if (rt_runq.count) +#endif + if (processor->is_recommended && (processor->processor_primary == processor)) { + if (rt_runq_count(pset)) break; } else { if (SCHED(processor_bound_count)(processor)) @@ -4223,7 +4652,7 @@ processor_idle( #endif IDLE_KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -1, 0); + MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0); machine_track_platform_idle(TRUE); @@ -4233,8 +4662,19 @@ processor_idle( (void)splsched(); + /* + * Check if we should call sched_timeshare_consider_maintenance() here. + * The CPU was woken out of idle due to an interrupt and we should do the + * call only if the processor is still idle. If the processor is non-idle, + * the threads running on the processor would do the call as part of + * context swithing. + */ + if (processor->state == PROCESSOR_IDLE) { + sched_timeshare_consider_maintenance(mach_absolute_time()); + } + IDLE_KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -2, 0); + MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0); if (!SCHED(processor_queue_empty)(processor)) { /* Secondary SMT processors respond to directed wakeups @@ -4245,16 +4685,19 @@ processor_idle( } } - timer_switch(&PROCESSOR_DATA(processor, idle_state), - mach_absolute_time(), &PROCESSOR_DATA(processor, system_state)); + ctime = mach_absolute_time(); + + timer_switch(&PROCESSOR_DATA(processor, idle_state), ctime, &PROCESSOR_DATA(processor, system_state)); PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state); + cpu_quiescent_counter_join(ctime); + pset_lock(pset); /* If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held */ - pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id); + bit_clear(pset->pending_AST_cpu_mask, processor->cpu_id); #if defined(CONFIG_SCHED_DEFERRED_AST) - pset->pending_deferred_AST_cpu_mask &= ~(1ULL << processor->cpu_id); + bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id); #endif state = processor->state; @@ -4264,20 +4707,18 @@ processor_idle( */ new_thread = processor->next_thread; processor->next_thread = THREAD_NULL; - processor->state = PROCESSOR_RUNNING; + pset_update_processor_state(pset, processor, PROCESSOR_RUNNING); if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE) || - (rt_runq.count > 0)) ) { - /* Something higher priority has popped up on the runqueue - redispatch this thread elsewhere */ - processor->current_pri = IDLEPRI; - processor->current_thmode = TH_MODE_FIXED; - processor->current_sfi_class = SFI_CLASS_KERNEL; + (rt_runq_count(pset) > 0)) ) { + /* Something higher priority has popped up on the runqueue - redispatch this thread elsewhere */ + processor_state_update_idle(processor); processor->deadline = UINT64_MAX; pset_unlock(pset); thread_lock(new_thread); - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq.count, 0, 0); + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq_count(pset), 0, 0); thread_setrun(new_thread, SCHED_HEADQ); thread_unlock(new_thread); @@ -4288,36 +4729,29 @@ processor_idle( return (THREAD_NULL); } + sched_update_pset_load_average(pset); + pset_unlock(pset); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0); - + return (new_thread); - } - else - if (state == PROCESSOR_IDLE) { - remqueue((queue_entry_t)processor); - processor->state = PROCESSOR_RUNNING; - processor->current_pri = IDLEPRI; - processor->current_thmode = TH_MODE_FIXED; - processor->current_sfi_class = SFI_CLASS_KERNEL; + } else if (state == PROCESSOR_IDLE) { + pset_update_processor_state(pset, processor, PROCESSOR_RUNNING); + processor_state_update_idle(processor); processor->deadline = UINT64_MAX; - enqueue_tail(&pset->active_queue, (queue_entry_t)processor); - } - else - if (state == PROCESSOR_SHUTDOWN) { + + } else if (state == PROCESSOR_SHUTDOWN) { /* * Going off-line. Force a * reschedule. */ if ((new_thread = processor->next_thread) != THREAD_NULL) { processor->next_thread = THREAD_NULL; - processor->current_pri = IDLEPRI; - processor->current_thmode = TH_MODE_FIXED; - processor->current_sfi_class = SFI_CLASS_KERNEL; + processor_state_update_idle(processor); processor->deadline = UINT64_MAX; pset_unlock(pset); @@ -4371,11 +4805,15 @@ idle_thread_create( kern_return_t result; thread_t thread; spl_t s; + char name[MAXTHREADNAMESIZE]; result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread); if (result != KERN_SUCCESS) return (result); + snprintf(name, sizeof(name), "idle #%d", processor->cpu_id); + thread_set_thread_name(thread, name); + s = splsched(); thread_lock(thread); thread->bound_processor = processor; @@ -4406,6 +4844,10 @@ sched_startup(void) simple_lock_init(&sched_vm_group_list_lock, 0); +#if __arm__ || __arm64__ + simple_lock_init(&sched_recommended_cores_lock, 0); +#endif /* __arm__ || __arm64__ */ + result = kernel_thread_start_priority((thread_continue_t)sched_init_thread, (void *)SCHED(maintenance_continuation), MAXPRI_KERNEL, &thread); if (result != KERN_SUCCESS) @@ -4413,6 +4855,8 @@ sched_startup(void) thread_deallocate(thread); + assert_thread_magic(thread); + /* * Yield to the sched_init_thread once, to * initialize our own thread after being switched @@ -4424,15 +4868,19 @@ sched_startup(void) thread_block(THREAD_CONTINUE_NULL); } +#if __arm64__ +static _Atomic uint64_t sched_perfcontrol_callback_deadline; +#endif /* __arm64__ */ + + #if defined(CONFIG_SCHED_TIMESHARE_CORE) static volatile uint64_t sched_maintenance_deadline; -#if defined(CONFIG_TELEMETRY) -static volatile uint64_t sched_telemetry_deadline = 0; -#endif static uint64_t sched_tick_last_abstime; static uint64_t sched_tick_delta; uint64_t sched_tick_max_delta; + + /* * sched_init_thread: * @@ -4478,19 +4926,17 @@ sched_timeshare_maintenance_continue(void) } KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)|DBG_FUNC_START, - sched_tick_delta, - late_time, - 0, - 0, - 0); + sched_tick_delta, late_time, 0, 0, 0); /* Add a number of pseudo-ticks corresponding to the elapsed interval * This could be greater than 1 if substantial intervals where * all processors are idle occur, which rarely occurs in practice. */ - + sched_tick += sched_tick_delta; + update_vm_info(); + /* * Compute various averages. */ @@ -4498,29 +4944,48 @@ sched_timeshare_maintenance_continue(void) /* * Scan the run queues for threads which - * may need to be updated. + * may need to be updated, and find the earliest runnable thread on the runqueue + * to report its latency. */ SCHED(thread_update_scan)(&scan_context); - rt_runq_scan(&scan_context); + SCHED(rt_runq_scan)(&scan_context); uint64_t ctime = mach_absolute_time(); - machine_max_runnable_latency(ctime > scan_context.earliest_bg_make_runnable_time ? ctime - scan_context.earliest_bg_make_runnable_time : 0, - ctime > scan_context.earliest_normal_make_runnable_time ? ctime - scan_context.earliest_normal_make_runnable_time : 0, - ctime > scan_context.earliest_rt_make_runnable_time ? ctime - scan_context.earliest_rt_make_runnable_time : 0); + uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ? + ctime - scan_context.earliest_bg_make_runnable_time : 0; + + uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ? + ctime - scan_context.earliest_normal_make_runnable_time : 0; + + uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ? + ctime - scan_context.earliest_rt_make_runnable_time : 0; + + machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency); /* * Check to see if the special sched VM group needs attention. */ sched_vm_group_maintenance(); - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)|DBG_FUNC_END, - sched_pri_shift, - sched_background_pri_shift, - 0, - 0, - 0); +#if __arm__ || __arm64__ + /* Check to see if the recommended cores failsafe is active */ + sched_recommended_cores_maintenance(); +#endif /* __arm__ || __arm64__ */ + + +#if DEBUG || DEVELOPMENT +#if __x86_64__ +#include + /* Check for long-duration interrupts */ + mp_interrupt_watchdog(); +#endif /* __x86_64__ */ +#endif /* DEBUG || DEVELOPMENT */ + + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END, + sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG], + sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0); assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT); thread_block((thread_continue_t)sched_timeshare_maintenance_continue); @@ -4541,14 +5006,17 @@ static uint64_t sched_maintenance_wakeups; */ void sched_timeshare_consider_maintenance(uint64_t ctime) { - uint64_t ndeadline, deadline = sched_maintenance_deadline; + + cpu_quiescent_counter_checkin(ctime); + + uint64_t deadline = sched_maintenance_deadline; if (__improbable(ctime >= deadline)) { if (__improbable(current_thread() == sched_maintenance_thread)) return; OSMemoryBarrier(); - ndeadline = ctime + sched_tick_interval; + uint64_t ndeadline = ctime + sched_tick_interval; if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) { thread_wakeup((event_t)sched_timeshare_maintenance_continue); @@ -4556,25 +5024,30 @@ sched_timeshare_consider_maintenance(uint64_t ctime) { } } -#if defined(CONFIG_TELEMETRY) - /* - * Windowed telemetry is driven by the scheduler. It should be safe - * to call compute_telemetry_windowed() even when windowed telemetry - * is disabled, but we should try to avoid doing extra work for no - * reason. - */ - if (telemetry_window_enabled) { - deadline = sched_telemetry_deadline; + uint64_t load_compute_deadline = __c11_atomic_load(&sched_load_compute_deadline, memory_order_relaxed); - if (__improbable(ctime >= deadline)) { - ndeadline = ctime + sched_telemetry_interval; + if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) { + uint64_t new_deadline = 0; + if (__c11_atomic_compare_exchange_strong(&sched_load_compute_deadline, &load_compute_deadline, new_deadline, + memory_order_relaxed, memory_order_relaxed)) { + compute_sched_load(); + new_deadline = ctime + sched_load_compute_interval_abs; + __c11_atomic_store(&sched_load_compute_deadline, new_deadline, memory_order_relaxed); + } + } - if (__probable(__sync_bool_compare_and_swap(&sched_telemetry_deadline, deadline, ndeadline))) { - compute_telemetry_windowed(); - } +#if __arm64__ + uint64_t perf_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline, memory_order_relaxed); + + if (__improbable(perf_deadline && ctime >= perf_deadline)) { + /* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */ + if (__c11_atomic_compare_exchange_strong(&sched_perfcontrol_callback_deadline, &perf_deadline, 0, + memory_order_relaxed, memory_order_relaxed)) { + machine_perfcontrol_deadline_passed(perf_deadline); } } -#endif /* CONFIG_TELEMETRY */ +#endif /* __arm64__ */ + } #endif /* CONFIG_SCHED_TIMESHARE_CORE */ @@ -4584,7 +5057,12 @@ sched_init_thread(void (*continuation)(void)) { thread_block(THREAD_CONTINUE_NULL); - sched_maintenance_thread = current_thread(); + thread_t thread = current_thread(); + + thread_set_thread_name(thread, "sched_maintenance_thread"); + + sched_maintenance_thread = thread; + continuation(); /*NOTREACHED*/ @@ -4610,8 +5088,8 @@ sched_init_thread(void (*continuation)(void)) #define THREAD_UPDATE_SIZE 128 -static thread_t thread_update_array[THREAD_UPDATE_SIZE]; -static int thread_update_count = 0; +static thread_t thread_update_array[THREAD_UPDATE_SIZE]; +static uint32_t thread_update_count = 0; /* Returns TRUE if thread was added, FALSE if thread_update_array is full */ boolean_t @@ -4628,14 +5106,16 @@ thread_update_add_thread(thread_t thread) void thread_update_process_threads(void) { - while (thread_update_count > 0) { - spl_t s; - thread_t thread = thread_update_array[--thread_update_count]; - thread_update_array[thread_update_count] = THREAD_NULL; + assert(thread_update_count <= THREAD_UPDATE_SIZE); - s = splsched(); + for (uint32_t i = 0 ; i < thread_update_count ; i++) { + thread_t thread = thread_update_array[i]; + assert_thread_magic(thread); + thread_update_array[i] = THREAD_NULL; + + spl_t s = splsched(); thread_lock(thread); - if (!(thread->state & (TH_WAIT)) && (SCHED(can_update_priority)(thread))) { + if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) { SCHED(update_priority)(thread); } thread_unlock(thread); @@ -4643,6 +5123,8 @@ thread_update_process_threads(void) thread_deallocate(thread); } + + thread_update_count = 0; } /* @@ -4652,41 +5134,48 @@ thread_update_process_threads(void) */ boolean_t runq_scan( - run_queue_t runq, - sched_update_scan_context_t scan_context) + run_queue_t runq, + sched_update_scan_context_t scan_context) { - register int count; - register queue_t q; - register thread_t thread; - - if ((count = runq->count) > 0) { - q = runq->queues + runq->highq; - while (count > 0) { - queue_iterate(q, thread, thread_t, links) { - if ( thread->sched_stamp != sched_tick && - (thread->sched_mode == TH_MODE_TIMESHARE) ) { - if (thread_update_add_thread(thread) == FALSE) - return (TRUE); - } + int count = runq->count; + int queue_index; - if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) { - if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) { - scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time; - } - } else { - if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) { - scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time; - } - } + assert(count >= 0); - count--; + if (count == 0) + return FALSE; + + for (queue_index = bitmap_first(runq->bitmap, NRQS); + queue_index >= 0; + queue_index = bitmap_next(runq->bitmap, queue_index)) { + + thread_t thread; + queue_t queue = &runq->queues[queue_index]; + + qe_foreach_element(thread, queue, runq_links) { + assert(count > 0); + assert_thread_magic(thread); + + if (thread->sched_stamp != sched_tick && + thread->sched_mode == TH_MODE_TIMESHARE) { + if (thread_update_add_thread(thread) == FALSE) + return TRUE; } - q--; + if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) { + if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) { + scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time; + } + } else { + if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) { + scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time; + } + } + count--; } } - return (FALSE); + return FALSE; } #endif /* CONFIG_SCHED_TIMESHARE_CORE */ @@ -4812,36 +5301,388 @@ sched_timer_deadline_tracking_init(void) { nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2); } +#if __arm__ || __arm64__ -kern_return_t -sched_work_interval_notify(thread_t thread, uint64_t work_interval_id, uint64_t start, uint64_t finish, uint64_t deadline, uint64_t next_start, uint32_t flags) +uint32_t perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED; +uint32_t perfcontrol_requested_recommended_core_count = MAX_CPUS; +bool perfcontrol_failsafe_active = false; +bool perfcontrol_sleep_override = false; + +uint64_t perfcontrol_failsafe_maintenance_runnable_time; +uint64_t perfcontrol_failsafe_activation_time; +uint64_t perfcontrol_failsafe_deactivation_time; + +/* data covering who likely caused it and how long they ran */ +#define FAILSAFE_NAME_LEN 33 /* (2*MAXCOMLEN)+1 from size of p_name */ +char perfcontrol_failsafe_name[FAILSAFE_NAME_LEN]; +int perfcontrol_failsafe_pid; +uint64_t perfcontrol_failsafe_tid; +uint64_t perfcontrol_failsafe_thread_timer_at_start; +uint64_t perfcontrol_failsafe_thread_timer_last_seen; +uint32_t perfcontrol_failsafe_recommended_at_trigger; + +/* + * Perf controller calls here to update the recommended core bitmask. + * If the failsafe is active, we don't immediately apply the new value. + * Instead, we store the new request and use it after the failsafe deactivates. + * + * If the failsafe is not active, immediately apply the update. + * + * No scheduler locks are held, no other locks are held that scheduler might depend on, + * interrupts are enabled + * + * currently prototype is in osfmk/arm/machine_routines.h + */ +void +sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores) { - int urgency; - uint64_t urgency_param1, urgency_param2; - spl_t s; + assert(preemption_enabled()); - if (work_interval_id == 0) { - return (KERN_INVALID_ARGUMENT); + spl_t s = splsched(); + simple_lock(&sched_recommended_cores_lock); + + perfcontrol_requested_recommended_cores = recommended_cores; + perfcontrol_requested_recommended_core_count = __builtin_popcountll(recommended_cores); + + if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) + sched_update_recommended_cores(perfcontrol_requested_recommended_cores); + else + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE, + perfcontrol_requested_recommended_cores, + sched_maintenance_thread->last_made_runnable_time, 0, 0, 0); + + simple_unlock(&sched_recommended_cores_lock); + splx(s); +} + +void +sched_override_recommended_cores_for_sleep(void) +{ + spl_t s = splsched(); + simple_lock(&sched_recommended_cores_lock); + + if (perfcontrol_sleep_override == false) { + perfcontrol_sleep_override = true; + sched_update_recommended_cores(ALL_CORES_RECOMMENDED); } - assert(thread == current_thread()); + simple_unlock(&sched_recommended_cores_lock); + splx(s); +} + +void +sched_restore_recommended_cores_after_sleep(void) +{ + spl_t s = splsched(); + simple_lock(&sched_recommended_cores_lock); - thread_mtx_lock(thread); - if (thread->work_interval_id != work_interval_id) { - thread_mtx_unlock(thread); - return (KERN_INVALID_ARGUMENT); + if (perfcontrol_sleep_override == true) { + perfcontrol_sleep_override = false; + sched_update_recommended_cores(perfcontrol_requested_recommended_cores); } - thread_mtx_unlock(thread); - s = splsched(); - thread_lock(thread); - urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2); - thread_unlock(thread); + simple_unlock(&sched_recommended_cores_lock); splx(s); +} - machine_work_interval_notify(thread, work_interval_id, start, finish, deadline, next_start, urgency, flags); - return (KERN_SUCCESS); +/* + * Consider whether we need to activate the recommended cores failsafe + * + * Called from quantum timer interrupt context of a realtime thread + * No scheduler locks are held, interrupts are disabled + */ +void +sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread) +{ + /* + * Check if a realtime thread is starving the system + * and bringing up non-recommended cores would help + * + * TODO: Is this the correct check for recommended == possible cores? + * TODO: Validate the checks without the relevant lock are OK. + */ + + if (__improbable(perfcontrol_failsafe_active == TRUE)) { + /* keep track of how long the responsible thread runs */ + + simple_lock(&sched_recommended_cores_lock); + + if (perfcontrol_failsafe_active == TRUE && + cur_thread->thread_id == perfcontrol_failsafe_tid) { + perfcontrol_failsafe_thread_timer_last_seen = timer_grab(&cur_thread->user_timer) + + timer_grab(&cur_thread->system_timer); + } + + simple_unlock(&sched_recommended_cores_lock); + + /* we're already trying to solve the problem, so bail */ + return; + } + + /* The failsafe won't help if there are no more processors to enable */ + if (__probable(perfcontrol_requested_recommended_core_count >= processor_count)) + return; + + uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold; + + /* Use the maintenance thread as our canary in the coal mine */ + thread_t m_thread = sched_maintenance_thread; + + /* If it doesn't look bad, nothing to see here */ + if (__probable(m_thread->last_made_runnable_time >= too_long_ago)) + return; + + /* It looks bad, take the lock to be sure */ + thread_lock(m_thread); + + if (m_thread->runq == PROCESSOR_NULL || + (m_thread->state & (TH_RUN|TH_WAIT)) != TH_RUN || + m_thread->last_made_runnable_time >= too_long_ago) { + /* + * Maintenance thread is either on cpu or blocked, and + * therefore wouldn't benefit from more cores + */ + thread_unlock(m_thread); + return; + } + + uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time; + + thread_unlock(m_thread); + + /* + * There are cores disabled at perfcontrol's recommendation, but the + * system is so overloaded that the maintenance thread can't run. + * That likely means that perfcontrol can't run either, so it can't fix + * the recommendation. We have to kick in a failsafe to keep from starving. + * + * When the maintenance thread has been starved for too long, + * ignore the recommendation from perfcontrol and light up all the cores. + * + * TODO: Consider weird states like boot, sleep, or debugger + */ + + simple_lock(&sched_recommended_cores_lock); + + if (perfcontrol_failsafe_active == TRUE) { + simple_unlock(&sched_recommended_cores_lock); + return; + } + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) | DBG_FUNC_START, + perfcontrol_requested_recommended_cores, maintenance_runnable_time, 0, 0, 0); + + perfcontrol_failsafe_active = TRUE; + perfcontrol_failsafe_activation_time = mach_absolute_time(); + perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time; + perfcontrol_failsafe_recommended_at_trigger = perfcontrol_requested_recommended_cores; + + /* Capture some data about who screwed up (assuming that the thread on core is at fault) */ + task_t task = cur_thread->task; + perfcontrol_failsafe_pid = task_pid(task); + strlcpy(perfcontrol_failsafe_name, proc_name_address(task->bsd_info), sizeof(perfcontrol_failsafe_name)); + + perfcontrol_failsafe_tid = cur_thread->thread_id; + + /* Blame the thread for time it has run recently */ + uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered; + + uint64_t last_seen = timer_grab(&cur_thread->user_timer) + timer_grab(&cur_thread->system_timer); + + /* Compute the start time of the bad behavior in terms of the thread's on core time */ + perfcontrol_failsafe_thread_timer_at_start = last_seen - recent_computation; + perfcontrol_failsafe_thread_timer_last_seen = last_seen; + + /* Ignore the previously recommended core configuration */ + sched_update_recommended_cores(ALL_CORES_RECOMMENDED); + + simple_unlock(&sched_recommended_cores_lock); +} + +/* + * Now that our bacon has been saved by the failsafe, consider whether to turn it off + * + * Runs in the context of the maintenance thread, no locks held + */ +static void +sched_recommended_cores_maintenance(void) +{ + /* Common case - no failsafe, nothing to be done here */ + if (__probable(perfcontrol_failsafe_active == FALSE)) + return; + + uint64_t ctime = mach_absolute_time(); + + boolean_t print_diagnostic = FALSE; + char p_name[FAILSAFE_NAME_LEN] = ""; + + spl_t s = splsched(); + simple_lock(&sched_recommended_cores_lock); + + /* Check again, under the lock, to avoid races */ + if (perfcontrol_failsafe_active == FALSE) + goto out; + + /* + * Ensure that the other cores get another few ticks to run some threads + * If we don't have this hysteresis, the maintenance thread is the first + * to run, and then it immediately kills the other cores + */ + if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold) + goto out; + + /* Capture some diagnostic state under the lock so we can print it out later */ + + int pid = perfcontrol_failsafe_pid; + uint64_t tid = perfcontrol_failsafe_tid; + + uint64_t thread_usage = perfcontrol_failsafe_thread_timer_last_seen - + perfcontrol_failsafe_thread_timer_at_start; + uint32_t rec_cores_before = perfcontrol_failsafe_recommended_at_trigger; + uint32_t rec_cores_after = perfcontrol_requested_recommended_cores; + uint64_t failsafe_duration = ctime - perfcontrol_failsafe_activation_time; + strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name)); + + print_diagnostic = TRUE; + + /* Deactivate the failsafe and reinstate the requested recommendation settings */ + + perfcontrol_failsafe_deactivation_time = ctime; + perfcontrol_failsafe_active = FALSE; + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) | DBG_FUNC_END, + perfcontrol_requested_recommended_cores, failsafe_duration, 0, 0, 0); + + sched_update_recommended_cores(perfcontrol_requested_recommended_cores); + +out: + simple_unlock(&sched_recommended_cores_lock); + splx(s); + + if (print_diagnostic) { + uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0; + + absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms); + failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC; + + absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms); + thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC; + + printf("recommended core failsafe kicked in for %lld ms " + "likely due to %s[%d] thread 0x%llx spending " + "%lld ms on cpu at realtime priority - " + "new recommendation: 0x%x -> 0x%x\n", + failsafe_duration_ms, p_name, pid, tid, thread_usage_ms, + rec_cores_before, rec_cores_after); + } +} + +/* + * Apply a new recommended cores mask to the processors it affects + * Runs after considering failsafes and such + * + * Iterate over processors and update their ->is_recommended field. + * If a processor is running, we let it drain out at its next + * quantum expiration or blocking point. If a processor is idle, there + * may be more work for it to do, so IPI it. + * + * interrupts disabled, sched_recommended_cores_lock is held + */ +static void +sched_update_recommended_cores(uint32_t recommended_cores) +{ + processor_set_t pset, nset; + processor_t processor; + uint64_t needs_exit_idle_mask = 0x0; + + processor = processor_list; + pset = processor->processor_set; + + KDBG(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START, + recommended_cores, perfcontrol_failsafe_active, 0, 0); + + if (__builtin_popcount(recommended_cores) == 0) { + bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */ + } + + /* First set recommended cores */ + pset_lock(pset); + do { + + nset = processor->processor_set; + if (nset != pset) { + pset_unlock(pset); + pset = nset; + pset_lock(pset); + } + + if (bit_test(recommended_cores, processor->cpu_id)) { + processor->is_recommended = TRUE; + bit_set(pset->recommended_bitmask, processor->cpu_id); + + if (processor->state == PROCESSOR_IDLE) { + if (processor != current_processor()) { + bit_set(needs_exit_idle_mask, processor->cpu_id); + } + } + } + } while ((processor = processor->processor_list) != NULL); + pset_unlock(pset); + + /* Now shutdown not recommended cores */ + processor = processor_list; + pset = processor->processor_set; + + pset_lock(pset); + do { + + nset = processor->processor_set; + if (nset != pset) { + pset_unlock(pset); + pset = nset; + pset_lock(pset); + } + + if (!bit_test(recommended_cores, processor->cpu_id)) { + sched_ipi_type_t ipi_type = SCHED_IPI_NONE; + + processor->is_recommended = FALSE; + bit_clear(pset->recommended_bitmask, processor->cpu_id); + + if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) { + ipi_type = SCHED_IPI_IMMEDIATE; + } + SCHED(processor_queue_shutdown)(processor); + /* pset unlocked */ + + SCHED(rt_queue_shutdown)(processor); + + if (ipi_type != SCHED_IPI_NONE) { + if (processor == current_processor()) { + ast_on(AST_PREEMPT); + } else { + sched_ipi_perform(processor, ipi_type); + } + } + + pset_lock(pset); + } + } while ((processor = processor->processor_list) != NULL); + pset_unlock(pset); + + /* Issue all pending IPIs now that the pset lock has been dropped */ + for (int cpuid = lsb_first(needs_exit_idle_mask); cpuid >= 0; cpuid = lsb_next(needs_exit_idle_mask, cpuid)) { + processor = processor_array[cpuid]; + machine_signal_idle(processor); + } + + KDBG(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END, + needs_exit_idle_mask, 0, 0, 0); } +#endif /* __arm__ || __arm64__ */ void thread_set_options(uint32_t thopt) { spl_t x; @@ -4855,3 +5696,180 @@ void thread_set_options(uint32_t thopt) { thread_unlock(t); splx(x); } + +void thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint) { + thread->pending_block_hint = block_hint; +} + +uint32_t qos_max_parallelism(int qos, uint64_t options) +{ + return SCHED(qos_max_parallelism)(qos, options); +} + +uint32_t sched_qos_max_parallelism(__unused int qos, uint64_t options) +{ + host_basic_info_data_t hinfo; + mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; + /* Query the machine layer for core information */ + __assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO, + (host_info_t)&hinfo, &count); + assert(kret == KERN_SUCCESS); + + /* We would not want multiple realtime threads running on the + * same physical core; even for SMT capable machines. + */ + if (options & QOS_PARALLELISM_REALTIME) { + return hinfo.physical_cpu; + } + + if (options & QOS_PARALLELISM_COUNT_LOGICAL) { + return hinfo.logical_cpu; + } else { + return hinfo.physical_cpu; + } +} + +#if __arm64__ + +/* + * Set up or replace old timer with new timer + * + * Returns true if canceled old timer, false if it did not + */ +boolean_t +sched_perfcontrol_update_callback_deadline(uint64_t new_deadline) +{ + /* + * Exchange deadline for new deadline, if old deadline was nonzero, + * then I cancelled the callback, otherwise I didn't + */ + + uint64_t old_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline, + memory_order_relaxed); + + + while (!__c11_atomic_compare_exchange_weak(&sched_perfcontrol_callback_deadline, + &old_deadline, new_deadline, + memory_order_relaxed, memory_order_relaxed)); + + + /* now old_deadline contains previous value, which might not be the same if it raced */ + + return (old_deadline != 0) ? TRUE : FALSE; +} + +#endif /* __arm64__ */ + +void +sched_update_pset_load_average(processor_set_t pset) +{ + int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT); + int new_load_average = (pset->load_average + load) >> 1; + + pset->load_average = new_load_average; + +#if (DEVELOPMENT || DEBUG) +#endif +} + +/* pset is locked */ +static processor_t +choose_processor_for_realtime_thread(processor_set_t pset) +{ + uint64_t cpu_map = (pset->cpu_bitmask & pset->recommended_bitmask & ~pset->pending_AST_cpu_mask); + + for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) { + processor_t processor = processor_array[cpuid]; + + if (processor->processor_primary != processor) { + continue; + } + + if (processor->state == PROCESSOR_IDLE) { + return processor; + } + + if ((processor->state != PROCESSOR_RUNNING) && (processor->state != PROCESSOR_DISPATCHING)) { + continue; + } + + if (processor->current_pri >= BASEPRI_RTQUEUES) { + continue; + } + + return processor; + + } + + if (!sched_allow_rt_smt) { + return PROCESSOR_NULL; + } + + /* Consider secondary processors */ + for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) { + processor_t processor = processor_array[cpuid]; + + if (processor->processor_primary == processor) { + continue; + } + + if (processor->state == PROCESSOR_IDLE) { + return processor; + } + + if ((processor->state != PROCESSOR_RUNNING) && (processor->state != PROCESSOR_DISPATCHING)) { + continue; + } + + if (processor->current_pri >= BASEPRI_RTQUEUES) { + continue; + } + + return processor; + + } + + return PROCESSOR_NULL; +} + +/* pset is locked */ +static bool +all_available_primaries_are_running_realtime_threads(processor_set_t pset) +{ + uint64_t cpu_map = (pset->cpu_bitmask & pset->recommended_bitmask); + + for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) { + processor_t processor = processor_array[cpuid]; + + if (processor->processor_primary != processor) { + continue; + } + + if (processor->state == PROCESSOR_IDLE) { + return false; + } + + if (processor->state == PROCESSOR_DISPATCHING) { + return false; + } + + if (processor->state != PROCESSOR_RUNNING) { + /* + * All other processor states are considered unavailable to run + * realtime threads. In particular, we prefer an available secondary + * processor over the risk of leaving a realtime thread on the run queue + * while waiting for a processor in PROCESSOR_START state, + * which should anyway be a rare case. + */ + continue; + } + + if (processor->current_pri < BASEPRI_RTQUEUES) { + return false; + } + } + + return true; +} + +