+sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread)
+{
+ /*
+ * Check if a realtime thread is starving the system
+ * and bringing up non-recommended cores would help
+ *
+ * TODO: Is this the correct check for recommended == possible cores?
+ * TODO: Validate the checks without the relevant lock are OK.
+ */
+
+ if (__improbable(perfcontrol_failsafe_active == TRUE)) {
+ /* keep track of how long the responsible thread runs */
+
+ simple_lock(&sched_recommended_cores_lock);
+
+ if (perfcontrol_failsafe_active == TRUE &&
+ cur_thread->thread_id == perfcontrol_failsafe_tid) {
+ perfcontrol_failsafe_thread_timer_last_seen = timer_grab(&cur_thread->user_timer) +
+ timer_grab(&cur_thread->system_timer);
+ }
+
+ simple_unlock(&sched_recommended_cores_lock);
+
+ /* we're already trying to solve the problem, so bail */
+ return;
+ }
+
+ /* The failsafe won't help if there are no more processors to enable */
+ if (__probable(perfcontrol_requested_recommended_core_count >= processor_count))
+ return;
+
+ uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold;
+
+ /* Use the maintenance thread as our canary in the coal mine */
+ thread_t m_thread = sched_maintenance_thread;
+
+ /* If it doesn't look bad, nothing to see here */
+ if (__probable(m_thread->last_made_runnable_time >= too_long_ago))
+ return;
+
+ /* It looks bad, take the lock to be sure */
+ thread_lock(m_thread);
+
+ if (m_thread->runq == PROCESSOR_NULL ||
+ (m_thread->state & (TH_RUN|TH_WAIT)) != TH_RUN ||
+ m_thread->last_made_runnable_time >= too_long_ago) {
+ /*
+ * Maintenance thread is either on cpu or blocked, and
+ * therefore wouldn't benefit from more cores
+ */
+ thread_unlock(m_thread);
+ return;
+ }
+
+ uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time;
+
+ thread_unlock(m_thread);
+
+ /*
+ * There are cores disabled at perfcontrol's recommendation, but the
+ * system is so overloaded that the maintenance thread can't run.
+ * That likely means that perfcontrol can't run either, so it can't fix
+ * the recommendation. We have to kick in a failsafe to keep from starving.
+ *
+ * When the maintenance thread has been starved for too long,
+ * ignore the recommendation from perfcontrol and light up all the cores.
+ *
+ * TODO: Consider weird states like boot, sleep, or debugger
+ */
+
+ simple_lock(&sched_recommended_cores_lock);
+
+ if (perfcontrol_failsafe_active == TRUE) {
+ simple_unlock(&sched_recommended_cores_lock);
+ return;
+ }
+
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+ MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) | DBG_FUNC_START,
+ perfcontrol_requested_recommended_cores, maintenance_runnable_time, 0, 0, 0);
+
+ perfcontrol_failsafe_active = TRUE;
+ perfcontrol_failsafe_activation_time = mach_absolute_time();
+ perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time;
+ perfcontrol_failsafe_recommended_at_trigger = perfcontrol_requested_recommended_cores;
+
+ /* Capture some data about who screwed up (assuming that the thread on core is at fault) */
+ task_t task = cur_thread->task;
+ perfcontrol_failsafe_pid = task_pid(task);
+ strlcpy(perfcontrol_failsafe_name, proc_name_address(task->bsd_info), sizeof(perfcontrol_failsafe_name));
+
+ perfcontrol_failsafe_tid = cur_thread->thread_id;
+
+ /* Blame the thread for time it has run recently */
+ uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered;
+
+ uint64_t last_seen = timer_grab(&cur_thread->user_timer) + timer_grab(&cur_thread->system_timer);
+
+ /* Compute the start time of the bad behavior in terms of the thread's on core time */
+ perfcontrol_failsafe_thread_timer_at_start = last_seen - recent_computation;
+ perfcontrol_failsafe_thread_timer_last_seen = last_seen;
+
+ /* Ignore the previously recommended core configuration */
+ sched_update_recommended_cores(ALL_CORES_RECOMMENDED);
+
+ simple_unlock(&sched_recommended_cores_lock);
+}
+
+/*
+ * Now that our bacon has been saved by the failsafe, consider whether to turn it off
+ *
+ * Runs in the context of the maintenance thread, no locks held
+ */
+static void
+sched_recommended_cores_maintenance(void)
+{
+ /* Common case - no failsafe, nothing to be done here */
+ if (__probable(perfcontrol_failsafe_active == FALSE))
+ return;
+
+ uint64_t ctime = mach_absolute_time();
+
+ boolean_t print_diagnostic = FALSE;
+ char p_name[FAILSAFE_NAME_LEN] = "";
+
+ spl_t s = splsched();
+ simple_lock(&sched_recommended_cores_lock);
+
+ /* Check again, under the lock, to avoid races */
+ if (perfcontrol_failsafe_active == FALSE)
+ goto out;
+
+ /*
+ * Ensure that the other cores get another few ticks to run some threads
+ * If we don't have this hysteresis, the maintenance thread is the first
+ * to run, and then it immediately kills the other cores
+ */
+ if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold)
+ goto out;
+
+ /* Capture some diagnostic state under the lock so we can print it out later */
+
+ int pid = perfcontrol_failsafe_pid;
+ uint64_t tid = perfcontrol_failsafe_tid;
+
+ uint64_t thread_usage = perfcontrol_failsafe_thread_timer_last_seen -
+ perfcontrol_failsafe_thread_timer_at_start;
+ uint32_t rec_cores_before = perfcontrol_failsafe_recommended_at_trigger;
+ uint32_t rec_cores_after = perfcontrol_requested_recommended_cores;
+ uint64_t failsafe_duration = ctime - perfcontrol_failsafe_activation_time;
+ strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name));
+
+ print_diagnostic = TRUE;
+
+ /* Deactivate the failsafe and reinstate the requested recommendation settings */
+
+ perfcontrol_failsafe_deactivation_time = ctime;
+ perfcontrol_failsafe_active = FALSE;
+
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+ MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) | DBG_FUNC_END,
+ perfcontrol_requested_recommended_cores, failsafe_duration, 0, 0, 0);
+
+ sched_update_recommended_cores(perfcontrol_requested_recommended_cores);
+
+out:
+ simple_unlock(&sched_recommended_cores_lock);
+ splx(s);
+
+ if (print_diagnostic) {
+ uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0;
+
+ absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms);
+ failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC;
+
+ absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms);
+ thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC;
+
+ printf("recommended core failsafe kicked in for %lld ms "
+ "likely due to %s[%d] thread 0x%llx spending "
+ "%lld ms on cpu at realtime priority - "
+ "new recommendation: 0x%x -> 0x%x\n",
+ failsafe_duration_ms, p_name, pid, tid, thread_usage_ms,
+ rec_cores_before, rec_cores_after);
+ }
+}
+
+/*
+ * Apply a new recommended cores mask to the processors it affects
+ * Runs after considering failsafes and such
+ *
+ * Iterate over processors and update their ->is_recommended field.
+ * If a processor is running, we let it drain out at its next
+ * quantum expiration or blocking point. If a processor is idle, there
+ * may be more work for it to do, so IPI it.
+ *
+ * interrupts disabled, sched_recommended_cores_lock is held
+ */
+static void
+sched_update_recommended_cores(uint32_t recommended_cores)
+{
+ processor_set_t pset, nset;
+ processor_t processor;
+ uint64_t needs_exit_idle_mask = 0x0;
+
+ processor = processor_list;
+ pset = processor->processor_set;
+
+ KDBG(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START,
+ recommended_cores, perfcontrol_failsafe_active, 0, 0);
+
+ if (__builtin_popcount(recommended_cores) == 0) {
+ bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */
+ }
+
+ /* First set recommended cores */
+ pset_lock(pset);
+ do {
+
+ nset = processor->processor_set;
+ if (nset != pset) {
+ pset_unlock(pset);
+ pset = nset;
+ pset_lock(pset);
+ }
+
+ if (bit_test(recommended_cores, processor->cpu_id)) {
+ processor->is_recommended = TRUE;
+ bit_set(pset->recommended_bitmask, processor->cpu_id);
+
+ if (processor->state == PROCESSOR_IDLE) {
+ if (processor != current_processor()) {
+ bit_set(needs_exit_idle_mask, processor->cpu_id);
+ }
+ }
+ }
+ } while ((processor = processor->processor_list) != NULL);
+ pset_unlock(pset);
+
+ /* Now shutdown not recommended cores */
+ processor = processor_list;
+ pset = processor->processor_set;
+
+ pset_lock(pset);
+ do {
+
+ nset = processor->processor_set;
+ if (nset != pset) {
+ pset_unlock(pset);
+ pset = nset;
+ pset_lock(pset);
+ }
+
+ if (!bit_test(recommended_cores, processor->cpu_id)) {
+ sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
+
+ processor->is_recommended = FALSE;
+ bit_clear(pset->recommended_bitmask, processor->cpu_id);
+
+ if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
+ ipi_type = SCHED_IPI_IMMEDIATE;
+ }
+ SCHED(processor_queue_shutdown)(processor);
+ /* pset unlocked */
+
+ SCHED(rt_queue_shutdown)(processor);
+
+ if (ipi_type != SCHED_IPI_NONE) {
+ if (processor == current_processor()) {
+ ast_on(AST_PREEMPT);
+ } else {
+ sched_ipi_perform(processor, ipi_type);
+ }
+ }
+
+ pset_lock(pset);
+ }
+ } while ((processor = processor->processor_list) != NULL);
+ pset_unlock(pset);
+
+ /* Issue all pending IPIs now that the pset lock has been dropped */
+ for (int cpuid = lsb_first(needs_exit_idle_mask); cpuid >= 0; cpuid = lsb_next(needs_exit_idle_mask, cpuid)) {
+ processor = processor_array[cpuid];
+ machine_signal_idle(processor);
+ }
+
+ KDBG(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END,
+ needs_exit_idle_mask, 0, 0, 0);
+}
+#endif /* __arm__ || __arm64__ */
+
+void thread_set_options(uint32_t thopt) {
+ spl_t x;
+ thread_t t = current_thread();
+
+ x = splsched();
+ thread_lock(t);
+
+ t->options |= thopt;
+
+ thread_unlock(t);
+ splx(x);
+}
+
+void thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint) {
+ thread->pending_block_hint = block_hint;
+}
+
+uint32_t qos_max_parallelism(int qos, uint64_t options)
+{
+ return SCHED(qos_max_parallelism)(qos, options);
+}
+
+uint32_t sched_qos_max_parallelism(__unused int qos, uint64_t options)
+{
+ host_basic_info_data_t hinfo;
+ mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
+ /* Query the machine layer for core information */
+ __assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO,
+ (host_info_t)&hinfo, &count);
+ assert(kret == KERN_SUCCESS);
+
+ /* We would not want multiple realtime threads running on the
+ * same physical core; even for SMT capable machines.
+ */
+ if (options & QOS_PARALLELISM_REALTIME) {
+ return hinfo.physical_cpu;
+ }
+
+ if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
+ return hinfo.logical_cpu;
+ } else {
+ return hinfo.physical_cpu;
+ }
+}
+
+#if __arm64__
+
+/*
+ * Set up or replace old timer with new timer
+ *
+ * Returns true if canceled old timer, false if it did not
+ */
+boolean_t
+sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
+{
+ /*
+ * Exchange deadline for new deadline, if old deadline was nonzero,
+ * then I cancelled the callback, otherwise I didn't
+ */
+
+ uint64_t old_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline,
+ memory_order_relaxed);
+
+
+ while (!__c11_atomic_compare_exchange_weak(&sched_perfcontrol_callback_deadline,
+ &old_deadline, new_deadline,
+ memory_order_relaxed, memory_order_relaxed));
+
+
+ /* now old_deadline contains previous value, which might not be the same if it raced */
+
+ return (old_deadline != 0) ? TRUE : FALSE;
+}
+
+#endif /* __arm64__ */