+ /* First set recommended cores */
+ pset_lock(pset);
+ avail_count = 0;
+ do {
+ nset = processor->processor_set;
+ if (nset != pset) {
+ pset_unlock(pset);
+ pset = nset;
+ pset_lock(pset);
+ }
+
+ if (bit_test(recommended_cores, processor->cpu_id)) {
+ processor->is_recommended = TRUE;
+ bit_set(pset->recommended_bitmask, processor->cpu_id);
+
+ if (processor->state == PROCESSOR_IDLE) {
+ if (processor != current_processor()) {
+ bit_set(needs_exit_idle_mask, processor->cpu_id);
+ }
+ }
+ if (processor->state != PROCESSOR_OFF_LINE) {
+ avail_count++;
+ SCHED(pset_made_schedulable)(processor, pset, false);
+ }
+ }
+ } while ((processor = processor->processor_list) != NULL);
+ pset_unlock(pset);
+
+ /* Now shutdown not recommended cores */
+ processor = processor_list;
+ pset = processor->processor_set;
+
+ pset_lock(pset);
+ do {
+ nset = processor->processor_set;
+ if (nset != pset) {
+ pset_unlock(pset);
+ pset = nset;
+ pset_lock(pset);
+ }
+
+ if (!bit_test(recommended_cores, processor->cpu_id)) {
+ sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
+
+ processor->is_recommended = FALSE;
+ bit_clear(pset->recommended_bitmask, processor->cpu_id);
+
+ if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
+ ipi_type = SCHED_IPI_IMMEDIATE;
+ }
+ SCHED(processor_queue_shutdown)(processor);
+ /* pset unlocked */
+
+ SCHED(rt_queue_shutdown)(processor);
+
+ if (ipi_type != SCHED_IPI_NONE) {
+ if (processor == current_processor()) {
+ ast_on(AST_PREEMPT);
+ } else {
+ sched_ipi_perform(processor, ipi_type);
+ }
+ }
+
+ pset_lock(pset);
+ }
+ } while ((processor = processor->processor_list) != NULL);
+
+ processor_avail_count_user = avail_count;
+#if defined(__x86_64__)
+ commpage_update_active_cpus();
+#endif
+
+ pset_unlock(pset);
+
+ /* Issue all pending IPIs now that the pset lock has been dropped */
+ for (int cpuid = lsb_first(needs_exit_idle_mask); cpuid >= 0; cpuid = lsb_next(needs_exit_idle_mask, cpuid)) {
+ processor = processor_array[cpuid];
+ machine_signal_idle(processor);
+ }
+
+ KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END,
+ needs_exit_idle_mask, 0, 0, 0);
+}
+
+void
+thread_set_options(uint32_t thopt)
+{
+ spl_t x;
+ thread_t t = current_thread();
+
+ x = splsched();
+ thread_lock(t);
+
+ t->options |= thopt;
+
+ thread_unlock(t);
+ splx(x);
+}
+
+void
+thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint)
+{
+ thread->pending_block_hint = block_hint;
+}
+
+uint32_t
+qos_max_parallelism(int qos, uint64_t options)
+{
+ return SCHED(qos_max_parallelism)(qos, options);
+}
+
+uint32_t
+sched_qos_max_parallelism(__unused int qos, uint64_t options)
+{
+ host_basic_info_data_t hinfo;
+ mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
+ /* Query the machine layer for core information */
+ __assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO,
+ (host_info_t)&hinfo, &count);
+ assert(kret == KERN_SUCCESS);
+
+ if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
+ return hinfo.logical_cpu;
+ } else {
+ return hinfo.physical_cpu;
+ }
+}
+
+int sched_allow_NO_SMT_threads = 1;
+bool
+thread_no_smt(thread_t thread)
+{
+ return sched_allow_NO_SMT_threads && (thread->bound_processor == PROCESSOR_NULL) && ((thread->sched_flags & TH_SFLAG_NO_SMT) || (thread->task->t_flags & TF_NO_SMT));
+}
+
+bool
+processor_active_thread_no_smt(processor_t processor)
+{
+ return sched_allow_NO_SMT_threads && !processor->current_is_bound && processor->current_is_NO_SMT;
+}
+
+#if __arm64__
+
+/*
+ * Set up or replace old timer with new timer
+ *
+ * Returns true if canceled old timer, false if it did not
+ */
+boolean_t
+sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
+{
+ /*
+ * Exchange deadline for new deadline, if old deadline was nonzero,
+ * then I cancelled the callback, otherwise I didn't
+ */
+
+ return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline,
+ relaxed) != 0;
+}
+
+#endif /* __arm64__ */
+
+#if CONFIG_SCHED_EDGE
+
+#define SCHED_PSET_LOAD_EWMA_TC_NSECS 10000000u
+
+/*
+ * sched_edge_pset_running_higher_bucket()
+ *
+ * Routine to calculate cumulative running counts for each scheduling
+ * bucket. This effectively lets the load calculation calculate if a
+ * cluster is running any threads at a QoS lower than the thread being
+ * migrated etc.
+ */
+
+static void
+sched_edge_pset_running_higher_bucket(processor_set_t pset, uint32_t *running_higher)
+{
+ bitmap_t *active_map = &pset->cpu_state_map[PROCESSOR_RUNNING];
+
+ /* Edge Scheduler Optimization */
+ for (int cpu = bitmap_first(active_map, MAX_CPUS); cpu >= 0; cpu = bitmap_next(active_map, cpu)) {
+ sched_bucket_t cpu_bucket = os_atomic_load(&pset->cpu_running_buckets[cpu], relaxed);
+ for (sched_bucket_t bucket = cpu_bucket; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
+ running_higher[bucket]++;
+ }
+ }
+}
+
+/*
+ * sched_update_pset_load_average()
+ *
+ * Updates the load average for each sched bucket for a cluster.
+ * This routine must be called with the pset lock held.
+ */
+void
+sched_update_pset_load_average(processor_set_t pset, uint64_t curtime)
+{
+ if (pset->online_processor_count == 0) {
+ /* Looks like the pset is not runnable any more; nothing to do here */
+ return;
+ }
+
+ /*
+ * Edge Scheduler Optimization
+ *
+ * See if more callers of this routine can pass in timestamps to avoid the
+ * mach_absolute_time() call here.
+ */
+
+ if (!curtime) {
+ curtime = mach_absolute_time();
+ }
+ uint64_t last_update = os_atomic_load(&pset->pset_load_last_update, relaxed);
+ int64_t delta_ticks = curtime - last_update;
+ if (delta_ticks < 0) {
+ return;
+ }
+
+ uint64_t delta_nsecs = 0;
+ absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
+
+ if (__improbable(delta_nsecs > UINT32_MAX)) {
+ delta_nsecs = UINT32_MAX;
+ }
+
+ uint32_t running_higher[TH_BUCKET_SCHED_MAX] = {0};
+ sched_edge_pset_running_higher_bucket(pset, running_higher);
+
+ for (sched_bucket_t sched_bucket = TH_BUCKET_FIXPRI; sched_bucket < TH_BUCKET_SCHED_MAX; sched_bucket++) {
+ uint64_t old_load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
+ uint64_t old_load_average_factor = old_load_average * SCHED_PSET_LOAD_EWMA_TC_NSECS;
+ uint32_t current_runq_depth = (sched_edge_cluster_cumulative_count(&pset->pset_clutch_root, sched_bucket) + rt_runq_count(pset) + running_higher[sched_bucket]) / pset->online_processor_count;
+
+ /*
+ * For the new load average multiply current_runq_depth by delta_nsecs (which resuts in a 32.0 value).
+ * Since we want to maintain the load average as a 24.8 fixed arithmetic value for precision, the
+ * new load averga needs to be shifted before it can be added to the old load average.
+ */
+ uint64_t new_load_average_factor = (current_runq_depth * delta_nsecs) << SCHED_PSET_LOAD_EWMA_FRACTION_BITS;
+
+ /*
+ * For extremely parallel workloads, it is important that the load average on a cluster moves zero to non-zero
+ * instantly to allow threads to be migrated to other (potentially idle) clusters quickly. Hence use the EWMA
+ * when the system is already loaded; otherwise for an idle system use the latest load average immediately.
+ */
+ int old_load_shifted = (int)((old_load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
+ boolean_t load_uptick = (old_load_shifted == 0) && (current_runq_depth != 0);
+ boolean_t load_downtick = (old_load_shifted != 0) && (current_runq_depth == 0);
+ uint64_t load_average;
+ if (load_uptick || load_downtick) {
+ load_average = (current_runq_depth << SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
+ } else {
+ /* Indicates a loaded system; use EWMA for load average calculation */
+ load_average = (old_load_average_factor + new_load_average_factor) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
+ }
+ os_atomic_store(&pset->pset_load_average[sched_bucket], load_average, relaxed);
+ KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_LOAD_AVG) | DBG_FUNC_NONE, pset->pset_cluster_id, (load_average >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS), load_average & SCHED_PSET_LOAD_EWMA_FRACTION_MASK, sched_bucket);
+ }
+ os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
+}
+
+void
+sched_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket)
+{
+ pset_execution_time_t old_execution_time_packed, new_execution_time_packed;
+ uint64_t avg_thread_execution_time = 0;
+
+ os_atomic_rmw_loop(&pset->pset_execution_time[sched_bucket].pset_execution_time_packed,
+ old_execution_time_packed.pset_execution_time_packed,
+ new_execution_time_packed.pset_execution_time_packed, relaxed, {
+ uint64_t last_update = old_execution_time_packed.pset_execution_time_last_update;
+ int64_t delta_ticks = curtime - last_update;
+ if (delta_ticks < 0) {
+ /*
+ * Its possible that another CPU came in and updated the pset_execution_time
+ * before this CPU could do it. Since the average execution time is meant to
+ * be an approximate measure per cluster, ignore the older update.
+ */
+ os_atomic_rmw_loop_give_up(return );
+ }
+ uint64_t delta_nsecs = 0;
+ absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
+
+ uint64_t nanotime = 0;
+ absolutetime_to_nanoseconds(execution_time, &nanotime);
+ uint64_t execution_time_us = nanotime / NSEC_PER_USEC;
+
+ uint64_t old_execution_time = (old_execution_time_packed.pset_avg_thread_execution_time * SCHED_PSET_LOAD_EWMA_TC_NSECS);
+ uint64_t new_execution_time = (execution_time_us * delta_nsecs);
+
+ avg_thread_execution_time = (old_execution_time + new_execution_time) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
+ new_execution_time_packed.pset_avg_thread_execution_time = avg_thread_execution_time;
+ new_execution_time_packed.pset_execution_time_last_update = curtime;
+ });
+ KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_AVG_EXEC_TIME) | DBG_FUNC_NONE, pset->pset_cluster_id, avg_thread_execution_time, sched_bucket);
+}
+
+#else /* CONFIG_SCHED_EDGE */
+
+void
+sched_update_pset_load_average(processor_set_t pset, __unused uint64_t curtime)
+{
+ int non_rt_load = pset->pset_runq.count;
+ int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
+ int new_load_average = ((int)pset->load_average + load) >> 1;
+
+ pset->load_average = new_load_average;
+#if (DEVELOPMENT || DEBUG)
+#if __AMP__
+ if (pset->pset_cluster_type == PSET_AMP_P) {
+ KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) | DBG_FUNC_NONE, sched_get_pset_load_average(pset, 0), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)));
+ }
+#endif
+#endif
+}
+
+void
+sched_update_pset_avg_execution_time(__unused processor_set_t pset, __unused uint64_t execution_time, __unused uint64_t curtime, __unused sched_bucket_t sched_bucket)
+{
+}
+#endif /* CONFIG_SCHED_EDGE */
+
+/* pset is locked */
+static bool
+processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor)
+{
+ int cpuid = processor->cpu_id;
+#if defined(__x86_64__)
+ if (sched_avoid_cpu0 && (cpuid == 0)) {
+ return false;
+ }
+#endif
+
+ cpumap_t fasttrack_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
+
+ return bit_test(fasttrack_map, cpuid);
+}
+
+/* pset is locked */
+static processor_t
+choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries)
+{
+#if defined(__x86_64__)
+ bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
+#else
+ const bool avoid_cpu0 = false;
+#endif
+
+ cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
+ if (skip_processor) {
+ bit_clear(cpu_map, skip_processor->cpu_id);
+ }
+
+ cpumap_t primary_map = cpu_map & pset->primary_map;
+ if (avoid_cpu0) {
+ primary_map = bit_ror64(primary_map, 1);
+ }
+
+ int rotid = lsb_first(primary_map);
+ if (rotid >= 0) {
+ int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
+
+ processor_t processor = processor_array[cpuid];
+
+ return processor;
+ }
+
+ if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
+ goto out;
+ }
+
+ /* Consider secondary processors */
+ cpumap_t secondary_map = cpu_map & ~pset->primary_map;
+ if (avoid_cpu0) {
+ /* Also avoid cpu1 */
+ secondary_map = bit_ror64(secondary_map, 2);
+ }
+ rotid = lsb_first(secondary_map);
+ if (rotid >= 0) {
+ int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
+
+ processor_t processor = processor_array[cpuid];
+
+ return processor;
+ }
+
+out:
+ if (skip_processor) {
+ return PROCESSOR_NULL;
+ }
+
+ /*
+ * If we didn't find an obvious processor to choose, but there are still more CPUs
+ * not already running realtime threads than realtime threads in the realtime run queue,
+ * this thread belongs in this pset, so choose some other processor in this pset
+ * to ensure the thread is enqueued here.
+ */
+ cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
+ if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
+ cpu_map = non_realtime_map;
+ assert(cpu_map != 0);
+ int cpuid = bit_first(cpu_map);
+ assert(cpuid >= 0);
+ return processor_array[cpuid];
+ }
+
+ if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
+ goto skip_secondaries;
+ }
+
+ non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
+ if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
+ cpu_map = non_realtime_map;
+ assert(cpu_map != 0);
+ int cpuid = bit_first(cpu_map);
+ assert(cpuid >= 0);
+ return processor_array[cpuid];
+ }
+
+skip_secondaries:
+ return PROCESSOR_NULL;
+}
+
+/* pset is locked */
+static bool
+all_available_primaries_are_running_realtime_threads(processor_set_t pset)
+{
+ cpumap_t cpu_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
+ return rt_runq_count(pset) > bit_count(cpu_map);
+}
+
+#if defined(__x86_64__)
+/* pset is locked */
+static bool
+these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map)
+{
+ cpumap_t cpu_map = pset_available_cpumap(pset) & these_map & ~pset->realtime_map;
+ return rt_runq_count(pset) > bit_count(cpu_map);
+}
+#endif
+
+static bool
+sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor)
+{
+ bool ok_to_run_realtime_thread = true;
+#if defined(__x86_64__)
+ if (sched_avoid_cpu0 && processor->cpu_id == 0) {
+ ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, pset->primary_map & ~0x1);
+ } else if (sched_avoid_cpu0 && (processor->cpu_id == 1) && processor->is_SMT) {
+ ok_to_run_realtime_thread = sched_allow_rt_smt && these_processors_are_running_realtime_threads(pset, ~0x2);
+ } else if (processor->processor_primary != processor) {
+ ok_to_run_realtime_thread = (sched_allow_rt_smt && all_available_primaries_are_running_realtime_threads(pset));
+ }
+#else
+ (void)pset;
+ (void)processor;
+#endif
+ return ok_to_run_realtime_thread;
+}
+
+void
+sched_pset_made_schedulable(__unused processor_t processor, processor_set_t pset, boolean_t drop_lock)
+{
+ if (drop_lock) {
+ pset_unlock(pset);
+ }
+}
+
+void
+thread_set_no_smt(bool set)
+{
+ if (!system_is_SMT) {
+ /* Not a machine that supports SMT */
+ return;
+ }
+
+ thread_t thread = current_thread();
+
+ spl_t s = splsched();