* or can we get away with putting the entry in either one or the other pset?
*
* Consider the right way to handle runq count - I don't want to iterate groups.
- * Perhaps keep a global counter. sched_run_count will not work.
+ * Perhaps keep a global counter.
* Alternate option - remove it from choose_processor. It doesn't add much value
* now that we have global runq.
*
#endif
typedef struct sched_entry {
- queue_chain_t links;
+ queue_chain_t entry_links;
int16_t sched_pri; /* scheduled (current) priority */
int16_t runq;
int32_t pad;
queue_chain_t sched_groups;
};
-/* TODO: Turn this into an attribute in the sched dispatch struct */
-boolean_t sched_groups_enabled = FALSE;
-
/*
* Keep entry on the head of the runqueue while dequeueing threads.
* Only cycle it to the end of the runqueue when a thread in the task
*/
static boolean_t deep_drain = FALSE;
-/*
- * Don't favor the task when an urgent thread is present.
- */
-static boolean_t drain_urgent_first = TRUE;
-
/* Verify the consistency of the runq before touching it */
static boolean_t multiq_sanity_check = FALSE;
#define DEFAULT_DRAIN_DEPTH_LIMIT MAXPRI_THROTTLE
static integer_t drain_depth_limit;
+/*
+ * Don't favor the task when there's something above this priority in another task.
+ */
+#define DEFAULT_DRAIN_CEILING BASEPRI_FOREGROUND
+static integer_t drain_ceiling;
static struct zone *sched_group_zone;
sched_multiq_steal_thread(processor_set_t pset);
static void
-sched_multiq_thread_update_scan(void);
+sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context);
static boolean_t
sched_multiq_processor_enqueue(processor_t processor, thread_t thread, integer_t options);
static sched_mode_t
sched_multiq_initial_thread_sched_mode(task_t parent_task);
-static boolean_t
-sched_multiq_should_current_thread_rechoose_processor(processor_t processor);
+static bool
+sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread);
const struct sched_dispatch_table sched_multiq_dispatch = {
+ .sched_name = "multiq",
.init = sched_multiq_init,
- .timebase_init = sched_traditional_timebase_init,
+ .timebase_init = sched_timeshare_timebase_init,
.processor_init = sched_multiq_processor_init,
.pset_init = sched_multiq_pset_init,
- .maintenance_continuation = sched_traditional_maintenance_continue,
+ .maintenance_continuation = sched_timeshare_maintenance_continue,
.choose_thread = sched_multiq_choose_thread,
+ .steal_thread_enabled = FALSE,
.steal_thread = sched_multiq_steal_thread,
- .compute_priority = compute_priority,
+ .compute_timeshare_priority = sched_compute_timeshare_priority,
.choose_processor = choose_processor,
.processor_enqueue = sched_multiq_processor_enqueue,
.processor_queue_shutdown = sched_multiq_processor_queue_shutdown,
.priority_is_urgent = priority_is_urgent,
.processor_csw_check = sched_multiq_processor_csw_check,
.processor_queue_has_priority = sched_multiq_processor_queue_has_priority,
- .initial_quantum_size = sched_traditional_initial_quantum_size,
+ .initial_quantum_size = sched_timeshare_initial_quantum_size,
.initial_thread_sched_mode = sched_multiq_initial_thread_sched_mode,
.can_update_priority = can_update_priority,
.update_priority = update_priority,
.lightweight_update_priority = lightweight_update_priority,
.quantum_expire = sched_multiq_quantum_expire,
- .should_current_thread_rechoose_processor = sched_multiq_should_current_thread_rechoose_processor,
.processor_runq_count = sched_multiq_runq_count,
.processor_runq_stats_count_sum = sched_multiq_runq_stats_count_sum,
- .fairshare_init = sched_traditional_fairshare_init,
- .fairshare_runq_count = sched_traditional_fairshare_runq_count,
- .fairshare_runq_stats_count_sum = sched_traditional_fairshare_runq_stats_count_sum,
- .fairshare_enqueue = sched_traditional_fairshare_enqueue,
- .fairshare_dequeue = sched_traditional_fairshare_dequeue,
- .fairshare_queue_remove = sched_traditional_fairshare_queue_remove,
.processor_bound_count = sched_multiq_processor_bound_count,
.thread_update_scan = sched_multiq_thread_update_scan,
.direct_dispatch_to_idle_processors = FALSE,
+ .multiple_psets_enabled = FALSE,
+ .sched_groups_enabled = TRUE,
+ .avoid_processor_enabled = TRUE,
+ .thread_avoid_processor = sched_multiq_thread_avoid_processor,
+ .processor_balance = sched_SMT_balance,
+
+ .rt_runq = sched_rtglobal_runq,
+ .rt_init = sched_rtglobal_init,
+ .rt_queue_shutdown = sched_rtglobal_queue_shutdown,
+ .rt_runq_scan = sched_rtglobal_runq_scan,
+ .rt_runq_count_sum = sched_rtglobal_runq_count_sum,
+
+ .qos_max_parallelism = sched_qos_max_parallelism,
+ .check_spill = sched_check_spill,
+ .ipi_policy = sched_ipi_policy,
+ .thread_should_yield = sched_thread_should_yield,
};
static void
sched_multiq_init(void)
{
- sched_groups_enabled = TRUE;
-
#if defined(MULTIQ_SANITY_CHECK)
PE_parse_boot_argn("-multiq-sanity-check", &multiq_sanity_check, sizeof(multiq_sanity_check));
#endif
PE_parse_boot_argn("-multiq-deep-drain", &deep_drain, sizeof(deep_drain));
- PE_parse_boot_argn("multiq_drain_urgent_first", &drain_urgent_first, sizeof(drain_urgent_first));
+ if (!PE_parse_boot_argn("multiq_drain_ceiling", &drain_ceiling, sizeof(drain_ceiling))) {
+ drain_ceiling = DEFAULT_DRAIN_CEILING;
+ }
if (!PE_parse_boot_argn("multiq_drain_depth_limit", &drain_depth_limit, sizeof(drain_depth_limit))) {
drain_depth_limit = DEFAULT_DRAIN_DEPTH_LIMIT;
drain_band_limit = DEFAULT_DRAIN_BAND_LIMIT;
}
- printf("multiq scheduler config: deep-drain %d, urgent first %d, depth limit %d, band limit %d, sanity check %d\n",
- deep_drain, drain_urgent_first, drain_depth_limit, drain_band_limit, multiq_sanity_check);
+ printf("multiq scheduler config: deep-drain %d, ceiling %d, depth limit %d, band limit %d, sanity check %d\n",
+ deep_drain, drain_ceiling, drain_depth_limit, drain_band_limit, multiq_sanity_check);
sched_group_zone = zinit(
sizeof(struct sched_group),
lck_attr_setdefault(&sched_groups_lock_attr);
lck_mtx_init(&sched_groups_lock, &sched_groups_lock_grp, &sched_groups_lock_attr);
- sched_traditional_init();
+ sched_timeshare_init();
}
static void
{
sched_group_t sched_group;
- if (!sched_groups_enabled)
+ if (!SCHED(sched_groups_enabled))
return SCHED_GROUP_NULL;
sched_group = (sched_group_t)zalloc(sched_group_zone);
void
sched_group_destroy(sched_group_t sched_group)
{
- if (!sched_groups_enabled) {
+ if (!SCHED(sched_groups_enabled)) {
assert(sched_group == SCHED_GROUP_NULL);
return;
}
static inline sched_group_t
group_for_entry(sched_entry_t entry)
{
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-align"
sched_group_t group = (sched_group_t)(entry - entry->sched_pri);
+#pragma clang diagnostic pop
return group;
}
{
assert(rq->count != 0);
- queue_t queue = rq->queues + rq->highq;
+ queue_t queue = &rq->queues[rq->highq];
- sched_entry_t entry = (sched_entry_t)queue_first(queue);
+ sched_entry_t entry = qe_queue_first(queue, struct sched_entry, entry_links);
assert(entry->sched_pri == rq->highq);
#if defined(MULTIQ_SANITY_CHECK)
+#if MACH_ASSERT
__attribute__((always_inline))
static inline boolean_t
queue_chain_linked(queue_chain_t* chain)
return FALSE;
}
}
+#endif /* MACH_ASSERT */
static thread_t
group_first_thread(sched_group_t group)
assert(rq->count != 0);
- queue_t queue = rq->queues + rq->highq;
+ queue_t queue = &rq->queues[rq->highq];
- thread_t thread = (thread_t)(void*)queue_first(queue);
+ thread_t thread = qe_queue_first(queue, struct thread, runq_links);
assert(thread != THREAD_NULL);
+ assert_thread_magic(thread);
assert(thread->sched_group == group);
queue_t q;
sched_entry_t elem;
- assert(queue_chain_linked(&entry->links));
+ assert(queue_chain_linked(&entry->entry_links));
assert(entry->runq == MULTIQ_ERUNQ);
q = &runq->queues[expected_pri];
- queue_iterate(q, elem, sched_entry_t, links) {
+ qe_foreach_element(elem, q, entry_links) {
if (elem == entry)
return;
}
q = &group->runq.queues[pri];
- queue_iterate(q, elem, thread_t, links) {
+ qe_foreach_element(elem, q, runq_links) {
if (elem == thread)
return;
}
entry_queue_dequeue_entry(entry_queue_t rq)
{
sched_entry_t sched_entry;
- queue_t queue = rq->queues + rq->highq;
+ queue_t queue = &rq->queues[rq->highq];
assert(rq->count > 0);
assert(!queue_empty(queue));
- sched_entry = (sched_entry_t)dequeue_head(queue);
+ sched_entry = qe_dequeue_head(queue, struct sched_entry, entry_links);
SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
rq->count--;
rq->urgency--; assert(rq->urgency >= 0);
}
if (queue_empty(queue)) {
- if (rq->highq != IDLEPRI)
- clrbit(MAXPRI - rq->highq, rq->bitmap);
- rq->highq = MAXPRI - ffsbit(rq->bitmap);
+ rq_bitmap_clear(rq->bitmap, rq->highq);
+ rq->highq = bitmap_first(rq->bitmap, NRQS);
}
sched_entry->runq = 0;
integer_t options)
{
int sched_pri = entry->sched_pri;
- queue_t queue = rq->queues + sched_pri;
+ queue_t queue = &rq->queues[sched_pri];
boolean_t result = FALSE;
assert(entry->runq == 0);
if (queue_empty(queue)) {
- enqueue_tail(queue, (queue_entry_t)entry);
+ enqueue_tail(queue, &entry->entry_links);
- setbit(MAXPRI - sched_pri, rq->bitmap);
+ rq_bitmap_set(rq->bitmap, sched_pri);
if (sched_pri > rq->highq) {
rq->highq = sched_pri;
result = TRUE;
}
} else {
if (options & SCHED_TAILQ)
- enqueue_tail(queue, (queue_entry_t)entry);
+ enqueue_tail(queue, &entry->entry_links);
else
- enqueue_head(queue, (queue_entry_t)entry);
+ enqueue_head(queue, &entry->entry_links);
}
if (SCHED(priority_is_urgent)(sched_pri))
rq->urgency++;
}
#endif
- remqueue((queue_entry_t)entry);
+ remqueue(&entry->entry_links);
SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
rq->count--;
rq->urgency--; assert(rq->urgency >= 0);
}
- if (queue_empty(rq->queues + sched_pri)) {
+ if (queue_empty(&rq->queues[sched_pri])) {
/* update run queue status */
- if (sched_pri != IDLEPRI)
- clrbit(MAXPRI - sched_pri, rq->bitmap);
- rq->highq = MAXPRI - ffsbit(rq->bitmap);
+ rq_bitmap_clear(rq->bitmap, sched_pri);
+ rq->highq = bitmap_first(rq->bitmap, NRQS);
}
entry->runq = 0;
}
+static void
+entry_queue_change_entry(
+ entry_queue_t rq,
+ sched_entry_t entry,
+ integer_t options)
+{
+ int sched_pri = entry->sched_pri;
+ queue_t queue = &rq->queues[sched_pri];
+
+#if defined(MULTIQ_SANITY_CHECK)
+ if (multiq_sanity_check) {
+ entry_queue_check_entry(rq, entry, sched_pri);
+ }
+#endif
+
+ if (options & SCHED_TAILQ)
+ re_queue_tail(queue, &entry->entry_links);
+ else
+ re_queue_head(queue, &entry->entry_links);
+}
/*
* The run queue must not be empty.
*
boolean_t *queue_empty)
{
thread_t thread;
- queue_t queue = rq->queues + rq->highq;
+ queue_t queue = &rq->queues[rq->highq];
assert(rq->count > 0);
assert(!queue_empty(queue));
*thread_pri = rq->highq;
- thread = (thread_t)(void*)dequeue_head(queue);
+ thread = qe_dequeue_head(queue, struct thread, runq_links);
+ assert_thread_magic(thread);
SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
rq->count--;
rq->urgency--; assert(rq->urgency >= 0);
}
if (queue_empty(queue)) {
- if (rq->highq != IDLEPRI)
- clrbit(MAXPRI - rq->highq, rq->bitmap);
- rq->highq = MAXPRI - ffsbit(rq->bitmap);
+ rq_bitmap_clear(rq->bitmap, rq->highq);
+ rq->highq = bitmap_first(rq->bitmap, NRQS);
*queue_empty = TRUE;
} else {
*queue_empty = FALSE;
}
- return (thread);
+ return thread;
}
/*
integer_t thread_pri,
integer_t options)
{
- queue_t queue = rq->queues + thread_pri;
+ queue_t queue = &rq->queues[thread_pri];
boolean_t result = FALSE;
assert(thread->runq == PROCESSOR_NULL);
+ assert_thread_magic(thread);
if (queue_empty(queue)) {
- enqueue_tail(queue, (queue_entry_t)thread);
+ enqueue_tail(queue, &thread->runq_links);
- setbit(MAXPRI - thread_pri, rq->bitmap);
+ rq_bitmap_set(rq->bitmap, thread_pri);
if (thread_pri > rq->highq) {
rq->highq = thread_pri;
}
result = TRUE;
} else {
if (options & SCHED_TAILQ)
- enqueue_tail(queue, (queue_entry_t)thread);
+ enqueue_tail(queue, &thread->runq_links);
else
- enqueue_head(queue, (queue_entry_t)thread);
+ enqueue_head(queue, &thread->runq_links);
}
if (SCHED(priority_is_urgent)(thread_pri))
rq->urgency++;
{
boolean_t result = FALSE;
+ assert_thread_magic(thread);
assert(thread->runq != PROCESSOR_NULL);
- remqueue((queue_entry_t)thread);
+ remqueue(&thread->runq_links);
SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
rq->count--;
rq->urgency--; assert(rq->urgency >= 0);
}
- if (queue_empty(rq->queues + thread_pri)) {
+ if (queue_empty(&rq->queues[thread_pri])) {
/* update run queue status */
- if (thread_pri != IDLEPRI)
- clrbit(MAXPRI - thread_pri, rq->bitmap);
- rq->highq = MAXPRI - ffsbit(rq->bitmap);
+ rq_bitmap_clear(rq->bitmap, thread_pri);
+ rq->highq = bitmap_first(rq->bitmap, NRQS);
result = TRUE;
}
* What effects would it have?
*/
entry_queue_enqueue_entry(main_entryq, &group->entries[sched_pri], options);
+ } else if (options & SCHED_HEADQ) {
+ /* The thread should be at the head of the line - move its entry to the front */
+ entry_queue_change_entry(main_entryq, &group->entries[sched_pri], options);
}
}
* Should YIELD AST override drain limit?
*/
if (group->runq.count != 0 && (reason & AST_PREEMPTION) == 0) {
- boolean_t drain_limit_hit = FALSE;
+ boolean_t favor_group = TRUE;
- if (main_entryq->highq > group->runq.highq) {
+ integer_t global_pri = main_entryq->highq;
+ integer_t group_pri = group->runq.highq;
+
+ /*
+ * Favor the current group if the group is still the globally highest.
+ *
+ * Otherwise, consider choosing a thread from the current group
+ * even if it's lower priority than the global highest priority.
+ */
+ if (global_pri > group_pri) {
/*
* If there's something elsewhere above the depth limit,
* don't pick a thread below the limit.
*/
- if (main_entryq->highq > drain_depth_limit &&
- group->runq.highq <= drain_depth_limit)
- drain_limit_hit = TRUE;
+ if (global_pri > drain_depth_limit && group_pri <= drain_depth_limit)
+ favor_group = FALSE;
/*
- * Don't go more than X steps below the global highest
+ * If there's something at or above the ceiling,
+ * don't favor the group.
*/
- if ((main_entryq->highq - group->runq.highq) >= drain_band_limit)
- drain_limit_hit = TRUE;
+ if (global_pri >= drain_ceiling)
+ favor_group = FALSE;
- /* Don't favor the task when an urgent thread is present. */
- if (drain_urgent_first && main_entryq->urgency > 0)
- drain_limit_hit = TRUE;
+ /*
+ * Don't go more than X steps below the global highest
+ */
+ if ((global_pri - group_pri) >= drain_band_limit)
+ favor_group = FALSE;
}
- if (!drain_limit_hit) {
+ if (favor_group) {
/* Pull from local runq */
KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
- MACH_MULTIQ_GROUP, main_entryq->highq, group->runq.highq, 0, 0);
+ MACH_MULTIQ_GROUP, global_pri, group_pri, 0, 0);
return sched_group_dequeue_thread(main_entryq, group);
}
sched_entry_t entry = group_entry_for_pri(thread->sched_group, processor->current_pri);
if (entry->runq == MULTIQ_ERUNQ) {
- entry_queue_remove_entry(entryq, entry);
- entry_queue_enqueue_entry(entryq, entry, SCHED_TAILQ);
+ entry_queue_change_entry(entryq, entry, SCHED_TAILQ);
}
pset_unlock(pset);
boolean_t has_higher;
int pri;
+ if (sched_multiq_thread_avoid_processor(processor, current_thread())) {
+ return (AST_PREEMPT | AST_URGENT);
+ }
+
entry_queue_t main_entryq = multiq_main_entryq(processor);
- run_queue_t bound_runq = multiq_bound_runq(processor);
+ run_queue_t bound_runq = multiq_bound_runq(processor);
assert(processor->active_thread != NULL);
pri = MAX(main_entryq->highq, bound_runq->highq);
- if (first_timeslice(processor)) {
+ if (processor->first_timeslice) {
has_higher = (pri > processor->current_pri);
} else {
has_higher = (pri >= processor->current_pri);
if (bound_runq->urgency > 0)
return (AST_PREEMPT | AST_URGENT);
-
- if (processor->active_thread && thread_eager_preemption(processor->active_thread))
- return (AST_PREEMPT | AST_URGENT);
return AST_PREEMPT;
}
int priority,
boolean_t gte)
{
- int qpri = MAX(multiq_main_entryq(processor)->highq, multiq_bound_runq(processor)->highq);
+ run_queue_t main_runq = multiq_main_entryq(processor);
+ run_queue_t bound_runq = multiq_bound_runq(processor);
+
+ int qpri = MAX(main_runq->highq, bound_runq->highq);
if (gte)
return qpri >= priority;
return qpri > priority;
}
-static boolean_t
-sched_multiq_should_current_thread_rechoose_processor(processor_t processor)
-{
- return (processor->current_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor);
-}
-
static int
sched_multiq_runq_count(processor_t processor)
{
while (main_entryq->count > 0) {
thread = sched_global_dequeue_thread(main_entryq);
- enqueue_tail(&tqueue, (queue_entry_t)thread);
+ enqueue_tail(&tqueue, &thread->runq_links);
}
pset_unlock(pset);
- while ((thread = (thread_t)(void*)dequeue_head(&tqueue)) != THREAD_NULL) {
+ qe_foreach_element_safe(thread, &tqueue, runq_links) {
+
+ remqueue(&thread->runq_links);
+
thread_lock(thread);
thread_setrun(thread, SCHED_TAILQ);
thread_t thread)
{
boolean_t removed = FALSE;
-
processor_set_t pset = processor->processor_set;
pset_lock(pset);
* Scan the global queue for candidate groups, and scan those groups for
* candidate threads.
*
+ * TODO: This iterates every group runq in its entirety for each entry it has in the runq, which is O(N^2)
+ * Instead, iterate only the queue in the group runq matching the priority of the entry.
+ *
* Returns TRUE if retry is needed.
*/
static boolean_t
-group_scan(entry_queue_t runq) {
- int count;
- queue_t q;
- sched_group_t group;
- sched_entry_t entry;
-
- if ((count = runq->count) > 0) {
- q = runq->queues + runq->highq;
- while (count > 0) {
- queue_iterate(q, entry, sched_entry_t, links) {
- group = group_for_entry(entry);
- if (group->runq.count > 0) {
- if (runq_scan(&group->runq))
- return (TRUE);
- }
- count--;
+group_scan(entry_queue_t runq, sched_update_scan_context_t scan_context) {
+ int count = runq->count;
+ int queue_index;
+
+ assert(count >= 0);
+
+ if (count == 0)
+ return FALSE;
+
+ for (queue_index = bitmap_first(runq->bitmap, NRQS);
+ queue_index >= 0;
+ queue_index = bitmap_next(runq->bitmap, queue_index)) {
+
+ sched_entry_t entry;
+
+ qe_foreach_element(entry, &runq->queues[queue_index], entry_links) {
+ assert(count > 0);
+
+ sched_group_t group = group_for_entry(entry);
+ if (group->runq.count > 0) {
+ if (runq_scan(&group->runq, scan_context))
+ return (TRUE);
}
- q--;
+ count--;
}
}
}
static void
-sched_multiq_thread_update_scan(void)
+sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context)
{
boolean_t restart_needed = FALSE;
processor_t processor = processor_list;
s = splsched();
pset_lock(pset);
- restart_needed = runq_scan(multiq_bound_runq(processor));
+ restart_needed = runq_scan(multiq_bound_runq(processor), scan_context);
pset_unlock(pset);
splx(s);
s = splsched();
pset_lock(pset);
- restart_needed = group_scan(&pset->pset_runq);
+ restart_needed = group_scan(&pset->pset_runq, scan_context);
pset_unlock(pset);
splx(s);
} while (restart_needed);
}
+extern int sched_allow_rt_smt;
+/* Return true if this thread should not continue running on this processor */
+static bool
+sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread)
+{
+ if (processor->processor_primary != processor) {
+ /*
+ * This is a secondary SMT processor. If the primary is running
+ * a realtime thread, only allow realtime threads on the secondary.
+ */
+ if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
+ return true;
+ }
+ }
+
+ return false;
+}