X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/fe8ab488e9161c46dd9885d58fc52996dc0249ff..a39ff7e25e19b3a8c3020042a3872ca9ec9659f1:/osfmk/kern/sched_multiq.c

diff --git a/osfmk/kern/sched_multiq.c b/osfmk/kern/sched_multiq.c
index 63519c677..fd945ba1c 100644
--- a/osfmk/kern/sched_multiq.c
+++ b/osfmk/kern/sched_multiq.c
@@ -119,7 +119,7 @@
  * or can we get away with putting the entry in either one or the other pset?
  *
  * Consider the right way to handle runq count - I don't want to iterate groups.
- * Perhaps keep a global counter.  sched_run_count will not work.
+ * Perhaps keep a global counter.
  * Alternate option - remove it from choose_processor. It doesn't add much value
  * now that we have global runq.
  *
@@ -175,7 +175,7 @@
 #endif
 
 typedef struct sched_entry {
-	queue_chain_t           links;
+	queue_chain_t           entry_links;
 	int16_t                 sched_pri;      /* scheduled (current) priority */
 	int16_t                 runq;
 	int32_t 		pad;
@@ -194,9 +194,6 @@ struct sched_group {
 	queue_chain_t           sched_groups;
 };
 
-/* TODO: Turn this into an attribute in the sched dispatch struct */
-boolean_t               sched_groups_enabled = FALSE;
-
 /*
  * Keep entry on the head of the runqueue while dequeueing threads.
  * Only cycle it to the end of the runqueue when a thread in the task
@@ -204,11 +201,6 @@ boolean_t               sched_groups_enabled = FALSE;
  */
 static boolean_t        deep_drain = FALSE;
 
-/*
- * Don't favor the task when an urgent thread is present.
- */
-static boolean_t        drain_urgent_first = TRUE;
-
 /* Verify the consistency of the runq before touching it */
 static boolean_t        multiq_sanity_check = FALSE;
 
@@ -226,6 +218,11 @@ static integer_t        drain_band_limit;
 #define DEFAULT_DRAIN_DEPTH_LIMIT MAXPRI_THROTTLE
 static integer_t        drain_depth_limit;
 
+/*
+ * Don't favor the task when there's something above this priority in another task.
+ */
+#define DEFAULT_DRAIN_CEILING BASEPRI_FOREGROUND
+static integer_t        drain_ceiling;
 
 static struct zone      *sched_group_zone;
 
@@ -246,7 +243,7 @@ static thread_t
 sched_multiq_steal_thread(processor_set_t pset);
 
 static void
-sched_multiq_thread_update_scan(void);
+sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context);
 
 static boolean_t
 sched_multiq_processor_enqueue(processor_t processor, thread_t thread, integer_t options);
@@ -290,18 +287,20 @@ sched_multiq_processor_queue_shutdown(processor_t processor);
 static sched_mode_t
 sched_multiq_initial_thread_sched_mode(task_t parent_task);
 
-static boolean_t
-sched_multiq_should_current_thread_rechoose_processor(processor_t processor);
+static bool
+sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread);
 
 const struct sched_dispatch_table sched_multiq_dispatch = {
+	.sched_name                                     = "multiq",
 	.init                                           = sched_multiq_init,
-	.timebase_init                                  = sched_traditional_timebase_init,
+	.timebase_init                                  = sched_timeshare_timebase_init,
 	.processor_init                                 = sched_multiq_processor_init,
 	.pset_init                                      = sched_multiq_pset_init,
-	.maintenance_continuation                       = sched_traditional_maintenance_continue,
+	.maintenance_continuation                       = sched_timeshare_maintenance_continue,
 	.choose_thread                                  = sched_multiq_choose_thread,
+	.steal_thread_enabled                           = FALSE,
 	.steal_thread                                   = sched_multiq_steal_thread,
-	.compute_priority                               = compute_priority,
+	.compute_timeshare_priority                     = sched_compute_timeshare_priority,
 	.choose_processor                               = choose_processor,
 	.processor_enqueue                              = sched_multiq_processor_enqueue,
 	.processor_queue_shutdown                       = sched_multiq_processor_queue_shutdown,
@@ -310,39 +309,48 @@ const struct sched_dispatch_table sched_multiq_dispatch = {
 	.priority_is_urgent                             = priority_is_urgent,
 	.processor_csw_check                            = sched_multiq_processor_csw_check,
 	.processor_queue_has_priority                   = sched_multiq_processor_queue_has_priority,
-	.initial_quantum_size                           = sched_traditional_initial_quantum_size,
+	.initial_quantum_size                           = sched_timeshare_initial_quantum_size,
 	.initial_thread_sched_mode                      = sched_multiq_initial_thread_sched_mode,
 	.can_update_priority                            = can_update_priority,
 	.update_priority                                = update_priority,
 	.lightweight_update_priority                    = lightweight_update_priority,
 	.quantum_expire                                 = sched_multiq_quantum_expire,
-	.should_current_thread_rechoose_processor       = sched_multiq_should_current_thread_rechoose_processor,
 	.processor_runq_count                           = sched_multiq_runq_count,
 	.processor_runq_stats_count_sum                 = sched_multiq_runq_stats_count_sum,
-	.fairshare_init                                 = sched_traditional_fairshare_init,
-	.fairshare_runq_count                           = sched_traditional_fairshare_runq_count,
-	.fairshare_runq_stats_count_sum                 = sched_traditional_fairshare_runq_stats_count_sum,
-	.fairshare_enqueue                              = sched_traditional_fairshare_enqueue,
-	.fairshare_dequeue                              = sched_traditional_fairshare_dequeue,
-	.fairshare_queue_remove                         = sched_traditional_fairshare_queue_remove,
 	.processor_bound_count                          = sched_multiq_processor_bound_count,
 	.thread_update_scan                             = sched_multiq_thread_update_scan,
 	.direct_dispatch_to_idle_processors             = FALSE,
+	.multiple_psets_enabled                         = FALSE,
+	.sched_groups_enabled                           = TRUE,
+	.avoid_processor_enabled                        = TRUE,
+	.thread_avoid_processor                         = sched_multiq_thread_avoid_processor,
+	.processor_balance                              = sched_SMT_balance,
+
+	.rt_runq                                        = sched_rtglobal_runq,
+	.rt_init                                        = sched_rtglobal_init,
+	.rt_queue_shutdown                              = sched_rtglobal_queue_shutdown,
+	.rt_runq_scan                                   = sched_rtglobal_runq_scan,
+	.rt_runq_count_sum                              = sched_rtglobal_runq_count_sum,
+
+	.qos_max_parallelism                            = sched_qos_max_parallelism,
+	.check_spill                                    = sched_check_spill,
+	.ipi_policy                                     = sched_ipi_policy,
+	.thread_should_yield                            = sched_thread_should_yield,
 };
 
 
 static void
 sched_multiq_init(void)
 {
-	sched_groups_enabled = TRUE;
-
 #if defined(MULTIQ_SANITY_CHECK)
 	PE_parse_boot_argn("-multiq-sanity-check", &multiq_sanity_check, sizeof(multiq_sanity_check));
 #endif
 
 	PE_parse_boot_argn("-multiq-deep-drain", &deep_drain, sizeof(deep_drain));
 
-	PE_parse_boot_argn("multiq_drain_urgent_first", &drain_urgent_first, sizeof(drain_urgent_first));
+	if (!PE_parse_boot_argn("multiq_drain_ceiling", &drain_ceiling, sizeof(drain_ceiling))) {
+		drain_ceiling = DEFAULT_DRAIN_CEILING;
+	}
 
 	if (!PE_parse_boot_argn("multiq_drain_depth_limit", &drain_depth_limit, sizeof(drain_depth_limit))) {
 		drain_depth_limit = DEFAULT_DRAIN_DEPTH_LIMIT;
@@ -352,8 +360,8 @@ sched_multiq_init(void)
 		drain_band_limit = DEFAULT_DRAIN_BAND_LIMIT;
 	}
 
-	printf("multiq scheduler config: deep-drain %d, urgent first %d, depth limit %d, band limit %d, sanity check %d\n",
-	       deep_drain, drain_urgent_first, drain_depth_limit, drain_band_limit, multiq_sanity_check);
+	printf("multiq scheduler config: deep-drain %d, ceiling %d, depth limit %d, band limit %d, sanity check %d\n",
+	       deep_drain, drain_ceiling, drain_depth_limit, drain_band_limit, multiq_sanity_check);
 
 	sched_group_zone = zinit(
 	                         sizeof(struct sched_group),
@@ -371,7 +379,7 @@ sched_multiq_init(void)
 	lck_attr_setdefault(&sched_groups_lock_attr);
 	lck_mtx_init(&sched_groups_lock, &sched_groups_lock_grp, &sched_groups_lock_attr);
 
-	sched_traditional_init();
+	sched_timeshare_init();
 }
 
 static void
@@ -400,7 +408,7 @@ sched_group_create(void)
 {
 	sched_group_t       sched_group;
 
-	if (!sched_groups_enabled)
+	if (!SCHED(sched_groups_enabled))
 		return SCHED_GROUP_NULL;
 
 	sched_group = (sched_group_t)zalloc(sched_group_zone);
@@ -425,7 +433,7 @@ sched_group_create(void)
 void
 sched_group_destroy(sched_group_t sched_group)
 {
-	if (!sched_groups_enabled) {
+	if (!SCHED(sched_groups_enabled)) {
 		assert(sched_group == SCHED_GROUP_NULL);
 		return;
 	}
@@ -471,7 +479,10 @@ __attribute__((always_inline))
 static inline sched_group_t
 group_for_entry(sched_entry_t entry)
 {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-align"
 	sched_group_t group = (sched_group_t)(entry - entry->sched_pri);
+#pragma clang diagnostic pop
 	return group;
 }	
 
@@ -481,9 +492,9 @@ entry_queue_first_entry(entry_queue_t rq)
 {
 	assert(rq->count != 0);
 
-	queue_t queue = rq->queues + rq->highq;
+	queue_t queue = &rq->queues[rq->highq];
 
-	sched_entry_t entry = (sched_entry_t)queue_first(queue);
+	sched_entry_t entry = qe_queue_first(queue, struct sched_entry, entry_links);
 
 	assert(entry->sched_pri == rq->highq);
 
@@ -492,6 +503,7 @@ entry_queue_first_entry(entry_queue_t rq)
 
 #if defined(MULTIQ_SANITY_CHECK)
 
+#if MACH_ASSERT
 __attribute__((always_inline))
 static inline boolean_t
 queue_chain_linked(queue_chain_t* chain)
@@ -504,6 +516,7 @@ queue_chain_linked(queue_chain_t* chain)
 		return FALSE;
 	}
 }
+#endif /* MACH_ASSERT */
 
 static thread_t
 group_first_thread(sched_group_t group)
@@ -512,11 +525,12 @@ group_first_thread(sched_group_t group)
 
 	assert(rq->count != 0);
 
-	queue_t queue = rq->queues + rq->highq;
+	queue_t queue = &rq->queues[rq->highq];
 
-	thread_t thread = (thread_t)(void*)queue_first(queue);
+	thread_t thread = qe_queue_first(queue, struct thread, runq_links);
 
 	assert(thread != THREAD_NULL);
+	assert_thread_magic(thread);
 
 	assert(thread->sched_group == group);
 
@@ -533,12 +547,12 @@ entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pr
 	queue_t q;
 	sched_entry_t elem;
 
-	assert(queue_chain_linked(&entry->links));
+	assert(queue_chain_linked(&entry->entry_links));
 	assert(entry->runq == MULTIQ_ERUNQ);
 
 	q = &runq->queues[expected_pri];
 
-	queue_iterate(q, elem, sched_entry_t, links) {
+	qe_foreach_element(elem, q, entry_links) {
 		if (elem == entry)
 			return;
 	}
@@ -558,7 +572,7 @@ sched_group_check_thread(sched_group_t group, thread_t thread)
 
 	q = &group->runq.queues[pri];
 
-	queue_iterate(q, elem, thread_t, links) {
+	qe_foreach_element(elem, q, runq_links) {
 		if (elem == thread)
 			return;
 	}
@@ -615,12 +629,12 @@ static sched_entry_t
 entry_queue_dequeue_entry(entry_queue_t rq)
 {
 	sched_entry_t   sched_entry;
-	queue_t         queue = rq->queues + rq->highq;
+	queue_t         queue = &rq->queues[rq->highq];
 
 	assert(rq->count > 0);
 	assert(!queue_empty(queue));
 
-	sched_entry = (sched_entry_t)dequeue_head(queue);
+	sched_entry = qe_dequeue_head(queue, struct sched_entry, entry_links);
 
 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 	rq->count--;
@@ -628,9 +642,8 @@ entry_queue_dequeue_entry(entry_queue_t rq)
 		rq->urgency--; assert(rq->urgency >= 0);
 	}
 	if (queue_empty(queue)) {
-		if (rq->highq != IDLEPRI)
-			clrbit(MAXPRI - rq->highq, rq->bitmap);
-		rq->highq = MAXPRI - ffsbit(rq->bitmap);
+		rq_bitmap_clear(rq->bitmap, rq->highq);
+		rq->highq = bitmap_first(rq->bitmap, NRQS);
 	}
 
 	sched_entry->runq = 0;
@@ -648,24 +661,24 @@ entry_queue_enqueue_entry(
                           integer_t     options)
 {
 	int             sched_pri = entry->sched_pri;
-	queue_t         queue = rq->queues + sched_pri;
+	queue_t         queue = &rq->queues[sched_pri];
 	boolean_t       result = FALSE;
 
 	assert(entry->runq == 0);
 
 	if (queue_empty(queue)) {
-		enqueue_tail(queue, (queue_entry_t)entry);
+		enqueue_tail(queue, &entry->entry_links);
 
-		setbit(MAXPRI - sched_pri, rq->bitmap);
+		rq_bitmap_set(rq->bitmap, sched_pri);
 		if (sched_pri > rq->highq) {
 			rq->highq = sched_pri;
 			result = TRUE;
 		}
 	} else {
 		if (options & SCHED_TAILQ)
-			enqueue_tail(queue, (queue_entry_t)entry);
+			enqueue_tail(queue, &entry->entry_links);
 		else
-			enqueue_head(queue, (queue_entry_t)entry);
+			enqueue_head(queue, &entry->entry_links);
 	}
 	if (SCHED(priority_is_urgent)(sched_pri))
 		rq->urgency++;
@@ -693,7 +706,7 @@ entry_queue_remove_entry(
 	}
 #endif
 
-	remqueue((queue_entry_t)entry);
+	remqueue(&entry->entry_links);
 
 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 	rq->count--;
@@ -701,16 +714,35 @@ entry_queue_remove_entry(
 		rq->urgency--; assert(rq->urgency >= 0);
 	}
 
-	if (queue_empty(rq->queues + sched_pri)) {
+	if (queue_empty(&rq->queues[sched_pri])) {
 		/* update run queue status */
-		if (sched_pri != IDLEPRI)
-			clrbit(MAXPRI - sched_pri, rq->bitmap);
-		rq->highq = MAXPRI - ffsbit(rq->bitmap);
+		rq_bitmap_clear(rq->bitmap, sched_pri);
+		rq->highq = bitmap_first(rq->bitmap, NRQS);
 	}
 
 	entry->runq = 0;
 }
 
+static void
+entry_queue_change_entry(
+                          entry_queue_t rq,
+                          sched_entry_t entry,
+                          integer_t     options)
+{
+	int     sched_pri   = entry->sched_pri;
+	queue_t queue       = &rq->queues[sched_pri];
+
+#if defined(MULTIQ_SANITY_CHECK)
+	if (multiq_sanity_check) {
+		entry_queue_check_entry(rq, entry, sched_pri);
+	}
+#endif
+
+	if (options & SCHED_TAILQ)
+		re_queue_tail(queue, &entry->entry_links);
+	else
+		re_queue_head(queue, &entry->entry_links);
+}
 /*
  * The run queue must not be empty.
  *
@@ -723,14 +755,15 @@ group_run_queue_dequeue_thread(
                          boolean_t     *queue_empty)
 {
 	thread_t        thread;
-	queue_t         queue = rq->queues + rq->highq;
+	queue_t         queue = &rq->queues[rq->highq];
 
 	assert(rq->count > 0);
 	assert(!queue_empty(queue));
 
 	*thread_pri = rq->highq;
 
-	thread = (thread_t)(void*)dequeue_head(queue);
+	thread = qe_dequeue_head(queue, struct thread, runq_links);
+	assert_thread_magic(thread);
 
 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 	rq->count--;
@@ -738,15 +771,14 @@ group_run_queue_dequeue_thread(
 		rq->urgency--; assert(rq->urgency >= 0);
 	}
 	if (queue_empty(queue)) {
-		if (rq->highq != IDLEPRI)
-			clrbit(MAXPRI - rq->highq, rq->bitmap);
-		rq->highq = MAXPRI - ffsbit(rq->bitmap);
+		rq_bitmap_clear(rq->bitmap, rq->highq);
+		rq->highq = bitmap_first(rq->bitmap, NRQS);
 		*queue_empty = TRUE;
 	} else {
 		*queue_empty = FALSE;
 	}
 
-	return (thread);
+	return thread;
 }
 
 /*
@@ -760,24 +792,25 @@ group_run_queue_enqueue_thread(
                          integer_t      thread_pri,
                          integer_t      options)
 {
-	queue_t         queue = rq->queues + thread_pri;
+	queue_t         queue = &rq->queues[thread_pri];
 	boolean_t       result = FALSE;
 
 	assert(thread->runq == PROCESSOR_NULL);
+	assert_thread_magic(thread);
 
 	if (queue_empty(queue)) {
-		enqueue_tail(queue, (queue_entry_t)thread);
+		enqueue_tail(queue, &thread->runq_links);
 
-		setbit(MAXPRI - thread_pri, rq->bitmap);
+		rq_bitmap_set(rq->bitmap, thread_pri);
 		if (thread_pri > rq->highq) {
 			rq->highq = thread_pri;
 		}
 		result = TRUE;
 	} else {
 		if (options & SCHED_TAILQ)
-			enqueue_tail(queue, (queue_entry_t)thread);
+			enqueue_tail(queue, &thread->runq_links);
 		else
-			enqueue_head(queue, (queue_entry_t)thread);
+			enqueue_head(queue, &thread->runq_links);
 	}
 	if (SCHED(priority_is_urgent)(thread_pri))
 		rq->urgency++;
@@ -799,9 +832,10 @@ group_run_queue_remove_thread(
 {
 	boolean_t       result = FALSE;
 
+	assert_thread_magic(thread);
 	assert(thread->runq != PROCESSOR_NULL);
 
-	remqueue((queue_entry_t)thread);
+	remqueue(&thread->runq_links);
 
 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 	rq->count--;
@@ -809,11 +843,10 @@ group_run_queue_remove_thread(
 		rq->urgency--; assert(rq->urgency >= 0);
 	}
 
-	if (queue_empty(rq->queues + thread_pri)) {
+	if (queue_empty(&rq->queues[thread_pri])) {
 		/* update run queue status */
-		if (thread_pri != IDLEPRI)
-			clrbit(MAXPRI - thread_pri, rq->bitmap);
-		rq->highq = MAXPRI - ffsbit(rq->bitmap);
+		rq_bitmap_clear(rq->bitmap, thread_pri);
+		rq->highq = bitmap_first(rq->bitmap, NRQS);
 		result = TRUE;
 	}
 
@@ -963,6 +996,9 @@ sched_group_enqueue_thread(
 		 * What effects would it have?
 		 */
 		entry_queue_enqueue_entry(main_entryq, &group->entries[sched_pri], options);
+	} else if (options & SCHED_HEADQ) {
+		/* The thread should be at the head of the line - move its entry to the front */
+		entry_queue_change_entry(main_entryq, &group->entries[sched_pri], options);
 	}
 }
 
@@ -1033,33 +1069,44 @@ sched_multiq_choose_thread(
 	 * Should YIELD AST override drain limit?
 	 */
 	if (group->runq.count != 0 && (reason & AST_PREEMPTION) == 0) {
-		boolean_t   drain_limit_hit = FALSE;
+		boolean_t favor_group = TRUE;
 
-		if (main_entryq->highq > group->runq.highq) {
+		integer_t global_pri = main_entryq->highq;
+		integer_t group_pri  = group->runq.highq;
+
+		/*
+		 * Favor the current group if the group is still the globally highest.
+		 *
+		 * Otherwise, consider choosing a thread from the current group
+		 * even if it's lower priority than the global highest priority.
+		 */
+		if (global_pri > group_pri) {
 			/*
 			 * If there's something elsewhere above the depth limit,
 			 * don't pick a thread below the limit.
 			 */
-			if (main_entryq->highq > drain_depth_limit &&
-			    group->runq.highq <= drain_depth_limit)
-				drain_limit_hit = TRUE;
+			if (global_pri > drain_depth_limit && group_pri <= drain_depth_limit)
+				favor_group = FALSE;
 
 			/*
-			 * Don't go more than X steps below the global highest
+			 * If there's something at or above the ceiling,
+			 * don't favor the group.
 			 */
-			if ((main_entryq->highq - group->runq.highq) >= drain_band_limit)
-				drain_limit_hit = TRUE;
+			if (global_pri >= drain_ceiling)
+				favor_group = FALSE;
 
-			/* Don't favor the task when an urgent thread is present. */
-			if (drain_urgent_first && main_entryq->urgency > 0)
-				drain_limit_hit = TRUE;
+			/*
+			 * Don't go more than X steps below the global highest
+			 */
+			if ((global_pri - group_pri) >= drain_band_limit)
+				favor_group = FALSE;
 		}
 
-		if (!drain_limit_hit) {
+		if (favor_group) {
 			/* Pull from local runq */
 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
-			    MACH_MULTIQ_GROUP, main_entryq->highq, group->runq.highq, 0, 0);
+			    MACH_MULTIQ_GROUP, global_pri, group_pri, 0, 0);
 
 			return sched_group_dequeue_thread(main_entryq, group);
 		}
@@ -1133,8 +1180,7 @@ sched_multiq_quantum_expire(thread_t thread)
 		sched_entry_t entry = group_entry_for_pri(thread->sched_group, processor->current_pri);
 
 		if (entry->runq == MULTIQ_ERUNQ) {
-			entry_queue_remove_entry(entryq, entry);
-			entry_queue_enqueue_entry(entryq, entry, SCHED_TAILQ);
+			entry_queue_change_entry(entryq, entry, SCHED_TAILQ);
 		}
 
 		pset_unlock(pset);
@@ -1154,14 +1200,18 @@ sched_multiq_processor_csw_check(processor_t processor)
 	boolean_t       has_higher;
 	int             pri;
 
+	if (sched_multiq_thread_avoid_processor(processor, current_thread())) {
+		return (AST_PREEMPT | AST_URGENT);
+	}
+
 	entry_queue_t main_entryq = multiq_main_entryq(processor);
-	run_queue_t   bound_runq  = multiq_bound_runq(processor);	
+	run_queue_t   bound_runq  = multiq_bound_runq(processor);
 
 	assert(processor->active_thread != NULL);
 
 	pri = MAX(main_entryq->highq, bound_runq->highq);
 
-	if (first_timeslice(processor)) {
+	if (processor->first_timeslice) {
 		has_higher = (pri > processor->current_pri);
 	} else {
 		has_higher = (pri >= processor->current_pri);
@@ -1173,9 +1223,6 @@ sched_multiq_processor_csw_check(processor_t processor)
 
 		if (bound_runq->urgency > 0)
 			return (AST_PREEMPT | AST_URGENT);
-		
-		if (processor->active_thread && thread_eager_preemption(processor->active_thread))
-			return (AST_PREEMPT | AST_URGENT);
 
 		return AST_PREEMPT;
 	}
@@ -1189,7 +1236,10 @@ sched_multiq_processor_queue_has_priority(
                                           int           priority,
                                           boolean_t     gte)
 {
-	int qpri = MAX(multiq_main_entryq(processor)->highq, multiq_bound_runq(processor)->highq);
+	run_queue_t main_runq  = multiq_main_entryq(processor);
+	run_queue_t bound_runq = multiq_bound_runq(processor);
+
+	int qpri = MAX(main_runq->highq, bound_runq->highq);
 
 	if (gte)
 		return qpri >= priority;
@@ -1197,12 +1247,6 @@ sched_multiq_processor_queue_has_priority(
 		return qpri > priority;
 }
 
-static boolean_t
-sched_multiq_should_current_thread_rechoose_processor(processor_t processor)
-{
-	return (processor->current_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor);
-}
-
 static int
 sched_multiq_runq_count(processor_t processor)
 {
@@ -1260,12 +1304,15 @@ sched_multiq_processor_queue_shutdown(processor_t processor)
 
 	while (main_entryq->count > 0) {
 		thread = sched_global_dequeue_thread(main_entryq);
-		enqueue_tail(&tqueue, (queue_entry_t)thread);
+		enqueue_tail(&tqueue, &thread->runq_links);
 	}
 
 	pset_unlock(pset);
 
-	while ((thread = (thread_t)(void*)dequeue_head(&tqueue)) != THREAD_NULL) {
+	qe_foreach_element_safe(thread, &tqueue, runq_links) {
+
+		remqueue(&thread->runq_links);
+
 		thread_lock(thread);
 
 		thread_setrun(thread, SCHED_TAILQ);
@@ -1286,7 +1333,6 @@ sched_multiq_processor_queue_remove(
                                     thread_t    thread)
 {
 	boolean_t removed = FALSE;
-
 	processor_set_t pset = processor->processor_set;
 
 	pset_lock(pset);
@@ -1329,27 +1375,36 @@ sched_multiq_steal_thread(processor_set_t pset)
  * Scan the global queue for candidate groups, and scan those groups for
  * candidate threads.
  *
+ * TODO: This iterates every group runq in its entirety for each entry it has in the runq, which is O(N^2)
+ *       Instead, iterate only the queue in the group runq matching the priority of the entry.
+ *
  * Returns TRUE if retry is needed.
  */
 static boolean_t
-group_scan(entry_queue_t runq) {
-	int             count;
-	queue_t         q;
-	sched_group_t   group;
-	sched_entry_t   entry;
-
-	if ((count = runq->count) > 0) {
-		q = runq->queues + runq->highq;
-		while (count > 0) {
-			queue_iterate(q, entry, sched_entry_t, links) {
-				group = group_for_entry(entry);
-				if (group->runq.count > 0) {
-					if (runq_scan(&group->runq))
-						return (TRUE);
-				}
-				count--;
+group_scan(entry_queue_t runq, sched_update_scan_context_t scan_context) {
+	int count       = runq->count;
+	int queue_index;
+
+	assert(count >= 0);
+
+	if (count == 0)
+		return FALSE;
+
+	for (queue_index = bitmap_first(runq->bitmap, NRQS);
+	     queue_index >= 0;
+	     queue_index = bitmap_next(runq->bitmap, queue_index)) {
+
+		sched_entry_t entry;
+
+		qe_foreach_element(entry, &runq->queues[queue_index], entry_links) {
+			assert(count > 0);
+
+			sched_group_t group = group_for_entry(entry);
+			if (group->runq.count > 0) {
+				if (runq_scan(&group->runq, scan_context))
+					return (TRUE);
 			}
-			q--;
+			count--;
 		}
 	}
 
@@ -1357,7 +1412,7 @@ group_scan(entry_queue_t runq) {
 }
 
 static void
-sched_multiq_thread_update_scan(void)
+sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context)
 {
 	boolean_t               restart_needed = FALSE;
 	processor_t             processor = processor_list;
@@ -1377,7 +1432,7 @@ sched_multiq_thread_update_scan(void)
 			s = splsched();
 			pset_lock(pset);
 
-			restart_needed = runq_scan(multiq_bound_runq(processor));
+			restart_needed = runq_scan(multiq_bound_runq(processor), scan_context);
 
 			pset_unlock(pset);
 			splx(s);
@@ -1406,7 +1461,7 @@ sched_multiq_thread_update_scan(void)
 			s = splsched();
 			pset_lock(pset);
 
-			restart_needed = group_scan(&pset->pset_runq);
+			restart_needed = group_scan(&pset->pset_runq, scan_context);
 
 			pset_unlock(pset);
 			splx(s);
@@ -1421,4 +1476,21 @@ sched_multiq_thread_update_scan(void)
 	} while (restart_needed);
 }
 
+extern int sched_allow_rt_smt;
 
+/* Return true if this thread should not continue running on this processor */
+static bool
+sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread)
+{
+	if (processor->processor_primary != processor) {
+		/*
+		 * This is a secondary SMT processor.  If the primary is running
+		 * a realtime thread, only allow realtime threads on the secondary.
+		 */
+		if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
+			return true;
+		}
+	}
+
+	return false;
+}