]> git.saurik.com Git - apple/xnu.git/blobdiff - osfmk/kern/sched_multiq.c
xnu-4570.51.1.tar.gz
[apple/xnu.git] / osfmk / kern / sched_multiq.c
index 63519c677159f9e155dd86b280521257f19fee11..fd945ba1c8d9ef93b191d8f3cad09d95795400ce 100644 (file)
  * or can we get away with putting the entry in either one or the other pset?
  *
  * Consider the right way to handle runq count - I don't want to iterate groups.
- * Perhaps keep a global counter.  sched_run_count will not work.
+ * Perhaps keep a global counter.
  * Alternate option - remove it from choose_processor. It doesn't add much value
  * now that we have global runq.
  *
 #endif
 
 typedef struct sched_entry {
-       queue_chain_t           links;
+       queue_chain_t           entry_links;
        int16_t                 sched_pri;      /* scheduled (current) priority */
        int16_t                 runq;
        int32_t                 pad;
@@ -194,9 +194,6 @@ struct sched_group {
        queue_chain_t           sched_groups;
 };
 
-/* TODO: Turn this into an attribute in the sched dispatch struct */
-boolean_t               sched_groups_enabled = FALSE;
-
 /*
  * Keep entry on the head of the runqueue while dequeueing threads.
  * Only cycle it to the end of the runqueue when a thread in the task
@@ -204,11 +201,6 @@ boolean_t               sched_groups_enabled = FALSE;
  */
 static boolean_t        deep_drain = FALSE;
 
-/*
- * Don't favor the task when an urgent thread is present.
- */
-static boolean_t        drain_urgent_first = TRUE;
-
 /* Verify the consistency of the runq before touching it */
 static boolean_t        multiq_sanity_check = FALSE;
 
@@ -226,6 +218,11 @@ static integer_t        drain_band_limit;
 #define DEFAULT_DRAIN_DEPTH_LIMIT MAXPRI_THROTTLE
 static integer_t        drain_depth_limit;
 
+/*
+ * Don't favor the task when there's something above this priority in another task.
+ */
+#define DEFAULT_DRAIN_CEILING BASEPRI_FOREGROUND
+static integer_t        drain_ceiling;
 
 static struct zone      *sched_group_zone;
 
@@ -246,7 +243,7 @@ static thread_t
 sched_multiq_steal_thread(processor_set_t pset);
 
 static void
-sched_multiq_thread_update_scan(void);
+sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context);
 
 static boolean_t
 sched_multiq_processor_enqueue(processor_t processor, thread_t thread, integer_t options);
@@ -290,18 +287,20 @@ sched_multiq_processor_queue_shutdown(processor_t processor);
 static sched_mode_t
 sched_multiq_initial_thread_sched_mode(task_t parent_task);
 
-static boolean_t
-sched_multiq_should_current_thread_rechoose_processor(processor_t processor);
+static bool
+sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread);
 
 const struct sched_dispatch_table sched_multiq_dispatch = {
+       .sched_name                                     = "multiq",
        .init                                           = sched_multiq_init,
-       .timebase_init                                  = sched_traditional_timebase_init,
+       .timebase_init                                  = sched_timeshare_timebase_init,
        .processor_init                                 = sched_multiq_processor_init,
        .pset_init                                      = sched_multiq_pset_init,
-       .maintenance_continuation                       = sched_traditional_maintenance_continue,
+       .maintenance_continuation                       = sched_timeshare_maintenance_continue,
        .choose_thread                                  = sched_multiq_choose_thread,
+       .steal_thread_enabled                           = FALSE,
        .steal_thread                                   = sched_multiq_steal_thread,
-       .compute_priority                               = compute_priority,
+       .compute_timeshare_priority                     = sched_compute_timeshare_priority,
        .choose_processor                               = choose_processor,
        .processor_enqueue                              = sched_multiq_processor_enqueue,
        .processor_queue_shutdown                       = sched_multiq_processor_queue_shutdown,
@@ -310,39 +309,48 @@ const struct sched_dispatch_table sched_multiq_dispatch = {
        .priority_is_urgent                             = priority_is_urgent,
        .processor_csw_check                            = sched_multiq_processor_csw_check,
        .processor_queue_has_priority                   = sched_multiq_processor_queue_has_priority,
-       .initial_quantum_size                           = sched_traditional_initial_quantum_size,
+       .initial_quantum_size                           = sched_timeshare_initial_quantum_size,
        .initial_thread_sched_mode                      = sched_multiq_initial_thread_sched_mode,
        .can_update_priority                            = can_update_priority,
        .update_priority                                = update_priority,
        .lightweight_update_priority                    = lightweight_update_priority,
        .quantum_expire                                 = sched_multiq_quantum_expire,
-       .should_current_thread_rechoose_processor       = sched_multiq_should_current_thread_rechoose_processor,
        .processor_runq_count                           = sched_multiq_runq_count,
        .processor_runq_stats_count_sum                 = sched_multiq_runq_stats_count_sum,
-       .fairshare_init                                 = sched_traditional_fairshare_init,
-       .fairshare_runq_count                           = sched_traditional_fairshare_runq_count,
-       .fairshare_runq_stats_count_sum                 = sched_traditional_fairshare_runq_stats_count_sum,
-       .fairshare_enqueue                              = sched_traditional_fairshare_enqueue,
-       .fairshare_dequeue                              = sched_traditional_fairshare_dequeue,
-       .fairshare_queue_remove                         = sched_traditional_fairshare_queue_remove,
        .processor_bound_count                          = sched_multiq_processor_bound_count,
        .thread_update_scan                             = sched_multiq_thread_update_scan,
        .direct_dispatch_to_idle_processors             = FALSE,
+       .multiple_psets_enabled                         = FALSE,
+       .sched_groups_enabled                           = TRUE,
+       .avoid_processor_enabled                        = TRUE,
+       .thread_avoid_processor                         = sched_multiq_thread_avoid_processor,
+       .processor_balance                              = sched_SMT_balance,
+
+       .rt_runq                                        = sched_rtglobal_runq,
+       .rt_init                                        = sched_rtglobal_init,
+       .rt_queue_shutdown                              = sched_rtglobal_queue_shutdown,
+       .rt_runq_scan                                   = sched_rtglobal_runq_scan,
+       .rt_runq_count_sum                              = sched_rtglobal_runq_count_sum,
+
+       .qos_max_parallelism                            = sched_qos_max_parallelism,
+       .check_spill                                    = sched_check_spill,
+       .ipi_policy                                     = sched_ipi_policy,
+       .thread_should_yield                            = sched_thread_should_yield,
 };
 
 
 static void
 sched_multiq_init(void)
 {
-       sched_groups_enabled = TRUE;
-
 #if defined(MULTIQ_SANITY_CHECK)
        PE_parse_boot_argn("-multiq-sanity-check", &multiq_sanity_check, sizeof(multiq_sanity_check));
 #endif
 
        PE_parse_boot_argn("-multiq-deep-drain", &deep_drain, sizeof(deep_drain));
 
-       PE_parse_boot_argn("multiq_drain_urgent_first", &drain_urgent_first, sizeof(drain_urgent_first));
+       if (!PE_parse_boot_argn("multiq_drain_ceiling", &drain_ceiling, sizeof(drain_ceiling))) {
+               drain_ceiling = DEFAULT_DRAIN_CEILING;
+       }
 
        if (!PE_parse_boot_argn("multiq_drain_depth_limit", &drain_depth_limit, sizeof(drain_depth_limit))) {
                drain_depth_limit = DEFAULT_DRAIN_DEPTH_LIMIT;
@@ -352,8 +360,8 @@ sched_multiq_init(void)
                drain_band_limit = DEFAULT_DRAIN_BAND_LIMIT;
        }
 
-       printf("multiq scheduler config: deep-drain %d, urgent first %d, depth limit %d, band limit %d, sanity check %d\n",
-              deep_drain, drain_urgent_first, drain_depth_limit, drain_band_limit, multiq_sanity_check);
+       printf("multiq scheduler config: deep-drain %d, ceiling %d, depth limit %d, band limit %d, sanity check %d\n",
+              deep_drain, drain_ceiling, drain_depth_limit, drain_band_limit, multiq_sanity_check);
 
        sched_group_zone = zinit(
                                 sizeof(struct sched_group),
@@ -371,7 +379,7 @@ sched_multiq_init(void)
        lck_attr_setdefault(&sched_groups_lock_attr);
        lck_mtx_init(&sched_groups_lock, &sched_groups_lock_grp, &sched_groups_lock_attr);
 
-       sched_traditional_init();
+       sched_timeshare_init();
 }
 
 static void
@@ -400,7 +408,7 @@ sched_group_create(void)
 {
        sched_group_t       sched_group;
 
-       if (!sched_groups_enabled)
+       if (!SCHED(sched_groups_enabled))
                return SCHED_GROUP_NULL;
 
        sched_group = (sched_group_t)zalloc(sched_group_zone);
@@ -425,7 +433,7 @@ sched_group_create(void)
 void
 sched_group_destroy(sched_group_t sched_group)
 {
-       if (!sched_groups_enabled) {
+       if (!SCHED(sched_groups_enabled)) {
                assert(sched_group == SCHED_GROUP_NULL);
                return;
        }
@@ -471,7 +479,10 @@ __attribute__((always_inline))
 static inline sched_group_t
 group_for_entry(sched_entry_t entry)
 {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-align"
        sched_group_t group = (sched_group_t)(entry - entry->sched_pri);
+#pragma clang diagnostic pop
        return group;
 }      
 
@@ -481,9 +492,9 @@ entry_queue_first_entry(entry_queue_t rq)
 {
        assert(rq->count != 0);
 
-       queue_t queue = rq->queues + rq->highq;
+       queue_t queue = &rq->queues[rq->highq];
 
-       sched_entry_t entry = (sched_entry_t)queue_first(queue);
+       sched_entry_t entry = qe_queue_first(queue, struct sched_entry, entry_links);
 
        assert(entry->sched_pri == rq->highq);
 
@@ -492,6 +503,7 @@ entry_queue_first_entry(entry_queue_t rq)
 
 #if defined(MULTIQ_SANITY_CHECK)
 
+#if MACH_ASSERT
 __attribute__((always_inline))
 static inline boolean_t
 queue_chain_linked(queue_chain_t* chain)
@@ -504,6 +516,7 @@ queue_chain_linked(queue_chain_t* chain)
                return FALSE;
        }
 }
+#endif /* MACH_ASSERT */
 
 static thread_t
 group_first_thread(sched_group_t group)
@@ -512,11 +525,12 @@ group_first_thread(sched_group_t group)
 
        assert(rq->count != 0);
 
-       queue_t queue = rq->queues + rq->highq;
+       queue_t queue = &rq->queues[rq->highq];
 
-       thread_t thread = (thread_t)(void*)queue_first(queue);
+       thread_t thread = qe_queue_first(queue, struct thread, runq_links);
 
        assert(thread != THREAD_NULL);
+       assert_thread_magic(thread);
 
        assert(thread->sched_group == group);
 
@@ -533,12 +547,12 @@ entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pr
        queue_t q;
        sched_entry_t elem;
 
-       assert(queue_chain_linked(&entry->links));
+       assert(queue_chain_linked(&entry->entry_links));
        assert(entry->runq == MULTIQ_ERUNQ);
 
        q = &runq->queues[expected_pri];
 
-       queue_iterate(q, elem, sched_entry_t, links) {
+       qe_foreach_element(elem, q, entry_links) {
                if (elem == entry)
                        return;
        }
@@ -558,7 +572,7 @@ sched_group_check_thread(sched_group_t group, thread_t thread)
 
        q = &group->runq.queues[pri];
 
-       queue_iterate(q, elem, thread_t, links) {
+       qe_foreach_element(elem, q, runq_links) {
                if (elem == thread)
                        return;
        }
@@ -615,12 +629,12 @@ static sched_entry_t
 entry_queue_dequeue_entry(entry_queue_t rq)
 {
        sched_entry_t   sched_entry;
-       queue_t         queue = rq->queues + rq->highq;
+       queue_t         queue = &rq->queues[rq->highq];
 
        assert(rq->count > 0);
        assert(!queue_empty(queue));
 
-       sched_entry = (sched_entry_t)dequeue_head(queue);
+       sched_entry = qe_dequeue_head(queue, struct sched_entry, entry_links);
 
        SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
        rq->count--;
@@ -628,9 +642,8 @@ entry_queue_dequeue_entry(entry_queue_t rq)
                rq->urgency--; assert(rq->urgency >= 0);
        }
        if (queue_empty(queue)) {
-               if (rq->highq != IDLEPRI)
-                       clrbit(MAXPRI - rq->highq, rq->bitmap);
-               rq->highq = MAXPRI - ffsbit(rq->bitmap);
+               rq_bitmap_clear(rq->bitmap, rq->highq);
+               rq->highq = bitmap_first(rq->bitmap, NRQS);
        }
 
        sched_entry->runq = 0;
@@ -648,24 +661,24 @@ entry_queue_enqueue_entry(
                           integer_t     options)
 {
        int             sched_pri = entry->sched_pri;
-       queue_t         queue = rq->queues + sched_pri;
+       queue_t         queue = &rq->queues[sched_pri];
        boolean_t       result = FALSE;
 
        assert(entry->runq == 0);
 
        if (queue_empty(queue)) {
-               enqueue_tail(queue, (queue_entry_t)entry);
+               enqueue_tail(queue, &entry->entry_links);
 
-               setbit(MAXPRI - sched_pri, rq->bitmap);
+               rq_bitmap_set(rq->bitmap, sched_pri);
                if (sched_pri > rq->highq) {
                        rq->highq = sched_pri;
                        result = TRUE;
                }
        } else {
                if (options & SCHED_TAILQ)
-                       enqueue_tail(queue, (queue_entry_t)entry);
+                       enqueue_tail(queue, &entry->entry_links);
                else
-                       enqueue_head(queue, (queue_entry_t)entry);
+                       enqueue_head(queue, &entry->entry_links);
        }
        if (SCHED(priority_is_urgent)(sched_pri))
                rq->urgency++;
@@ -693,7 +706,7 @@ entry_queue_remove_entry(
        }
 #endif
 
-       remqueue((queue_entry_t)entry);
+       remqueue(&entry->entry_links);
 
        SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
        rq->count--;
@@ -701,16 +714,35 @@ entry_queue_remove_entry(
                rq->urgency--; assert(rq->urgency >= 0);
        }
 
-       if (queue_empty(rq->queues + sched_pri)) {
+       if (queue_empty(&rq->queues[sched_pri])) {
                /* update run queue status */
-               if (sched_pri != IDLEPRI)
-                       clrbit(MAXPRI - sched_pri, rq->bitmap);
-               rq->highq = MAXPRI - ffsbit(rq->bitmap);
+               rq_bitmap_clear(rq->bitmap, sched_pri);
+               rq->highq = bitmap_first(rq->bitmap, NRQS);
        }
 
        entry->runq = 0;
 }
 
+static void
+entry_queue_change_entry(
+                          entry_queue_t rq,
+                          sched_entry_t entry,
+                          integer_t     options)
+{
+       int     sched_pri   = entry->sched_pri;
+       queue_t queue       = &rq->queues[sched_pri];
+
+#if defined(MULTIQ_SANITY_CHECK)
+       if (multiq_sanity_check) {
+               entry_queue_check_entry(rq, entry, sched_pri);
+       }
+#endif
+
+       if (options & SCHED_TAILQ)
+               re_queue_tail(queue, &entry->entry_links);
+       else
+               re_queue_head(queue, &entry->entry_links);
+}
 /*
  * The run queue must not be empty.
  *
@@ -723,14 +755,15 @@ group_run_queue_dequeue_thread(
                          boolean_t     *queue_empty)
 {
        thread_t        thread;
-       queue_t         queue = rq->queues + rq->highq;
+       queue_t         queue = &rq->queues[rq->highq];
 
        assert(rq->count > 0);
        assert(!queue_empty(queue));
 
        *thread_pri = rq->highq;
 
-       thread = (thread_t)(void*)dequeue_head(queue);
+       thread = qe_dequeue_head(queue, struct thread, runq_links);
+       assert_thread_magic(thread);
 
        SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
        rq->count--;
@@ -738,15 +771,14 @@ group_run_queue_dequeue_thread(
                rq->urgency--; assert(rq->urgency >= 0);
        }
        if (queue_empty(queue)) {
-               if (rq->highq != IDLEPRI)
-                       clrbit(MAXPRI - rq->highq, rq->bitmap);
-               rq->highq = MAXPRI - ffsbit(rq->bitmap);
+               rq_bitmap_clear(rq->bitmap, rq->highq);
+               rq->highq = bitmap_first(rq->bitmap, NRQS);
                *queue_empty = TRUE;
        } else {
                *queue_empty = FALSE;
        }
 
-       return (thread);
+       return thread;
 }
 
 /*
@@ -760,24 +792,25 @@ group_run_queue_enqueue_thread(
                          integer_t      thread_pri,
                          integer_t      options)
 {
-       queue_t         queue = rq->queues + thread_pri;
+       queue_t         queue = &rq->queues[thread_pri];
        boolean_t       result = FALSE;
 
        assert(thread->runq == PROCESSOR_NULL);
+       assert_thread_magic(thread);
 
        if (queue_empty(queue)) {
-               enqueue_tail(queue, (queue_entry_t)thread);
+               enqueue_tail(queue, &thread->runq_links);
 
-               setbit(MAXPRI - thread_pri, rq->bitmap);
+               rq_bitmap_set(rq->bitmap, thread_pri);
                if (thread_pri > rq->highq) {
                        rq->highq = thread_pri;
                }
                result = TRUE;
        } else {
                if (options & SCHED_TAILQ)
-                       enqueue_tail(queue, (queue_entry_t)thread);
+                       enqueue_tail(queue, &thread->runq_links);
                else
-                       enqueue_head(queue, (queue_entry_t)thread);
+                       enqueue_head(queue, &thread->runq_links);
        }
        if (SCHED(priority_is_urgent)(thread_pri))
                rq->urgency++;
@@ -799,9 +832,10 @@ group_run_queue_remove_thread(
 {
        boolean_t       result = FALSE;
 
+       assert_thread_magic(thread);
        assert(thread->runq != PROCESSOR_NULL);
 
-       remqueue((queue_entry_t)thread);
+       remqueue(&thread->runq_links);
 
        SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
        rq->count--;
@@ -809,11 +843,10 @@ group_run_queue_remove_thread(
                rq->urgency--; assert(rq->urgency >= 0);
        }
 
-       if (queue_empty(rq->queues + thread_pri)) {
+       if (queue_empty(&rq->queues[thread_pri])) {
                /* update run queue status */
-               if (thread_pri != IDLEPRI)
-                       clrbit(MAXPRI - thread_pri, rq->bitmap);
-               rq->highq = MAXPRI - ffsbit(rq->bitmap);
+               rq_bitmap_clear(rq->bitmap, thread_pri);
+               rq->highq = bitmap_first(rq->bitmap, NRQS);
                result = TRUE;
        }
 
@@ -963,6 +996,9 @@ sched_group_enqueue_thread(
                 * What effects would it have?
                 */
                entry_queue_enqueue_entry(main_entryq, &group->entries[sched_pri], options);
+       } else if (options & SCHED_HEADQ) {
+               /* The thread should be at the head of the line - move its entry to the front */
+               entry_queue_change_entry(main_entryq, &group->entries[sched_pri], options);
        }
 }
 
@@ -1033,33 +1069,44 @@ sched_multiq_choose_thread(
         * Should YIELD AST override drain limit?
         */
        if (group->runq.count != 0 && (reason & AST_PREEMPTION) == 0) {
-               boolean_t   drain_limit_hit = FALSE;
+               boolean_t favor_group = TRUE;
 
-               if (main_entryq->highq > group->runq.highq) {
+               integer_t global_pri = main_entryq->highq;
+               integer_t group_pri  = group->runq.highq;
+
+               /*
+                * Favor the current group if the group is still the globally highest.
+                *
+                * Otherwise, consider choosing a thread from the current group
+                * even if it's lower priority than the global highest priority.
+                */
+               if (global_pri > group_pri) {
                        /*
                         * If there's something elsewhere above the depth limit,
                         * don't pick a thread below the limit.
                         */
-                       if (main_entryq->highq > drain_depth_limit &&
-                           group->runq.highq <= drain_depth_limit)
-                               drain_limit_hit = TRUE;
+                       if (global_pri > drain_depth_limit && group_pri <= drain_depth_limit)
+                               favor_group = FALSE;
 
                        /*
-                        * Don't go more than X steps below the global highest
+                        * If there's something at or above the ceiling,
+                        * don't favor the group.
                         */
-                       if ((main_entryq->highq - group->runq.highq) >= drain_band_limit)
-                               drain_limit_hit = TRUE;
+                       if (global_pri >= drain_ceiling)
+                               favor_group = FALSE;
 
-                       /* Don't favor the task when an urgent thread is present. */
-                       if (drain_urgent_first && main_entryq->urgency > 0)
-                               drain_limit_hit = TRUE;
+                       /*
+                        * Don't go more than X steps below the global highest
+                        */
+                       if ((global_pri - group_pri) >= drain_band_limit)
+                               favor_group = FALSE;
                }
 
-               if (!drain_limit_hit) {
+               if (favor_group) {
                        /* Pull from local runq */
                        KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
                            MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
-                           MACH_MULTIQ_GROUP, main_entryq->highq, group->runq.highq, 0, 0);
+                           MACH_MULTIQ_GROUP, global_pri, group_pri, 0, 0);
 
                        return sched_group_dequeue_thread(main_entryq, group);
                }
@@ -1133,8 +1180,7 @@ sched_multiq_quantum_expire(thread_t thread)
                sched_entry_t entry = group_entry_for_pri(thread->sched_group, processor->current_pri);
 
                if (entry->runq == MULTIQ_ERUNQ) {
-                       entry_queue_remove_entry(entryq, entry);
-                       entry_queue_enqueue_entry(entryq, entry, SCHED_TAILQ);
+                       entry_queue_change_entry(entryq, entry, SCHED_TAILQ);
                }
 
                pset_unlock(pset);
@@ -1154,14 +1200,18 @@ sched_multiq_processor_csw_check(processor_t processor)
        boolean_t       has_higher;
        int             pri;
 
+       if (sched_multiq_thread_avoid_processor(processor, current_thread())) {
+               return (AST_PREEMPT | AST_URGENT);
+       }
+
        entry_queue_t main_entryq = multiq_main_entryq(processor);
-       run_queue_t   bound_runq  = multiq_bound_runq(processor);       
+       run_queue_t   bound_runq  = multiq_bound_runq(processor);
 
        assert(processor->active_thread != NULL);
 
        pri = MAX(main_entryq->highq, bound_runq->highq);
 
-       if (first_timeslice(processor)) {
+       if (processor->first_timeslice) {
                has_higher = (pri > processor->current_pri);
        } else {
                has_higher = (pri >= processor->current_pri);
@@ -1173,9 +1223,6 @@ sched_multiq_processor_csw_check(processor_t processor)
 
                if (bound_runq->urgency > 0)
                        return (AST_PREEMPT | AST_URGENT);
-               
-               if (processor->active_thread && thread_eager_preemption(processor->active_thread))
-                       return (AST_PREEMPT | AST_URGENT);
 
                return AST_PREEMPT;
        }
@@ -1189,7 +1236,10 @@ sched_multiq_processor_queue_has_priority(
                                           int           priority,
                                           boolean_t     gte)
 {
-       int qpri = MAX(multiq_main_entryq(processor)->highq, multiq_bound_runq(processor)->highq);
+       run_queue_t main_runq  = multiq_main_entryq(processor);
+       run_queue_t bound_runq = multiq_bound_runq(processor);
+
+       int qpri = MAX(main_runq->highq, bound_runq->highq);
 
        if (gte)
                return qpri >= priority;
@@ -1197,12 +1247,6 @@ sched_multiq_processor_queue_has_priority(
                return qpri > priority;
 }
 
-static boolean_t
-sched_multiq_should_current_thread_rechoose_processor(processor_t processor)
-{
-       return (processor->current_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor);
-}
-
 static int
 sched_multiq_runq_count(processor_t processor)
 {
@@ -1260,12 +1304,15 @@ sched_multiq_processor_queue_shutdown(processor_t processor)
 
        while (main_entryq->count > 0) {
                thread = sched_global_dequeue_thread(main_entryq);
-               enqueue_tail(&tqueue, (queue_entry_t)thread);
+               enqueue_tail(&tqueue, &thread->runq_links);
        }
 
        pset_unlock(pset);
 
-       while ((thread = (thread_t)(void*)dequeue_head(&tqueue)) != THREAD_NULL) {
+       qe_foreach_element_safe(thread, &tqueue, runq_links) {
+
+               remqueue(&thread->runq_links);
+
                thread_lock(thread);
 
                thread_setrun(thread, SCHED_TAILQ);
@@ -1286,7 +1333,6 @@ sched_multiq_processor_queue_remove(
                                     thread_t    thread)
 {
        boolean_t removed = FALSE;
-
        processor_set_t pset = processor->processor_set;
 
        pset_lock(pset);
@@ -1329,27 +1375,36 @@ sched_multiq_steal_thread(processor_set_t pset)
  * Scan the global queue for candidate groups, and scan those groups for
  * candidate threads.
  *
+ * TODO: This iterates every group runq in its entirety for each entry it has in the runq, which is O(N^2)
+ *       Instead, iterate only the queue in the group runq matching the priority of the entry.
+ *
  * Returns TRUE if retry is needed.
  */
 static boolean_t
-group_scan(entry_queue_t runq) {
-       int             count;
-       queue_t         q;
-       sched_group_t   group;
-       sched_entry_t   entry;
-
-       if ((count = runq->count) > 0) {
-               q = runq->queues + runq->highq;
-               while (count > 0) {
-                       queue_iterate(q, entry, sched_entry_t, links) {
-                               group = group_for_entry(entry);
-                               if (group->runq.count > 0) {
-                                       if (runq_scan(&group->runq))
-                                               return (TRUE);
-                               }
-                               count--;
+group_scan(entry_queue_t runq, sched_update_scan_context_t scan_context) {
+       int count       = runq->count;
+       int queue_index;
+
+       assert(count >= 0);
+
+       if (count == 0)
+               return FALSE;
+
+       for (queue_index = bitmap_first(runq->bitmap, NRQS);
+            queue_index >= 0;
+            queue_index = bitmap_next(runq->bitmap, queue_index)) {
+
+               sched_entry_t entry;
+
+               qe_foreach_element(entry, &runq->queues[queue_index], entry_links) {
+                       assert(count > 0);
+
+                       sched_group_t group = group_for_entry(entry);
+                       if (group->runq.count > 0) {
+                               if (runq_scan(&group->runq, scan_context))
+                                       return (TRUE);
                        }
-                       q--;
+                       count--;
                }
        }
 
@@ -1357,7 +1412,7 @@ group_scan(entry_queue_t runq) {
 }
 
 static void
-sched_multiq_thread_update_scan(void)
+sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context)
 {
        boolean_t               restart_needed = FALSE;
        processor_t             processor = processor_list;
@@ -1377,7 +1432,7 @@ sched_multiq_thread_update_scan(void)
                        s = splsched();
                        pset_lock(pset);
 
-                       restart_needed = runq_scan(multiq_bound_runq(processor));
+                       restart_needed = runq_scan(multiq_bound_runq(processor), scan_context);
 
                        pset_unlock(pset);
                        splx(s);
@@ -1406,7 +1461,7 @@ sched_multiq_thread_update_scan(void)
                        s = splsched();
                        pset_lock(pset);
 
-                       restart_needed = group_scan(&pset->pset_runq);
+                       restart_needed = group_scan(&pset->pset_runq, scan_context);
 
                        pset_unlock(pset);
                        splx(s);
@@ -1421,4 +1476,21 @@ sched_multiq_thread_update_scan(void)
        } while (restart_needed);
 }
 
+extern int sched_allow_rt_smt;
 
+/* Return true if this thread should not continue running on this processor */
+static bool
+sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread)
+{
+       if (processor->processor_primary != processor) {
+               /*
+                * This is a secondary SMT processor.  If the primary is running
+                * a realtime thread, only allow realtime threads on the secondary.
+                */
+               if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
+                       return true;
+               }
+       }
+
+       return false;
+}