xnu-4570.51.1.tar.gz

[apple/xnu.git] / osfmk / kern / sched_multiq.c
diff --git a/osfmk/kern/sched_multiq.c b/osfmk/kern/sched_multiq.c

index 63519c677159f9e155dd86b280521257f19fee11..fd945ba1c8d9ef93b191d8f3cad09d95795400ce 100644 (file)
--- a/osfmk/kern/sched_multiq.c
+++ b/osfmk/kern/sched_multiq.c
@@ -119,7 +119,7 @@
   * or can we get away with putting the entry in either one or the other pset?
   *
   * Consider the right way to handle runq count - I don't want to iterate groups.
- * Perhaps keep a global counter.  sched_run_count will not work.
+ * Perhaps keep a global counter.
   * Alternate option - remove it from choose_processor. It doesn't add much value
   * now that we have global runq.
   *
@@ -175,7 +175,7 @@
  #endif
  
  typedef struct sched_entry {
-       queue_chain_t           links;
+       queue_chain_t           entry_links;
         int16_t                 sched_pri;      /* scheduled (current) priority */
         int16_t                 runq;
         int32_t                 pad;
@@ -194,9 +194,6 @@ struct sched_group {
         queue_chain_t           sched_groups;
  };
  
-/* TODO: Turn this into an attribute in the sched dispatch struct */
-boolean_t               sched_groups_enabled = FALSE;
-
  /*
   * Keep entry on the head of the runqueue while dequeueing threads.
   * Only cycle it to the end of the runqueue when a thread in the task
@@ -204,11 +201,6 @@ boolean_t               sched_groups_enabled = FALSE;
   */
  static boolean_t        deep_drain = FALSE;
  
-/*
- * Don't favor the task when an urgent thread is present.
- */
-static boolean_t        drain_urgent_first = TRUE;
-
  /* Verify the consistency of the runq before touching it */
  static boolean_t        multiq_sanity_check = FALSE;
  
@@ -226,6 +218,11 @@ static integer_t        drain_band_limit;
  #define DEFAULT_DRAIN_DEPTH_LIMIT MAXPRI_THROTTLE
  static integer_t        drain_depth_limit;
  
+/*
+ * Don't favor the task when there's something above this priority in another task.
+ */
+#define DEFAULT_DRAIN_CEILING BASEPRI_FOREGROUND
+static integer_t        drain_ceiling;
  
  static struct zone      *sched_group_zone;
  
@@ -246,7 +243,7 @@ static thread_t
  sched_multiq_steal_thread(processor_set_t pset);
  
  static void
-sched_multiq_thread_update_scan(void);
+sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context);
  
  static boolean_t
  sched_multiq_processor_enqueue(processor_t processor, thread_t thread, integer_t options);
@@ -290,18 +287,20 @@ sched_multiq_processor_queue_shutdown(processor_t processor);
  static sched_mode_t
  sched_multiq_initial_thread_sched_mode(task_t parent_task);
  
-static boolean_t
-sched_multiq_should_current_thread_rechoose_processor(processor_t processor);
+static bool
+sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread);
  
  const struct sched_dispatch_table sched_multiq_dispatch = {
+       .sched_name                                     = "multiq",
         .init                                           = sched_multiq_init,
-       .timebase_init                                  = sched_traditional_timebase_init,
+       .timebase_init                                  = sched_timeshare_timebase_init,
         .processor_init                                 = sched_multiq_processor_init,
         .pset_init                                      = sched_multiq_pset_init,
-       .maintenance_continuation                       = sched_traditional_maintenance_continue,
+       .maintenance_continuation                       = sched_timeshare_maintenance_continue,
         .choose_thread                                  = sched_multiq_choose_thread,
+       .steal_thread_enabled                           = FALSE,
         .steal_thread                                   = sched_multiq_steal_thread,
-       .compute_priority                               = compute_priority,
+       .compute_timeshare_priority                     = sched_compute_timeshare_priority,
         .choose_processor                               = choose_processor,
         .processor_enqueue                              = sched_multiq_processor_enqueue,
         .processor_queue_shutdown                       = sched_multiq_processor_queue_shutdown,
@@ -310,39 +309,48 @@ const struct sched_dispatch_table sched_multiq_dispatch = {
         .priority_is_urgent                             = priority_is_urgent,
         .processor_csw_check                            = sched_multiq_processor_csw_check,
         .processor_queue_has_priority                   = sched_multiq_processor_queue_has_priority,
-       .initial_quantum_size                           = sched_traditional_initial_quantum_size,
+       .initial_quantum_size                           = sched_timeshare_initial_quantum_size,
         .initial_thread_sched_mode                      = sched_multiq_initial_thread_sched_mode,
         .can_update_priority                            = can_update_priority,
         .update_priority                                = update_priority,
         .lightweight_update_priority                    = lightweight_update_priority,
         .quantum_expire                                 = sched_multiq_quantum_expire,
-       .should_current_thread_rechoose_processor       = sched_multiq_should_current_thread_rechoose_processor,
         .processor_runq_count                           = sched_multiq_runq_count,
         .processor_runq_stats_count_sum                 = sched_multiq_runq_stats_count_sum,
-       .fairshare_init                                 = sched_traditional_fairshare_init,
-       .fairshare_runq_count                           = sched_traditional_fairshare_runq_count,
-       .fairshare_runq_stats_count_sum                 = sched_traditional_fairshare_runq_stats_count_sum,
-       .fairshare_enqueue                              = sched_traditional_fairshare_enqueue,
-       .fairshare_dequeue                              = sched_traditional_fairshare_dequeue,
-       .fairshare_queue_remove                         = sched_traditional_fairshare_queue_remove,
         .processor_bound_count                          = sched_multiq_processor_bound_count,
         .thread_update_scan                             = sched_multiq_thread_update_scan,
         .direct_dispatch_to_idle_processors             = FALSE,
+       .multiple_psets_enabled                         = FALSE,
+       .sched_groups_enabled                           = TRUE,
+       .avoid_processor_enabled                        = TRUE,
+       .thread_avoid_processor                         = sched_multiq_thread_avoid_processor,
+       .processor_balance                              = sched_SMT_balance,
+
+       .rt_runq                                        = sched_rtglobal_runq,
+       .rt_init                                        = sched_rtglobal_init,
+       .rt_queue_shutdown                              = sched_rtglobal_queue_shutdown,
+       .rt_runq_scan                                   = sched_rtglobal_runq_scan,
+       .rt_runq_count_sum                              = sched_rtglobal_runq_count_sum,
+
+       .qos_max_parallelism                            = sched_qos_max_parallelism,
+       .check_spill                                    = sched_check_spill,
+       .ipi_policy                                     = sched_ipi_policy,
+       .thread_should_yield                            = sched_thread_should_yield,
  };
  
  
  static void
  sched_multiq_init(void)
  {
-       sched_groups_enabled = TRUE;
-
  #if defined(MULTIQ_SANITY_CHECK)
         PE_parse_boot_argn("-multiq-sanity-check", &multiq_sanity_check, sizeof(multiq_sanity_check));
  #endif
  
         PE_parse_boot_argn("-multiq-deep-drain", &deep_drain, sizeof(deep_drain));
  
-       PE_parse_boot_argn("multiq_drain_urgent_first", &drain_urgent_first, sizeof(drain_urgent_first));
+       if (!PE_parse_boot_argn("multiq_drain_ceiling", &drain_ceiling, sizeof(drain_ceiling))) {
+               drain_ceiling = DEFAULT_DRAIN_CEILING;
+       }
  
         if (!PE_parse_boot_argn("multiq_drain_depth_limit", &drain_depth_limit, sizeof(drain_depth_limit))) {
                 drain_depth_limit = DEFAULT_DRAIN_DEPTH_LIMIT;
@@ -352,8 +360,8 @@ sched_multiq_init(void)
                 drain_band_limit = DEFAULT_DRAIN_BAND_LIMIT;
         }
  
-       printf("multiq scheduler config: deep-drain %d, urgent first %d, depth limit %d, band limit %d, sanity check %d\n",
-              deep_drain, drain_urgent_first, drain_depth_limit, drain_band_limit, multiq_sanity_check);
+       printf("multiq scheduler config: deep-drain %d, ceiling %d, depth limit %d, band limit %d, sanity check %d\n",
+              deep_drain, drain_ceiling, drain_depth_limit, drain_band_limit, multiq_sanity_check);
  
         sched_group_zone = zinit(
                                  sizeof(struct sched_group),
@@ -371,7 +379,7 @@ sched_multiq_init(void)
         lck_attr_setdefault(&sched_groups_lock_attr);
         lck_mtx_init(&sched_groups_lock, &sched_groups_lock_grp, &sched_groups_lock_attr);
  
-       sched_traditional_init();
+       sched_timeshare_init();
  }
  
  static void
@@ -400,7 +408,7 @@ sched_group_create(void)
  {
         sched_group_t       sched_group;
  
-       if (!sched_groups_enabled)
+       if (!SCHED(sched_groups_enabled))
                 return SCHED_GROUP_NULL;
  
         sched_group = (sched_group_t)zalloc(sched_group_zone);
@@ -425,7 +433,7 @@ sched_group_create(void)
  void
  sched_group_destroy(sched_group_t sched_group)
  {
-       if (!sched_groups_enabled) {
+       if (!SCHED(sched_groups_enabled)) {
                 assert(sched_group == SCHED_GROUP_NULL);
                 return;
         }
@@ -471,7 +479,10 @@ __attribute__((always_inline))
  static inline sched_group_t
  group_for_entry(sched_entry_t entry)
  {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-align"
         sched_group_t group = (sched_group_t)(entry - entry->sched_pri);
+#pragma clang diagnostic pop
         return group;
  }      
  
@@ -481,9 +492,9 @@ entry_queue_first_entry(entry_queue_t rq)
  {
         assert(rq->count != 0);
  
-       queue_t queue = rq->queues + rq->highq;
+       queue_t queue = &rq->queues[rq->highq];
  
-       sched_entry_t entry = (sched_entry_t)queue_first(queue);
+       sched_entry_t entry = qe_queue_first(queue, struct sched_entry, entry_links);
  
         assert(entry->sched_pri == rq->highq);
  
@@ -492,6 +503,7 @@ entry_queue_first_entry(entry_queue_t rq)
  
  #if defined(MULTIQ_SANITY_CHECK)
  
+#if MACH_ASSERT
  __attribute__((always_inline))
  static inline boolean_t
  queue_chain_linked(queue_chain_t* chain)
@@ -504,6 +516,7 @@ queue_chain_linked(queue_chain_t* chain)
                 return FALSE;
         }
  }
+#endif /* MACH_ASSERT */
  
  static thread_t
  group_first_thread(sched_group_t group)
@@ -512,11 +525,12 @@ group_first_thread(sched_group_t group)
  
         assert(rq->count != 0);
  
-       queue_t queue = rq->queues + rq->highq;
+       queue_t queue = &rq->queues[rq->highq];
  
-       thread_t thread = (thread_t)(void*)queue_first(queue);
+       thread_t thread = qe_queue_first(queue, struct thread, runq_links);
  
         assert(thread != THREAD_NULL);
+       assert_thread_magic(thread);
  
         assert(thread->sched_group == group);
  
@@ -533,12 +547,12 @@ entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pr
         queue_t q;
         sched_entry_t elem;
  
-       assert(queue_chain_linked(&entry->links));
+       assert(queue_chain_linked(&entry->entry_links));
         assert(entry->runq == MULTIQ_ERUNQ);
  
         q = &runq->queues[expected_pri];
  
-       queue_iterate(q, elem, sched_entry_t, links) {
+       qe_foreach_element(elem, q, entry_links) {
                 if (elem == entry)
                         return;
         }
@@ -558,7 +572,7 @@ sched_group_check_thread(sched_group_t group, thread_t thread)
  
         q = &group->runq.queues[pri];
  
-       queue_iterate(q, elem, thread_t, links) {
+       qe_foreach_element(elem, q, runq_links) {
                 if (elem == thread)
                         return;
         }
@@ -615,12 +629,12 @@ static sched_entry_t
  entry_queue_dequeue_entry(entry_queue_t rq)
  {
         sched_entry_t   sched_entry;
-       queue_t         queue = rq->queues + rq->highq;
+       queue_t         queue = &rq->queues[rq->highq];
  
         assert(rq->count > 0);
         assert(!queue_empty(queue));
  
-       sched_entry = (sched_entry_t)dequeue_head(queue);
+       sched_entry = qe_dequeue_head(queue, struct sched_entry, entry_links);
  
         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
         rq->count--;
@@ -628,9 +642,8 @@ entry_queue_dequeue_entry(entry_queue_t rq)
                 rq->urgency--; assert(rq->urgency >= 0);
         }
         if (queue_empty(queue)) {
-               if (rq->highq != IDLEPRI)
-                       clrbit(MAXPRI - rq->highq, rq->bitmap);
-               rq->highq = MAXPRI - ffsbit(rq->bitmap);
+               rq_bitmap_clear(rq->bitmap, rq->highq);
+               rq->highq = bitmap_first(rq->bitmap, NRQS);
         }
  
         sched_entry->runq = 0;
@@ -648,24 +661,24 @@ entry_queue_enqueue_entry(
                            integer_t     options)
  {
         int             sched_pri = entry->sched_pri;
-       queue_t         queue = rq->queues + sched_pri;
+       queue_t         queue = &rq->queues[sched_pri];
         boolean_t       result = FALSE;
  
         assert(entry->runq == 0);
  
         if (queue_empty(queue)) {
-               enqueue_tail(queue, (queue_entry_t)entry);
+               enqueue_tail(queue, &entry->entry_links);
  
-               setbit(MAXPRI - sched_pri, rq->bitmap);
+               rq_bitmap_set(rq->bitmap, sched_pri);
                 if (sched_pri > rq->highq) {
                         rq->highq = sched_pri;
                         result = TRUE;
                 }
         } else {
                 if (options & SCHED_TAILQ)
-                       enqueue_tail(queue, (queue_entry_t)entry);
+                       enqueue_tail(queue, &entry->entry_links);
                 else
-                       enqueue_head(queue, (queue_entry_t)entry);
+                       enqueue_head(queue, &entry->entry_links);
         }
         if (SCHED(priority_is_urgent)(sched_pri))
                 rq->urgency++;
@@ -693,7 +706,7 @@ entry_queue_remove_entry(
         }
  #endif
  
-       remqueue((queue_entry_t)entry);
+       remqueue(&entry->entry_links);
  
         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
         rq->count--;
@@ -701,16 +714,35 @@ entry_queue_remove_entry(
                 rq->urgency--; assert(rq->urgency >= 0);
         }
  
-       if (queue_empty(rq->queues + sched_pri)) {
+       if (queue_empty(&rq->queues[sched_pri])) {
                 /* update run queue status */
-               if (sched_pri != IDLEPRI)
-                       clrbit(MAXPRI - sched_pri, rq->bitmap);
-               rq->highq = MAXPRI - ffsbit(rq->bitmap);
+               rq_bitmap_clear(rq->bitmap, sched_pri);
+               rq->highq = bitmap_first(rq->bitmap, NRQS);
         }
  
         entry->runq = 0;
  }
  
+static void
+entry_queue_change_entry(
+                          entry_queue_t rq,
+                          sched_entry_t entry,
+                          integer_t     options)
+{
+       int     sched_pri   = entry->sched_pri;
+       queue_t queue       = &rq->queues[sched_pri];
+
+#if defined(MULTIQ_SANITY_CHECK)
+       if (multiq_sanity_check) {
+               entry_queue_check_entry(rq, entry, sched_pri);
+       }
+#endif
+
+       if (options & SCHED_TAILQ)
+               re_queue_tail(queue, &entry->entry_links);
+       else
+               re_queue_head(queue, &entry->entry_links);
+}
  /*
   * The run queue must not be empty.
   *
@@ -723,14 +755,15 @@ group_run_queue_dequeue_thread(
                           boolean_t     *queue_empty)
  {
         thread_t        thread;
-       queue_t         queue = rq->queues + rq->highq;
+       queue_t         queue = &rq->queues[rq->highq];
  
         assert(rq->count > 0);
         assert(!queue_empty(queue));
  
         *thread_pri = rq->highq;
  
-       thread = (thread_t)(void*)dequeue_head(queue);
+       thread = qe_dequeue_head(queue, struct thread, runq_links);
+       assert_thread_magic(thread);
  
         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
         rq->count--;
@@ -738,15 +771,14 @@ group_run_queue_dequeue_thread(
                 rq->urgency--; assert(rq->urgency >= 0);
         }
         if (queue_empty(queue)) {
-               if (rq->highq != IDLEPRI)
-                       clrbit(MAXPRI - rq->highq, rq->bitmap);
-               rq->highq = MAXPRI - ffsbit(rq->bitmap);
+               rq_bitmap_clear(rq->bitmap, rq->highq);
+               rq->highq = bitmap_first(rq->bitmap, NRQS);
                 *queue_empty = TRUE;
         } else {
                 *queue_empty = FALSE;
         }
  
-       return (thread);
+       return thread;
  }
  
  /*
@@ -760,24 +792,25 @@ group_run_queue_enqueue_thread(
                           integer_t      thread_pri,
                           integer_t      options)
  {
-       queue_t         queue = rq->queues + thread_pri;
+       queue_t         queue = &rq->queues[thread_pri];
         boolean_t       result = FALSE;
  
         assert(thread->runq == PROCESSOR_NULL);
+       assert_thread_magic(thread);
  
         if (queue_empty(queue)) {
-               enqueue_tail(queue, (queue_entry_t)thread);
+               enqueue_tail(queue, &thread->runq_links);
  
-               setbit(MAXPRI - thread_pri, rq->bitmap);
+               rq_bitmap_set(rq->bitmap, thread_pri);
                 if (thread_pri > rq->highq) {
                         rq->highq = thread_pri;
                 }
                 result = TRUE;
         } else {
                 if (options & SCHED_TAILQ)
-                       enqueue_tail(queue, (queue_entry_t)thread);
+                       enqueue_tail(queue, &thread->runq_links);
                 else
-                       enqueue_head(queue, (queue_entry_t)thread);
+                       enqueue_head(queue, &thread->runq_links);
         }
         if (SCHED(priority_is_urgent)(thread_pri))
                 rq->urgency++;
@@ -799,9 +832,10 @@ group_run_queue_remove_thread(
  {
         boolean_t       result = FALSE;
  
+       assert_thread_magic(thread);
         assert(thread->runq != PROCESSOR_NULL);
  
-       remqueue((queue_entry_t)thread);
+       remqueue(&thread->runq_links);
  
         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
         rq->count--;
@@ -809,11 +843,10 @@ group_run_queue_remove_thread(
                 rq->urgency--; assert(rq->urgency >= 0);
         }
  
-       if (queue_empty(rq->queues + thread_pri)) {
+       if (queue_empty(&rq->queues[thread_pri])) {
                 /* update run queue status */
-               if (thread_pri != IDLEPRI)
-                       clrbit(MAXPRI - thread_pri, rq->bitmap);
-               rq->highq = MAXPRI - ffsbit(rq->bitmap);
+               rq_bitmap_clear(rq->bitmap, thread_pri);
+               rq->highq = bitmap_first(rq->bitmap, NRQS);
                 result = TRUE;
         }
  
@@ -963,6 +996,9 @@ sched_group_enqueue_thread(
                  * What effects would it have?
                  */
                 entry_queue_enqueue_entry(main_entryq, &group->entries[sched_pri], options);
+       } else if (options & SCHED_HEADQ) {
+               /* The thread should be at the head of the line - move its entry to the front */
+               entry_queue_change_entry(main_entryq, &group->entries[sched_pri], options);
         }
  }
  
@@ -1033,33 +1069,44 @@ sched_multiq_choose_thread(
          * Should YIELD AST override drain limit?
          */
         if (group->runq.count != 0 && (reason & AST_PREEMPTION) == 0) {
-               boolean_t   drain_limit_hit = FALSE;
+               boolean_t favor_group = TRUE;
  
-               if (main_entryq->highq > group->runq.highq) {
+               integer_t global_pri = main_entryq->highq;
+               integer_t group_pri  = group->runq.highq;
+
+               /*
+                * Favor the current group if the group is still the globally highest.
+                *
+                * Otherwise, consider choosing a thread from the current group
+                * even if it's lower priority than the global highest priority.
+                */
+               if (global_pri > group_pri) {
                         /*
                          * If there's something elsewhere above the depth limit,
                          * don't pick a thread below the limit.
                          */
-                       if (main_entryq->highq > drain_depth_limit &&
-                           group->runq.highq <= drain_depth_limit)
-                               drain_limit_hit = TRUE;
+                       if (global_pri > drain_depth_limit && group_pri <= drain_depth_limit)
+                               favor_group = FALSE;
  
                         /*
-                        * Don't go more than X steps below the global highest
+                        * If there's something at or above the ceiling,
+                        * don't favor the group.
                          */
-                       if ((main_entryq->highq - group->runq.highq) >= drain_band_limit)
-                               drain_limit_hit = TRUE;
+                       if (global_pri >= drain_ceiling)
+                               favor_group = FALSE;
  
-                       /* Don't favor the task when an urgent thread is present. */
-                       if (drain_urgent_first && main_entryq->urgency > 0)
-                               drain_limit_hit = TRUE;
+                       /*
+                        * Don't go more than X steps below the global highest
+                        */
+                       if ((global_pri - group_pri) >= drain_band_limit)
+                               favor_group = FALSE;
                 }
  
-               if (!drain_limit_hit) {
+               if (favor_group) {
                         /* Pull from local runq */
                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
                             MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
-                           MACH_MULTIQ_GROUP, main_entryq->highq, group->runq.highq, 0, 0);
+                           MACH_MULTIQ_GROUP, global_pri, group_pri, 0, 0);
  
                         return sched_group_dequeue_thread(main_entryq, group);
                 }
@@ -1133,8 +1180,7 @@ sched_multiq_quantum_expire(thread_t thread)
                 sched_entry_t entry = group_entry_for_pri(thread->sched_group, processor->current_pri);
  
                 if (entry->runq == MULTIQ_ERUNQ) {
-                       entry_queue_remove_entry(entryq, entry);
-                       entry_queue_enqueue_entry(entryq, entry, SCHED_TAILQ);
+                       entry_queue_change_entry(entryq, entry, SCHED_TAILQ);
                 }
  
                 pset_unlock(pset);
@@ -1154,14 +1200,18 @@ sched_multiq_processor_csw_check(processor_t processor)
         boolean_t       has_higher;
         int             pri;
  
+       if (sched_multiq_thread_avoid_processor(processor, current_thread())) {
+               return (AST_PREEMPT | AST_URGENT);
+       }
+
         entry_queue_t main_entryq = multiq_main_entryq(processor);
-       run_queue_t   bound_runq  = multiq_bound_runq(processor);       
+       run_queue_t   bound_runq  = multiq_bound_runq(processor);
  
         assert(processor->active_thread != NULL);
  
         pri = MAX(main_entryq->highq, bound_runq->highq);
  
-       if (first_timeslice(processor)) {
+       if (processor->first_timeslice) {
                 has_higher = (pri > processor->current_pri);
         } else {
                 has_higher = (pri >= processor->current_pri);
@@ -1173,9 +1223,6 @@ sched_multiq_processor_csw_check(processor_t processor)
  
                 if (bound_runq->urgency > 0)
                         return (AST_PREEMPT | AST_URGENT);
-               
-               if (processor->active_thread && thread_eager_preemption(processor->active_thread))
-                       return (AST_PREEMPT | AST_URGENT);
  
                 return AST_PREEMPT;
         }
@@ -1189,7 +1236,10 @@ sched_multiq_processor_queue_has_priority(
                                            int           priority,
                                            boolean_t     gte)
  {
-       int qpri = MAX(multiq_main_entryq(processor)->highq, multiq_bound_runq(processor)->highq);
+       run_queue_t main_runq  = multiq_main_entryq(processor);
+       run_queue_t bound_runq = multiq_bound_runq(processor);
+
+       int qpri = MAX(main_runq->highq, bound_runq->highq);
  
         if (gte)
                 return qpri >= priority;
@@ -1197,12 +1247,6 @@ sched_multiq_processor_queue_has_priority(
                 return qpri > priority;
  }
  
-static boolean_t
-sched_multiq_should_current_thread_rechoose_processor(processor_t processor)
-{
-       return (processor->current_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor);
-}
-
  static int
  sched_multiq_runq_count(processor_t processor)
  {
@@ -1260,12 +1304,15 @@ sched_multiq_processor_queue_shutdown(processor_t processor)
  
         while (main_entryq->count > 0) {
                 thread = sched_global_dequeue_thread(main_entryq);
-               enqueue_tail(&tqueue, (queue_entry_t)thread);
+               enqueue_tail(&tqueue, &thread->runq_links);
         }
  
         pset_unlock(pset);
  
-       while ((thread = (thread_t)(void*)dequeue_head(&tqueue)) != THREAD_NULL) {
+       qe_foreach_element_safe(thread, &tqueue, runq_links) {
+
+               remqueue(&thread->runq_links);
+
                 thread_lock(thread);
  
                 thread_setrun(thread, SCHED_TAILQ);
@@ -1286,7 +1333,6 @@ sched_multiq_processor_queue_remove(
                                      thread_t    thread)
  {
         boolean_t removed = FALSE;
-
         processor_set_t pset = processor->processor_set;
  
         pset_lock(pset);
@@ -1329,27 +1375,36 @@ sched_multiq_steal_thread(processor_set_t pset)
   * Scan the global queue for candidate groups, and scan those groups for
   * candidate threads.
   *
+ * TODO: This iterates every group runq in its entirety for each entry it has in the runq, which is O(N^2)
+ *       Instead, iterate only the queue in the group runq matching the priority of the entry.
+ *
   * Returns TRUE if retry is needed.
   */
  static boolean_t
-group_scan(entry_queue_t runq) {
-       int             count;
-       queue_t         q;
-       sched_group_t   group;
-       sched_entry_t   entry;
-
-       if ((count = runq->count) > 0) {
-               q = runq->queues + runq->highq;
-               while (count > 0) {
-                       queue_iterate(q, entry, sched_entry_t, links) {
-                               group = group_for_entry(entry);
-                               if (group->runq.count > 0) {
-                                       if (runq_scan(&group->runq))
-                                               return (TRUE);
-                               }
-                               count--;
+group_scan(entry_queue_t runq, sched_update_scan_context_t scan_context) {
+       int count       = runq->count;
+       int queue_index;
+
+       assert(count >= 0);
+
+       if (count == 0)
+               return FALSE;
+
+       for (queue_index = bitmap_first(runq->bitmap, NRQS);
+            queue_index >= 0;
+            queue_index = bitmap_next(runq->bitmap, queue_index)) {
+
+               sched_entry_t entry;
+
+               qe_foreach_element(entry, &runq->queues[queue_index], entry_links) {
+                       assert(count > 0);
+
+                       sched_group_t group = group_for_entry(entry);
+                       if (group->runq.count > 0) {
+                               if (runq_scan(&group->runq, scan_context))
+                                       return (TRUE);
                         }
-                       q--;
+                       count--;
                 }
         }
  
@@ -1357,7 +1412,7 @@ group_scan(entry_queue_t runq) {
  }
  
  static void
-sched_multiq_thread_update_scan(void)
+sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context)
  {
         boolean_t               restart_needed = FALSE;
         processor_t             processor = processor_list;
@@ -1377,7 +1432,7 @@ sched_multiq_thread_update_scan(void)
                         s = splsched();
                         pset_lock(pset);
  
-                       restart_needed = runq_scan(multiq_bound_runq(processor));
+                       restart_needed = runq_scan(multiq_bound_runq(processor), scan_context);
  
                         pset_unlock(pset);
                         splx(s);
@@ -1406,7 +1461,7 @@ sched_multiq_thread_update_scan(void)
                         s = splsched();
                         pset_lock(pset);
  
-                       restart_needed = group_scan(&pset->pset_runq);
+                       restart_needed = group_scan(&pset->pset_runq, scan_context);
  
                         pset_unlock(pset);
                         splx(s);
@@ -1421,4 +1476,21 @@ sched_multiq_thread_update_scan(void)
         } while (restart_needed);
  }
  
+extern int sched_allow_rt_smt;
  
+/* Return true if this thread should not continue running on this processor */
+static bool
+sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread)
+{
+       if (processor->processor_primary != processor) {
+               /*
+                * This is a secondary SMT processor.  If the primary is running
+                * a realtime thread, only allow realtime threads on the secondary.
+                */
+               if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
+                       return true;
+               }
+       }
+
+       return false;
+}