osfmk/kern/sched_multiq.c

   1 /*
   2  * Copyright (c) 2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <mach/mach_types.h>
  30 #include <mach/machine.h>
  31
  32 #include <machine/machine_routines.h>
  33 #include <machine/sched_param.h>
  34 #include <machine/machine_cpu.h>
  35
  36 #include <kern/kern_types.h>
  37 #include <kern/debug.h>
  38 #include <kern/mach_param.h>
  39 #include <kern/machine.h>
  40 #include <kern/misc_protos.h>
  41 #include <kern/processor.h>
  42 #include <kern/queue.h>
  43 #include <kern/sched.h>
  44 #include <kern/sched_prim.h>
  45 #include <kern/task.h>
  46 #include <kern/thread.h>
  47
  48 #include <sys/kdebug.h>
  49
  50 /*
  51  * Theory Statement
  52  *
  53  * How does the task scheduler work?
  54  *
  55  * It schedules threads across a few levels.
  56  *
  57  * RT threads are dealt with above us
  58  * Bound threads go into the per-processor runq
  59  * Non-bound threads are linked on their task's sched_group's runq
  60  * sched_groups' sched_entries are linked on the pset's runq
  61  *
  62  * TODO: make this explicit - bound threads should have a different enqueue fxn
  63  *
  64  * When we choose a new thread, we will decide whether to look at the bound runqueue, the global runqueue
  65  * or the current group's runqueue, then dequeue the next thread in that runqueue.
  66  *
  67  * We then manipulate the sched_entries to reflect the invariant that:
  68  * Each non-empty priority level in a group's runq is represented by one sched_entry enqueued in the global
  69  * runqueue.
  70  *
  71  * A sched_entry represents a chance at running - for each priority in each task, there is one chance of getting
  72  * to run.  This reduces the excess contention bonus given to processes which have work spread among many threads
  73  * as compared to processes which do the same amount of work under fewer threads.
  74  *
  75  * NOTE: Currently, the multiq scheduler only supports one pset.
  76  *
  77  * NOTE ABOUT thread->sched_pri:
  78  *
  79  * It can change after enqueue - it's changed without pset lock but with thread lock if thread->runq is 0.
  80  * Therefore we can only depend on it not changing during the enqueue and remove path, not the dequeue.
  81  *
  82  * TODO: Future features:
  83  *
  84  * Decouple the task priority from the sched_entry priority, allowing for:
  85  *      fast task priority change without having to iterate and re-dispatch all threads in the task.
  86  *              i.e. task-wide priority, task-wide boosting
  87  *      fancier group decay features
  88  *
  89  * Group (or task) decay:
  90  *      Decay is used for a few different things:
  91  *              Prioritizing latency-needing threads over throughput-needing threads for time-to-running
  92  *              Balancing work between threads in a process
  93  *              Balancing work done at the same priority between different processes
  94  *              Recovering from priority inversions between two threads in the same process
  95  *              Recovering from priority inversions between two threads in different processes
  96  *              Simulating a proportional share scheduler by allowing lower priority threads
  97  *                to run for a certain percentage of the time
  98  *
  99  *      Task decay lets us separately address the 'same process' and 'different process' needs,
 100  *      which will allow us to make smarter tradeoffs in different cases.
 101  *      For example, we could resolve priority inversion in the same process by reordering threads without dropping the
 102  *      process below low priority threads in other processes.
 103  *
 104  * One lock to rule them all (or at least all the runqueues) instead of the pset locks
 105  *
 106  * Shrink sched_entry size to the size of a queue_chain_t by inferring priority, group, and perhaps runq field.
 107  * The entries array is 5K currently so it'd be really great to reduce.
 108  * One way to get sched_group below 4K without a new runq structure would be to remove the extra queues above realtime.
 109  *
 110  * When preempting a processor, store a flag saying if the preemption
 111  * was from a thread in the same group or different group,
 112  * and tell choose_thread about it.
 113  *
 114  * When choosing a processor, bias towards those running in the same
 115  * group as I am running (at the same priority, or within a certain band?).
 116  *
 117  * Decide if we need to support psets.
 118  * Decide how to support psets - do we need duplicate entries for each pset,
 119  * or can we get away with putting the entry in either one or the other pset?
 120  *
 121  * Consider the right way to handle runq count - I don't want to iterate groups.
 122  * Perhaps keep a global counter.
 123  * Alternate option - remove it from choose_processor. It doesn't add much value
 124  * now that we have global runq.
 125  *
 126  * Need a better way of finding group to target instead of looking at current_task.
 127  * Perhaps choose_thread could pass in the current thread?
 128  *
 129  * Consider unifying runq copy-pastes.
 130  *
 131  * Thoughts on having a group central quantum bucket:
 132  *
 133  * I see two algorithms to decide quanta:
 134  * A) Hand off only when switching thread to thread in the same group
 135  * B) Allocate and return quanta to the group's pool
 136  *
 137  * Issues:
 138  * If a task blocks completely, should it come back with the leftover quanta
 139  * or brand new quanta?
 140  *
 141  * Should I put a flag saying zero out a quanta you grab when youre dispatched'?
 142  *
 143  * Resolution:
 144  * Handing off quanta between threads will help with jumping around in the current task
 145  * but will not help when a thread from a different task is involved.
 146  * Need an algorithm that works with round robin-ing between threads in different tasks
 147  *
 148  * But wait - round robining can only be triggered by quantum expire or blocking.
 149  * We need something that works with preemption or yielding - that's the more interesting idea.
 150  *
 151  * Existing algorithm - preemption doesn't re-set quantum, puts thread on head of runq.
 152  * Blocking or quantum expiration does re-set quantum, puts thread on tail of runq.
 153  *
 154  * New algorithm -
 155  * Hand off quanta when hopping between threads with same sched_group
 156  * Even if thread was blocked it uses last thread remaining quanta when it starts.
 157  *
 158  * If we use the only cycle entry at quantum algorithm, then the quantum pool starts getting
 159  * interesting.
 160  *
 161  * A thought - perhaps the handoff approach doesn't work so well in the presence of
 162  * non-handoff wakeups i.e. wake other thread then wait then block - doesn't mean that
 163  * woken thread will be what I switch to - other processor may have stolen it.
 164  * What do we do there?
 165  *
 166  * Conclusions:
 167  * We currently don't know of a scenario where quantum buckets on the task is beneficial.
 168  * We will instead handoff quantum between threads in the task, and keep quantum
 169  * on the preempted thread if it's preempted by something outside the task.
 170  *
 171  */
 172
 173 #if DEBUG || DEVELOPMENT
 174 #define MULTIQ_SANITY_CHECK
 175 #endif
 176
 177 typedef struct sched_entry {
 178         queue_chain_t           entry_links;
 179         int16_t                 sched_pri;      /* scheduled (current) priority */
 180         int16_t                 runq;
 181         int32_t                 pad;
 182 } *sched_entry_t;
 183
 184 typedef run_queue_t entry_queue_t;                      /* A run queue that holds sched_entries instead of threads */
 185 typedef run_queue_t group_runq_t;                       /* A run queue that is part of a sched_group */
 186
 187 #define SCHED_ENTRY_NULL        ((sched_entry_t) 0)
 188 #define MULTIQ_ERUNQ            (-4)                    /* Indicates entry is on the main runq */
 189
 190 /* Each level in the run queue corresponds to one entry in the entries array */
 191 struct sched_group {
 192         struct sched_entry      entries[NRQS];
 193         struct run_queue        runq;
 194         queue_chain_t           sched_groups;
 195 };
 196
 197 /*
 198  * Keep entry on the head of the runqueue while dequeueing threads.
 199  * Only cycle it to the end of the runqueue when a thread in the task
 200  * hits its quantum.
 201  */
 202 static boolean_t        deep_drain = FALSE;
 203
 204 /* Verify the consistency of the runq before touching it */
 205 static boolean_t        multiq_sanity_check = FALSE;
 206
 207 /*
 208  * Draining threads from the current task is preferred
 209  * when they're less than X steps below the current
 210  * global highest priority
 211  */
 212 #define DEFAULT_DRAIN_BAND_LIMIT MAXPRI
 213 static integer_t        drain_band_limit;
 214
 215 /*
 216  * Don't go below this priority level if there is something above it in another task
 217  */
 218 #define DEFAULT_DRAIN_DEPTH_LIMIT MAXPRI_THROTTLE
 219 static integer_t        drain_depth_limit;
 220
 221 /*
 222  * Don't favor the task when there's something above this priority in another task.
 223  */
 224 #define DEFAULT_DRAIN_CEILING BASEPRI_FOREGROUND
 225 static integer_t        drain_ceiling;
 226
 227 static struct zone      *sched_group_zone;
 228
 229 static uint64_t         num_sched_groups = 0;
 230 static queue_head_t     sched_groups;
 231
 232 static lck_attr_t       sched_groups_lock_attr;
 233 static lck_grp_t        sched_groups_lock_grp;
 234 static lck_grp_attr_t   sched_groups_lock_grp_attr;
 235
 236 static lck_mtx_t        sched_groups_lock;
 237
 238
 239 static void
 240 sched_multiq_init(void);
 241
 242 static thread_t
 243 sched_multiq_steal_thread(processor_set_t pset);
 244
 245 static void
 246 sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context);
 247
 248 static boolean_t
 249 sched_multiq_processor_enqueue(processor_t processor, thread_t thread, integer_t options);
 250
 251 static boolean_t
 252 sched_multiq_processor_queue_remove(processor_t processor, thread_t thread);
 253
 254 void
 255 sched_multiq_quantum_expire(thread_t thread);
 256
 257 static ast_t
 258 sched_multiq_processor_csw_check(processor_t processor);
 259
 260 static boolean_t
 261 sched_multiq_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
 262
 263 static int
 264 sched_multiq_runq_count(processor_t processor);
 265
 266 static boolean_t
 267 sched_multiq_processor_queue_empty(processor_t processor);
 268
 269 static uint64_t
 270 sched_multiq_runq_stats_count_sum(processor_t processor);
 271
 272 static int
 273 sched_multiq_processor_bound_count(processor_t processor);
 274
 275 static void
 276 sched_multiq_pset_init(processor_set_t pset);
 277
 278 static void
 279 sched_multiq_processor_init(processor_t processor);
 280
 281 static thread_t
 282 sched_multiq_choose_thread(processor_t processor, int priority, ast_t reason);
 283
 284 static void
 285 sched_multiq_processor_queue_shutdown(processor_t processor);
 286
 287 static sched_mode_t
 288 sched_multiq_initial_thread_sched_mode(task_t parent_task);
 289
 290 static bool
 291 sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread);
 292
 293 const struct sched_dispatch_table sched_multiq_dispatch = {
 294         .sched_name                                     = "multiq",
 295         .init                                           = sched_multiq_init,
 296         .timebase_init                                  = sched_timeshare_timebase_init,
 297         .processor_init                                 = sched_multiq_processor_init,
 298         .pset_init                                      = sched_multiq_pset_init,
 299         .maintenance_continuation                       = sched_timeshare_maintenance_continue,
 300         .choose_thread                                  = sched_multiq_choose_thread,
 301         .steal_thread_enabled                           = FALSE,
 302         .steal_thread                                   = sched_multiq_steal_thread,
 303         .compute_timeshare_priority                     = sched_compute_timeshare_priority,
 304         .choose_processor                               = choose_processor,
 305         .processor_enqueue                              = sched_multiq_processor_enqueue,
 306         .processor_queue_shutdown                       = sched_multiq_processor_queue_shutdown,
 307         .processor_queue_remove                         = sched_multiq_processor_queue_remove,
 308         .processor_queue_empty                          = sched_multiq_processor_queue_empty,
 309         .priority_is_urgent                             = priority_is_urgent,
 310         .processor_csw_check                            = sched_multiq_processor_csw_check,
 311         .processor_queue_has_priority                   = sched_multiq_processor_queue_has_priority,
 312         .initial_quantum_size                           = sched_timeshare_initial_quantum_size,
 313         .initial_thread_sched_mode                      = sched_multiq_initial_thread_sched_mode,
 314         .can_update_priority                            = can_update_priority,
 315         .update_priority                                = update_priority,
 316         .lightweight_update_priority                    = lightweight_update_priority,
 317         .quantum_expire                                 = sched_multiq_quantum_expire,
 318         .processor_runq_count                           = sched_multiq_runq_count,
 319         .processor_runq_stats_count_sum                 = sched_multiq_runq_stats_count_sum,
 320         .processor_bound_count                          = sched_multiq_processor_bound_count,
 321         .thread_update_scan                             = sched_multiq_thread_update_scan,
 322         .direct_dispatch_to_idle_processors             = FALSE,
 323         .multiple_psets_enabled                         = FALSE,
 324         .sched_groups_enabled                           = TRUE,
 325         .avoid_processor_enabled                        = TRUE,
 326         .thread_avoid_processor                         = sched_multiq_thread_avoid_processor,
 327         .processor_balance                              = sched_SMT_balance,
 328
 329         .rt_runq                                        = sched_rtglobal_runq,
 330         .rt_init                                        = sched_rtglobal_init,
 331         .rt_queue_shutdown                              = sched_rtglobal_queue_shutdown,
 332         .rt_runq_scan                                   = sched_rtglobal_runq_scan,
 333         .rt_runq_count_sum                              = sched_rtglobal_runq_count_sum,
 334
 335         .qos_max_parallelism                            = sched_qos_max_parallelism,
 336         .check_spill                                    = sched_check_spill,
 337         .ipi_policy                                     = sched_ipi_policy,
 338         .thread_should_yield                            = sched_thread_should_yield,
 339 };
 340
 341
 342 static void
 343 sched_multiq_init(void)
 344 {
 345 #if defined(MULTIQ_SANITY_CHECK)
 346         PE_parse_boot_argn("-multiq-sanity-check", &multiq_sanity_check, sizeof(multiq_sanity_check));
 347 #endif
 348
 349         PE_parse_boot_argn("-multiq-deep-drain", &deep_drain, sizeof(deep_drain));
 350
 351         if (!PE_parse_boot_argn("multiq_drain_ceiling", &drain_ceiling, sizeof(drain_ceiling))) {
 352                 drain_ceiling = DEFAULT_DRAIN_CEILING;
 353         }
 354
 355         if (!PE_parse_boot_argn("multiq_drain_depth_limit", &drain_depth_limit, sizeof(drain_depth_limit))) {
 356                 drain_depth_limit = DEFAULT_DRAIN_DEPTH_LIMIT;
 357         }
 358
 359         if (!PE_parse_boot_argn("multiq_drain_band_limit", &drain_band_limit, sizeof(drain_band_limit))) {
 360                 drain_band_limit = DEFAULT_DRAIN_BAND_LIMIT;
 361         }
 362
 363         printf("multiq scheduler config: deep-drain %d, ceiling %d, depth limit %d, band limit %d, sanity check %d\n",
 364                deep_drain, drain_ceiling, drain_depth_limit, drain_band_limit, multiq_sanity_check);
 365
 366         sched_group_zone = zinit(
 367                                  sizeof(struct sched_group),
 368                                  task_max * sizeof(struct sched_group),
 369                                  PAGE_SIZE,
 370                                  "sched groups");
 371
 372         zone_change(sched_group_zone, Z_NOENCRYPT, TRUE);
 373         zone_change(sched_group_zone, Z_NOCALLOUT, TRUE);
 374
 375         queue_init(&sched_groups);
 376
 377         lck_grp_attr_setdefault(&sched_groups_lock_grp_attr);
 378         lck_grp_init(&sched_groups_lock_grp, "sched_groups", &sched_groups_lock_grp_attr);
 379         lck_attr_setdefault(&sched_groups_lock_attr);
 380         lck_mtx_init(&sched_groups_lock, &sched_groups_lock_grp, &sched_groups_lock_attr);
 381
 382         sched_timeshare_init();
 383 }
 384
 385 static void
 386 sched_multiq_processor_init(processor_t processor)
 387 {
 388         run_queue_init(&processor->runq);
 389 }
 390
 391 static void
 392 sched_multiq_pset_init(processor_set_t pset)
 393 {
 394         run_queue_init(&pset->pset_runq);
 395 }
 396
 397 static sched_mode_t
 398 sched_multiq_initial_thread_sched_mode(task_t parent_task)
 399 {
 400         if (parent_task == kernel_task)
 401                 return TH_MODE_FIXED;
 402         else
 403                 return TH_MODE_TIMESHARE;
 404 }
 405
 406 sched_group_t
 407 sched_group_create(void)
 408 {
 409         sched_group_t       sched_group;
 410
 411         if (!SCHED(sched_groups_enabled))
 412                 return SCHED_GROUP_NULL;
 413
 414         sched_group = (sched_group_t)zalloc(sched_group_zone);
 415
 416         bzero(sched_group, sizeof(struct sched_group));
 417
 418         run_queue_init(&sched_group->runq);
 419
 420         for (int i = 0; i < NRQS; i++) {
 421                 sched_group->entries[i].runq = 0;
 422                 sched_group->entries[i].sched_pri = i;
 423         }
 424
 425         lck_mtx_lock(&sched_groups_lock);
 426         queue_enter(&sched_groups, sched_group, sched_group_t, sched_groups);
 427         num_sched_groups++;
 428         lck_mtx_unlock(&sched_groups_lock);
 429
 430         return (sched_group);
 431 }
 432
 433 void
 434 sched_group_destroy(sched_group_t sched_group)
 435 {
 436         if (!SCHED(sched_groups_enabled)) {
 437                 assert(sched_group == SCHED_GROUP_NULL);
 438                 return;
 439         }
 440
 441         assert(sched_group != SCHED_GROUP_NULL);
 442         assert(sched_group->runq.count == 0);
 443
 444         for (int i = 0; i < NRQS; i++) {
 445                 assert(sched_group->entries[i].runq == 0);
 446                 assert(sched_group->entries[i].sched_pri == i);
 447         }
 448
 449         lck_mtx_lock(&sched_groups_lock);
 450         queue_remove(&sched_groups, sched_group, sched_group_t, sched_groups);
 451         num_sched_groups--;
 452         lck_mtx_unlock(&sched_groups_lock);
 453
 454         zfree(sched_group_zone, sched_group);
 455 }
 456
 457 __attribute__((always_inline))
 458 static inline entry_queue_t
 459 multiq_main_entryq(processor_t processor)
 460 {
 461         return (entry_queue_t)&processor->processor_set->pset_runq;
 462 }
 463
 464 __attribute__((always_inline))
 465 static inline run_queue_t
 466 multiq_bound_runq(processor_t processor)
 467 {
 468         return &processor->runq;
 469 }
 470
 471 __attribute__((always_inline))
 472 static inline sched_entry_t
 473 group_entry_for_pri(sched_group_t group, integer_t pri)
 474 {
 475         return &group->entries[pri];
 476 }
 477
 478 __attribute__((always_inline))
 479 static inline sched_group_t
 480 group_for_entry(sched_entry_t entry)
 481 {
 482 #pragma clang diagnostic push
 483 #pragma clang diagnostic ignored "-Wcast-align"
 484         sched_group_t group = (sched_group_t)(entry - entry->sched_pri);
 485 #pragma clang diagnostic pop
 486         return group;
 487 }
 488
 489 /* Peek at the head of the runqueue */
 490 static sched_entry_t
 491 entry_queue_first_entry(entry_queue_t rq)
 492 {
 493         assert(rq->count != 0);
 494
 495         queue_t queue = &rq->queues[rq->highq];
 496
 497         sched_entry_t entry = qe_queue_first(queue, struct sched_entry, entry_links);
 498
 499         assert(entry->sched_pri == rq->highq);
 500
 501         return entry;
 502 }
 503
 504 #if defined(MULTIQ_SANITY_CHECK)
 505
 506 #if MACH_ASSERT
 507 __attribute__((always_inline))
 508 static inline boolean_t
 509 queue_chain_linked(queue_chain_t* chain)
 510 {
 511         if (chain->next != NULL) {
 512                 assert(chain->prev != NULL);
 513                 return TRUE;
 514         } else {
 515                 assert(chain->prev == NULL);
 516                 return FALSE;
 517         }
 518 }
 519 #endif /* MACH_ASSERT */
 520
 521 static thread_t
 522 group_first_thread(sched_group_t group)
 523 {
 524         group_runq_t rq = &group->runq;
 525
 526         assert(rq->count != 0);
 527
 528         queue_t queue = &rq->queues[rq->highq];
 529
 530         thread_t thread = qe_queue_first(queue, struct thread, runq_links);
 531
 532         assert(thread != THREAD_NULL);
 533         assert_thread_magic(thread);
 534
 535         assert(thread->sched_group == group);
 536
 537         /* TODO: May not be safe */
 538         assert(thread->sched_pri == rq->highq);
 539
 540         return thread;
 541 }
 542
 543 /* Asserts if entry is not in entry runq at pri */
 544 static void
 545 entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pri)
 546 {
 547         queue_t q;
 548         sched_entry_t elem;
 549
 550         assert(queue_chain_linked(&entry->entry_links));
 551         assert(entry->runq == MULTIQ_ERUNQ);
 552
 553         q = &runq->queues[expected_pri];
 554
 555         qe_foreach_element(elem, q, entry_links) {
 556                 if (elem == entry)
 557                         return;
 558         }
 559
 560         panic("runq %p doesn't contain entry %p at pri %d", runq, entry, expected_pri);
 561 }
 562
 563 /* Asserts if thread is not in group at its priority */
 564 static void
 565 sched_group_check_thread(sched_group_t group, thread_t thread)
 566 {
 567         queue_t q;
 568         thread_t elem;
 569         int pri = thread->sched_pri;
 570
 571         assert(thread->runq != PROCESSOR_NULL);
 572
 573         q = &group->runq.queues[pri];
 574
 575         qe_foreach_element(elem, q, runq_links) {
 576                 if (elem == thread)
 577                         return;
 578         }
 579
 580         panic("group %p doesn't contain thread %p at pri %d", group, thread, pri);
 581 }
 582
 583 static void
 584 global_check_entry_queue(entry_queue_t main_entryq)
 585 {
 586         if (main_entryq->count == 0)
 587                 return;
 588
 589         sched_entry_t entry = entry_queue_first_entry(main_entryq);
 590
 591         assert(entry->runq == MULTIQ_ERUNQ);
 592
 593         sched_group_t group = group_for_entry(entry);
 594
 595         thread_t thread = group_first_thread(group);
 596
 597         __assert_only sched_entry_t thread_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
 598
 599         assert(entry->sched_pri == group->runq.highq);
 600
 601         assert(entry == thread_entry);
 602         assert(thread->runq != PROCESSOR_NULL);
 603 }
 604
 605 static void
 606 group_check_run_queue(entry_queue_t main_entryq, sched_group_t group)
 607 {
 608         if (group->runq.count == 0)
 609                 return;
 610
 611         thread_t thread = group_first_thread(group);
 612
 613         assert(thread->runq != PROCESSOR_NULL);
 614
 615         sched_entry_t sched_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
 616
 617         entry_queue_check_entry(main_entryq, sched_entry, thread->sched_pri);
 618
 619         assert(sched_entry->sched_pri == thread->sched_pri);
 620         assert(sched_entry->runq == MULTIQ_ERUNQ);
 621 }
 622
 623 #endif /* defined(MULTIQ_SANITY_CHECK) */
 624
 625 /*
 626  * The run queue must not be empty.
 627  */
 628 static sched_entry_t
 629 entry_queue_dequeue_entry(entry_queue_t rq)
 630 {
 631         sched_entry_t   sched_entry;
 632         queue_t         queue = &rq->queues[rq->highq];
 633
 634         assert(rq->count > 0);
 635         assert(!queue_empty(queue));
 636
 637         sched_entry = qe_dequeue_head(queue, struct sched_entry, entry_links);
 638
 639         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 640         rq->count--;
 641         if (SCHED(priority_is_urgent)(rq->highq)) {
 642                 rq->urgency--; assert(rq->urgency >= 0);
 643         }
 644         if (queue_empty(queue)) {
 645                 rq_bitmap_clear(rq->bitmap, rq->highq);
 646                 rq->highq = bitmap_first(rq->bitmap, NRQS);
 647         }
 648
 649         sched_entry->runq = 0;
 650
 651         return (sched_entry);
 652 }
 653
 654 /*
 655  * The run queue must not be empty.
 656  */
 657 static boolean_t
 658 entry_queue_enqueue_entry(
 659                           entry_queue_t rq,
 660                           sched_entry_t entry,
 661                           integer_t     options)
 662 {
 663         int             sched_pri = entry->sched_pri;
 664         queue_t         queue = &rq->queues[sched_pri];
 665         boolean_t       result = FALSE;
 666
 667         assert(entry->runq == 0);
 668
 669         if (queue_empty(queue)) {
 670                 enqueue_tail(queue, &entry->entry_links);
 671
 672                 rq_bitmap_set(rq->bitmap, sched_pri);
 673                 if (sched_pri > rq->highq) {
 674                         rq->highq = sched_pri;
 675                         result = TRUE;
 676                 }
 677         } else {
 678                 if (options & SCHED_TAILQ)
 679                         enqueue_tail(queue, &entry->entry_links);
 680                 else
 681                         enqueue_head(queue, &entry->entry_links);
 682         }
 683         if (SCHED(priority_is_urgent)(sched_pri))
 684                 rq->urgency++;
 685         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 686         rq->count++;
 687
 688         entry->runq = MULTIQ_ERUNQ;
 689
 690         return (result);
 691 }
 692
 693 /*
 694  * The entry must be in this runqueue.
 695  */
 696 static void
 697 entry_queue_remove_entry(
 698                          entry_queue_t  rq,
 699                          sched_entry_t  entry)
 700 {
 701         int sched_pri = entry->sched_pri;
 702
 703 #if defined(MULTIQ_SANITY_CHECK)
 704         if (multiq_sanity_check) {
 705                 entry_queue_check_entry(rq, entry, sched_pri);
 706         }
 707 #endif
 708
 709         remqueue(&entry->entry_links);
 710
 711         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 712         rq->count--;
 713         if (SCHED(priority_is_urgent)(sched_pri)) {
 714                 rq->urgency--; assert(rq->urgency >= 0);
 715         }
 716
 717         if (queue_empty(&rq->queues[sched_pri])) {
 718                 /* update run queue status */
 719                 rq_bitmap_clear(rq->bitmap, sched_pri);
 720                 rq->highq = bitmap_first(rq->bitmap, NRQS);
 721         }
 722
 723         entry->runq = 0;
 724 }
 725
 726 static void
 727 entry_queue_change_entry(
 728                           entry_queue_t rq,
 729                           sched_entry_t entry,
 730                           integer_t     options)
 731 {
 732         int     sched_pri   = entry->sched_pri;
 733         queue_t queue       = &rq->queues[sched_pri];
 734
 735 #if defined(MULTIQ_SANITY_CHECK)
 736         if (multiq_sanity_check) {
 737                 entry_queue_check_entry(rq, entry, sched_pri);
 738         }
 739 #endif
 740
 741         if (options & SCHED_TAILQ)
 742                 re_queue_tail(queue, &entry->entry_links);
 743         else
 744                 re_queue_head(queue, &entry->entry_links);
 745 }
 746 /*
 747  * The run queue must not be empty.
 748  *
 749  * sets queue_empty to TRUE if queue is now empty at thread_pri
 750  */
 751 static thread_t
 752 group_run_queue_dequeue_thread(
 753                          group_runq_t   rq,
 754                          integer_t     *thread_pri,
 755                          boolean_t     *queue_empty)
 756 {
 757         thread_t        thread;
 758         queue_t         queue = &rq->queues[rq->highq];
 759
 760         assert(rq->count > 0);
 761         assert(!queue_empty(queue));
 762
 763         *thread_pri = rq->highq;
 764
 765         thread = qe_dequeue_head(queue, struct thread, runq_links);
 766         assert_thread_magic(thread);
 767
 768         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 769         rq->count--;
 770         if (SCHED(priority_is_urgent)(rq->highq)) {
 771                 rq->urgency--; assert(rq->urgency >= 0);
 772         }
 773         if (queue_empty(queue)) {
 774                 rq_bitmap_clear(rq->bitmap, rq->highq);
 775                 rq->highq = bitmap_first(rq->bitmap, NRQS);
 776                 *queue_empty = TRUE;
 777         } else {
 778                 *queue_empty = FALSE;
 779         }
 780
 781         return thread;
 782 }
 783
 784 /*
 785  * The run queue must not be empty.
 786  * returns TRUE if queue was empty at thread_pri
 787  */
 788 static boolean_t
 789 group_run_queue_enqueue_thread(
 790                          group_runq_t   rq,
 791                          thread_t       thread,
 792                          integer_t      thread_pri,
 793                          integer_t      options)
 794 {
 795         queue_t         queue = &rq->queues[thread_pri];
 796         boolean_t       result = FALSE;
 797
 798         assert(thread->runq == PROCESSOR_NULL);
 799         assert_thread_magic(thread);
 800
 801         if (queue_empty(queue)) {
 802                 enqueue_tail(queue, &thread->runq_links);
 803
 804                 rq_bitmap_set(rq->bitmap, thread_pri);
 805                 if (thread_pri > rq->highq) {
 806                         rq->highq = thread_pri;
 807                 }
 808                 result = TRUE;
 809         } else {
 810                 if (options & SCHED_TAILQ)
 811                         enqueue_tail(queue, &thread->runq_links);
 812                 else
 813                         enqueue_head(queue, &thread->runq_links);
 814         }
 815         if (SCHED(priority_is_urgent)(thread_pri))
 816                 rq->urgency++;
 817         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 818         rq->count++;
 819
 820         return (result);
 821 }
 822
 823 /*
 824  * The thread must be in this runqueue.
 825  * returns TRUE if queue is now empty at thread_pri
 826  */
 827 static boolean_t
 828 group_run_queue_remove_thread(
 829                         group_runq_t    rq,
 830                         thread_t        thread,
 831                         integer_t       thread_pri)
 832 {
 833         boolean_t       result = FALSE;
 834
 835         assert_thread_magic(thread);
 836         assert(thread->runq != PROCESSOR_NULL);
 837
 838         remqueue(&thread->runq_links);
 839
 840         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 841         rq->count--;
 842         if (SCHED(priority_is_urgent)(thread_pri)) {
 843                 rq->urgency--; assert(rq->urgency >= 0);
 844         }
 845
 846         if (queue_empty(&rq->queues[thread_pri])) {
 847                 /* update run queue status */
 848                 rq_bitmap_clear(rq->bitmap, thread_pri);
 849                 rq->highq = bitmap_first(rq->bitmap, NRQS);
 850                 result = TRUE;
 851         }
 852
 853         thread->runq = PROCESSOR_NULL;
 854
 855         return result;
 856 }
 857
 858 /*
 859  * A thread's sched pri may change out from under us because
 860  * we're clearing thread->runq here without the thread locked.
 861  * Do not rely on it to be the same as when we enqueued.
 862  */
 863 static thread_t
 864 sched_global_dequeue_thread(entry_queue_t main_entryq)
 865 {
 866         boolean_t pri_level_empty = FALSE;
 867         sched_entry_t entry;
 868         group_runq_t group_runq;
 869         thread_t thread;
 870         integer_t thread_pri;
 871         sched_group_t group;
 872
 873         assert(main_entryq->count > 0);
 874
 875         entry = entry_queue_dequeue_entry(main_entryq);
 876
 877         group = group_for_entry(entry);
 878         group_runq = &group->runq;
 879
 880         thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
 881
 882         thread->runq = PROCESSOR_NULL;
 883
 884         if (!pri_level_empty) {
 885                 entry_queue_enqueue_entry(main_entryq, entry, SCHED_TAILQ);
 886         }
 887
 888         return thread;
 889 }
 890
 891 /* Dequeue a thread from the global runq without moving the entry */
 892 static thread_t
 893 sched_global_deep_drain_dequeue_thread(entry_queue_t main_entryq)
 894 {
 895         boolean_t pri_level_empty = FALSE;
 896         sched_entry_t entry;
 897         group_runq_t group_runq;
 898         thread_t thread;
 899         integer_t thread_pri;
 900         sched_group_t group;
 901
 902         assert(main_entryq->count > 0);
 903
 904         entry = entry_queue_first_entry(main_entryq);
 905
 906         group = group_for_entry(entry);
 907         group_runq = &group->runq;
 908
 909         thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
 910
 911         thread->runq = PROCESSOR_NULL;
 912
 913         if (pri_level_empty) {
 914                 entry_queue_remove_entry(main_entryq, entry);
 915         }
 916
 917         return thread;
 918 }
 919
 920
 921 static thread_t
 922 sched_group_dequeue_thread(
 923                            entry_queue_t main_entryq,
 924                            sched_group_t group)
 925 {
 926         group_runq_t group_runq = &group->runq;
 927         boolean_t pri_level_empty = FALSE;
 928         thread_t thread;
 929         integer_t thread_pri;
 930
 931         thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
 932
 933         thread->runq = PROCESSOR_NULL;
 934
 935         if (pri_level_empty) {
 936                 entry_queue_remove_entry(main_entryq, group_entry_for_pri(group, thread_pri));
 937         }
 938
 939         return thread;
 940 }
 941
 942 static void
 943 sched_group_remove_thread(
 944                           entry_queue_t main_entryq,
 945                           sched_group_t group,
 946                           thread_t thread)
 947 {
 948         integer_t thread_pri = thread->sched_pri;
 949         sched_entry_t sched_entry = group_entry_for_pri(group, thread_pri);
 950
 951 #if defined(MULTIQ_SANITY_CHECK)
 952         if (multiq_sanity_check) {
 953                 global_check_entry_queue(main_entryq);
 954                 group_check_run_queue(main_entryq, group);
 955
 956                 sched_group_check_thread(group, thread);
 957                 entry_queue_check_entry(main_entryq, sched_entry, thread_pri);
 958         }
 959 #endif
 960
 961         boolean_t pri_level_empty = group_run_queue_remove_thread(&group->runq, thread, thread_pri);
 962
 963         if (pri_level_empty) {
 964                 entry_queue_remove_entry(main_entryq, sched_entry);
 965         }
 966
 967 #if defined(MULTIQ_SANITY_CHECK)
 968         if (multiq_sanity_check) {
 969                 global_check_entry_queue(main_entryq);
 970                 group_check_run_queue(main_entryq, group);
 971         }
 972 #endif
 973 }
 974
 975 static void
 976 sched_group_enqueue_thread(
 977                            entry_queue_t        main_entryq,
 978                            sched_group_t        group,
 979                            thread_t             thread,
 980                            integer_t            options)
 981 {
 982 #if defined(MULTIQ_SANITY_CHECK)
 983         if (multiq_sanity_check) {
 984                 global_check_entry_queue(main_entryq);
 985                 group_check_run_queue(main_entryq, group);
 986         }
 987 #endif
 988
 989         int sched_pri = thread->sched_pri;
 990
 991         boolean_t pri_level_was_empty = group_run_queue_enqueue_thread(&group->runq, thread, sched_pri, options);
 992
 993         if (pri_level_was_empty) {
 994                 /*
 995                  * TODO: Need to figure out if passing options here is a good idea or not
 996                  * What effects would it have?
 997                  */
 998                 entry_queue_enqueue_entry(main_entryq, &group->entries[sched_pri], options);
 999         } else if (options & SCHED_HEADQ) {
1000                 /* The thread should be at the head of the line - move its entry to the front */
1001                 entry_queue_change_entry(main_entryq, &group->entries[sched_pri], options);
1002         }
1003 }
1004
1005 /*
1006  *  Locate a thread to execute from the run queue and return it.
1007  *  Only choose a thread with greater or equal priority.
1008  *
1009  *  pset is locked, thread is not locked.
1010  *
1011  *  Returns THREAD_NULL if it cannot find a valid thread.
1012  *
1013  *  Note: we cannot rely on the value of thread->sched_pri in this path because
1014  *  we don't have the thread locked.
1015  *
1016  *  TODO: Remove tracepoints
1017  */
1018 static thread_t
1019 sched_multiq_choose_thread(
1020                            processor_t      processor,
1021                            int              priority,
1022                            ast_t            reason)
1023 {
1024         entry_queue_t   main_entryq = multiq_main_entryq(processor);
1025         run_queue_t     bound_runq  = multiq_bound_runq(processor);
1026
1027         boolean_t choose_bound_runq = FALSE;
1028
1029         if (bound_runq->highq  < priority &&
1030             main_entryq->highq < priority)
1031                 return THREAD_NULL;
1032
1033         if (bound_runq->count && main_entryq->count) {
1034                 if (bound_runq->highq >= main_entryq->highq) {
1035                         choose_bound_runq = TRUE;
1036                 } else {
1037                         /* Use main runq */
1038                 }
1039         } else if (bound_runq->count) {
1040                 choose_bound_runq = TRUE;
1041         } else if (main_entryq->count) {
1042                 /* Use main runq */
1043         } else {
1044                 return (THREAD_NULL);
1045         }
1046
1047         if (choose_bound_runq) {
1048                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1049                     MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1050                     MACH_MULTIQ_BOUND, main_entryq->highq, bound_runq->highq, 0, 0);
1051
1052                 return run_queue_dequeue(bound_runq, SCHED_HEADQ);
1053         }
1054
1055         sched_group_t group = current_thread()->sched_group;
1056
1057 #if defined(MULTIQ_SANITY_CHECK)
1058         if (multiq_sanity_check) {
1059                 global_check_entry_queue(main_entryq);
1060                 group_check_run_queue(main_entryq, group);
1061         }
1062 #endif
1063
1064         /*
1065          * Determine if we should look at the group or the global queue
1066          *
1067          * TODO:
1068          * Perhaps pass reason as a 'should look inside' argument to choose_thread
1069          * Should YIELD AST override drain limit?
1070          */
1071         if (group->runq.count != 0 && (reason & AST_PREEMPTION) == 0) {
1072                 boolean_t favor_group = TRUE;
1073
1074                 integer_t global_pri = main_entryq->highq;
1075                 integer_t group_pri  = group->runq.highq;
1076
1077                 /*
1078                  * Favor the current group if the group is still the globally highest.
1079                  *
1080                  * Otherwise, consider choosing a thread from the current group
1081                  * even if it's lower priority than the global highest priority.
1082                  */
1083                 if (global_pri > group_pri) {
1084                         /*
1085                          * If there's something elsewhere above the depth limit,
1086                          * don't pick a thread below the limit.
1087                          */
1088                         if (global_pri > drain_depth_limit && group_pri <= drain_depth_limit)
1089                                 favor_group = FALSE;
1090
1091                         /*
1092                          * If there's something at or above the ceiling,
1093                          * don't favor the group.
1094                          */
1095                         if (global_pri >= drain_ceiling)
1096                                 favor_group = FALSE;
1097
1098                         /*
1099                          * Don't go more than X steps below the global highest
1100                          */
1101                         if ((global_pri - group_pri) >= drain_band_limit)
1102                                 favor_group = FALSE;
1103                 }
1104
1105                 if (favor_group) {
1106                         /* Pull from local runq */
1107                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1108                             MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1109                             MACH_MULTIQ_GROUP, global_pri, group_pri, 0, 0);
1110
1111                         return sched_group_dequeue_thread(main_entryq, group);
1112                 }
1113         }
1114
1115         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1116             MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1117             MACH_MULTIQ_GLOBAL, main_entryq->highq, group->runq.highq, 0, 0);
1118
1119         /* Couldn't pull from local runq, pull from global runq instead */
1120         if (deep_drain) {
1121                 return sched_global_deep_drain_dequeue_thread(main_entryq);
1122         } else {
1123                 return sched_global_dequeue_thread(main_entryq);
1124         }
1125 }
1126
1127
1128 /*
1129  * Thread must be locked, and not already be on a run queue.
1130  * pset is locked.
1131  */
1132 static boolean_t
1133 sched_multiq_processor_enqueue(
1134                                processor_t      processor,
1135                                thread_t         thread,
1136                                integer_t        options)
1137 {
1138         boolean_t       result;
1139
1140         assert(processor == thread->chosen_processor);
1141
1142         if (thread->bound_processor != PROCESSOR_NULL) {
1143                 assert(thread->bound_processor == processor);
1144
1145                 result = run_queue_enqueue(multiq_bound_runq(processor), thread, options);
1146                 thread->runq = processor;
1147
1148                 return result;
1149         }
1150
1151         sched_group_enqueue_thread(multiq_main_entryq(processor),
1152                                    thread->sched_group,
1153                                    thread, options);
1154
1155         thread->runq = processor;
1156
1157         return (FALSE);
1158 }
1159
1160 /*
1161  * Called in the context of thread with thread and pset unlocked,
1162  * after updating thread priority but before propagating that priority
1163  * to the processor
1164  */
1165 void
1166 sched_multiq_quantum_expire(thread_t thread)
1167 {
1168         if (deep_drain) {
1169                 /*
1170                  * Move the entry at this priority to the end of the queue,
1171                  * to allow the next task a shot at running.
1172                  */
1173
1174                 processor_t processor = thread->last_processor;
1175                 processor_set_t pset = processor->processor_set;
1176                 entry_queue_t entryq = multiq_main_entryq(processor);
1177
1178                 pset_lock(pset);
1179
1180                 sched_entry_t entry = group_entry_for_pri(thread->sched_group, processor->current_pri);
1181
1182                 if (entry->runq == MULTIQ_ERUNQ) {
1183                         entry_queue_change_entry(entryq, entry, SCHED_TAILQ);
1184                 }
1185
1186                 pset_unlock(pset);
1187         }
1188 }
1189
1190 static boolean_t
1191 sched_multiq_processor_queue_empty(processor_t processor)
1192 {
1193         return multiq_main_entryq(processor)->count == 0 &&
1194                multiq_bound_runq(processor)->count  == 0;
1195 }
1196
1197 static ast_t
1198 sched_multiq_processor_csw_check(processor_t processor)
1199 {
1200         boolean_t       has_higher;
1201         int             pri;
1202
1203         if (sched_multiq_thread_avoid_processor(processor, current_thread())) {
1204                 return (AST_PREEMPT | AST_URGENT);
1205         }
1206
1207         entry_queue_t main_entryq = multiq_main_entryq(processor);
1208         run_queue_t   bound_runq  = multiq_bound_runq(processor);
1209
1210         assert(processor->active_thread != NULL);
1211
1212         pri = MAX(main_entryq->highq, bound_runq->highq);
1213
1214         if (processor->first_timeslice) {
1215                 has_higher = (pri > processor->current_pri);
1216         } else {
1217                 has_higher = (pri >= processor->current_pri);
1218         }
1219
1220         if (has_higher) {
1221                 if (main_entryq->urgency > 0)
1222                         return (AST_PREEMPT | AST_URGENT);
1223
1224                 if (bound_runq->urgency > 0)
1225                         return (AST_PREEMPT | AST_URGENT);
1226
1227                 return AST_PREEMPT;
1228         }
1229
1230         return AST_NONE;
1231 }
1232
1233 static boolean_t
1234 sched_multiq_processor_queue_has_priority(
1235                                           processor_t   processor,
1236                                           int           priority,
1237                                           boolean_t     gte)
1238 {
1239         run_queue_t main_runq  = multiq_main_entryq(processor);
1240         run_queue_t bound_runq = multiq_bound_runq(processor);
1241
1242         int qpri = MAX(main_runq->highq, bound_runq->highq);
1243
1244         if (gte)
1245                 return qpri >= priority;
1246         else
1247                 return qpri > priority;
1248 }
1249
1250 static int
1251 sched_multiq_runq_count(processor_t processor)
1252 {
1253         /*
1254          *  TODO: Decide whether to keep a count of runnable threads in the pset
1255          *  or just return something less than the true count.
1256          *
1257          *  This needs to be fast, so no iterating the whole runq.
1258          *
1259          *  Another possible decision is to remove this - with global runq
1260          *  it doesn't make much sense.
1261          */
1262         return multiq_main_entryq(processor)->count + multiq_bound_runq(processor)->count;
1263 }
1264
1265 static uint64_t
1266 sched_multiq_runq_stats_count_sum(processor_t processor)
1267 {
1268         /*
1269          * TODO: This one does need to go through all the runqueues, but it's only needed for
1270          * the sched stats tool
1271          */
1272
1273         uint64_t bound_sum = multiq_bound_runq(processor)->runq_stats.count_sum;
1274
1275         if (processor->cpu_id == processor->processor_set->cpu_set_low)
1276                 return bound_sum + multiq_main_entryq(processor)->runq_stats.count_sum;
1277         else
1278                 return bound_sum;
1279 }
1280
1281 static int
1282 sched_multiq_processor_bound_count(processor_t processor)
1283 {
1284         return multiq_bound_runq(processor)->count;
1285 }
1286
1287 static void
1288 sched_multiq_processor_queue_shutdown(processor_t processor)
1289 {
1290         processor_set_t pset = processor->processor_set;
1291         entry_queue_t   main_entryq = multiq_main_entryq(processor);
1292         thread_t        thread;
1293         queue_head_t    tqueue;
1294
1295         /* We only need to migrate threads if this is the last active processor in the pset */
1296         if (pset->online_processor_count > 0) {
1297                 pset_unlock(pset);
1298                 return;
1299         }
1300
1301         queue_init(&tqueue);
1302
1303         /* Note that we do not remove bound threads from the queues here */
1304
1305         while (main_entryq->count > 0) {
1306                 thread = sched_global_dequeue_thread(main_entryq);
1307                 enqueue_tail(&tqueue, &thread->runq_links);
1308         }
1309
1310         pset_unlock(pset);
1311
1312         qe_foreach_element_safe(thread, &tqueue, runq_links) {
1313
1314                 remqueue(&thread->runq_links);
1315
1316                 thread_lock(thread);
1317
1318                 thread_setrun(thread, SCHED_TAILQ);
1319
1320                 thread_unlock(thread);
1321         }
1322 }
1323
1324 /*
1325  * Thread is locked
1326  *
1327  * This is why we can never read sched_pri unless we have the thread locked.
1328  * Which we do in the enqueue and remove cases, but not the dequeue case.
1329  */
1330 static boolean_t
1331 sched_multiq_processor_queue_remove(
1332                                     processor_t processor,
1333                                     thread_t    thread)
1334 {
1335         boolean_t removed = FALSE;
1336         processor_set_t pset = processor->processor_set;
1337
1338         pset_lock(pset);
1339
1340         if (thread->runq != PROCESSOR_NULL) {
1341                 /*
1342                  * Thread is on a run queue and we have a lock on
1343                  * that run queue.
1344                  */
1345
1346                 assert(thread->runq == processor);
1347
1348                 if (thread->bound_processor != PROCESSOR_NULL) {
1349                         assert(processor == thread->bound_processor);
1350                         run_queue_remove(multiq_bound_runq(processor), thread);
1351                         thread->runq = PROCESSOR_NULL;
1352                 } else {
1353                         sched_group_remove_thread(multiq_main_entryq(processor),
1354                                                   thread->sched_group,
1355                                                   thread);
1356                 }
1357
1358                 removed = TRUE;
1359         }
1360
1361         pset_unlock(pset);
1362
1363         return removed;
1364 }
1365
1366 /* pset is locked, returned unlocked */
1367 static thread_t
1368 sched_multiq_steal_thread(processor_set_t pset)
1369 {
1370         pset_unlock(pset);
1371         return (THREAD_NULL);
1372 }
1373
1374 /*
1375  * Scan the global queue for candidate groups, and scan those groups for
1376  * candidate threads.
1377  *
1378  * TODO: This iterates every group runq in its entirety for each entry it has in the runq, which is O(N^2)
1379  *       Instead, iterate only the queue in the group runq matching the priority of the entry.
1380  *
1381  * Returns TRUE if retry is needed.
1382  */
1383 static boolean_t
1384 group_scan(entry_queue_t runq, sched_update_scan_context_t scan_context) {
1385         int count       = runq->count;
1386         int queue_index;
1387
1388         assert(count >= 0);
1389
1390         if (count == 0)
1391                 return FALSE;
1392
1393         for (queue_index = bitmap_first(runq->bitmap, NRQS);
1394              queue_index >= 0;
1395              queue_index = bitmap_next(runq->bitmap, queue_index)) {
1396
1397                 sched_entry_t entry;
1398
1399                 qe_foreach_element(entry, &runq->queues[queue_index], entry_links) {
1400                         assert(count > 0);
1401
1402                         sched_group_t group = group_for_entry(entry);
1403                         if (group->runq.count > 0) {
1404                                 if (runq_scan(&group->runq, scan_context))
1405                                         return (TRUE);
1406                         }
1407                         count--;
1408                 }
1409         }
1410
1411         return (FALSE);
1412 }
1413
1414 static void
1415 sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context)
1416 {
1417         boolean_t               restart_needed = FALSE;
1418         processor_t             processor = processor_list;
1419         processor_set_t         pset;
1420         thread_t                thread;
1421         spl_t                   s;
1422
1423         /*
1424          *  We update the threads associated with each processor (bound and idle threads)
1425          *  and then update the threads in each pset runqueue.
1426          */
1427
1428         do {
1429                 do {
1430                         pset = processor->processor_set;
1431
1432                         s = splsched();
1433                         pset_lock(pset);
1434
1435                         restart_needed = runq_scan(multiq_bound_runq(processor), scan_context);
1436
1437                         pset_unlock(pset);
1438                         splx(s);
1439
1440                         if (restart_needed)
1441                                 break;
1442
1443                         thread = processor->idle_thread;
1444                         if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
1445                                 if (thread_update_add_thread(thread) == FALSE) {
1446                                         restart_needed = TRUE;
1447                                         break;
1448                                 }
1449                         }
1450                 } while ((processor = processor->processor_list) != NULL);
1451
1452                 /* Ok, we now have a collection of candidates -- fix them. */
1453                 thread_update_process_threads();
1454
1455         } while (restart_needed);
1456
1457         pset = &pset0;
1458
1459         do {
1460                 do {
1461                         s = splsched();
1462                         pset_lock(pset);
1463
1464                         restart_needed = group_scan(&pset->pset_runq, scan_context);
1465
1466                         pset_unlock(pset);
1467                         splx(s);
1468
1469                         if (restart_needed)
1470                                 break;
1471                 } while ((pset = pset->pset_list) != NULL);
1472
1473                 /* Ok, we now have a collection of candidates -- fix them. */
1474                 thread_update_process_threads();
1475
1476         } while (restart_needed);
1477 }
1478
1479 extern int sched_allow_rt_smt;
1480
1481 /* Return true if this thread should not continue running on this processor */
1482 static bool
1483 sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread)
1484 {
1485         if (processor->processor_primary != processor) {
1486                 /*
1487                  * This is a secondary SMT processor.  If the primary is running
1488                  * a realtime thread, only allow realtime threads on the secondary.
1489                  */
1490                 if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
1491                         return true;
1492                 }
1493         }
1494
1495         return false;
1496 }