osfmk/kern/sched_multiq.c

   1 /*
   2  * Copyright (c) 2013-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <mach/mach_types.h>
  30 #include <mach/machine.h>
  31
  32 #include <machine/machine_routines.h>
  33 #include <machine/sched_param.h>
  34 #include <machine/machine_cpu.h>
  35
  36 #include <kern/kern_types.h>
  37 #include <kern/debug.h>
  38 #include <kern/mach_param.h>
  39 #include <kern/machine.h>
  40 #include <kern/misc_protos.h>
  41 #include <kern/processor.h>
  42 #include <kern/queue.h>
  43 #include <kern/sched.h>
  44 #include <kern/sched_prim.h>
  45 #include <kern/task.h>
  46 #include <kern/thread.h>
  47
  48 #include <sys/kdebug.h>
  49
  50 /*
  51  * Theory Statement
  52  *
  53  * How does the task scheduler work?
  54  *
  55  * It schedules threads across a few levels.
  56  *
  57  * RT threads are dealt with above us
  58  * Bound threads go into the per-processor runq
  59  * Non-bound threads are linked on their task's sched_group's runq
  60  * sched_groups' sched_entries are linked on the pset's runq
  61  *
  62  * TODO: make this explicit - bound threads should have a different enqueue fxn
  63  *
  64  * When we choose a new thread, we will decide whether to look at the bound runqueue, the global runqueue
  65  * or the current group's runqueue, then dequeue the next thread in that runqueue.
  66  *
  67  * We then manipulate the sched_entries to reflect the invariant that:
  68  * Each non-empty priority level in a group's runq is represented by one sched_entry enqueued in the global
  69  * runqueue.
  70  *
  71  * A sched_entry represents a chance at running - for each priority in each task, there is one chance of getting
  72  * to run.  This reduces the excess contention bonus given to processes which have work spread among many threads
  73  * as compared to processes which do the same amount of work under fewer threads.
  74  *
  75  * NOTE: Currently, the multiq scheduler only supports one pset.
  76  *
  77  * NOTE ABOUT thread->sched_pri:
  78  *
  79  * It can change after enqueue - it's changed without pset lock but with thread lock if thread->runq is 0.
  80  * Therefore we can only depend on it not changing during the enqueue and remove path, not the dequeue.
  81  *
  82  * TODO: Future features:
  83  *
  84  * Decouple the task priority from the sched_entry priority, allowing for:
  85  *      fast task priority change without having to iterate and re-dispatch all threads in the task.
  86  *              i.e. task-wide priority, task-wide boosting
  87  *      fancier group decay features
  88  *
  89  * Group (or task) decay:
  90  *      Decay is used for a few different things:
  91  *              Prioritizing latency-needing threads over throughput-needing threads for time-to-running
  92  *              Balancing work between threads in a process
  93  *              Balancing work done at the same priority between different processes
  94  *              Recovering from priority inversions between two threads in the same process
  95  *              Recovering from priority inversions between two threads in different processes
  96  *              Simulating a proportional share scheduler by allowing lower priority threads
  97  *                to run for a certain percentage of the time
  98  *
  99  *      Task decay lets us separately address the 'same process' and 'different process' needs,
 100  *      which will allow us to make smarter tradeoffs in different cases.
 101  *      For example, we could resolve priority inversion in the same process by reordering threads without dropping the
 102  *      process below low priority threads in other processes.
 103  *
 104  * One lock to rule them all (or at least all the runqueues) instead of the pset locks
 105  *
 106  * Shrink sched_entry size to the size of a queue_chain_t by inferring priority, group, and perhaps runq field.
 107  * The entries array is 5K currently so it'd be really great to reduce.
 108  * One way to get sched_group below 4K without a new runq structure would be to remove the extra queues above realtime.
 109  *
 110  * When preempting a processor, store a flag saying if the preemption
 111  * was from a thread in the same group or different group,
 112  * and tell choose_thread about it.
 113  *
 114  * When choosing a processor, bias towards those running in the same
 115  * group as I am running (at the same priority, or within a certain band?).
 116  *
 117  * Decide if we need to support psets.
 118  * Decide how to support psets - do we need duplicate entries for each pset,
 119  * or can we get away with putting the entry in either one or the other pset?
 120  *
 121  * Consider the right way to handle runq count - I don't want to iterate groups.
 122  * Perhaps keep a global counter.
 123  * Alternate option - remove it from choose_processor. It doesn't add much value
 124  * now that we have global runq.
 125  *
 126  * Need a better way of finding group to target instead of looking at current_task.
 127  * Perhaps choose_thread could pass in the current thread?
 128  *
 129  * Consider unifying runq copy-pastes.
 130  *
 131  * Thoughts on having a group central quantum bucket:
 132  *
 133  * I see two algorithms to decide quanta:
 134  * A) Hand off only when switching thread to thread in the same group
 135  * B) Allocate and return quanta to the group's pool
 136  *
 137  * Issues:
 138  * If a task blocks completely, should it come back with the leftover quanta
 139  * or brand new quanta?
 140  *
 141  * Should I put a flag saying zero out a quanta you grab when youre dispatched'?
 142  *
 143  * Resolution:
 144  * Handing off quanta between threads will help with jumping around in the current task
 145  * but will not help when a thread from a different task is involved.
 146  * Need an algorithm that works with round robin-ing between threads in different tasks
 147  *
 148  * But wait - round robining can only be triggered by quantum expire or blocking.
 149  * We need something that works with preemption or yielding - that's the more interesting idea.
 150  *
 151  * Existing algorithm - preemption doesn't re-set quantum, puts thread on head of runq.
 152  * Blocking or quantum expiration does re-set quantum, puts thread on tail of runq.
 153  *
 154  * New algorithm -
 155  * Hand off quanta when hopping between threads with same sched_group
 156  * Even if thread was blocked it uses last thread remaining quanta when it starts.
 157  *
 158  * If we use the only cycle entry at quantum algorithm, then the quantum pool starts getting
 159  * interesting.
 160  *
 161  * A thought - perhaps the handoff approach doesn't work so well in the presence of
 162  * non-handoff wakeups i.e. wake other thread then wait then block - doesn't mean that
 163  * woken thread will be what I switch to - other processor may have stolen it.
 164  * What do we do there?
 165  *
 166  * Conclusions:
 167  * We currently don't know of a scenario where quantum buckets on the task is beneficial.
 168  * We will instead handoff quantum between threads in the task, and keep quantum
 169  * on the preempted thread if it's preempted by something outside the task.
 170  *
 171  */
 172
 173 #if DEBUG || DEVELOPMENT
 174 #define MULTIQ_SANITY_CHECK
 175 #endif
 176
 177 typedef struct sched_entry {
 178         queue_chain_t           entry_links;
 179         int16_t                 sched_pri;      /* scheduled (current) priority */
 180         int16_t                 runq;
 181         int32_t                 pad;
 182 } *sched_entry_t;
 183
 184 typedef run_queue_t entry_queue_t;                      /* A run queue that holds sched_entries instead of threads */
 185 typedef run_queue_t group_runq_t;                       /* A run queue that is part of a sched_group */
 186
 187 #define SCHED_ENTRY_NULL        ((sched_entry_t) 0)
 188 #define MULTIQ_ERUNQ            (-4)                    /* Indicates entry is on the main runq */
 189
 190 /* Each level in the run queue corresponds to one entry in the entries array */
 191 struct sched_group {
 192         struct sched_entry      entries[NRQS];
 193         struct run_queue        runq;
 194         queue_chain_t           sched_groups;
 195 };
 196
 197 /*
 198  * Keep entry on the head of the runqueue while dequeueing threads.
 199  * Only cycle it to the end of the runqueue when a thread in the task
 200  * hits its quantum.
 201  */
 202 static boolean_t        deep_drain = FALSE;
 203
 204 /* Verify the consistency of the runq before touching it */
 205 static boolean_t        multiq_sanity_check = FALSE;
 206
 207 /*
 208  * Draining threads from the current task is preferred
 209  * when they're less than X steps below the current
 210  * global highest priority
 211  */
 212 #define DEFAULT_DRAIN_BAND_LIMIT MAXPRI
 213 static integer_t        drain_band_limit;
 214
 215 /*
 216  * Don't go below this priority level if there is something above it in another task
 217  */
 218 #define DEFAULT_DRAIN_DEPTH_LIMIT MAXPRI_THROTTLE
 219 static integer_t        drain_depth_limit;
 220
 221 /*
 222  * Don't favor the task when there's something above this priority in another task.
 223  */
 224 #define DEFAULT_DRAIN_CEILING BASEPRI_FOREGROUND
 225 static integer_t        drain_ceiling;
 226
 227 static ZONE_DECLARE(sched_group_zone, "sched groups",
 228     sizeof(struct sched_group), ZC_NOENCRYPT | ZC_NOCALLOUT);
 229
 230 static uint64_t         num_sched_groups = 0;
 231 static queue_head_t     sched_groups;
 232
 233 static LCK_GRP_DECLARE(sched_groups_lock_grp, "sched_groups");
 234 static LCK_MTX_DECLARE(sched_groups_lock, &sched_groups_lock_grp);
 235
 236 static void
 237 sched_multiq_init(void);
 238
 239 static thread_t
 240 sched_multiq_steal_thread(processor_set_t pset);
 241
 242 static void
 243 sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context);
 244
 245 static boolean_t
 246 sched_multiq_processor_enqueue(processor_t processor, thread_t thread,
 247     sched_options_t options);
 248
 249 static boolean_t
 250 sched_multiq_processor_queue_remove(processor_t processor, thread_t thread);
 251
 252 void
 253 sched_multiq_quantum_expire(thread_t thread);
 254
 255 static ast_t
 256 sched_multiq_processor_csw_check(processor_t processor);
 257
 258 static boolean_t
 259 sched_multiq_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
 260
 261 static int
 262 sched_multiq_runq_count(processor_t processor);
 263
 264 static boolean_t
 265 sched_multiq_processor_queue_empty(processor_t processor);
 266
 267 static uint64_t
 268 sched_multiq_runq_stats_count_sum(processor_t processor);
 269
 270 static int
 271 sched_multiq_processor_bound_count(processor_t processor);
 272
 273 static void
 274 sched_multiq_pset_init(processor_set_t pset);
 275
 276 static void
 277 sched_multiq_processor_init(processor_t processor);
 278
 279 static thread_t
 280 sched_multiq_choose_thread(processor_t processor, int priority, ast_t reason);
 281
 282 static void
 283 sched_multiq_processor_queue_shutdown(processor_t processor);
 284
 285 static sched_mode_t
 286 sched_multiq_initial_thread_sched_mode(task_t parent_task);
 287
 288 static bool
 289 sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread);
 290
 291 const struct sched_dispatch_table sched_multiq_dispatch = {
 292         .sched_name                                     = "multiq",
 293         .init                                           = sched_multiq_init,
 294         .timebase_init                                  = sched_timeshare_timebase_init,
 295         .processor_init                                 = sched_multiq_processor_init,
 296         .pset_init                                      = sched_multiq_pset_init,
 297         .maintenance_continuation                       = sched_timeshare_maintenance_continue,
 298         .choose_thread                                  = sched_multiq_choose_thread,
 299         .steal_thread_enabled                           = sched_steal_thread_DISABLED,
 300         .steal_thread                                   = sched_multiq_steal_thread,
 301         .compute_timeshare_priority                     = sched_compute_timeshare_priority,
 302         .choose_node                                    = sched_choose_node,
 303         .choose_processor                               = choose_processor,
 304         .processor_enqueue                              = sched_multiq_processor_enqueue,
 305         .processor_queue_shutdown                       = sched_multiq_processor_queue_shutdown,
 306         .processor_queue_remove                         = sched_multiq_processor_queue_remove,
 307         .processor_queue_empty                          = sched_multiq_processor_queue_empty,
 308         .priority_is_urgent                             = priority_is_urgent,
 309         .processor_csw_check                            = sched_multiq_processor_csw_check,
 310         .processor_queue_has_priority                   = sched_multiq_processor_queue_has_priority,
 311         .initial_quantum_size                           = sched_timeshare_initial_quantum_size,
 312         .initial_thread_sched_mode                      = sched_multiq_initial_thread_sched_mode,
 313         .can_update_priority                            = can_update_priority,
 314         .update_priority                                = update_priority,
 315         .lightweight_update_priority                    = lightweight_update_priority,
 316         .quantum_expire                                 = sched_multiq_quantum_expire,
 317         .processor_runq_count                           = sched_multiq_runq_count,
 318         .processor_runq_stats_count_sum                 = sched_multiq_runq_stats_count_sum,
 319         .processor_bound_count                          = sched_multiq_processor_bound_count,
 320         .thread_update_scan                             = sched_multiq_thread_update_scan,
 321         .multiple_psets_enabled                         = FALSE,
 322         .sched_groups_enabled                           = TRUE,
 323         .avoid_processor_enabled                        = TRUE,
 324         .thread_avoid_processor                         = sched_multiq_thread_avoid_processor,
 325         .processor_balance                              = sched_SMT_balance,
 326
 327         .rt_runq                                        = sched_rtlocal_runq,
 328         .rt_init                                        = sched_rtlocal_init,
 329         .rt_queue_shutdown                              = sched_rtlocal_queue_shutdown,
 330         .rt_runq_scan                                   = sched_rtlocal_runq_scan,
 331         .rt_runq_count_sum                              = sched_rtlocal_runq_count_sum,
 332
 333         .qos_max_parallelism                            = sched_qos_max_parallelism,
 334         .check_spill                                    = sched_check_spill,
 335         .ipi_policy                                     = sched_ipi_policy,
 336         .thread_should_yield                            = sched_thread_should_yield,
 337         .run_count_incr                                 = sched_run_incr,
 338         .run_count_decr                                 = sched_run_decr,
 339         .update_thread_bucket                           = sched_update_thread_bucket,
 340         .pset_made_schedulable                          = sched_pset_made_schedulable,
 341 };
 342
 343
 344 static void
 345 sched_multiq_init(void)
 346 {
 347 #if defined(MULTIQ_SANITY_CHECK)
 348         PE_parse_boot_argn("-multiq-sanity-check", &multiq_sanity_check, sizeof(multiq_sanity_check));
 349 #endif
 350
 351         PE_parse_boot_argn("-multiq-deep-drain", &deep_drain, sizeof(deep_drain));
 352
 353         if (!PE_parse_boot_argn("multiq_drain_ceiling", &drain_ceiling, sizeof(drain_ceiling))) {
 354                 drain_ceiling = DEFAULT_DRAIN_CEILING;
 355         }
 356
 357         if (!PE_parse_boot_argn("multiq_drain_depth_limit", &drain_depth_limit, sizeof(drain_depth_limit))) {
 358                 drain_depth_limit = DEFAULT_DRAIN_DEPTH_LIMIT;
 359         }
 360
 361         if (!PE_parse_boot_argn("multiq_drain_band_limit", &drain_band_limit, sizeof(drain_band_limit))) {
 362                 drain_band_limit = DEFAULT_DRAIN_BAND_LIMIT;
 363         }
 364
 365         printf("multiq scheduler config: deep-drain %d, ceiling %d, depth limit %d, band limit %d, sanity check %d\n",
 366             deep_drain, drain_ceiling, drain_depth_limit, drain_band_limit, multiq_sanity_check);
 367
 368         queue_init(&sched_groups);
 369
 370         sched_timeshare_init();
 371 }
 372
 373 static void
 374 sched_multiq_processor_init(processor_t processor)
 375 {
 376         run_queue_init(&processor->runq);
 377 }
 378
 379 static void
 380 sched_multiq_pset_init(processor_set_t pset)
 381 {
 382         run_queue_init(&pset->pset_runq);
 383 }
 384
 385 static sched_mode_t
 386 sched_multiq_initial_thread_sched_mode(task_t parent_task)
 387 {
 388         if (parent_task == kernel_task) {
 389                 return TH_MODE_FIXED;
 390         } else {
 391                 return TH_MODE_TIMESHARE;
 392         }
 393 }
 394
 395 sched_group_t
 396 sched_group_create(void)
 397 {
 398         sched_group_t       sched_group;
 399
 400         if (!SCHED(sched_groups_enabled)) {
 401                 return SCHED_GROUP_NULL;
 402         }
 403
 404         sched_group = (sched_group_t)zalloc(sched_group_zone);
 405
 406         bzero(sched_group, sizeof(struct sched_group));
 407
 408         run_queue_init(&sched_group->runq);
 409
 410         for (size_t i = 0; i < NRQS; i++) {
 411                 sched_group->entries[i].runq = 0;
 412                 sched_group->entries[i].sched_pri = (int16_t)i;
 413         }
 414
 415         lck_mtx_lock(&sched_groups_lock);
 416         queue_enter(&sched_groups, sched_group, sched_group_t, sched_groups);
 417         num_sched_groups++;
 418         lck_mtx_unlock(&sched_groups_lock);
 419
 420         return sched_group;
 421 }
 422
 423 void
 424 sched_group_destroy(sched_group_t sched_group)
 425 {
 426         if (!SCHED(sched_groups_enabled)) {
 427                 assert(sched_group == SCHED_GROUP_NULL);
 428                 return;
 429         }
 430
 431         assert(sched_group != SCHED_GROUP_NULL);
 432         assert(sched_group->runq.count == 0);
 433
 434         for (int i = 0; i < NRQS; i++) {
 435                 assert(sched_group->entries[i].runq == 0);
 436                 assert(sched_group->entries[i].sched_pri == i);
 437         }
 438
 439         lck_mtx_lock(&sched_groups_lock);
 440         queue_remove(&sched_groups, sched_group, sched_group_t, sched_groups);
 441         num_sched_groups--;
 442         lck_mtx_unlock(&sched_groups_lock);
 443
 444         zfree(sched_group_zone, sched_group);
 445 }
 446
 447 __attribute__((always_inline))
 448 static inline entry_queue_t
 449 multiq_main_entryq(processor_t processor)
 450 {
 451         return (entry_queue_t)&processor->processor_set->pset_runq;
 452 }
 453
 454 __attribute__((always_inline))
 455 static inline run_queue_t
 456 multiq_bound_runq(processor_t processor)
 457 {
 458         return &processor->runq;
 459 }
 460
 461 __attribute__((always_inline))
 462 static inline sched_entry_t
 463 group_entry_for_pri(sched_group_t group, integer_t pri)
 464 {
 465         return &group->entries[pri];
 466 }
 467
 468 __attribute__((always_inline))
 469 static inline sched_group_t
 470 group_for_entry(sched_entry_t entry)
 471 {
 472 #pragma clang diagnostic push
 473 #pragma clang diagnostic ignored "-Wcast-align"
 474         sched_group_t group = (sched_group_t)(entry - entry->sched_pri);
 475 #pragma clang diagnostic pop
 476         return group;
 477 }
 478
 479 /* Peek at the head of the runqueue */
 480 static sched_entry_t
 481 entry_queue_first_entry(entry_queue_t rq)
 482 {
 483         assert(rq->count != 0);
 484
 485         circle_queue_t queue = &rq->queues[rq->highq];
 486
 487         sched_entry_t entry = cqe_queue_first(queue, struct sched_entry, entry_links);
 488
 489         assert(entry->sched_pri == rq->highq);
 490
 491         return entry;
 492 }
 493
 494 #if defined(MULTIQ_SANITY_CHECK)
 495
 496 #if MACH_ASSERT
 497 __attribute__((always_inline))
 498 static inline boolean_t
 499 queue_chain_linked(queue_chain_t* chain)
 500 {
 501         if (chain->next != NULL) {
 502                 assert(chain->prev != NULL);
 503                 return TRUE;
 504         } else {
 505                 assert(chain->prev == NULL);
 506                 return FALSE;
 507         }
 508 }
 509 #endif /* MACH_ASSERT */
 510
 511 static thread_t
 512 group_first_thread(sched_group_t group)
 513 {
 514         group_runq_t rq = &group->runq;
 515
 516         assert(rq->count != 0);
 517
 518         circle_queue_t queue = &rq->queues[rq->highq];
 519
 520         thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
 521
 522         assert(thread != THREAD_NULL);
 523         assert_thread_magic(thread);
 524
 525         assert(thread->sched_group == group);
 526
 527         /* TODO: May not be safe */
 528         assert(thread->sched_pri == rq->highq);
 529
 530         return thread;
 531 }
 532
 533 /* Asserts if entry is not in entry runq at pri */
 534 static void
 535 entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pri)
 536 {
 537         circle_queue_t q;
 538         sched_entry_t elem;
 539
 540         assert(queue_chain_linked(&entry->entry_links));
 541         assert(entry->runq == MULTIQ_ERUNQ);
 542
 543         q = &runq->queues[expected_pri];
 544
 545         cqe_foreach_element(elem, q, entry_links) {
 546                 if (elem == entry) {
 547                         return;
 548                 }
 549         }
 550
 551         panic("runq %p doesn't contain entry %p at pri %d", runq, entry, expected_pri);
 552 }
 553
 554 /* Asserts if thread is not in group at its priority */
 555 static void
 556 sched_group_check_thread(sched_group_t group, thread_t thread)
 557 {
 558         circle_queue_t q;
 559         thread_t elem;
 560         int pri = thread->sched_pri;
 561
 562         assert(thread->runq != PROCESSOR_NULL);
 563
 564         q = &group->runq.queues[pri];
 565
 566         cqe_foreach_element(elem, q, runq_links) {
 567                 if (elem == thread) {
 568                         return;
 569                 }
 570         }
 571
 572         panic("group %p doesn't contain thread %p at pri %d", group, thread, pri);
 573 }
 574
 575 static void
 576 global_check_entry_queue(entry_queue_t main_entryq)
 577 {
 578         if (main_entryq->count == 0) {
 579                 return;
 580         }
 581
 582         sched_entry_t entry = entry_queue_first_entry(main_entryq);
 583
 584         assert(entry->runq == MULTIQ_ERUNQ);
 585
 586         sched_group_t group = group_for_entry(entry);
 587
 588         thread_t thread = group_first_thread(group);
 589
 590         __assert_only sched_entry_t thread_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
 591
 592         assert(entry->sched_pri == group->runq.highq);
 593
 594         assert(entry == thread_entry);
 595         assert(thread->runq != PROCESSOR_NULL);
 596 }
 597
 598 static void
 599 group_check_run_queue(entry_queue_t main_entryq, sched_group_t group)
 600 {
 601         if (group->runq.count == 0) {
 602                 return;
 603         }
 604
 605         thread_t thread = group_first_thread(group);
 606
 607         assert(thread->runq != PROCESSOR_NULL);
 608
 609         sched_entry_t sched_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
 610
 611         entry_queue_check_entry(main_entryq, sched_entry, thread->sched_pri);
 612
 613         assert(sched_entry->sched_pri == thread->sched_pri);
 614         assert(sched_entry->runq == MULTIQ_ERUNQ);
 615 }
 616
 617 #endif /* defined(MULTIQ_SANITY_CHECK) */
 618
 619 /*
 620  * The run queue must not be empty.
 621  */
 622 static sched_entry_t
 623 entry_queue_dequeue_entry(entry_queue_t rq)
 624 {
 625         sched_entry_t   sched_entry;
 626         circle_queue_t  queue = &rq->queues[rq->highq];
 627
 628         assert(rq->count > 0);
 629         assert(!circle_queue_empty(queue));
 630
 631         sched_entry = cqe_dequeue_head(queue, struct sched_entry, entry_links);
 632
 633         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 634         rq->count--;
 635         if (SCHED(priority_is_urgent)(rq->highq)) {
 636                 rq->urgency--; assert(rq->urgency >= 0);
 637         }
 638         if (circle_queue_empty(queue)) {
 639                 rq_bitmap_clear(rq->bitmap, rq->highq);
 640                 rq->highq = bitmap_first(rq->bitmap, NRQS);
 641         }
 642
 643         sched_entry->runq = 0;
 644
 645         return sched_entry;
 646 }
 647
 648 /*
 649  * The run queue must not be empty.
 650  */
 651 static boolean_t
 652 entry_queue_enqueue_entry(
 653         entry_queue_t rq,
 654         sched_entry_t entry,
 655         integer_t     options)
 656 {
 657         int             sched_pri = entry->sched_pri;
 658         circle_queue_t  queue = &rq->queues[sched_pri];
 659         boolean_t       result = FALSE;
 660
 661         assert(entry->runq == 0);
 662
 663         if (circle_queue_empty(queue)) {
 664                 circle_enqueue_tail(queue, &entry->entry_links);
 665
 666                 rq_bitmap_set(rq->bitmap, sched_pri);
 667                 if (sched_pri > rq->highq) {
 668                         rq->highq = sched_pri;
 669                         result = TRUE;
 670                 }
 671         } else {
 672                 if (options & SCHED_TAILQ) {
 673                         circle_enqueue_tail(queue, &entry->entry_links);
 674                 } else {
 675                         circle_enqueue_head(queue, &entry->entry_links);
 676                 }
 677         }
 678         if (SCHED(priority_is_urgent)(sched_pri)) {
 679                 rq->urgency++;
 680         }
 681         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 682         rq->count++;
 683
 684         entry->runq = MULTIQ_ERUNQ;
 685
 686         return result;
 687 }
 688
 689 /*
 690  * The entry must be in this runqueue.
 691  */
 692 static void
 693 entry_queue_remove_entry(
 694         entry_queue_t  rq,
 695         sched_entry_t  entry)
 696 {
 697         int sched_pri = entry->sched_pri;
 698
 699 #if defined(MULTIQ_SANITY_CHECK)
 700         if (multiq_sanity_check) {
 701                 entry_queue_check_entry(rq, entry, sched_pri);
 702         }
 703 #endif
 704
 705         remqueue(&entry->entry_links);
 706
 707         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 708         rq->count--;
 709         if (SCHED(priority_is_urgent)(sched_pri)) {
 710                 rq->urgency--; assert(rq->urgency >= 0);
 711         }
 712
 713         if (circle_queue_empty(&rq->queues[sched_pri])) {
 714                 /* update run queue status */
 715                 rq_bitmap_clear(rq->bitmap, sched_pri);
 716                 rq->highq = bitmap_first(rq->bitmap, NRQS);
 717         }
 718
 719         entry->runq = 0;
 720 }
 721
 722 static void
 723 entry_queue_change_entry(
 724         entry_queue_t rq,
 725         sched_entry_t entry,
 726         integer_t     options)
 727 {
 728         int            sched_pri   = entry->sched_pri;
 729         circle_queue_t queue       = &rq->queues[sched_pri];
 730
 731 #if defined(MULTIQ_SANITY_CHECK)
 732         if (multiq_sanity_check) {
 733                 entry_queue_check_entry(rq, entry, sched_pri);
 734         }
 735 #endif
 736
 737         circle_dequeue(queue, &entry->entry_links);
 738         if (options & SCHED_TAILQ) {
 739                 circle_enqueue_tail(queue, &entry->entry_links);
 740         } else {
 741                 circle_enqueue_head(queue, &entry->entry_links);
 742         }
 743 }
 744 /*
 745  * The run queue must not be empty.
 746  *
 747  * sets queue_empty to TRUE if queue is now empty at thread_pri
 748  */
 749 static thread_t
 750 group_run_queue_dequeue_thread(
 751         group_runq_t   rq,
 752         integer_t     *thread_pri,
 753         boolean_t     *queue_empty)
 754 {
 755         thread_t        thread;
 756         circle_queue_t  queue = &rq->queues[rq->highq];
 757
 758         assert(rq->count > 0);
 759         assert(!circle_queue_empty(queue));
 760
 761         *thread_pri = rq->highq;
 762
 763         thread = cqe_dequeue_head(queue, struct thread, runq_links);
 764         assert_thread_magic(thread);
 765
 766         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 767         rq->count--;
 768         if (SCHED(priority_is_urgent)(rq->highq)) {
 769                 rq->urgency--; assert(rq->urgency >= 0);
 770         }
 771         if (circle_queue_empty(queue)) {
 772                 rq_bitmap_clear(rq->bitmap, rq->highq);
 773                 rq->highq = bitmap_first(rq->bitmap, NRQS);
 774                 *queue_empty = TRUE;
 775         } else {
 776                 *queue_empty = FALSE;
 777         }
 778
 779         return thread;
 780 }
 781
 782 /*
 783  * The run queue must not be empty.
 784  * returns TRUE if queue was empty at thread_pri
 785  */
 786 static boolean_t
 787 group_run_queue_enqueue_thread(
 788         group_runq_t   rq,
 789         thread_t       thread,
 790         integer_t      thread_pri,
 791         integer_t      options)
 792 {
 793         circle_queue_t  queue = &rq->queues[thread_pri];
 794         boolean_t       result = FALSE;
 795
 796         assert(thread->runq == PROCESSOR_NULL);
 797         assert_thread_magic(thread);
 798
 799         if (circle_queue_empty(queue)) {
 800                 circle_enqueue_tail(queue, &thread->runq_links);
 801
 802                 rq_bitmap_set(rq->bitmap, thread_pri);
 803                 if (thread_pri > rq->highq) {
 804                         rq->highq = thread_pri;
 805                 }
 806                 result = TRUE;
 807         } else {
 808                 if (options & SCHED_TAILQ) {
 809                         circle_enqueue_tail(queue, &thread->runq_links);
 810                 } else {
 811                         circle_enqueue_head(queue, &thread->runq_links);
 812                 }
 813         }
 814         if (SCHED(priority_is_urgent)(thread_pri)) {
 815                 rq->urgency++;
 816         }
 817         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 818         rq->count++;
 819
 820         return result;
 821 }
 822
 823 /*
 824  * The thread must be in this runqueue.
 825  * returns TRUE if queue is now empty at thread_pri
 826  */
 827 static boolean_t
 828 group_run_queue_remove_thread(
 829         group_runq_t    rq,
 830         thread_t        thread,
 831         integer_t       thread_pri)
 832 {
 833         circle_queue_t  queue = &rq->queues[thread_pri];
 834         boolean_t       result = FALSE;
 835
 836         assert_thread_magic(thread);
 837         assert(thread->runq != PROCESSOR_NULL);
 838
 839         circle_dequeue(queue, &thread->runq_links);
 840
 841         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 842         rq->count--;
 843         if (SCHED(priority_is_urgent)(thread_pri)) {
 844                 rq->urgency--; assert(rq->urgency >= 0);
 845         }
 846
 847         if (circle_queue_empty(queue)) {
 848                 /* update run queue status */
 849                 rq_bitmap_clear(rq->bitmap, thread_pri);
 850                 rq->highq = bitmap_first(rq->bitmap, NRQS);
 851                 result = TRUE;
 852         }
 853
 854         thread->runq = PROCESSOR_NULL;
 855
 856         return result;
 857 }
 858
 859 /*
 860  * A thread's sched pri may change out from under us because
 861  * we're clearing thread->runq here without the thread locked.
 862  * Do not rely on it to be the same as when we enqueued.
 863  */
 864 static thread_t
 865 sched_global_dequeue_thread(entry_queue_t main_entryq)
 866 {
 867         boolean_t pri_level_empty = FALSE;
 868         sched_entry_t entry;
 869         group_runq_t group_runq;
 870         thread_t thread;
 871         integer_t thread_pri;
 872         sched_group_t group;
 873
 874         assert(main_entryq->count > 0);
 875
 876         entry = entry_queue_dequeue_entry(main_entryq);
 877
 878         group = group_for_entry(entry);
 879         group_runq = &group->runq;
 880
 881         thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
 882
 883         thread->runq = PROCESSOR_NULL;
 884
 885         if (!pri_level_empty) {
 886                 entry_queue_enqueue_entry(main_entryq, entry, SCHED_TAILQ);
 887         }
 888
 889         return thread;
 890 }
 891
 892 /* Dequeue a thread from the global runq without moving the entry */
 893 static thread_t
 894 sched_global_deep_drain_dequeue_thread(entry_queue_t main_entryq)
 895 {
 896         boolean_t pri_level_empty = FALSE;
 897         sched_entry_t entry;
 898         group_runq_t group_runq;
 899         thread_t thread;
 900         integer_t thread_pri;
 901         sched_group_t group;
 902
 903         assert(main_entryq->count > 0);
 904
 905         entry = entry_queue_first_entry(main_entryq);
 906
 907         group = group_for_entry(entry);
 908         group_runq = &group->runq;
 909
 910         thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
 911
 912         thread->runq = PROCESSOR_NULL;
 913
 914         if (pri_level_empty) {
 915                 entry_queue_remove_entry(main_entryq, entry);
 916         }
 917
 918         return thread;
 919 }
 920
 921
 922 static thread_t
 923 sched_group_dequeue_thread(
 924         entry_queue_t main_entryq,
 925         sched_group_t group)
 926 {
 927         group_runq_t group_runq = &group->runq;
 928         boolean_t pri_level_empty = FALSE;
 929         thread_t thread;
 930         integer_t thread_pri;
 931
 932         thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
 933
 934         thread->runq = PROCESSOR_NULL;
 935
 936         if (pri_level_empty) {
 937                 entry_queue_remove_entry(main_entryq, group_entry_for_pri(group, thread_pri));
 938         }
 939
 940         return thread;
 941 }
 942
 943 static void
 944 sched_group_remove_thread(
 945         entry_queue_t main_entryq,
 946         sched_group_t group,
 947         thread_t thread)
 948 {
 949         integer_t thread_pri = thread->sched_pri;
 950         sched_entry_t sched_entry = group_entry_for_pri(group, thread_pri);
 951
 952 #if defined(MULTIQ_SANITY_CHECK)
 953         if (multiq_sanity_check) {
 954                 global_check_entry_queue(main_entryq);
 955                 group_check_run_queue(main_entryq, group);
 956
 957                 sched_group_check_thread(group, thread);
 958                 entry_queue_check_entry(main_entryq, sched_entry, thread_pri);
 959         }
 960 #endif
 961
 962         boolean_t pri_level_empty = group_run_queue_remove_thread(&group->runq, thread, thread_pri);
 963
 964         if (pri_level_empty) {
 965                 entry_queue_remove_entry(main_entryq, sched_entry);
 966         }
 967
 968 #if defined(MULTIQ_SANITY_CHECK)
 969         if (multiq_sanity_check) {
 970                 global_check_entry_queue(main_entryq);
 971                 group_check_run_queue(main_entryq, group);
 972         }
 973 #endif
 974 }
 975
 976 static void
 977 sched_group_enqueue_thread(
 978         entry_queue_t        main_entryq,
 979         sched_group_t        group,
 980         thread_t             thread,
 981         integer_t            options)
 982 {
 983 #if defined(MULTIQ_SANITY_CHECK)
 984         if (multiq_sanity_check) {
 985                 global_check_entry_queue(main_entryq);
 986                 group_check_run_queue(main_entryq, group);
 987         }
 988 #endif
 989
 990         int sched_pri = thread->sched_pri;
 991
 992         boolean_t pri_level_was_empty = group_run_queue_enqueue_thread(&group->runq, thread, sched_pri, options);
 993
 994         if (pri_level_was_empty) {
 995                 /*
 996                  * TODO: Need to figure out if passing options here is a good idea or not
 997                  * What effects would it have?
 998                  */
 999                 entry_queue_enqueue_entry(main_entryq, &group->entries[sched_pri], options);
1000         } else if (options & SCHED_HEADQ) {
1001                 /* The thread should be at the head of the line - move its entry to the front */
1002                 entry_queue_change_entry(main_entryq, &group->entries[sched_pri], options);
1003         }
1004 }
1005
1006 /*
1007  *  Locate a thread to execute from the run queue and return it.
1008  *  Only choose a thread with greater or equal priority.
1009  *
1010  *  pset is locked, thread is not locked.
1011  *
1012  *  Returns THREAD_NULL if it cannot find a valid thread.
1013  *
1014  *  Note: we cannot rely on the value of thread->sched_pri in this path because
1015  *  we don't have the thread locked.
1016  *
1017  *  TODO: Remove tracepoints
1018  */
1019 static thread_t
1020 sched_multiq_choose_thread(
1021         processor_t      processor,
1022         int              priority,
1023         ast_t            reason)
1024 {
1025         entry_queue_t   main_entryq = multiq_main_entryq(processor);
1026         run_queue_t     bound_runq  = multiq_bound_runq(processor);
1027
1028         boolean_t choose_bound_runq = FALSE;
1029
1030         if (bound_runq->highq < priority &&
1031             main_entryq->highq < priority) {
1032                 return THREAD_NULL;
1033         }
1034
1035         if (bound_runq->count && main_entryq->count) {
1036                 if (bound_runq->highq >= main_entryq->highq) {
1037                         choose_bound_runq = TRUE;
1038                 } else {
1039                         /* Use main runq */
1040                 }
1041         } else if (bound_runq->count) {
1042                 choose_bound_runq = TRUE;
1043         } else if (main_entryq->count) {
1044                 /* Use main runq */
1045         } else {
1046                 return THREAD_NULL;
1047         }
1048
1049         if (choose_bound_runq) {
1050                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1051                     MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1052                     MACH_MULTIQ_BOUND, main_entryq->highq, bound_runq->highq, 0, 0);
1053
1054                 return run_queue_dequeue(bound_runq, SCHED_HEADQ);
1055         }
1056
1057         sched_group_t group = current_thread()->sched_group;
1058
1059 #if defined(MULTIQ_SANITY_CHECK)
1060         if (multiq_sanity_check) {
1061                 global_check_entry_queue(main_entryq);
1062                 group_check_run_queue(main_entryq, group);
1063         }
1064 #endif
1065
1066         /*
1067          * Determine if we should look at the group or the global queue
1068          *
1069          * TODO:
1070          * Perhaps pass reason as a 'should look inside' argument to choose_thread
1071          * Should YIELD AST override drain limit?
1072          */
1073         if (group->runq.count != 0 && (reason & AST_PREEMPTION) == 0) {
1074                 boolean_t favor_group = TRUE;
1075
1076                 integer_t global_pri = main_entryq->highq;
1077                 integer_t group_pri  = group->runq.highq;
1078
1079                 /*
1080                  * Favor the current group if the group is still the globally highest.
1081                  *
1082                  * Otherwise, consider choosing a thread from the current group
1083                  * even if it's lower priority than the global highest priority.
1084                  */
1085                 if (global_pri > group_pri) {
1086                         /*
1087                          * If there's something elsewhere above the depth limit,
1088                          * don't pick a thread below the limit.
1089                          */
1090                         if (global_pri > drain_depth_limit && group_pri <= drain_depth_limit) {
1091                                 favor_group = FALSE;
1092                         }
1093
1094                         /*
1095                          * If there's something at or above the ceiling,
1096                          * don't favor the group.
1097                          */
1098                         if (global_pri >= drain_ceiling) {
1099                                 favor_group = FALSE;
1100                         }
1101
1102                         /*
1103                          * Don't go more than X steps below the global highest
1104                          */
1105                         if ((global_pri - group_pri) >= drain_band_limit) {
1106                                 favor_group = FALSE;
1107                         }
1108                 }
1109
1110                 if (favor_group) {
1111                         /* Pull from local runq */
1112                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1113                             MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1114                             MACH_MULTIQ_GROUP, global_pri, group_pri, 0, 0);
1115
1116                         return sched_group_dequeue_thread(main_entryq, group);
1117                 }
1118         }
1119
1120         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1121             MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1122             MACH_MULTIQ_GLOBAL, main_entryq->highq, group->runq.highq, 0, 0);
1123
1124         /* Couldn't pull from local runq, pull from global runq instead */
1125         if (deep_drain) {
1126                 return sched_global_deep_drain_dequeue_thread(main_entryq);
1127         } else {
1128                 return sched_global_dequeue_thread(main_entryq);
1129         }
1130 }
1131
1132
1133 /*
1134  * Thread must be locked, and not already be on a run queue.
1135  * pset is locked.
1136  */
1137 static boolean_t
1138 sched_multiq_processor_enqueue(
1139         processor_t      processor,
1140         thread_t         thread,
1141         sched_options_t  options)
1142 {
1143         boolean_t       result;
1144
1145         assert(processor == thread->chosen_processor);
1146
1147         if (thread->bound_processor != PROCESSOR_NULL) {
1148                 assert(thread->bound_processor == processor);
1149
1150                 result = run_queue_enqueue(multiq_bound_runq(processor), thread, options);
1151                 thread->runq = processor;
1152
1153                 return result;
1154         }
1155
1156         sched_group_enqueue_thread(multiq_main_entryq(processor),
1157             thread->sched_group,
1158             thread, options);
1159
1160         thread->runq = processor;
1161
1162         return FALSE;
1163 }
1164
1165 /*
1166  * Called in the context of thread with thread and pset unlocked,
1167  * after updating thread priority but before propagating that priority
1168  * to the processor
1169  */
1170 void
1171 sched_multiq_quantum_expire(thread_t thread)
1172 {
1173         if (deep_drain) {
1174                 /*
1175                  * Move the entry at this priority to the end of the queue,
1176                  * to allow the next task a shot at running.
1177                  */
1178
1179                 processor_t processor = thread->last_processor;
1180                 processor_set_t pset = processor->processor_set;
1181                 entry_queue_t entryq = multiq_main_entryq(processor);
1182
1183                 pset_lock(pset);
1184
1185                 sched_entry_t entry = group_entry_for_pri(thread->sched_group, processor->current_pri);
1186
1187                 if (entry->runq == MULTIQ_ERUNQ) {
1188                         entry_queue_change_entry(entryq, entry, SCHED_TAILQ);
1189                 }
1190
1191                 pset_unlock(pset);
1192         }
1193 }
1194
1195 static boolean_t
1196 sched_multiq_processor_queue_empty(processor_t processor)
1197 {
1198         return multiq_main_entryq(processor)->count == 0 &&
1199                multiq_bound_runq(processor)->count == 0;
1200 }
1201
1202 static ast_t
1203 sched_multiq_processor_csw_check(processor_t processor)
1204 {
1205         boolean_t       has_higher;
1206         int             pri;
1207
1208         if (sched_multiq_thread_avoid_processor(processor, current_thread())) {
1209                 return AST_PREEMPT | AST_URGENT;
1210         }
1211
1212         entry_queue_t main_entryq = multiq_main_entryq(processor);
1213         run_queue_t   bound_runq  = multiq_bound_runq(processor);
1214
1215         assert(processor->active_thread != NULL);
1216
1217         pri = MAX(main_entryq->highq, bound_runq->highq);
1218
1219         if (processor->first_timeslice) {
1220                 has_higher = (pri > processor->current_pri);
1221         } else {
1222                 has_higher = (pri >= processor->current_pri);
1223         }
1224
1225         if (has_higher) {
1226                 if (main_entryq->urgency > 0) {
1227                         return AST_PREEMPT | AST_URGENT;
1228                 }
1229
1230                 if (bound_runq->urgency > 0) {
1231                         return AST_PREEMPT | AST_URGENT;
1232                 }
1233
1234                 return AST_PREEMPT;
1235         }
1236
1237         return AST_NONE;
1238 }
1239
1240 static boolean_t
1241 sched_multiq_processor_queue_has_priority(
1242         processor_t   processor,
1243         int           priority,
1244         boolean_t     gte)
1245 {
1246         run_queue_t main_runq  = multiq_main_entryq(processor);
1247         run_queue_t bound_runq = multiq_bound_runq(processor);
1248
1249         int qpri = MAX(main_runq->highq, bound_runq->highq);
1250
1251         if (gte) {
1252                 return qpri >= priority;
1253         } else {
1254                 return qpri > priority;
1255         }
1256 }
1257
1258 static int
1259 sched_multiq_runq_count(processor_t processor)
1260 {
1261         /*
1262          *  TODO: Decide whether to keep a count of runnable threads in the pset
1263          *  or just return something less than the true count.
1264          *
1265          *  This needs to be fast, so no iterating the whole runq.
1266          *
1267          *  Another possible decision is to remove this - with global runq
1268          *  it doesn't make much sense.
1269          */
1270         return multiq_main_entryq(processor)->count + multiq_bound_runq(processor)->count;
1271 }
1272
1273 static uint64_t
1274 sched_multiq_runq_stats_count_sum(processor_t processor)
1275 {
1276         /*
1277          * TODO: This one does need to go through all the runqueues, but it's only needed for
1278          * the sched stats tool
1279          */
1280
1281         uint64_t bound_sum = multiq_bound_runq(processor)->runq_stats.count_sum;
1282
1283         if (processor->cpu_id == processor->processor_set->cpu_set_low) {
1284                 return bound_sum + multiq_main_entryq(processor)->runq_stats.count_sum;
1285         } else {
1286                 return bound_sum;
1287         }
1288 }
1289
1290 static int
1291 sched_multiq_processor_bound_count(processor_t processor)
1292 {
1293         return multiq_bound_runq(processor)->count;
1294 }
1295
1296 static void
1297 sched_multiq_processor_queue_shutdown(processor_t processor)
1298 {
1299         processor_set_t pset = processor->processor_set;
1300         entry_queue_t   main_entryq = multiq_main_entryq(processor);
1301         thread_t        thread;
1302         queue_head_t    tqueue;
1303
1304         /* We only need to migrate threads if this is the last active processor in the pset */
1305         if (pset->online_processor_count > 0) {
1306                 pset_unlock(pset);
1307                 return;
1308         }
1309
1310         queue_init(&tqueue);
1311
1312         /* Note that we do not remove bound threads from the queues here */
1313
1314         while (main_entryq->count > 0) {
1315                 thread = sched_global_dequeue_thread(main_entryq);
1316                 enqueue_tail(&tqueue, &thread->runq_links);
1317         }
1318
1319         pset_unlock(pset);
1320
1321         qe_foreach_element_safe(thread, &tqueue, runq_links) {
1322                 remqueue(&thread->runq_links);
1323
1324                 thread_lock(thread);
1325
1326                 thread_setrun(thread, SCHED_TAILQ);
1327
1328                 thread_unlock(thread);
1329         }
1330 }
1331
1332 /*
1333  * Thread is locked
1334  *
1335  * This is why we can never read sched_pri unless we have the thread locked.
1336  * Which we do in the enqueue and remove cases, but not the dequeue case.
1337  */
1338 static boolean_t
1339 sched_multiq_processor_queue_remove(
1340         processor_t processor,
1341         thread_t    thread)
1342 {
1343         boolean_t removed = FALSE;
1344         processor_set_t pset = processor->processor_set;
1345
1346         pset_lock(pset);
1347
1348         if (thread->runq != PROCESSOR_NULL) {
1349                 /*
1350                  * Thread is on a run queue and we have a lock on
1351                  * that run queue.
1352                  */
1353
1354                 assert(thread->runq == processor);
1355
1356                 if (thread->bound_processor != PROCESSOR_NULL) {
1357                         assert(processor == thread->bound_processor);
1358                         run_queue_remove(multiq_bound_runq(processor), thread);
1359                         thread->runq = PROCESSOR_NULL;
1360                 } else {
1361                         sched_group_remove_thread(multiq_main_entryq(processor),
1362                             thread->sched_group,
1363                             thread);
1364                 }
1365
1366                 removed = TRUE;
1367         }
1368
1369         pset_unlock(pset);
1370
1371         return removed;
1372 }
1373
1374 /* pset is locked, returned unlocked */
1375 static thread_t
1376 sched_multiq_steal_thread(processor_set_t pset)
1377 {
1378         pset_unlock(pset);
1379         return THREAD_NULL;
1380 }
1381
1382 /*
1383  * Scan the global queue for candidate groups, and scan those groups for
1384  * candidate threads.
1385  *
1386  * TODO: This iterates every group runq in its entirety for each entry it has in the runq, which is O(N^2)
1387  *       Instead, iterate only the queue in the group runq matching the priority of the entry.
1388  *
1389  * Returns TRUE if retry is needed.
1390  */
1391 static boolean_t
1392 group_scan(entry_queue_t runq, sched_update_scan_context_t scan_context)
1393 {
1394         int count       = runq->count;
1395         int queue_index;
1396
1397         assert(count >= 0);
1398
1399         if (count == 0) {
1400                 return FALSE;
1401         }
1402
1403         for (queue_index = bitmap_first(runq->bitmap, NRQS);
1404             queue_index >= 0;
1405             queue_index = bitmap_next(runq->bitmap, queue_index)) {
1406                 sched_entry_t entry;
1407
1408                 cqe_foreach_element(entry, &runq->queues[queue_index], entry_links) {
1409                         assert(count > 0);
1410
1411                         sched_group_t group = group_for_entry(entry);
1412                         if (group->runq.count > 0) {
1413                                 if (runq_scan(&group->runq, scan_context)) {
1414                                         return TRUE;
1415                                 }
1416                         }
1417                         count--;
1418                 }
1419         }
1420
1421         return FALSE;
1422 }
1423
1424 static void
1425 sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context)
1426 {
1427         boolean_t               restart_needed = FALSE;
1428         processor_t             processor = processor_list;
1429         processor_set_t         pset;
1430         thread_t                thread;
1431         spl_t                   s;
1432
1433         /*
1434          *  We update the threads associated with each processor (bound and idle threads)
1435          *  and then update the threads in each pset runqueue.
1436          */
1437
1438         do {
1439                 do {
1440                         pset = processor->processor_set;
1441
1442                         s = splsched();
1443                         pset_lock(pset);
1444
1445                         restart_needed = runq_scan(multiq_bound_runq(processor), scan_context);
1446
1447                         pset_unlock(pset);
1448                         splx(s);
1449
1450                         if (restart_needed) {
1451                                 break;
1452                         }
1453
1454                         thread = processor->idle_thread;
1455                         if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
1456                                 if (thread_update_add_thread(thread) == FALSE) {
1457                                         restart_needed = TRUE;
1458                                         break;
1459                                 }
1460                         }
1461                 } while ((processor = processor->processor_list) != NULL);
1462
1463                 /* Ok, we now have a collection of candidates -- fix them. */
1464                 thread_update_process_threads();
1465         } while (restart_needed);
1466
1467         pset = &pset0;
1468
1469         do {
1470                 do {
1471                         s = splsched();
1472                         pset_lock(pset);
1473
1474                         restart_needed = group_scan(&pset->pset_runq, scan_context);
1475
1476                         pset_unlock(pset);
1477                         splx(s);
1478
1479                         if (restart_needed) {
1480                                 break;
1481                         }
1482                 } while ((pset = pset->pset_list) != NULL);
1483
1484                 /* Ok, we now have a collection of candidates -- fix them. */
1485                 thread_update_process_threads();
1486         } while (restart_needed);
1487 }
1488
1489 extern int sched_allow_rt_smt;
1490
1491 /* Return true if this thread should not continue running on this processor */
1492 static bool
1493 sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread)
1494 {
1495         if (processor->processor_primary != processor) {
1496                 /*
1497                  * This is a secondary SMT processor.  If the primary is running
1498                  * a realtime thread, only allow realtime threads on the secondary.
1499                  */
1500                 if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
1501                         return true;
1502                 }
1503         }
1504
1505         return false;
1506 }