osfmk/kern/sched_multiq.c

   1 /*
   2  * Copyright (c) 2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <mach/mach_types.h>
  30 #include <mach/machine.h>
  31
  32 #include <machine/machine_routines.h>
  33 #include <machine/sched_param.h>
  34 #include <machine/machine_cpu.h>
  35
  36 #include <kern/kern_types.h>
  37 #include <kern/debug.h>
  38 #include <kern/mach_param.h>
  39 #include <kern/machine.h>
  40 #include <kern/misc_protos.h>
  41 #include <kern/processor.h>
  42 #include <kern/queue.h>
  43 #include <kern/sched.h>
  44 #include <kern/sched_prim.h>
  45 #include <kern/task.h>
  46 #include <kern/thread.h>
  47
  48 #include <sys/kdebug.h>
  49
  50 /*
  51  * Theory Statement
  52  *
  53  * How does the task scheduler work?
  54  *
  55  * It schedules threads across a few levels.
  56  *
  57  * RT threads are dealt with above us
  58  * Bound threads go into the per-processor runq
  59  * Non-bound threads are linked on their task's sched_group's runq
  60  * sched_groups' sched_entries are linked on the pset's runq
  61  *
  62  * TODO: make this explicit - bound threads should have a different enqueue fxn
  63  *
  64  * When we choose a new thread, we will decide whether to look at the bound runqueue, the global runqueue
  65  * or the current group's runqueue, then dequeue the next thread in that runqueue.
  66  *
  67  * We then manipulate the sched_entries to reflect the invariant that:
  68  * Each non-empty priority level in a group's runq is represented by one sched_entry enqueued in the global
  69  * runqueue.
  70  *
  71  * A sched_entry represents a chance at running - for each priority in each task, there is one chance of getting
  72  * to run.  This reduces the excess contention bonus given to processes which have work spread among many threads
  73  * as compared to processes which do the same amount of work under fewer threads.
  74  *
  75  * NOTE: Currently, the multiq scheduler only supports one pset.
  76  *
  77  * NOTE ABOUT thread->sched_pri:
  78  *
  79  * It can change after enqueue - it's changed without pset lock but with thread lock if thread->runq is 0.
  80  * Therefore we can only depend on it not changing during the enqueue and remove path, not the dequeue.
  81  *
  82  * TODO: Future features:
  83  *
  84  * Decouple the task priority from the sched_entry priority, allowing for:
  85  *      fast task priority change without having to iterate and re-dispatch all threads in the task.
  86  *              i.e. task-wide priority, task-wide boosting
  87  *      fancier group decay features
  88  *
  89  * Group (or task) decay:
  90  *      Decay is used for a few different things:
  91  *              Prioritizing latency-needing threads over throughput-needing threads for time-to-running
  92  *              Balancing work between threads in a process
  93  *              Balancing work done at the same priority between different processes
  94  *              Recovering from priority inversions between two threads in the same process
  95  *              Recovering from priority inversions between two threads in different processes
  96  *              Simulating a proportional share scheduler by allowing lower priority threads
  97  *                to run for a certain percentage of the time
  98  *
  99  *      Task decay lets us separately address the 'same process' and 'different process' needs,
 100  *      which will allow us to make smarter tradeoffs in different cases.
 101  *      For example, we could resolve priority inversion in the same process by reordering threads without dropping the
 102  *      process below low priority threads in other processes.
 103  *
 104  * One lock to rule them all (or at least all the runqueues) instead of the pset locks
 105  *
 106  * Shrink sched_entry size to the size of a queue_chain_t by inferring priority, group, and perhaps runq field.
 107  * The entries array is 5K currently so it'd be really great to reduce.
 108  * One way to get sched_group below 4K without a new runq structure would be to remove the extra queues above realtime.
 109  *
 110  * When preempting a processor, store a flag saying if the preemption
 111  * was from a thread in the same group or different group,
 112  * and tell choose_thread about it.
 113  *
 114  * When choosing a processor, bias towards those running in the same
 115  * group as I am running (at the same priority, or within a certain band?).
 116  *
 117  * Decide if we need to support psets.
 118  * Decide how to support psets - do we need duplicate entries for each pset,
 119  * or can we get away with putting the entry in either one or the other pset?
 120  *
 121  * Consider the right way to handle runq count - I don't want to iterate groups.
 122  * Perhaps keep a global counter.
 123  * Alternate option - remove it from choose_processor. It doesn't add much value
 124  * now that we have global runq.
 125  *
 126  * Need a better way of finding group to target instead of looking at current_task.
 127  * Perhaps choose_thread could pass in the current thread?
 128  *
 129  * Consider unifying runq copy-pastes.
 130  *
 131  * Thoughts on having a group central quantum bucket:
 132  *
 133  * I see two algorithms to decide quanta:
 134  * A) Hand off only when switching thread to thread in the same group
 135  * B) Allocate and return quanta to the group's pool
 136  *
 137  * Issues:
 138  * If a task blocks completely, should it come back with the leftover quanta
 139  * or brand new quanta?
 140  *
 141  * Should I put a flag saying zero out a quanta you grab when youre dispatched'?
 142  *
 143  * Resolution:
 144  * Handing off quanta between threads will help with jumping around in the current task
 145  * but will not help when a thread from a different task is involved.
 146  * Need an algorithm that works with round robin-ing between threads in different tasks
 147  *
 148  * But wait - round robining can only be triggered by quantum expire or blocking.
 149  * We need something that works with preemption or yielding - that's the more interesting idea.
 150  *
 151  * Existing algorithm - preemption doesn't re-set quantum, puts thread on head of runq.
 152  * Blocking or quantum expiration does re-set quantum, puts thread on tail of runq.
 153  *
 154  * New algorithm -
 155  * Hand off quanta when hopping between threads with same sched_group
 156  * Even if thread was blocked it uses last thread remaining quanta when it starts.
 157  *
 158  * If we use the only cycle entry at quantum algorithm, then the quantum pool starts getting
 159  * interesting.
 160  *
 161  * A thought - perhaps the handoff approach doesn't work so well in the presence of
 162  * non-handoff wakeups i.e. wake other thread then wait then block - doesn't mean that
 163  * woken thread will be what I switch to - other processor may have stolen it.
 164  * What do we do there?
 165  *
 166  * Conclusions:
 167  * We currently don't know of a scenario where quantum buckets on the task is beneficial.
 168  * We will instead handoff quantum between threads in the task, and keep quantum
 169  * on the preempted thread if it's preempted by something outside the task.
 170  *
 171  */
 172
 173 #if DEBUG || DEVELOPMENT
 174 #define MULTIQ_SANITY_CHECK
 175 #endif
 176
 177 typedef struct sched_entry {
 178         queue_chain_t           entry_links;
 179         int16_t                 sched_pri;      /* scheduled (current) priority */
 180         int16_t                 runq;
 181         int32_t                 pad;
 182 } *sched_entry_t;
 183
 184 typedef run_queue_t entry_queue_t;                      /* A run queue that holds sched_entries instead of threads */
 185 typedef run_queue_t group_runq_t;                       /* A run queue that is part of a sched_group */
 186
 187 #define SCHED_ENTRY_NULL        ((sched_entry_t) 0)
 188 #define MULTIQ_ERUNQ            (-4)                    /* Indicates entry is on the main runq */
 189
 190 /* Each level in the run queue corresponds to one entry in the entries array */
 191 struct sched_group {
 192         struct sched_entry      entries[NRQS];
 193         struct run_queue        runq;
 194         queue_chain_t           sched_groups;
 195 };
 196
 197 /*
 198  * Keep entry on the head of the runqueue while dequeueing threads.
 199  * Only cycle it to the end of the runqueue when a thread in the task
 200  * hits its quantum.
 201  */
 202 static boolean_t        deep_drain = FALSE;
 203
 204 /* Verify the consistency of the runq before touching it */
 205 static boolean_t        multiq_sanity_check = FALSE;
 206
 207 /*
 208  * Draining threads from the current task is preferred
 209  * when they're less than X steps below the current
 210  * global highest priority
 211  */
 212 #define DEFAULT_DRAIN_BAND_LIMIT MAXPRI
 213 static integer_t        drain_band_limit;
 214
 215 /*
 216  * Don't go below this priority level if there is something above it in another task
 217  */
 218 #define DEFAULT_DRAIN_DEPTH_LIMIT MAXPRI_THROTTLE
 219 static integer_t        drain_depth_limit;
 220
 221 /*
 222  * Don't favor the task when there's something above this priority in another task.
 223  */
 224 #define DEFAULT_DRAIN_CEILING BASEPRI_FOREGROUND
 225 static integer_t        drain_ceiling;
 226
 227 static struct zone      *sched_group_zone;
 228
 229 static uint64_t         num_sched_groups = 0;
 230 static queue_head_t     sched_groups;
 231
 232 static lck_attr_t       sched_groups_lock_attr;
 233 static lck_grp_t        sched_groups_lock_grp;
 234 static lck_grp_attr_t   sched_groups_lock_grp_attr;
 235
 236 static lck_mtx_t        sched_groups_lock;
 237
 238
 239 static void
 240 sched_multiq_init(void);
 241
 242 static thread_t
 243 sched_multiq_steal_thread(processor_set_t pset);
 244
 245 static void
 246 sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context);
 247
 248 static boolean_t
 249 sched_multiq_processor_enqueue(processor_t processor, thread_t thread, integer_t options);
 250
 251 static boolean_t
 252 sched_multiq_processor_queue_remove(processor_t processor, thread_t thread);
 253
 254 void
 255 sched_multiq_quantum_expire(thread_t thread);
 256
 257 static ast_t
 258 sched_multiq_processor_csw_check(processor_t processor);
 259
 260 static boolean_t
 261 sched_multiq_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
 262
 263 static int
 264 sched_multiq_runq_count(processor_t processor);
 265
 266 static boolean_t
 267 sched_multiq_processor_queue_empty(processor_t processor);
 268
 269 static uint64_t
 270 sched_multiq_runq_stats_count_sum(processor_t processor);
 271
 272 static int
 273 sched_multiq_processor_bound_count(processor_t processor);
 274
 275 static void
 276 sched_multiq_pset_init(processor_set_t pset);
 277
 278 static void
 279 sched_multiq_processor_init(processor_t processor);
 280
 281 static thread_t
 282 sched_multiq_choose_thread(processor_t processor, int priority, ast_t reason);
 283
 284 static void
 285 sched_multiq_processor_queue_shutdown(processor_t processor);
 286
 287 static sched_mode_t
 288 sched_multiq_initial_thread_sched_mode(task_t parent_task);
 289
 290 const struct sched_dispatch_table sched_multiq_dispatch = {
 291         .sched_name                                     = "multiq",
 292         .init                                           = sched_multiq_init,
 293         .timebase_init                                  = sched_timeshare_timebase_init,
 294         .processor_init                                 = sched_multiq_processor_init,
 295         .pset_init                                      = sched_multiq_pset_init,
 296         .maintenance_continuation                       = sched_timeshare_maintenance_continue,
 297         .choose_thread                                  = sched_multiq_choose_thread,
 298         .steal_thread_enabled                           = FALSE,
 299         .steal_thread                                   = sched_multiq_steal_thread,
 300         .compute_timeshare_priority                     = sched_compute_timeshare_priority,
 301         .choose_processor                               = choose_processor,
 302         .processor_enqueue                              = sched_multiq_processor_enqueue,
 303         .processor_queue_shutdown                       = sched_multiq_processor_queue_shutdown,
 304         .processor_queue_remove                         = sched_multiq_processor_queue_remove,
 305         .processor_queue_empty                          = sched_multiq_processor_queue_empty,
 306         .priority_is_urgent                             = priority_is_urgent,
 307         .processor_csw_check                            = sched_multiq_processor_csw_check,
 308         .processor_queue_has_priority                   = sched_multiq_processor_queue_has_priority,
 309         .initial_quantum_size                           = sched_timeshare_initial_quantum_size,
 310         .initial_thread_sched_mode                      = sched_multiq_initial_thread_sched_mode,
 311         .can_update_priority                            = can_update_priority,
 312         .update_priority                                = update_priority,
 313         .lightweight_update_priority                    = lightweight_update_priority,
 314         .quantum_expire                                 = sched_multiq_quantum_expire,
 315         .processor_runq_count                           = sched_multiq_runq_count,
 316         .processor_runq_stats_count_sum                 = sched_multiq_runq_stats_count_sum,
 317         .processor_bound_count                          = sched_multiq_processor_bound_count,
 318         .thread_update_scan                             = sched_multiq_thread_update_scan,
 319         .direct_dispatch_to_idle_processors             = FALSE,
 320         .multiple_psets_enabled                         = FALSE,
 321         .sched_groups_enabled                           = TRUE,
 322         .avoid_processor_enabled                        = FALSE,
 323         .thread_avoid_processor                         = NULL,
 324         .processor_balance                              = sched_SMT_balance,
 325
 326         .rt_runq                                        = sched_rtglobal_runq,
 327         .rt_init                                        = sched_rtglobal_init,
 328         .rt_queue_shutdown                              = sched_rtglobal_queue_shutdown,
 329         .rt_runq_scan                                   = sched_rtglobal_runq_scan,
 330         .rt_runq_count_sum                              = sched_rtglobal_runq_count_sum,
 331
 332         .qos_max_parallelism                            = sched_qos_max_parallelism,
 333         .check_spill                                    = sched_check_spill,
 334         .ipi_policy                                     = sched_ipi_policy,
 335         .thread_should_yield                            = sched_thread_should_yield,
 336 };
 337
 338
 339 static void
 340 sched_multiq_init(void)
 341 {
 342 #if defined(MULTIQ_SANITY_CHECK)
 343         PE_parse_boot_argn("-multiq-sanity-check", &multiq_sanity_check, sizeof(multiq_sanity_check));
 344 #endif
 345
 346         PE_parse_boot_argn("-multiq-deep-drain", &deep_drain, sizeof(deep_drain));
 347
 348         if (!PE_parse_boot_argn("multiq_drain_ceiling", &drain_ceiling, sizeof(drain_ceiling))) {
 349                 drain_ceiling = DEFAULT_DRAIN_CEILING;
 350         }
 351
 352         if (!PE_parse_boot_argn("multiq_drain_depth_limit", &drain_depth_limit, sizeof(drain_depth_limit))) {
 353                 drain_depth_limit = DEFAULT_DRAIN_DEPTH_LIMIT;
 354         }
 355
 356         if (!PE_parse_boot_argn("multiq_drain_band_limit", &drain_band_limit, sizeof(drain_band_limit))) {
 357                 drain_band_limit = DEFAULT_DRAIN_BAND_LIMIT;
 358         }
 359
 360         printf("multiq scheduler config: deep-drain %d, ceiling %d, depth limit %d, band limit %d, sanity check %d\n",
 361                deep_drain, drain_ceiling, drain_depth_limit, drain_band_limit, multiq_sanity_check);
 362
 363         sched_group_zone = zinit(
 364                                  sizeof(struct sched_group),
 365                                  task_max * sizeof(struct sched_group),
 366                                  PAGE_SIZE,
 367                                  "sched groups");
 368
 369         zone_change(sched_group_zone, Z_NOENCRYPT, TRUE);
 370         zone_change(sched_group_zone, Z_NOCALLOUT, TRUE);
 371
 372         queue_init(&sched_groups);
 373
 374         lck_grp_attr_setdefault(&sched_groups_lock_grp_attr);
 375         lck_grp_init(&sched_groups_lock_grp, "sched_groups", &sched_groups_lock_grp_attr);
 376         lck_attr_setdefault(&sched_groups_lock_attr);
 377         lck_mtx_init(&sched_groups_lock, &sched_groups_lock_grp, &sched_groups_lock_attr);
 378
 379         sched_timeshare_init();
 380 }
 381
 382 static void
 383 sched_multiq_processor_init(processor_t processor)
 384 {
 385         run_queue_init(&processor->runq);
 386 }
 387
 388 static void
 389 sched_multiq_pset_init(processor_set_t pset)
 390 {
 391         run_queue_init(&pset->pset_runq);
 392 }
 393
 394 static sched_mode_t
 395 sched_multiq_initial_thread_sched_mode(task_t parent_task)
 396 {
 397         if (parent_task == kernel_task)
 398                 return TH_MODE_FIXED;
 399         else
 400                 return TH_MODE_TIMESHARE;
 401 }
 402
 403 sched_group_t
 404 sched_group_create(void)
 405 {
 406         sched_group_t       sched_group;
 407
 408         if (!SCHED(sched_groups_enabled))
 409                 return SCHED_GROUP_NULL;
 410
 411         sched_group = (sched_group_t)zalloc(sched_group_zone);
 412
 413         bzero(sched_group, sizeof(struct sched_group));
 414
 415         run_queue_init(&sched_group->runq);
 416
 417         for (int i = 0; i < NRQS; i++) {
 418                 sched_group->entries[i].runq = 0;
 419                 sched_group->entries[i].sched_pri = i;
 420         }
 421
 422         lck_mtx_lock(&sched_groups_lock);
 423         queue_enter(&sched_groups, sched_group, sched_group_t, sched_groups);
 424         num_sched_groups++;
 425         lck_mtx_unlock(&sched_groups_lock);
 426
 427         return (sched_group);
 428 }
 429
 430 void
 431 sched_group_destroy(sched_group_t sched_group)
 432 {
 433         if (!SCHED(sched_groups_enabled)) {
 434                 assert(sched_group == SCHED_GROUP_NULL);
 435                 return;
 436         }
 437
 438         assert(sched_group != SCHED_GROUP_NULL);
 439         assert(sched_group->runq.count == 0);
 440
 441         for (int i = 0; i < NRQS; i++) {
 442                 assert(sched_group->entries[i].runq == 0);
 443                 assert(sched_group->entries[i].sched_pri == i);
 444         }
 445
 446         lck_mtx_lock(&sched_groups_lock);
 447         queue_remove(&sched_groups, sched_group, sched_group_t, sched_groups);
 448         num_sched_groups--;
 449         lck_mtx_unlock(&sched_groups_lock);
 450
 451         zfree(sched_group_zone, sched_group);
 452 }
 453
 454 __attribute__((always_inline))
 455 static inline entry_queue_t
 456 multiq_main_entryq(processor_t processor)
 457 {
 458         return (entry_queue_t)&processor->processor_set->pset_runq;
 459 }
 460
 461 __attribute__((always_inline))
 462 static inline run_queue_t
 463 multiq_bound_runq(processor_t processor)
 464 {
 465         return &processor->runq;
 466 }
 467
 468 __attribute__((always_inline))
 469 static inline sched_entry_t
 470 group_entry_for_pri(sched_group_t group, integer_t pri)
 471 {
 472         return &group->entries[pri];
 473 }
 474
 475 __attribute__((always_inline))
 476 static inline sched_group_t
 477 group_for_entry(sched_entry_t entry)
 478 {
 479 #pragma clang diagnostic push
 480 #pragma clang diagnostic ignored "-Wcast-align"
 481         sched_group_t group = (sched_group_t)(entry - entry->sched_pri);
 482 #pragma clang diagnostic pop
 483         return group;
 484 }
 485
 486 /* Peek at the head of the runqueue */
 487 static sched_entry_t
 488 entry_queue_first_entry(entry_queue_t rq)
 489 {
 490         assert(rq->count != 0);
 491
 492         queue_t queue = &rq->queues[rq->highq];
 493
 494         sched_entry_t entry = qe_queue_first(queue, struct sched_entry, entry_links);
 495
 496         assert(entry->sched_pri == rq->highq);
 497
 498         return entry;
 499 }
 500
 501 #if defined(MULTIQ_SANITY_CHECK)
 502
 503 #if MACH_ASSERT
 504 __attribute__((always_inline))
 505 static inline boolean_t
 506 queue_chain_linked(queue_chain_t* chain)
 507 {
 508         if (chain->next != NULL) {
 509                 assert(chain->prev != NULL);
 510                 return TRUE;
 511         } else {
 512                 assert(chain->prev == NULL);
 513                 return FALSE;
 514         }
 515 }
 516 #endif /* MACH_ASSERT */
 517
 518 static thread_t
 519 group_first_thread(sched_group_t group)
 520 {
 521         group_runq_t rq = &group->runq;
 522
 523         assert(rq->count != 0);
 524
 525         queue_t queue = &rq->queues[rq->highq];
 526
 527         thread_t thread = qe_queue_first(queue, struct thread, runq_links);
 528
 529         assert(thread != THREAD_NULL);
 530         assert_thread_magic(thread);
 531
 532         assert(thread->sched_group == group);
 533
 534         /* TODO: May not be safe */
 535         assert(thread->sched_pri == rq->highq);
 536
 537         return thread;
 538 }
 539
 540 /* Asserts if entry is not in entry runq at pri */
 541 static void
 542 entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pri)
 543 {
 544         queue_t q;
 545         sched_entry_t elem;
 546
 547         assert(queue_chain_linked(&entry->entry_links));
 548         assert(entry->runq == MULTIQ_ERUNQ);
 549
 550         q = &runq->queues[expected_pri];
 551
 552         qe_foreach_element(elem, q, entry_links) {
 553                 if (elem == entry)
 554                         return;
 555         }
 556
 557         panic("runq %p doesn't contain entry %p at pri %d", runq, entry, expected_pri);
 558 }
 559
 560 /* Asserts if thread is not in group at its priority */
 561 static void
 562 sched_group_check_thread(sched_group_t group, thread_t thread)
 563 {
 564         queue_t q;
 565         thread_t elem;
 566         int pri = thread->sched_pri;
 567
 568         assert(thread->runq != PROCESSOR_NULL);
 569
 570         q = &group->runq.queues[pri];
 571
 572         qe_foreach_element(elem, q, runq_links) {
 573                 if (elem == thread)
 574                         return;
 575         }
 576
 577         panic("group %p doesn't contain thread %p at pri %d", group, thread, pri);
 578 }
 579
 580 static void
 581 global_check_entry_queue(entry_queue_t main_entryq)
 582 {
 583         if (main_entryq->count == 0)
 584                 return;
 585
 586         sched_entry_t entry = entry_queue_first_entry(main_entryq);
 587
 588         assert(entry->runq == MULTIQ_ERUNQ);
 589
 590         sched_group_t group = group_for_entry(entry);
 591
 592         thread_t thread = group_first_thread(group);
 593
 594         __assert_only sched_entry_t thread_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
 595
 596         assert(entry->sched_pri == group->runq.highq);
 597
 598         assert(entry == thread_entry);
 599         assert(thread->runq != PROCESSOR_NULL);
 600 }
 601
 602 static void
 603 group_check_run_queue(entry_queue_t main_entryq, sched_group_t group)
 604 {
 605         if (group->runq.count == 0)
 606                 return;
 607
 608         thread_t thread = group_first_thread(group);
 609
 610         assert(thread->runq != PROCESSOR_NULL);
 611
 612         sched_entry_t sched_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
 613
 614         entry_queue_check_entry(main_entryq, sched_entry, thread->sched_pri);
 615
 616         assert(sched_entry->sched_pri == thread->sched_pri);
 617         assert(sched_entry->runq == MULTIQ_ERUNQ);
 618 }
 619
 620 #endif /* defined(MULTIQ_SANITY_CHECK) */
 621
 622 /*
 623  * The run queue must not be empty.
 624  */
 625 static sched_entry_t
 626 entry_queue_dequeue_entry(entry_queue_t rq)
 627 {
 628         sched_entry_t   sched_entry;
 629         queue_t         queue = &rq->queues[rq->highq];
 630
 631         assert(rq->count > 0);
 632         assert(!queue_empty(queue));
 633
 634         sched_entry = qe_dequeue_head(queue, struct sched_entry, entry_links);
 635
 636         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 637         rq->count--;
 638         if (SCHED(priority_is_urgent)(rq->highq)) {
 639                 rq->urgency--; assert(rq->urgency >= 0);
 640         }
 641         if (queue_empty(queue)) {
 642                 rq_bitmap_clear(rq->bitmap, rq->highq);
 643                 rq->highq = bitmap_first(rq->bitmap, NRQS);
 644         }
 645
 646         sched_entry->runq = 0;
 647
 648         return (sched_entry);
 649 }
 650
 651 /*
 652  * The run queue must not be empty.
 653  */
 654 static boolean_t
 655 entry_queue_enqueue_entry(
 656                           entry_queue_t rq,
 657                           sched_entry_t entry,
 658                           integer_t     options)
 659 {
 660         int             sched_pri = entry->sched_pri;
 661         queue_t         queue = &rq->queues[sched_pri];
 662         boolean_t       result = FALSE;
 663
 664         assert(entry->runq == 0);
 665
 666         if (queue_empty(queue)) {
 667                 enqueue_tail(queue, &entry->entry_links);
 668
 669                 rq_bitmap_set(rq->bitmap, sched_pri);
 670                 if (sched_pri > rq->highq) {
 671                         rq->highq = sched_pri;
 672                         result = TRUE;
 673                 }
 674         } else {
 675                 if (options & SCHED_TAILQ)
 676                         enqueue_tail(queue, &entry->entry_links);
 677                 else
 678                         enqueue_head(queue, &entry->entry_links);
 679         }
 680         if (SCHED(priority_is_urgent)(sched_pri))
 681                 rq->urgency++;
 682         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 683         rq->count++;
 684
 685         entry->runq = MULTIQ_ERUNQ;
 686
 687         return (result);
 688 }
 689
 690 /*
 691  * The entry must be in this runqueue.
 692  */
 693 static void
 694 entry_queue_remove_entry(
 695                          entry_queue_t  rq,
 696                          sched_entry_t  entry)
 697 {
 698         int sched_pri = entry->sched_pri;
 699
 700 #if defined(MULTIQ_SANITY_CHECK)
 701         if (multiq_sanity_check) {
 702                 entry_queue_check_entry(rq, entry, sched_pri);
 703         }
 704 #endif
 705
 706         remqueue(&entry->entry_links);
 707
 708         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 709         rq->count--;
 710         if (SCHED(priority_is_urgent)(sched_pri)) {
 711                 rq->urgency--; assert(rq->urgency >= 0);
 712         }
 713
 714         if (queue_empty(&rq->queues[sched_pri])) {
 715                 /* update run queue status */
 716                 rq_bitmap_clear(rq->bitmap, sched_pri);
 717                 rq->highq = bitmap_first(rq->bitmap, NRQS);
 718         }
 719
 720         entry->runq = 0;
 721 }
 722
 723 static void
 724 entry_queue_change_entry(
 725                           entry_queue_t rq,
 726                           sched_entry_t entry,
 727                           integer_t     options)
 728 {
 729         int     sched_pri   = entry->sched_pri;
 730         queue_t queue       = &rq->queues[sched_pri];
 731
 732 #if defined(MULTIQ_SANITY_CHECK)
 733         if (multiq_sanity_check) {
 734                 entry_queue_check_entry(rq, entry, sched_pri);
 735         }
 736 #endif
 737
 738         if (options & SCHED_TAILQ)
 739                 re_queue_tail(queue, &entry->entry_links);
 740         else
 741                 re_queue_head(queue, &entry->entry_links);
 742 }
 743 /*
 744  * The run queue must not be empty.
 745  *
 746  * sets queue_empty to TRUE if queue is now empty at thread_pri
 747  */
 748 static thread_t
 749 group_run_queue_dequeue_thread(
 750                          group_runq_t   rq,
 751                          integer_t     *thread_pri,
 752                          boolean_t     *queue_empty)
 753 {
 754         thread_t        thread;
 755         queue_t         queue = &rq->queues[rq->highq];
 756
 757         assert(rq->count > 0);
 758         assert(!queue_empty(queue));
 759
 760         *thread_pri = rq->highq;
 761
 762         thread = qe_dequeue_head(queue, struct thread, runq_links);
 763         assert_thread_magic(thread);
 764
 765         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 766         rq->count--;
 767         if (SCHED(priority_is_urgent)(rq->highq)) {
 768                 rq->urgency--; assert(rq->urgency >= 0);
 769         }
 770         if (queue_empty(queue)) {
 771                 rq_bitmap_clear(rq->bitmap, rq->highq);
 772                 rq->highq = bitmap_first(rq->bitmap, NRQS);
 773                 *queue_empty = TRUE;
 774         } else {
 775                 *queue_empty = FALSE;
 776         }
 777
 778         return thread;
 779 }
 780
 781 /*
 782  * The run queue must not be empty.
 783  * returns TRUE if queue was empty at thread_pri
 784  */
 785 static boolean_t
 786 group_run_queue_enqueue_thread(
 787                          group_runq_t   rq,
 788                          thread_t       thread,
 789                          integer_t      thread_pri,
 790                          integer_t      options)
 791 {
 792         queue_t         queue = &rq->queues[thread_pri];
 793         boolean_t       result = FALSE;
 794
 795         assert(thread->runq == PROCESSOR_NULL);
 796         assert_thread_magic(thread);
 797
 798         if (queue_empty(queue)) {
 799                 enqueue_tail(queue, &thread->runq_links);
 800
 801                 rq_bitmap_set(rq->bitmap, thread_pri);
 802                 if (thread_pri > rq->highq) {
 803                         rq->highq = thread_pri;
 804                 }
 805                 result = TRUE;
 806         } else {
 807                 if (options & SCHED_TAILQ)
 808                         enqueue_tail(queue, &thread->runq_links);
 809                 else
 810                         enqueue_head(queue, &thread->runq_links);
 811         }
 812         if (SCHED(priority_is_urgent)(thread_pri))
 813                 rq->urgency++;
 814         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 815         rq->count++;
 816
 817         return (result);
 818 }
 819
 820 /*
 821  * The thread must be in this runqueue.
 822  * returns TRUE if queue is now empty at thread_pri
 823  */
 824 static boolean_t
 825 group_run_queue_remove_thread(
 826                         group_runq_t    rq,
 827                         thread_t        thread,
 828                         integer_t       thread_pri)
 829 {
 830         boolean_t       result = FALSE;
 831
 832         assert_thread_magic(thread);
 833         assert(thread->runq != PROCESSOR_NULL);
 834
 835         remqueue(&thread->runq_links);
 836
 837         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
 838         rq->count--;
 839         if (SCHED(priority_is_urgent)(thread_pri)) {
 840                 rq->urgency--; assert(rq->urgency >= 0);
 841         }
 842
 843         if (queue_empty(&rq->queues[thread_pri])) {
 844                 /* update run queue status */
 845                 rq_bitmap_clear(rq->bitmap, thread_pri);
 846                 rq->highq = bitmap_first(rq->bitmap, NRQS);
 847                 result = TRUE;
 848         }
 849
 850         thread->runq = PROCESSOR_NULL;
 851
 852         return result;
 853 }
 854
 855 /*
 856  * A thread's sched pri may change out from under us because
 857  * we're clearing thread->runq here without the thread locked.
 858  * Do not rely on it to be the same as when we enqueued.
 859  */
 860 static thread_t
 861 sched_global_dequeue_thread(entry_queue_t main_entryq)
 862 {
 863         boolean_t pri_level_empty = FALSE;
 864         sched_entry_t entry;
 865         group_runq_t group_runq;
 866         thread_t thread;
 867         integer_t thread_pri;
 868         sched_group_t group;
 869
 870         assert(main_entryq->count > 0);
 871
 872         entry = entry_queue_dequeue_entry(main_entryq);
 873
 874         group = group_for_entry(entry);
 875         group_runq = &group->runq;
 876
 877         thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
 878
 879         thread->runq = PROCESSOR_NULL;
 880
 881         if (!pri_level_empty) {
 882                 entry_queue_enqueue_entry(main_entryq, entry, SCHED_TAILQ);
 883         }
 884
 885         return thread;
 886 }
 887
 888 /* Dequeue a thread from the global runq without moving the entry */
 889 static thread_t
 890 sched_global_deep_drain_dequeue_thread(entry_queue_t main_entryq)
 891 {
 892         boolean_t pri_level_empty = FALSE;
 893         sched_entry_t entry;
 894         group_runq_t group_runq;
 895         thread_t thread;
 896         integer_t thread_pri;
 897         sched_group_t group;
 898
 899         assert(main_entryq->count > 0);
 900
 901         entry = entry_queue_first_entry(main_entryq);
 902
 903         group = group_for_entry(entry);
 904         group_runq = &group->runq;
 905
 906         thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
 907
 908         thread->runq = PROCESSOR_NULL;
 909
 910         if (pri_level_empty) {
 911                 entry_queue_remove_entry(main_entryq, entry);
 912         }
 913
 914         return thread;
 915 }
 916
 917
 918 static thread_t
 919 sched_group_dequeue_thread(
 920                            entry_queue_t main_entryq,
 921                            sched_group_t group)
 922 {
 923         group_runq_t group_runq = &group->runq;
 924         boolean_t pri_level_empty = FALSE;
 925         thread_t thread;
 926         integer_t thread_pri;
 927
 928         thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
 929
 930         thread->runq = PROCESSOR_NULL;
 931
 932         if (pri_level_empty) {
 933                 entry_queue_remove_entry(main_entryq, group_entry_for_pri(group, thread_pri));
 934         }
 935
 936         return thread;
 937 }
 938
 939 static void
 940 sched_group_remove_thread(
 941                           entry_queue_t main_entryq,
 942                           sched_group_t group,
 943                           thread_t thread)
 944 {
 945         integer_t thread_pri = thread->sched_pri;
 946         sched_entry_t sched_entry = group_entry_for_pri(group, thread_pri);
 947
 948 #if defined(MULTIQ_SANITY_CHECK)
 949         if (multiq_sanity_check) {
 950                 global_check_entry_queue(main_entryq);
 951                 group_check_run_queue(main_entryq, group);
 952
 953                 sched_group_check_thread(group, thread);
 954                 entry_queue_check_entry(main_entryq, sched_entry, thread_pri);
 955         }
 956 #endif
 957
 958         boolean_t pri_level_empty = group_run_queue_remove_thread(&group->runq, thread, thread_pri);
 959
 960         if (pri_level_empty) {
 961                 entry_queue_remove_entry(main_entryq, sched_entry);
 962         }
 963
 964 #if defined(MULTIQ_SANITY_CHECK)
 965         if (multiq_sanity_check) {
 966                 global_check_entry_queue(main_entryq);
 967                 group_check_run_queue(main_entryq, group);
 968         }
 969 #endif
 970 }
 971
 972 static void
 973 sched_group_enqueue_thread(
 974                            entry_queue_t        main_entryq,
 975                            sched_group_t        group,
 976                            thread_t             thread,
 977                            integer_t            options)
 978 {
 979 #if defined(MULTIQ_SANITY_CHECK)
 980         if (multiq_sanity_check) {
 981                 global_check_entry_queue(main_entryq);
 982                 group_check_run_queue(main_entryq, group);
 983         }
 984 #endif
 985
 986         int sched_pri = thread->sched_pri;
 987
 988         boolean_t pri_level_was_empty = group_run_queue_enqueue_thread(&group->runq, thread, sched_pri, options);
 989
 990         if (pri_level_was_empty) {
 991                 /*
 992                  * TODO: Need to figure out if passing options here is a good idea or not
 993                  * What effects would it have?
 994                  */
 995                 entry_queue_enqueue_entry(main_entryq, &group->entries[sched_pri], options);
 996         } else if (options & SCHED_HEADQ) {
 997                 /* The thread should be at the head of the line - move its entry to the front */
 998                 entry_queue_change_entry(main_entryq, &group->entries[sched_pri], options);
 999         }
1000 }
1001
1002 /*
1003  *  Locate a thread to execute from the run queue and return it.
1004  *  Only choose a thread with greater or equal priority.
1005  *
1006  *  pset is locked, thread is not locked.
1007  *
1008  *  Returns THREAD_NULL if it cannot find a valid thread.
1009  *
1010  *  Note: we cannot rely on the value of thread->sched_pri in this path because
1011  *  we don't have the thread locked.
1012  *
1013  *  TODO: Remove tracepoints
1014  */
1015 static thread_t
1016 sched_multiq_choose_thread(
1017                            processor_t      processor,
1018                            int              priority,
1019                            ast_t            reason)
1020 {
1021         entry_queue_t   main_entryq = multiq_main_entryq(processor);
1022         run_queue_t     bound_runq  = multiq_bound_runq(processor);
1023
1024         boolean_t choose_bound_runq = FALSE;
1025
1026         if (bound_runq->highq  < priority &&
1027             main_entryq->highq < priority)
1028                 return THREAD_NULL;
1029
1030         if (bound_runq->count && main_entryq->count) {
1031                 if (bound_runq->highq >= main_entryq->highq) {
1032                         choose_bound_runq = TRUE;
1033                 } else {
1034                         /* Use main runq */
1035                 }
1036         } else if (bound_runq->count) {
1037                 choose_bound_runq = TRUE;
1038         } else if (main_entryq->count) {
1039                 /* Use main runq */
1040         } else {
1041                 return (THREAD_NULL);
1042         }
1043
1044         if (choose_bound_runq) {
1045                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1046                     MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1047                     MACH_MULTIQ_BOUND, main_entryq->highq, bound_runq->highq, 0, 0);
1048
1049                 return run_queue_dequeue(bound_runq, SCHED_HEADQ);
1050         }
1051
1052         sched_group_t group = current_thread()->sched_group;
1053
1054 #if defined(MULTIQ_SANITY_CHECK)
1055         if (multiq_sanity_check) {
1056                 global_check_entry_queue(main_entryq);
1057                 group_check_run_queue(main_entryq, group);
1058         }
1059 #endif
1060
1061         /*
1062          * Determine if we should look at the group or the global queue
1063          *
1064          * TODO:
1065          * Perhaps pass reason as a 'should look inside' argument to choose_thread
1066          * Should YIELD AST override drain limit?
1067          */
1068         if (group->runq.count != 0 && (reason & AST_PREEMPTION) == 0) {
1069                 boolean_t favor_group = TRUE;
1070
1071                 integer_t global_pri = main_entryq->highq;
1072                 integer_t group_pri  = group->runq.highq;
1073
1074                 /*
1075                  * Favor the current group if the group is still the globally highest.
1076                  *
1077                  * Otherwise, consider choosing a thread from the current group
1078                  * even if it's lower priority than the global highest priority.
1079                  */
1080                 if (global_pri > group_pri) {
1081                         /*
1082                          * If there's something elsewhere above the depth limit,
1083                          * don't pick a thread below the limit.
1084                          */
1085                         if (global_pri > drain_depth_limit && group_pri <= drain_depth_limit)
1086                                 favor_group = FALSE;
1087
1088                         /*
1089                          * If there's something at or above the ceiling,
1090                          * don't favor the group.
1091                          */
1092                         if (global_pri >= drain_ceiling)
1093                                 favor_group = FALSE;
1094
1095                         /*
1096                          * Don't go more than X steps below the global highest
1097                          */
1098                         if ((global_pri - group_pri) >= drain_band_limit)
1099                                 favor_group = FALSE;
1100                 }
1101
1102                 if (favor_group) {
1103                         /* Pull from local runq */
1104                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1105                             MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1106                             MACH_MULTIQ_GROUP, global_pri, group_pri, 0, 0);
1107
1108                         return sched_group_dequeue_thread(main_entryq, group);
1109                 }
1110         }
1111
1112         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1113             MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1114             MACH_MULTIQ_GLOBAL, main_entryq->highq, group->runq.highq, 0, 0);
1115
1116         /* Couldn't pull from local runq, pull from global runq instead */
1117         if (deep_drain) {
1118                 return sched_global_deep_drain_dequeue_thread(main_entryq);
1119         } else {
1120                 return sched_global_dequeue_thread(main_entryq);
1121         }
1122 }
1123
1124
1125 /*
1126  * Thread must be locked, and not already be on a run queue.
1127  * pset is locked.
1128  */
1129 static boolean_t
1130 sched_multiq_processor_enqueue(
1131                                processor_t      processor,
1132                                thread_t         thread,
1133                                integer_t        options)
1134 {
1135         boolean_t       result;
1136
1137         assert(processor == thread->chosen_processor);
1138
1139         if (thread->bound_processor != PROCESSOR_NULL) {
1140                 assert(thread->bound_processor == processor);
1141
1142                 result = run_queue_enqueue(multiq_bound_runq(processor), thread, options);
1143                 thread->runq = processor;
1144
1145                 return result;
1146         }
1147
1148         sched_group_enqueue_thread(multiq_main_entryq(processor),
1149                                    thread->sched_group,
1150                                    thread, options);
1151
1152         thread->runq = processor;
1153
1154         return (FALSE);
1155 }
1156
1157 /*
1158  * Called in the context of thread with thread and pset unlocked,
1159  * after updating thread priority but before propagating that priority
1160  * to the processor
1161  */
1162 void
1163 sched_multiq_quantum_expire(thread_t thread)
1164 {
1165         if (deep_drain) {
1166                 /*
1167                  * Move the entry at this priority to the end of the queue,
1168                  * to allow the next task a shot at running.
1169                  */
1170
1171                 processor_t processor = thread->last_processor;
1172                 processor_set_t pset = processor->processor_set;
1173                 entry_queue_t entryq = multiq_main_entryq(processor);
1174
1175                 pset_lock(pset);
1176
1177                 sched_entry_t entry = group_entry_for_pri(thread->sched_group, processor->current_pri);
1178
1179                 if (entry->runq == MULTIQ_ERUNQ) {
1180                         entry_queue_change_entry(entryq, entry, SCHED_TAILQ);
1181                 }
1182
1183                 pset_unlock(pset);
1184         }
1185 }
1186
1187 static boolean_t
1188 sched_multiq_processor_queue_empty(processor_t processor)
1189 {
1190         return multiq_main_entryq(processor)->count == 0 &&
1191                multiq_bound_runq(processor)->count  == 0;
1192 }
1193
1194 static ast_t
1195 sched_multiq_processor_csw_check(processor_t processor)
1196 {
1197         boolean_t       has_higher;
1198         int             pri;
1199
1200         entry_queue_t main_entryq = multiq_main_entryq(processor);
1201         run_queue_t   bound_runq  = multiq_bound_runq(processor);
1202
1203         assert(processor->active_thread != NULL);
1204
1205         pri = MAX(main_entryq->highq, bound_runq->highq);
1206
1207         if (processor->first_timeslice) {
1208                 has_higher = (pri > processor->current_pri);
1209         } else {
1210                 has_higher = (pri >= processor->current_pri);
1211         }
1212
1213         if (has_higher) {
1214                 if (main_entryq->urgency > 0)
1215                         return (AST_PREEMPT | AST_URGENT);
1216
1217                 if (bound_runq->urgency > 0)
1218                         return (AST_PREEMPT | AST_URGENT);
1219
1220                 return AST_PREEMPT;
1221         }
1222
1223         return AST_NONE;
1224 }
1225
1226 static boolean_t
1227 sched_multiq_processor_queue_has_priority(
1228                                           processor_t   processor,
1229                                           int           priority,
1230                                           boolean_t     gte)
1231 {
1232         run_queue_t main_runq  = multiq_main_entryq(processor);
1233         run_queue_t bound_runq = multiq_bound_runq(processor);
1234
1235         int qpri = MAX(main_runq->highq, bound_runq->highq);
1236
1237         if (gte)
1238                 return qpri >= priority;
1239         else
1240                 return qpri > priority;
1241 }
1242
1243 static int
1244 sched_multiq_runq_count(processor_t processor)
1245 {
1246         /*
1247          *  TODO: Decide whether to keep a count of runnable threads in the pset
1248          *  or just return something less than the true count.
1249          *
1250          *  This needs to be fast, so no iterating the whole runq.
1251          *
1252          *  Another possible decision is to remove this - with global runq
1253          *  it doesn't make much sense.
1254          */
1255         return multiq_main_entryq(processor)->count + multiq_bound_runq(processor)->count;
1256 }
1257
1258 static uint64_t
1259 sched_multiq_runq_stats_count_sum(processor_t processor)
1260 {
1261         /*
1262          * TODO: This one does need to go through all the runqueues, but it's only needed for
1263          * the sched stats tool
1264          */
1265
1266         uint64_t bound_sum = multiq_bound_runq(processor)->runq_stats.count_sum;
1267
1268         if (processor->cpu_id == processor->processor_set->cpu_set_low)
1269                 return bound_sum + multiq_main_entryq(processor)->runq_stats.count_sum;
1270         else
1271                 return bound_sum;
1272 }
1273
1274 static int
1275 sched_multiq_processor_bound_count(processor_t processor)
1276 {
1277         return multiq_bound_runq(processor)->count;
1278 }
1279
1280 static void
1281 sched_multiq_processor_queue_shutdown(processor_t processor)
1282 {
1283         processor_set_t pset = processor->processor_set;
1284         entry_queue_t   main_entryq = multiq_main_entryq(processor);
1285         thread_t        thread;
1286         queue_head_t    tqueue;
1287
1288         /* We only need to migrate threads if this is the last active processor in the pset */
1289         if (pset->online_processor_count > 0) {
1290                 pset_unlock(pset);
1291                 return;
1292         }
1293
1294         queue_init(&tqueue);
1295
1296         /* Note that we do not remove bound threads from the queues here */
1297
1298         while (main_entryq->count > 0) {
1299                 thread = sched_global_dequeue_thread(main_entryq);
1300                 enqueue_tail(&tqueue, &thread->runq_links);
1301         }
1302
1303         pset_unlock(pset);
1304
1305         qe_foreach_element_safe(thread, &tqueue, runq_links) {
1306
1307                 remqueue(&thread->runq_links);
1308
1309                 thread_lock(thread);
1310
1311                 thread_setrun(thread, SCHED_TAILQ);
1312
1313                 thread_unlock(thread);
1314         }
1315 }
1316
1317 /*
1318  * Thread is locked
1319  *
1320  * This is why we can never read sched_pri unless we have the thread locked.
1321  * Which we do in the enqueue and remove cases, but not the dequeue case.
1322  */
1323 static boolean_t
1324 sched_multiq_processor_queue_remove(
1325                                     processor_t processor,
1326                                     thread_t    thread)
1327 {
1328         boolean_t removed = FALSE;
1329         processor_set_t pset = processor->processor_set;
1330
1331         pset_lock(pset);
1332
1333         if (thread->runq != PROCESSOR_NULL) {
1334                 /*
1335                  * Thread is on a run queue and we have a lock on
1336                  * that run queue.
1337                  */
1338
1339                 assert(thread->runq == processor);
1340
1341                 if (thread->bound_processor != PROCESSOR_NULL) {
1342                         assert(processor == thread->bound_processor);
1343                         run_queue_remove(multiq_bound_runq(processor), thread);
1344                         thread->runq = PROCESSOR_NULL;
1345                 } else {
1346                         sched_group_remove_thread(multiq_main_entryq(processor),
1347                                                   thread->sched_group,
1348                                                   thread);
1349                 }
1350
1351                 removed = TRUE;
1352         }
1353
1354         pset_unlock(pset);
1355
1356         return removed;
1357 }
1358
1359 /* pset is locked, returned unlocked */
1360 static thread_t
1361 sched_multiq_steal_thread(processor_set_t pset)
1362 {
1363         pset_unlock(pset);
1364         return (THREAD_NULL);
1365 }
1366
1367 /*
1368  * Scan the global queue for candidate groups, and scan those groups for
1369  * candidate threads.
1370  *
1371  * TODO: This iterates every group runq in its entirety for each entry it has in the runq, which is O(N^2)
1372  *       Instead, iterate only the queue in the group runq matching the priority of the entry.
1373  *
1374  * Returns TRUE if retry is needed.
1375  */
1376 static boolean_t
1377 group_scan(entry_queue_t runq, sched_update_scan_context_t scan_context) {
1378         int count       = runq->count;
1379         int queue_index;
1380
1381         assert(count >= 0);
1382
1383         if (count == 0)
1384                 return FALSE;
1385
1386         for (queue_index = bitmap_first(runq->bitmap, NRQS);
1387              queue_index >= 0;
1388              queue_index = bitmap_next(runq->bitmap, queue_index)) {
1389
1390                 sched_entry_t entry;
1391
1392                 qe_foreach_element(entry, &runq->queues[queue_index], entry_links) {
1393                         assert(count > 0);
1394
1395                         sched_group_t group = group_for_entry(entry);
1396                         if (group->runq.count > 0) {
1397                                 if (runq_scan(&group->runq, scan_context))
1398                                         return (TRUE);
1399                         }
1400                         count--;
1401                 }
1402         }
1403
1404         return (FALSE);
1405 }
1406
1407 static void
1408 sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context)
1409 {
1410         boolean_t               restart_needed = FALSE;
1411         processor_t             processor = processor_list;
1412         processor_set_t         pset;
1413         thread_t                thread;
1414         spl_t                   s;
1415
1416         /*
1417          *  We update the threads associated with each processor (bound and idle threads)
1418          *  and then update the threads in each pset runqueue.
1419          */
1420
1421         do {
1422                 do {
1423                         pset = processor->processor_set;
1424
1425                         s = splsched();
1426                         pset_lock(pset);
1427
1428                         restart_needed = runq_scan(multiq_bound_runq(processor), scan_context);
1429
1430                         pset_unlock(pset);
1431                         splx(s);
1432
1433                         if (restart_needed)
1434                                 break;
1435
1436                         thread = processor->idle_thread;
1437                         if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
1438                                 if (thread_update_add_thread(thread) == FALSE) {
1439                                         restart_needed = TRUE;
1440                                         break;
1441                                 }
1442                         }
1443                 } while ((processor = processor->processor_list) != NULL);
1444
1445                 /* Ok, we now have a collection of candidates -- fix them. */
1446                 thread_update_process_threads();
1447
1448         } while (restart_needed);
1449
1450         pset = &pset0;
1451
1452         do {
1453                 do {
1454                         s = splsched();
1455                         pset_lock(pset);
1456
1457                         restart_needed = group_scan(&pset->pset_runq, scan_context);
1458
1459                         pset_unlock(pset);
1460                         splx(s);
1461
1462                         if (restart_needed)
1463                                 break;
1464                 } while ((pset = pset->pset_list) != NULL);
1465
1466                 /* Ok, we now have a collection of candidates -- fix them. */
1467                 thread_update_process_threads();
1468
1469         } while (restart_needed);
1470 }