]> git.saurik.com Git - apple/xnu.git/blame - osfmk/kern/sched_multiq.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / osfmk / kern / sched_multiq.c
CommitLineData
fe8ab488 1/*
f427ee49 2 * Copyright (c) 2013-2020 Apple Inc. All rights reserved.
fe8ab488
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <mach/mach_types.h>
30#include <mach/machine.h>
31
32#include <machine/machine_routines.h>
33#include <machine/sched_param.h>
34#include <machine/machine_cpu.h>
35
36#include <kern/kern_types.h>
37#include <kern/debug.h>
38#include <kern/mach_param.h>
39#include <kern/machine.h>
40#include <kern/misc_protos.h>
41#include <kern/processor.h>
42#include <kern/queue.h>
43#include <kern/sched.h>
44#include <kern/sched_prim.h>
45#include <kern/task.h>
46#include <kern/thread.h>
47
48#include <sys/kdebug.h>
49
50/*
51 * Theory Statement
52 *
53 * How does the task scheduler work?
54 *
55 * It schedules threads across a few levels.
56 *
57 * RT threads are dealt with above us
58 * Bound threads go into the per-processor runq
59 * Non-bound threads are linked on their task's sched_group's runq
60 * sched_groups' sched_entries are linked on the pset's runq
61 *
62 * TODO: make this explicit - bound threads should have a different enqueue fxn
63 *
64 * When we choose a new thread, we will decide whether to look at the bound runqueue, the global runqueue
65 * or the current group's runqueue, then dequeue the next thread in that runqueue.
66 *
67 * We then manipulate the sched_entries to reflect the invariant that:
68 * Each non-empty priority level in a group's runq is represented by one sched_entry enqueued in the global
69 * runqueue.
70 *
71 * A sched_entry represents a chance at running - for each priority in each task, there is one chance of getting
72 * to run. This reduces the excess contention bonus given to processes which have work spread among many threads
73 * as compared to processes which do the same amount of work under fewer threads.
74 *
75 * NOTE: Currently, the multiq scheduler only supports one pset.
76 *
77 * NOTE ABOUT thread->sched_pri:
78 *
79 * It can change after enqueue - it's changed without pset lock but with thread lock if thread->runq is 0.
80 * Therefore we can only depend on it not changing during the enqueue and remove path, not the dequeue.
81 *
82 * TODO: Future features:
83 *
84 * Decouple the task priority from the sched_entry priority, allowing for:
85 * fast task priority change without having to iterate and re-dispatch all threads in the task.
86 * i.e. task-wide priority, task-wide boosting
87 * fancier group decay features
88 *
89 * Group (or task) decay:
90 * Decay is used for a few different things:
91 * Prioritizing latency-needing threads over throughput-needing threads for time-to-running
92 * Balancing work between threads in a process
93 * Balancing work done at the same priority between different processes
94 * Recovering from priority inversions between two threads in the same process
95 * Recovering from priority inversions between two threads in different processes
96 * Simulating a proportional share scheduler by allowing lower priority threads
97 * to run for a certain percentage of the time
98 *
99 * Task decay lets us separately address the 'same process' and 'different process' needs,
100 * which will allow us to make smarter tradeoffs in different cases.
101 * For example, we could resolve priority inversion in the same process by reordering threads without dropping the
102 * process below low priority threads in other processes.
103 *
104 * One lock to rule them all (or at least all the runqueues) instead of the pset locks
105 *
106 * Shrink sched_entry size to the size of a queue_chain_t by inferring priority, group, and perhaps runq field.
107 * The entries array is 5K currently so it'd be really great to reduce.
108 * One way to get sched_group below 4K without a new runq structure would be to remove the extra queues above realtime.
109 *
110 * When preempting a processor, store a flag saying if the preemption
111 * was from a thread in the same group or different group,
112 * and tell choose_thread about it.
113 *
114 * When choosing a processor, bias towards those running in the same
115 * group as I am running (at the same priority, or within a certain band?).
116 *
117 * Decide if we need to support psets.
118 * Decide how to support psets - do we need duplicate entries for each pset,
119 * or can we get away with putting the entry in either one or the other pset?
120 *
121 * Consider the right way to handle runq count - I don't want to iterate groups.
39037602 122 * Perhaps keep a global counter.
fe8ab488
A
123 * Alternate option - remove it from choose_processor. It doesn't add much value
124 * now that we have global runq.
125 *
126 * Need a better way of finding group to target instead of looking at current_task.
127 * Perhaps choose_thread could pass in the current thread?
128 *
129 * Consider unifying runq copy-pastes.
130 *
131 * Thoughts on having a group central quantum bucket:
132 *
133 * I see two algorithms to decide quanta:
134 * A) Hand off only when switching thread to thread in the same group
135 * B) Allocate and return quanta to the group's pool
136 *
137 * Issues:
138 * If a task blocks completely, should it come back with the leftover quanta
139 * or brand new quanta?
140 *
141 * Should I put a flag saying zero out a quanta you grab when youre dispatched'?
142 *
143 * Resolution:
144 * Handing off quanta between threads will help with jumping around in the current task
145 * but will not help when a thread from a different task is involved.
146 * Need an algorithm that works with round robin-ing between threads in different tasks
147 *
148 * But wait - round robining can only be triggered by quantum expire or blocking.
149 * We need something that works with preemption or yielding - that's the more interesting idea.
150 *
151 * Existing algorithm - preemption doesn't re-set quantum, puts thread on head of runq.
152 * Blocking or quantum expiration does re-set quantum, puts thread on tail of runq.
153 *
154 * New algorithm -
155 * Hand off quanta when hopping between threads with same sched_group
156 * Even if thread was blocked it uses last thread remaining quanta when it starts.
157 *
158 * If we use the only cycle entry at quantum algorithm, then the quantum pool starts getting
159 * interesting.
160 *
161 * A thought - perhaps the handoff approach doesn't work so well in the presence of
162 * non-handoff wakeups i.e. wake other thread then wait then block - doesn't mean that
163 * woken thread will be what I switch to - other processor may have stolen it.
164 * What do we do there?
165 *
166 * Conclusions:
167 * We currently don't know of a scenario where quantum buckets on the task is beneficial.
168 * We will instead handoff quantum between threads in the task, and keep quantum
169 * on the preempted thread if it's preempted by something outside the task.
170 *
171 */
172
173#if DEBUG || DEVELOPMENT
174#define MULTIQ_SANITY_CHECK
175#endif
176
177typedef struct sched_entry {
39037602 178 queue_chain_t entry_links;
fe8ab488
A
179 int16_t sched_pri; /* scheduled (current) priority */
180 int16_t runq;
0a7de745 181 int32_t pad;
fe8ab488
A
182} *sched_entry_t;
183
184typedef run_queue_t entry_queue_t; /* A run queue that holds sched_entries instead of threads */
185typedef run_queue_t group_runq_t; /* A run queue that is part of a sched_group */
186
187#define SCHED_ENTRY_NULL ((sched_entry_t) 0)
0a7de745 188#define MULTIQ_ERUNQ (-4) /* Indicates entry is on the main runq */
fe8ab488
A
189
190/* Each level in the run queue corresponds to one entry in the entries array */
191struct sched_group {
192 struct sched_entry entries[NRQS];
193 struct run_queue runq;
194 queue_chain_t sched_groups;
195};
196
fe8ab488
A
197/*
198 * Keep entry on the head of the runqueue while dequeueing threads.
199 * Only cycle it to the end of the runqueue when a thread in the task
200 * hits its quantum.
201 */
202static boolean_t deep_drain = FALSE;
203
fe8ab488
A
204/* Verify the consistency of the runq before touching it */
205static boolean_t multiq_sanity_check = FALSE;
206
207/*
208 * Draining threads from the current task is preferred
209 * when they're less than X steps below the current
210 * global highest priority
211 */
212#define DEFAULT_DRAIN_BAND_LIMIT MAXPRI
213static integer_t drain_band_limit;
214
215/*
216 * Don't go below this priority level if there is something above it in another task
217 */
218#define DEFAULT_DRAIN_DEPTH_LIMIT MAXPRI_THROTTLE
219static integer_t drain_depth_limit;
220
3e170ce0
A
221/*
222 * Don't favor the task when there's something above this priority in another task.
223 */
224#define DEFAULT_DRAIN_CEILING BASEPRI_FOREGROUND
225static integer_t drain_ceiling;
fe8ab488 226
f427ee49
A
227static ZONE_DECLARE(sched_group_zone, "sched groups",
228 sizeof(struct sched_group), ZC_NOENCRYPT | ZC_NOCALLOUT);
fe8ab488
A
229
230static uint64_t num_sched_groups = 0;
231static queue_head_t sched_groups;
232
f427ee49
A
233static LCK_GRP_DECLARE(sched_groups_lock_grp, "sched_groups");
234static LCK_MTX_DECLARE(sched_groups_lock, &sched_groups_lock_grp);
fe8ab488
A
235
236static void
237sched_multiq_init(void);
238
239static thread_t
240sched_multiq_steal_thread(processor_set_t pset);
241
242static void
3e170ce0 243sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context);
fe8ab488
A
244
245static boolean_t
cb323159
A
246sched_multiq_processor_enqueue(processor_t processor, thread_t thread,
247 sched_options_t options);
fe8ab488
A
248
249static boolean_t
250sched_multiq_processor_queue_remove(processor_t processor, thread_t thread);
251
252void
253sched_multiq_quantum_expire(thread_t thread);
254
255static ast_t
256sched_multiq_processor_csw_check(processor_t processor);
257
258static boolean_t
259sched_multiq_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
260
261static int
262sched_multiq_runq_count(processor_t processor);
263
264static boolean_t
265sched_multiq_processor_queue_empty(processor_t processor);
266
267static uint64_t
268sched_multiq_runq_stats_count_sum(processor_t processor);
269
270static int
271sched_multiq_processor_bound_count(processor_t processor);
272
273static void
274sched_multiq_pset_init(processor_set_t pset);
275
276static void
277sched_multiq_processor_init(processor_t processor);
278
279static thread_t
280sched_multiq_choose_thread(processor_t processor, int priority, ast_t reason);
281
282static void
283sched_multiq_processor_queue_shutdown(processor_t processor);
284
285static sched_mode_t
286sched_multiq_initial_thread_sched_mode(task_t parent_task);
287
a39ff7e2
A
288static bool
289sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread);
290
fe8ab488 291const struct sched_dispatch_table sched_multiq_dispatch = {
3e170ce0 292 .sched_name = "multiq",
fe8ab488 293 .init = sched_multiq_init,
3e170ce0 294 .timebase_init = sched_timeshare_timebase_init,
fe8ab488
A
295 .processor_init = sched_multiq_processor_init,
296 .pset_init = sched_multiq_pset_init,
3e170ce0 297 .maintenance_continuation = sched_timeshare_maintenance_continue,
fe8ab488 298 .choose_thread = sched_multiq_choose_thread,
0a7de745 299 .steal_thread_enabled = sched_steal_thread_DISABLED,
fe8ab488 300 .steal_thread = sched_multiq_steal_thread,
3e170ce0 301 .compute_timeshare_priority = sched_compute_timeshare_priority,
f427ee49 302 .choose_node = sched_choose_node,
fe8ab488
A
303 .choose_processor = choose_processor,
304 .processor_enqueue = sched_multiq_processor_enqueue,
305 .processor_queue_shutdown = sched_multiq_processor_queue_shutdown,
306 .processor_queue_remove = sched_multiq_processor_queue_remove,
307 .processor_queue_empty = sched_multiq_processor_queue_empty,
308 .priority_is_urgent = priority_is_urgent,
309 .processor_csw_check = sched_multiq_processor_csw_check,
310 .processor_queue_has_priority = sched_multiq_processor_queue_has_priority,
3e170ce0 311 .initial_quantum_size = sched_timeshare_initial_quantum_size,
fe8ab488
A
312 .initial_thread_sched_mode = sched_multiq_initial_thread_sched_mode,
313 .can_update_priority = can_update_priority,
314 .update_priority = update_priority,
315 .lightweight_update_priority = lightweight_update_priority,
316 .quantum_expire = sched_multiq_quantum_expire,
fe8ab488
A
317 .processor_runq_count = sched_multiq_runq_count,
318 .processor_runq_stats_count_sum = sched_multiq_runq_stats_count_sum,
fe8ab488
A
319 .processor_bound_count = sched_multiq_processor_bound_count,
320 .thread_update_scan = sched_multiq_thread_update_scan,
3e170ce0
A
321 .multiple_psets_enabled = FALSE,
322 .sched_groups_enabled = TRUE,
a39ff7e2
A
323 .avoid_processor_enabled = TRUE,
324 .thread_avoid_processor = sched_multiq_thread_avoid_processor,
5ba3f43e
A
325 .processor_balance = sched_SMT_balance,
326
f427ee49
A
327 .rt_runq = sched_rtlocal_runq,
328 .rt_init = sched_rtlocal_init,
329 .rt_queue_shutdown = sched_rtlocal_queue_shutdown,
330 .rt_runq_scan = sched_rtlocal_runq_scan,
331 .rt_runq_count_sum = sched_rtlocal_runq_count_sum,
5ba3f43e
A
332
333 .qos_max_parallelism = sched_qos_max_parallelism,
334 .check_spill = sched_check_spill,
335 .ipi_policy = sched_ipi_policy,
336 .thread_should_yield = sched_thread_should_yield,
cb323159
A
337 .run_count_incr = sched_run_incr,
338 .run_count_decr = sched_run_decr,
339 .update_thread_bucket = sched_update_thread_bucket,
340 .pset_made_schedulable = sched_pset_made_schedulable,
fe8ab488
A
341};
342
343
344static void
345sched_multiq_init(void)
346{
fe8ab488
A
347#if defined(MULTIQ_SANITY_CHECK)
348 PE_parse_boot_argn("-multiq-sanity-check", &multiq_sanity_check, sizeof(multiq_sanity_check));
349#endif
350
351 PE_parse_boot_argn("-multiq-deep-drain", &deep_drain, sizeof(deep_drain));
352
3e170ce0
A
353 if (!PE_parse_boot_argn("multiq_drain_ceiling", &drain_ceiling, sizeof(drain_ceiling))) {
354 drain_ceiling = DEFAULT_DRAIN_CEILING;
355 }
fe8ab488
A
356
357 if (!PE_parse_boot_argn("multiq_drain_depth_limit", &drain_depth_limit, sizeof(drain_depth_limit))) {
358 drain_depth_limit = DEFAULT_DRAIN_DEPTH_LIMIT;
359 }
360
361 if (!PE_parse_boot_argn("multiq_drain_band_limit", &drain_band_limit, sizeof(drain_band_limit))) {
362 drain_band_limit = DEFAULT_DRAIN_BAND_LIMIT;
363 }
364
3e170ce0 365 printf("multiq scheduler config: deep-drain %d, ceiling %d, depth limit %d, band limit %d, sanity check %d\n",
0a7de745 366 deep_drain, drain_ceiling, drain_depth_limit, drain_band_limit, multiq_sanity_check);
fe8ab488 367
fe8ab488
A
368 queue_init(&sched_groups);
369
3e170ce0 370 sched_timeshare_init();
fe8ab488
A
371}
372
373static void
374sched_multiq_processor_init(processor_t processor)
375{
376 run_queue_init(&processor->runq);
377}
378
379static void
380sched_multiq_pset_init(processor_set_t pset)
381{
382 run_queue_init(&pset->pset_runq);
383}
384
385static sched_mode_t
386sched_multiq_initial_thread_sched_mode(task_t parent_task)
387{
0a7de745 388 if (parent_task == kernel_task) {
fe8ab488 389 return TH_MODE_FIXED;
0a7de745 390 } else {
fe8ab488 391 return TH_MODE_TIMESHARE;
0a7de745 392 }
fe8ab488
A
393}
394
395sched_group_t
396sched_group_create(void)
397{
398 sched_group_t sched_group;
399
0a7de745 400 if (!SCHED(sched_groups_enabled)) {
fe8ab488 401 return SCHED_GROUP_NULL;
0a7de745 402 }
fe8ab488
A
403
404 sched_group = (sched_group_t)zalloc(sched_group_zone);
405
406 bzero(sched_group, sizeof(struct sched_group));
407
408 run_queue_init(&sched_group->runq);
409
f427ee49 410 for (size_t i = 0; i < NRQS; i++) {
fe8ab488 411 sched_group->entries[i].runq = 0;
f427ee49 412 sched_group->entries[i].sched_pri = (int16_t)i;
fe8ab488
A
413 }
414
415 lck_mtx_lock(&sched_groups_lock);
416 queue_enter(&sched_groups, sched_group, sched_group_t, sched_groups);
417 num_sched_groups++;
418 lck_mtx_unlock(&sched_groups_lock);
419
0a7de745 420 return sched_group;
fe8ab488
A
421}
422
423void
424sched_group_destroy(sched_group_t sched_group)
425{
3e170ce0 426 if (!SCHED(sched_groups_enabled)) {
fe8ab488
A
427 assert(sched_group == SCHED_GROUP_NULL);
428 return;
429 }
430
431 assert(sched_group != SCHED_GROUP_NULL);
432 assert(sched_group->runq.count == 0);
433
434 for (int i = 0; i < NRQS; i++) {
435 assert(sched_group->entries[i].runq == 0);
436 assert(sched_group->entries[i].sched_pri == i);
437 }
438
439 lck_mtx_lock(&sched_groups_lock);
440 queue_remove(&sched_groups, sched_group, sched_group_t, sched_groups);
441 num_sched_groups--;
442 lck_mtx_unlock(&sched_groups_lock);
443
444 zfree(sched_group_zone, sched_group);
445}
446
447__attribute__((always_inline))
448static inline entry_queue_t
449multiq_main_entryq(processor_t processor)
450{
451 return (entry_queue_t)&processor->processor_set->pset_runq;
452}
453
454__attribute__((always_inline))
455static inline run_queue_t
456multiq_bound_runq(processor_t processor)
457{
458 return &processor->runq;
459}
460
461__attribute__((always_inline))
462static inline sched_entry_t
463group_entry_for_pri(sched_group_t group, integer_t pri)
464{
465 return &group->entries[pri];
466}
467
468__attribute__((always_inline))
469static inline sched_group_t
470group_for_entry(sched_entry_t entry)
471{
39037602
A
472#pragma clang diagnostic push
473#pragma clang diagnostic ignored "-Wcast-align"
fe8ab488 474 sched_group_t group = (sched_group_t)(entry - entry->sched_pri);
39037602 475#pragma clang diagnostic pop
fe8ab488 476 return group;
0a7de745 477}
fe8ab488
A
478
479/* Peek at the head of the runqueue */
480static sched_entry_t
481entry_queue_first_entry(entry_queue_t rq)
482{
483 assert(rq->count != 0);
484
cb323159 485 circle_queue_t queue = &rq->queues[rq->highq];
fe8ab488 486
cb323159 487 sched_entry_t entry = cqe_queue_first(queue, struct sched_entry, entry_links);
fe8ab488
A
488
489 assert(entry->sched_pri == rq->highq);
490
491 return entry;
492}
493
494#if defined(MULTIQ_SANITY_CHECK)
495
3e170ce0 496#if MACH_ASSERT
fe8ab488
A
497__attribute__((always_inline))
498static inline boolean_t
499queue_chain_linked(queue_chain_t* chain)
500{
501 if (chain->next != NULL) {
502 assert(chain->prev != NULL);
503 return TRUE;
504 } else {
505 assert(chain->prev == NULL);
506 return FALSE;
507 }
508}
3e170ce0 509#endif /* MACH_ASSERT */
fe8ab488
A
510
511static thread_t
512group_first_thread(sched_group_t group)
513{
514 group_runq_t rq = &group->runq;
515
516 assert(rq->count != 0);
517
cb323159 518 circle_queue_t queue = &rq->queues[rq->highq];
fe8ab488 519
cb323159 520 thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
fe8ab488
A
521
522 assert(thread != THREAD_NULL);
39037602 523 assert_thread_magic(thread);
fe8ab488
A
524
525 assert(thread->sched_group == group);
526
527 /* TODO: May not be safe */
528 assert(thread->sched_pri == rq->highq);
529
530 return thread;
531}
532
533/* Asserts if entry is not in entry runq at pri */
534static void
535entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pri)
536{
cb323159 537 circle_queue_t q;
fe8ab488
A
538 sched_entry_t elem;
539
39037602 540 assert(queue_chain_linked(&entry->entry_links));
fe8ab488
A
541 assert(entry->runq == MULTIQ_ERUNQ);
542
543 q = &runq->queues[expected_pri];
544
cb323159 545 cqe_foreach_element(elem, q, entry_links) {
0a7de745 546 if (elem == entry) {
fe8ab488 547 return;
0a7de745 548 }
fe8ab488
A
549 }
550
551 panic("runq %p doesn't contain entry %p at pri %d", runq, entry, expected_pri);
552}
553
554/* Asserts if thread is not in group at its priority */
555static void
556sched_group_check_thread(sched_group_t group, thread_t thread)
557{
cb323159 558 circle_queue_t q;
fe8ab488
A
559 thread_t elem;
560 int pri = thread->sched_pri;
561
562 assert(thread->runq != PROCESSOR_NULL);
563
564 q = &group->runq.queues[pri];
565
cb323159 566 cqe_foreach_element(elem, q, runq_links) {
0a7de745 567 if (elem == thread) {
fe8ab488 568 return;
0a7de745 569 }
fe8ab488
A
570 }
571
572 panic("group %p doesn't contain thread %p at pri %d", group, thread, pri);
573}
574
575static void
576global_check_entry_queue(entry_queue_t main_entryq)
577{
0a7de745 578 if (main_entryq->count == 0) {
fe8ab488 579 return;
0a7de745 580 }
fe8ab488
A
581
582 sched_entry_t entry = entry_queue_first_entry(main_entryq);
583
584 assert(entry->runq == MULTIQ_ERUNQ);
585
586 sched_group_t group = group_for_entry(entry);
587
588 thread_t thread = group_first_thread(group);
589
590 __assert_only sched_entry_t thread_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
591
592 assert(entry->sched_pri == group->runq.highq);
593
594 assert(entry == thread_entry);
595 assert(thread->runq != PROCESSOR_NULL);
596}
597
598static void
599group_check_run_queue(entry_queue_t main_entryq, sched_group_t group)
600{
0a7de745 601 if (group->runq.count == 0) {
fe8ab488 602 return;
0a7de745 603 }
fe8ab488
A
604
605 thread_t thread = group_first_thread(group);
606
607 assert(thread->runq != PROCESSOR_NULL);
608
609 sched_entry_t sched_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
610
611 entry_queue_check_entry(main_entryq, sched_entry, thread->sched_pri);
612
613 assert(sched_entry->sched_pri == thread->sched_pri);
614 assert(sched_entry->runq == MULTIQ_ERUNQ);
615}
616
617#endif /* defined(MULTIQ_SANITY_CHECK) */
618
619/*
620 * The run queue must not be empty.
621 */
622static sched_entry_t
623entry_queue_dequeue_entry(entry_queue_t rq)
624{
625 sched_entry_t sched_entry;
cb323159 626 circle_queue_t queue = &rq->queues[rq->highq];
fe8ab488
A
627
628 assert(rq->count > 0);
cb323159 629 assert(!circle_queue_empty(queue));
fe8ab488 630
cb323159 631 sched_entry = cqe_dequeue_head(queue, struct sched_entry, entry_links);
fe8ab488
A
632
633 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
634 rq->count--;
635 if (SCHED(priority_is_urgent)(rq->highq)) {
636 rq->urgency--; assert(rq->urgency >= 0);
637 }
cb323159 638 if (circle_queue_empty(queue)) {
39037602
A
639 rq_bitmap_clear(rq->bitmap, rq->highq);
640 rq->highq = bitmap_first(rq->bitmap, NRQS);
fe8ab488
A
641 }
642
643 sched_entry->runq = 0;
644
0a7de745 645 return sched_entry;
fe8ab488
A
646}
647
648/*
649 * The run queue must not be empty.
650 */
651static boolean_t
652entry_queue_enqueue_entry(
0a7de745
A
653 entry_queue_t rq,
654 sched_entry_t entry,
655 integer_t options)
fe8ab488
A
656{
657 int sched_pri = entry->sched_pri;
cb323159 658 circle_queue_t queue = &rq->queues[sched_pri];
fe8ab488
A
659 boolean_t result = FALSE;
660
661 assert(entry->runq == 0);
662
cb323159
A
663 if (circle_queue_empty(queue)) {
664 circle_enqueue_tail(queue, &entry->entry_links);
fe8ab488 665
39037602 666 rq_bitmap_set(rq->bitmap, sched_pri);
fe8ab488
A
667 if (sched_pri > rq->highq) {
668 rq->highq = sched_pri;
669 result = TRUE;
670 }
671 } else {
0a7de745 672 if (options & SCHED_TAILQ) {
cb323159 673 circle_enqueue_tail(queue, &entry->entry_links);
0a7de745 674 } else {
cb323159 675 circle_enqueue_head(queue, &entry->entry_links);
0a7de745 676 }
fe8ab488 677 }
0a7de745 678 if (SCHED(priority_is_urgent)(sched_pri)) {
fe8ab488 679 rq->urgency++;
0a7de745 680 }
fe8ab488
A
681 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
682 rq->count++;
683
684 entry->runq = MULTIQ_ERUNQ;
685
0a7de745 686 return result;
fe8ab488
A
687}
688
689/*
690 * The entry must be in this runqueue.
691 */
692static void
693entry_queue_remove_entry(
0a7de745
A
694 entry_queue_t rq,
695 sched_entry_t entry)
fe8ab488
A
696{
697 int sched_pri = entry->sched_pri;
698
699#if defined(MULTIQ_SANITY_CHECK)
700 if (multiq_sanity_check) {
701 entry_queue_check_entry(rq, entry, sched_pri);
702 }
703#endif
704
39037602 705 remqueue(&entry->entry_links);
fe8ab488
A
706
707 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
708 rq->count--;
709 if (SCHED(priority_is_urgent)(sched_pri)) {
710 rq->urgency--; assert(rq->urgency >= 0);
711 }
712
cb323159 713 if (circle_queue_empty(&rq->queues[sched_pri])) {
fe8ab488 714 /* update run queue status */
39037602
A
715 rq_bitmap_clear(rq->bitmap, sched_pri);
716 rq->highq = bitmap_first(rq->bitmap, NRQS);
fe8ab488
A
717 }
718
719 entry->runq = 0;
720}
721
3e170ce0
A
722static void
723entry_queue_change_entry(
0a7de745
A
724 entry_queue_t rq,
725 sched_entry_t entry,
726 integer_t options)
3e170ce0 727{
cb323159
A
728 int sched_pri = entry->sched_pri;
729 circle_queue_t queue = &rq->queues[sched_pri];
3e170ce0
A
730
731#if defined(MULTIQ_SANITY_CHECK)
732 if (multiq_sanity_check) {
733 entry_queue_check_entry(rq, entry, sched_pri);
734 }
735#endif
3e170ce0 736
cb323159 737 circle_dequeue(queue, &entry->entry_links);
0a7de745 738 if (options & SCHED_TAILQ) {
cb323159 739 circle_enqueue_tail(queue, &entry->entry_links);
0a7de745 740 } else {
cb323159 741 circle_enqueue_head(queue, &entry->entry_links);
0a7de745 742 }
3e170ce0 743}
fe8ab488
A
744/*
745 * The run queue must not be empty.
746 *
747 * sets queue_empty to TRUE if queue is now empty at thread_pri
748 */
749static thread_t
750group_run_queue_dequeue_thread(
0a7de745
A
751 group_runq_t rq,
752 integer_t *thread_pri,
753 boolean_t *queue_empty)
fe8ab488
A
754{
755 thread_t thread;
cb323159 756 circle_queue_t queue = &rq->queues[rq->highq];
fe8ab488
A
757
758 assert(rq->count > 0);
cb323159 759 assert(!circle_queue_empty(queue));
fe8ab488
A
760
761 *thread_pri = rq->highq;
762
cb323159 763 thread = cqe_dequeue_head(queue, struct thread, runq_links);
39037602 764 assert_thread_magic(thread);
fe8ab488
A
765
766 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
767 rq->count--;
768 if (SCHED(priority_is_urgent)(rq->highq)) {
769 rq->urgency--; assert(rq->urgency >= 0);
770 }
cb323159 771 if (circle_queue_empty(queue)) {
39037602
A
772 rq_bitmap_clear(rq->bitmap, rq->highq);
773 rq->highq = bitmap_first(rq->bitmap, NRQS);
fe8ab488
A
774 *queue_empty = TRUE;
775 } else {
776 *queue_empty = FALSE;
777 }
778
39037602 779 return thread;
fe8ab488
A
780}
781
782/*
783 * The run queue must not be empty.
784 * returns TRUE if queue was empty at thread_pri
785 */
786static boolean_t
787group_run_queue_enqueue_thread(
0a7de745
A
788 group_runq_t rq,
789 thread_t thread,
790 integer_t thread_pri,
791 integer_t options)
fe8ab488 792{
cb323159 793 circle_queue_t queue = &rq->queues[thread_pri];
fe8ab488
A
794 boolean_t result = FALSE;
795
796 assert(thread->runq == PROCESSOR_NULL);
39037602 797 assert_thread_magic(thread);
fe8ab488 798
cb323159
A
799 if (circle_queue_empty(queue)) {
800 circle_enqueue_tail(queue, &thread->runq_links);
fe8ab488 801
39037602 802 rq_bitmap_set(rq->bitmap, thread_pri);
fe8ab488
A
803 if (thread_pri > rq->highq) {
804 rq->highq = thread_pri;
805 }
806 result = TRUE;
807 } else {
0a7de745 808 if (options & SCHED_TAILQ) {
cb323159 809 circle_enqueue_tail(queue, &thread->runq_links);
0a7de745 810 } else {
cb323159 811 circle_enqueue_head(queue, &thread->runq_links);
0a7de745 812 }
fe8ab488 813 }
0a7de745 814 if (SCHED(priority_is_urgent)(thread_pri)) {
fe8ab488 815 rq->urgency++;
0a7de745 816 }
fe8ab488
A
817 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
818 rq->count++;
819
0a7de745 820 return result;
fe8ab488
A
821}
822
823/*
824 * The thread must be in this runqueue.
825 * returns TRUE if queue is now empty at thread_pri
826 */
827static boolean_t
828group_run_queue_remove_thread(
0a7de745
A
829 group_runq_t rq,
830 thread_t thread,
831 integer_t thread_pri)
fe8ab488 832{
cb323159 833 circle_queue_t queue = &rq->queues[thread_pri];
fe8ab488
A
834 boolean_t result = FALSE;
835
39037602 836 assert_thread_magic(thread);
fe8ab488
A
837 assert(thread->runq != PROCESSOR_NULL);
838
cb323159 839 circle_dequeue(queue, &thread->runq_links);
fe8ab488
A
840
841 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
842 rq->count--;
843 if (SCHED(priority_is_urgent)(thread_pri)) {
844 rq->urgency--; assert(rq->urgency >= 0);
845 }
846
cb323159 847 if (circle_queue_empty(queue)) {
fe8ab488 848 /* update run queue status */
39037602
A
849 rq_bitmap_clear(rq->bitmap, thread_pri);
850 rq->highq = bitmap_first(rq->bitmap, NRQS);
fe8ab488
A
851 result = TRUE;
852 }
853
854 thread->runq = PROCESSOR_NULL;
855
856 return result;
857}
858
859/*
860 * A thread's sched pri may change out from under us because
861 * we're clearing thread->runq here without the thread locked.
862 * Do not rely on it to be the same as when we enqueued.
863 */
864static thread_t
865sched_global_dequeue_thread(entry_queue_t main_entryq)
866{
867 boolean_t pri_level_empty = FALSE;
868 sched_entry_t entry;
869 group_runq_t group_runq;
870 thread_t thread;
871 integer_t thread_pri;
872 sched_group_t group;
873
874 assert(main_entryq->count > 0);
875
876 entry = entry_queue_dequeue_entry(main_entryq);
877
878 group = group_for_entry(entry);
879 group_runq = &group->runq;
880
881 thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
882
883 thread->runq = PROCESSOR_NULL;
884
885 if (!pri_level_empty) {
886 entry_queue_enqueue_entry(main_entryq, entry, SCHED_TAILQ);
887 }
888
889 return thread;
890}
891
892/* Dequeue a thread from the global runq without moving the entry */
893static thread_t
894sched_global_deep_drain_dequeue_thread(entry_queue_t main_entryq)
895{
896 boolean_t pri_level_empty = FALSE;
897 sched_entry_t entry;
898 group_runq_t group_runq;
899 thread_t thread;
900 integer_t thread_pri;
901 sched_group_t group;
902
903 assert(main_entryq->count > 0);
904
905 entry = entry_queue_first_entry(main_entryq);
906
907 group = group_for_entry(entry);
908 group_runq = &group->runq;
909
910 thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
911
912 thread->runq = PROCESSOR_NULL;
913
914 if (pri_level_empty) {
915 entry_queue_remove_entry(main_entryq, entry);
916 }
917
918 return thread;
919}
920
921
922static thread_t
923sched_group_dequeue_thread(
0a7de745
A
924 entry_queue_t main_entryq,
925 sched_group_t group)
fe8ab488
A
926{
927 group_runq_t group_runq = &group->runq;
928 boolean_t pri_level_empty = FALSE;
929 thread_t thread;
930 integer_t thread_pri;
931
932 thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
933
934 thread->runq = PROCESSOR_NULL;
935
936 if (pri_level_empty) {
937 entry_queue_remove_entry(main_entryq, group_entry_for_pri(group, thread_pri));
938 }
939
940 return thread;
941}
942
943static void
944sched_group_remove_thread(
0a7de745
A
945 entry_queue_t main_entryq,
946 sched_group_t group,
947 thread_t thread)
fe8ab488
A
948{
949 integer_t thread_pri = thread->sched_pri;
950 sched_entry_t sched_entry = group_entry_for_pri(group, thread_pri);
951
952#if defined(MULTIQ_SANITY_CHECK)
953 if (multiq_sanity_check) {
954 global_check_entry_queue(main_entryq);
955 group_check_run_queue(main_entryq, group);
956
957 sched_group_check_thread(group, thread);
958 entry_queue_check_entry(main_entryq, sched_entry, thread_pri);
959 }
960#endif
961
962 boolean_t pri_level_empty = group_run_queue_remove_thread(&group->runq, thread, thread_pri);
963
964 if (pri_level_empty) {
965 entry_queue_remove_entry(main_entryq, sched_entry);
966 }
967
968#if defined(MULTIQ_SANITY_CHECK)
969 if (multiq_sanity_check) {
970 global_check_entry_queue(main_entryq);
971 group_check_run_queue(main_entryq, group);
972 }
973#endif
974}
975
976static void
977sched_group_enqueue_thread(
0a7de745
A
978 entry_queue_t main_entryq,
979 sched_group_t group,
980 thread_t thread,
981 integer_t options)
fe8ab488
A
982{
983#if defined(MULTIQ_SANITY_CHECK)
984 if (multiq_sanity_check) {
985 global_check_entry_queue(main_entryq);
986 group_check_run_queue(main_entryq, group);
987 }
988#endif
989
990 int sched_pri = thread->sched_pri;
991
992 boolean_t pri_level_was_empty = group_run_queue_enqueue_thread(&group->runq, thread, sched_pri, options);
993
994 if (pri_level_was_empty) {
995 /*
996 * TODO: Need to figure out if passing options here is a good idea or not
997 * What effects would it have?
998 */
999 entry_queue_enqueue_entry(main_entryq, &group->entries[sched_pri], options);
3e170ce0
A
1000 } else if (options & SCHED_HEADQ) {
1001 /* The thread should be at the head of the line - move its entry to the front */
1002 entry_queue_change_entry(main_entryq, &group->entries[sched_pri], options);
fe8ab488
A
1003 }
1004}
1005
1006/*
1007 * Locate a thread to execute from the run queue and return it.
1008 * Only choose a thread with greater or equal priority.
1009 *
1010 * pset is locked, thread is not locked.
1011 *
1012 * Returns THREAD_NULL if it cannot find a valid thread.
1013 *
1014 * Note: we cannot rely on the value of thread->sched_pri in this path because
1015 * we don't have the thread locked.
1016 *
1017 * TODO: Remove tracepoints
1018 */
1019static thread_t
1020sched_multiq_choose_thread(
0a7de745
A
1021 processor_t processor,
1022 int priority,
1023 ast_t reason)
fe8ab488
A
1024{
1025 entry_queue_t main_entryq = multiq_main_entryq(processor);
1026 run_queue_t bound_runq = multiq_bound_runq(processor);
1027
1028 boolean_t choose_bound_runq = FALSE;
1029
0a7de745
A
1030 if (bound_runq->highq < priority &&
1031 main_entryq->highq < priority) {
fe8ab488 1032 return THREAD_NULL;
0a7de745 1033 }
fe8ab488
A
1034
1035 if (bound_runq->count && main_entryq->count) {
1036 if (bound_runq->highq >= main_entryq->highq) {
1037 choose_bound_runq = TRUE;
1038 } else {
1039 /* Use main runq */
1040 }
1041 } else if (bound_runq->count) {
1042 choose_bound_runq = TRUE;
1043 } else if (main_entryq->count) {
1044 /* Use main runq */
1045 } else {
0a7de745 1046 return THREAD_NULL;
fe8ab488
A
1047 }
1048
1049 if (choose_bound_runq) {
1050 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1051 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1052 MACH_MULTIQ_BOUND, main_entryq->highq, bound_runq->highq, 0, 0);
1053
1054 return run_queue_dequeue(bound_runq, SCHED_HEADQ);
1055 }
1056
1057 sched_group_t group = current_thread()->sched_group;
1058
1059#if defined(MULTIQ_SANITY_CHECK)
1060 if (multiq_sanity_check) {
1061 global_check_entry_queue(main_entryq);
1062 group_check_run_queue(main_entryq, group);
1063 }
1064#endif
1065
1066 /*
1067 * Determine if we should look at the group or the global queue
1068 *
1069 * TODO:
1070 * Perhaps pass reason as a 'should look inside' argument to choose_thread
1071 * Should YIELD AST override drain limit?
1072 */
1073 if (group->runq.count != 0 && (reason & AST_PREEMPTION) == 0) {
3e170ce0
A
1074 boolean_t favor_group = TRUE;
1075
1076 integer_t global_pri = main_entryq->highq;
1077 integer_t group_pri = group->runq.highq;
fe8ab488 1078
3e170ce0
A
1079 /*
1080 * Favor the current group if the group is still the globally highest.
1081 *
1082 * Otherwise, consider choosing a thread from the current group
1083 * even if it's lower priority than the global highest priority.
1084 */
1085 if (global_pri > group_pri) {
fe8ab488
A
1086 /*
1087 * If there's something elsewhere above the depth limit,
1088 * don't pick a thread below the limit.
1089 */
0a7de745 1090 if (global_pri > drain_depth_limit && group_pri <= drain_depth_limit) {
3e170ce0 1091 favor_group = FALSE;
0a7de745 1092 }
fe8ab488
A
1093
1094 /*
3e170ce0
A
1095 * If there's something at or above the ceiling,
1096 * don't favor the group.
fe8ab488 1097 */
0a7de745 1098 if (global_pri >= drain_ceiling) {
3e170ce0 1099 favor_group = FALSE;
0a7de745 1100 }
fe8ab488 1101
3e170ce0
A
1102 /*
1103 * Don't go more than X steps below the global highest
1104 */
0a7de745 1105 if ((global_pri - group_pri) >= drain_band_limit) {
3e170ce0 1106 favor_group = FALSE;
0a7de745 1107 }
fe8ab488
A
1108 }
1109
3e170ce0 1110 if (favor_group) {
fe8ab488
A
1111 /* Pull from local runq */
1112 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1113 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
3e170ce0 1114 MACH_MULTIQ_GROUP, global_pri, group_pri, 0, 0);
fe8ab488
A
1115
1116 return sched_group_dequeue_thread(main_entryq, group);
1117 }
1118 }
1119
1120 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1121 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1122 MACH_MULTIQ_GLOBAL, main_entryq->highq, group->runq.highq, 0, 0);
1123
1124 /* Couldn't pull from local runq, pull from global runq instead */
1125 if (deep_drain) {
1126 return sched_global_deep_drain_dequeue_thread(main_entryq);
1127 } else {
1128 return sched_global_dequeue_thread(main_entryq);
1129 }
1130}
1131
1132
1133/*
1134 * Thread must be locked, and not already be on a run queue.
1135 * pset is locked.
1136 */
1137static boolean_t
1138sched_multiq_processor_enqueue(
0a7de745
A
1139 processor_t processor,
1140 thread_t thread,
cb323159 1141 sched_options_t options)
fe8ab488
A
1142{
1143 boolean_t result;
1144
1145 assert(processor == thread->chosen_processor);
1146
1147 if (thread->bound_processor != PROCESSOR_NULL) {
1148 assert(thread->bound_processor == processor);
1149
1150 result = run_queue_enqueue(multiq_bound_runq(processor), thread, options);
1151 thread->runq = processor;
1152
1153 return result;
1154 }
1155
1156 sched_group_enqueue_thread(multiq_main_entryq(processor),
0a7de745
A
1157 thread->sched_group,
1158 thread, options);
fe8ab488
A
1159
1160 thread->runq = processor;
1161
0a7de745 1162 return FALSE;
fe8ab488
A
1163}
1164
1165/*
1166 * Called in the context of thread with thread and pset unlocked,
1167 * after updating thread priority but before propagating that priority
1168 * to the processor
1169 */
1170void
1171sched_multiq_quantum_expire(thread_t thread)
1172{
1173 if (deep_drain) {
1174 /*
1175 * Move the entry at this priority to the end of the queue,
1176 * to allow the next task a shot at running.
1177 */
1178
1179 processor_t processor = thread->last_processor;
1180 processor_set_t pset = processor->processor_set;
1181 entry_queue_t entryq = multiq_main_entryq(processor);
1182
1183 pset_lock(pset);
1184
1185 sched_entry_t entry = group_entry_for_pri(thread->sched_group, processor->current_pri);
1186
1187 if (entry->runq == MULTIQ_ERUNQ) {
3e170ce0 1188 entry_queue_change_entry(entryq, entry, SCHED_TAILQ);
fe8ab488
A
1189 }
1190
1191 pset_unlock(pset);
1192 }
1193}
1194
1195static boolean_t
1196sched_multiq_processor_queue_empty(processor_t processor)
1197{
1198 return multiq_main_entryq(processor)->count == 0 &&
0a7de745 1199 multiq_bound_runq(processor)->count == 0;
fe8ab488
A
1200}
1201
1202static ast_t
1203sched_multiq_processor_csw_check(processor_t processor)
1204{
1205 boolean_t has_higher;
1206 int pri;
1207
a39ff7e2 1208 if (sched_multiq_thread_avoid_processor(processor, current_thread())) {
0a7de745 1209 return AST_PREEMPT | AST_URGENT;
a39ff7e2
A
1210 }
1211
fe8ab488 1212 entry_queue_t main_entryq = multiq_main_entryq(processor);
3e170ce0 1213 run_queue_t bound_runq = multiq_bound_runq(processor);
fe8ab488
A
1214
1215 assert(processor->active_thread != NULL);
1216
1217 pri = MAX(main_entryq->highq, bound_runq->highq);
1218
3e170ce0 1219 if (processor->first_timeslice) {
fe8ab488
A
1220 has_higher = (pri > processor->current_pri);
1221 } else {
1222 has_higher = (pri >= processor->current_pri);
1223 }
1224
1225 if (has_higher) {
0a7de745
A
1226 if (main_entryq->urgency > 0) {
1227 return AST_PREEMPT | AST_URGENT;
1228 }
fe8ab488 1229
0a7de745
A
1230 if (bound_runq->urgency > 0) {
1231 return AST_PREEMPT | AST_URGENT;
1232 }
fe8ab488
A
1233
1234 return AST_PREEMPT;
1235 }
1236
1237 return AST_NONE;
1238}
1239
1240static boolean_t
1241sched_multiq_processor_queue_has_priority(
0a7de745
A
1242 processor_t processor,
1243 int priority,
1244 boolean_t gte)
fe8ab488 1245{
39037602
A
1246 run_queue_t main_runq = multiq_main_entryq(processor);
1247 run_queue_t bound_runq = multiq_bound_runq(processor);
1248
39037602 1249 int qpri = MAX(main_runq->highq, bound_runq->highq);
fe8ab488 1250
0a7de745 1251 if (gte) {
fe8ab488 1252 return qpri >= priority;
0a7de745 1253 } else {
fe8ab488 1254 return qpri > priority;
0a7de745 1255 }
fe8ab488
A
1256}
1257
fe8ab488
A
1258static int
1259sched_multiq_runq_count(processor_t processor)
1260{
1261 /*
1262 * TODO: Decide whether to keep a count of runnable threads in the pset
1263 * or just return something less than the true count.
1264 *
1265 * This needs to be fast, so no iterating the whole runq.
1266 *
1267 * Another possible decision is to remove this - with global runq
1268 * it doesn't make much sense.
1269 */
1270 return multiq_main_entryq(processor)->count + multiq_bound_runq(processor)->count;
1271}
1272
1273static uint64_t
1274sched_multiq_runq_stats_count_sum(processor_t processor)
1275{
1276 /*
1277 * TODO: This one does need to go through all the runqueues, but it's only needed for
1278 * the sched stats tool
1279 */
1280
1281 uint64_t bound_sum = multiq_bound_runq(processor)->runq_stats.count_sum;
1282
0a7de745 1283 if (processor->cpu_id == processor->processor_set->cpu_set_low) {
fe8ab488 1284 return bound_sum + multiq_main_entryq(processor)->runq_stats.count_sum;
0a7de745 1285 } else {
fe8ab488 1286 return bound_sum;
0a7de745 1287 }
fe8ab488
A
1288}
1289
1290static int
1291sched_multiq_processor_bound_count(processor_t processor)
1292{
1293 return multiq_bound_runq(processor)->count;
1294}
1295
1296static void
1297sched_multiq_processor_queue_shutdown(processor_t processor)
1298{
1299 processor_set_t pset = processor->processor_set;
1300 entry_queue_t main_entryq = multiq_main_entryq(processor);
1301 thread_t thread;
1302 queue_head_t tqueue;
1303
1304 /* We only need to migrate threads if this is the last active processor in the pset */
1305 if (pset->online_processor_count > 0) {
1306 pset_unlock(pset);
1307 return;
1308 }
1309
1310 queue_init(&tqueue);
1311
1312 /* Note that we do not remove bound threads from the queues here */
1313
1314 while (main_entryq->count > 0) {
1315 thread = sched_global_dequeue_thread(main_entryq);
39037602 1316 enqueue_tail(&tqueue, &thread->runq_links);
fe8ab488
A
1317 }
1318
1319 pset_unlock(pset);
1320
39037602 1321 qe_foreach_element_safe(thread, &tqueue, runq_links) {
39037602
A
1322 remqueue(&thread->runq_links);
1323
fe8ab488
A
1324 thread_lock(thread);
1325
1326 thread_setrun(thread, SCHED_TAILQ);
1327
1328 thread_unlock(thread);
1329 }
1330}
1331
1332/*
1333 * Thread is locked
1334 *
1335 * This is why we can never read sched_pri unless we have the thread locked.
1336 * Which we do in the enqueue and remove cases, but not the dequeue case.
1337 */
1338static boolean_t
1339sched_multiq_processor_queue_remove(
0a7de745
A
1340 processor_t processor,
1341 thread_t thread)
fe8ab488
A
1342{
1343 boolean_t removed = FALSE;
fe8ab488
A
1344 processor_set_t pset = processor->processor_set;
1345
1346 pset_lock(pset);
1347
1348 if (thread->runq != PROCESSOR_NULL) {
1349 /*
1350 * Thread is on a run queue and we have a lock on
1351 * that run queue.
1352 */
1353
1354 assert(thread->runq == processor);
1355
1356 if (thread->bound_processor != PROCESSOR_NULL) {
1357 assert(processor == thread->bound_processor);
1358 run_queue_remove(multiq_bound_runq(processor), thread);
1359 thread->runq = PROCESSOR_NULL;
1360 } else {
1361 sched_group_remove_thread(multiq_main_entryq(processor),
0a7de745
A
1362 thread->sched_group,
1363 thread);
fe8ab488
A
1364 }
1365
1366 removed = TRUE;
1367 }
1368
1369 pset_unlock(pset);
1370
1371 return removed;
1372}
1373
1374/* pset is locked, returned unlocked */
1375static thread_t
1376sched_multiq_steal_thread(processor_set_t pset)
1377{
1378 pset_unlock(pset);
0a7de745 1379 return THREAD_NULL;
fe8ab488
A
1380}
1381
1382/*
1383 * Scan the global queue for candidate groups, and scan those groups for
1384 * candidate threads.
1385 *
39037602
A
1386 * TODO: This iterates every group runq in its entirety for each entry it has in the runq, which is O(N^2)
1387 * Instead, iterate only the queue in the group runq matching the priority of the entry.
1388 *
fe8ab488
A
1389 * Returns TRUE if retry is needed.
1390 */
1391static boolean_t
0a7de745
A
1392group_scan(entry_queue_t runq, sched_update_scan_context_t scan_context)
1393{
39037602
A
1394 int count = runq->count;
1395 int queue_index;
1396
1397 assert(count >= 0);
1398
0a7de745 1399 if (count == 0) {
39037602 1400 return FALSE;
0a7de745 1401 }
39037602
A
1402
1403 for (queue_index = bitmap_first(runq->bitmap, NRQS);
0a7de745
A
1404 queue_index >= 0;
1405 queue_index = bitmap_next(runq->bitmap, queue_index)) {
39037602
A
1406 sched_entry_t entry;
1407
cb323159 1408 cqe_foreach_element(entry, &runq->queues[queue_index], entry_links) {
39037602
A
1409 assert(count > 0);
1410
1411 sched_group_t group = group_for_entry(entry);
1412 if (group->runq.count > 0) {
0a7de745
A
1413 if (runq_scan(&group->runq, scan_context)) {
1414 return TRUE;
1415 }
fe8ab488 1416 }
39037602 1417 count--;
fe8ab488
A
1418 }
1419 }
1420
0a7de745 1421 return FALSE;
fe8ab488
A
1422}
1423
1424static void
3e170ce0 1425sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context)
fe8ab488
A
1426{
1427 boolean_t restart_needed = FALSE;
1428 processor_t processor = processor_list;
1429 processor_set_t pset;
1430 thread_t thread;
1431 spl_t s;
1432
1433 /*
1434 * We update the threads associated with each processor (bound and idle threads)
1435 * and then update the threads in each pset runqueue.
1436 */
1437
1438 do {
1439 do {
1440 pset = processor->processor_set;
1441
1442 s = splsched();
1443 pset_lock(pset);
1444
3e170ce0 1445 restart_needed = runq_scan(multiq_bound_runq(processor), scan_context);
fe8ab488
A
1446
1447 pset_unlock(pset);
1448 splx(s);
1449
0a7de745 1450 if (restart_needed) {
fe8ab488 1451 break;
0a7de745 1452 }
fe8ab488
A
1453
1454 thread = processor->idle_thread;
1455 if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
1456 if (thread_update_add_thread(thread) == FALSE) {
1457 restart_needed = TRUE;
1458 break;
1459 }
1460 }
1461 } while ((processor = processor->processor_list) != NULL);
1462
1463 /* Ok, we now have a collection of candidates -- fix them. */
1464 thread_update_process_threads();
fe8ab488
A
1465 } while (restart_needed);
1466
1467 pset = &pset0;
1468
1469 do {
1470 do {
1471 s = splsched();
1472 pset_lock(pset);
1473
3e170ce0 1474 restart_needed = group_scan(&pset->pset_runq, scan_context);
fe8ab488
A
1475
1476 pset_unlock(pset);
1477 splx(s);
1478
0a7de745 1479 if (restart_needed) {
fe8ab488 1480 break;
0a7de745 1481 }
fe8ab488
A
1482 } while ((pset = pset->pset_list) != NULL);
1483
1484 /* Ok, we now have a collection of candidates -- fix them. */
1485 thread_update_process_threads();
fe8ab488
A
1486 } while (restart_needed);
1487}
a39ff7e2
A
1488
1489extern int sched_allow_rt_smt;
1490
1491/* Return true if this thread should not continue running on this processor */
1492static bool
1493sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread)
1494{
1495 if (processor->processor_primary != processor) {
1496 /*
1497 * This is a secondary SMT processor. If the primary is running
1498 * a realtime thread, only allow realtime threads on the secondary.
1499 */
1500 if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
1501 return true;
1502 }
1503 }
1504
1505 return false;
1506}