]> git.saurik.com Git - apple/xnu.git/blame - osfmk/kern/sched_multiq.c
xnu-4903.221.2.tar.gz
[apple/xnu.git] / osfmk / kern / sched_multiq.c
CommitLineData
fe8ab488
A
1/*
2 * Copyright (c) 2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <mach/mach_types.h>
30#include <mach/machine.h>
31
32#include <machine/machine_routines.h>
33#include <machine/sched_param.h>
34#include <machine/machine_cpu.h>
35
36#include <kern/kern_types.h>
37#include <kern/debug.h>
38#include <kern/mach_param.h>
39#include <kern/machine.h>
40#include <kern/misc_protos.h>
41#include <kern/processor.h>
42#include <kern/queue.h>
43#include <kern/sched.h>
44#include <kern/sched_prim.h>
45#include <kern/task.h>
46#include <kern/thread.h>
47
48#include <sys/kdebug.h>
49
50/*
51 * Theory Statement
52 *
53 * How does the task scheduler work?
54 *
55 * It schedules threads across a few levels.
56 *
57 * RT threads are dealt with above us
58 * Bound threads go into the per-processor runq
59 * Non-bound threads are linked on their task's sched_group's runq
60 * sched_groups' sched_entries are linked on the pset's runq
61 *
62 * TODO: make this explicit - bound threads should have a different enqueue fxn
63 *
64 * When we choose a new thread, we will decide whether to look at the bound runqueue, the global runqueue
65 * or the current group's runqueue, then dequeue the next thread in that runqueue.
66 *
67 * We then manipulate the sched_entries to reflect the invariant that:
68 * Each non-empty priority level in a group's runq is represented by one sched_entry enqueued in the global
69 * runqueue.
70 *
71 * A sched_entry represents a chance at running - for each priority in each task, there is one chance of getting
72 * to run. This reduces the excess contention bonus given to processes which have work spread among many threads
73 * as compared to processes which do the same amount of work under fewer threads.
74 *
75 * NOTE: Currently, the multiq scheduler only supports one pset.
76 *
77 * NOTE ABOUT thread->sched_pri:
78 *
79 * It can change after enqueue - it's changed without pset lock but with thread lock if thread->runq is 0.
80 * Therefore we can only depend on it not changing during the enqueue and remove path, not the dequeue.
81 *
82 * TODO: Future features:
83 *
84 * Decouple the task priority from the sched_entry priority, allowing for:
85 * fast task priority change without having to iterate and re-dispatch all threads in the task.
86 * i.e. task-wide priority, task-wide boosting
87 * fancier group decay features
88 *
89 * Group (or task) decay:
90 * Decay is used for a few different things:
91 * Prioritizing latency-needing threads over throughput-needing threads for time-to-running
92 * Balancing work between threads in a process
93 * Balancing work done at the same priority between different processes
94 * Recovering from priority inversions between two threads in the same process
95 * Recovering from priority inversions between two threads in different processes
96 * Simulating a proportional share scheduler by allowing lower priority threads
97 * to run for a certain percentage of the time
98 *
99 * Task decay lets us separately address the 'same process' and 'different process' needs,
100 * which will allow us to make smarter tradeoffs in different cases.
101 * For example, we could resolve priority inversion in the same process by reordering threads without dropping the
102 * process below low priority threads in other processes.
103 *
104 * One lock to rule them all (or at least all the runqueues) instead of the pset locks
105 *
106 * Shrink sched_entry size to the size of a queue_chain_t by inferring priority, group, and perhaps runq field.
107 * The entries array is 5K currently so it'd be really great to reduce.
108 * One way to get sched_group below 4K without a new runq structure would be to remove the extra queues above realtime.
109 *
110 * When preempting a processor, store a flag saying if the preemption
111 * was from a thread in the same group or different group,
112 * and tell choose_thread about it.
113 *
114 * When choosing a processor, bias towards those running in the same
115 * group as I am running (at the same priority, or within a certain band?).
116 *
117 * Decide if we need to support psets.
118 * Decide how to support psets - do we need duplicate entries for each pset,
119 * or can we get away with putting the entry in either one or the other pset?
120 *
121 * Consider the right way to handle runq count - I don't want to iterate groups.
39037602 122 * Perhaps keep a global counter.
fe8ab488
A
123 * Alternate option - remove it from choose_processor. It doesn't add much value
124 * now that we have global runq.
125 *
126 * Need a better way of finding group to target instead of looking at current_task.
127 * Perhaps choose_thread could pass in the current thread?
128 *
129 * Consider unifying runq copy-pastes.
130 *
131 * Thoughts on having a group central quantum bucket:
132 *
133 * I see two algorithms to decide quanta:
134 * A) Hand off only when switching thread to thread in the same group
135 * B) Allocate and return quanta to the group's pool
136 *
137 * Issues:
138 * If a task blocks completely, should it come back with the leftover quanta
139 * or brand new quanta?
140 *
141 * Should I put a flag saying zero out a quanta you grab when youre dispatched'?
142 *
143 * Resolution:
144 * Handing off quanta between threads will help with jumping around in the current task
145 * but will not help when a thread from a different task is involved.
146 * Need an algorithm that works with round robin-ing between threads in different tasks
147 *
148 * But wait - round robining can only be triggered by quantum expire or blocking.
149 * We need something that works with preemption or yielding - that's the more interesting idea.
150 *
151 * Existing algorithm - preemption doesn't re-set quantum, puts thread on head of runq.
152 * Blocking or quantum expiration does re-set quantum, puts thread on tail of runq.
153 *
154 * New algorithm -
155 * Hand off quanta when hopping between threads with same sched_group
156 * Even if thread was blocked it uses last thread remaining quanta when it starts.
157 *
158 * If we use the only cycle entry at quantum algorithm, then the quantum pool starts getting
159 * interesting.
160 *
161 * A thought - perhaps the handoff approach doesn't work so well in the presence of
162 * non-handoff wakeups i.e. wake other thread then wait then block - doesn't mean that
163 * woken thread will be what I switch to - other processor may have stolen it.
164 * What do we do there?
165 *
166 * Conclusions:
167 * We currently don't know of a scenario where quantum buckets on the task is beneficial.
168 * We will instead handoff quantum between threads in the task, and keep quantum
169 * on the preempted thread if it's preempted by something outside the task.
170 *
171 */
172
173#if DEBUG || DEVELOPMENT
174#define MULTIQ_SANITY_CHECK
175#endif
176
177typedef struct sched_entry {
39037602 178 queue_chain_t entry_links;
fe8ab488
A
179 int16_t sched_pri; /* scheduled (current) priority */
180 int16_t runq;
181 int32_t pad;
182} *sched_entry_t;
183
184typedef run_queue_t entry_queue_t; /* A run queue that holds sched_entries instead of threads */
185typedef run_queue_t group_runq_t; /* A run queue that is part of a sched_group */
186
187#define SCHED_ENTRY_NULL ((sched_entry_t) 0)
188#define MULTIQ_ERUNQ (-4) /* Indicates entry is on the main runq */
189
190/* Each level in the run queue corresponds to one entry in the entries array */
191struct sched_group {
192 struct sched_entry entries[NRQS];
193 struct run_queue runq;
194 queue_chain_t sched_groups;
195};
196
fe8ab488
A
197/*
198 * Keep entry on the head of the runqueue while dequeueing threads.
199 * Only cycle it to the end of the runqueue when a thread in the task
200 * hits its quantum.
201 */
202static boolean_t deep_drain = FALSE;
203
fe8ab488
A
204/* Verify the consistency of the runq before touching it */
205static boolean_t multiq_sanity_check = FALSE;
206
207/*
208 * Draining threads from the current task is preferred
209 * when they're less than X steps below the current
210 * global highest priority
211 */
212#define DEFAULT_DRAIN_BAND_LIMIT MAXPRI
213static integer_t drain_band_limit;
214
215/*
216 * Don't go below this priority level if there is something above it in another task
217 */
218#define DEFAULT_DRAIN_DEPTH_LIMIT MAXPRI_THROTTLE
219static integer_t drain_depth_limit;
220
3e170ce0
A
221/*
222 * Don't favor the task when there's something above this priority in another task.
223 */
224#define DEFAULT_DRAIN_CEILING BASEPRI_FOREGROUND
225static integer_t drain_ceiling;
fe8ab488
A
226
227static struct zone *sched_group_zone;
228
229static uint64_t num_sched_groups = 0;
230static queue_head_t sched_groups;
231
232static lck_attr_t sched_groups_lock_attr;
233static lck_grp_t sched_groups_lock_grp;
234static lck_grp_attr_t sched_groups_lock_grp_attr;
235
236static lck_mtx_t sched_groups_lock;
237
238
239static void
240sched_multiq_init(void);
241
242static thread_t
243sched_multiq_steal_thread(processor_set_t pset);
244
245static void
3e170ce0 246sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context);
fe8ab488
A
247
248static boolean_t
249sched_multiq_processor_enqueue(processor_t processor, thread_t thread, integer_t options);
250
251static boolean_t
252sched_multiq_processor_queue_remove(processor_t processor, thread_t thread);
253
254void
255sched_multiq_quantum_expire(thread_t thread);
256
257static ast_t
258sched_multiq_processor_csw_check(processor_t processor);
259
260static boolean_t
261sched_multiq_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
262
263static int
264sched_multiq_runq_count(processor_t processor);
265
266static boolean_t
267sched_multiq_processor_queue_empty(processor_t processor);
268
269static uint64_t
270sched_multiq_runq_stats_count_sum(processor_t processor);
271
272static int
273sched_multiq_processor_bound_count(processor_t processor);
274
275static void
276sched_multiq_pset_init(processor_set_t pset);
277
278static void
279sched_multiq_processor_init(processor_t processor);
280
281static thread_t
282sched_multiq_choose_thread(processor_t processor, int priority, ast_t reason);
283
284static void
285sched_multiq_processor_queue_shutdown(processor_t processor);
286
287static sched_mode_t
288sched_multiq_initial_thread_sched_mode(task_t parent_task);
289
a39ff7e2
A
290static bool
291sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread);
292
fe8ab488 293const struct sched_dispatch_table sched_multiq_dispatch = {
3e170ce0 294 .sched_name = "multiq",
fe8ab488 295 .init = sched_multiq_init,
3e170ce0 296 .timebase_init = sched_timeshare_timebase_init,
fe8ab488
A
297 .processor_init = sched_multiq_processor_init,
298 .pset_init = sched_multiq_pset_init,
3e170ce0 299 .maintenance_continuation = sched_timeshare_maintenance_continue,
fe8ab488 300 .choose_thread = sched_multiq_choose_thread,
3e170ce0 301 .steal_thread_enabled = FALSE,
fe8ab488 302 .steal_thread = sched_multiq_steal_thread,
3e170ce0 303 .compute_timeshare_priority = sched_compute_timeshare_priority,
fe8ab488
A
304 .choose_processor = choose_processor,
305 .processor_enqueue = sched_multiq_processor_enqueue,
306 .processor_queue_shutdown = sched_multiq_processor_queue_shutdown,
307 .processor_queue_remove = sched_multiq_processor_queue_remove,
308 .processor_queue_empty = sched_multiq_processor_queue_empty,
309 .priority_is_urgent = priority_is_urgent,
310 .processor_csw_check = sched_multiq_processor_csw_check,
311 .processor_queue_has_priority = sched_multiq_processor_queue_has_priority,
3e170ce0 312 .initial_quantum_size = sched_timeshare_initial_quantum_size,
fe8ab488
A
313 .initial_thread_sched_mode = sched_multiq_initial_thread_sched_mode,
314 .can_update_priority = can_update_priority,
315 .update_priority = update_priority,
316 .lightweight_update_priority = lightweight_update_priority,
317 .quantum_expire = sched_multiq_quantum_expire,
fe8ab488
A
318 .processor_runq_count = sched_multiq_runq_count,
319 .processor_runq_stats_count_sum = sched_multiq_runq_stats_count_sum,
fe8ab488
A
320 .processor_bound_count = sched_multiq_processor_bound_count,
321 .thread_update_scan = sched_multiq_thread_update_scan,
322 .direct_dispatch_to_idle_processors = FALSE,
3e170ce0
A
323 .multiple_psets_enabled = FALSE,
324 .sched_groups_enabled = TRUE,
a39ff7e2
A
325 .avoid_processor_enabled = TRUE,
326 .thread_avoid_processor = sched_multiq_thread_avoid_processor,
5ba3f43e
A
327 .processor_balance = sched_SMT_balance,
328
329 .rt_runq = sched_rtglobal_runq,
330 .rt_init = sched_rtglobal_init,
331 .rt_queue_shutdown = sched_rtglobal_queue_shutdown,
332 .rt_runq_scan = sched_rtglobal_runq_scan,
333 .rt_runq_count_sum = sched_rtglobal_runq_count_sum,
334
335 .qos_max_parallelism = sched_qos_max_parallelism,
336 .check_spill = sched_check_spill,
337 .ipi_policy = sched_ipi_policy,
338 .thread_should_yield = sched_thread_should_yield,
fe8ab488
A
339};
340
341
342static void
343sched_multiq_init(void)
344{
fe8ab488
A
345#if defined(MULTIQ_SANITY_CHECK)
346 PE_parse_boot_argn("-multiq-sanity-check", &multiq_sanity_check, sizeof(multiq_sanity_check));
347#endif
348
349 PE_parse_boot_argn("-multiq-deep-drain", &deep_drain, sizeof(deep_drain));
350
3e170ce0
A
351 if (!PE_parse_boot_argn("multiq_drain_ceiling", &drain_ceiling, sizeof(drain_ceiling))) {
352 drain_ceiling = DEFAULT_DRAIN_CEILING;
353 }
fe8ab488
A
354
355 if (!PE_parse_boot_argn("multiq_drain_depth_limit", &drain_depth_limit, sizeof(drain_depth_limit))) {
356 drain_depth_limit = DEFAULT_DRAIN_DEPTH_LIMIT;
357 }
358
359 if (!PE_parse_boot_argn("multiq_drain_band_limit", &drain_band_limit, sizeof(drain_band_limit))) {
360 drain_band_limit = DEFAULT_DRAIN_BAND_LIMIT;
361 }
362
3e170ce0
A
363 printf("multiq scheduler config: deep-drain %d, ceiling %d, depth limit %d, band limit %d, sanity check %d\n",
364 deep_drain, drain_ceiling, drain_depth_limit, drain_band_limit, multiq_sanity_check);
fe8ab488
A
365
366 sched_group_zone = zinit(
367 sizeof(struct sched_group),
368 task_max * sizeof(struct sched_group),
369 PAGE_SIZE,
370 "sched groups");
371
372 zone_change(sched_group_zone, Z_NOENCRYPT, TRUE);
373 zone_change(sched_group_zone, Z_NOCALLOUT, TRUE);
374
375 queue_init(&sched_groups);
376
377 lck_grp_attr_setdefault(&sched_groups_lock_grp_attr);
378 lck_grp_init(&sched_groups_lock_grp, "sched_groups", &sched_groups_lock_grp_attr);
379 lck_attr_setdefault(&sched_groups_lock_attr);
380 lck_mtx_init(&sched_groups_lock, &sched_groups_lock_grp, &sched_groups_lock_attr);
381
3e170ce0 382 sched_timeshare_init();
fe8ab488
A
383}
384
385static void
386sched_multiq_processor_init(processor_t processor)
387{
388 run_queue_init(&processor->runq);
389}
390
391static void
392sched_multiq_pset_init(processor_set_t pset)
393{
394 run_queue_init(&pset->pset_runq);
395}
396
397static sched_mode_t
398sched_multiq_initial_thread_sched_mode(task_t parent_task)
399{
400 if (parent_task == kernel_task)
401 return TH_MODE_FIXED;
402 else
403 return TH_MODE_TIMESHARE;
404}
405
406sched_group_t
407sched_group_create(void)
408{
409 sched_group_t sched_group;
410
3e170ce0 411 if (!SCHED(sched_groups_enabled))
fe8ab488
A
412 return SCHED_GROUP_NULL;
413
414 sched_group = (sched_group_t)zalloc(sched_group_zone);
415
416 bzero(sched_group, sizeof(struct sched_group));
417
418 run_queue_init(&sched_group->runq);
419
420 for (int i = 0; i < NRQS; i++) {
421 sched_group->entries[i].runq = 0;
422 sched_group->entries[i].sched_pri = i;
423 }
424
425 lck_mtx_lock(&sched_groups_lock);
426 queue_enter(&sched_groups, sched_group, sched_group_t, sched_groups);
427 num_sched_groups++;
428 lck_mtx_unlock(&sched_groups_lock);
429
430 return (sched_group);
431}
432
433void
434sched_group_destroy(sched_group_t sched_group)
435{
3e170ce0 436 if (!SCHED(sched_groups_enabled)) {
fe8ab488
A
437 assert(sched_group == SCHED_GROUP_NULL);
438 return;
439 }
440
441 assert(sched_group != SCHED_GROUP_NULL);
442 assert(sched_group->runq.count == 0);
443
444 for (int i = 0; i < NRQS; i++) {
445 assert(sched_group->entries[i].runq == 0);
446 assert(sched_group->entries[i].sched_pri == i);
447 }
448
449 lck_mtx_lock(&sched_groups_lock);
450 queue_remove(&sched_groups, sched_group, sched_group_t, sched_groups);
451 num_sched_groups--;
452 lck_mtx_unlock(&sched_groups_lock);
453
454 zfree(sched_group_zone, sched_group);
455}
456
457__attribute__((always_inline))
458static inline entry_queue_t
459multiq_main_entryq(processor_t processor)
460{
461 return (entry_queue_t)&processor->processor_set->pset_runq;
462}
463
464__attribute__((always_inline))
465static inline run_queue_t
466multiq_bound_runq(processor_t processor)
467{
468 return &processor->runq;
469}
470
471__attribute__((always_inline))
472static inline sched_entry_t
473group_entry_for_pri(sched_group_t group, integer_t pri)
474{
475 return &group->entries[pri];
476}
477
478__attribute__((always_inline))
479static inline sched_group_t
480group_for_entry(sched_entry_t entry)
481{
39037602
A
482#pragma clang diagnostic push
483#pragma clang diagnostic ignored "-Wcast-align"
fe8ab488 484 sched_group_t group = (sched_group_t)(entry - entry->sched_pri);
39037602 485#pragma clang diagnostic pop
fe8ab488
A
486 return group;
487}
488
489/* Peek at the head of the runqueue */
490static sched_entry_t
491entry_queue_first_entry(entry_queue_t rq)
492{
493 assert(rq->count != 0);
494
39037602 495 queue_t queue = &rq->queues[rq->highq];
fe8ab488 496
39037602 497 sched_entry_t entry = qe_queue_first(queue, struct sched_entry, entry_links);
fe8ab488
A
498
499 assert(entry->sched_pri == rq->highq);
500
501 return entry;
502}
503
504#if defined(MULTIQ_SANITY_CHECK)
505
3e170ce0 506#if MACH_ASSERT
fe8ab488
A
507__attribute__((always_inline))
508static inline boolean_t
509queue_chain_linked(queue_chain_t* chain)
510{
511 if (chain->next != NULL) {
512 assert(chain->prev != NULL);
513 return TRUE;
514 } else {
515 assert(chain->prev == NULL);
516 return FALSE;
517 }
518}
3e170ce0 519#endif /* MACH_ASSERT */
fe8ab488
A
520
521static thread_t
522group_first_thread(sched_group_t group)
523{
524 group_runq_t rq = &group->runq;
525
526 assert(rq->count != 0);
527
39037602 528 queue_t queue = &rq->queues[rq->highq];
fe8ab488 529
39037602 530 thread_t thread = qe_queue_first(queue, struct thread, runq_links);
fe8ab488
A
531
532 assert(thread != THREAD_NULL);
39037602 533 assert_thread_magic(thread);
fe8ab488
A
534
535 assert(thread->sched_group == group);
536
537 /* TODO: May not be safe */
538 assert(thread->sched_pri == rq->highq);
539
540 return thread;
541}
542
543/* Asserts if entry is not in entry runq at pri */
544static void
545entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pri)
546{
547 queue_t q;
548 sched_entry_t elem;
549
39037602 550 assert(queue_chain_linked(&entry->entry_links));
fe8ab488
A
551 assert(entry->runq == MULTIQ_ERUNQ);
552
553 q = &runq->queues[expected_pri];
554
39037602 555 qe_foreach_element(elem, q, entry_links) {
fe8ab488
A
556 if (elem == entry)
557 return;
558 }
559
560 panic("runq %p doesn't contain entry %p at pri %d", runq, entry, expected_pri);
561}
562
563/* Asserts if thread is not in group at its priority */
564static void
565sched_group_check_thread(sched_group_t group, thread_t thread)
566{
567 queue_t q;
568 thread_t elem;
569 int pri = thread->sched_pri;
570
571 assert(thread->runq != PROCESSOR_NULL);
572
573 q = &group->runq.queues[pri];
574
39037602 575 qe_foreach_element(elem, q, runq_links) {
fe8ab488
A
576 if (elem == thread)
577 return;
578 }
579
580 panic("group %p doesn't contain thread %p at pri %d", group, thread, pri);
581}
582
583static void
584global_check_entry_queue(entry_queue_t main_entryq)
585{
586 if (main_entryq->count == 0)
587 return;
588
589 sched_entry_t entry = entry_queue_first_entry(main_entryq);
590
591 assert(entry->runq == MULTIQ_ERUNQ);
592
593 sched_group_t group = group_for_entry(entry);
594
595 thread_t thread = group_first_thread(group);
596
597 __assert_only sched_entry_t thread_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
598
599 assert(entry->sched_pri == group->runq.highq);
600
601 assert(entry == thread_entry);
602 assert(thread->runq != PROCESSOR_NULL);
603}
604
605static void
606group_check_run_queue(entry_queue_t main_entryq, sched_group_t group)
607{
608 if (group->runq.count == 0)
609 return;
610
611 thread_t thread = group_first_thread(group);
612
613 assert(thread->runq != PROCESSOR_NULL);
614
615 sched_entry_t sched_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
616
617 entry_queue_check_entry(main_entryq, sched_entry, thread->sched_pri);
618
619 assert(sched_entry->sched_pri == thread->sched_pri);
620 assert(sched_entry->runq == MULTIQ_ERUNQ);
621}
622
623#endif /* defined(MULTIQ_SANITY_CHECK) */
624
625/*
626 * The run queue must not be empty.
627 */
628static sched_entry_t
629entry_queue_dequeue_entry(entry_queue_t rq)
630{
631 sched_entry_t sched_entry;
39037602 632 queue_t queue = &rq->queues[rq->highq];
fe8ab488
A
633
634 assert(rq->count > 0);
635 assert(!queue_empty(queue));
636
39037602 637 sched_entry = qe_dequeue_head(queue, struct sched_entry, entry_links);
fe8ab488
A
638
639 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
640 rq->count--;
641 if (SCHED(priority_is_urgent)(rq->highq)) {
642 rq->urgency--; assert(rq->urgency >= 0);
643 }
644 if (queue_empty(queue)) {
39037602
A
645 rq_bitmap_clear(rq->bitmap, rq->highq);
646 rq->highq = bitmap_first(rq->bitmap, NRQS);
fe8ab488
A
647 }
648
649 sched_entry->runq = 0;
650
651 return (sched_entry);
652}
653
654/*
655 * The run queue must not be empty.
656 */
657static boolean_t
658entry_queue_enqueue_entry(
659 entry_queue_t rq,
660 sched_entry_t entry,
661 integer_t options)
662{
663 int sched_pri = entry->sched_pri;
39037602 664 queue_t queue = &rq->queues[sched_pri];
fe8ab488
A
665 boolean_t result = FALSE;
666
667 assert(entry->runq == 0);
668
669 if (queue_empty(queue)) {
39037602 670 enqueue_tail(queue, &entry->entry_links);
fe8ab488 671
39037602 672 rq_bitmap_set(rq->bitmap, sched_pri);
fe8ab488
A
673 if (sched_pri > rq->highq) {
674 rq->highq = sched_pri;
675 result = TRUE;
676 }
677 } else {
678 if (options & SCHED_TAILQ)
39037602 679 enqueue_tail(queue, &entry->entry_links);
fe8ab488 680 else
39037602 681 enqueue_head(queue, &entry->entry_links);
fe8ab488
A
682 }
683 if (SCHED(priority_is_urgent)(sched_pri))
684 rq->urgency++;
685 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
686 rq->count++;
687
688 entry->runq = MULTIQ_ERUNQ;
689
690 return (result);
691}
692
693/*
694 * The entry must be in this runqueue.
695 */
696static void
697entry_queue_remove_entry(
698 entry_queue_t rq,
699 sched_entry_t entry)
700{
701 int sched_pri = entry->sched_pri;
702
703#if defined(MULTIQ_SANITY_CHECK)
704 if (multiq_sanity_check) {
705 entry_queue_check_entry(rq, entry, sched_pri);
706 }
707#endif
708
39037602 709 remqueue(&entry->entry_links);
fe8ab488
A
710
711 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
712 rq->count--;
713 if (SCHED(priority_is_urgent)(sched_pri)) {
714 rq->urgency--; assert(rq->urgency >= 0);
715 }
716
39037602 717 if (queue_empty(&rq->queues[sched_pri])) {
fe8ab488 718 /* update run queue status */
39037602
A
719 rq_bitmap_clear(rq->bitmap, sched_pri);
720 rq->highq = bitmap_first(rq->bitmap, NRQS);
fe8ab488
A
721 }
722
723 entry->runq = 0;
724}
725
3e170ce0
A
726static void
727entry_queue_change_entry(
728 entry_queue_t rq,
729 sched_entry_t entry,
730 integer_t options)
731{
732 int sched_pri = entry->sched_pri;
39037602 733 queue_t queue = &rq->queues[sched_pri];
3e170ce0
A
734
735#if defined(MULTIQ_SANITY_CHECK)
736 if (multiq_sanity_check) {
737 entry_queue_check_entry(rq, entry, sched_pri);
738 }
739#endif
3e170ce0
A
740
741 if (options & SCHED_TAILQ)
39037602 742 re_queue_tail(queue, &entry->entry_links);
3e170ce0 743 else
39037602 744 re_queue_head(queue, &entry->entry_links);
3e170ce0 745}
fe8ab488
A
746/*
747 * The run queue must not be empty.
748 *
749 * sets queue_empty to TRUE if queue is now empty at thread_pri
750 */
751static thread_t
752group_run_queue_dequeue_thread(
753 group_runq_t rq,
754 integer_t *thread_pri,
755 boolean_t *queue_empty)
756{
757 thread_t thread;
39037602 758 queue_t queue = &rq->queues[rq->highq];
fe8ab488
A
759
760 assert(rq->count > 0);
761 assert(!queue_empty(queue));
762
763 *thread_pri = rq->highq;
764
39037602
A
765 thread = qe_dequeue_head(queue, struct thread, runq_links);
766 assert_thread_magic(thread);
fe8ab488
A
767
768 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
769 rq->count--;
770 if (SCHED(priority_is_urgent)(rq->highq)) {
771 rq->urgency--; assert(rq->urgency >= 0);
772 }
773 if (queue_empty(queue)) {
39037602
A
774 rq_bitmap_clear(rq->bitmap, rq->highq);
775 rq->highq = bitmap_first(rq->bitmap, NRQS);
fe8ab488
A
776 *queue_empty = TRUE;
777 } else {
778 *queue_empty = FALSE;
779 }
780
39037602 781 return thread;
fe8ab488
A
782}
783
784/*
785 * The run queue must not be empty.
786 * returns TRUE if queue was empty at thread_pri
787 */
788static boolean_t
789group_run_queue_enqueue_thread(
790 group_runq_t rq,
791 thread_t thread,
792 integer_t thread_pri,
793 integer_t options)
794{
39037602 795 queue_t queue = &rq->queues[thread_pri];
fe8ab488
A
796 boolean_t result = FALSE;
797
798 assert(thread->runq == PROCESSOR_NULL);
39037602 799 assert_thread_magic(thread);
fe8ab488
A
800
801 if (queue_empty(queue)) {
39037602 802 enqueue_tail(queue, &thread->runq_links);
fe8ab488 803
39037602 804 rq_bitmap_set(rq->bitmap, thread_pri);
fe8ab488
A
805 if (thread_pri > rq->highq) {
806 rq->highq = thread_pri;
807 }
808 result = TRUE;
809 } else {
810 if (options & SCHED_TAILQ)
39037602 811 enqueue_tail(queue, &thread->runq_links);
fe8ab488 812 else
39037602 813 enqueue_head(queue, &thread->runq_links);
fe8ab488
A
814 }
815 if (SCHED(priority_is_urgent)(thread_pri))
816 rq->urgency++;
817 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
818 rq->count++;
819
820 return (result);
821}
822
823/*
824 * The thread must be in this runqueue.
825 * returns TRUE if queue is now empty at thread_pri
826 */
827static boolean_t
828group_run_queue_remove_thread(
829 group_runq_t rq,
830 thread_t thread,
831 integer_t thread_pri)
832{
833 boolean_t result = FALSE;
834
39037602 835 assert_thread_magic(thread);
fe8ab488
A
836 assert(thread->runq != PROCESSOR_NULL);
837
39037602 838 remqueue(&thread->runq_links);
fe8ab488
A
839
840 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
841 rq->count--;
842 if (SCHED(priority_is_urgent)(thread_pri)) {
843 rq->urgency--; assert(rq->urgency >= 0);
844 }
845
39037602 846 if (queue_empty(&rq->queues[thread_pri])) {
fe8ab488 847 /* update run queue status */
39037602
A
848 rq_bitmap_clear(rq->bitmap, thread_pri);
849 rq->highq = bitmap_first(rq->bitmap, NRQS);
fe8ab488
A
850 result = TRUE;
851 }
852
853 thread->runq = PROCESSOR_NULL;
854
855 return result;
856}
857
858/*
859 * A thread's sched pri may change out from under us because
860 * we're clearing thread->runq here without the thread locked.
861 * Do not rely on it to be the same as when we enqueued.
862 */
863static thread_t
864sched_global_dequeue_thread(entry_queue_t main_entryq)
865{
866 boolean_t pri_level_empty = FALSE;
867 sched_entry_t entry;
868 group_runq_t group_runq;
869 thread_t thread;
870 integer_t thread_pri;
871 sched_group_t group;
872
873 assert(main_entryq->count > 0);
874
875 entry = entry_queue_dequeue_entry(main_entryq);
876
877 group = group_for_entry(entry);
878 group_runq = &group->runq;
879
880 thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
881
882 thread->runq = PROCESSOR_NULL;
883
884 if (!pri_level_empty) {
885 entry_queue_enqueue_entry(main_entryq, entry, SCHED_TAILQ);
886 }
887
888 return thread;
889}
890
891/* Dequeue a thread from the global runq without moving the entry */
892static thread_t
893sched_global_deep_drain_dequeue_thread(entry_queue_t main_entryq)
894{
895 boolean_t pri_level_empty = FALSE;
896 sched_entry_t entry;
897 group_runq_t group_runq;
898 thread_t thread;
899 integer_t thread_pri;
900 sched_group_t group;
901
902 assert(main_entryq->count > 0);
903
904 entry = entry_queue_first_entry(main_entryq);
905
906 group = group_for_entry(entry);
907 group_runq = &group->runq;
908
909 thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
910
911 thread->runq = PROCESSOR_NULL;
912
913 if (pri_level_empty) {
914 entry_queue_remove_entry(main_entryq, entry);
915 }
916
917 return thread;
918}
919
920
921static thread_t
922sched_group_dequeue_thread(
923 entry_queue_t main_entryq,
924 sched_group_t group)
925{
926 group_runq_t group_runq = &group->runq;
927 boolean_t pri_level_empty = FALSE;
928 thread_t thread;
929 integer_t thread_pri;
930
931 thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
932
933 thread->runq = PROCESSOR_NULL;
934
935 if (pri_level_empty) {
936 entry_queue_remove_entry(main_entryq, group_entry_for_pri(group, thread_pri));
937 }
938
939 return thread;
940}
941
942static void
943sched_group_remove_thread(
944 entry_queue_t main_entryq,
945 sched_group_t group,
946 thread_t thread)
947{
948 integer_t thread_pri = thread->sched_pri;
949 sched_entry_t sched_entry = group_entry_for_pri(group, thread_pri);
950
951#if defined(MULTIQ_SANITY_CHECK)
952 if (multiq_sanity_check) {
953 global_check_entry_queue(main_entryq);
954 group_check_run_queue(main_entryq, group);
955
956 sched_group_check_thread(group, thread);
957 entry_queue_check_entry(main_entryq, sched_entry, thread_pri);
958 }
959#endif
960
961 boolean_t pri_level_empty = group_run_queue_remove_thread(&group->runq, thread, thread_pri);
962
963 if (pri_level_empty) {
964 entry_queue_remove_entry(main_entryq, sched_entry);
965 }
966
967#if defined(MULTIQ_SANITY_CHECK)
968 if (multiq_sanity_check) {
969 global_check_entry_queue(main_entryq);
970 group_check_run_queue(main_entryq, group);
971 }
972#endif
973}
974
975static void
976sched_group_enqueue_thread(
977 entry_queue_t main_entryq,
978 sched_group_t group,
979 thread_t thread,
980 integer_t options)
981{
982#if defined(MULTIQ_SANITY_CHECK)
983 if (multiq_sanity_check) {
984 global_check_entry_queue(main_entryq);
985 group_check_run_queue(main_entryq, group);
986 }
987#endif
988
989 int sched_pri = thread->sched_pri;
990
991 boolean_t pri_level_was_empty = group_run_queue_enqueue_thread(&group->runq, thread, sched_pri, options);
992
993 if (pri_level_was_empty) {
994 /*
995 * TODO: Need to figure out if passing options here is a good idea or not
996 * What effects would it have?
997 */
998 entry_queue_enqueue_entry(main_entryq, &group->entries[sched_pri], options);
3e170ce0
A
999 } else if (options & SCHED_HEADQ) {
1000 /* The thread should be at the head of the line - move its entry to the front */
1001 entry_queue_change_entry(main_entryq, &group->entries[sched_pri], options);
fe8ab488
A
1002 }
1003}
1004
1005/*
1006 * Locate a thread to execute from the run queue and return it.
1007 * Only choose a thread with greater or equal priority.
1008 *
1009 * pset is locked, thread is not locked.
1010 *
1011 * Returns THREAD_NULL if it cannot find a valid thread.
1012 *
1013 * Note: we cannot rely on the value of thread->sched_pri in this path because
1014 * we don't have the thread locked.
1015 *
1016 * TODO: Remove tracepoints
1017 */
1018static thread_t
1019sched_multiq_choose_thread(
1020 processor_t processor,
1021 int priority,
1022 ast_t reason)
1023{
1024 entry_queue_t main_entryq = multiq_main_entryq(processor);
1025 run_queue_t bound_runq = multiq_bound_runq(processor);
1026
1027 boolean_t choose_bound_runq = FALSE;
1028
1029 if (bound_runq->highq < priority &&
1030 main_entryq->highq < priority)
1031 return THREAD_NULL;
1032
1033 if (bound_runq->count && main_entryq->count) {
1034 if (bound_runq->highq >= main_entryq->highq) {
1035 choose_bound_runq = TRUE;
1036 } else {
1037 /* Use main runq */
1038 }
1039 } else if (bound_runq->count) {
1040 choose_bound_runq = TRUE;
1041 } else if (main_entryq->count) {
1042 /* Use main runq */
1043 } else {
1044 return (THREAD_NULL);
1045 }
1046
1047 if (choose_bound_runq) {
1048 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1049 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1050 MACH_MULTIQ_BOUND, main_entryq->highq, bound_runq->highq, 0, 0);
1051
1052 return run_queue_dequeue(bound_runq, SCHED_HEADQ);
1053 }
1054
1055 sched_group_t group = current_thread()->sched_group;
1056
1057#if defined(MULTIQ_SANITY_CHECK)
1058 if (multiq_sanity_check) {
1059 global_check_entry_queue(main_entryq);
1060 group_check_run_queue(main_entryq, group);
1061 }
1062#endif
1063
1064 /*
1065 * Determine if we should look at the group or the global queue
1066 *
1067 * TODO:
1068 * Perhaps pass reason as a 'should look inside' argument to choose_thread
1069 * Should YIELD AST override drain limit?
1070 */
1071 if (group->runq.count != 0 && (reason & AST_PREEMPTION) == 0) {
3e170ce0
A
1072 boolean_t favor_group = TRUE;
1073
1074 integer_t global_pri = main_entryq->highq;
1075 integer_t group_pri = group->runq.highq;
fe8ab488 1076
3e170ce0
A
1077 /*
1078 * Favor the current group if the group is still the globally highest.
1079 *
1080 * Otherwise, consider choosing a thread from the current group
1081 * even if it's lower priority than the global highest priority.
1082 */
1083 if (global_pri > group_pri) {
fe8ab488
A
1084 /*
1085 * If there's something elsewhere above the depth limit,
1086 * don't pick a thread below the limit.
1087 */
3e170ce0
A
1088 if (global_pri > drain_depth_limit && group_pri <= drain_depth_limit)
1089 favor_group = FALSE;
fe8ab488
A
1090
1091 /*
3e170ce0
A
1092 * If there's something at or above the ceiling,
1093 * don't favor the group.
fe8ab488 1094 */
3e170ce0
A
1095 if (global_pri >= drain_ceiling)
1096 favor_group = FALSE;
fe8ab488 1097
3e170ce0
A
1098 /*
1099 * Don't go more than X steps below the global highest
1100 */
1101 if ((global_pri - group_pri) >= drain_band_limit)
1102 favor_group = FALSE;
fe8ab488
A
1103 }
1104
3e170ce0 1105 if (favor_group) {
fe8ab488
A
1106 /* Pull from local runq */
1107 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1108 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
3e170ce0 1109 MACH_MULTIQ_GROUP, global_pri, group_pri, 0, 0);
fe8ab488
A
1110
1111 return sched_group_dequeue_thread(main_entryq, group);
1112 }
1113 }
1114
1115 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1116 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1117 MACH_MULTIQ_GLOBAL, main_entryq->highq, group->runq.highq, 0, 0);
1118
1119 /* Couldn't pull from local runq, pull from global runq instead */
1120 if (deep_drain) {
1121 return sched_global_deep_drain_dequeue_thread(main_entryq);
1122 } else {
1123 return sched_global_dequeue_thread(main_entryq);
1124 }
1125}
1126
1127
1128/*
1129 * Thread must be locked, and not already be on a run queue.
1130 * pset is locked.
1131 */
1132static boolean_t
1133sched_multiq_processor_enqueue(
1134 processor_t processor,
1135 thread_t thread,
1136 integer_t options)
1137{
1138 boolean_t result;
1139
1140 assert(processor == thread->chosen_processor);
1141
1142 if (thread->bound_processor != PROCESSOR_NULL) {
1143 assert(thread->bound_processor == processor);
1144
1145 result = run_queue_enqueue(multiq_bound_runq(processor), thread, options);
1146 thread->runq = processor;
1147
1148 return result;
1149 }
1150
1151 sched_group_enqueue_thread(multiq_main_entryq(processor),
1152 thread->sched_group,
1153 thread, options);
1154
1155 thread->runq = processor;
1156
1157 return (FALSE);
1158}
1159
1160/*
1161 * Called in the context of thread with thread and pset unlocked,
1162 * after updating thread priority but before propagating that priority
1163 * to the processor
1164 */
1165void
1166sched_multiq_quantum_expire(thread_t thread)
1167{
1168 if (deep_drain) {
1169 /*
1170 * Move the entry at this priority to the end of the queue,
1171 * to allow the next task a shot at running.
1172 */
1173
1174 processor_t processor = thread->last_processor;
1175 processor_set_t pset = processor->processor_set;
1176 entry_queue_t entryq = multiq_main_entryq(processor);
1177
1178 pset_lock(pset);
1179
1180 sched_entry_t entry = group_entry_for_pri(thread->sched_group, processor->current_pri);
1181
1182 if (entry->runq == MULTIQ_ERUNQ) {
3e170ce0 1183 entry_queue_change_entry(entryq, entry, SCHED_TAILQ);
fe8ab488
A
1184 }
1185
1186 pset_unlock(pset);
1187 }
1188}
1189
1190static boolean_t
1191sched_multiq_processor_queue_empty(processor_t processor)
1192{
1193 return multiq_main_entryq(processor)->count == 0 &&
1194 multiq_bound_runq(processor)->count == 0;
1195}
1196
1197static ast_t
1198sched_multiq_processor_csw_check(processor_t processor)
1199{
1200 boolean_t has_higher;
1201 int pri;
1202
a39ff7e2
A
1203 if (sched_multiq_thread_avoid_processor(processor, current_thread())) {
1204 return (AST_PREEMPT | AST_URGENT);
1205 }
1206
fe8ab488 1207 entry_queue_t main_entryq = multiq_main_entryq(processor);
3e170ce0 1208 run_queue_t bound_runq = multiq_bound_runq(processor);
fe8ab488
A
1209
1210 assert(processor->active_thread != NULL);
1211
1212 pri = MAX(main_entryq->highq, bound_runq->highq);
1213
3e170ce0 1214 if (processor->first_timeslice) {
fe8ab488
A
1215 has_higher = (pri > processor->current_pri);
1216 } else {
1217 has_higher = (pri >= processor->current_pri);
1218 }
1219
1220 if (has_higher) {
1221 if (main_entryq->urgency > 0)
1222 return (AST_PREEMPT | AST_URGENT);
1223
1224 if (bound_runq->urgency > 0)
1225 return (AST_PREEMPT | AST_URGENT);
fe8ab488
A
1226
1227 return AST_PREEMPT;
1228 }
1229
1230 return AST_NONE;
1231}
1232
1233static boolean_t
1234sched_multiq_processor_queue_has_priority(
1235 processor_t processor,
1236 int priority,
1237 boolean_t gte)
1238{
39037602
A
1239 run_queue_t main_runq = multiq_main_entryq(processor);
1240 run_queue_t bound_runq = multiq_bound_runq(processor);
1241
39037602 1242 int qpri = MAX(main_runq->highq, bound_runq->highq);
fe8ab488
A
1243
1244 if (gte)
1245 return qpri >= priority;
1246 else
1247 return qpri > priority;
1248}
1249
fe8ab488
A
1250static int
1251sched_multiq_runq_count(processor_t processor)
1252{
1253 /*
1254 * TODO: Decide whether to keep a count of runnable threads in the pset
1255 * or just return something less than the true count.
1256 *
1257 * This needs to be fast, so no iterating the whole runq.
1258 *
1259 * Another possible decision is to remove this - with global runq
1260 * it doesn't make much sense.
1261 */
1262 return multiq_main_entryq(processor)->count + multiq_bound_runq(processor)->count;
1263}
1264
1265static uint64_t
1266sched_multiq_runq_stats_count_sum(processor_t processor)
1267{
1268 /*
1269 * TODO: This one does need to go through all the runqueues, but it's only needed for
1270 * the sched stats tool
1271 */
1272
1273 uint64_t bound_sum = multiq_bound_runq(processor)->runq_stats.count_sum;
1274
1275 if (processor->cpu_id == processor->processor_set->cpu_set_low)
1276 return bound_sum + multiq_main_entryq(processor)->runq_stats.count_sum;
1277 else
1278 return bound_sum;
1279}
1280
1281static int
1282sched_multiq_processor_bound_count(processor_t processor)
1283{
1284 return multiq_bound_runq(processor)->count;
1285}
1286
1287static void
1288sched_multiq_processor_queue_shutdown(processor_t processor)
1289{
1290 processor_set_t pset = processor->processor_set;
1291 entry_queue_t main_entryq = multiq_main_entryq(processor);
1292 thread_t thread;
1293 queue_head_t tqueue;
1294
1295 /* We only need to migrate threads if this is the last active processor in the pset */
1296 if (pset->online_processor_count > 0) {
1297 pset_unlock(pset);
1298 return;
1299 }
1300
1301 queue_init(&tqueue);
1302
1303 /* Note that we do not remove bound threads from the queues here */
1304
1305 while (main_entryq->count > 0) {
1306 thread = sched_global_dequeue_thread(main_entryq);
39037602 1307 enqueue_tail(&tqueue, &thread->runq_links);
fe8ab488
A
1308 }
1309
1310 pset_unlock(pset);
1311
39037602
A
1312 qe_foreach_element_safe(thread, &tqueue, runq_links) {
1313
1314 remqueue(&thread->runq_links);
1315
fe8ab488
A
1316 thread_lock(thread);
1317
1318 thread_setrun(thread, SCHED_TAILQ);
1319
1320 thread_unlock(thread);
1321 }
1322}
1323
1324/*
1325 * Thread is locked
1326 *
1327 * This is why we can never read sched_pri unless we have the thread locked.
1328 * Which we do in the enqueue and remove cases, but not the dequeue case.
1329 */
1330static boolean_t
1331sched_multiq_processor_queue_remove(
1332 processor_t processor,
1333 thread_t thread)
1334{
1335 boolean_t removed = FALSE;
fe8ab488
A
1336 processor_set_t pset = processor->processor_set;
1337
1338 pset_lock(pset);
1339
1340 if (thread->runq != PROCESSOR_NULL) {
1341 /*
1342 * Thread is on a run queue and we have a lock on
1343 * that run queue.
1344 */
1345
1346 assert(thread->runq == processor);
1347
1348 if (thread->bound_processor != PROCESSOR_NULL) {
1349 assert(processor == thread->bound_processor);
1350 run_queue_remove(multiq_bound_runq(processor), thread);
1351 thread->runq = PROCESSOR_NULL;
1352 } else {
1353 sched_group_remove_thread(multiq_main_entryq(processor),
1354 thread->sched_group,
1355 thread);
1356 }
1357
1358 removed = TRUE;
1359 }
1360
1361 pset_unlock(pset);
1362
1363 return removed;
1364}
1365
1366/* pset is locked, returned unlocked */
1367static thread_t
1368sched_multiq_steal_thread(processor_set_t pset)
1369{
1370 pset_unlock(pset);
1371 return (THREAD_NULL);
1372}
1373
1374/*
1375 * Scan the global queue for candidate groups, and scan those groups for
1376 * candidate threads.
1377 *
39037602
A
1378 * TODO: This iterates every group runq in its entirety for each entry it has in the runq, which is O(N^2)
1379 * Instead, iterate only the queue in the group runq matching the priority of the entry.
1380 *
fe8ab488
A
1381 * Returns TRUE if retry is needed.
1382 */
1383static boolean_t
3e170ce0 1384group_scan(entry_queue_t runq, sched_update_scan_context_t scan_context) {
39037602
A
1385 int count = runq->count;
1386 int queue_index;
1387
1388 assert(count >= 0);
1389
1390 if (count == 0)
1391 return FALSE;
1392
1393 for (queue_index = bitmap_first(runq->bitmap, NRQS);
1394 queue_index >= 0;
1395 queue_index = bitmap_next(runq->bitmap, queue_index)) {
1396
1397 sched_entry_t entry;
1398
1399 qe_foreach_element(entry, &runq->queues[queue_index], entry_links) {
1400 assert(count > 0);
1401
1402 sched_group_t group = group_for_entry(entry);
1403 if (group->runq.count > 0) {
1404 if (runq_scan(&group->runq, scan_context))
1405 return (TRUE);
fe8ab488 1406 }
39037602 1407 count--;
fe8ab488
A
1408 }
1409 }
1410
1411 return (FALSE);
1412}
1413
1414static void
3e170ce0 1415sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context)
fe8ab488
A
1416{
1417 boolean_t restart_needed = FALSE;
1418 processor_t processor = processor_list;
1419 processor_set_t pset;
1420 thread_t thread;
1421 spl_t s;
1422
1423 /*
1424 * We update the threads associated with each processor (bound and idle threads)
1425 * and then update the threads in each pset runqueue.
1426 */
1427
1428 do {
1429 do {
1430 pset = processor->processor_set;
1431
1432 s = splsched();
1433 pset_lock(pset);
1434
3e170ce0 1435 restart_needed = runq_scan(multiq_bound_runq(processor), scan_context);
fe8ab488
A
1436
1437 pset_unlock(pset);
1438 splx(s);
1439
1440 if (restart_needed)
1441 break;
1442
1443 thread = processor->idle_thread;
1444 if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
1445 if (thread_update_add_thread(thread) == FALSE) {
1446 restart_needed = TRUE;
1447 break;
1448 }
1449 }
1450 } while ((processor = processor->processor_list) != NULL);
1451
1452 /* Ok, we now have a collection of candidates -- fix them. */
1453 thread_update_process_threads();
1454
1455 } while (restart_needed);
1456
1457 pset = &pset0;
1458
1459 do {
1460 do {
1461 s = splsched();
1462 pset_lock(pset);
1463
3e170ce0 1464 restart_needed = group_scan(&pset->pset_runq, scan_context);
fe8ab488
A
1465
1466 pset_unlock(pset);
1467 splx(s);
1468
1469 if (restart_needed)
1470 break;
1471 } while ((pset = pset->pset_list) != NULL);
1472
1473 /* Ok, we now have a collection of candidates -- fix them. */
1474 thread_update_process_threads();
1475
1476 } while (restart_needed);
1477}
a39ff7e2
A
1478
1479extern int sched_allow_rt_smt;
1480
1481/* Return true if this thread should not continue running on this processor */
1482static bool
1483sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread)
1484{
1485 if (processor->processor_primary != processor) {
1486 /*
1487 * This is a secondary SMT processor. If the primary is running
1488 * a realtime thread, only allow realtime threads on the secondary.
1489 */
1490 if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
1491 return true;
1492 }
1493 }
1494
1495 return false;
1496}