xnu-6153.61.1.tar.gz
[apple/xnu.git] / osfmk / kern / sched_multiq.c
CommitLineData
fe8ab488
A
1/*
2 * Copyright (c) 2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <mach/mach_types.h>
30#include <mach/machine.h>
31
32#include <machine/machine_routines.h>
33#include <machine/sched_param.h>
34#include <machine/machine_cpu.h>
35
36#include <kern/kern_types.h>
37#include <kern/debug.h>
38#include <kern/mach_param.h>
39#include <kern/machine.h>
40#include <kern/misc_protos.h>
41#include <kern/processor.h>
42#include <kern/queue.h>
43#include <kern/sched.h>
44#include <kern/sched_prim.h>
45#include <kern/task.h>
46#include <kern/thread.h>
47
48#include <sys/kdebug.h>
49
50/*
51 * Theory Statement
52 *
53 * How does the task scheduler work?
54 *
55 * It schedules threads across a few levels.
56 *
57 * RT threads are dealt with above us
58 * Bound threads go into the per-processor runq
59 * Non-bound threads are linked on their task's sched_group's runq
60 * sched_groups' sched_entries are linked on the pset's runq
61 *
62 * TODO: make this explicit - bound threads should have a different enqueue fxn
63 *
64 * When we choose a new thread, we will decide whether to look at the bound runqueue, the global runqueue
65 * or the current group's runqueue, then dequeue the next thread in that runqueue.
66 *
67 * We then manipulate the sched_entries to reflect the invariant that:
68 * Each non-empty priority level in a group's runq is represented by one sched_entry enqueued in the global
69 * runqueue.
70 *
71 * A sched_entry represents a chance at running - for each priority in each task, there is one chance of getting
72 * to run. This reduces the excess contention bonus given to processes which have work spread among many threads
73 * as compared to processes which do the same amount of work under fewer threads.
74 *
75 * NOTE: Currently, the multiq scheduler only supports one pset.
76 *
77 * NOTE ABOUT thread->sched_pri:
78 *
79 * It can change after enqueue - it's changed without pset lock but with thread lock if thread->runq is 0.
80 * Therefore we can only depend on it not changing during the enqueue and remove path, not the dequeue.
81 *
82 * TODO: Future features:
83 *
84 * Decouple the task priority from the sched_entry priority, allowing for:
85 * fast task priority change without having to iterate and re-dispatch all threads in the task.
86 * i.e. task-wide priority, task-wide boosting
87 * fancier group decay features
88 *
89 * Group (or task) decay:
90 * Decay is used for a few different things:
91 * Prioritizing latency-needing threads over throughput-needing threads for time-to-running
92 * Balancing work between threads in a process
93 * Balancing work done at the same priority between different processes
94 * Recovering from priority inversions between two threads in the same process
95 * Recovering from priority inversions between two threads in different processes
96 * Simulating a proportional share scheduler by allowing lower priority threads
97 * to run for a certain percentage of the time
98 *
99 * Task decay lets us separately address the 'same process' and 'different process' needs,
100 * which will allow us to make smarter tradeoffs in different cases.
101 * For example, we could resolve priority inversion in the same process by reordering threads without dropping the
102 * process below low priority threads in other processes.
103 *
104 * One lock to rule them all (or at least all the runqueues) instead of the pset locks
105 *
106 * Shrink sched_entry size to the size of a queue_chain_t by inferring priority, group, and perhaps runq field.
107 * The entries array is 5K currently so it'd be really great to reduce.
108 * One way to get sched_group below 4K without a new runq structure would be to remove the extra queues above realtime.
109 *
110 * When preempting a processor, store a flag saying if the preemption
111 * was from a thread in the same group or different group,
112 * and tell choose_thread about it.
113 *
114 * When choosing a processor, bias towards those running in the same
115 * group as I am running (at the same priority, or within a certain band?).
116 *
117 * Decide if we need to support psets.
118 * Decide how to support psets - do we need duplicate entries for each pset,
119 * or can we get away with putting the entry in either one or the other pset?
120 *
121 * Consider the right way to handle runq count - I don't want to iterate groups.
39037602 122 * Perhaps keep a global counter.
fe8ab488
A
123 * Alternate option - remove it from choose_processor. It doesn't add much value
124 * now that we have global runq.
125 *
126 * Need a better way of finding group to target instead of looking at current_task.
127 * Perhaps choose_thread could pass in the current thread?
128 *
129 * Consider unifying runq copy-pastes.
130 *
131 * Thoughts on having a group central quantum bucket:
132 *
133 * I see two algorithms to decide quanta:
134 * A) Hand off only when switching thread to thread in the same group
135 * B) Allocate and return quanta to the group's pool
136 *
137 * Issues:
138 * If a task blocks completely, should it come back with the leftover quanta
139 * or brand new quanta?
140 *
141 * Should I put a flag saying zero out a quanta you grab when youre dispatched'?
142 *
143 * Resolution:
144 * Handing off quanta between threads will help with jumping around in the current task
145 * but will not help when a thread from a different task is involved.
146 * Need an algorithm that works with round robin-ing between threads in different tasks
147 *
148 * But wait - round robining can only be triggered by quantum expire or blocking.
149 * We need something that works with preemption or yielding - that's the more interesting idea.
150 *
151 * Existing algorithm - preemption doesn't re-set quantum, puts thread on head of runq.
152 * Blocking or quantum expiration does re-set quantum, puts thread on tail of runq.
153 *
154 * New algorithm -
155 * Hand off quanta when hopping between threads with same sched_group
156 * Even if thread was blocked it uses last thread remaining quanta when it starts.
157 *
158 * If we use the only cycle entry at quantum algorithm, then the quantum pool starts getting
159 * interesting.
160 *
161 * A thought - perhaps the handoff approach doesn't work so well in the presence of
162 * non-handoff wakeups i.e. wake other thread then wait then block - doesn't mean that
163 * woken thread will be what I switch to - other processor may have stolen it.
164 * What do we do there?
165 *
166 * Conclusions:
167 * We currently don't know of a scenario where quantum buckets on the task is beneficial.
168 * We will instead handoff quantum between threads in the task, and keep quantum
169 * on the preempted thread if it's preempted by something outside the task.
170 *
171 */
172
173#if DEBUG || DEVELOPMENT
174#define MULTIQ_SANITY_CHECK
175#endif
176
177typedef struct sched_entry {
39037602 178 queue_chain_t entry_links;
fe8ab488
A
179 int16_t sched_pri; /* scheduled (current) priority */
180 int16_t runq;
0a7de745 181 int32_t pad;
fe8ab488
A
182} *sched_entry_t;
183
184typedef run_queue_t entry_queue_t; /* A run queue that holds sched_entries instead of threads */
185typedef run_queue_t group_runq_t; /* A run queue that is part of a sched_group */
186
187#define SCHED_ENTRY_NULL ((sched_entry_t) 0)
0a7de745 188#define MULTIQ_ERUNQ (-4) /* Indicates entry is on the main runq */
fe8ab488
A
189
190/* Each level in the run queue corresponds to one entry in the entries array */
191struct sched_group {
192 struct sched_entry entries[NRQS];
193 struct run_queue runq;
194 queue_chain_t sched_groups;
195};
196
fe8ab488
A
197/*
198 * Keep entry on the head of the runqueue while dequeueing threads.
199 * Only cycle it to the end of the runqueue when a thread in the task
200 * hits its quantum.
201 */
202static boolean_t deep_drain = FALSE;
203
fe8ab488
A
204/* Verify the consistency of the runq before touching it */
205static boolean_t multiq_sanity_check = FALSE;
206
207/*
208 * Draining threads from the current task is preferred
209 * when they're less than X steps below the current
210 * global highest priority
211 */
212#define DEFAULT_DRAIN_BAND_LIMIT MAXPRI
213static integer_t drain_band_limit;
214
215/*
216 * Don't go below this priority level if there is something above it in another task
217 */
218#define DEFAULT_DRAIN_DEPTH_LIMIT MAXPRI_THROTTLE
219static integer_t drain_depth_limit;
220
3e170ce0
A
221/*
222 * Don't favor the task when there's something above this priority in another task.
223 */
224#define DEFAULT_DRAIN_CEILING BASEPRI_FOREGROUND
225static integer_t drain_ceiling;
fe8ab488
A
226
227static struct zone *sched_group_zone;
228
229static uint64_t num_sched_groups = 0;
230static queue_head_t sched_groups;
231
232static lck_attr_t sched_groups_lock_attr;
233static lck_grp_t sched_groups_lock_grp;
234static lck_grp_attr_t sched_groups_lock_grp_attr;
235
236static lck_mtx_t sched_groups_lock;
237
238
239static void
240sched_multiq_init(void);
241
242static thread_t
243sched_multiq_steal_thread(processor_set_t pset);
244
245static void
3e170ce0 246sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context);
fe8ab488
A
247
248static boolean_t
cb323159
A
249sched_multiq_processor_enqueue(processor_t processor, thread_t thread,
250 sched_options_t options);
fe8ab488
A
251
252static boolean_t
253sched_multiq_processor_queue_remove(processor_t processor, thread_t thread);
254
255void
256sched_multiq_quantum_expire(thread_t thread);
257
258static ast_t
259sched_multiq_processor_csw_check(processor_t processor);
260
261static boolean_t
262sched_multiq_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
263
264static int
265sched_multiq_runq_count(processor_t processor);
266
267static boolean_t
268sched_multiq_processor_queue_empty(processor_t processor);
269
270static uint64_t
271sched_multiq_runq_stats_count_sum(processor_t processor);
272
273static int
274sched_multiq_processor_bound_count(processor_t processor);
275
276static void
277sched_multiq_pset_init(processor_set_t pset);
278
279static void
280sched_multiq_processor_init(processor_t processor);
281
282static thread_t
283sched_multiq_choose_thread(processor_t processor, int priority, ast_t reason);
284
285static void
286sched_multiq_processor_queue_shutdown(processor_t processor);
287
288static sched_mode_t
289sched_multiq_initial_thread_sched_mode(task_t parent_task);
290
a39ff7e2
A
291static bool
292sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread);
293
fe8ab488 294const struct sched_dispatch_table sched_multiq_dispatch = {
3e170ce0 295 .sched_name = "multiq",
fe8ab488 296 .init = sched_multiq_init,
3e170ce0 297 .timebase_init = sched_timeshare_timebase_init,
fe8ab488
A
298 .processor_init = sched_multiq_processor_init,
299 .pset_init = sched_multiq_pset_init,
3e170ce0 300 .maintenance_continuation = sched_timeshare_maintenance_continue,
fe8ab488 301 .choose_thread = sched_multiq_choose_thread,
0a7de745 302 .steal_thread_enabled = sched_steal_thread_DISABLED,
fe8ab488 303 .steal_thread = sched_multiq_steal_thread,
3e170ce0 304 .compute_timeshare_priority = sched_compute_timeshare_priority,
fe8ab488
A
305 .choose_processor = choose_processor,
306 .processor_enqueue = sched_multiq_processor_enqueue,
307 .processor_queue_shutdown = sched_multiq_processor_queue_shutdown,
308 .processor_queue_remove = sched_multiq_processor_queue_remove,
309 .processor_queue_empty = sched_multiq_processor_queue_empty,
310 .priority_is_urgent = priority_is_urgent,
311 .processor_csw_check = sched_multiq_processor_csw_check,
312 .processor_queue_has_priority = sched_multiq_processor_queue_has_priority,
3e170ce0 313 .initial_quantum_size = sched_timeshare_initial_quantum_size,
fe8ab488
A
314 .initial_thread_sched_mode = sched_multiq_initial_thread_sched_mode,
315 .can_update_priority = can_update_priority,
316 .update_priority = update_priority,
317 .lightweight_update_priority = lightweight_update_priority,
318 .quantum_expire = sched_multiq_quantum_expire,
fe8ab488
A
319 .processor_runq_count = sched_multiq_runq_count,
320 .processor_runq_stats_count_sum = sched_multiq_runq_stats_count_sum,
fe8ab488
A
321 .processor_bound_count = sched_multiq_processor_bound_count,
322 .thread_update_scan = sched_multiq_thread_update_scan,
3e170ce0
A
323 .multiple_psets_enabled = FALSE,
324 .sched_groups_enabled = TRUE,
a39ff7e2
A
325 .avoid_processor_enabled = TRUE,
326 .thread_avoid_processor = sched_multiq_thread_avoid_processor,
5ba3f43e
A
327 .processor_balance = sched_SMT_balance,
328
329 .rt_runq = sched_rtglobal_runq,
330 .rt_init = sched_rtglobal_init,
331 .rt_queue_shutdown = sched_rtglobal_queue_shutdown,
332 .rt_runq_scan = sched_rtglobal_runq_scan,
333 .rt_runq_count_sum = sched_rtglobal_runq_count_sum,
334
335 .qos_max_parallelism = sched_qos_max_parallelism,
336 .check_spill = sched_check_spill,
337 .ipi_policy = sched_ipi_policy,
338 .thread_should_yield = sched_thread_should_yield,
cb323159
A
339 .run_count_incr = sched_run_incr,
340 .run_count_decr = sched_run_decr,
341 .update_thread_bucket = sched_update_thread_bucket,
342 .pset_made_schedulable = sched_pset_made_schedulable,
fe8ab488
A
343};
344
345
346static void
347sched_multiq_init(void)
348{
fe8ab488
A
349#if defined(MULTIQ_SANITY_CHECK)
350 PE_parse_boot_argn("-multiq-sanity-check", &multiq_sanity_check, sizeof(multiq_sanity_check));
351#endif
352
353 PE_parse_boot_argn("-multiq-deep-drain", &deep_drain, sizeof(deep_drain));
354
3e170ce0
A
355 if (!PE_parse_boot_argn("multiq_drain_ceiling", &drain_ceiling, sizeof(drain_ceiling))) {
356 drain_ceiling = DEFAULT_DRAIN_CEILING;
357 }
fe8ab488
A
358
359 if (!PE_parse_boot_argn("multiq_drain_depth_limit", &drain_depth_limit, sizeof(drain_depth_limit))) {
360 drain_depth_limit = DEFAULT_DRAIN_DEPTH_LIMIT;
361 }
362
363 if (!PE_parse_boot_argn("multiq_drain_band_limit", &drain_band_limit, sizeof(drain_band_limit))) {
364 drain_band_limit = DEFAULT_DRAIN_BAND_LIMIT;
365 }
366
3e170ce0 367 printf("multiq scheduler config: deep-drain %d, ceiling %d, depth limit %d, band limit %d, sanity check %d\n",
0a7de745 368 deep_drain, drain_ceiling, drain_depth_limit, drain_band_limit, multiq_sanity_check);
fe8ab488
A
369
370 sched_group_zone = zinit(
0a7de745
A
371 sizeof(struct sched_group),
372 task_max * sizeof(struct sched_group),
373 PAGE_SIZE,
374 "sched groups");
fe8ab488
A
375
376 zone_change(sched_group_zone, Z_NOENCRYPT, TRUE);
377 zone_change(sched_group_zone, Z_NOCALLOUT, TRUE);
378
379 queue_init(&sched_groups);
380
381 lck_grp_attr_setdefault(&sched_groups_lock_grp_attr);
382 lck_grp_init(&sched_groups_lock_grp, "sched_groups", &sched_groups_lock_grp_attr);
383 lck_attr_setdefault(&sched_groups_lock_attr);
384 lck_mtx_init(&sched_groups_lock, &sched_groups_lock_grp, &sched_groups_lock_attr);
385
3e170ce0 386 sched_timeshare_init();
fe8ab488
A
387}
388
389static void
390sched_multiq_processor_init(processor_t processor)
391{
392 run_queue_init(&processor->runq);
393}
394
395static void
396sched_multiq_pset_init(processor_set_t pset)
397{
398 run_queue_init(&pset->pset_runq);
399}
400
401static sched_mode_t
402sched_multiq_initial_thread_sched_mode(task_t parent_task)
403{
0a7de745 404 if (parent_task == kernel_task) {
fe8ab488 405 return TH_MODE_FIXED;
0a7de745 406 } else {
fe8ab488 407 return TH_MODE_TIMESHARE;
0a7de745 408 }
fe8ab488
A
409}
410
411sched_group_t
412sched_group_create(void)
413{
414 sched_group_t sched_group;
415
0a7de745 416 if (!SCHED(sched_groups_enabled)) {
fe8ab488 417 return SCHED_GROUP_NULL;
0a7de745 418 }
fe8ab488
A
419
420 sched_group = (sched_group_t)zalloc(sched_group_zone);
421
422 bzero(sched_group, sizeof(struct sched_group));
423
424 run_queue_init(&sched_group->runq);
425
426 for (int i = 0; i < NRQS; i++) {
427 sched_group->entries[i].runq = 0;
428 sched_group->entries[i].sched_pri = i;
429 }
430
431 lck_mtx_lock(&sched_groups_lock);
432 queue_enter(&sched_groups, sched_group, sched_group_t, sched_groups);
433 num_sched_groups++;
434 lck_mtx_unlock(&sched_groups_lock);
435
0a7de745 436 return sched_group;
fe8ab488
A
437}
438
439void
440sched_group_destroy(sched_group_t sched_group)
441{
3e170ce0 442 if (!SCHED(sched_groups_enabled)) {
fe8ab488
A
443 assert(sched_group == SCHED_GROUP_NULL);
444 return;
445 }
446
447 assert(sched_group != SCHED_GROUP_NULL);
448 assert(sched_group->runq.count == 0);
449
450 for (int i = 0; i < NRQS; i++) {
451 assert(sched_group->entries[i].runq == 0);
452 assert(sched_group->entries[i].sched_pri == i);
453 }
454
455 lck_mtx_lock(&sched_groups_lock);
456 queue_remove(&sched_groups, sched_group, sched_group_t, sched_groups);
457 num_sched_groups--;
458 lck_mtx_unlock(&sched_groups_lock);
459
460 zfree(sched_group_zone, sched_group);
461}
462
463__attribute__((always_inline))
464static inline entry_queue_t
465multiq_main_entryq(processor_t processor)
466{
467 return (entry_queue_t)&processor->processor_set->pset_runq;
468}
469
470__attribute__((always_inline))
471static inline run_queue_t
472multiq_bound_runq(processor_t processor)
473{
474 return &processor->runq;
475}
476
477__attribute__((always_inline))
478static inline sched_entry_t
479group_entry_for_pri(sched_group_t group, integer_t pri)
480{
481 return &group->entries[pri];
482}
483
484__attribute__((always_inline))
485static inline sched_group_t
486group_for_entry(sched_entry_t entry)
487{
39037602
A
488#pragma clang diagnostic push
489#pragma clang diagnostic ignored "-Wcast-align"
fe8ab488 490 sched_group_t group = (sched_group_t)(entry - entry->sched_pri);
39037602 491#pragma clang diagnostic pop
fe8ab488 492 return group;
0a7de745 493}
fe8ab488
A
494
495/* Peek at the head of the runqueue */
496static sched_entry_t
497entry_queue_first_entry(entry_queue_t rq)
498{
499 assert(rq->count != 0);
500
cb323159 501 circle_queue_t queue = &rq->queues[rq->highq];
fe8ab488 502
cb323159 503 sched_entry_t entry = cqe_queue_first(queue, struct sched_entry, entry_links);
fe8ab488
A
504
505 assert(entry->sched_pri == rq->highq);
506
507 return entry;
508}
509
510#if defined(MULTIQ_SANITY_CHECK)
511
3e170ce0 512#if MACH_ASSERT
fe8ab488
A
513__attribute__((always_inline))
514static inline boolean_t
515queue_chain_linked(queue_chain_t* chain)
516{
517 if (chain->next != NULL) {
518 assert(chain->prev != NULL);
519 return TRUE;
520 } else {
521 assert(chain->prev == NULL);
522 return FALSE;
523 }
524}
3e170ce0 525#endif /* MACH_ASSERT */
fe8ab488
A
526
527static thread_t
528group_first_thread(sched_group_t group)
529{
530 group_runq_t rq = &group->runq;
531
532 assert(rq->count != 0);
533
cb323159 534 circle_queue_t queue = &rq->queues[rq->highq];
fe8ab488 535
cb323159 536 thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
fe8ab488
A
537
538 assert(thread != THREAD_NULL);
39037602 539 assert_thread_magic(thread);
fe8ab488
A
540
541 assert(thread->sched_group == group);
542
543 /* TODO: May not be safe */
544 assert(thread->sched_pri == rq->highq);
545
546 return thread;
547}
548
549/* Asserts if entry is not in entry runq at pri */
550static void
551entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pri)
552{
cb323159 553 circle_queue_t q;
fe8ab488
A
554 sched_entry_t elem;
555
39037602 556 assert(queue_chain_linked(&entry->entry_links));
fe8ab488
A
557 assert(entry->runq == MULTIQ_ERUNQ);
558
559 q = &runq->queues[expected_pri];
560
cb323159 561 cqe_foreach_element(elem, q, entry_links) {
0a7de745 562 if (elem == entry) {
fe8ab488 563 return;
0a7de745 564 }
fe8ab488
A
565 }
566
567 panic("runq %p doesn't contain entry %p at pri %d", runq, entry, expected_pri);
568}
569
570/* Asserts if thread is not in group at its priority */
571static void
572sched_group_check_thread(sched_group_t group, thread_t thread)
573{
cb323159 574 circle_queue_t q;
fe8ab488
A
575 thread_t elem;
576 int pri = thread->sched_pri;
577
578 assert(thread->runq != PROCESSOR_NULL);
579
580 q = &group->runq.queues[pri];
581
cb323159 582 cqe_foreach_element(elem, q, runq_links) {
0a7de745 583 if (elem == thread) {
fe8ab488 584 return;
0a7de745 585 }
fe8ab488
A
586 }
587
588 panic("group %p doesn't contain thread %p at pri %d", group, thread, pri);
589}
590
591static void
592global_check_entry_queue(entry_queue_t main_entryq)
593{
0a7de745 594 if (main_entryq->count == 0) {
fe8ab488 595 return;
0a7de745 596 }
fe8ab488
A
597
598 sched_entry_t entry = entry_queue_first_entry(main_entryq);
599
600 assert(entry->runq == MULTIQ_ERUNQ);
601
602 sched_group_t group = group_for_entry(entry);
603
604 thread_t thread = group_first_thread(group);
605
606 __assert_only sched_entry_t thread_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
607
608 assert(entry->sched_pri == group->runq.highq);
609
610 assert(entry == thread_entry);
611 assert(thread->runq != PROCESSOR_NULL);
612}
613
614static void
615group_check_run_queue(entry_queue_t main_entryq, sched_group_t group)
616{
0a7de745 617 if (group->runq.count == 0) {
fe8ab488 618 return;
0a7de745 619 }
fe8ab488
A
620
621 thread_t thread = group_first_thread(group);
622
623 assert(thread->runq != PROCESSOR_NULL);
624
625 sched_entry_t sched_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
626
627 entry_queue_check_entry(main_entryq, sched_entry, thread->sched_pri);
628
629 assert(sched_entry->sched_pri == thread->sched_pri);
630 assert(sched_entry->runq == MULTIQ_ERUNQ);
631}
632
633#endif /* defined(MULTIQ_SANITY_CHECK) */
634
635/*
636 * The run queue must not be empty.
637 */
638static sched_entry_t
639entry_queue_dequeue_entry(entry_queue_t rq)
640{
641 sched_entry_t sched_entry;
cb323159 642 circle_queue_t queue = &rq->queues[rq->highq];
fe8ab488
A
643
644 assert(rq->count > 0);
cb323159 645 assert(!circle_queue_empty(queue));
fe8ab488 646
cb323159 647 sched_entry = cqe_dequeue_head(queue, struct sched_entry, entry_links);
fe8ab488
A
648
649 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
650 rq->count--;
651 if (SCHED(priority_is_urgent)(rq->highq)) {
652 rq->urgency--; assert(rq->urgency >= 0);
653 }
cb323159 654 if (circle_queue_empty(queue)) {
39037602
A
655 rq_bitmap_clear(rq->bitmap, rq->highq);
656 rq->highq = bitmap_first(rq->bitmap, NRQS);
fe8ab488
A
657 }
658
659 sched_entry->runq = 0;
660
0a7de745 661 return sched_entry;
fe8ab488
A
662}
663
664/*
665 * The run queue must not be empty.
666 */
667static boolean_t
668entry_queue_enqueue_entry(
0a7de745
A
669 entry_queue_t rq,
670 sched_entry_t entry,
671 integer_t options)
fe8ab488
A
672{
673 int sched_pri = entry->sched_pri;
cb323159 674 circle_queue_t queue = &rq->queues[sched_pri];
fe8ab488
A
675 boolean_t result = FALSE;
676
677 assert(entry->runq == 0);
678
cb323159
A
679 if (circle_queue_empty(queue)) {
680 circle_enqueue_tail(queue, &entry->entry_links);
fe8ab488 681
39037602 682 rq_bitmap_set(rq->bitmap, sched_pri);
fe8ab488
A
683 if (sched_pri > rq->highq) {
684 rq->highq = sched_pri;
685 result = TRUE;
686 }
687 } else {
0a7de745 688 if (options & SCHED_TAILQ) {
cb323159 689 circle_enqueue_tail(queue, &entry->entry_links);
0a7de745 690 } else {
cb323159 691 circle_enqueue_head(queue, &entry->entry_links);
0a7de745 692 }
fe8ab488 693 }
0a7de745 694 if (SCHED(priority_is_urgent)(sched_pri)) {
fe8ab488 695 rq->urgency++;
0a7de745 696 }
fe8ab488
A
697 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
698 rq->count++;
699
700 entry->runq = MULTIQ_ERUNQ;
701
0a7de745 702 return result;
fe8ab488
A
703}
704
705/*
706 * The entry must be in this runqueue.
707 */
708static void
709entry_queue_remove_entry(
0a7de745
A
710 entry_queue_t rq,
711 sched_entry_t entry)
fe8ab488
A
712{
713 int sched_pri = entry->sched_pri;
714
715#if defined(MULTIQ_SANITY_CHECK)
716 if (multiq_sanity_check) {
717 entry_queue_check_entry(rq, entry, sched_pri);
718 }
719#endif
720
39037602 721 remqueue(&entry->entry_links);
fe8ab488
A
722
723 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
724 rq->count--;
725 if (SCHED(priority_is_urgent)(sched_pri)) {
726 rq->urgency--; assert(rq->urgency >= 0);
727 }
728
cb323159 729 if (circle_queue_empty(&rq->queues[sched_pri])) {
fe8ab488 730 /* update run queue status */
39037602
A
731 rq_bitmap_clear(rq->bitmap, sched_pri);
732 rq->highq = bitmap_first(rq->bitmap, NRQS);
fe8ab488
A
733 }
734
735 entry->runq = 0;
736}
737
3e170ce0
A
738static void
739entry_queue_change_entry(
0a7de745
A
740 entry_queue_t rq,
741 sched_entry_t entry,
742 integer_t options)
3e170ce0 743{
cb323159
A
744 int sched_pri = entry->sched_pri;
745 circle_queue_t queue = &rq->queues[sched_pri];
3e170ce0
A
746
747#if defined(MULTIQ_SANITY_CHECK)
748 if (multiq_sanity_check) {
749 entry_queue_check_entry(rq, entry, sched_pri);
750 }
751#endif
3e170ce0 752
cb323159 753 circle_dequeue(queue, &entry->entry_links);
0a7de745 754 if (options & SCHED_TAILQ) {
cb323159 755 circle_enqueue_tail(queue, &entry->entry_links);
0a7de745 756 } else {
cb323159 757 circle_enqueue_head(queue, &entry->entry_links);
0a7de745 758 }
3e170ce0 759}
fe8ab488
A
760/*
761 * The run queue must not be empty.
762 *
763 * sets queue_empty to TRUE if queue is now empty at thread_pri
764 */
765static thread_t
766group_run_queue_dequeue_thread(
0a7de745
A
767 group_runq_t rq,
768 integer_t *thread_pri,
769 boolean_t *queue_empty)
fe8ab488
A
770{
771 thread_t thread;
cb323159 772 circle_queue_t queue = &rq->queues[rq->highq];
fe8ab488
A
773
774 assert(rq->count > 0);
cb323159 775 assert(!circle_queue_empty(queue));
fe8ab488
A
776
777 *thread_pri = rq->highq;
778
cb323159 779 thread = cqe_dequeue_head(queue, struct thread, runq_links);
39037602 780 assert_thread_magic(thread);
fe8ab488
A
781
782 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
783 rq->count--;
784 if (SCHED(priority_is_urgent)(rq->highq)) {
785 rq->urgency--; assert(rq->urgency >= 0);
786 }
cb323159 787 if (circle_queue_empty(queue)) {
39037602
A
788 rq_bitmap_clear(rq->bitmap, rq->highq);
789 rq->highq = bitmap_first(rq->bitmap, NRQS);
fe8ab488
A
790 *queue_empty = TRUE;
791 } else {
792 *queue_empty = FALSE;
793 }
794
39037602 795 return thread;
fe8ab488
A
796}
797
798/*
799 * The run queue must not be empty.
800 * returns TRUE if queue was empty at thread_pri
801 */
802static boolean_t
803group_run_queue_enqueue_thread(
0a7de745
A
804 group_runq_t rq,
805 thread_t thread,
806 integer_t thread_pri,
807 integer_t options)
fe8ab488 808{
cb323159 809 circle_queue_t queue = &rq->queues[thread_pri];
fe8ab488
A
810 boolean_t result = FALSE;
811
812 assert(thread->runq == PROCESSOR_NULL);
39037602 813 assert_thread_magic(thread);
fe8ab488 814
cb323159
A
815 if (circle_queue_empty(queue)) {
816 circle_enqueue_tail(queue, &thread->runq_links);
fe8ab488 817
39037602 818 rq_bitmap_set(rq->bitmap, thread_pri);
fe8ab488
A
819 if (thread_pri > rq->highq) {
820 rq->highq = thread_pri;
821 }
822 result = TRUE;
823 } else {
0a7de745 824 if (options & SCHED_TAILQ) {
cb323159 825 circle_enqueue_tail(queue, &thread->runq_links);
0a7de745 826 } else {
cb323159 827 circle_enqueue_head(queue, &thread->runq_links);
0a7de745 828 }
fe8ab488 829 }
0a7de745 830 if (SCHED(priority_is_urgent)(thread_pri)) {
fe8ab488 831 rq->urgency++;
0a7de745 832 }
fe8ab488
A
833 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
834 rq->count++;
835
0a7de745 836 return result;
fe8ab488
A
837}
838
839/*
840 * The thread must be in this runqueue.
841 * returns TRUE if queue is now empty at thread_pri
842 */
843static boolean_t
844group_run_queue_remove_thread(
0a7de745
A
845 group_runq_t rq,
846 thread_t thread,
847 integer_t thread_pri)
fe8ab488 848{
cb323159 849 circle_queue_t queue = &rq->queues[thread_pri];
fe8ab488
A
850 boolean_t result = FALSE;
851
39037602 852 assert_thread_magic(thread);
fe8ab488
A
853 assert(thread->runq != PROCESSOR_NULL);
854
cb323159 855 circle_dequeue(queue, &thread->runq_links);
fe8ab488
A
856
857 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
858 rq->count--;
859 if (SCHED(priority_is_urgent)(thread_pri)) {
860 rq->urgency--; assert(rq->urgency >= 0);
861 }
862
cb323159 863 if (circle_queue_empty(queue)) {
fe8ab488 864 /* update run queue status */
39037602
A
865 rq_bitmap_clear(rq->bitmap, thread_pri);
866 rq->highq = bitmap_first(rq->bitmap, NRQS);
fe8ab488
A
867 result = TRUE;
868 }
869
870 thread->runq = PROCESSOR_NULL;
871
872 return result;
873}
874
875/*
876 * A thread's sched pri may change out from under us because
877 * we're clearing thread->runq here without the thread locked.
878 * Do not rely on it to be the same as when we enqueued.
879 */
880static thread_t
881sched_global_dequeue_thread(entry_queue_t main_entryq)
882{
883 boolean_t pri_level_empty = FALSE;
884 sched_entry_t entry;
885 group_runq_t group_runq;
886 thread_t thread;
887 integer_t thread_pri;
888 sched_group_t group;
889
890 assert(main_entryq->count > 0);
891
892 entry = entry_queue_dequeue_entry(main_entryq);
893
894 group = group_for_entry(entry);
895 group_runq = &group->runq;
896
897 thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
898
899 thread->runq = PROCESSOR_NULL;
900
901 if (!pri_level_empty) {
902 entry_queue_enqueue_entry(main_entryq, entry, SCHED_TAILQ);
903 }
904
905 return thread;
906}
907
908/* Dequeue a thread from the global runq without moving the entry */
909static thread_t
910sched_global_deep_drain_dequeue_thread(entry_queue_t main_entryq)
911{
912 boolean_t pri_level_empty = FALSE;
913 sched_entry_t entry;
914 group_runq_t group_runq;
915 thread_t thread;
916 integer_t thread_pri;
917 sched_group_t group;
918
919 assert(main_entryq->count > 0);
920
921 entry = entry_queue_first_entry(main_entryq);
922
923 group = group_for_entry(entry);
924 group_runq = &group->runq;
925
926 thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
927
928 thread->runq = PROCESSOR_NULL;
929
930 if (pri_level_empty) {
931 entry_queue_remove_entry(main_entryq, entry);
932 }
933
934 return thread;
935}
936
937
938static thread_t
939sched_group_dequeue_thread(
0a7de745
A
940 entry_queue_t main_entryq,
941 sched_group_t group)
fe8ab488
A
942{
943 group_runq_t group_runq = &group->runq;
944 boolean_t pri_level_empty = FALSE;
945 thread_t thread;
946 integer_t thread_pri;
947
948 thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
949
950 thread->runq = PROCESSOR_NULL;
951
952 if (pri_level_empty) {
953 entry_queue_remove_entry(main_entryq, group_entry_for_pri(group, thread_pri));
954 }
955
956 return thread;
957}
958
959static void
960sched_group_remove_thread(
0a7de745
A
961 entry_queue_t main_entryq,
962 sched_group_t group,
963 thread_t thread)
fe8ab488
A
964{
965 integer_t thread_pri = thread->sched_pri;
966 sched_entry_t sched_entry = group_entry_for_pri(group, thread_pri);
967
968#if defined(MULTIQ_SANITY_CHECK)
969 if (multiq_sanity_check) {
970 global_check_entry_queue(main_entryq);
971 group_check_run_queue(main_entryq, group);
972
973 sched_group_check_thread(group, thread);
974 entry_queue_check_entry(main_entryq, sched_entry, thread_pri);
975 }
976#endif
977
978 boolean_t pri_level_empty = group_run_queue_remove_thread(&group->runq, thread, thread_pri);
979
980 if (pri_level_empty) {
981 entry_queue_remove_entry(main_entryq, sched_entry);
982 }
983
984#if defined(MULTIQ_SANITY_CHECK)
985 if (multiq_sanity_check) {
986 global_check_entry_queue(main_entryq);
987 group_check_run_queue(main_entryq, group);
988 }
989#endif
990}
991
992static void
993sched_group_enqueue_thread(
0a7de745
A
994 entry_queue_t main_entryq,
995 sched_group_t group,
996 thread_t thread,
997 integer_t options)
fe8ab488
A
998{
999#if defined(MULTIQ_SANITY_CHECK)
1000 if (multiq_sanity_check) {
1001 global_check_entry_queue(main_entryq);
1002 group_check_run_queue(main_entryq, group);
1003 }
1004#endif
1005
1006 int sched_pri = thread->sched_pri;
1007
1008 boolean_t pri_level_was_empty = group_run_queue_enqueue_thread(&group->runq, thread, sched_pri, options);
1009
1010 if (pri_level_was_empty) {
1011 /*
1012 * TODO: Need to figure out if passing options here is a good idea or not
1013 * What effects would it have?
1014 */
1015 entry_queue_enqueue_entry(main_entryq, &group->entries[sched_pri], options);
3e170ce0
A
1016 } else if (options & SCHED_HEADQ) {
1017 /* The thread should be at the head of the line - move its entry to the front */
1018 entry_queue_change_entry(main_entryq, &group->entries[sched_pri], options);
fe8ab488
A
1019 }
1020}
1021
1022/*
1023 * Locate a thread to execute from the run queue and return it.
1024 * Only choose a thread with greater or equal priority.
1025 *
1026 * pset is locked, thread is not locked.
1027 *
1028 * Returns THREAD_NULL if it cannot find a valid thread.
1029 *
1030 * Note: we cannot rely on the value of thread->sched_pri in this path because
1031 * we don't have the thread locked.
1032 *
1033 * TODO: Remove tracepoints
1034 */
1035static thread_t
1036sched_multiq_choose_thread(
0a7de745
A
1037 processor_t processor,
1038 int priority,
1039 ast_t reason)
fe8ab488
A
1040{
1041 entry_queue_t main_entryq = multiq_main_entryq(processor);
1042 run_queue_t bound_runq = multiq_bound_runq(processor);
1043
1044 boolean_t choose_bound_runq = FALSE;
1045
0a7de745
A
1046 if (bound_runq->highq < priority &&
1047 main_entryq->highq < priority) {
fe8ab488 1048 return THREAD_NULL;
0a7de745 1049 }
fe8ab488
A
1050
1051 if (bound_runq->count && main_entryq->count) {
1052 if (bound_runq->highq >= main_entryq->highq) {
1053 choose_bound_runq = TRUE;
1054 } else {
1055 /* Use main runq */
1056 }
1057 } else if (bound_runq->count) {
1058 choose_bound_runq = TRUE;
1059 } else if (main_entryq->count) {
1060 /* Use main runq */
1061 } else {
0a7de745 1062 return THREAD_NULL;
fe8ab488
A
1063 }
1064
1065 if (choose_bound_runq) {
1066 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1067 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1068 MACH_MULTIQ_BOUND, main_entryq->highq, bound_runq->highq, 0, 0);
1069
1070 return run_queue_dequeue(bound_runq, SCHED_HEADQ);
1071 }
1072
1073 sched_group_t group = current_thread()->sched_group;
1074
1075#if defined(MULTIQ_SANITY_CHECK)
1076 if (multiq_sanity_check) {
1077 global_check_entry_queue(main_entryq);
1078 group_check_run_queue(main_entryq, group);
1079 }
1080#endif
1081
1082 /*
1083 * Determine if we should look at the group or the global queue
1084 *
1085 * TODO:
1086 * Perhaps pass reason as a 'should look inside' argument to choose_thread
1087 * Should YIELD AST override drain limit?
1088 */
1089 if (group->runq.count != 0 && (reason & AST_PREEMPTION) == 0) {
3e170ce0
A
1090 boolean_t favor_group = TRUE;
1091
1092 integer_t global_pri = main_entryq->highq;
1093 integer_t group_pri = group->runq.highq;
fe8ab488 1094
3e170ce0
A
1095 /*
1096 * Favor the current group if the group is still the globally highest.
1097 *
1098 * Otherwise, consider choosing a thread from the current group
1099 * even if it's lower priority than the global highest priority.
1100 */
1101 if (global_pri > group_pri) {
fe8ab488
A
1102 /*
1103 * If there's something elsewhere above the depth limit,
1104 * don't pick a thread below the limit.
1105 */
0a7de745 1106 if (global_pri > drain_depth_limit && group_pri <= drain_depth_limit) {
3e170ce0 1107 favor_group = FALSE;
0a7de745 1108 }
fe8ab488
A
1109
1110 /*
3e170ce0
A
1111 * If there's something at or above the ceiling,
1112 * don't favor the group.
fe8ab488 1113 */
0a7de745 1114 if (global_pri >= drain_ceiling) {
3e170ce0 1115 favor_group = FALSE;
0a7de745 1116 }
fe8ab488 1117
3e170ce0
A
1118 /*
1119 * Don't go more than X steps below the global highest
1120 */
0a7de745 1121 if ((global_pri - group_pri) >= drain_band_limit) {
3e170ce0 1122 favor_group = FALSE;
0a7de745 1123 }
fe8ab488
A
1124 }
1125
3e170ce0 1126 if (favor_group) {
fe8ab488
A
1127 /* Pull from local runq */
1128 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1129 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
3e170ce0 1130 MACH_MULTIQ_GROUP, global_pri, group_pri, 0, 0);
fe8ab488
A
1131
1132 return sched_group_dequeue_thread(main_entryq, group);
1133 }
1134 }
1135
1136 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1137 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1138 MACH_MULTIQ_GLOBAL, main_entryq->highq, group->runq.highq, 0, 0);
1139
1140 /* Couldn't pull from local runq, pull from global runq instead */
1141 if (deep_drain) {
1142 return sched_global_deep_drain_dequeue_thread(main_entryq);
1143 } else {
1144 return sched_global_dequeue_thread(main_entryq);
1145 }
1146}
1147
1148
1149/*
1150 * Thread must be locked, and not already be on a run queue.
1151 * pset is locked.
1152 */
1153static boolean_t
1154sched_multiq_processor_enqueue(
0a7de745
A
1155 processor_t processor,
1156 thread_t thread,
cb323159 1157 sched_options_t options)
fe8ab488
A
1158{
1159 boolean_t result;
1160
1161 assert(processor == thread->chosen_processor);
1162
1163 if (thread->bound_processor != PROCESSOR_NULL) {
1164 assert(thread->bound_processor == processor);
1165
1166 result = run_queue_enqueue(multiq_bound_runq(processor), thread, options);
1167 thread->runq = processor;
1168
1169 return result;
1170 }
1171
1172 sched_group_enqueue_thread(multiq_main_entryq(processor),
0a7de745
A
1173 thread->sched_group,
1174 thread, options);
fe8ab488
A
1175
1176 thread->runq = processor;
1177
0a7de745 1178 return FALSE;
fe8ab488
A
1179}
1180
1181/*
1182 * Called in the context of thread with thread and pset unlocked,
1183 * after updating thread priority but before propagating that priority
1184 * to the processor
1185 */
1186void
1187sched_multiq_quantum_expire(thread_t thread)
1188{
1189 if (deep_drain) {
1190 /*
1191 * Move the entry at this priority to the end of the queue,
1192 * to allow the next task a shot at running.
1193 */
1194
1195 processor_t processor = thread->last_processor;
1196 processor_set_t pset = processor->processor_set;
1197 entry_queue_t entryq = multiq_main_entryq(processor);
1198
1199 pset_lock(pset);
1200
1201 sched_entry_t entry = group_entry_for_pri(thread->sched_group, processor->current_pri);
1202
1203 if (entry->runq == MULTIQ_ERUNQ) {
3e170ce0 1204 entry_queue_change_entry(entryq, entry, SCHED_TAILQ);
fe8ab488
A
1205 }
1206
1207 pset_unlock(pset);
1208 }
1209}
1210
1211static boolean_t
1212sched_multiq_processor_queue_empty(processor_t processor)
1213{
1214 return multiq_main_entryq(processor)->count == 0 &&
0a7de745 1215 multiq_bound_runq(processor)->count == 0;
fe8ab488
A
1216}
1217
1218static ast_t
1219sched_multiq_processor_csw_check(processor_t processor)
1220{
1221 boolean_t has_higher;
1222 int pri;
1223
a39ff7e2 1224 if (sched_multiq_thread_avoid_processor(processor, current_thread())) {
0a7de745 1225 return AST_PREEMPT | AST_URGENT;
a39ff7e2
A
1226 }
1227
fe8ab488 1228 entry_queue_t main_entryq = multiq_main_entryq(processor);
3e170ce0 1229 run_queue_t bound_runq = multiq_bound_runq(processor);
fe8ab488
A
1230
1231 assert(processor->active_thread != NULL);
1232
1233 pri = MAX(main_entryq->highq, bound_runq->highq);
1234
3e170ce0 1235 if (processor->first_timeslice) {
fe8ab488
A
1236 has_higher = (pri > processor->current_pri);
1237 } else {
1238 has_higher = (pri >= processor->current_pri);
1239 }
1240
1241 if (has_higher) {
0a7de745
A
1242 if (main_entryq->urgency > 0) {
1243 return AST_PREEMPT | AST_URGENT;
1244 }
fe8ab488 1245
0a7de745
A
1246 if (bound_runq->urgency > 0) {
1247 return AST_PREEMPT | AST_URGENT;
1248 }
fe8ab488
A
1249
1250 return AST_PREEMPT;
1251 }
1252
1253 return AST_NONE;
1254}
1255
1256static boolean_t
1257sched_multiq_processor_queue_has_priority(
0a7de745
A
1258 processor_t processor,
1259 int priority,
1260 boolean_t gte)
fe8ab488 1261{
39037602
A
1262 run_queue_t main_runq = multiq_main_entryq(processor);
1263 run_queue_t bound_runq = multiq_bound_runq(processor);
1264
39037602 1265 int qpri = MAX(main_runq->highq, bound_runq->highq);
fe8ab488 1266
0a7de745 1267 if (gte) {
fe8ab488 1268 return qpri >= priority;
0a7de745 1269 } else {
fe8ab488 1270 return qpri > priority;
0a7de745 1271 }
fe8ab488
A
1272}
1273
fe8ab488
A
1274static int
1275sched_multiq_runq_count(processor_t processor)
1276{
1277 /*
1278 * TODO: Decide whether to keep a count of runnable threads in the pset
1279 * or just return something less than the true count.
1280 *
1281 * This needs to be fast, so no iterating the whole runq.
1282 *
1283 * Another possible decision is to remove this - with global runq
1284 * it doesn't make much sense.
1285 */
1286 return multiq_main_entryq(processor)->count + multiq_bound_runq(processor)->count;
1287}
1288
1289static uint64_t
1290sched_multiq_runq_stats_count_sum(processor_t processor)
1291{
1292 /*
1293 * TODO: This one does need to go through all the runqueues, but it's only needed for
1294 * the sched stats tool
1295 */
1296
1297 uint64_t bound_sum = multiq_bound_runq(processor)->runq_stats.count_sum;
1298
0a7de745 1299 if (processor->cpu_id == processor->processor_set->cpu_set_low) {
fe8ab488 1300 return bound_sum + multiq_main_entryq(processor)->runq_stats.count_sum;
0a7de745 1301 } else {
fe8ab488 1302 return bound_sum;
0a7de745 1303 }
fe8ab488
A
1304}
1305
1306static int
1307sched_multiq_processor_bound_count(processor_t processor)
1308{
1309 return multiq_bound_runq(processor)->count;
1310}
1311
1312static void
1313sched_multiq_processor_queue_shutdown(processor_t processor)
1314{
1315 processor_set_t pset = processor->processor_set;
1316 entry_queue_t main_entryq = multiq_main_entryq(processor);
1317 thread_t thread;
1318 queue_head_t tqueue;
1319
1320 /* We only need to migrate threads if this is the last active processor in the pset */
1321 if (pset->online_processor_count > 0) {
1322 pset_unlock(pset);
1323 return;
1324 }
1325
1326 queue_init(&tqueue);
1327
1328 /* Note that we do not remove bound threads from the queues here */
1329
1330 while (main_entryq->count > 0) {
1331 thread = sched_global_dequeue_thread(main_entryq);
39037602 1332 enqueue_tail(&tqueue, &thread->runq_links);
fe8ab488
A
1333 }
1334
1335 pset_unlock(pset);
1336
39037602 1337 qe_foreach_element_safe(thread, &tqueue, runq_links) {
39037602
A
1338 remqueue(&thread->runq_links);
1339
fe8ab488
A
1340 thread_lock(thread);
1341
1342 thread_setrun(thread, SCHED_TAILQ);
1343
1344 thread_unlock(thread);
1345 }
1346}
1347
1348/*
1349 * Thread is locked
1350 *
1351 * This is why we can never read sched_pri unless we have the thread locked.
1352 * Which we do in the enqueue and remove cases, but not the dequeue case.
1353 */
1354static boolean_t
1355sched_multiq_processor_queue_remove(
0a7de745
A
1356 processor_t processor,
1357 thread_t thread)
fe8ab488
A
1358{
1359 boolean_t removed = FALSE;
fe8ab488
A
1360 processor_set_t pset = processor->processor_set;
1361
1362 pset_lock(pset);
1363
1364 if (thread->runq != PROCESSOR_NULL) {
1365 /*
1366 * Thread is on a run queue and we have a lock on
1367 * that run queue.
1368 */
1369
1370 assert(thread->runq == processor);
1371
1372 if (thread->bound_processor != PROCESSOR_NULL) {
1373 assert(processor == thread->bound_processor);
1374 run_queue_remove(multiq_bound_runq(processor), thread);
1375 thread->runq = PROCESSOR_NULL;
1376 } else {
1377 sched_group_remove_thread(multiq_main_entryq(processor),
0a7de745
A
1378 thread->sched_group,
1379 thread);
fe8ab488
A
1380 }
1381
1382 removed = TRUE;
1383 }
1384
1385 pset_unlock(pset);
1386
1387 return removed;
1388}
1389
1390/* pset is locked, returned unlocked */
1391static thread_t
1392sched_multiq_steal_thread(processor_set_t pset)
1393{
1394 pset_unlock(pset);
0a7de745 1395 return THREAD_NULL;
fe8ab488
A
1396}
1397
1398/*
1399 * Scan the global queue for candidate groups, and scan those groups for
1400 * candidate threads.
1401 *
39037602
A
1402 * TODO: This iterates every group runq in its entirety for each entry it has in the runq, which is O(N^2)
1403 * Instead, iterate only the queue in the group runq matching the priority of the entry.
1404 *
fe8ab488
A
1405 * Returns TRUE if retry is needed.
1406 */
1407static boolean_t
0a7de745
A
1408group_scan(entry_queue_t runq, sched_update_scan_context_t scan_context)
1409{
39037602
A
1410 int count = runq->count;
1411 int queue_index;
1412
1413 assert(count >= 0);
1414
0a7de745 1415 if (count == 0) {
39037602 1416 return FALSE;
0a7de745 1417 }
39037602
A
1418
1419 for (queue_index = bitmap_first(runq->bitmap, NRQS);
0a7de745
A
1420 queue_index >= 0;
1421 queue_index = bitmap_next(runq->bitmap, queue_index)) {
39037602
A
1422 sched_entry_t entry;
1423
cb323159 1424 cqe_foreach_element(entry, &runq->queues[queue_index], entry_links) {
39037602
A
1425 assert(count > 0);
1426
1427 sched_group_t group = group_for_entry(entry);
1428 if (group->runq.count > 0) {
0a7de745
A
1429 if (runq_scan(&group->runq, scan_context)) {
1430 return TRUE;
1431 }
fe8ab488 1432 }
39037602 1433 count--;
fe8ab488
A
1434 }
1435 }
1436
0a7de745 1437 return FALSE;
fe8ab488
A
1438}
1439
1440static void
3e170ce0 1441sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context)
fe8ab488
A
1442{
1443 boolean_t restart_needed = FALSE;
1444 processor_t processor = processor_list;
1445 processor_set_t pset;
1446 thread_t thread;
1447 spl_t s;
1448
1449 /*
1450 * We update the threads associated with each processor (bound and idle threads)
1451 * and then update the threads in each pset runqueue.
1452 */
1453
1454 do {
1455 do {
1456 pset = processor->processor_set;
1457
1458 s = splsched();
1459 pset_lock(pset);
1460
3e170ce0 1461 restart_needed = runq_scan(multiq_bound_runq(processor), scan_context);
fe8ab488
A
1462
1463 pset_unlock(pset);
1464 splx(s);
1465
0a7de745 1466 if (restart_needed) {
fe8ab488 1467 break;
0a7de745 1468 }
fe8ab488
A
1469
1470 thread = processor->idle_thread;
1471 if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
1472 if (thread_update_add_thread(thread) == FALSE) {
1473 restart_needed = TRUE;
1474 break;
1475 }
1476 }
1477 } while ((processor = processor->processor_list) != NULL);
1478
1479 /* Ok, we now have a collection of candidates -- fix them. */
1480 thread_update_process_threads();
fe8ab488
A
1481 } while (restart_needed);
1482
1483 pset = &pset0;
1484
1485 do {
1486 do {
1487 s = splsched();
1488 pset_lock(pset);
1489
3e170ce0 1490 restart_needed = group_scan(&pset->pset_runq, scan_context);
fe8ab488
A
1491
1492 pset_unlock(pset);
1493 splx(s);
1494
0a7de745 1495 if (restart_needed) {
fe8ab488 1496 break;
0a7de745 1497 }
fe8ab488
A
1498 } while ((pset = pset->pset_list) != NULL);
1499
1500 /* Ok, we now have a collection of candidates -- fix them. */
1501 thread_update_process_threads();
fe8ab488
A
1502 } while (restart_needed);
1503}
a39ff7e2
A
1504
1505extern int sched_allow_rt_smt;
1506
1507/* Return true if this thread should not continue running on this processor */
1508static bool
1509sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread)
1510{
1511 if (processor->processor_primary != processor) {
1512 /*
1513 * This is a secondary SMT processor. If the primary is running
1514 * a realtime thread, only allow realtime threads on the secondary.
1515 */
1516 if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
1517 return true;
1518 }
1519 }
1520
1521 return false;
1522}