2 * Copyright (c) 2013-2020 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <mach/mach_types.h>
30 #include <mach/machine.h>
32 #include <machine/machine_routines.h>
33 #include <machine/sched_param.h>
34 #include <machine/machine_cpu.h>
36 #include <kern/kern_types.h>
37 #include <kern/debug.h>
38 #include <kern/mach_param.h>
39 #include <kern/machine.h>
40 #include <kern/misc_protos.h>
41 #include <kern/processor.h>
42 #include <kern/queue.h>
43 #include <kern/sched.h>
44 #include <kern/sched_prim.h>
45 #include <kern/task.h>
46 #include <kern/thread.h>
48 #include <sys/kdebug.h>
53 * How does the task scheduler work?
55 * It schedules threads across a few levels.
57 * RT threads are dealt with above us
58 * Bound threads go into the per-processor runq
59 * Non-bound threads are linked on their task's sched_group's runq
60 * sched_groups' sched_entries are linked on the pset's runq
62 * TODO: make this explicit - bound threads should have a different enqueue fxn
64 * When we choose a new thread, we will decide whether to look at the bound runqueue, the global runqueue
65 * or the current group's runqueue, then dequeue the next thread in that runqueue.
67 * We then manipulate the sched_entries to reflect the invariant that:
68 * Each non-empty priority level in a group's runq is represented by one sched_entry enqueued in the global
71 * A sched_entry represents a chance at running - for each priority in each task, there is one chance of getting
72 * to run. This reduces the excess contention bonus given to processes which have work spread among many threads
73 * as compared to processes which do the same amount of work under fewer threads.
75 * NOTE: Currently, the multiq scheduler only supports one pset.
77 * NOTE ABOUT thread->sched_pri:
79 * It can change after enqueue - it's changed without pset lock but with thread lock if thread->runq is 0.
80 * Therefore we can only depend on it not changing during the enqueue and remove path, not the dequeue.
82 * TODO: Future features:
84 * Decouple the task priority from the sched_entry priority, allowing for:
85 * fast task priority change without having to iterate and re-dispatch all threads in the task.
86 * i.e. task-wide priority, task-wide boosting
87 * fancier group decay features
89 * Group (or task) decay:
90 * Decay is used for a few different things:
91 * Prioritizing latency-needing threads over throughput-needing threads for time-to-running
92 * Balancing work between threads in a process
93 * Balancing work done at the same priority between different processes
94 * Recovering from priority inversions between two threads in the same process
95 * Recovering from priority inversions between two threads in different processes
96 * Simulating a proportional share scheduler by allowing lower priority threads
97 * to run for a certain percentage of the time
99 * Task decay lets us separately address the 'same process' and 'different process' needs,
100 * which will allow us to make smarter tradeoffs in different cases.
101 * For example, we could resolve priority inversion in the same process by reordering threads without dropping the
102 * process below low priority threads in other processes.
104 * One lock to rule them all (or at least all the runqueues) instead of the pset locks
106 * Shrink sched_entry size to the size of a queue_chain_t by inferring priority, group, and perhaps runq field.
107 * The entries array is 5K currently so it'd be really great to reduce.
108 * One way to get sched_group below 4K without a new runq structure would be to remove the extra queues above realtime.
110 * When preempting a processor, store a flag saying if the preemption
111 * was from a thread in the same group or different group,
112 * and tell choose_thread about it.
114 * When choosing a processor, bias towards those running in the same
115 * group as I am running (at the same priority, or within a certain band?).
117 * Decide if we need to support psets.
118 * Decide how to support psets - do we need duplicate entries for each pset,
119 * or can we get away with putting the entry in either one or the other pset?
121 * Consider the right way to handle runq count - I don't want to iterate groups.
122 * Perhaps keep a global counter.
123 * Alternate option - remove it from choose_processor. It doesn't add much value
124 * now that we have global runq.
126 * Need a better way of finding group to target instead of looking at current_task.
127 * Perhaps choose_thread could pass in the current thread?
129 * Consider unifying runq copy-pastes.
131 * Thoughts on having a group central quantum bucket:
133 * I see two algorithms to decide quanta:
134 * A) Hand off only when switching thread to thread in the same group
135 * B) Allocate and return quanta to the group's pool
138 * If a task blocks completely, should it come back with the leftover quanta
139 * or brand new quanta?
141 * Should I put a flag saying zero out a quanta you grab when youre dispatched'?
144 * Handing off quanta between threads will help with jumping around in the current task
145 * but will not help when a thread from a different task is involved.
146 * Need an algorithm that works with round robin-ing between threads in different tasks
148 * But wait - round robining can only be triggered by quantum expire or blocking.
149 * We need something that works with preemption or yielding - that's the more interesting idea.
151 * Existing algorithm - preemption doesn't re-set quantum, puts thread on head of runq.
152 * Blocking or quantum expiration does re-set quantum, puts thread on tail of runq.
155 * Hand off quanta when hopping between threads with same sched_group
156 * Even if thread was blocked it uses last thread remaining quanta when it starts.
158 * If we use the only cycle entry at quantum algorithm, then the quantum pool starts getting
161 * A thought - perhaps the handoff approach doesn't work so well in the presence of
162 * non-handoff wakeups i.e. wake other thread then wait then block - doesn't mean that
163 * woken thread will be what I switch to - other processor may have stolen it.
164 * What do we do there?
167 * We currently don't know of a scenario where quantum buckets on the task is beneficial.
168 * We will instead handoff quantum between threads in the task, and keep quantum
169 * on the preempted thread if it's preempted by something outside the task.
173 #if DEBUG || DEVELOPMENT
174 #define MULTIQ_SANITY_CHECK
177 typedef struct sched_entry
{
178 queue_chain_t entry_links
;
179 int16_t sched_pri
; /* scheduled (current) priority */
184 typedef run_queue_t entry_queue_t
; /* A run queue that holds sched_entries instead of threads */
185 typedef run_queue_t group_runq_t
; /* A run queue that is part of a sched_group */
187 #define SCHED_ENTRY_NULL ((sched_entry_t) 0)
188 #define MULTIQ_ERUNQ (-4) /* Indicates entry is on the main runq */
190 /* Each level in the run queue corresponds to one entry in the entries array */
192 struct sched_entry entries
[NRQS
];
193 struct run_queue runq
;
194 queue_chain_t sched_groups
;
198 * Keep entry on the head of the runqueue while dequeueing threads.
199 * Only cycle it to the end of the runqueue when a thread in the task
202 static boolean_t deep_drain
= FALSE
;
204 /* Verify the consistency of the runq before touching it */
205 static boolean_t multiq_sanity_check
= FALSE
;
208 * Draining threads from the current task is preferred
209 * when they're less than X steps below the current
210 * global highest priority
212 #define DEFAULT_DRAIN_BAND_LIMIT MAXPRI
213 static integer_t drain_band_limit
;
216 * Don't go below this priority level if there is something above it in another task
218 #define DEFAULT_DRAIN_DEPTH_LIMIT MAXPRI_THROTTLE
219 static integer_t drain_depth_limit
;
222 * Don't favor the task when there's something above this priority in another task.
224 #define DEFAULT_DRAIN_CEILING BASEPRI_FOREGROUND
225 static integer_t drain_ceiling
;
227 static ZONE_DECLARE(sched_group_zone
, "sched groups",
228 sizeof(struct sched_group
), ZC_NOENCRYPT
| ZC_NOCALLOUT
);
230 static uint64_t num_sched_groups
= 0;
231 static queue_head_t sched_groups
;
233 static LCK_GRP_DECLARE(sched_groups_lock_grp
, "sched_groups");
234 static LCK_MTX_DECLARE(sched_groups_lock
, &sched_groups_lock_grp
);
237 sched_multiq_init(void);
240 sched_multiq_steal_thread(processor_set_t pset
);
243 sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context
);
246 sched_multiq_processor_enqueue(processor_t processor
, thread_t thread
,
247 sched_options_t options
);
250 sched_multiq_processor_queue_remove(processor_t processor
, thread_t thread
);
253 sched_multiq_quantum_expire(thread_t thread
);
256 sched_multiq_processor_csw_check(processor_t processor
);
259 sched_multiq_processor_queue_has_priority(processor_t processor
, int priority
, boolean_t gte
);
262 sched_multiq_runq_count(processor_t processor
);
265 sched_multiq_processor_queue_empty(processor_t processor
);
268 sched_multiq_runq_stats_count_sum(processor_t processor
);
271 sched_multiq_processor_bound_count(processor_t processor
);
274 sched_multiq_pset_init(processor_set_t pset
);
277 sched_multiq_processor_init(processor_t processor
);
280 sched_multiq_choose_thread(processor_t processor
, int priority
, ast_t reason
);
283 sched_multiq_processor_queue_shutdown(processor_t processor
);
286 sched_multiq_initial_thread_sched_mode(task_t parent_task
);
289 sched_multiq_thread_avoid_processor(processor_t processor
, thread_t thread
);
291 const struct sched_dispatch_table sched_multiq_dispatch
= {
292 .sched_name
= "multiq",
293 .init
= sched_multiq_init
,
294 .timebase_init
= sched_timeshare_timebase_init
,
295 .processor_init
= sched_multiq_processor_init
,
296 .pset_init
= sched_multiq_pset_init
,
297 .maintenance_continuation
= sched_timeshare_maintenance_continue
,
298 .choose_thread
= sched_multiq_choose_thread
,
299 .steal_thread_enabled
= sched_steal_thread_DISABLED
,
300 .steal_thread
= sched_multiq_steal_thread
,
301 .compute_timeshare_priority
= sched_compute_timeshare_priority
,
302 .choose_node
= sched_choose_node
,
303 .choose_processor
= choose_processor
,
304 .processor_enqueue
= sched_multiq_processor_enqueue
,
305 .processor_queue_shutdown
= sched_multiq_processor_queue_shutdown
,
306 .processor_queue_remove
= sched_multiq_processor_queue_remove
,
307 .processor_queue_empty
= sched_multiq_processor_queue_empty
,
308 .priority_is_urgent
= priority_is_urgent
,
309 .processor_csw_check
= sched_multiq_processor_csw_check
,
310 .processor_queue_has_priority
= sched_multiq_processor_queue_has_priority
,
311 .initial_quantum_size
= sched_timeshare_initial_quantum_size
,
312 .initial_thread_sched_mode
= sched_multiq_initial_thread_sched_mode
,
313 .can_update_priority
= can_update_priority
,
314 .update_priority
= update_priority
,
315 .lightweight_update_priority
= lightweight_update_priority
,
316 .quantum_expire
= sched_multiq_quantum_expire
,
317 .processor_runq_count
= sched_multiq_runq_count
,
318 .processor_runq_stats_count_sum
= sched_multiq_runq_stats_count_sum
,
319 .processor_bound_count
= sched_multiq_processor_bound_count
,
320 .thread_update_scan
= sched_multiq_thread_update_scan
,
321 .multiple_psets_enabled
= FALSE
,
322 .sched_groups_enabled
= TRUE
,
323 .avoid_processor_enabled
= TRUE
,
324 .thread_avoid_processor
= sched_multiq_thread_avoid_processor
,
325 .processor_balance
= sched_SMT_balance
,
327 .rt_runq
= sched_rtlocal_runq
,
328 .rt_init
= sched_rtlocal_init
,
329 .rt_queue_shutdown
= sched_rtlocal_queue_shutdown
,
330 .rt_runq_scan
= sched_rtlocal_runq_scan
,
331 .rt_runq_count_sum
= sched_rtlocal_runq_count_sum
,
333 .qos_max_parallelism
= sched_qos_max_parallelism
,
334 .check_spill
= sched_check_spill
,
335 .ipi_policy
= sched_ipi_policy
,
336 .thread_should_yield
= sched_thread_should_yield
,
337 .run_count_incr
= sched_run_incr
,
338 .run_count_decr
= sched_run_decr
,
339 .update_thread_bucket
= sched_update_thread_bucket
,
340 .pset_made_schedulable
= sched_pset_made_schedulable
,
345 sched_multiq_init(void)
347 #if defined(MULTIQ_SANITY_CHECK)
348 PE_parse_boot_argn("-multiq-sanity-check", &multiq_sanity_check
, sizeof(multiq_sanity_check
));
351 PE_parse_boot_argn("-multiq-deep-drain", &deep_drain
, sizeof(deep_drain
));
353 if (!PE_parse_boot_argn("multiq_drain_ceiling", &drain_ceiling
, sizeof(drain_ceiling
))) {
354 drain_ceiling
= DEFAULT_DRAIN_CEILING
;
357 if (!PE_parse_boot_argn("multiq_drain_depth_limit", &drain_depth_limit
, sizeof(drain_depth_limit
))) {
358 drain_depth_limit
= DEFAULT_DRAIN_DEPTH_LIMIT
;
361 if (!PE_parse_boot_argn("multiq_drain_band_limit", &drain_band_limit
, sizeof(drain_band_limit
))) {
362 drain_band_limit
= DEFAULT_DRAIN_BAND_LIMIT
;
365 printf("multiq scheduler config: deep-drain %d, ceiling %d, depth limit %d, band limit %d, sanity check %d\n",
366 deep_drain
, drain_ceiling
, drain_depth_limit
, drain_band_limit
, multiq_sanity_check
);
368 queue_init(&sched_groups
);
370 sched_timeshare_init();
374 sched_multiq_processor_init(processor_t processor
)
376 run_queue_init(&processor
->runq
);
380 sched_multiq_pset_init(processor_set_t pset
)
382 run_queue_init(&pset
->pset_runq
);
386 sched_multiq_initial_thread_sched_mode(task_t parent_task
)
388 if (parent_task
== kernel_task
) {
389 return TH_MODE_FIXED
;
391 return TH_MODE_TIMESHARE
;
396 sched_group_create(void)
398 sched_group_t sched_group
;
400 if (!SCHED(sched_groups_enabled
)) {
401 return SCHED_GROUP_NULL
;
404 sched_group
= (sched_group_t
)zalloc(sched_group_zone
);
406 bzero(sched_group
, sizeof(struct sched_group
));
408 run_queue_init(&sched_group
->runq
);
410 for (size_t i
= 0; i
< NRQS
; i
++) {
411 sched_group
->entries
[i
].runq
= 0;
412 sched_group
->entries
[i
].sched_pri
= (int16_t)i
;
415 lck_mtx_lock(&sched_groups_lock
);
416 queue_enter(&sched_groups
, sched_group
, sched_group_t
, sched_groups
);
418 lck_mtx_unlock(&sched_groups_lock
);
424 sched_group_destroy(sched_group_t sched_group
)
426 if (!SCHED(sched_groups_enabled
)) {
427 assert(sched_group
== SCHED_GROUP_NULL
);
431 assert(sched_group
!= SCHED_GROUP_NULL
);
432 assert(sched_group
->runq
.count
== 0);
434 for (int i
= 0; i
< NRQS
; i
++) {
435 assert(sched_group
->entries
[i
].runq
== 0);
436 assert(sched_group
->entries
[i
].sched_pri
== i
);
439 lck_mtx_lock(&sched_groups_lock
);
440 queue_remove(&sched_groups
, sched_group
, sched_group_t
, sched_groups
);
442 lck_mtx_unlock(&sched_groups_lock
);
444 zfree(sched_group_zone
, sched_group
);
447 __attribute__((always_inline
))
448 static inline entry_queue_t
449 multiq_main_entryq(processor_t processor
)
451 return (entry_queue_t
)&processor
->processor_set
->pset_runq
;
454 __attribute__((always_inline
))
455 static inline run_queue_t
456 multiq_bound_runq(processor_t processor
)
458 return &processor
->runq
;
461 __attribute__((always_inline
))
462 static inline sched_entry_t
463 group_entry_for_pri(sched_group_t group
, integer_t pri
)
465 return &group
->entries
[pri
];
468 __attribute__((always_inline
))
469 static inline sched_group_t
470 group_for_entry(sched_entry_t entry
)
472 #pragma clang diagnostic push
473 #pragma clang diagnostic ignored "-Wcast-align"
474 sched_group_t group
= (sched_group_t
)(entry
- entry
->sched_pri
);
475 #pragma clang diagnostic pop
479 /* Peek at the head of the runqueue */
481 entry_queue_first_entry(entry_queue_t rq
)
483 assert(rq
->count
!= 0);
485 circle_queue_t queue
= &rq
->queues
[rq
->highq
];
487 sched_entry_t entry
= cqe_queue_first(queue
, struct sched_entry
, entry_links
);
489 assert(entry
->sched_pri
== rq
->highq
);
494 #if defined(MULTIQ_SANITY_CHECK)
497 __attribute__((always_inline
))
498 static inline boolean_t
499 queue_chain_linked(queue_chain_t
* chain
)
501 if (chain
->next
!= NULL
) {
502 assert(chain
->prev
!= NULL
);
505 assert(chain
->prev
== NULL
);
509 #endif /* MACH_ASSERT */
512 group_first_thread(sched_group_t group
)
514 group_runq_t rq
= &group
->runq
;
516 assert(rq
->count
!= 0);
518 circle_queue_t queue
= &rq
->queues
[rq
->highq
];
520 thread_t thread
= cqe_queue_first(queue
, struct thread
, runq_links
);
522 assert(thread
!= THREAD_NULL
);
523 assert_thread_magic(thread
);
525 assert(thread
->sched_group
== group
);
527 /* TODO: May not be safe */
528 assert(thread
->sched_pri
== rq
->highq
);
533 /* Asserts if entry is not in entry runq at pri */
535 entry_queue_check_entry(entry_queue_t runq
, sched_entry_t entry
, int expected_pri
)
540 assert(queue_chain_linked(&entry
->entry_links
));
541 assert(entry
->runq
== MULTIQ_ERUNQ
);
543 q
= &runq
->queues
[expected_pri
];
545 cqe_foreach_element(elem
, q
, entry_links
) {
551 panic("runq %p doesn't contain entry %p at pri %d", runq
, entry
, expected_pri
);
554 /* Asserts if thread is not in group at its priority */
556 sched_group_check_thread(sched_group_t group
, thread_t thread
)
560 int pri
= thread
->sched_pri
;
562 assert(thread
->runq
!= PROCESSOR_NULL
);
564 q
= &group
->runq
.queues
[pri
];
566 cqe_foreach_element(elem
, q
, runq_links
) {
567 if (elem
== thread
) {
572 panic("group %p doesn't contain thread %p at pri %d", group
, thread
, pri
);
576 global_check_entry_queue(entry_queue_t main_entryq
)
578 if (main_entryq
->count
== 0) {
582 sched_entry_t entry
= entry_queue_first_entry(main_entryq
);
584 assert(entry
->runq
== MULTIQ_ERUNQ
);
586 sched_group_t group
= group_for_entry(entry
);
588 thread_t thread
= group_first_thread(group
);
590 __assert_only sched_entry_t thread_entry
= group_entry_for_pri(thread
->sched_group
, thread
->sched_pri
);
592 assert(entry
->sched_pri
== group
->runq
.highq
);
594 assert(entry
== thread_entry
);
595 assert(thread
->runq
!= PROCESSOR_NULL
);
599 group_check_run_queue(entry_queue_t main_entryq
, sched_group_t group
)
601 if (group
->runq
.count
== 0) {
605 thread_t thread
= group_first_thread(group
);
607 assert(thread
->runq
!= PROCESSOR_NULL
);
609 sched_entry_t sched_entry
= group_entry_for_pri(thread
->sched_group
, thread
->sched_pri
);
611 entry_queue_check_entry(main_entryq
, sched_entry
, thread
->sched_pri
);
613 assert(sched_entry
->sched_pri
== thread
->sched_pri
);
614 assert(sched_entry
->runq
== MULTIQ_ERUNQ
);
617 #endif /* defined(MULTIQ_SANITY_CHECK) */
620 * The run queue must not be empty.
623 entry_queue_dequeue_entry(entry_queue_t rq
)
625 sched_entry_t sched_entry
;
626 circle_queue_t queue
= &rq
->queues
[rq
->highq
];
628 assert(rq
->count
> 0);
629 assert(!circle_queue_empty(queue
));
631 sched_entry
= cqe_dequeue_head(queue
, struct sched_entry
, entry_links
);
633 SCHED_STATS_RUNQ_CHANGE(&rq
->runq_stats
, rq
->count
);
635 if (SCHED(priority_is_urgent
)(rq
->highq
)) {
636 rq
->urgency
--; assert(rq
->urgency
>= 0);
638 if (circle_queue_empty(queue
)) {
639 rq_bitmap_clear(rq
->bitmap
, rq
->highq
);
640 rq
->highq
= bitmap_first(rq
->bitmap
, NRQS
);
643 sched_entry
->runq
= 0;
649 * The run queue must not be empty.
652 entry_queue_enqueue_entry(
657 int sched_pri
= entry
->sched_pri
;
658 circle_queue_t queue
= &rq
->queues
[sched_pri
];
659 boolean_t result
= FALSE
;
661 assert(entry
->runq
== 0);
663 if (circle_queue_empty(queue
)) {
664 circle_enqueue_tail(queue
, &entry
->entry_links
);
666 rq_bitmap_set(rq
->bitmap
, sched_pri
);
667 if (sched_pri
> rq
->highq
) {
668 rq
->highq
= sched_pri
;
672 if (options
& SCHED_TAILQ
) {
673 circle_enqueue_tail(queue
, &entry
->entry_links
);
675 circle_enqueue_head(queue
, &entry
->entry_links
);
678 if (SCHED(priority_is_urgent
)(sched_pri
)) {
681 SCHED_STATS_RUNQ_CHANGE(&rq
->runq_stats
, rq
->count
);
684 entry
->runq
= MULTIQ_ERUNQ
;
690 * The entry must be in this runqueue.
693 entry_queue_remove_entry(
697 int sched_pri
= entry
->sched_pri
;
699 #if defined(MULTIQ_SANITY_CHECK)
700 if (multiq_sanity_check
) {
701 entry_queue_check_entry(rq
, entry
, sched_pri
);
705 remqueue(&entry
->entry_links
);
707 SCHED_STATS_RUNQ_CHANGE(&rq
->runq_stats
, rq
->count
);
709 if (SCHED(priority_is_urgent
)(sched_pri
)) {
710 rq
->urgency
--; assert(rq
->urgency
>= 0);
713 if (circle_queue_empty(&rq
->queues
[sched_pri
])) {
714 /* update run queue status */
715 rq_bitmap_clear(rq
->bitmap
, sched_pri
);
716 rq
->highq
= bitmap_first(rq
->bitmap
, NRQS
);
723 entry_queue_change_entry(
728 int sched_pri
= entry
->sched_pri
;
729 circle_queue_t queue
= &rq
->queues
[sched_pri
];
731 #if defined(MULTIQ_SANITY_CHECK)
732 if (multiq_sanity_check
) {
733 entry_queue_check_entry(rq
, entry
, sched_pri
);
737 circle_dequeue(queue
, &entry
->entry_links
);
738 if (options
& SCHED_TAILQ
) {
739 circle_enqueue_tail(queue
, &entry
->entry_links
);
741 circle_enqueue_head(queue
, &entry
->entry_links
);
745 * The run queue must not be empty.
747 * sets queue_empty to TRUE if queue is now empty at thread_pri
750 group_run_queue_dequeue_thread(
752 integer_t
*thread_pri
,
753 boolean_t
*queue_empty
)
756 circle_queue_t queue
= &rq
->queues
[rq
->highq
];
758 assert(rq
->count
> 0);
759 assert(!circle_queue_empty(queue
));
761 *thread_pri
= rq
->highq
;
763 thread
= cqe_dequeue_head(queue
, struct thread
, runq_links
);
764 assert_thread_magic(thread
);
766 SCHED_STATS_RUNQ_CHANGE(&rq
->runq_stats
, rq
->count
);
768 if (SCHED(priority_is_urgent
)(rq
->highq
)) {
769 rq
->urgency
--; assert(rq
->urgency
>= 0);
771 if (circle_queue_empty(queue
)) {
772 rq_bitmap_clear(rq
->bitmap
, rq
->highq
);
773 rq
->highq
= bitmap_first(rq
->bitmap
, NRQS
);
776 *queue_empty
= FALSE
;
783 * The run queue must not be empty.
784 * returns TRUE if queue was empty at thread_pri
787 group_run_queue_enqueue_thread(
790 integer_t thread_pri
,
793 circle_queue_t queue
= &rq
->queues
[thread_pri
];
794 boolean_t result
= FALSE
;
796 assert(thread
->runq
== PROCESSOR_NULL
);
797 assert_thread_magic(thread
);
799 if (circle_queue_empty(queue
)) {
800 circle_enqueue_tail(queue
, &thread
->runq_links
);
802 rq_bitmap_set(rq
->bitmap
, thread_pri
);
803 if (thread_pri
> rq
->highq
) {
804 rq
->highq
= thread_pri
;
808 if (options
& SCHED_TAILQ
) {
809 circle_enqueue_tail(queue
, &thread
->runq_links
);
811 circle_enqueue_head(queue
, &thread
->runq_links
);
814 if (SCHED(priority_is_urgent
)(thread_pri
)) {
817 SCHED_STATS_RUNQ_CHANGE(&rq
->runq_stats
, rq
->count
);
824 * The thread must be in this runqueue.
825 * returns TRUE if queue is now empty at thread_pri
828 group_run_queue_remove_thread(
831 integer_t thread_pri
)
833 circle_queue_t queue
= &rq
->queues
[thread_pri
];
834 boolean_t result
= FALSE
;
836 assert_thread_magic(thread
);
837 assert(thread
->runq
!= PROCESSOR_NULL
);
839 circle_dequeue(queue
, &thread
->runq_links
);
841 SCHED_STATS_RUNQ_CHANGE(&rq
->runq_stats
, rq
->count
);
843 if (SCHED(priority_is_urgent
)(thread_pri
)) {
844 rq
->urgency
--; assert(rq
->urgency
>= 0);
847 if (circle_queue_empty(queue
)) {
848 /* update run queue status */
849 rq_bitmap_clear(rq
->bitmap
, thread_pri
);
850 rq
->highq
= bitmap_first(rq
->bitmap
, NRQS
);
854 thread
->runq
= PROCESSOR_NULL
;
860 * A thread's sched pri may change out from under us because
861 * we're clearing thread->runq here without the thread locked.
862 * Do not rely on it to be the same as when we enqueued.
865 sched_global_dequeue_thread(entry_queue_t main_entryq
)
867 boolean_t pri_level_empty
= FALSE
;
869 group_runq_t group_runq
;
871 integer_t thread_pri
;
874 assert(main_entryq
->count
> 0);
876 entry
= entry_queue_dequeue_entry(main_entryq
);
878 group
= group_for_entry(entry
);
879 group_runq
= &group
->runq
;
881 thread
= group_run_queue_dequeue_thread(group_runq
, &thread_pri
, &pri_level_empty
);
883 thread
->runq
= PROCESSOR_NULL
;
885 if (!pri_level_empty
) {
886 entry_queue_enqueue_entry(main_entryq
, entry
, SCHED_TAILQ
);
892 /* Dequeue a thread from the global runq without moving the entry */
894 sched_global_deep_drain_dequeue_thread(entry_queue_t main_entryq
)
896 boolean_t pri_level_empty
= FALSE
;
898 group_runq_t group_runq
;
900 integer_t thread_pri
;
903 assert(main_entryq
->count
> 0);
905 entry
= entry_queue_first_entry(main_entryq
);
907 group
= group_for_entry(entry
);
908 group_runq
= &group
->runq
;
910 thread
= group_run_queue_dequeue_thread(group_runq
, &thread_pri
, &pri_level_empty
);
912 thread
->runq
= PROCESSOR_NULL
;
914 if (pri_level_empty
) {
915 entry_queue_remove_entry(main_entryq
, entry
);
923 sched_group_dequeue_thread(
924 entry_queue_t main_entryq
,
927 group_runq_t group_runq
= &group
->runq
;
928 boolean_t pri_level_empty
= FALSE
;
930 integer_t thread_pri
;
932 thread
= group_run_queue_dequeue_thread(group_runq
, &thread_pri
, &pri_level_empty
);
934 thread
->runq
= PROCESSOR_NULL
;
936 if (pri_level_empty
) {
937 entry_queue_remove_entry(main_entryq
, group_entry_for_pri(group
, thread_pri
));
944 sched_group_remove_thread(
945 entry_queue_t main_entryq
,
949 integer_t thread_pri
= thread
->sched_pri
;
950 sched_entry_t sched_entry
= group_entry_for_pri(group
, thread_pri
);
952 #if defined(MULTIQ_SANITY_CHECK)
953 if (multiq_sanity_check
) {
954 global_check_entry_queue(main_entryq
);
955 group_check_run_queue(main_entryq
, group
);
957 sched_group_check_thread(group
, thread
);
958 entry_queue_check_entry(main_entryq
, sched_entry
, thread_pri
);
962 boolean_t pri_level_empty
= group_run_queue_remove_thread(&group
->runq
, thread
, thread_pri
);
964 if (pri_level_empty
) {
965 entry_queue_remove_entry(main_entryq
, sched_entry
);
968 #if defined(MULTIQ_SANITY_CHECK)
969 if (multiq_sanity_check
) {
970 global_check_entry_queue(main_entryq
);
971 group_check_run_queue(main_entryq
, group
);
977 sched_group_enqueue_thread(
978 entry_queue_t main_entryq
,
983 #if defined(MULTIQ_SANITY_CHECK)
984 if (multiq_sanity_check
) {
985 global_check_entry_queue(main_entryq
);
986 group_check_run_queue(main_entryq
, group
);
990 int sched_pri
= thread
->sched_pri
;
992 boolean_t pri_level_was_empty
= group_run_queue_enqueue_thread(&group
->runq
, thread
, sched_pri
, options
);
994 if (pri_level_was_empty
) {
996 * TODO: Need to figure out if passing options here is a good idea or not
997 * What effects would it have?
999 entry_queue_enqueue_entry(main_entryq
, &group
->entries
[sched_pri
], options
);
1000 } else if (options
& SCHED_HEADQ
) {
1001 /* The thread should be at the head of the line - move its entry to the front */
1002 entry_queue_change_entry(main_entryq
, &group
->entries
[sched_pri
], options
);
1007 * Locate a thread to execute from the run queue and return it.
1008 * Only choose a thread with greater or equal priority.
1010 * pset is locked, thread is not locked.
1012 * Returns THREAD_NULL if it cannot find a valid thread.
1014 * Note: we cannot rely on the value of thread->sched_pri in this path because
1015 * we don't have the thread locked.
1017 * TODO: Remove tracepoints
1020 sched_multiq_choose_thread(
1021 processor_t processor
,
1025 entry_queue_t main_entryq
= multiq_main_entryq(processor
);
1026 run_queue_t bound_runq
= multiq_bound_runq(processor
);
1028 boolean_t choose_bound_runq
= FALSE
;
1030 if (bound_runq
->highq
< priority
&&
1031 main_entryq
->highq
< priority
) {
1035 if (bound_runq
->count
&& main_entryq
->count
) {
1036 if (bound_runq
->highq
>= main_entryq
->highq
) {
1037 choose_bound_runq
= TRUE
;
1041 } else if (bound_runq
->count
) {
1042 choose_bound_runq
= TRUE
;
1043 } else if (main_entryq
->count
) {
1049 if (choose_bound_runq
) {
1050 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
1051 MACHDBG_CODE(DBG_MACH_SCHED
, MACH_MULTIQ_DEQUEUE
) | DBG_FUNC_NONE
,
1052 MACH_MULTIQ_BOUND
, main_entryq
->highq
, bound_runq
->highq
, 0, 0);
1054 return run_queue_dequeue(bound_runq
, SCHED_HEADQ
);
1057 sched_group_t group
= current_thread()->sched_group
;
1059 #if defined(MULTIQ_SANITY_CHECK)
1060 if (multiq_sanity_check
) {
1061 global_check_entry_queue(main_entryq
);
1062 group_check_run_queue(main_entryq
, group
);
1067 * Determine if we should look at the group or the global queue
1070 * Perhaps pass reason as a 'should look inside' argument to choose_thread
1071 * Should YIELD AST override drain limit?
1073 if (group
->runq
.count
!= 0 && (reason
& AST_PREEMPTION
) == 0) {
1074 boolean_t favor_group
= TRUE
;
1076 integer_t global_pri
= main_entryq
->highq
;
1077 integer_t group_pri
= group
->runq
.highq
;
1080 * Favor the current group if the group is still the globally highest.
1082 * Otherwise, consider choosing a thread from the current group
1083 * even if it's lower priority than the global highest priority.
1085 if (global_pri
> group_pri
) {
1087 * If there's something elsewhere above the depth limit,
1088 * don't pick a thread below the limit.
1090 if (global_pri
> drain_depth_limit
&& group_pri
<= drain_depth_limit
) {
1091 favor_group
= FALSE
;
1095 * If there's something at or above the ceiling,
1096 * don't favor the group.
1098 if (global_pri
>= drain_ceiling
) {
1099 favor_group
= FALSE
;
1103 * Don't go more than X steps below the global highest
1105 if ((global_pri
- group_pri
) >= drain_band_limit
) {
1106 favor_group
= FALSE
;
1111 /* Pull from local runq */
1112 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
1113 MACHDBG_CODE(DBG_MACH_SCHED
, MACH_MULTIQ_DEQUEUE
) | DBG_FUNC_NONE
,
1114 MACH_MULTIQ_GROUP
, global_pri
, group_pri
, 0, 0);
1116 return sched_group_dequeue_thread(main_entryq
, group
);
1120 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
1121 MACHDBG_CODE(DBG_MACH_SCHED
, MACH_MULTIQ_DEQUEUE
) | DBG_FUNC_NONE
,
1122 MACH_MULTIQ_GLOBAL
, main_entryq
->highq
, group
->runq
.highq
, 0, 0);
1124 /* Couldn't pull from local runq, pull from global runq instead */
1126 return sched_global_deep_drain_dequeue_thread(main_entryq
);
1128 return sched_global_dequeue_thread(main_entryq
);
1134 * Thread must be locked, and not already be on a run queue.
1138 sched_multiq_processor_enqueue(
1139 processor_t processor
,
1141 sched_options_t options
)
1145 assert(processor
== thread
->chosen_processor
);
1147 if (thread
->bound_processor
!= PROCESSOR_NULL
) {
1148 assert(thread
->bound_processor
== processor
);
1150 result
= run_queue_enqueue(multiq_bound_runq(processor
), thread
, options
);
1151 thread
->runq
= processor
;
1156 sched_group_enqueue_thread(multiq_main_entryq(processor
),
1157 thread
->sched_group
,
1160 thread
->runq
= processor
;
1166 * Called in the context of thread with thread and pset unlocked,
1167 * after updating thread priority but before propagating that priority
1171 sched_multiq_quantum_expire(thread_t thread
)
1175 * Move the entry at this priority to the end of the queue,
1176 * to allow the next task a shot at running.
1179 processor_t processor
= thread
->last_processor
;
1180 processor_set_t pset
= processor
->processor_set
;
1181 entry_queue_t entryq
= multiq_main_entryq(processor
);
1185 sched_entry_t entry
= group_entry_for_pri(thread
->sched_group
, processor
->current_pri
);
1187 if (entry
->runq
== MULTIQ_ERUNQ
) {
1188 entry_queue_change_entry(entryq
, entry
, SCHED_TAILQ
);
1196 sched_multiq_processor_queue_empty(processor_t processor
)
1198 return multiq_main_entryq(processor
)->count
== 0 &&
1199 multiq_bound_runq(processor
)->count
== 0;
1203 sched_multiq_processor_csw_check(processor_t processor
)
1205 boolean_t has_higher
;
1208 if (sched_multiq_thread_avoid_processor(processor
, current_thread())) {
1209 return AST_PREEMPT
| AST_URGENT
;
1212 entry_queue_t main_entryq
= multiq_main_entryq(processor
);
1213 run_queue_t bound_runq
= multiq_bound_runq(processor
);
1215 assert(processor
->active_thread
!= NULL
);
1217 pri
= MAX(main_entryq
->highq
, bound_runq
->highq
);
1219 if (processor
->first_timeslice
) {
1220 has_higher
= (pri
> processor
->current_pri
);
1222 has_higher
= (pri
>= processor
->current_pri
);
1226 if (main_entryq
->urgency
> 0) {
1227 return AST_PREEMPT
| AST_URGENT
;
1230 if (bound_runq
->urgency
> 0) {
1231 return AST_PREEMPT
| AST_URGENT
;
1241 sched_multiq_processor_queue_has_priority(
1242 processor_t processor
,
1246 run_queue_t main_runq
= multiq_main_entryq(processor
);
1247 run_queue_t bound_runq
= multiq_bound_runq(processor
);
1249 int qpri
= MAX(main_runq
->highq
, bound_runq
->highq
);
1252 return qpri
>= priority
;
1254 return qpri
> priority
;
1259 sched_multiq_runq_count(processor_t processor
)
1262 * TODO: Decide whether to keep a count of runnable threads in the pset
1263 * or just return something less than the true count.
1265 * This needs to be fast, so no iterating the whole runq.
1267 * Another possible decision is to remove this - with global runq
1268 * it doesn't make much sense.
1270 return multiq_main_entryq(processor
)->count
+ multiq_bound_runq(processor
)->count
;
1274 sched_multiq_runq_stats_count_sum(processor_t processor
)
1277 * TODO: This one does need to go through all the runqueues, but it's only needed for
1278 * the sched stats tool
1281 uint64_t bound_sum
= multiq_bound_runq(processor
)->runq_stats
.count_sum
;
1283 if (processor
->cpu_id
== processor
->processor_set
->cpu_set_low
) {
1284 return bound_sum
+ multiq_main_entryq(processor
)->runq_stats
.count_sum
;
1291 sched_multiq_processor_bound_count(processor_t processor
)
1293 return multiq_bound_runq(processor
)->count
;
1297 sched_multiq_processor_queue_shutdown(processor_t processor
)
1299 processor_set_t pset
= processor
->processor_set
;
1300 entry_queue_t main_entryq
= multiq_main_entryq(processor
);
1302 queue_head_t tqueue
;
1304 /* We only need to migrate threads if this is the last active processor in the pset */
1305 if (pset
->online_processor_count
> 0) {
1310 queue_init(&tqueue
);
1312 /* Note that we do not remove bound threads from the queues here */
1314 while (main_entryq
->count
> 0) {
1315 thread
= sched_global_dequeue_thread(main_entryq
);
1316 enqueue_tail(&tqueue
, &thread
->runq_links
);
1321 qe_foreach_element_safe(thread
, &tqueue
, runq_links
) {
1322 remqueue(&thread
->runq_links
);
1324 thread_lock(thread
);
1326 thread_setrun(thread
, SCHED_TAILQ
);
1328 thread_unlock(thread
);
1335 * This is why we can never read sched_pri unless we have the thread locked.
1336 * Which we do in the enqueue and remove cases, but not the dequeue case.
1339 sched_multiq_processor_queue_remove(
1340 processor_t processor
,
1343 boolean_t removed
= FALSE
;
1344 processor_set_t pset
= processor
->processor_set
;
1348 if (thread
->runq
!= PROCESSOR_NULL
) {
1350 * Thread is on a run queue and we have a lock on
1354 assert(thread
->runq
== processor
);
1356 if (thread
->bound_processor
!= PROCESSOR_NULL
) {
1357 assert(processor
== thread
->bound_processor
);
1358 run_queue_remove(multiq_bound_runq(processor
), thread
);
1359 thread
->runq
= PROCESSOR_NULL
;
1361 sched_group_remove_thread(multiq_main_entryq(processor
),
1362 thread
->sched_group
,
1374 /* pset is locked, returned unlocked */
1376 sched_multiq_steal_thread(processor_set_t pset
)
1383 * Scan the global queue for candidate groups, and scan those groups for
1384 * candidate threads.
1386 * TODO: This iterates every group runq in its entirety for each entry it has in the runq, which is O(N^2)
1387 * Instead, iterate only the queue in the group runq matching the priority of the entry.
1389 * Returns TRUE if retry is needed.
1392 group_scan(entry_queue_t runq
, sched_update_scan_context_t scan_context
)
1394 int count
= runq
->count
;
1403 for (queue_index
= bitmap_first(runq
->bitmap
, NRQS
);
1405 queue_index
= bitmap_next(runq
->bitmap
, queue_index
)) {
1406 sched_entry_t entry
;
1408 cqe_foreach_element(entry
, &runq
->queues
[queue_index
], entry_links
) {
1411 sched_group_t group
= group_for_entry(entry
);
1412 if (group
->runq
.count
> 0) {
1413 if (runq_scan(&group
->runq
, scan_context
)) {
1425 sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context
)
1427 boolean_t restart_needed
= FALSE
;
1428 processor_t processor
= processor_list
;
1429 processor_set_t pset
;
1434 * We update the threads associated with each processor (bound and idle threads)
1435 * and then update the threads in each pset runqueue.
1440 pset
= processor
->processor_set
;
1445 restart_needed
= runq_scan(multiq_bound_runq(processor
), scan_context
);
1450 if (restart_needed
) {
1454 thread
= processor
->idle_thread
;
1455 if (thread
!= THREAD_NULL
&& thread
->sched_stamp
!= sched_tick
) {
1456 if (thread_update_add_thread(thread
) == FALSE
) {
1457 restart_needed
= TRUE
;
1461 } while ((processor
= processor
->processor_list
) != NULL
);
1463 /* Ok, we now have a collection of candidates -- fix them. */
1464 thread_update_process_threads();
1465 } while (restart_needed
);
1474 restart_needed
= group_scan(&pset
->pset_runq
, scan_context
);
1479 if (restart_needed
) {
1482 } while ((pset
= pset
->pset_list
) != NULL
);
1484 /* Ok, we now have a collection of candidates -- fix them. */
1485 thread_update_process_threads();
1486 } while (restart_needed
);
1489 extern int sched_allow_rt_smt
;
1491 /* Return true if this thread should not continue running on this processor */
1493 sched_multiq_thread_avoid_processor(processor_t processor
, thread_t thread
)
1495 if (processor
->processor_primary
!= processor
) {
1497 * This is a secondary SMT processor. If the primary is running
1498 * a realtime thread, only allow realtime threads on the secondary.
1500 if ((processor
->processor_primary
->current_pri
>= BASEPRI_RTQUEUES
) && ((thread
->sched_pri
< BASEPRI_RTQUEUES
) || !sched_allow_rt_smt
)) {