]> git.saurik.com Git - apple/xnu.git/blame - osfmk/kern/sched_multiq.c
xnu-3247.1.106.tar.gz
[apple/xnu.git] / osfmk / kern / sched_multiq.c
CommitLineData
fe8ab488
A
1/*
2 * Copyright (c) 2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <mach/mach_types.h>
30#include <mach/machine.h>
31
32#include <machine/machine_routines.h>
33#include <machine/sched_param.h>
34#include <machine/machine_cpu.h>
35
36#include <kern/kern_types.h>
37#include <kern/debug.h>
38#include <kern/mach_param.h>
39#include <kern/machine.h>
40#include <kern/misc_protos.h>
41#include <kern/processor.h>
42#include <kern/queue.h>
43#include <kern/sched.h>
44#include <kern/sched_prim.h>
45#include <kern/task.h>
46#include <kern/thread.h>
47
48#include <sys/kdebug.h>
49
50/*
51 * Theory Statement
52 *
53 * How does the task scheduler work?
54 *
55 * It schedules threads across a few levels.
56 *
57 * RT threads are dealt with above us
58 * Bound threads go into the per-processor runq
59 * Non-bound threads are linked on their task's sched_group's runq
60 * sched_groups' sched_entries are linked on the pset's runq
61 *
62 * TODO: make this explicit - bound threads should have a different enqueue fxn
63 *
64 * When we choose a new thread, we will decide whether to look at the bound runqueue, the global runqueue
65 * or the current group's runqueue, then dequeue the next thread in that runqueue.
66 *
67 * We then manipulate the sched_entries to reflect the invariant that:
68 * Each non-empty priority level in a group's runq is represented by one sched_entry enqueued in the global
69 * runqueue.
70 *
71 * A sched_entry represents a chance at running - for each priority in each task, there is one chance of getting
72 * to run. This reduces the excess contention bonus given to processes which have work spread among many threads
73 * as compared to processes which do the same amount of work under fewer threads.
74 *
75 * NOTE: Currently, the multiq scheduler only supports one pset.
76 *
77 * NOTE ABOUT thread->sched_pri:
78 *
79 * It can change after enqueue - it's changed without pset lock but with thread lock if thread->runq is 0.
80 * Therefore we can only depend on it not changing during the enqueue and remove path, not the dequeue.
81 *
82 * TODO: Future features:
83 *
84 * Decouple the task priority from the sched_entry priority, allowing for:
85 * fast task priority change without having to iterate and re-dispatch all threads in the task.
86 * i.e. task-wide priority, task-wide boosting
87 * fancier group decay features
88 *
89 * Group (or task) decay:
90 * Decay is used for a few different things:
91 * Prioritizing latency-needing threads over throughput-needing threads for time-to-running
92 * Balancing work between threads in a process
93 * Balancing work done at the same priority between different processes
94 * Recovering from priority inversions between two threads in the same process
95 * Recovering from priority inversions between two threads in different processes
96 * Simulating a proportional share scheduler by allowing lower priority threads
97 * to run for a certain percentage of the time
98 *
99 * Task decay lets us separately address the 'same process' and 'different process' needs,
100 * which will allow us to make smarter tradeoffs in different cases.
101 * For example, we could resolve priority inversion in the same process by reordering threads without dropping the
102 * process below low priority threads in other processes.
103 *
104 * One lock to rule them all (or at least all the runqueues) instead of the pset locks
105 *
106 * Shrink sched_entry size to the size of a queue_chain_t by inferring priority, group, and perhaps runq field.
107 * The entries array is 5K currently so it'd be really great to reduce.
108 * One way to get sched_group below 4K without a new runq structure would be to remove the extra queues above realtime.
109 *
110 * When preempting a processor, store a flag saying if the preemption
111 * was from a thread in the same group or different group,
112 * and tell choose_thread about it.
113 *
114 * When choosing a processor, bias towards those running in the same
115 * group as I am running (at the same priority, or within a certain band?).
116 *
117 * Decide if we need to support psets.
118 * Decide how to support psets - do we need duplicate entries for each pset,
119 * or can we get away with putting the entry in either one or the other pset?
120 *
121 * Consider the right way to handle runq count - I don't want to iterate groups.
122 * Perhaps keep a global counter. sched_run_count will not work.
123 * Alternate option - remove it from choose_processor. It doesn't add much value
124 * now that we have global runq.
125 *
126 * Need a better way of finding group to target instead of looking at current_task.
127 * Perhaps choose_thread could pass in the current thread?
128 *
129 * Consider unifying runq copy-pastes.
130 *
131 * Thoughts on having a group central quantum bucket:
132 *
133 * I see two algorithms to decide quanta:
134 * A) Hand off only when switching thread to thread in the same group
135 * B) Allocate and return quanta to the group's pool
136 *
137 * Issues:
138 * If a task blocks completely, should it come back with the leftover quanta
139 * or brand new quanta?
140 *
141 * Should I put a flag saying zero out a quanta you grab when youre dispatched'?
142 *
143 * Resolution:
144 * Handing off quanta between threads will help with jumping around in the current task
145 * but will not help when a thread from a different task is involved.
146 * Need an algorithm that works with round robin-ing between threads in different tasks
147 *
148 * But wait - round robining can only be triggered by quantum expire or blocking.
149 * We need something that works with preemption or yielding - that's the more interesting idea.
150 *
151 * Existing algorithm - preemption doesn't re-set quantum, puts thread on head of runq.
152 * Blocking or quantum expiration does re-set quantum, puts thread on tail of runq.
153 *
154 * New algorithm -
155 * Hand off quanta when hopping between threads with same sched_group
156 * Even if thread was blocked it uses last thread remaining quanta when it starts.
157 *
158 * If we use the only cycle entry at quantum algorithm, then the quantum pool starts getting
159 * interesting.
160 *
161 * A thought - perhaps the handoff approach doesn't work so well in the presence of
162 * non-handoff wakeups i.e. wake other thread then wait then block - doesn't mean that
163 * woken thread will be what I switch to - other processor may have stolen it.
164 * What do we do there?
165 *
166 * Conclusions:
167 * We currently don't know of a scenario where quantum buckets on the task is beneficial.
168 * We will instead handoff quantum between threads in the task, and keep quantum
169 * on the preempted thread if it's preempted by something outside the task.
170 *
171 */
172
173#if DEBUG || DEVELOPMENT
174#define MULTIQ_SANITY_CHECK
175#endif
176
177typedef struct sched_entry {
178 queue_chain_t links;
179 int16_t sched_pri; /* scheduled (current) priority */
180 int16_t runq;
181 int32_t pad;
182} *sched_entry_t;
183
184typedef run_queue_t entry_queue_t; /* A run queue that holds sched_entries instead of threads */
185typedef run_queue_t group_runq_t; /* A run queue that is part of a sched_group */
186
187#define SCHED_ENTRY_NULL ((sched_entry_t) 0)
188#define MULTIQ_ERUNQ (-4) /* Indicates entry is on the main runq */
189
190/* Each level in the run queue corresponds to one entry in the entries array */
191struct sched_group {
192 struct sched_entry entries[NRQS];
193 struct run_queue runq;
194 queue_chain_t sched_groups;
195};
196
fe8ab488
A
197/*
198 * Keep entry on the head of the runqueue while dequeueing threads.
199 * Only cycle it to the end of the runqueue when a thread in the task
200 * hits its quantum.
201 */
202static boolean_t deep_drain = FALSE;
203
fe8ab488
A
204/* Verify the consistency of the runq before touching it */
205static boolean_t multiq_sanity_check = FALSE;
206
207/*
208 * Draining threads from the current task is preferred
209 * when they're less than X steps below the current
210 * global highest priority
211 */
212#define DEFAULT_DRAIN_BAND_LIMIT MAXPRI
213static integer_t drain_band_limit;
214
215/*
216 * Don't go below this priority level if there is something above it in another task
217 */
218#define DEFAULT_DRAIN_DEPTH_LIMIT MAXPRI_THROTTLE
219static integer_t drain_depth_limit;
220
3e170ce0
A
221/*
222 * Don't favor the task when there's something above this priority in another task.
223 */
224#define DEFAULT_DRAIN_CEILING BASEPRI_FOREGROUND
225static integer_t drain_ceiling;
fe8ab488
A
226
227static struct zone *sched_group_zone;
228
229static uint64_t num_sched_groups = 0;
230static queue_head_t sched_groups;
231
232static lck_attr_t sched_groups_lock_attr;
233static lck_grp_t sched_groups_lock_grp;
234static lck_grp_attr_t sched_groups_lock_grp_attr;
235
236static lck_mtx_t sched_groups_lock;
237
238
239static void
240sched_multiq_init(void);
241
242static thread_t
243sched_multiq_steal_thread(processor_set_t pset);
244
245static void
3e170ce0 246sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context);
fe8ab488
A
247
248static boolean_t
249sched_multiq_processor_enqueue(processor_t processor, thread_t thread, integer_t options);
250
251static boolean_t
252sched_multiq_processor_queue_remove(processor_t processor, thread_t thread);
253
254void
255sched_multiq_quantum_expire(thread_t thread);
256
257static ast_t
258sched_multiq_processor_csw_check(processor_t processor);
259
260static boolean_t
261sched_multiq_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
262
263static int
264sched_multiq_runq_count(processor_t processor);
265
266static boolean_t
267sched_multiq_processor_queue_empty(processor_t processor);
268
269static uint64_t
270sched_multiq_runq_stats_count_sum(processor_t processor);
271
272static int
273sched_multiq_processor_bound_count(processor_t processor);
274
275static void
276sched_multiq_pset_init(processor_set_t pset);
277
278static void
279sched_multiq_processor_init(processor_t processor);
280
281static thread_t
282sched_multiq_choose_thread(processor_t processor, int priority, ast_t reason);
283
284static void
285sched_multiq_processor_queue_shutdown(processor_t processor);
286
287static sched_mode_t
288sched_multiq_initial_thread_sched_mode(task_t parent_task);
289
fe8ab488 290const struct sched_dispatch_table sched_multiq_dispatch = {
3e170ce0 291 .sched_name = "multiq",
fe8ab488 292 .init = sched_multiq_init,
3e170ce0 293 .timebase_init = sched_timeshare_timebase_init,
fe8ab488
A
294 .processor_init = sched_multiq_processor_init,
295 .pset_init = sched_multiq_pset_init,
3e170ce0 296 .maintenance_continuation = sched_timeshare_maintenance_continue,
fe8ab488 297 .choose_thread = sched_multiq_choose_thread,
3e170ce0 298 .steal_thread_enabled = FALSE,
fe8ab488 299 .steal_thread = sched_multiq_steal_thread,
3e170ce0 300 .compute_timeshare_priority = sched_compute_timeshare_priority,
fe8ab488
A
301 .choose_processor = choose_processor,
302 .processor_enqueue = sched_multiq_processor_enqueue,
303 .processor_queue_shutdown = sched_multiq_processor_queue_shutdown,
304 .processor_queue_remove = sched_multiq_processor_queue_remove,
305 .processor_queue_empty = sched_multiq_processor_queue_empty,
306 .priority_is_urgent = priority_is_urgent,
307 .processor_csw_check = sched_multiq_processor_csw_check,
308 .processor_queue_has_priority = sched_multiq_processor_queue_has_priority,
3e170ce0 309 .initial_quantum_size = sched_timeshare_initial_quantum_size,
fe8ab488
A
310 .initial_thread_sched_mode = sched_multiq_initial_thread_sched_mode,
311 .can_update_priority = can_update_priority,
312 .update_priority = update_priority,
313 .lightweight_update_priority = lightweight_update_priority,
314 .quantum_expire = sched_multiq_quantum_expire,
fe8ab488
A
315 .processor_runq_count = sched_multiq_runq_count,
316 .processor_runq_stats_count_sum = sched_multiq_runq_stats_count_sum,
fe8ab488
A
317 .processor_bound_count = sched_multiq_processor_bound_count,
318 .thread_update_scan = sched_multiq_thread_update_scan,
319 .direct_dispatch_to_idle_processors = FALSE,
3e170ce0
A
320 .multiple_psets_enabled = FALSE,
321 .sched_groups_enabled = TRUE,
fe8ab488
A
322};
323
324
325static void
326sched_multiq_init(void)
327{
fe8ab488
A
328#if defined(MULTIQ_SANITY_CHECK)
329 PE_parse_boot_argn("-multiq-sanity-check", &multiq_sanity_check, sizeof(multiq_sanity_check));
330#endif
331
332 PE_parse_boot_argn("-multiq-deep-drain", &deep_drain, sizeof(deep_drain));
333
3e170ce0
A
334 if (!PE_parse_boot_argn("multiq_drain_ceiling", &drain_ceiling, sizeof(drain_ceiling))) {
335 drain_ceiling = DEFAULT_DRAIN_CEILING;
336 }
fe8ab488
A
337
338 if (!PE_parse_boot_argn("multiq_drain_depth_limit", &drain_depth_limit, sizeof(drain_depth_limit))) {
339 drain_depth_limit = DEFAULT_DRAIN_DEPTH_LIMIT;
340 }
341
342 if (!PE_parse_boot_argn("multiq_drain_band_limit", &drain_band_limit, sizeof(drain_band_limit))) {
343 drain_band_limit = DEFAULT_DRAIN_BAND_LIMIT;
344 }
345
3e170ce0
A
346 printf("multiq scheduler config: deep-drain %d, ceiling %d, depth limit %d, band limit %d, sanity check %d\n",
347 deep_drain, drain_ceiling, drain_depth_limit, drain_band_limit, multiq_sanity_check);
fe8ab488
A
348
349 sched_group_zone = zinit(
350 sizeof(struct sched_group),
351 task_max * sizeof(struct sched_group),
352 PAGE_SIZE,
353 "sched groups");
354
355 zone_change(sched_group_zone, Z_NOENCRYPT, TRUE);
356 zone_change(sched_group_zone, Z_NOCALLOUT, TRUE);
357
358 queue_init(&sched_groups);
359
360 lck_grp_attr_setdefault(&sched_groups_lock_grp_attr);
361 lck_grp_init(&sched_groups_lock_grp, "sched_groups", &sched_groups_lock_grp_attr);
362 lck_attr_setdefault(&sched_groups_lock_attr);
363 lck_mtx_init(&sched_groups_lock, &sched_groups_lock_grp, &sched_groups_lock_attr);
364
3e170ce0 365 sched_timeshare_init();
fe8ab488
A
366}
367
368static void
369sched_multiq_processor_init(processor_t processor)
370{
371 run_queue_init(&processor->runq);
372}
373
374static void
375sched_multiq_pset_init(processor_set_t pset)
376{
377 run_queue_init(&pset->pset_runq);
378}
379
380static sched_mode_t
381sched_multiq_initial_thread_sched_mode(task_t parent_task)
382{
383 if (parent_task == kernel_task)
384 return TH_MODE_FIXED;
385 else
386 return TH_MODE_TIMESHARE;
387}
388
389sched_group_t
390sched_group_create(void)
391{
392 sched_group_t sched_group;
393
3e170ce0 394 if (!SCHED(sched_groups_enabled))
fe8ab488
A
395 return SCHED_GROUP_NULL;
396
397 sched_group = (sched_group_t)zalloc(sched_group_zone);
398
399 bzero(sched_group, sizeof(struct sched_group));
400
401 run_queue_init(&sched_group->runq);
402
403 for (int i = 0; i < NRQS; i++) {
404 sched_group->entries[i].runq = 0;
405 sched_group->entries[i].sched_pri = i;
406 }
407
408 lck_mtx_lock(&sched_groups_lock);
409 queue_enter(&sched_groups, sched_group, sched_group_t, sched_groups);
410 num_sched_groups++;
411 lck_mtx_unlock(&sched_groups_lock);
412
413 return (sched_group);
414}
415
416void
417sched_group_destroy(sched_group_t sched_group)
418{
3e170ce0 419 if (!SCHED(sched_groups_enabled)) {
fe8ab488
A
420 assert(sched_group == SCHED_GROUP_NULL);
421 return;
422 }
423
424 assert(sched_group != SCHED_GROUP_NULL);
425 assert(sched_group->runq.count == 0);
426
427 for (int i = 0; i < NRQS; i++) {
428 assert(sched_group->entries[i].runq == 0);
429 assert(sched_group->entries[i].sched_pri == i);
430 }
431
432 lck_mtx_lock(&sched_groups_lock);
433 queue_remove(&sched_groups, sched_group, sched_group_t, sched_groups);
434 num_sched_groups--;
435 lck_mtx_unlock(&sched_groups_lock);
436
437 zfree(sched_group_zone, sched_group);
438}
439
440__attribute__((always_inline))
441static inline entry_queue_t
442multiq_main_entryq(processor_t processor)
443{
444 return (entry_queue_t)&processor->processor_set->pset_runq;
445}
446
447__attribute__((always_inline))
448static inline run_queue_t
449multiq_bound_runq(processor_t processor)
450{
451 return &processor->runq;
452}
453
454__attribute__((always_inline))
455static inline sched_entry_t
456group_entry_for_pri(sched_group_t group, integer_t pri)
457{
458 return &group->entries[pri];
459}
460
461__attribute__((always_inline))
462static inline sched_group_t
463group_for_entry(sched_entry_t entry)
464{
465 sched_group_t group = (sched_group_t)(entry - entry->sched_pri);
466 return group;
467}
468
469/* Peek at the head of the runqueue */
470static sched_entry_t
471entry_queue_first_entry(entry_queue_t rq)
472{
473 assert(rq->count != 0);
474
475 queue_t queue = rq->queues + rq->highq;
476
477 sched_entry_t entry = (sched_entry_t)queue_first(queue);
478
479 assert(entry->sched_pri == rq->highq);
480
481 return entry;
482}
483
484#if defined(MULTIQ_SANITY_CHECK)
485
3e170ce0 486#if MACH_ASSERT
fe8ab488
A
487__attribute__((always_inline))
488static inline boolean_t
489queue_chain_linked(queue_chain_t* chain)
490{
491 if (chain->next != NULL) {
492 assert(chain->prev != NULL);
493 return TRUE;
494 } else {
495 assert(chain->prev == NULL);
496 return FALSE;
497 }
498}
3e170ce0 499#endif /* MACH_ASSERT */
fe8ab488
A
500
501static thread_t
502group_first_thread(sched_group_t group)
503{
504 group_runq_t rq = &group->runq;
505
506 assert(rq->count != 0);
507
508 queue_t queue = rq->queues + rq->highq;
509
510 thread_t thread = (thread_t)(void*)queue_first(queue);
511
512 assert(thread != THREAD_NULL);
513
514 assert(thread->sched_group == group);
515
516 /* TODO: May not be safe */
517 assert(thread->sched_pri == rq->highq);
518
519 return thread;
520}
521
522/* Asserts if entry is not in entry runq at pri */
523static void
524entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pri)
525{
526 queue_t q;
527 sched_entry_t elem;
528
529 assert(queue_chain_linked(&entry->links));
530 assert(entry->runq == MULTIQ_ERUNQ);
531
532 q = &runq->queues[expected_pri];
533
534 queue_iterate(q, elem, sched_entry_t, links) {
535 if (elem == entry)
536 return;
537 }
538
539 panic("runq %p doesn't contain entry %p at pri %d", runq, entry, expected_pri);
540}
541
542/* Asserts if thread is not in group at its priority */
543static void
544sched_group_check_thread(sched_group_t group, thread_t thread)
545{
546 queue_t q;
547 thread_t elem;
548 int pri = thread->sched_pri;
549
550 assert(thread->runq != PROCESSOR_NULL);
551
552 q = &group->runq.queues[pri];
553
554 queue_iterate(q, elem, thread_t, links) {
555 if (elem == thread)
556 return;
557 }
558
559 panic("group %p doesn't contain thread %p at pri %d", group, thread, pri);
560}
561
562static void
563global_check_entry_queue(entry_queue_t main_entryq)
564{
565 if (main_entryq->count == 0)
566 return;
567
568 sched_entry_t entry = entry_queue_first_entry(main_entryq);
569
570 assert(entry->runq == MULTIQ_ERUNQ);
571
572 sched_group_t group = group_for_entry(entry);
573
574 thread_t thread = group_first_thread(group);
575
576 __assert_only sched_entry_t thread_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
577
578 assert(entry->sched_pri == group->runq.highq);
579
580 assert(entry == thread_entry);
581 assert(thread->runq != PROCESSOR_NULL);
582}
583
584static void
585group_check_run_queue(entry_queue_t main_entryq, sched_group_t group)
586{
587 if (group->runq.count == 0)
588 return;
589
590 thread_t thread = group_first_thread(group);
591
592 assert(thread->runq != PROCESSOR_NULL);
593
594 sched_entry_t sched_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
595
596 entry_queue_check_entry(main_entryq, sched_entry, thread->sched_pri);
597
598 assert(sched_entry->sched_pri == thread->sched_pri);
599 assert(sched_entry->runq == MULTIQ_ERUNQ);
600}
601
602#endif /* defined(MULTIQ_SANITY_CHECK) */
603
604/*
605 * The run queue must not be empty.
606 */
607static sched_entry_t
608entry_queue_dequeue_entry(entry_queue_t rq)
609{
610 sched_entry_t sched_entry;
611 queue_t queue = rq->queues + rq->highq;
612
613 assert(rq->count > 0);
614 assert(!queue_empty(queue));
615
616 sched_entry = (sched_entry_t)dequeue_head(queue);
617
618 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
619 rq->count--;
620 if (SCHED(priority_is_urgent)(rq->highq)) {
621 rq->urgency--; assert(rq->urgency >= 0);
622 }
623 if (queue_empty(queue)) {
624 if (rq->highq != IDLEPRI)
625 clrbit(MAXPRI - rq->highq, rq->bitmap);
626 rq->highq = MAXPRI - ffsbit(rq->bitmap);
627 }
628
629 sched_entry->runq = 0;
630
631 return (sched_entry);
632}
633
634/*
635 * The run queue must not be empty.
636 */
637static boolean_t
638entry_queue_enqueue_entry(
639 entry_queue_t rq,
640 sched_entry_t entry,
641 integer_t options)
642{
643 int sched_pri = entry->sched_pri;
644 queue_t queue = rq->queues + sched_pri;
645 boolean_t result = FALSE;
646
647 assert(entry->runq == 0);
648
649 if (queue_empty(queue)) {
650 enqueue_tail(queue, (queue_entry_t)entry);
651
652 setbit(MAXPRI - sched_pri, rq->bitmap);
653 if (sched_pri > rq->highq) {
654 rq->highq = sched_pri;
655 result = TRUE;
656 }
657 } else {
658 if (options & SCHED_TAILQ)
659 enqueue_tail(queue, (queue_entry_t)entry);
660 else
661 enqueue_head(queue, (queue_entry_t)entry);
662 }
663 if (SCHED(priority_is_urgent)(sched_pri))
664 rq->urgency++;
665 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
666 rq->count++;
667
668 entry->runq = MULTIQ_ERUNQ;
669
670 return (result);
671}
672
673/*
674 * The entry must be in this runqueue.
675 */
676static void
677entry_queue_remove_entry(
678 entry_queue_t rq,
679 sched_entry_t entry)
680{
681 int sched_pri = entry->sched_pri;
682
683#if defined(MULTIQ_SANITY_CHECK)
684 if (multiq_sanity_check) {
685 entry_queue_check_entry(rq, entry, sched_pri);
686 }
687#endif
688
689 remqueue((queue_entry_t)entry);
690
691 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
692 rq->count--;
693 if (SCHED(priority_is_urgent)(sched_pri)) {
694 rq->urgency--; assert(rq->urgency >= 0);
695 }
696
697 if (queue_empty(rq->queues + sched_pri)) {
698 /* update run queue status */
699 if (sched_pri != IDLEPRI)
700 clrbit(MAXPRI - sched_pri, rq->bitmap);
701 rq->highq = MAXPRI - ffsbit(rq->bitmap);
702 }
703
704 entry->runq = 0;
705}
706
3e170ce0
A
707static void
708entry_queue_change_entry(
709 entry_queue_t rq,
710 sched_entry_t entry,
711 integer_t options)
712{
713 int sched_pri = entry->sched_pri;
714 queue_t queue = rq->queues + sched_pri;
715
716#if defined(MULTIQ_SANITY_CHECK)
717 if (multiq_sanity_check) {
718 entry_queue_check_entry(rq, entry, sched_pri);
719 }
720#endif
721 remqueue((queue_entry_t)entry);
722
723 if (options & SCHED_TAILQ)
724 enqueue_tail(queue, (queue_entry_t)entry);
725 else
726 enqueue_head(queue, (queue_entry_t)entry);
727}
fe8ab488
A
728/*
729 * The run queue must not be empty.
730 *
731 * sets queue_empty to TRUE if queue is now empty at thread_pri
732 */
733static thread_t
734group_run_queue_dequeue_thread(
735 group_runq_t rq,
736 integer_t *thread_pri,
737 boolean_t *queue_empty)
738{
739 thread_t thread;
740 queue_t queue = rq->queues + rq->highq;
741
742 assert(rq->count > 0);
743 assert(!queue_empty(queue));
744
745 *thread_pri = rq->highq;
746
747 thread = (thread_t)(void*)dequeue_head(queue);
748
749 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
750 rq->count--;
751 if (SCHED(priority_is_urgent)(rq->highq)) {
752 rq->urgency--; assert(rq->urgency >= 0);
753 }
754 if (queue_empty(queue)) {
755 if (rq->highq != IDLEPRI)
756 clrbit(MAXPRI - rq->highq, rq->bitmap);
757 rq->highq = MAXPRI - ffsbit(rq->bitmap);
758 *queue_empty = TRUE;
759 } else {
760 *queue_empty = FALSE;
761 }
762
763 return (thread);
764}
765
766/*
767 * The run queue must not be empty.
768 * returns TRUE if queue was empty at thread_pri
769 */
770static boolean_t
771group_run_queue_enqueue_thread(
772 group_runq_t rq,
773 thread_t thread,
774 integer_t thread_pri,
775 integer_t options)
776{
777 queue_t queue = rq->queues + thread_pri;
778 boolean_t result = FALSE;
779
780 assert(thread->runq == PROCESSOR_NULL);
781
782 if (queue_empty(queue)) {
783 enqueue_tail(queue, (queue_entry_t)thread);
784
785 setbit(MAXPRI - thread_pri, rq->bitmap);
786 if (thread_pri > rq->highq) {
787 rq->highq = thread_pri;
788 }
789 result = TRUE;
790 } else {
791 if (options & SCHED_TAILQ)
792 enqueue_tail(queue, (queue_entry_t)thread);
793 else
794 enqueue_head(queue, (queue_entry_t)thread);
795 }
796 if (SCHED(priority_is_urgent)(thread_pri))
797 rq->urgency++;
798 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
799 rq->count++;
800
801 return (result);
802}
803
804/*
805 * The thread must be in this runqueue.
806 * returns TRUE if queue is now empty at thread_pri
807 */
808static boolean_t
809group_run_queue_remove_thread(
810 group_runq_t rq,
811 thread_t thread,
812 integer_t thread_pri)
813{
814 boolean_t result = FALSE;
815
816 assert(thread->runq != PROCESSOR_NULL);
817
818 remqueue((queue_entry_t)thread);
819
820 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
821 rq->count--;
822 if (SCHED(priority_is_urgent)(thread_pri)) {
823 rq->urgency--; assert(rq->urgency >= 0);
824 }
825
826 if (queue_empty(rq->queues + thread_pri)) {
827 /* update run queue status */
828 if (thread_pri != IDLEPRI)
829 clrbit(MAXPRI - thread_pri, rq->bitmap);
830 rq->highq = MAXPRI - ffsbit(rq->bitmap);
831 result = TRUE;
832 }
833
834 thread->runq = PROCESSOR_NULL;
835
836 return result;
837}
838
839/*
840 * A thread's sched pri may change out from under us because
841 * we're clearing thread->runq here without the thread locked.
842 * Do not rely on it to be the same as when we enqueued.
843 */
844static thread_t
845sched_global_dequeue_thread(entry_queue_t main_entryq)
846{
847 boolean_t pri_level_empty = FALSE;
848 sched_entry_t entry;
849 group_runq_t group_runq;
850 thread_t thread;
851 integer_t thread_pri;
852 sched_group_t group;
853
854 assert(main_entryq->count > 0);
855
856 entry = entry_queue_dequeue_entry(main_entryq);
857
858 group = group_for_entry(entry);
859 group_runq = &group->runq;
860
861 thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
862
863 thread->runq = PROCESSOR_NULL;
864
865 if (!pri_level_empty) {
866 entry_queue_enqueue_entry(main_entryq, entry, SCHED_TAILQ);
867 }
868
869 return thread;
870}
871
872/* Dequeue a thread from the global runq without moving the entry */
873static thread_t
874sched_global_deep_drain_dequeue_thread(entry_queue_t main_entryq)
875{
876 boolean_t pri_level_empty = FALSE;
877 sched_entry_t entry;
878 group_runq_t group_runq;
879 thread_t thread;
880 integer_t thread_pri;
881 sched_group_t group;
882
883 assert(main_entryq->count > 0);
884
885 entry = entry_queue_first_entry(main_entryq);
886
887 group = group_for_entry(entry);
888 group_runq = &group->runq;
889
890 thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
891
892 thread->runq = PROCESSOR_NULL;
893
894 if (pri_level_empty) {
895 entry_queue_remove_entry(main_entryq, entry);
896 }
897
898 return thread;
899}
900
901
902static thread_t
903sched_group_dequeue_thread(
904 entry_queue_t main_entryq,
905 sched_group_t group)
906{
907 group_runq_t group_runq = &group->runq;
908 boolean_t pri_level_empty = FALSE;
909 thread_t thread;
910 integer_t thread_pri;
911
912 thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
913
914 thread->runq = PROCESSOR_NULL;
915
916 if (pri_level_empty) {
917 entry_queue_remove_entry(main_entryq, group_entry_for_pri(group, thread_pri));
918 }
919
920 return thread;
921}
922
923static void
924sched_group_remove_thread(
925 entry_queue_t main_entryq,
926 sched_group_t group,
927 thread_t thread)
928{
929 integer_t thread_pri = thread->sched_pri;
930 sched_entry_t sched_entry = group_entry_for_pri(group, thread_pri);
931
932#if defined(MULTIQ_SANITY_CHECK)
933 if (multiq_sanity_check) {
934 global_check_entry_queue(main_entryq);
935 group_check_run_queue(main_entryq, group);
936
937 sched_group_check_thread(group, thread);
938 entry_queue_check_entry(main_entryq, sched_entry, thread_pri);
939 }
940#endif
941
942 boolean_t pri_level_empty = group_run_queue_remove_thread(&group->runq, thread, thread_pri);
943
944 if (pri_level_empty) {
945 entry_queue_remove_entry(main_entryq, sched_entry);
946 }
947
948#if defined(MULTIQ_SANITY_CHECK)
949 if (multiq_sanity_check) {
950 global_check_entry_queue(main_entryq);
951 group_check_run_queue(main_entryq, group);
952 }
953#endif
954}
955
956static void
957sched_group_enqueue_thread(
958 entry_queue_t main_entryq,
959 sched_group_t group,
960 thread_t thread,
961 integer_t options)
962{
963#if defined(MULTIQ_SANITY_CHECK)
964 if (multiq_sanity_check) {
965 global_check_entry_queue(main_entryq);
966 group_check_run_queue(main_entryq, group);
967 }
968#endif
969
970 int sched_pri = thread->sched_pri;
971
972 boolean_t pri_level_was_empty = group_run_queue_enqueue_thread(&group->runq, thread, sched_pri, options);
973
974 if (pri_level_was_empty) {
975 /*
976 * TODO: Need to figure out if passing options here is a good idea or not
977 * What effects would it have?
978 */
979 entry_queue_enqueue_entry(main_entryq, &group->entries[sched_pri], options);
3e170ce0
A
980 } else if (options & SCHED_HEADQ) {
981 /* The thread should be at the head of the line - move its entry to the front */
982 entry_queue_change_entry(main_entryq, &group->entries[sched_pri], options);
fe8ab488
A
983 }
984}
985
986/*
987 * Locate a thread to execute from the run queue and return it.
988 * Only choose a thread with greater or equal priority.
989 *
990 * pset is locked, thread is not locked.
991 *
992 * Returns THREAD_NULL if it cannot find a valid thread.
993 *
994 * Note: we cannot rely on the value of thread->sched_pri in this path because
995 * we don't have the thread locked.
996 *
997 * TODO: Remove tracepoints
998 */
999static thread_t
1000sched_multiq_choose_thread(
1001 processor_t processor,
1002 int priority,
1003 ast_t reason)
1004{
1005 entry_queue_t main_entryq = multiq_main_entryq(processor);
1006 run_queue_t bound_runq = multiq_bound_runq(processor);
1007
1008 boolean_t choose_bound_runq = FALSE;
1009
1010 if (bound_runq->highq < priority &&
1011 main_entryq->highq < priority)
1012 return THREAD_NULL;
1013
1014 if (bound_runq->count && main_entryq->count) {
1015 if (bound_runq->highq >= main_entryq->highq) {
1016 choose_bound_runq = TRUE;
1017 } else {
1018 /* Use main runq */
1019 }
1020 } else if (bound_runq->count) {
1021 choose_bound_runq = TRUE;
1022 } else if (main_entryq->count) {
1023 /* Use main runq */
1024 } else {
1025 return (THREAD_NULL);
1026 }
1027
1028 if (choose_bound_runq) {
1029 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1030 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1031 MACH_MULTIQ_BOUND, main_entryq->highq, bound_runq->highq, 0, 0);
1032
1033 return run_queue_dequeue(bound_runq, SCHED_HEADQ);
1034 }
1035
1036 sched_group_t group = current_thread()->sched_group;
1037
1038#if defined(MULTIQ_SANITY_CHECK)
1039 if (multiq_sanity_check) {
1040 global_check_entry_queue(main_entryq);
1041 group_check_run_queue(main_entryq, group);
1042 }
1043#endif
1044
1045 /*
1046 * Determine if we should look at the group or the global queue
1047 *
1048 * TODO:
1049 * Perhaps pass reason as a 'should look inside' argument to choose_thread
1050 * Should YIELD AST override drain limit?
1051 */
1052 if (group->runq.count != 0 && (reason & AST_PREEMPTION) == 0) {
3e170ce0
A
1053 boolean_t favor_group = TRUE;
1054
1055 integer_t global_pri = main_entryq->highq;
1056 integer_t group_pri = group->runq.highq;
fe8ab488 1057
3e170ce0
A
1058 /*
1059 * Favor the current group if the group is still the globally highest.
1060 *
1061 * Otherwise, consider choosing a thread from the current group
1062 * even if it's lower priority than the global highest priority.
1063 */
1064 if (global_pri > group_pri) {
fe8ab488
A
1065 /*
1066 * If there's something elsewhere above the depth limit,
1067 * don't pick a thread below the limit.
1068 */
3e170ce0
A
1069 if (global_pri > drain_depth_limit && group_pri <= drain_depth_limit)
1070 favor_group = FALSE;
fe8ab488
A
1071
1072 /*
3e170ce0
A
1073 * If there's something at or above the ceiling,
1074 * don't favor the group.
fe8ab488 1075 */
3e170ce0
A
1076 if (global_pri >= drain_ceiling)
1077 favor_group = FALSE;
fe8ab488 1078
3e170ce0
A
1079 /*
1080 * Don't go more than X steps below the global highest
1081 */
1082 if ((global_pri - group_pri) >= drain_band_limit)
1083 favor_group = FALSE;
fe8ab488
A
1084 }
1085
3e170ce0 1086 if (favor_group) {
fe8ab488
A
1087 /* Pull from local runq */
1088 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1089 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
3e170ce0 1090 MACH_MULTIQ_GROUP, global_pri, group_pri, 0, 0);
fe8ab488
A
1091
1092 return sched_group_dequeue_thread(main_entryq, group);
1093 }
1094 }
1095
1096 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1097 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1098 MACH_MULTIQ_GLOBAL, main_entryq->highq, group->runq.highq, 0, 0);
1099
1100 /* Couldn't pull from local runq, pull from global runq instead */
1101 if (deep_drain) {
1102 return sched_global_deep_drain_dequeue_thread(main_entryq);
1103 } else {
1104 return sched_global_dequeue_thread(main_entryq);
1105 }
1106}
1107
1108
1109/*
1110 * Thread must be locked, and not already be on a run queue.
1111 * pset is locked.
1112 */
1113static boolean_t
1114sched_multiq_processor_enqueue(
1115 processor_t processor,
1116 thread_t thread,
1117 integer_t options)
1118{
1119 boolean_t result;
1120
1121 assert(processor == thread->chosen_processor);
1122
1123 if (thread->bound_processor != PROCESSOR_NULL) {
1124 assert(thread->bound_processor == processor);
1125
1126 result = run_queue_enqueue(multiq_bound_runq(processor), thread, options);
1127 thread->runq = processor;
1128
1129 return result;
1130 }
1131
1132 sched_group_enqueue_thread(multiq_main_entryq(processor),
1133 thread->sched_group,
1134 thread, options);
1135
1136 thread->runq = processor;
1137
1138 return (FALSE);
1139}
1140
1141/*
1142 * Called in the context of thread with thread and pset unlocked,
1143 * after updating thread priority but before propagating that priority
1144 * to the processor
1145 */
1146void
1147sched_multiq_quantum_expire(thread_t thread)
1148{
1149 if (deep_drain) {
1150 /*
1151 * Move the entry at this priority to the end of the queue,
1152 * to allow the next task a shot at running.
1153 */
1154
1155 processor_t processor = thread->last_processor;
1156 processor_set_t pset = processor->processor_set;
1157 entry_queue_t entryq = multiq_main_entryq(processor);
1158
1159 pset_lock(pset);
1160
1161 sched_entry_t entry = group_entry_for_pri(thread->sched_group, processor->current_pri);
1162
1163 if (entry->runq == MULTIQ_ERUNQ) {
3e170ce0 1164 entry_queue_change_entry(entryq, entry, SCHED_TAILQ);
fe8ab488
A
1165 }
1166
1167 pset_unlock(pset);
1168 }
1169}
1170
1171static boolean_t
1172sched_multiq_processor_queue_empty(processor_t processor)
1173{
1174 return multiq_main_entryq(processor)->count == 0 &&
1175 multiq_bound_runq(processor)->count == 0;
1176}
1177
1178static ast_t
1179sched_multiq_processor_csw_check(processor_t processor)
1180{
1181 boolean_t has_higher;
1182 int pri;
1183
1184 entry_queue_t main_entryq = multiq_main_entryq(processor);
3e170ce0 1185 run_queue_t bound_runq = multiq_bound_runq(processor);
fe8ab488
A
1186
1187 assert(processor->active_thread != NULL);
1188
1189 pri = MAX(main_entryq->highq, bound_runq->highq);
1190
3e170ce0 1191 if (processor->first_timeslice) {
fe8ab488
A
1192 has_higher = (pri > processor->current_pri);
1193 } else {
1194 has_higher = (pri >= processor->current_pri);
1195 }
1196
1197 if (has_higher) {
1198 if (main_entryq->urgency > 0)
1199 return (AST_PREEMPT | AST_URGENT);
1200
1201 if (bound_runq->urgency > 0)
1202 return (AST_PREEMPT | AST_URGENT);
fe8ab488
A
1203
1204 return AST_PREEMPT;
1205 }
1206
1207 return AST_NONE;
1208}
1209
1210static boolean_t
1211sched_multiq_processor_queue_has_priority(
1212 processor_t processor,
1213 int priority,
1214 boolean_t gte)
1215{
1216 int qpri = MAX(multiq_main_entryq(processor)->highq, multiq_bound_runq(processor)->highq);
1217
1218 if (gte)
1219 return qpri >= priority;
1220 else
1221 return qpri > priority;
1222}
1223
fe8ab488
A
1224static int
1225sched_multiq_runq_count(processor_t processor)
1226{
1227 /*
1228 * TODO: Decide whether to keep a count of runnable threads in the pset
1229 * or just return something less than the true count.
1230 *
1231 * This needs to be fast, so no iterating the whole runq.
1232 *
1233 * Another possible decision is to remove this - with global runq
1234 * it doesn't make much sense.
1235 */
1236 return multiq_main_entryq(processor)->count + multiq_bound_runq(processor)->count;
1237}
1238
1239static uint64_t
1240sched_multiq_runq_stats_count_sum(processor_t processor)
1241{
1242 /*
1243 * TODO: This one does need to go through all the runqueues, but it's only needed for
1244 * the sched stats tool
1245 */
1246
1247 uint64_t bound_sum = multiq_bound_runq(processor)->runq_stats.count_sum;
1248
1249 if (processor->cpu_id == processor->processor_set->cpu_set_low)
1250 return bound_sum + multiq_main_entryq(processor)->runq_stats.count_sum;
1251 else
1252 return bound_sum;
1253}
1254
1255static int
1256sched_multiq_processor_bound_count(processor_t processor)
1257{
1258 return multiq_bound_runq(processor)->count;
1259}
1260
1261static void
1262sched_multiq_processor_queue_shutdown(processor_t processor)
1263{
1264 processor_set_t pset = processor->processor_set;
1265 entry_queue_t main_entryq = multiq_main_entryq(processor);
1266 thread_t thread;
1267 queue_head_t tqueue;
1268
1269 /* We only need to migrate threads if this is the last active processor in the pset */
1270 if (pset->online_processor_count > 0) {
1271 pset_unlock(pset);
1272 return;
1273 }
1274
1275 queue_init(&tqueue);
1276
1277 /* Note that we do not remove bound threads from the queues here */
1278
1279 while (main_entryq->count > 0) {
1280 thread = sched_global_dequeue_thread(main_entryq);
1281 enqueue_tail(&tqueue, (queue_entry_t)thread);
1282 }
1283
1284 pset_unlock(pset);
1285
1286 while ((thread = (thread_t)(void*)dequeue_head(&tqueue)) != THREAD_NULL) {
1287 thread_lock(thread);
1288
1289 thread_setrun(thread, SCHED_TAILQ);
1290
1291 thread_unlock(thread);
1292 }
1293}
1294
1295/*
1296 * Thread is locked
1297 *
1298 * This is why we can never read sched_pri unless we have the thread locked.
1299 * Which we do in the enqueue and remove cases, but not the dequeue case.
1300 */
1301static boolean_t
1302sched_multiq_processor_queue_remove(
1303 processor_t processor,
1304 thread_t thread)
1305{
1306 boolean_t removed = FALSE;
fe8ab488
A
1307 processor_set_t pset = processor->processor_set;
1308
1309 pset_lock(pset);
1310
1311 if (thread->runq != PROCESSOR_NULL) {
1312 /*
1313 * Thread is on a run queue and we have a lock on
1314 * that run queue.
1315 */
1316
1317 assert(thread->runq == processor);
1318
1319 if (thread->bound_processor != PROCESSOR_NULL) {
1320 assert(processor == thread->bound_processor);
1321 run_queue_remove(multiq_bound_runq(processor), thread);
1322 thread->runq = PROCESSOR_NULL;
1323 } else {
1324 sched_group_remove_thread(multiq_main_entryq(processor),
1325 thread->sched_group,
1326 thread);
1327 }
1328
1329 removed = TRUE;
1330 }
1331
1332 pset_unlock(pset);
1333
1334 return removed;
1335}
1336
1337/* pset is locked, returned unlocked */
1338static thread_t
1339sched_multiq_steal_thread(processor_set_t pset)
1340{
1341 pset_unlock(pset);
1342 return (THREAD_NULL);
1343}
1344
1345/*
1346 * Scan the global queue for candidate groups, and scan those groups for
1347 * candidate threads.
1348 *
1349 * Returns TRUE if retry is needed.
1350 */
1351static boolean_t
3e170ce0 1352group_scan(entry_queue_t runq, sched_update_scan_context_t scan_context) {
fe8ab488
A
1353 int count;
1354 queue_t q;
1355 sched_group_t group;
1356 sched_entry_t entry;
1357
1358 if ((count = runq->count) > 0) {
1359 q = runq->queues + runq->highq;
1360 while (count > 0) {
1361 queue_iterate(q, entry, sched_entry_t, links) {
1362 group = group_for_entry(entry);
1363 if (group->runq.count > 0) {
3e170ce0 1364 if (runq_scan(&group->runq, scan_context))
fe8ab488
A
1365 return (TRUE);
1366 }
1367 count--;
1368 }
1369 q--;
1370 }
1371 }
1372
1373 return (FALSE);
1374}
1375
1376static void
3e170ce0 1377sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context)
fe8ab488
A
1378{
1379 boolean_t restart_needed = FALSE;
1380 processor_t processor = processor_list;
1381 processor_set_t pset;
1382 thread_t thread;
1383 spl_t s;
1384
1385 /*
1386 * We update the threads associated with each processor (bound and idle threads)
1387 * and then update the threads in each pset runqueue.
1388 */
1389
1390 do {
1391 do {
1392 pset = processor->processor_set;
1393
1394 s = splsched();
1395 pset_lock(pset);
1396
3e170ce0 1397 restart_needed = runq_scan(multiq_bound_runq(processor), scan_context);
fe8ab488
A
1398
1399 pset_unlock(pset);
1400 splx(s);
1401
1402 if (restart_needed)
1403 break;
1404
1405 thread = processor->idle_thread;
1406 if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
1407 if (thread_update_add_thread(thread) == FALSE) {
1408 restart_needed = TRUE;
1409 break;
1410 }
1411 }
1412 } while ((processor = processor->processor_list) != NULL);
1413
1414 /* Ok, we now have a collection of candidates -- fix them. */
1415 thread_update_process_threads();
1416
1417 } while (restart_needed);
1418
1419 pset = &pset0;
1420
1421 do {
1422 do {
1423 s = splsched();
1424 pset_lock(pset);
1425
3e170ce0 1426 restart_needed = group_scan(&pset->pset_runq, scan_context);
fe8ab488
A
1427
1428 pset_unlock(pset);
1429 splx(s);
1430
1431 if (restart_needed)
1432 break;
1433 } while ((pset = pset->pset_list) != NULL);
1434
1435 /* Ok, we now have a collection of candidates -- fix them. */
1436 thread_update_process_threads();
1437
1438 } while (restart_needed);
1439}
1440
1441