]> git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/kern/sched_prim.c
xnu-3789.70.16.tar.gz
[apple/xnu.git] / osfmk / kern / sched_prim.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_FREE_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: sched_prim.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1986
62 *
63 * Scheduling primitives
64 *
65 */
66
67#include <debug.h>
68
69#include <mach/mach_types.h>
70#include <mach/machine.h>
71#include <mach/policy.h>
72#include <mach/sync_policy.h>
73#include <mach/thread_act.h>
74
75#include <machine/machine_routines.h>
76#include <machine/sched_param.h>
77#include <machine/machine_cpu.h>
78#include <machine/machlimits.h>
79
80#ifdef CONFIG_MACH_APPROXIMATE_TIME
81#include <machine/commpage.h>
82#endif
83
84#include <kern/kern_types.h>
85#include <kern/backtrace.h>
86#include <kern/clock.h>
87#include <kern/counters.h>
88#include <kern/cpu_number.h>
89#include <kern/cpu_data.h>
90#include <kern/smp.h>
91#include <kern/debug.h>
92#include <kern/macro_help.h>
93#include <kern/machine.h>
94#include <kern/misc_protos.h>
95#include <kern/processor.h>
96#include <kern/queue.h>
97#include <kern/sched.h>
98#include <kern/sched_prim.h>
99#include <kern/sfi.h>
100#include <kern/syscall_subr.h>
101#include <kern/task.h>
102#include <kern/thread.h>
103#include <kern/ledger.h>
104#include <kern/timer_queue.h>
105#include <kern/waitq.h>
106#include <kern/policy_internal.h>
107
108#include <vm/pmap.h>
109#include <vm/vm_kern.h>
110#include <vm/vm_map.h>
111
112#include <mach/sdt.h>
113
114#include <sys/kdebug.h>
115#include <kperf/kperf.h>
116#include <kern/kpc.h>
117
118#include <kern/pms.h>
119
120struct rt_queue rt_runq;
121
122uintptr_t sched_thread_on_rt_queue = (uintptr_t)0xDEAFBEE0;
123
124/* Lock RT runq, must be done with interrupts disabled (under splsched()) */
125#if __SMP__
126decl_simple_lock_data(static,rt_lock);
127#define rt_lock_init() simple_lock_init(&rt_lock, 0)
128#define rt_lock_lock() simple_lock(&rt_lock)
129#define rt_lock_unlock() simple_unlock(&rt_lock)
130#else
131#define rt_lock_init() do { } while(0)
132#define rt_lock_lock() do { } while(0)
133#define rt_lock_unlock() do { } while(0)
134#endif
135
136#define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
137int default_preemption_rate = DEFAULT_PREEMPTION_RATE;
138
139#define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
140int default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
141
142#define MAX_UNSAFE_QUANTA 800
143int max_unsafe_quanta = MAX_UNSAFE_QUANTA;
144
145#define MAX_POLL_QUANTA 2
146int max_poll_quanta = MAX_POLL_QUANTA;
147
148#define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
149int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
150
151uint64_t max_poll_computation;
152
153uint64_t max_unsafe_computation;
154uint64_t sched_safe_duration;
155
156#if defined(CONFIG_SCHED_TIMESHARE_CORE)
157
158uint32_t std_quantum;
159uint32_t min_std_quantum;
160uint32_t bg_quantum;
161
162uint32_t std_quantum_us;
163uint32_t bg_quantum_us;
164
165#endif /* CONFIG_SCHED_TIMESHARE_CORE */
166
167uint32_t thread_depress_time;
168uint32_t default_timeshare_computation;
169uint32_t default_timeshare_constraint;
170
171uint32_t max_rt_quantum;
172uint32_t min_rt_quantum;
173
174#if defined(CONFIG_SCHED_TIMESHARE_CORE)
175
176unsigned sched_tick;
177uint32_t sched_tick_interval;
178
179uint32_t sched_pri_shifts[TH_BUCKET_MAX];
180uint32_t sched_fixed_shift;
181
182uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
183
184/* Allow foreground to decay past default to resolve inversions */
185#define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
186int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
187
188/* Defaults for timer deadline profiling */
189#define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
190 * 2ms */
191#define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
192 <= 5ms */
193
194uint64_t timer_deadline_tracking_bin_1;
195uint64_t timer_deadline_tracking_bin_2;
196
197#endif /* CONFIG_SCHED_TIMESHARE_CORE */
198
199thread_t sched_maintenance_thread;
200
201
202uint64_t sched_one_second_interval;
203
204/* Forwards */
205
206#if defined(CONFIG_SCHED_TIMESHARE_CORE)
207
208static void load_shift_init(void);
209static void preempt_pri_init(void);
210
211#endif /* CONFIG_SCHED_TIMESHARE_CORE */
212
213static thread_t thread_select(
214 thread_t thread,
215 processor_t processor,
216 ast_t reason);
217
218#if CONFIG_SCHED_IDLE_IN_PLACE
219static thread_t thread_select_idle(
220 thread_t thread,
221 processor_t processor);
222#endif
223
224thread_t processor_idle(
225 thread_t thread,
226 processor_t processor);
227
228ast_t
229csw_check_locked( processor_t processor,
230 processor_set_t pset,
231 ast_t check_reason);
232
233static void processor_setrun(
234 processor_t processor,
235 thread_t thread,
236 integer_t options);
237
238static void
239sched_realtime_init(void);
240
241static void
242sched_realtime_timebase_init(void);
243
244static void
245sched_timer_deadline_tracking_init(void);
246
247#if DEBUG
248extern int debug_task;
249#define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
250#else
251#define TLOG(a, fmt, args...) do {} while (0)
252#endif
253
254static processor_t
255thread_bind_internal(
256 thread_t thread,
257 processor_t processor);
258
259static void
260sched_vm_group_maintenance(void);
261
262#if defined(CONFIG_SCHED_TIMESHARE_CORE)
263int8_t sched_load_shifts[NRQS];
264bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS)];
265#endif /* CONFIG_SCHED_TIMESHARE_CORE */
266
267const struct sched_dispatch_table *sched_current_dispatch = NULL;
268
269/*
270 * Statically allocate a buffer to hold the longest possible
271 * scheduler description string, as currently implemented.
272 * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
273 * to export to userspace via sysctl(3). If either version
274 * changes, update the other.
275 *
276 * Note that in addition to being an upper bound on the strings
277 * in the kernel, it's also an exact parameter to PE_get_default(),
278 * which interrogates the device tree on some platforms. That
279 * API requires the caller know the exact size of the device tree
280 * property, so we need both a legacy size (32) and the current size
281 * (48) to deal with old and new device trees. The device tree property
282 * is similarly padded to a fixed size so that the same kernel image
283 * can run on multiple devices with different schedulers configured
284 * in the device tree.
285 */
286char sched_string[SCHED_STRING_MAX_LENGTH];
287
288uint32_t sched_debug_flags;
289
290/* Global flag which indicates whether Background Stepper Context is enabled */
291static int cpu_throttle_enabled = 1;
292
293void
294sched_init(void)
295{
296 char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' };
297
298 /* Check for runtime selection of the scheduler algorithm */
299 if (!PE_parse_boot_argn("sched", sched_arg, sizeof (sched_arg))) {
300 /* If no boot-args override, look in device tree */
301 if (!PE_get_default("kern.sched", sched_arg,
302 SCHED_STRING_MAX_LENGTH)) {
303 sched_arg[0] = '\0';
304 }
305 }
306
307
308 if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
309 /* No boot-args, check in device tree */
310 if (!PE_get_default("kern.sched_pri_decay_limit",
311 &sched_pri_decay_band_limit,
312 sizeof(sched_pri_decay_band_limit))) {
313 /* Allow decay all the way to normal limits */
314 sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
315 }
316 }
317
318 kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
319
320 if (strlen(sched_arg) > 0) {
321 if (0) {
322 /* Allow pattern below */
323#if defined(CONFIG_SCHED_TRADITIONAL)
324 } else if (0 == strcmp(sched_arg, sched_traditional_dispatch.sched_name)) {
325 sched_current_dispatch = &sched_traditional_dispatch;
326 } else if (0 == strcmp(sched_arg, sched_traditional_with_pset_runqueue_dispatch.sched_name)) {
327 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
328#endif
329#if defined(CONFIG_SCHED_PROTO)
330 } else if (0 == strcmp(sched_arg, sched_proto_dispatch.sched_name)) {
331 sched_current_dispatch = &sched_proto_dispatch;
332#endif
333#if defined(CONFIG_SCHED_GRRR)
334 } else if (0 == strcmp(sched_arg, sched_grrr_dispatch.sched_name)) {
335 sched_current_dispatch = &sched_grrr_dispatch;
336#endif
337#if defined(CONFIG_SCHED_MULTIQ)
338 } else if (0 == strcmp(sched_arg, sched_multiq_dispatch.sched_name)) {
339 sched_current_dispatch = &sched_multiq_dispatch;
340 } else if (0 == strcmp(sched_arg, sched_dualq_dispatch.sched_name)) {
341 sched_current_dispatch = &sched_dualq_dispatch;
342#endif
343 } else {
344#if defined(CONFIG_SCHED_TRADITIONAL)
345 printf("Unrecognized scheduler algorithm: %s\n", sched_arg);
346 printf("Scheduler: Using instead: %s\n", sched_traditional_with_pset_runqueue_dispatch.sched_name);
347 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
348#else
349 panic("Unrecognized scheduler algorithm: %s", sched_arg);
350#endif
351 }
352 kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name));
353 } else {
354#if defined(CONFIG_SCHED_MULTIQ)
355 sched_current_dispatch = &sched_multiq_dispatch;
356#elif defined(CONFIG_SCHED_TRADITIONAL)
357 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
358#elif defined(CONFIG_SCHED_PROTO)
359 sched_current_dispatch = &sched_proto_dispatch;
360#elif defined(CONFIG_SCHED_GRRR)
361 sched_current_dispatch = &sched_grrr_dispatch;
362#else
363#error No default scheduler implementation
364#endif
365 kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
366 }
367
368 strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
369
370 if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
371 kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
372 }
373
374 SCHED(init)();
375 sched_realtime_init();
376 ast_init();
377 sched_timer_deadline_tracking_init();
378
379 SCHED(pset_init)(&pset0);
380 SCHED(processor_init)(master_processor);
381}
382
383void
384sched_timebase_init(void)
385{
386 uint64_t abstime;
387
388 clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
389 sched_one_second_interval = abstime;
390
391 SCHED(timebase_init)();
392 sched_realtime_timebase_init();
393}
394
395#if defined(CONFIG_SCHED_TIMESHARE_CORE)
396
397void
398sched_timeshare_init(void)
399{
400 /*
401 * Calculate the timeslicing quantum
402 * in us.
403 */
404 if (default_preemption_rate < 1)
405 default_preemption_rate = DEFAULT_PREEMPTION_RATE;
406 std_quantum_us = (1000 * 1000) / default_preemption_rate;
407
408 printf("standard timeslicing quantum is %d us\n", std_quantum_us);
409
410 if (default_bg_preemption_rate < 1)
411 default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
412 bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
413
414 printf("standard background quantum is %d us\n", bg_quantum_us);
415
416 load_shift_init();
417 preempt_pri_init();
418 sched_tick = 0;
419}
420
421void
422sched_timeshare_timebase_init(void)
423{
424 uint64_t abstime;
425 uint32_t shift;
426
427 /* standard timeslicing quantum */
428 clock_interval_to_absolutetime_interval(
429 std_quantum_us, NSEC_PER_USEC, &abstime);
430 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
431 std_quantum = (uint32_t)abstime;
432
433 /* smallest remaining quantum (250 us) */
434 clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
435 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
436 min_std_quantum = (uint32_t)abstime;
437
438 /* quantum for background tasks */
439 clock_interval_to_absolutetime_interval(
440 bg_quantum_us, NSEC_PER_USEC, &abstime);
441 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
442 bg_quantum = (uint32_t)abstime;
443
444 /* scheduler tick interval */
445 clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
446 NSEC_PER_USEC, &abstime);
447 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
448 sched_tick_interval = (uint32_t)abstime;
449
450 /*
451 * Compute conversion factor from usage to
452 * timesharing priorities with 5/8 ** n aging.
453 */
454 abstime = (abstime * 5) / 3;
455 for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift)
456 abstime >>= 1;
457 sched_fixed_shift = shift;
458
459 for (uint32_t i = 0 ; i < TH_BUCKET_MAX ; i++)
460 sched_pri_shifts[i] = INT8_MAX;
461
462 max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum;
463 sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum;
464
465 max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
466 thread_depress_time = 1 * std_quantum;
467 default_timeshare_computation = std_quantum / 2;
468 default_timeshare_constraint = std_quantum;
469
470}
471
472#endif /* CONFIG_SCHED_TIMESHARE_CORE */
473
474static void
475sched_realtime_init(void)
476{
477 rt_lock_init();
478
479 rt_runq.count = 0;
480 queue_init(&rt_runq.queue);
481}
482
483static void
484sched_realtime_timebase_init(void)
485{
486 uint64_t abstime;
487
488 /* smallest rt computaton (50 us) */
489 clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
490 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
491 min_rt_quantum = (uint32_t)abstime;
492
493 /* maximum rt computation (50 ms) */
494 clock_interval_to_absolutetime_interval(
495 50, 1000*NSEC_PER_USEC, &abstime);
496 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
497 max_rt_quantum = (uint32_t)abstime;
498
499}
500
501#if defined(CONFIG_SCHED_TIMESHARE_CORE)
502
503/*
504 * Set up values for timeshare
505 * loading factors.
506 */
507static void
508load_shift_init(void)
509{
510 int8_t k, *p = sched_load_shifts;
511 uint32_t i, j;
512
513 uint32_t sched_decay_penalty = 1;
514
515 if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof (sched_decay_penalty))) {
516 kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
517 }
518
519 if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof (sched_decay_usage_age_factor))) {
520 kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
521 }
522
523 if (sched_decay_penalty == 0) {
524 /*
525 * There is no penalty for timeshare threads for using too much
526 * CPU, so set all load shifts to INT8_MIN. Even under high load,
527 * sched_pri_shift will be >INT8_MAX, and there will be no
528 * penalty applied to threads (nor will sched_usage be updated per
529 * thread).
530 */
531 for (i = 0; i < NRQS; i++) {
532 sched_load_shifts[i] = INT8_MIN;
533 }
534
535 return;
536 }
537
538 *p++ = INT8_MIN; *p++ = 0;
539
540 /*
541 * For a given system load "i", the per-thread priority
542 * penalty per quantum of CPU usage is ~2^k priority
543 * levels. "sched_decay_penalty" can cause more
544 * array entries to be filled with smaller "k" values
545 */
546 for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
547 for (j <<= 1; (i < j) && (i < NRQS); ++i)
548 *p++ = k;
549 }
550}
551
552static void
553preempt_pri_init(void)
554{
555 bitmap_t *p = sched_preempt_pri;
556
557 for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i)
558 bitmap_set(p, i);
559
560 for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i)
561 bitmap_set(p, i);
562}
563
564#endif /* CONFIG_SCHED_TIMESHARE_CORE */
565
566/*
567 * Thread wait timer expiration.
568 */
569void
570thread_timer_expire(
571 void *p0,
572 __unused void *p1)
573{
574 thread_t thread = p0;
575 spl_t s;
576
577 assert_thread_magic(thread);
578
579 s = splsched();
580 thread_lock(thread);
581 if (--thread->wait_timer_active == 0) {
582 if (thread->wait_timer_is_set) {
583 thread->wait_timer_is_set = FALSE;
584 clear_wait_internal(thread, THREAD_TIMED_OUT);
585 }
586 }
587 thread_unlock(thread);
588 splx(s);
589}
590
591/*
592 * thread_unblock:
593 *
594 * Unblock thread on wake up.
595 *
596 * Returns TRUE if the thread should now be placed on the runqueue.
597 *
598 * Thread must be locked.
599 *
600 * Called at splsched().
601 */
602boolean_t
603thread_unblock(
604 thread_t thread,
605 wait_result_t wresult)
606{
607 boolean_t ready_for_runq = FALSE;
608 thread_t cthread = current_thread();
609 uint32_t new_run_count;
610
611 /*
612 * Set wait_result.
613 */
614 thread->wait_result = wresult;
615
616 /*
617 * Cancel pending wait timer.
618 */
619 if (thread->wait_timer_is_set) {
620 if (timer_call_cancel(&thread->wait_timer))
621 thread->wait_timer_active--;
622 thread->wait_timer_is_set = FALSE;
623 }
624
625 /*
626 * Update scheduling state: not waiting,
627 * set running.
628 */
629 thread->state &= ~(TH_WAIT|TH_UNINT);
630
631 if (!(thread->state & TH_RUN)) {
632 thread->state |= TH_RUN;
633 thread->last_made_runnable_time = mach_approximate_time();
634
635 ready_for_runq = TRUE;
636
637 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
638
639 /* Update the runnable thread count */
640 new_run_count = sched_run_incr(thread);
641 } else {
642 /*
643 * Either the thread is idling in place on another processor,
644 * or it hasn't finished context switching yet.
645 */
646#if CONFIG_SCHED_IDLE_IN_PLACE
647 if (thread->state & TH_IDLE) {
648 processor_t processor = thread->last_processor;
649
650 if (processor != current_processor())
651 machine_signal_idle(processor);
652 }
653#else
654 assert((thread->state & TH_IDLE) == 0);
655#endif
656 /*
657 * The run count is only dropped after the context switch completes
658 * and the thread is still waiting, so we should not run_incr here
659 */
660 new_run_count = sched_run_buckets[TH_BUCKET_RUN];
661 }
662
663
664 /*
665 * Calculate deadline for real-time threads.
666 */
667 if (thread->sched_mode == TH_MODE_REALTIME) {
668 uint64_t ctime;
669
670 ctime = mach_absolute_time();
671 thread->realtime.deadline = thread->realtime.constraint + ctime;
672 }
673
674 /*
675 * Clear old quantum, fail-safe computation, etc.
676 */
677 thread->quantum_remaining = 0;
678 thread->computation_metered = 0;
679 thread->reason = AST_NONE;
680 thread->block_hint = kThreadWaitNone;
681
682 /* Obtain power-relevant interrupt and "platform-idle exit" statistics.
683 * We also account for "double hop" thread signaling via
684 * the thread callout infrastructure.
685 * DRK: consider removing the callout wakeup counters in the future
686 * they're present for verification at the moment.
687 */
688 boolean_t aticontext, pidle;
689 ml_get_power_state(&aticontext, &pidle);
690
691 if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
692 ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
693 DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
694
695 uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd);
696
697 if (ttd) {
698 if (ttd <= timer_deadline_tracking_bin_1)
699 thread->thread_timer_wakeups_bin_1++;
700 else
701 if (ttd <= timer_deadline_tracking_bin_2)
702 thread->thread_timer_wakeups_bin_2++;
703 }
704
705 if (pidle) {
706 ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
707 }
708
709 } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
710 if (cthread->callout_woken_from_icontext) {
711 ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
712 thread->thread_callout_interrupt_wakeups++;
713 if (cthread->callout_woken_from_platform_idle) {
714 ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
715 thread->thread_callout_platform_idle_wakeups++;
716 }
717
718 cthread->callout_woke_thread = TRUE;
719 }
720 }
721
722 if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
723 thread->callout_woken_from_icontext = aticontext;
724 thread->callout_woken_from_platform_idle = pidle;
725 thread->callout_woke_thread = FALSE;
726 }
727
728 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
729 MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
730 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
731 sched_run_buckets[TH_BUCKET_RUN], 0);
732
733 DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
734
735 return (ready_for_runq);
736}
737
738/*
739 * Routine: thread_go
740 * Purpose:
741 * Unblock and dispatch thread.
742 * Conditions:
743 * thread lock held, IPC locks may be held.
744 * thread must have been pulled from wait queue under same lock hold.
745 * thread must have been waiting
746 * Returns:
747 * KERN_SUCCESS - Thread was set running
748 *
749 * TODO: This should return void
750 */
751kern_return_t
752thread_go(
753 thread_t thread,
754 wait_result_t wresult)
755{
756 assert_thread_magic(thread);
757
758 assert(thread->at_safe_point == FALSE);
759 assert(thread->wait_event == NO_EVENT64);
760 assert(thread->waitq == NULL);
761
762 assert(!(thread->state & (TH_TERMINATE|TH_TERMINATE2)));
763 assert(thread->state & TH_WAIT);
764
765
766 if (thread_unblock(thread, wresult)) {
767#if SCHED_TRACE_THREAD_WAKEUPS
768 backtrace(&thread->thread_wakeup_bt[0],
769 (sizeof(thread->thread_wakeup_bt)/sizeof(uintptr_t)));
770#endif
771 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
772 }
773
774 return (KERN_SUCCESS);
775}
776
777/*
778 * Routine: thread_mark_wait_locked
779 * Purpose:
780 * Mark a thread as waiting. If, given the circumstances,
781 * it doesn't want to wait (i.e. already aborted), then
782 * indicate that in the return value.
783 * Conditions:
784 * at splsched() and thread is locked.
785 */
786__private_extern__
787wait_result_t
788thread_mark_wait_locked(
789 thread_t thread,
790 wait_interrupt_t interruptible)
791{
792 boolean_t at_safe_point;
793
794 assert(!(thread->state & (TH_WAIT|TH_IDLE|TH_UNINT|TH_TERMINATE2)));
795
796 /*
797 * The thread may have certain types of interrupts/aborts masked
798 * off. Even if the wait location says these types of interrupts
799 * are OK, we have to honor mask settings (outer-scoped code may
800 * not be able to handle aborts at the moment).
801 */
802 if (interruptible > (thread->options & TH_OPT_INTMASK))
803 interruptible = thread->options & TH_OPT_INTMASK;
804
805 at_safe_point = (interruptible == THREAD_ABORTSAFE);
806
807 if ( interruptible == THREAD_UNINT ||
808 !(thread->sched_flags & TH_SFLAG_ABORT) ||
809 (!at_safe_point &&
810 (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
811
812 if ( !(thread->state & TH_TERMINATE))
813 DTRACE_SCHED(sleep);
814
815 thread->state |= (interruptible) ? TH_WAIT : (TH_WAIT | TH_UNINT);
816 thread->at_safe_point = at_safe_point;
817
818 /* TODO: pass this through assert_wait instead, have
819 * assert_wait just take a struct as an argument */
820 assert(!thread->block_hint);
821 thread->block_hint = thread->pending_block_hint;
822 thread->pending_block_hint = kThreadWaitNone;
823
824 return (thread->wait_result = THREAD_WAITING);
825 }
826 else
827 if (thread->sched_flags & TH_SFLAG_ABORTSAFELY)
828 thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
829 thread->pending_block_hint = kThreadWaitNone;
830
831 return (thread->wait_result = THREAD_INTERRUPTED);
832}
833
834/*
835 * Routine: thread_interrupt_level
836 * Purpose:
837 * Set the maximum interruptible state for the
838 * current thread. The effective value of any
839 * interruptible flag passed into assert_wait
840 * will never exceed this.
841 *
842 * Useful for code that must not be interrupted,
843 * but which calls code that doesn't know that.
844 * Returns:
845 * The old interrupt level for the thread.
846 */
847__private_extern__
848wait_interrupt_t
849thread_interrupt_level(
850 wait_interrupt_t new_level)
851{
852 thread_t thread = current_thread();
853 wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
854
855 thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
856
857 return result;
858}
859
860/*
861 * Check to see if an assert wait is possible, without actually doing one.
862 * This is used by debug code in locks and elsewhere to verify that it is
863 * always OK to block when trying to take a blocking lock (since waiting
864 * for the actual assert_wait to catch the case may make it hard to detect
865 * this case.
866 */
867boolean_t
868assert_wait_possible(void)
869{
870
871 thread_t thread;
872
873#if DEBUG
874 if(debug_mode) return TRUE; /* Always succeed in debug mode */
875#endif
876
877 thread = current_thread();
878
879 return (thread == NULL || waitq_wait_possible(thread));
880}
881
882/*
883 * assert_wait:
884 *
885 * Assert that the current thread is about to go to
886 * sleep until the specified event occurs.
887 */
888wait_result_t
889assert_wait(
890 event_t event,
891 wait_interrupt_t interruptible)
892{
893 if (__improbable(event == NO_EVENT))
894 panic("%s() called with NO_EVENT", __func__);
895
896 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
897 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
898 VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
899
900 struct waitq *waitq;
901 waitq = global_eventq(event);
902 return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
903}
904
905/*
906 * assert_wait_queue:
907 *
908 * Return the global waitq for the specified event
909 */
910struct waitq *
911assert_wait_queue(
912 event_t event)
913{
914 return global_eventq(event);
915}
916
917wait_result_t
918assert_wait_timeout(
919 event_t event,
920 wait_interrupt_t interruptible,
921 uint32_t interval,
922 uint32_t scale_factor)
923{
924 thread_t thread = current_thread();
925 wait_result_t wresult;
926 uint64_t deadline;
927 spl_t s;
928
929 if (__improbable(event == NO_EVENT))
930 panic("%s() called with NO_EVENT", __func__);
931
932 struct waitq *waitq;
933 waitq = global_eventq(event);
934
935 s = splsched();
936 waitq_lock(waitq);
937
938 clock_interval_to_deadline(interval, scale_factor, &deadline);
939
940 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
941 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
942 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
943
944 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
945 interruptible,
946 TIMEOUT_URGENCY_SYS_NORMAL,
947 deadline, TIMEOUT_NO_LEEWAY,
948 thread);
949
950 waitq_unlock(waitq);
951 splx(s);
952 return wresult;
953}
954
955wait_result_t
956assert_wait_timeout_with_leeway(
957 event_t event,
958 wait_interrupt_t interruptible,
959 wait_timeout_urgency_t urgency,
960 uint32_t interval,
961 uint32_t leeway,
962 uint32_t scale_factor)
963{
964 thread_t thread = current_thread();
965 wait_result_t wresult;
966 uint64_t deadline;
967 uint64_t abstime;
968 uint64_t slop;
969 uint64_t now;
970 spl_t s;
971
972 if (__improbable(event == NO_EVENT))
973 panic("%s() called with NO_EVENT", __func__);
974
975 now = mach_absolute_time();
976 clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
977 deadline = now + abstime;
978
979 clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
980
981 struct waitq *waitq;
982 waitq = global_eventq(event);
983
984 s = splsched();
985 waitq_lock(waitq);
986
987 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
988 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
989 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
990
991 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
992 interruptible,
993 urgency, deadline, slop,
994 thread);
995
996 waitq_unlock(waitq);
997 splx(s);
998 return wresult;
999}
1000
1001wait_result_t
1002assert_wait_deadline(
1003 event_t event,
1004 wait_interrupt_t interruptible,
1005 uint64_t deadline)
1006{
1007 thread_t thread = current_thread();
1008 wait_result_t wresult;
1009 spl_t s;
1010
1011 if (__improbable(event == NO_EVENT))
1012 panic("%s() called with NO_EVENT", __func__);
1013
1014 struct waitq *waitq;
1015 waitq = global_eventq(event);
1016
1017 s = splsched();
1018 waitq_lock(waitq);
1019
1020 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1021 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1022 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1023
1024 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1025 interruptible,
1026 TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1027 TIMEOUT_NO_LEEWAY, thread);
1028 waitq_unlock(waitq);
1029 splx(s);
1030 return wresult;
1031}
1032
1033wait_result_t
1034assert_wait_deadline_with_leeway(
1035 event_t event,
1036 wait_interrupt_t interruptible,
1037 wait_timeout_urgency_t urgency,
1038 uint64_t deadline,
1039 uint64_t leeway)
1040{
1041 thread_t thread = current_thread();
1042 wait_result_t wresult;
1043 spl_t s;
1044
1045 if (__improbable(event == NO_EVENT))
1046 panic("%s() called with NO_EVENT", __func__);
1047
1048 struct waitq *waitq;
1049 waitq = global_eventq(event);
1050
1051 s = splsched();
1052 waitq_lock(waitq);
1053
1054 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1055 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1056 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1057
1058 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1059 interruptible,
1060 urgency, deadline, leeway,
1061 thread);
1062 waitq_unlock(waitq);
1063 splx(s);
1064 return wresult;
1065}
1066
1067/*
1068 * thread_isoncpu:
1069 *
1070 * Return TRUE if a thread is running on a processor such that an AST
1071 * is needed to pull it out of userspace execution, or if executing in
1072 * the kernel, bring to a context switch boundary that would cause
1073 * thread state to be serialized in the thread PCB.
1074 *
1075 * Thread locked, returns the same way. While locked, fields
1076 * like "state" cannot change. "runq" can change only from set to unset.
1077 */
1078static inline boolean_t
1079thread_isoncpu(thread_t thread)
1080{
1081 /* Not running or runnable */
1082 if (!(thread->state & TH_RUN))
1083 return (FALSE);
1084
1085 /* Waiting on a runqueue, not currently running */
1086 /* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
1087 if (thread->runq != PROCESSOR_NULL)
1088 return (FALSE);
1089
1090 /*
1091 * Thread does not have a stack yet
1092 * It could be on the stack alloc queue or preparing to be invoked
1093 */
1094 if (!thread->kernel_stack)
1095 return (FALSE);
1096
1097 /*
1098 * Thread must be running on a processor, or
1099 * about to run, or just did run. In all these
1100 * cases, an AST to the processor is needed
1101 * to guarantee that the thread is kicked out
1102 * of userspace and the processor has
1103 * context switched (and saved register state).
1104 */
1105 return (TRUE);
1106}
1107
1108/*
1109 * thread_stop:
1110 *
1111 * Force a preemption point for a thread and wait
1112 * for it to stop running on a CPU. If a stronger
1113 * guarantee is requested, wait until no longer
1114 * runnable. Arbitrates access among
1115 * multiple stop requests. (released by unstop)
1116 *
1117 * The thread must enter a wait state and stop via a
1118 * separate means.
1119 *
1120 * Returns FALSE if interrupted.
1121 */
1122boolean_t
1123thread_stop(
1124 thread_t thread,
1125 boolean_t until_not_runnable)
1126{
1127 wait_result_t wresult;
1128 spl_t s = splsched();
1129 boolean_t oncpu;
1130
1131 wake_lock(thread);
1132 thread_lock(thread);
1133
1134 while (thread->state & TH_SUSP) {
1135 thread->wake_active = TRUE;
1136 thread_unlock(thread);
1137
1138 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1139 wake_unlock(thread);
1140 splx(s);
1141
1142 if (wresult == THREAD_WAITING)
1143 wresult = thread_block(THREAD_CONTINUE_NULL);
1144
1145 if (wresult != THREAD_AWAKENED)
1146 return (FALSE);
1147
1148 s = splsched();
1149 wake_lock(thread);
1150 thread_lock(thread);
1151 }
1152
1153 thread->state |= TH_SUSP;
1154
1155 while ((oncpu = thread_isoncpu(thread)) ||
1156 (until_not_runnable && (thread->state & TH_RUN))) {
1157 processor_t processor;
1158
1159 if (oncpu) {
1160 assert(thread->state & TH_RUN);
1161 processor = thread->chosen_processor;
1162 cause_ast_check(processor);
1163 }
1164
1165 thread->wake_active = TRUE;
1166 thread_unlock(thread);
1167
1168 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1169 wake_unlock(thread);
1170 splx(s);
1171
1172 if (wresult == THREAD_WAITING)
1173 wresult = thread_block(THREAD_CONTINUE_NULL);
1174
1175 if (wresult != THREAD_AWAKENED) {
1176 thread_unstop(thread);
1177 return (FALSE);
1178 }
1179
1180 s = splsched();
1181 wake_lock(thread);
1182 thread_lock(thread);
1183 }
1184
1185 thread_unlock(thread);
1186 wake_unlock(thread);
1187 splx(s);
1188
1189 /*
1190 * We return with the thread unlocked. To prevent it from
1191 * transitioning to a runnable state (or from TH_RUN to
1192 * being on the CPU), the caller must ensure the thread
1193 * is stopped via an external means (such as an AST)
1194 */
1195
1196 return (TRUE);
1197}
1198
1199/*
1200 * thread_unstop:
1201 *
1202 * Release a previous stop request and set
1203 * the thread running if appropriate.
1204 *
1205 * Use only after a successful stop operation.
1206 */
1207void
1208thread_unstop(
1209 thread_t thread)
1210{
1211 spl_t s = splsched();
1212
1213 wake_lock(thread);
1214 thread_lock(thread);
1215
1216 assert((thread->state & (TH_RUN|TH_WAIT|TH_SUSP)) != TH_SUSP);
1217
1218 if (thread->state & TH_SUSP) {
1219 thread->state &= ~TH_SUSP;
1220
1221 if (thread->wake_active) {
1222 thread->wake_active = FALSE;
1223 thread_unlock(thread);
1224
1225 thread_wakeup(&thread->wake_active);
1226 wake_unlock(thread);
1227 splx(s);
1228
1229 return;
1230 }
1231 }
1232
1233 thread_unlock(thread);
1234 wake_unlock(thread);
1235 splx(s);
1236}
1237
1238/*
1239 * thread_wait:
1240 *
1241 * Wait for a thread to stop running. (non-interruptible)
1242 *
1243 */
1244void
1245thread_wait(
1246 thread_t thread,
1247 boolean_t until_not_runnable)
1248{
1249 wait_result_t wresult;
1250 boolean_t oncpu;
1251 processor_t processor;
1252 spl_t s = splsched();
1253
1254 wake_lock(thread);
1255 thread_lock(thread);
1256
1257 /*
1258 * Wait until not running on a CPU. If stronger requirement
1259 * desired, wait until not runnable. Assumption: if thread is
1260 * on CPU, then TH_RUN is set, so we're not waiting in any case
1261 * where the original, pure "TH_RUN" check would have let us
1262 * finish.
1263 */
1264 while ((oncpu = thread_isoncpu(thread)) ||
1265 (until_not_runnable && (thread->state & TH_RUN))) {
1266
1267 if (oncpu) {
1268 assert(thread->state & TH_RUN);
1269 processor = thread->chosen_processor;
1270 cause_ast_check(processor);
1271 }
1272
1273 thread->wake_active = TRUE;
1274 thread_unlock(thread);
1275
1276 wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1277 wake_unlock(thread);
1278 splx(s);
1279
1280 if (wresult == THREAD_WAITING)
1281 thread_block(THREAD_CONTINUE_NULL);
1282
1283 s = splsched();
1284 wake_lock(thread);
1285 thread_lock(thread);
1286 }
1287
1288 thread_unlock(thread);
1289 wake_unlock(thread);
1290 splx(s);
1291}
1292
1293/*
1294 * Routine: clear_wait_internal
1295 *
1296 * Clear the wait condition for the specified thread.
1297 * Start the thread executing if that is appropriate.
1298 * Arguments:
1299 * thread thread to awaken
1300 * result Wakeup result the thread should see
1301 * Conditions:
1302 * At splsched
1303 * the thread is locked.
1304 * Returns:
1305 * KERN_SUCCESS thread was rousted out a wait
1306 * KERN_FAILURE thread was waiting but could not be rousted
1307 * KERN_NOT_WAITING thread was not waiting
1308 */
1309__private_extern__ kern_return_t
1310clear_wait_internal(
1311 thread_t thread,
1312 wait_result_t wresult)
1313{
1314 uint32_t i = LockTimeOutUsec;
1315 struct waitq *waitq = thread->waitq;
1316
1317 do {
1318 if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT))
1319 return (KERN_FAILURE);
1320
1321 if (waitq != NULL) {
1322 if (!waitq_pull_thread_locked(waitq, thread)) {
1323 thread_unlock(thread);
1324 delay(1);
1325 if (i > 0 && !machine_timeout_suspended())
1326 i--;
1327 thread_lock(thread);
1328 if (waitq != thread->waitq)
1329 return KERN_NOT_WAITING;
1330 continue;
1331 }
1332 }
1333
1334 /* TODO: Can we instead assert TH_TERMINATE is not set? */
1335 if ((thread->state & (TH_WAIT|TH_TERMINATE)) == TH_WAIT)
1336 return (thread_go(thread, wresult));
1337 else
1338 return (KERN_NOT_WAITING);
1339 } while (i > 0);
1340
1341 panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n",
1342 thread, waitq, cpu_number());
1343
1344 return (KERN_FAILURE);
1345}
1346
1347
1348/*
1349 * clear_wait:
1350 *
1351 * Clear the wait condition for the specified thread. Start the thread
1352 * executing if that is appropriate.
1353 *
1354 * parameters:
1355 * thread thread to awaken
1356 * result Wakeup result the thread should see
1357 */
1358kern_return_t
1359clear_wait(
1360 thread_t thread,
1361 wait_result_t result)
1362{
1363 kern_return_t ret;
1364 spl_t s;
1365
1366 s = splsched();
1367 thread_lock(thread);
1368 ret = clear_wait_internal(thread, result);
1369 thread_unlock(thread);
1370 splx(s);
1371 return ret;
1372}
1373
1374
1375/*
1376 * thread_wakeup_prim:
1377 *
1378 * Common routine for thread_wakeup, thread_wakeup_with_result,
1379 * and thread_wakeup_one.
1380 *
1381 */
1382kern_return_t
1383thread_wakeup_prim(
1384 event_t event,
1385 boolean_t one_thread,
1386 wait_result_t result)
1387{
1388 if (__improbable(event == NO_EVENT))
1389 panic("%s() called with NO_EVENT", __func__);
1390
1391 struct waitq *wq = global_eventq(event);
1392
1393 if (one_thread)
1394 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
1395 else
1396 return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
1397}
1398
1399/*
1400 * Wakeup a specified thread if and only if it's waiting for this event
1401 */
1402kern_return_t
1403thread_wakeup_thread(
1404 event_t event,
1405 thread_t thread)
1406{
1407 if (__improbable(event == NO_EVENT))
1408 panic("%s() called with NO_EVENT", __func__);
1409
1410 struct waitq *wq = global_eventq(event);
1411
1412 return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1413}
1414
1415/*
1416 * Wakeup a thread waiting on an event and promote it to a priority.
1417 *
1418 * Requires woken thread to un-promote itself when done.
1419 */
1420kern_return_t
1421thread_wakeup_one_with_pri(
1422 event_t event,
1423 int priority)
1424{
1425 if (__improbable(event == NO_EVENT))
1426 panic("%s() called with NO_EVENT", __func__);
1427
1428 struct waitq *wq = global_eventq(event);
1429
1430 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1431}
1432
1433/*
1434 * Wakeup a thread waiting on an event,
1435 * promote it to a priority,
1436 * and return a reference to the woken thread.
1437 *
1438 * Requires woken thread to un-promote itself when done.
1439 */
1440thread_t
1441thread_wakeup_identify(event_t event,
1442 int priority)
1443{
1444 if (__improbable(event == NO_EVENT))
1445 panic("%s() called with NO_EVENT", __func__);
1446
1447 struct waitq *wq = global_eventq(event);
1448
1449 return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1450}
1451
1452/*
1453 * thread_bind:
1454 *
1455 * Force the current thread to execute on the specified processor.
1456 * Takes effect after the next thread_block().
1457 *
1458 * Returns the previous binding. PROCESSOR_NULL means
1459 * not bound.
1460 *
1461 * XXX - DO NOT export this to users - XXX
1462 */
1463processor_t
1464thread_bind(
1465 processor_t processor)
1466{
1467 thread_t self = current_thread();
1468 processor_t prev;
1469 spl_t s;
1470
1471 s = splsched();
1472 thread_lock(self);
1473
1474 prev = thread_bind_internal(self, processor);
1475
1476 thread_unlock(self);
1477 splx(s);
1478
1479 return (prev);
1480}
1481
1482/*
1483 * thread_bind_internal:
1484 *
1485 * If the specified thread is not the current thread, and it is currently
1486 * running on another CPU, a remote AST must be sent to that CPU to cause
1487 * the thread to migrate to its bound processor. Otherwise, the migration
1488 * will occur at the next quantum expiration or blocking point.
1489 *
1490 * When the thread is the current thread, and explicit thread_block() should
1491 * be used to force the current processor to context switch away and
1492 * let the thread migrate to the bound processor.
1493 *
1494 * Thread must be locked, and at splsched.
1495 */
1496
1497static processor_t
1498thread_bind_internal(
1499 thread_t thread,
1500 processor_t processor)
1501{
1502 processor_t prev;
1503
1504 /* <rdar://problem/15102234> */
1505 assert(thread->sched_pri < BASEPRI_RTQUEUES);
1506 /* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
1507 assert(thread->runq == PROCESSOR_NULL);
1508
1509 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? (uintptr_t)processor->cpu_id : (uintptr_t)-1, 0, 0, 0);
1510
1511 prev = thread->bound_processor;
1512 thread->bound_processor = processor;
1513
1514 return (prev);
1515}
1516
1517/*
1518 * thread_vm_bind_group_add:
1519 *
1520 * The "VM bind group" is a special mechanism to mark a collection
1521 * of threads from the VM subsystem that, in general, should be scheduled
1522 * with only one CPU of parallelism. To accomplish this, we initially
1523 * bind all the threads to the master processor, which has the effect
1524 * that only one of the threads in the group can execute at once, including
1525 * preempting threads in the group that are a lower priority. Future
1526 * mechanisms may use more dynamic mechanisms to prevent the collection
1527 * of VM threads from using more CPU time than desired.
1528 *
1529 * The current implementation can result in priority inversions where
1530 * compute-bound priority 95 or realtime threads that happen to have
1531 * landed on the master processor prevent the VM threads from running.
1532 * When this situation is detected, we unbind the threads for one
1533 * scheduler tick to allow the scheduler to run the threads an
1534 * additional CPUs, before restoring the binding (assuming high latency
1535 * is no longer a problem).
1536 */
1537
1538/*
1539 * The current max is provisioned for:
1540 * vm_compressor_swap_trigger_thread (92)
1541 * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1542 * vm_pageout_continue (92)
1543 * memorystatus_thread (95)
1544 */
1545#define MAX_VM_BIND_GROUP_COUNT (5)
1546decl_simple_lock_data(static,sched_vm_group_list_lock);
1547static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1548static int sched_vm_group_thread_count;
1549static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1550
1551void
1552thread_vm_bind_group_add(void)
1553{
1554 thread_t self = current_thread();
1555
1556 thread_reference_internal(self);
1557 self->options |= TH_OPT_SCHED_VM_GROUP;
1558
1559 simple_lock(&sched_vm_group_list_lock);
1560 assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1561 sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1562 simple_unlock(&sched_vm_group_list_lock);
1563
1564 thread_bind(master_processor);
1565
1566 /* Switch to bound processor if not already there */
1567 thread_block(THREAD_CONTINUE_NULL);
1568}
1569
1570static void
1571sched_vm_group_maintenance(void)
1572{
1573 uint64_t ctime = mach_absolute_time();
1574 uint64_t longtime = ctime - sched_tick_interval;
1575 int i;
1576 spl_t s;
1577 boolean_t high_latency_observed = FALSE;
1578 boolean_t runnable_and_not_on_runq_observed = FALSE;
1579 boolean_t bind_target_changed = FALSE;
1580 processor_t bind_target = PROCESSOR_NULL;
1581
1582 /* Make sure nobody attempts to add new threads while we are enumerating them */
1583 simple_lock(&sched_vm_group_list_lock);
1584
1585 s = splsched();
1586
1587 for (i=0; i < sched_vm_group_thread_count; i++) {
1588 thread_t thread = sched_vm_group_thread_list[i];
1589 assert(thread != THREAD_NULL);
1590 thread_lock(thread);
1591 if ((thread->state & (TH_RUN|TH_WAIT)) == TH_RUN) {
1592 if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
1593 high_latency_observed = TRUE;
1594 } else if (thread->runq == PROCESSOR_NULL) {
1595 /* There are some cases where a thread be transitiong that also fall into this case */
1596 runnable_and_not_on_runq_observed = TRUE;
1597 }
1598 }
1599 thread_unlock(thread);
1600
1601 if (high_latency_observed && runnable_and_not_on_runq_observed) {
1602 /* All the things we are looking for are true, stop looking */
1603 break;
1604 }
1605 }
1606
1607 splx(s);
1608
1609 if (sched_vm_group_temporarily_unbound) {
1610 /* If we turned off binding, make sure everything is OK before rebinding */
1611 if (!high_latency_observed) {
1612 /* rebind */
1613 bind_target_changed = TRUE;
1614 bind_target = master_processor;
1615 sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
1616 }
1617 } else {
1618 /*
1619 * Check if we're in a bad state, which is defined by high
1620 * latency with no core currently executing a thread. If a
1621 * single thread is making progress on a CPU, that means the
1622 * binding concept to reduce parallelism is working as
1623 * designed.
1624 */
1625 if (high_latency_observed && !runnable_and_not_on_runq_observed) {
1626 /* unbind */
1627 bind_target_changed = TRUE;
1628 bind_target = PROCESSOR_NULL;
1629 sched_vm_group_temporarily_unbound = TRUE;
1630 }
1631 }
1632
1633 if (bind_target_changed) {
1634 s = splsched();
1635 for (i=0; i < sched_vm_group_thread_count; i++) {
1636 thread_t thread = sched_vm_group_thread_list[i];
1637 boolean_t removed;
1638 assert(thread != THREAD_NULL);
1639
1640 thread_lock(thread);
1641 removed = thread_run_queue_remove(thread);
1642 if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
1643 thread_bind_internal(thread, bind_target);
1644 } else {
1645 /*
1646 * Thread was in the middle of being context-switched-to,
1647 * or was in the process of blocking. To avoid switching the bind
1648 * state out mid-flight, defer the change if possible.
1649 */
1650 if (bind_target == PROCESSOR_NULL) {
1651 thread_bind_internal(thread, bind_target);
1652 } else {
1653 sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
1654 }
1655 }
1656
1657 if (removed) {
1658 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
1659 }
1660 thread_unlock(thread);
1661 }
1662 splx(s);
1663 }
1664
1665 simple_unlock(&sched_vm_group_list_lock);
1666}
1667
1668/* Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
1669 * rebalancing opportunity exists when a core is (instantaneously) idle, but
1670 * other SMT-capable cores may be over-committed. TODO: some possible negatives:
1671 * IPI thrash if this core does not remain idle following the load balancing ASTs
1672 * Idle "thrash", when IPI issue is followed by idle entry/core power down
1673 * followed by a wakeup shortly thereafter.
1674 */
1675
1676#if (DEVELOPMENT || DEBUG)
1677int sched_smt_balance = 1;
1678#endif
1679
1680#if __SMP__
1681/* Invoked with pset locked, returns with pset unlocked */
1682static void
1683sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) {
1684 processor_t ast_processor = NULL;
1685
1686#if (DEVELOPMENT || DEBUG)
1687 if (__improbable(sched_smt_balance == 0))
1688 goto smt_balance_exit;
1689#endif
1690
1691 assert(cprocessor == current_processor());
1692 if (cprocessor->is_SMT == FALSE)
1693 goto smt_balance_exit;
1694
1695 processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
1696
1697 /* Determine if both this processor and its sibling are idle,
1698 * indicating an SMT rebalancing opportunity.
1699 */
1700 if (sib_processor->state != PROCESSOR_IDLE)
1701 goto smt_balance_exit;
1702
1703 processor_t sprocessor;
1704
1705 qe_foreach_element(sprocessor, &cpset->active_queue, processor_queue) {
1706 if ((sprocessor->state == PROCESSOR_RUNNING) &&
1707 (sprocessor->processor_primary != sprocessor) &&
1708 (sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
1709 (sprocessor->current_pri < BASEPRI_RTQUEUES) &&
1710 ((cpset->pending_AST_cpu_mask & (1ULL << sprocessor->cpu_id)) == 0)) {
1711 assert(sprocessor != cprocessor);
1712 ast_processor = sprocessor;
1713 break;
1714 }
1715 }
1716
1717smt_balance_exit:
1718 pset_unlock(cpset);
1719
1720 if (ast_processor) {
1721 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
1722 cause_ast_check(ast_processor);
1723 }
1724}
1725#endif /* __SMP__ */
1726
1727/*
1728 * thread_select:
1729 *
1730 * Select a new thread for the current processor to execute.
1731 *
1732 * May select the current thread, which must be locked.
1733 */
1734static thread_t
1735thread_select(
1736 thread_t thread,
1737 processor_t processor,
1738 ast_t reason)
1739{
1740 processor_set_t pset = processor->processor_set;
1741 thread_t new_thread = THREAD_NULL;
1742
1743 assert(processor == current_processor());
1744 assert((thread->state & (TH_RUN|TH_TERMINATE2)) == TH_RUN);
1745
1746 do {
1747 /*
1748 * Update the priority.
1749 */
1750 if (SCHED(can_update_priority)(thread))
1751 SCHED(update_priority)(thread);
1752
1753 processor->current_pri = thread->sched_pri;
1754 processor->current_thmode = thread->sched_mode;
1755 processor->current_sfi_class = thread->sfi_class;
1756
1757 pset_lock(pset);
1758
1759 assert(processor->state != PROCESSOR_OFF_LINE);
1760
1761 if (!processor->is_recommended) {
1762 /*
1763 * The performance controller has provided a hint to not dispatch more threads,
1764 * unless they are bound to us (and thus we are the only option
1765 */
1766 if (!SCHED(processor_bound_count)(processor)) {
1767 goto idle;
1768 }
1769 } else if (processor->processor_primary != processor) {
1770 /*
1771 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
1772 * we should look for work only under the same conditions that choose_processor()
1773 * would have assigned work, which is when all primary processors have been assigned work.
1774 *
1775 * An exception is that bound threads are dispatched to a processor without going through
1776 * choose_processor(), so in those cases we should continue trying to dequeue work.
1777 */
1778 if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue) && !rt_runq.count) {
1779 goto idle;
1780 }
1781 }
1782
1783 rt_lock_lock();
1784
1785 /*
1786 * Test to see if the current thread should continue
1787 * to run on this processor. Must not be attempting to wait, and not
1788 * bound to a different processor, nor be in the wrong
1789 * processor set, nor be forced to context switch by TH_SUSP.
1790 *
1791 * Note that there are never any RT threads in the regular runqueue.
1792 *
1793 * This code is very insanely tricky.
1794 */
1795
1796 if (((thread->state & (TH_TERMINATE|TH_IDLE|TH_WAIT|TH_RUN|TH_SUSP)) == TH_RUN) &&
1797 (thread->sched_pri >= BASEPRI_RTQUEUES || processor->processor_primary == processor) &&
1798 (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor) &&
1799 (thread->affinity_set == AFFINITY_SET_NULL || thread->affinity_set->aset_pset == pset)) {
1800 /*
1801 * RT threads with un-expired quantum stay on processor,
1802 * unless there's a valid RT thread with an earlier deadline.
1803 */
1804 if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
1805 if (rt_runq.count > 0) {
1806 thread_t next_rt = qe_queue_first(&rt_runq.queue, struct thread, runq_links);
1807
1808 assert(next_rt->runq == THREAD_ON_RT_RUNQ);
1809
1810 if (next_rt->realtime.deadline < processor->deadline &&
1811 (next_rt->bound_processor == PROCESSOR_NULL ||
1812 next_rt->bound_processor == processor)) {
1813 /* The next RT thread is better, so pick it off the runqueue. */
1814 goto pick_new_rt_thread;
1815 }
1816 }
1817
1818 /* This is still the best RT thread to run. */
1819 processor->deadline = thread->realtime.deadline;
1820
1821 rt_lock_unlock();
1822 pset_unlock(pset);
1823
1824 return (thread);
1825 }
1826
1827 if ((rt_runq.count == 0) &&
1828 SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
1829 /* This thread is still the highest priority runnable (non-idle) thread */
1830 processor->deadline = UINT64_MAX;
1831
1832 rt_lock_unlock();
1833 pset_unlock(pset);
1834
1835 return (thread);
1836 }
1837 }
1838
1839 /* OK, so we're not going to run the current thread. Look at the RT queue. */
1840 if (rt_runq.count > 0) {
1841 thread_t next_rt = qe_queue_first(&rt_runq.queue, struct thread, runq_links);
1842
1843 assert(next_rt->runq == THREAD_ON_RT_RUNQ);
1844
1845 if (__probable((next_rt->bound_processor == PROCESSOR_NULL ||
1846 (next_rt->bound_processor == processor)))) {
1847pick_new_rt_thread:
1848 new_thread = qe_dequeue_head(&rt_runq.queue, struct thread, runq_links);
1849
1850 new_thread->runq = PROCESSOR_NULL;
1851 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
1852 rt_runq.count--;
1853
1854 processor->deadline = new_thread->realtime.deadline;
1855
1856 rt_lock_unlock();
1857 pset_unlock(pset);
1858
1859 return (new_thread);
1860 }
1861 }
1862
1863 processor->deadline = UINT64_MAX;
1864 rt_lock_unlock();
1865
1866 /* No RT threads, so let's look at the regular threads. */
1867 if ((new_thread = SCHED(choose_thread)(processor, MINPRI, reason)) != THREAD_NULL) {
1868 pset_unlock(pset);
1869 return (new_thread);
1870 }
1871
1872#if __SMP__
1873 if (SCHED(steal_thread_enabled)) {
1874 /*
1875 * No runnable threads, attempt to steal
1876 * from other processors. Returns with pset lock dropped.
1877 */
1878
1879 if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
1880 return (new_thread);
1881 }
1882
1883 /*
1884 * If other threads have appeared, shortcut
1885 * around again.
1886 */
1887 if (!SCHED(processor_queue_empty)(processor) || rt_runq.count > 0)
1888 continue;
1889
1890 pset_lock(pset);
1891 }
1892#endif
1893
1894 idle:
1895 /*
1896 * Nothing is runnable, so set this processor idle if it
1897 * was running.
1898 */
1899 if (processor->state == PROCESSOR_RUNNING) {
1900 processor->state = PROCESSOR_IDLE;
1901
1902 if (processor->processor_primary == processor) {
1903 re_queue_head(&pset->idle_queue, &processor->processor_queue);
1904 } else {
1905 re_queue_head(&pset->idle_secondary_queue, &processor->processor_queue);
1906 }
1907 }
1908
1909#if __SMP__
1910 /* Invoked with pset locked, returns with pset unlocked */
1911 sched_SMT_balance(processor, pset);
1912#else
1913 pset_unlock(pset);
1914#endif
1915
1916#if CONFIG_SCHED_IDLE_IN_PLACE
1917 /*
1918 * Choose idle thread if fast idle is not possible.
1919 */
1920 if (processor->processor_primary != processor)
1921 return (processor->idle_thread);
1922
1923 if ((thread->state & (TH_IDLE|TH_TERMINATE|TH_SUSP)) || !(thread->state & TH_WAIT) || thread->wake_active || thread->sched_pri >= BASEPRI_RTQUEUES)
1924 return (processor->idle_thread);
1925
1926 /*
1927 * Perform idling activities directly without a
1928 * context switch. Return dispatched thread,
1929 * else check again for a runnable thread.
1930 */
1931 new_thread = thread_select_idle(thread, processor);
1932
1933#else /* !CONFIG_SCHED_IDLE_IN_PLACE */
1934
1935 /*
1936 * Do a full context switch to idle so that the current
1937 * thread can start running on another processor without
1938 * waiting for the fast-idled processor to wake up.
1939 */
1940 new_thread = processor->idle_thread;
1941
1942#endif /* !CONFIG_SCHED_IDLE_IN_PLACE */
1943
1944 } while (new_thread == THREAD_NULL);
1945
1946 return (new_thread);
1947}
1948
1949#if CONFIG_SCHED_IDLE_IN_PLACE
1950/*
1951 * thread_select_idle:
1952 *
1953 * Idle the processor using the current thread context.
1954 *
1955 * Called with thread locked, then dropped and relocked.
1956 */
1957static thread_t
1958thread_select_idle(
1959 thread_t thread,
1960 processor_t processor)
1961{
1962 thread_t new_thread;
1963 uint64_t arg1, arg2;
1964 int urgency;
1965
1966 sched_run_decr(thread);
1967
1968 thread->state |= TH_IDLE;
1969 processor->current_pri = IDLEPRI;
1970 processor->current_thmode = TH_MODE_NONE;
1971 processor->current_sfi_class = SFI_CLASS_KERNEL;
1972
1973 /* Reload precise timing global policy to thread-local policy */
1974 thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
1975
1976 thread_unlock(thread);
1977
1978 /*
1979 * Switch execution timing to processor idle thread.
1980 */
1981 processor->last_dispatch = mach_absolute_time();
1982
1983#ifdef CONFIG_MACH_APPROXIMATE_TIME
1984 commpage_update_mach_approximate_time(processor->last_dispatch);
1985#endif
1986
1987 thread->last_run_time = processor->last_dispatch;
1988 thread_timer_event(processor->last_dispatch, &processor->idle_thread->system_timer);
1989 PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
1990
1991 /*
1992 * Cancel the quantum timer while idling.
1993 */
1994 timer_call_cancel(&processor->quantum_timer);
1995 processor->first_timeslice = FALSE;
1996
1997 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
1998
1999 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL);
2000
2001 /*
2002 * Enable interrupts and perform idling activities. No
2003 * preemption due to TH_IDLE being set.
2004 */
2005 spllo(); new_thread = processor_idle(thread, processor);
2006
2007 /*
2008 * Return at splsched.
2009 */
2010 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
2011
2012 thread_lock(thread);
2013
2014 /*
2015 * If awakened, switch to thread timer and start a new quantum.
2016 * Otherwise skip; we will context switch to another thread or return here.
2017 */
2018 if (!(thread->state & TH_WAIT)) {
2019 processor->last_dispatch = mach_absolute_time();
2020 thread_timer_event(processor->last_dispatch, &thread->system_timer);
2021 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2022
2023 thread_quantum_init(thread);
2024 processor->quantum_end = processor->last_dispatch + thread->quantum_remaining;
2025 timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
2026 processor->first_timeslice = TRUE;
2027
2028 thread->computation_epoch = processor->last_dispatch;
2029 }
2030
2031 thread->state &= ~TH_IDLE;
2032
2033 urgency = thread_get_urgency(thread, &arg1, &arg2);
2034
2035 thread_tell_urgency(urgency, arg1, arg2, 0, new_thread);
2036
2037 sched_run_incr(thread);
2038
2039 return (new_thread);
2040}
2041#endif /* CONFIG_SCHED_IDLE_IN_PLACE */
2042
2043/*
2044 * thread_invoke
2045 *
2046 * Called at splsched with neither thread locked.
2047 *
2048 * Perform a context switch and start executing the new thread.
2049 *
2050 * Returns FALSE when the context switch didn't happen.
2051 * The reference to the new thread is still consumed.
2052 *
2053 * "self" is what is currently running on the processor,
2054 * "thread" is the new thread to context switch to
2055 * (which may be the same thread in some cases)
2056 */
2057static boolean_t
2058thread_invoke(
2059 thread_t self,
2060 thread_t thread,
2061 ast_t reason)
2062{
2063 if (__improbable(get_preemption_level() != 0)) {
2064 int pl = get_preemption_level();
2065 panic("thread_invoke: preemption_level %d, possible cause: %s",
2066 pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2067 "blocking while holding a spinlock, or within interrupt context"));
2068 }
2069
2070 thread_continue_t continuation = self->continuation;
2071 void *parameter = self->parameter;
2072 processor_t processor;
2073
2074 uint64_t ctime = mach_absolute_time();
2075
2076#ifdef CONFIG_MACH_APPROXIMATE_TIME
2077 commpage_update_mach_approximate_time(ctime);
2078#endif
2079
2080#if defined(CONFIG_SCHED_TIMESHARE_CORE)
2081 sched_timeshare_consider_maintenance(ctime);
2082#endif
2083
2084 assert_thread_magic(self);
2085 assert(self == current_thread());
2086 assert(self->runq == PROCESSOR_NULL);
2087 assert((self->state & (TH_RUN|TH_TERMINATE2)) == TH_RUN);
2088
2089 thread_lock(thread);
2090
2091 assert_thread_magic(thread);
2092 assert((thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_TERMINATE|TH_TERMINATE2)) == TH_RUN);
2093 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor());
2094 assert(thread->runq == PROCESSOR_NULL);
2095
2096 /* Reload precise timing global policy to thread-local policy */
2097 thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
2098
2099 /* Update SFI class based on other factors */
2100 thread->sfi_class = sfi_thread_classify(thread);
2101
2102 /* Allow realtime threads to hang onto a stack. */
2103 if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack)
2104 self->reserved_stack = self->kernel_stack;
2105
2106 if (continuation != NULL) {
2107 if (!thread->kernel_stack) {
2108 /*
2109 * If we are using a privileged stack,
2110 * check to see whether we can exchange it with
2111 * that of the other thread.
2112 */
2113 if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack)
2114 goto need_stack;
2115
2116 /*
2117 * Context switch by performing a stack handoff.
2118 */
2119 continuation = thread->continuation;
2120 parameter = thread->parameter;
2121
2122 processor = current_processor();
2123 processor->active_thread = thread;
2124 processor->current_pri = thread->sched_pri;
2125 processor->current_thmode = thread->sched_mode;
2126 processor->current_sfi_class = thread->sfi_class;
2127 if (thread->last_processor != processor && thread->last_processor != NULL) {
2128 if (thread->last_processor->processor_set != processor->processor_set)
2129 thread->ps_switch++;
2130 thread->p_switch++;
2131 }
2132 thread->last_processor = processor;
2133 thread->c_switch++;
2134 ast_context(thread);
2135
2136 thread_unlock(thread);
2137
2138 self->reason = reason;
2139
2140 processor->last_dispatch = ctime;
2141 self->last_run_time = ctime;
2142 thread_timer_event(ctime, &thread->system_timer);
2143 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2144
2145 /*
2146 * Since non-precise user/kernel time doesn't update the state timer
2147 * during privilege transitions, synthesize an event now.
2148 */
2149 if (!thread->precise_user_kernel_time) {
2150 timer_switch(PROCESSOR_DATA(processor, current_state),
2151 ctime,
2152 PROCESSOR_DATA(processor, current_state));
2153 }
2154
2155 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2156 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)|DBG_FUNC_NONE,
2157 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2158
2159 if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
2160 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
2161 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2162 }
2163
2164 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
2165
2166 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2167
2168 TLOG(1, "thread_invoke: calling stack_handoff\n");
2169 stack_handoff(self, thread);
2170
2171 /* 'self' is now off core */
2172 assert(thread == current_thread());
2173
2174 DTRACE_SCHED(on__cpu);
2175
2176#if KPERF
2177 kperf_on_cpu(thread, continuation, NULL);
2178#endif /* KPERF */
2179
2180 thread_dispatch(self, thread);
2181
2182 thread->continuation = thread->parameter = NULL;
2183
2184 counter(c_thread_invoke_hits++);
2185
2186 (void) spllo();
2187
2188 assert(continuation);
2189 call_continuation(continuation, parameter, thread->wait_result);
2190 /*NOTREACHED*/
2191 }
2192 else if (thread == self) {
2193 /* same thread but with continuation */
2194 ast_context(self);
2195 counter(++c_thread_invoke_same);
2196
2197 thread_unlock(self);
2198
2199#if KPERF
2200 kperf_on_cpu(thread, continuation, NULL);
2201#endif /* KPERF */
2202
2203 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2204 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2205 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2206
2207 self->continuation = self->parameter = NULL;
2208
2209 (void) spllo();
2210
2211 call_continuation(continuation, parameter, self->wait_result);
2212 /*NOTREACHED*/
2213 }
2214 } else {
2215 /*
2216 * Check that the other thread has a stack
2217 */
2218 if (!thread->kernel_stack) {
2219need_stack:
2220 if (!stack_alloc_try(thread)) {
2221 counter(c_thread_invoke_misses++);
2222 thread_unlock(thread);
2223 thread_stack_enqueue(thread);
2224 return (FALSE);
2225 }
2226 } else if (thread == self) {
2227 ast_context(self);
2228 counter(++c_thread_invoke_same);
2229 thread_unlock(self);
2230
2231 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2232 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2233 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2234
2235 return (TRUE);
2236 }
2237 }
2238
2239 /*
2240 * Context switch by full context save.
2241 */
2242 processor = current_processor();
2243 processor->active_thread = thread;
2244 processor->current_pri = thread->sched_pri;
2245 processor->current_thmode = thread->sched_mode;
2246 processor->current_sfi_class = thread->sfi_class;
2247 if (thread->last_processor != processor && thread->last_processor != NULL) {
2248 if (thread->last_processor->processor_set != processor->processor_set)
2249 thread->ps_switch++;
2250 thread->p_switch++;
2251 }
2252 thread->last_processor = processor;
2253 thread->c_switch++;
2254 ast_context(thread);
2255
2256 thread_unlock(thread);
2257
2258 counter(c_thread_invoke_csw++);
2259
2260 self->reason = reason;
2261
2262 processor->last_dispatch = ctime;
2263 self->last_run_time = ctime;
2264 thread_timer_event(ctime, &thread->system_timer);
2265 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2266
2267 /*
2268 * Since non-precise user/kernel time doesn't update the state timer
2269 * during privilege transitions, synthesize an event now.
2270 */
2271 if (!thread->precise_user_kernel_time) {
2272 timer_switch(PROCESSOR_DATA(processor, current_state),
2273 ctime,
2274 PROCESSOR_DATA(processor, current_state));
2275 }
2276
2277 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2278 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2279 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2280
2281 if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
2282 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
2283 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2284 }
2285
2286 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
2287
2288 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2289
2290 /*
2291 * This is where we actually switch register context,
2292 * and address space if required. We will next run
2293 * as a result of a subsequent context switch.
2294 *
2295 * Once registers are switched and the processor is running "thread",
2296 * the stack variables and non-volatile registers will contain whatever
2297 * was there the last time that thread blocked. No local variables should
2298 * be used after this point, except for the special case of "thread", which
2299 * the platform layer returns as the previous thread running on the processor
2300 * via the function call ABI as a return register, and "self", which may have
2301 * been stored on the stack or a non-volatile register, but a stale idea of
2302 * what was on the CPU is newly-accurate because that thread is again
2303 * running on the CPU.
2304 */
2305 assert(continuation == self->continuation);
2306 thread = machine_switch_context(self, continuation, thread);
2307 assert(self == current_thread());
2308 TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
2309
2310 DTRACE_SCHED(on__cpu);
2311
2312#if KPERF
2313 kperf_on_cpu(self, NULL, __builtin_frame_address(0));
2314#endif /* KPERF */
2315
2316 /*
2317 * We have been resumed and are set to run.
2318 */
2319 thread_dispatch(thread, self);
2320
2321 if (continuation) {
2322 self->continuation = self->parameter = NULL;
2323
2324 (void) spllo();
2325
2326 call_continuation(continuation, parameter, self->wait_result);
2327 /*NOTREACHED*/
2328 }
2329
2330 return (TRUE);
2331}
2332
2333#if defined(CONFIG_SCHED_DEFERRED_AST)
2334/*
2335 * pset_cancel_deferred_dispatch:
2336 *
2337 * Cancels all ASTs that we can cancel for the given processor set
2338 * if the current processor is running the last runnable thread in the
2339 * system.
2340 *
2341 * This function assumes the current thread is runnable. This must
2342 * be called with the pset unlocked.
2343 */
2344static void
2345pset_cancel_deferred_dispatch(
2346 processor_set_t pset,
2347 processor_t processor)
2348{
2349 processor_t active_processor = NULL;
2350 uint32_t sampled_sched_run_count;
2351
2352 pset_lock(pset);
2353 sampled_sched_run_count = (volatile uint32_t) sched_run_buckets[TH_BUCKET_RUN];
2354
2355 /*
2356 * If we have emptied the run queue, and our current thread is runnable, we
2357 * should tell any processors that are still DISPATCHING that they will
2358 * probably not have any work to do. In the event that there are no
2359 * pending signals that we can cancel, this is also uninteresting.
2360 *
2361 * In the unlikely event that another thread becomes runnable while we are
2362 * doing this (sched_run_count is atomically updated, not guarded), the
2363 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
2364 * in order to dispatch it to a processor in our pset. So, the other
2365 * codepath will wait while we squash all cancelable ASTs, get the pset
2366 * lock, and then dispatch the freshly runnable thread. So this should be
2367 * correct (we won't accidentally have a runnable thread that hasn't been
2368 * dispatched to an idle processor), if not ideal (we may be restarting the
2369 * dispatch process, which could have some overhead).
2370 *
2371 */
2372 if ((sampled_sched_run_count == 1) &&
2373 (pset->pending_deferred_AST_cpu_mask)) {
2374 qe_foreach_element_safe(active_processor, &pset->active_queue, processor_queue) {
2375 /*
2376 * If a processor is DISPATCHING, it could be because of
2377 * a cancelable signal.
2378 *
2379 * IF the processor is not our
2380 * current processor (the current processor should not
2381 * be DISPATCHING, so this is a bit paranoid), AND there
2382 * is a cancelable signal pending on the processor, AND
2383 * there is no non-cancelable signal pending (as there is
2384 * no point trying to backtrack on bringing the processor
2385 * up if a signal we cannot cancel is outstanding), THEN
2386 * it should make sense to roll back the processor state
2387 * to the IDLE state.
2388 *
2389 * If the racey nature of this approach (as the signal
2390 * will be arbitrated by hardware, and can fire as we
2391 * roll back state) results in the core responding
2392 * despite being pushed back to the IDLE state, it
2393 * should be no different than if the core took some
2394 * interrupt while IDLE.
2395 */
2396 if ((active_processor->state == PROCESSOR_DISPATCHING) &&
2397 (pset->pending_deferred_AST_cpu_mask & (1ULL << active_processor->cpu_id)) &&
2398 (!(pset->pending_AST_cpu_mask & (1ULL << active_processor->cpu_id))) &&
2399 (active_processor != processor)) {
2400 /*
2401 * Squash all of the processor state back to some
2402 * reasonable facsimile of PROCESSOR_IDLE.
2403 *
2404 * TODO: What queue policy do we actually want here?
2405 * We want to promote selection of a good processor
2406 * to run on. Do we want to enqueue at the head?
2407 * The tail? At the (relative) old position in the
2408 * queue? Or something else entirely?
2409 */
2410 re_queue_head(&pset->idle_queue, &active_processor->processor_queue);
2411
2412 assert(active_processor->next_thread == THREAD_NULL);
2413
2414 active_processor->current_pri = IDLEPRI;
2415 active_processor->current_thmode = TH_MODE_FIXED;
2416 active_processor->current_sfi_class = SFI_CLASS_KERNEL;
2417 active_processor->deadline = UINT64_MAX;
2418 active_processor->state = PROCESSOR_IDLE;
2419 pset->pending_deferred_AST_cpu_mask &= ~(1U << active_processor->cpu_id);
2420 machine_signal_idle_cancel(active_processor);
2421 }
2422
2423 }
2424 }
2425
2426 pset_unlock(pset);
2427}
2428#else
2429/* We don't support deferred ASTs; everything is candycanes and sunshine. */
2430#endif
2431
2432/*
2433 * thread_dispatch:
2434 *
2435 * Handle threads at context switch. Re-dispatch other thread
2436 * if still running, otherwise update run state and perform
2437 * special actions. Update quantum for other thread and begin
2438 * the quantum for ourselves.
2439 *
2440 * "thread" is the old thread that we have switched away from.
2441 * "self" is the new current thread that we have context switched to
2442 *
2443 * Called at splsched.
2444 */
2445void
2446thread_dispatch(
2447 thread_t thread,
2448 thread_t self)
2449{
2450 processor_t processor = self->last_processor;
2451
2452 assert(processor == current_processor());
2453 assert(self == current_thread());
2454 assert(thread != self);
2455
2456 if (thread != THREAD_NULL) {
2457 /*
2458 * If blocked at a continuation, discard
2459 * the stack.
2460 */
2461 if (thread->continuation != NULL && thread->kernel_stack != 0)
2462 stack_free(thread);
2463
2464 if (thread->state & TH_IDLE) {
2465 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2466 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2467 (uintptr_t)thread_tid(thread), 0, thread->state,
2468 sched_run_buckets[TH_BUCKET_RUN], 0);
2469 } else {
2470 int64_t consumed;
2471 int64_t remainder = 0;
2472
2473 if (processor->quantum_end > processor->last_dispatch)
2474 remainder = processor->quantum_end -
2475 processor->last_dispatch;
2476
2477 consumed = thread->quantum_remaining - remainder;
2478
2479 if ((thread->reason & AST_LEDGER) == 0) {
2480 /*
2481 * Bill CPU time to both the task and
2482 * the individual thread.
2483 */
2484 ledger_credit(thread->t_ledger,
2485 task_ledgers.cpu_time, consumed);
2486 ledger_credit(thread->t_threadledger,
2487 thread_ledgers.cpu_time, consumed);
2488#ifdef CONFIG_BANK
2489 if (thread->t_bankledger) {
2490 ledger_credit(thread->t_bankledger,
2491 bank_ledgers.cpu_time,
2492 (consumed - thread->t_deduct_bank_ledger_time));
2493
2494 }
2495 thread->t_deduct_bank_ledger_time =0;
2496#endif
2497 }
2498
2499 wake_lock(thread);
2500 thread_lock(thread);
2501
2502 /*
2503 * Apply a priority floor if the thread holds a kernel resource
2504 * Do this before checking starting_pri to avoid overpenalizing
2505 * repeated rwlock blockers.
2506 */
2507 if (__improbable(thread->rwlock_count != 0))
2508 lck_rw_set_promotion_locked(thread);
2509
2510 boolean_t keep_quantum = processor->first_timeslice;
2511
2512 /*
2513 * Treat a thread which has dropped priority since it got on core
2514 * as having expired its quantum.
2515 */
2516 if (processor->starting_pri > thread->sched_pri)
2517 keep_quantum = FALSE;
2518
2519 /* Compute remainder of current quantum. */
2520 if (keep_quantum &&
2521 processor->quantum_end > processor->last_dispatch)
2522 thread->quantum_remaining = (uint32_t)remainder;
2523 else
2524 thread->quantum_remaining = 0;
2525
2526 if (thread->sched_mode == TH_MODE_REALTIME) {
2527 /*
2528 * Cancel the deadline if the thread has
2529 * consumed the entire quantum.
2530 */
2531 if (thread->quantum_remaining == 0) {
2532 thread->realtime.deadline = UINT64_MAX;
2533 }
2534 } else {
2535#if defined(CONFIG_SCHED_TIMESHARE_CORE)
2536 /*
2537 * For non-realtime threads treat a tiny
2538 * remaining quantum as an expired quantum
2539 * but include what's left next time.
2540 */
2541 if (thread->quantum_remaining < min_std_quantum) {
2542 thread->reason |= AST_QUANTUM;
2543 thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
2544 }
2545#endif /* CONFIG_SCHED_TIMESHARE_CORE */
2546 }
2547
2548 /*
2549 * If we are doing a direct handoff then
2550 * take the remainder of the quantum.
2551 */
2552 if ((thread->reason & (AST_HANDOFF|AST_QUANTUM)) == AST_HANDOFF) {
2553 self->quantum_remaining = thread->quantum_remaining;
2554 thread->reason |= AST_QUANTUM;
2555 thread->quantum_remaining = 0;
2556 } else {
2557#if defined(CONFIG_SCHED_MULTIQ)
2558 if (SCHED(sched_groups_enabled) &&
2559 thread->sched_group == self->sched_group) {
2560 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2561 MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
2562 self->reason, (uintptr_t)thread_tid(thread),
2563 self->quantum_remaining, thread->quantum_remaining, 0);
2564
2565 self->quantum_remaining = thread->quantum_remaining;
2566 thread->quantum_remaining = 0;
2567 /* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
2568 }
2569#endif /* defined(CONFIG_SCHED_MULTIQ) */
2570 }
2571
2572 thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
2573
2574 if (!(thread->state & TH_WAIT)) {
2575 /*
2576 * Still runnable.
2577 */
2578 thread->last_made_runnable_time = mach_approximate_time();
2579
2580 machine_thread_going_off_core(thread, FALSE, processor->last_dispatch);
2581
2582 if (thread->reason & AST_QUANTUM)
2583 thread_setrun(thread, SCHED_TAILQ);
2584 else if (thread->reason & AST_PREEMPT)
2585 thread_setrun(thread, SCHED_HEADQ);
2586 else
2587 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
2588
2589 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2590 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2591 (uintptr_t)thread_tid(thread), thread->reason, thread->state,
2592 sched_run_buckets[TH_BUCKET_RUN], 0);
2593
2594 if (thread->wake_active) {
2595 thread->wake_active = FALSE;
2596 thread_unlock(thread);
2597
2598 thread_wakeup(&thread->wake_active);
2599 } else {
2600 thread_unlock(thread);
2601 }
2602
2603 wake_unlock(thread);
2604 } else {
2605 /*
2606 * Waiting.
2607 */
2608 boolean_t should_terminate = FALSE;
2609 uint32_t new_run_count;
2610
2611 /* Only the first call to thread_dispatch
2612 * after explicit termination should add
2613 * the thread to the termination queue
2614 */
2615 if ((thread->state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) {
2616 should_terminate = TRUE;
2617 thread->state |= TH_TERMINATE2;
2618 }
2619
2620 thread->state &= ~TH_RUN;
2621 thread->last_made_runnable_time = ~0ULL;
2622 thread->chosen_processor = PROCESSOR_NULL;
2623
2624 new_run_count = sched_run_decr(thread);
2625
2626#if CONFIG_SCHED_SFI
2627 if ((thread->state & (TH_WAIT | TH_TERMINATE)) == TH_WAIT) {
2628 if (thread->reason & AST_SFI) {
2629 thread->wait_sfi_begin_time = processor->last_dispatch;
2630 }
2631 }
2632#endif
2633
2634 machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch);
2635
2636 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2637 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2638 (uintptr_t)thread_tid(thread), thread->reason, thread->state,
2639 new_run_count, 0);
2640
2641 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
2642
2643 if (thread->wake_active) {
2644 thread->wake_active = FALSE;
2645 thread_unlock(thread);
2646
2647 thread_wakeup(&thread->wake_active);
2648 } else {
2649 thread_unlock(thread);
2650 }
2651
2652 wake_unlock(thread);
2653
2654 if (should_terminate)
2655 thread_terminate_enqueue(thread);
2656 }
2657 }
2658 }
2659
2660 /* Update (new) current thread and reprogram quantum timer */
2661 thread_lock(self);
2662 if (!(self->state & TH_IDLE)) {
2663 uint64_t arg1, arg2;
2664 int urgency;
2665 uint64_t latency;
2666
2667#if CONFIG_SCHED_SFI
2668 ast_t new_ast;
2669
2670 new_ast = sfi_thread_needs_ast(self, NULL);
2671
2672 if (new_ast != AST_NONE) {
2673 ast_on(new_ast);
2674 }
2675#endif
2676
2677 assertf(processor->last_dispatch >= self->last_made_runnable_time, "Non-monotonic time? dispatch at 0x%llx, runnable at 0x%llx", processor->last_dispatch, self->last_made_runnable_time);
2678 latency = processor->last_dispatch - self->last_made_runnable_time;
2679
2680 urgency = thread_get_urgency(self, &arg1, &arg2);
2681
2682 thread_tell_urgency(urgency, arg1, arg2, latency, self);
2683
2684 machine_thread_going_on_core(self, urgency, latency, processor->last_dispatch);
2685
2686 /*
2687 * Get a new quantum if none remaining.
2688 */
2689 if (self->quantum_remaining == 0) {
2690 thread_quantum_init(self);
2691 }
2692
2693 /*
2694 * Set up quantum timer and timeslice.
2695 */
2696 processor->quantum_end = processor->last_dispatch + self->quantum_remaining;
2697 timer_call_enter1(&processor->quantum_timer, self, processor->quantum_end, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
2698
2699 processor->first_timeslice = TRUE;
2700 } else {
2701 timer_call_cancel(&processor->quantum_timer);
2702 processor->first_timeslice = FALSE;
2703
2704 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
2705 machine_thread_going_on_core(self, THREAD_URGENCY_NONE, 0, processor->last_dispatch);
2706 }
2707
2708 assert(self->block_hint == kThreadWaitNone);
2709 self->computation_epoch = processor->last_dispatch;
2710 self->reason = AST_NONE;
2711 processor->starting_pri = self->sched_pri;
2712
2713 thread_unlock(self);
2714
2715#if defined(CONFIG_SCHED_DEFERRED_AST)
2716 /*
2717 * TODO: Can we state that redispatching our old thread is also
2718 * uninteresting?
2719 */
2720 if ((((volatile uint32_t)sched_run_buckets[TH_BUCKET_RUN]) == 1) &&
2721 !(self->state & TH_IDLE)) {
2722 pset_cancel_deferred_dispatch(processor->processor_set, processor);
2723 }
2724#endif
2725
2726}
2727
2728/*
2729 * thread_block_reason:
2730 *
2731 * Forces a reschedule, blocking the caller if a wait
2732 * has been asserted.
2733 *
2734 * If a continuation is specified, then thread_invoke will
2735 * attempt to discard the thread's kernel stack. When the
2736 * thread resumes, it will execute the continuation function
2737 * on a new kernel stack.
2738 */
2739counter(mach_counter_t c_thread_block_calls = 0;)
2740
2741wait_result_t
2742thread_block_reason(
2743 thread_continue_t continuation,
2744 void *parameter,
2745 ast_t reason)
2746{
2747 thread_t self = current_thread();
2748 processor_t processor;
2749 thread_t new_thread;
2750 spl_t s;
2751
2752 counter(++c_thread_block_calls);
2753
2754 s = splsched();
2755
2756 processor = current_processor();
2757
2758 /* If we're explicitly yielding, force a subsequent quantum */
2759 if (reason & AST_YIELD)
2760 processor->first_timeslice = FALSE;
2761
2762 /* We're handling all scheduling AST's */
2763 ast_off(AST_SCHEDULING);
2764
2765#if PROC_REF_DEBUG
2766 if ((continuation != NULL) && (self->task != kernel_task)) {
2767 if (uthread_get_proc_refcount(self->uthread) != 0) {
2768 panic("thread_block_reason with continuation uthread %p with uu_proc_refcount != 0", self->uthread);
2769 }
2770 }
2771#endif
2772
2773 self->continuation = continuation;
2774 self->parameter = parameter;
2775
2776 if (self->state & ~(TH_RUN | TH_IDLE)) {
2777 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2778 MACHDBG_CODE(DBG_MACH_SCHED,MACH_BLOCK),
2779 reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
2780 }
2781
2782 do {
2783 thread_lock(self);
2784 new_thread = thread_select(self, processor, reason);
2785 thread_unlock(self);
2786 } while (!thread_invoke(self, new_thread, reason));
2787
2788 splx(s);
2789
2790 return (self->wait_result);
2791}
2792
2793/*
2794 * thread_block:
2795 *
2796 * Block the current thread if a wait has been asserted.
2797 */
2798wait_result_t
2799thread_block(
2800 thread_continue_t continuation)
2801{
2802 return thread_block_reason(continuation, NULL, AST_NONE);
2803}
2804
2805wait_result_t
2806thread_block_parameter(
2807 thread_continue_t continuation,
2808 void *parameter)
2809{
2810 return thread_block_reason(continuation, parameter, AST_NONE);
2811}
2812
2813/*
2814 * thread_run:
2815 *
2816 * Switch directly from the current thread to the
2817 * new thread, handing off our quantum if appropriate.
2818 *
2819 * New thread must be runnable, and not on a run queue.
2820 *
2821 * Called at splsched.
2822 */
2823int
2824thread_run(
2825 thread_t self,
2826 thread_continue_t continuation,
2827 void *parameter,
2828 thread_t new_thread)
2829{
2830 ast_t handoff = AST_HANDOFF;
2831
2832 self->continuation = continuation;
2833 self->parameter = parameter;
2834
2835 while (!thread_invoke(self, new_thread, handoff)) {
2836 processor_t processor = current_processor();
2837
2838 thread_lock(self);
2839 new_thread = thread_select(self, processor, AST_NONE);
2840 thread_unlock(self);
2841 handoff = AST_NONE;
2842 }
2843
2844 return (self->wait_result);
2845}
2846
2847/*
2848 * thread_continue:
2849 *
2850 * Called at splsched when a thread first receives
2851 * a new stack after a continuation.
2852 */
2853void
2854thread_continue(
2855 thread_t thread)
2856{
2857 thread_t self = current_thread();
2858 thread_continue_t continuation;
2859 void *parameter;
2860
2861 DTRACE_SCHED(on__cpu);
2862
2863 continuation = self->continuation;
2864 parameter = self->parameter;
2865
2866#if KPERF
2867 kperf_on_cpu(self, continuation, NULL);
2868#endif
2869
2870 thread_dispatch(thread, self);
2871
2872 self->continuation = self->parameter = NULL;
2873
2874 if (thread != THREAD_NULL)
2875 (void)spllo();
2876
2877 TLOG(1, "thread_continue: calling call_continuation \n");
2878 call_continuation(continuation, parameter, self->wait_result);
2879 /*NOTREACHED*/
2880}
2881
2882void
2883thread_quantum_init(thread_t thread)
2884{
2885 if (thread->sched_mode == TH_MODE_REALTIME) {
2886 thread->quantum_remaining = thread->realtime.computation;
2887 } else {
2888 thread->quantum_remaining = SCHED(initial_quantum_size)(thread);
2889 }
2890}
2891
2892uint32_t
2893sched_timeshare_initial_quantum_size(thread_t thread)
2894{
2895 if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG)
2896 return bg_quantum;
2897 else
2898 return std_quantum;
2899}
2900
2901/*
2902 * run_queue_init:
2903 *
2904 * Initialize a run queue before first use.
2905 */
2906void
2907run_queue_init(
2908 run_queue_t rq)
2909{
2910 rq->highq = NOPRI;
2911 for (u_int i = 0; i < BITMAP_LEN(NRQS); i++)
2912 rq->bitmap[i] = 0;
2913 rq->urgency = rq->count = 0;
2914 for (int i = 0; i < NRQS; i++)
2915 queue_init(&rq->queues[i]);
2916}
2917
2918/*
2919 * run_queue_dequeue:
2920 *
2921 * Perform a dequeue operation on a run queue,
2922 * and return the resulting thread.
2923 *
2924 * The run queue must be locked (see thread_run_queue_remove()
2925 * for more info), and not empty.
2926 */
2927thread_t
2928run_queue_dequeue(
2929 run_queue_t rq,
2930 integer_t options)
2931{
2932 thread_t thread;
2933 queue_t queue = &rq->queues[rq->highq];
2934
2935 if (options & SCHED_HEADQ) {
2936 thread = qe_dequeue_head(queue, struct thread, runq_links);
2937 } else {
2938 thread = qe_dequeue_tail(queue, struct thread, runq_links);
2939 }
2940
2941 assert(thread != THREAD_NULL);
2942 assert_thread_magic(thread);
2943
2944 thread->runq = PROCESSOR_NULL;
2945 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2946 rq->count--;
2947 if (SCHED(priority_is_urgent)(rq->highq)) {
2948 rq->urgency--; assert(rq->urgency >= 0);
2949 }
2950 if (queue_empty(queue)) {
2951 bitmap_clear(rq->bitmap, rq->highq);
2952 rq->highq = bitmap_first(rq->bitmap, NRQS);
2953 }
2954
2955 return thread;
2956}
2957
2958/*
2959 * run_queue_enqueue:
2960 *
2961 * Perform a enqueue operation on a run queue.
2962 *
2963 * The run queue must be locked (see thread_run_queue_remove()
2964 * for more info).
2965 */
2966boolean_t
2967run_queue_enqueue(
2968 run_queue_t rq,
2969 thread_t thread,
2970 integer_t options)
2971{
2972 queue_t queue = &rq->queues[thread->sched_pri];
2973 boolean_t result = FALSE;
2974
2975 assert_thread_magic(thread);
2976
2977 if (queue_empty(queue)) {
2978 enqueue_tail(queue, &thread->runq_links);
2979
2980 rq_bitmap_set(rq->bitmap, thread->sched_pri);
2981 if (thread->sched_pri > rq->highq) {
2982 rq->highq = thread->sched_pri;
2983 result = TRUE;
2984 }
2985 } else {
2986 if (options & SCHED_TAILQ)
2987 enqueue_tail(queue, &thread->runq_links);
2988 else
2989 enqueue_head(queue, &thread->runq_links);
2990 }
2991 if (SCHED(priority_is_urgent)(thread->sched_pri))
2992 rq->urgency++;
2993 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2994 rq->count++;
2995
2996 return (result);
2997}
2998
2999/*
3000 * run_queue_remove:
3001 *
3002 * Remove a specific thread from a runqueue.
3003 *
3004 * The run queue must be locked.
3005 */
3006void
3007run_queue_remove(
3008 run_queue_t rq,
3009 thread_t thread)
3010{
3011 assert(thread->runq != PROCESSOR_NULL);
3012 assert_thread_magic(thread);
3013
3014 remqueue(&thread->runq_links);
3015 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3016 rq->count--;
3017 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
3018 rq->urgency--; assert(rq->urgency >= 0);
3019 }
3020
3021 if (queue_empty(&rq->queues[thread->sched_pri])) {
3022 /* update run queue status */
3023 bitmap_clear(rq->bitmap, thread->sched_pri);
3024 rq->highq = bitmap_first(rq->bitmap, NRQS);
3025 }
3026
3027 thread->runq = PROCESSOR_NULL;
3028}
3029
3030/* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
3031void
3032rt_runq_scan(sched_update_scan_context_t scan_context)
3033{
3034 spl_t s;
3035 thread_t thread;
3036
3037 s = splsched();
3038 rt_lock_lock();
3039
3040 qe_foreach_element_safe(thread, &rt_runq.queue, runq_links) {
3041 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
3042 scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
3043 }
3044 }
3045
3046 rt_lock_unlock();
3047 splx(s);
3048}
3049
3050
3051/*
3052 * realtime_queue_insert:
3053 *
3054 * Enqueue a thread for realtime execution.
3055 */
3056static boolean_t
3057realtime_queue_insert(thread_t thread)
3058{
3059 queue_t queue = &rt_runq.queue;
3060 uint64_t deadline = thread->realtime.deadline;
3061 boolean_t preempt = FALSE;
3062
3063 rt_lock_lock();
3064
3065 if (queue_empty(queue)) {
3066 enqueue_tail(queue, &thread->runq_links);
3067 preempt = TRUE;
3068 } else {
3069 /* Insert into rt_runq in thread deadline order */
3070 queue_entry_t iter;
3071 qe_foreach(iter, queue) {
3072 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
3073 assert_thread_magic(iter_thread);
3074
3075 if (deadline < iter_thread->realtime.deadline) {
3076 if (iter == queue_first(queue))
3077 preempt = TRUE;
3078 insque(&thread->runq_links, queue_prev(iter));
3079 break;
3080 } else if (iter == queue_last(queue)) {
3081 enqueue_tail(queue, &thread->runq_links);
3082 break;
3083 }
3084 }
3085 }
3086
3087 thread->runq = THREAD_ON_RT_RUNQ;
3088 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
3089 rt_runq.count++;
3090
3091 rt_lock_unlock();
3092
3093 return (preempt);
3094}
3095
3096/*
3097 * realtime_setrun:
3098 *
3099 * Dispatch a thread for realtime execution.
3100 *
3101 * Thread must be locked. Associated pset must
3102 * be locked, and is returned unlocked.
3103 */
3104static void
3105realtime_setrun(
3106 processor_t processor,
3107 thread_t thread)
3108{
3109 processor_set_t pset = processor->processor_set;
3110 ast_t preempt;
3111
3112 boolean_t do_signal_idle = FALSE, do_cause_ast = FALSE;
3113
3114 thread->chosen_processor = processor;
3115
3116 /* <rdar://problem/15102234> */
3117 assert(thread->bound_processor == PROCESSOR_NULL);
3118
3119 /*
3120 * Dispatch directly onto idle processor.
3121 */
3122 if ( (thread->bound_processor == processor)
3123 && processor->state == PROCESSOR_IDLE) {
3124 re_queue_tail(&pset->active_queue, &processor->processor_queue);
3125
3126 processor->next_thread = thread;
3127 processor->current_pri = thread->sched_pri;
3128 processor->current_thmode = thread->sched_mode;
3129 processor->current_sfi_class = thread->sfi_class;
3130 processor->deadline = thread->realtime.deadline;
3131 processor->state = PROCESSOR_DISPATCHING;
3132
3133 if (processor != current_processor()) {
3134 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3135 /* cleared on exit from main processor_idle() loop */
3136 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3137 do_signal_idle = TRUE;
3138 }
3139 }
3140 pset_unlock(pset);
3141
3142 if (do_signal_idle) {
3143 machine_signal_idle(processor);
3144 }
3145 return;
3146 }
3147
3148 if (processor->current_pri < BASEPRI_RTQUEUES)
3149 preempt = (AST_PREEMPT | AST_URGENT);
3150 else if (thread->realtime.deadline < processor->deadline)
3151 preempt = (AST_PREEMPT | AST_URGENT);
3152 else
3153 preempt = AST_NONE;
3154
3155 realtime_queue_insert(thread);
3156
3157 if (preempt != AST_NONE) {
3158 if (processor->state == PROCESSOR_IDLE) {
3159 re_queue_tail(&pset->active_queue, &processor->processor_queue);
3160
3161 processor->next_thread = THREAD_NULL;
3162 processor->current_pri = thread->sched_pri;
3163 processor->current_thmode = thread->sched_mode;
3164 processor->current_sfi_class = thread->sfi_class;
3165 processor->deadline = thread->realtime.deadline;
3166 processor->state = PROCESSOR_DISPATCHING;
3167 if (processor == current_processor()) {
3168 ast_on(preempt);
3169 } else {
3170 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3171 /* cleared on exit from main processor_idle() loop */
3172 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3173 do_signal_idle = TRUE;
3174 }
3175 }
3176 } else if (processor->state == PROCESSOR_DISPATCHING) {
3177 if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) || (processor->deadline > thread->realtime.deadline))) {
3178 processor->current_pri = thread->sched_pri;
3179 processor->current_thmode = thread->sched_mode;
3180 processor->current_sfi_class = thread->sfi_class;
3181 processor->deadline = thread->realtime.deadline;
3182 }
3183 } else {
3184 if (processor == current_processor()) {
3185 ast_on(preempt);
3186 } else {
3187 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3188 /* cleared after IPI causes csw_check() to be called */
3189 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3190 do_cause_ast = TRUE;
3191 }
3192 }
3193 }
3194 } else {
3195 /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
3196 }
3197
3198 pset_unlock(pset);
3199
3200 if (do_signal_idle) {
3201 machine_signal_idle(processor);
3202 } else if (do_cause_ast) {
3203 cause_ast_check(processor);
3204 }
3205}
3206
3207
3208#if defined(CONFIG_SCHED_TIMESHARE_CORE)
3209
3210boolean_t
3211priority_is_urgent(int priority)
3212{
3213 return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
3214}
3215
3216#endif /* CONFIG_SCHED_TIMESHARE_CORE */
3217
3218/*
3219 * processor_setrun:
3220 *
3221 * Dispatch a thread for execution on a
3222 * processor.
3223 *
3224 * Thread must be locked. Associated pset must
3225 * be locked, and is returned unlocked.
3226 */
3227static void
3228processor_setrun(
3229 processor_t processor,
3230 thread_t thread,
3231 integer_t options)
3232{
3233 processor_set_t pset = processor->processor_set;
3234 ast_t preempt;
3235 enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
3236 enum { eNoSignal, eDoSignal, eDoDeferredSignal } do_signal_idle = eNoSignal;
3237
3238 boolean_t do_cause_ast = FALSE;
3239
3240 thread->chosen_processor = processor;
3241
3242 /*
3243 * Dispatch directly onto idle processor.
3244 */
3245 if ( (SCHED(direct_dispatch_to_idle_processors) ||
3246 thread->bound_processor == processor)
3247 && processor->state == PROCESSOR_IDLE) {
3248
3249 re_queue_tail(&pset->active_queue, &processor->processor_queue);
3250
3251 processor->next_thread = thread;
3252 processor->current_pri = thread->sched_pri;
3253 processor->current_thmode = thread->sched_mode;
3254 processor->current_sfi_class = thread->sfi_class;
3255 processor->deadline = UINT64_MAX;
3256 processor->state = PROCESSOR_DISPATCHING;
3257
3258 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3259 /* cleared on exit from main processor_idle() loop */
3260 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3261 do_signal_idle = eDoSignal;
3262 }
3263
3264 pset_unlock(pset);
3265
3266 if (do_signal_idle == eDoSignal) {
3267 machine_signal_idle(processor);
3268 }
3269
3270 return;
3271 }
3272
3273 /*
3274 * Set preemption mode.
3275 */
3276#if defined(CONFIG_SCHED_DEFERRED_AST)
3277 /* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
3278#endif
3279 if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri)
3280 preempt = (AST_PREEMPT | AST_URGENT);
3281 else if(processor->active_thread && thread_eager_preemption(processor->active_thread))
3282 preempt = (AST_PREEMPT | AST_URGENT);
3283 else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
3284 if(SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
3285 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3286 } else {
3287 preempt = AST_NONE;
3288 }
3289 } else
3290 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3291
3292 SCHED(processor_enqueue)(processor, thread, options);
3293
3294 if (preempt != AST_NONE) {
3295 if (processor->state == PROCESSOR_IDLE) {
3296 re_queue_tail(&pset->active_queue, &processor->processor_queue);
3297
3298 processor->next_thread = THREAD_NULL;
3299 processor->current_pri = thread->sched_pri;
3300 processor->current_thmode = thread->sched_mode;
3301 processor->current_sfi_class = thread->sfi_class;
3302 processor->deadline = UINT64_MAX;
3303 processor->state = PROCESSOR_DISPATCHING;
3304
3305 ipi_action = eExitIdle;
3306 } else if ( processor->state == PROCESSOR_DISPATCHING) {
3307 if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) {
3308 processor->current_pri = thread->sched_pri;
3309 processor->current_thmode = thread->sched_mode;
3310 processor->current_sfi_class = thread->sfi_class;
3311 processor->deadline = UINT64_MAX;
3312 }
3313 } else if ( (processor->state == PROCESSOR_RUNNING ||
3314 processor->state == PROCESSOR_SHUTDOWN) &&
3315 (thread->sched_pri >= processor->current_pri)) {
3316 ipi_action = eInterruptRunning;
3317 }
3318 } else {
3319 /*
3320 * New thread is not important enough to preempt what is running, but
3321 * special processor states may need special handling
3322 */
3323 if (processor->state == PROCESSOR_SHUTDOWN &&
3324 thread->sched_pri >= processor->current_pri ) {
3325 ipi_action = eInterruptRunning;
3326 } else if (processor->state == PROCESSOR_IDLE) {
3327 re_queue_tail(&pset->active_queue, &processor->processor_queue);
3328
3329 processor->next_thread = THREAD_NULL;
3330 processor->current_pri = thread->sched_pri;
3331 processor->current_thmode = thread->sched_mode;
3332 processor->current_sfi_class = thread->sfi_class;
3333 processor->deadline = UINT64_MAX;
3334 processor->state = PROCESSOR_DISPATCHING;
3335
3336 ipi_action = eExitIdle;
3337 }
3338 }
3339
3340 switch (ipi_action) {
3341 case eDoNothing:
3342 break;
3343 case eExitIdle:
3344 if (processor == current_processor()) {
3345 if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
3346 ast_on(preempt);
3347 } else {
3348#if defined(CONFIG_SCHED_DEFERRED_AST)
3349 if (!(pset->pending_deferred_AST_cpu_mask & (1ULL << processor->cpu_id)) &&
3350 !(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3351 /* cleared on exit from main processor_idle() loop */
3352 pset->pending_deferred_AST_cpu_mask |= (1ULL << processor->cpu_id);
3353 do_signal_idle = eDoDeferredSignal;
3354 }
3355#else
3356 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3357 /* cleared on exit from main processor_idle() loop */
3358 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3359 do_signal_idle = eDoSignal;
3360 }
3361#endif
3362 }
3363 break;
3364 case eInterruptRunning:
3365 if (processor == current_processor()) {
3366 if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
3367 ast_on(preempt);
3368 } else {
3369 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3370 /* cleared after IPI causes csw_check() to be called */
3371 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3372 do_cause_ast = TRUE;
3373 }
3374 }
3375 break;
3376 }
3377
3378 pset_unlock(pset);
3379
3380 if (do_signal_idle == eDoSignal) {
3381 machine_signal_idle(processor);
3382 }
3383#if defined(CONFIG_SCHED_DEFERRED_AST)
3384 else if (do_signal_idle == eDoDeferredSignal) {
3385 /*
3386 * TODO: The ability to cancel this signal could make
3387 * sending it outside of the pset lock an issue. Do
3388 * we need to address this? Or would the only fallout
3389 * be that the core takes a signal? As long as we do
3390 * not run the risk of having a core marked as signal
3391 * outstanding, with no real signal outstanding, the
3392 * only result should be that we fail to cancel some
3393 * signals.
3394 */
3395 machine_signal_idle_deferred(processor);
3396 }
3397#endif
3398 else if (do_cause_ast) {
3399 cause_ast_check(processor);
3400 }
3401}
3402
3403/*
3404 * choose_next_pset:
3405 *
3406 * Return the next sibling pset containing
3407 * available processors.
3408 *
3409 * Returns the original pset if none other is
3410 * suitable.
3411 */
3412static processor_set_t
3413choose_next_pset(
3414 processor_set_t pset)
3415{
3416 processor_set_t nset = pset;
3417
3418 do {
3419 nset = next_pset(nset);
3420 } while (nset->online_processor_count < 1 && nset != pset);
3421
3422 return (nset);
3423}
3424
3425/*
3426 * choose_processor:
3427 *
3428 * Choose a processor for the thread, beginning at
3429 * the pset. Accepts an optional processor hint in
3430 * the pset.
3431 *
3432 * Returns a processor, possibly from a different pset.
3433 *
3434 * The thread must be locked. The pset must be locked,
3435 * and the resulting pset is locked on return.
3436 */
3437processor_t
3438choose_processor(
3439 processor_set_t pset,
3440 processor_t processor,
3441 thread_t thread)
3442{
3443 processor_set_t nset, cset = pset;
3444
3445 assert(thread->sched_pri <= BASEPRI_RTQUEUES);
3446
3447 /*
3448 * Prefer the hinted processor, when appropriate.
3449 */
3450
3451 /* Fold last processor hint from secondary processor to its primary */
3452 if (processor != PROCESSOR_NULL) {
3453 processor = processor->processor_primary;
3454 }
3455
3456 /*
3457 * Only consult platform layer if pset is active, which
3458 * it may not be in some cases when a multi-set system
3459 * is going to sleep.
3460 */
3461 if (pset->online_processor_count) {
3462 if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
3463 processor_t mc_processor = machine_choose_processor(pset, processor);
3464 if (mc_processor != PROCESSOR_NULL)
3465 processor = mc_processor->processor_primary;
3466 }
3467 }
3468
3469 /*
3470 * At this point, we may have a processor hint, and we may have
3471 * an initial starting pset. If the hint is not in the pset, or
3472 * if the hint is for a processor in an invalid state, discard
3473 * the hint.
3474 */
3475 if (processor != PROCESSOR_NULL) {
3476 if (processor->processor_set != pset) {
3477 processor = PROCESSOR_NULL;
3478 } else if (!processor->is_recommended) {
3479 processor = PROCESSOR_NULL;
3480 } else {
3481 switch (processor->state) {
3482 case PROCESSOR_START:
3483 case PROCESSOR_SHUTDOWN:
3484 case PROCESSOR_OFF_LINE:
3485 /*
3486 * Hint is for a processor that cannot support running new threads.
3487 */
3488 processor = PROCESSOR_NULL;
3489 break;
3490 case PROCESSOR_IDLE:
3491 /*
3492 * Hint is for an idle processor. Assume it is no worse than any other
3493 * idle processor. The platform layer had an opportunity to provide
3494 * the "least cost idle" processor above.
3495 */
3496 return (processor);
3497 case PROCESSOR_RUNNING:
3498 case PROCESSOR_DISPATCHING:
3499 /*
3500 * Hint is for an active CPU. This fast-path allows
3501 * realtime threads to preempt non-realtime threads
3502 * to regain their previous executing processor.
3503 */
3504 if ((thread->sched_pri >= BASEPRI_RTQUEUES) &&
3505 (processor->current_pri < BASEPRI_RTQUEUES))
3506 return (processor);
3507
3508 /* Otherwise, use hint as part of search below */
3509 break;
3510 default:
3511 processor = PROCESSOR_NULL;
3512 break;
3513 }
3514 }
3515 }
3516
3517 /*
3518 * Iterate through the processor sets to locate
3519 * an appropriate processor. Seed results with
3520 * a last-processor hint, if available, so that
3521 * a search must find something strictly better
3522 * to replace it.
3523 *
3524 * A primary/secondary pair of SMT processors are
3525 * "unpaired" if the primary is busy but its
3526 * corresponding secondary is idle (so the physical
3527 * core has full use of its resources).
3528 */
3529
3530 integer_t lowest_priority = MAXPRI + 1;
3531 integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
3532 integer_t lowest_count = INT_MAX;
3533 uint64_t furthest_deadline = 1;
3534 processor_t lp_processor = PROCESSOR_NULL;
3535 processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
3536 processor_t lp_unpaired_secondary_processor = PROCESSOR_NULL;
3537 processor_t lc_processor = PROCESSOR_NULL;
3538 processor_t fd_processor = PROCESSOR_NULL;
3539
3540 if (processor != PROCESSOR_NULL) {
3541 /* All other states should be enumerated above. */
3542 assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
3543
3544 lowest_priority = processor->current_pri;
3545 lp_processor = processor;
3546
3547 if (processor->current_pri >= BASEPRI_RTQUEUES) {
3548 furthest_deadline = processor->deadline;
3549 fd_processor = processor;
3550 }
3551
3552 lowest_count = SCHED(processor_runq_count)(processor);
3553 lc_processor = processor;
3554 }
3555
3556 do {
3557
3558 /*
3559 * Choose an idle processor, in pset traversal order
3560 */
3561 qe_foreach_element(processor, &cset->idle_queue, processor_queue) {
3562 if (processor->is_recommended)
3563 return processor;
3564 }
3565
3566 /*
3567 * Otherwise, enumerate active and idle processors to find candidates
3568 * with lower priority/etc.
3569 */
3570
3571 qe_foreach_element(processor, &cset->active_queue, processor_queue) {
3572
3573 if (!processor->is_recommended) {
3574 continue;
3575 }
3576
3577 integer_t cpri = processor->current_pri;
3578 if (cpri < lowest_priority) {
3579 lowest_priority = cpri;
3580 lp_processor = processor;
3581 }
3582
3583 if ((cpri >= BASEPRI_RTQUEUES) && (processor->deadline > furthest_deadline)) {
3584 furthest_deadline = processor->deadline;
3585 fd_processor = processor;
3586 }
3587
3588 integer_t ccount = SCHED(processor_runq_count)(processor);
3589 if (ccount < lowest_count) {
3590 lowest_count = ccount;
3591 lc_processor = processor;
3592 }
3593 }
3594
3595 /*
3596 * For SMT configs, these idle secondary processors must have active primary. Otherwise
3597 * the idle primary would have short-circuited the loop above
3598 */
3599 qe_foreach_element(processor, &cset->idle_secondary_queue, processor_queue) {
3600
3601 if (!processor->is_recommended) {
3602 continue;
3603 }
3604
3605 processor_t cprimary = processor->processor_primary;
3606
3607 /* If the primary processor is offline or starting up, it's not a candidate for this path */
3608 if (cprimary->state == PROCESSOR_RUNNING || cprimary->state == PROCESSOR_DISPATCHING) {
3609 integer_t primary_pri = cprimary->current_pri;
3610
3611 if (primary_pri < lowest_unpaired_primary_priority) {
3612 lowest_unpaired_primary_priority = primary_pri;
3613 lp_unpaired_primary_processor = cprimary;
3614 lp_unpaired_secondary_processor = processor;
3615 }
3616 }
3617 }
3618
3619
3620 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
3621
3622 /*
3623 * For realtime threads, the most important aspect is
3624 * scheduling latency, so we attempt to assign threads
3625 * to good preemption candidates (assuming an idle primary
3626 * processor was not available above).
3627 */
3628
3629 if (thread->sched_pri > lowest_unpaired_primary_priority) {
3630 /* Move to end of active queue so that the next thread doesn't also pick it */
3631 re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue);
3632 return lp_unpaired_primary_processor;
3633 }
3634 if (thread->sched_pri > lowest_priority) {
3635 /* Move to end of active queue so that the next thread doesn't also pick it */
3636 re_queue_tail(&cset->active_queue, &lp_processor->processor_queue);
3637 return lp_processor;
3638 }
3639 if (thread->realtime.deadline < furthest_deadline)
3640 return fd_processor;
3641
3642 /*
3643 * If all primary and secondary CPUs are busy with realtime
3644 * threads with deadlines earlier than us, move on to next
3645 * pset.
3646 */
3647 }
3648 else {
3649
3650 if (thread->sched_pri > lowest_unpaired_primary_priority) {
3651 /* Move to end of active queue so that the next thread doesn't also pick it */
3652 re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue);
3653 return lp_unpaired_primary_processor;
3654 }
3655 if (thread->sched_pri > lowest_priority) {
3656 /* Move to end of active queue so that the next thread doesn't also pick it */
3657 re_queue_tail(&cset->active_queue, &lp_processor->processor_queue);
3658 return lp_processor;
3659 }
3660
3661 /*
3662 * If all primary processor in this pset are running a higher
3663 * priority thread, move on to next pset. Only when we have
3664 * exhausted this search do we fall back to other heuristics.
3665 */
3666 }
3667
3668 /*
3669 * Move onto the next processor set.
3670 */
3671 nset = next_pset(cset);
3672
3673 if (nset != pset) {
3674 pset_unlock(cset);
3675
3676 cset = nset;
3677 pset_lock(cset);
3678 }
3679 } while (nset != pset);
3680
3681 /*
3682 * Make sure that we pick a running processor,
3683 * and that the correct processor set is locked.
3684 * Since we may have unlock the candidate processor's
3685 * pset, it may have changed state.
3686 *
3687 * All primary processors are running a higher priority
3688 * thread, so the only options left are enqueuing on
3689 * the secondary processor that would perturb the least priority
3690 * primary, or the least busy primary.
3691 */
3692 do {
3693
3694 /* lowest_priority is evaluated in the main loops above */
3695 if (lp_unpaired_secondary_processor != PROCESSOR_NULL) {
3696 processor = lp_unpaired_secondary_processor;
3697 lp_unpaired_secondary_processor = PROCESSOR_NULL;
3698 } else if (lc_processor != PROCESSOR_NULL) {
3699 processor = lc_processor;
3700 lc_processor = PROCESSOR_NULL;
3701 } else {
3702 /*
3703 * All processors are executing higher
3704 * priority threads, and the lowest_count
3705 * candidate was not usable
3706 */
3707 processor = master_processor;
3708 }
3709
3710 /*
3711 * Check that the correct processor set is
3712 * returned locked.
3713 */
3714 if (cset != processor->processor_set) {
3715 pset_unlock(cset);
3716 cset = processor->processor_set;
3717 pset_lock(cset);
3718 }
3719
3720 /*
3721 * We must verify that the chosen processor is still available.
3722 * master_processor is an exception, since we may need to preempt
3723 * a running thread on it during processor shutdown (for sleep),
3724 * and that thread needs to be enqueued on its runqueue to run
3725 * when the processor is restarted.
3726 */
3727 if (processor != master_processor && (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE))
3728 processor = PROCESSOR_NULL;
3729
3730 } while (processor == PROCESSOR_NULL);
3731
3732 return (processor);
3733}
3734
3735/*
3736 * thread_setrun:
3737 *
3738 * Dispatch thread for execution, onto an idle
3739 * processor or run queue, and signal a preemption
3740 * as appropriate.
3741 *
3742 * Thread must be locked.
3743 */
3744void
3745thread_setrun(
3746 thread_t thread,
3747 integer_t options)
3748{
3749 processor_t processor;
3750 processor_set_t pset;
3751
3752 assert((thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_TERMINATE|TH_TERMINATE2)) == TH_RUN);
3753 assert(thread->runq == PROCESSOR_NULL);
3754
3755 /*
3756 * Update priority if needed.
3757 */
3758 if (SCHED(can_update_priority)(thread))
3759 SCHED(update_priority)(thread);
3760
3761 thread->sfi_class = sfi_thread_classify(thread);
3762
3763 assert(thread->runq == PROCESSOR_NULL);
3764
3765#if __SMP__
3766 if (thread->bound_processor == PROCESSOR_NULL) {
3767 /*
3768 * Unbound case.
3769 */
3770 if (thread->affinity_set != AFFINITY_SET_NULL) {
3771 /*
3772 * Use affinity set policy hint.
3773 */
3774 pset = thread->affinity_set->aset_pset;
3775 pset_lock(pset);
3776
3777 processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
3778
3779 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3780 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
3781 } else if (thread->last_processor != PROCESSOR_NULL) {
3782 /*
3783 * Simple (last processor) affinity case.
3784 */
3785 processor = thread->last_processor;
3786 pset = processor->processor_set;
3787 pset_lock(pset);
3788 processor = SCHED(choose_processor)(pset, processor, thread);
3789
3790 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3791 (uintptr_t)thread_tid(thread), thread->last_processor->cpu_id, processor->cpu_id, processor->state, 0);
3792 } else {
3793 /*
3794 * No Affinity case:
3795 *
3796 * Utilitize a per task hint to spread threads
3797 * among the available processor sets.
3798 */
3799 task_t task = thread->task;
3800
3801 pset = task->pset_hint;
3802 if (pset == PROCESSOR_SET_NULL)
3803 pset = current_processor()->processor_set;
3804
3805 pset = choose_next_pset(pset);
3806 pset_lock(pset);
3807
3808 processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
3809 task->pset_hint = processor->processor_set;
3810
3811 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3812 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
3813 }
3814 } else {
3815 /*
3816 * Bound case:
3817 *
3818 * Unconditionally dispatch on the processor.
3819 */
3820 processor = thread->bound_processor;
3821 pset = processor->processor_set;
3822 pset_lock(pset);
3823
3824 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3825 (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
3826 }
3827#else /* !__SMP__ */
3828 /* Only one processor to choose */
3829 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == master_processor);
3830 processor = master_processor;
3831 pset = processor->processor_set;
3832 pset_lock(pset);
3833#endif /* !__SMP__ */
3834
3835 /*
3836 * Dispatch the thread on the chosen processor.
3837 * TODO: This should be based on sched_mode, not sched_pri
3838 */
3839 if (thread->sched_pri >= BASEPRI_RTQUEUES)
3840 realtime_setrun(processor, thread);
3841 else
3842 processor_setrun(processor, thread, options);
3843}
3844
3845processor_set_t
3846task_choose_pset(
3847 task_t task)
3848{
3849 processor_set_t pset = task->pset_hint;
3850
3851 if (pset != PROCESSOR_SET_NULL)
3852 pset = choose_next_pset(pset);
3853
3854 return (pset);
3855}
3856
3857/*
3858 * Check for a preemption point in
3859 * the current context.
3860 *
3861 * Called at splsched with thread locked.
3862 */
3863ast_t
3864csw_check(
3865 processor_t processor,
3866 ast_t check_reason)
3867{
3868 processor_set_t pset = processor->processor_set;
3869 ast_t result;
3870
3871 pset_lock(pset);
3872
3873 /* If we were sent a remote AST and interrupted a running processor, acknowledge it here with pset lock held */
3874 pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
3875
3876 result = csw_check_locked(processor, pset, check_reason);
3877
3878 pset_unlock(pset);
3879
3880 return result;
3881}
3882
3883/*
3884 * Check for preemption at splsched with
3885 * pset and thread locked
3886 */
3887ast_t
3888csw_check_locked(
3889 processor_t processor,
3890 processor_set_t pset __unused,
3891 ast_t check_reason)
3892{
3893 ast_t result;
3894 thread_t thread = processor->active_thread;
3895
3896 if (processor->first_timeslice) {
3897 if (rt_runq.count > 0)
3898 return (check_reason | AST_PREEMPT | AST_URGENT);
3899 }
3900 else {
3901 if (rt_runq.count > 0) {
3902 if (BASEPRI_RTQUEUES > processor->current_pri)
3903 return (check_reason | AST_PREEMPT | AST_URGENT);
3904 else
3905 return (check_reason | AST_PREEMPT);
3906 }
3907 }
3908
3909 result = SCHED(processor_csw_check)(processor);
3910 if (result != AST_NONE)
3911 return (check_reason | result | (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE));
3912
3913#if __SMP__
3914
3915 /*
3916 * If the current thread is running on a processor that is no longer recommended, gently
3917 * (non-urgently) get to a point and then block, and which point thread_select() should
3918 * try to idle the processor and re-dispatch the thread to a recommended processor.
3919 */
3920 if (!processor->is_recommended)
3921 return (check_reason | AST_PREEMPT);
3922
3923 /*
3924 * Even though we could continue executing on this processor, a
3925 * secondary SMT core should try to shed load to another primary core.
3926 *
3927 * TODO: Should this do the same check that thread_select does? i.e.
3928 * if no bound threads target this processor, and idle primaries exist, preempt
3929 * The case of RT threads existing is already taken care of above
3930 * Consider Capri in this scenario.
3931 *
3932 * if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue))
3933 *
3934 * TODO: Alternatively - check if only primary is idle, or check if primary's pri is lower than mine.
3935 */
3936
3937 if (processor->current_pri < BASEPRI_RTQUEUES &&
3938 processor->processor_primary != processor)
3939 return (check_reason | AST_PREEMPT);
3940#endif
3941
3942 if (thread->state & TH_SUSP)
3943 return (check_reason | AST_PREEMPT);
3944
3945#if CONFIG_SCHED_SFI
3946 /*
3947 * Current thread may not need to be preempted, but maybe needs
3948 * an SFI wait?
3949 */
3950 result = sfi_thread_needs_ast(thread, NULL);
3951 if (result != AST_NONE)
3952 return (check_reason | result);
3953#endif
3954
3955 return (AST_NONE);
3956}
3957
3958/*
3959 * set_sched_pri:
3960 *
3961 * Set the scheduled priority of the specified thread.
3962 *
3963 * This may cause the thread to change queues.
3964 *
3965 * Thread must be locked.
3966 */
3967void
3968set_sched_pri(
3969 thread_t thread,
3970 int priority)
3971{
3972 thread_t cthread = current_thread();
3973 boolean_t is_current_thread = (thread == cthread) ? TRUE : FALSE;
3974 int curgency, nurgency;
3975 uint64_t urgency_param1, urgency_param2;
3976 boolean_t removed_from_runq = FALSE;
3977
3978 /* If we're already at this priority, no need to mess with the runqueue */
3979 if (priority == thread->sched_pri)
3980 return;
3981
3982 if (is_current_thread) {
3983 assert(thread->runq == PROCESSOR_NULL);
3984 curgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
3985 } else {
3986 removed_from_runq = thread_run_queue_remove(thread);
3987 }
3988
3989 thread->sched_pri = priority;
3990
3991 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
3992 (uintptr_t)thread_tid(thread),
3993 thread->base_pri,
3994 thread->sched_pri,
3995 0, /* eventually, 'reason' */
3996 0);
3997
3998 if (is_current_thread) {
3999 nurgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
4000 /*
4001 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
4002 * class alterations from user space to occur relatively infrequently, hence
4003 * those are lazily handled. QoS classes have distinct priority bands, and QoS
4004 * inheritance is expected to involve priority changes.
4005 */
4006 if (nurgency != curgency) {
4007 thread_tell_urgency(nurgency, urgency_param1, urgency_param2, 0, thread);
4008 machine_thread_going_on_core(thread, nurgency, 0, 0);
4009 }
4010 }
4011
4012 /* TODO: Should this be TAILQ if it went down, HEADQ if it went up? */
4013 if (removed_from_runq)
4014 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
4015 else if (thread->state & TH_RUN) {
4016 processor_t processor = thread->last_processor;
4017
4018 if (is_current_thread) {
4019 ast_t preempt;
4020
4021 processor->current_pri = priority;
4022 processor->current_thmode = thread->sched_mode;
4023 processor->current_sfi_class = thread->sfi_class = sfi_thread_classify(thread);
4024 if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE)
4025 ast_on(preempt);
4026 } else if (processor != PROCESSOR_NULL && processor->active_thread == thread)
4027 cause_ast_check(processor);
4028 }
4029}
4030
4031/*
4032 * thread_run_queue_remove_for_handoff
4033 *
4034 * Pull a thread or its (recursive) push target out of the runqueue
4035 * so that it is ready for thread_run()
4036 *
4037 * Called at splsched
4038 *
4039 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
4040 * This may be different than the thread that was passed in.
4041 */
4042thread_t
4043thread_run_queue_remove_for_handoff(thread_t thread) {
4044
4045 thread_t pulled_thread = THREAD_NULL;
4046
4047 thread_lock(thread);
4048
4049 /*
4050 * Check that the thread is not bound
4051 * to a different processor, and that realtime
4052 * is not involved.
4053 *
4054 * Next, pull it off its run queue. If it
4055 * doesn't come, it's not eligible.
4056 */
4057
4058 processor_t processor = current_processor();
4059 if (processor->current_pri < BASEPRI_RTQUEUES && thread->sched_pri < BASEPRI_RTQUEUES &&
4060 (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)) {
4061
4062 if (thread_run_queue_remove(thread))
4063 pulled_thread = thread;
4064 }
4065
4066 thread_unlock(thread);
4067
4068 return pulled_thread;
4069}
4070
4071/*
4072 * thread_run_queue_remove:
4073 *
4074 * Remove a thread from its current run queue and
4075 * return TRUE if successful.
4076 *
4077 * Thread must be locked.
4078 *
4079 * If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
4080 * run queues because the caller locked the thread. Otherwise
4081 * the thread is on a run queue, but could be chosen for dispatch
4082 * and removed by another processor under a different lock, which
4083 * will set thread->runq to PROCESSOR_NULL.
4084 *
4085 * Hence the thread select path must not rely on anything that could
4086 * be changed under the thread lock after calling this function,
4087 * most importantly thread->sched_pri.
4088 */
4089boolean_t
4090thread_run_queue_remove(
4091 thread_t thread)
4092{
4093 boolean_t removed = FALSE;
4094 processor_t processor = thread->runq;
4095
4096 if ((thread->state & (TH_RUN|TH_WAIT)) == TH_WAIT) {
4097 /* Thread isn't runnable */
4098 assert(thread->runq == PROCESSOR_NULL);
4099 return FALSE;
4100 }
4101
4102 if (processor == PROCESSOR_NULL) {
4103 /*
4104 * The thread is either not on the runq,
4105 * or is in the midst of being removed from the runq.
4106 *
4107 * runq is set to NULL under the pset lock, not the thread
4108 * lock, so the thread may still be in the process of being dequeued
4109 * from the runq. It will wait in invoke for the thread lock to be
4110 * dropped.
4111 */
4112
4113 return FALSE;
4114 }
4115
4116 if (thread->sched_pri < BASEPRI_RTQUEUES) {
4117 return SCHED(processor_queue_remove)(processor, thread);
4118 }
4119
4120 rt_lock_lock();
4121
4122 if (thread->runq != PROCESSOR_NULL) {
4123 /*
4124 * Thread is on the RT run queue and we have a lock on
4125 * that run queue.
4126 */
4127
4128 assert(thread->runq == THREAD_ON_RT_RUNQ);
4129
4130 remqueue(&thread->runq_links);
4131 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
4132 rt_runq.count--;
4133
4134 thread->runq = PROCESSOR_NULL;
4135
4136 removed = TRUE;
4137 }
4138
4139 rt_lock_unlock();
4140
4141 return (removed);
4142}
4143
4144/*
4145 * Put the thread back where it goes after a thread_run_queue_remove
4146 *
4147 * Thread must have been removed under the same thread lock hold
4148 *
4149 * thread locked, at splsched
4150 */
4151void
4152thread_run_queue_reinsert(thread_t thread, integer_t options)
4153{
4154 assert(thread->runq == PROCESSOR_NULL);
4155
4156 assert(thread->state & (TH_RUN));
4157 thread_setrun(thread, options);
4158
4159}
4160
4161void
4162sys_override_cpu_throttle(int flag)
4163{
4164 if (flag == CPU_THROTTLE_ENABLE)
4165 cpu_throttle_enabled = 1;
4166 if (flag == CPU_THROTTLE_DISABLE)
4167 cpu_throttle_enabled = 0;
4168}
4169
4170int
4171thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
4172{
4173 if (thread == NULL || (thread->state & TH_IDLE)) {
4174 *arg1 = 0;
4175 *arg2 = 0;
4176
4177 return (THREAD_URGENCY_NONE);
4178 } else if (thread->sched_mode == TH_MODE_REALTIME) {
4179 *arg1 = thread->realtime.period;
4180 *arg2 = thread->realtime.deadline;
4181
4182 return (THREAD_URGENCY_REAL_TIME);
4183 } else if (cpu_throttle_enabled &&
4184 ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
4185 /*
4186 * Background urgency applied when thread priority is MAXPRI_THROTTLE or lower and thread is not promoted
4187 */
4188 *arg1 = thread->sched_pri;
4189 *arg2 = thread->base_pri;
4190
4191 return (THREAD_URGENCY_BACKGROUND);
4192 } else {
4193 /* For otherwise unclassified threads, report throughput QoS
4194 * parameters
4195 */
4196 *arg1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
4197 *arg2 = proc_get_effective_task_policy(thread->task, TASK_POLICY_THROUGH_QOS);
4198
4199 return (THREAD_URGENCY_NORMAL);
4200 }
4201}
4202
4203
4204/*
4205 * This is the processor idle loop, which just looks for other threads
4206 * to execute. Processor idle threads invoke this without supplying a
4207 * current thread to idle without an asserted wait state.
4208 *
4209 * Returns a the next thread to execute if dispatched directly.
4210 */
4211
4212#if 0
4213#define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
4214#else
4215#define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
4216#endif
4217
4218thread_t
4219processor_idle(
4220 thread_t thread,
4221 processor_t processor)
4222{
4223 processor_set_t pset = processor->processor_set;
4224 thread_t new_thread;
4225 int state;
4226 (void)splsched();
4227
4228 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4229 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_START,
4230 (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
4231
4232 SCHED_STATS_CPU_IDLE_START(processor);
4233
4234 timer_switch(&PROCESSOR_DATA(processor, system_state),
4235 mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state));
4236 PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
4237
4238 while (1) {
4239 if (processor->state != PROCESSOR_IDLE) /* unsafe, but worst case we loop around once */
4240 break;
4241 if (pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))
4242 break;
4243 if (processor->is_recommended) {
4244 if (rt_runq.count)
4245 break;
4246 } else {
4247 if (SCHED(processor_bound_count)(processor))
4248 break;
4249 }
4250
4251#if CONFIG_SCHED_IDLE_IN_PLACE
4252 if (thread != THREAD_NULL) {
4253 /* Did idle-in-place thread wake up */
4254 if ((thread->state & (TH_WAIT|TH_SUSP)) != TH_WAIT || thread->wake_active)
4255 break;
4256 }
4257#endif
4258
4259 IDLE_KERNEL_DEBUG_CONSTANT(
4260 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -1, 0);
4261
4262 machine_track_platform_idle(TRUE);
4263
4264 machine_idle();
4265
4266 machine_track_platform_idle(FALSE);
4267
4268 (void)splsched();
4269
4270 IDLE_KERNEL_DEBUG_CONSTANT(
4271 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -2, 0);
4272
4273 if (!SCHED(processor_queue_empty)(processor)) {
4274 /* Secondary SMT processors respond to directed wakeups
4275 * exclusively. Some platforms induce 'spurious' SMT wakeups.
4276 */
4277 if (processor->processor_primary == processor)
4278 break;
4279 }
4280 }
4281
4282 timer_switch(&PROCESSOR_DATA(processor, idle_state),
4283 mach_absolute_time(), &PROCESSOR_DATA(processor, system_state));
4284 PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
4285
4286 pset_lock(pset);
4287
4288 /* If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held */
4289 pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
4290#if defined(CONFIG_SCHED_DEFERRED_AST)
4291 pset->pending_deferred_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
4292#endif
4293
4294 state = processor->state;
4295 if (state == PROCESSOR_DISPATCHING) {
4296 /*
4297 * Commmon case -- cpu dispatched.
4298 */
4299 new_thread = processor->next_thread;
4300 processor->next_thread = THREAD_NULL;
4301 processor->state = PROCESSOR_RUNNING;
4302
4303 if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE) ||
4304 (rt_runq.count > 0)) ) {
4305 /* Something higher priority has popped up on the runqueue - redispatch this thread elsewhere */
4306 processor->current_pri = IDLEPRI;
4307 processor->current_thmode = TH_MODE_FIXED;
4308 processor->current_sfi_class = SFI_CLASS_KERNEL;
4309 processor->deadline = UINT64_MAX;
4310
4311 pset_unlock(pset);
4312
4313 thread_lock(new_thread);
4314 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq.count, 0, 0);
4315 thread_setrun(new_thread, SCHED_HEADQ);
4316 thread_unlock(new_thread);
4317
4318 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4319 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4320 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4321
4322 return (THREAD_NULL);
4323 }
4324
4325 pset_unlock(pset);
4326
4327 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4328 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4329 (uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0);
4330
4331 return (new_thread);
4332
4333 } else if (state == PROCESSOR_IDLE) {
4334 re_queue_tail(&pset->active_queue, &processor->processor_queue);
4335
4336 processor->state = PROCESSOR_RUNNING;
4337 processor->current_pri = IDLEPRI;
4338 processor->current_thmode = TH_MODE_FIXED;
4339 processor->current_sfi_class = SFI_CLASS_KERNEL;
4340 processor->deadline = UINT64_MAX;
4341
4342 } else if (state == PROCESSOR_SHUTDOWN) {
4343 /*
4344 * Going off-line. Force a
4345 * reschedule.
4346 */
4347 if ((new_thread = processor->next_thread) != THREAD_NULL) {
4348 processor->next_thread = THREAD_NULL;
4349 processor->current_pri = IDLEPRI;
4350 processor->current_thmode = TH_MODE_FIXED;
4351 processor->current_sfi_class = SFI_CLASS_KERNEL;
4352 processor->deadline = UINT64_MAX;
4353
4354 pset_unlock(pset);
4355
4356 thread_lock(new_thread);
4357 thread_setrun(new_thread, SCHED_HEADQ);
4358 thread_unlock(new_thread);
4359
4360 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4361 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4362 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4363
4364 return (THREAD_NULL);
4365 }
4366 }
4367
4368 pset_unlock(pset);
4369
4370 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4371 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4372 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4373
4374 return (THREAD_NULL);
4375}
4376
4377/*
4378 * Each processor has a dedicated thread which
4379 * executes the idle loop when there is no suitable
4380 * previous context.
4381 */
4382void
4383idle_thread(void)
4384{
4385 processor_t processor = current_processor();
4386 thread_t new_thread;
4387
4388 new_thread = processor_idle(THREAD_NULL, processor);
4389 if (new_thread != THREAD_NULL) {
4390 thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread);
4391 /*NOTREACHED*/
4392 }
4393
4394 thread_block((thread_continue_t)idle_thread);
4395 /*NOTREACHED*/
4396}
4397
4398kern_return_t
4399idle_thread_create(
4400 processor_t processor)
4401{
4402 kern_return_t result;
4403 thread_t thread;
4404 spl_t s;
4405
4406 result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread);
4407 if (result != KERN_SUCCESS)
4408 return (result);
4409
4410 s = splsched();
4411 thread_lock(thread);
4412 thread->bound_processor = processor;
4413 processor->idle_thread = thread;
4414 thread->sched_pri = thread->base_pri = IDLEPRI;
4415 thread->state = (TH_RUN | TH_IDLE);
4416 thread->options |= TH_OPT_IDLE_THREAD;
4417 thread_unlock(thread);
4418 splx(s);
4419
4420 thread_deallocate(thread);
4421
4422 return (KERN_SUCCESS);
4423}
4424
4425/*
4426 * sched_startup:
4427 *
4428 * Kicks off scheduler services.
4429 *
4430 * Called at splsched.
4431 */
4432void
4433sched_startup(void)
4434{
4435 kern_return_t result;
4436 thread_t thread;
4437
4438 simple_lock_init(&sched_vm_group_list_lock, 0);
4439
4440
4441 result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
4442 (void *)SCHED(maintenance_continuation), MAXPRI_KERNEL, &thread);
4443 if (result != KERN_SUCCESS)
4444 panic("sched_startup");
4445
4446 thread_deallocate(thread);
4447
4448 assert_thread_magic(thread);
4449
4450 /*
4451 * Yield to the sched_init_thread once, to
4452 * initialize our own thread after being switched
4453 * back to.
4454 *
4455 * The current thread is the only other thread
4456 * active at this point.
4457 */
4458 thread_block(THREAD_CONTINUE_NULL);
4459}
4460
4461#if defined(CONFIG_SCHED_TIMESHARE_CORE)
4462
4463static volatile uint64_t sched_maintenance_deadline;
4464static uint64_t sched_tick_last_abstime;
4465static uint64_t sched_tick_delta;
4466uint64_t sched_tick_max_delta;
4467/*
4468 * sched_init_thread:
4469 *
4470 * Perform periodic bookkeeping functions about ten
4471 * times per second.
4472 */
4473void
4474sched_timeshare_maintenance_continue(void)
4475{
4476 uint64_t sched_tick_ctime, late_time;
4477
4478 struct sched_update_scan_context scan_context = {
4479 .earliest_bg_make_runnable_time = UINT64_MAX,
4480 .earliest_normal_make_runnable_time = UINT64_MAX,
4481 .earliest_rt_make_runnable_time = UINT64_MAX
4482 };
4483
4484 sched_tick_ctime = mach_absolute_time();
4485
4486 if (__improbable(sched_tick_last_abstime == 0)) {
4487 sched_tick_last_abstime = sched_tick_ctime;
4488 late_time = 0;
4489 sched_tick_delta = 1;
4490 } else {
4491 late_time = sched_tick_ctime - sched_tick_last_abstime;
4492 sched_tick_delta = late_time / sched_tick_interval;
4493 /* Ensure a delta of 1, since the interval could be slightly
4494 * smaller than the sched_tick_interval due to dispatch
4495 * latencies.
4496 */
4497 sched_tick_delta = MAX(sched_tick_delta, 1);
4498
4499 /* In the event interrupt latencies or platform
4500 * idle events that advanced the timebase resulted
4501 * in periods where no threads were dispatched,
4502 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
4503 * iterations.
4504 */
4505 sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
4506
4507 sched_tick_last_abstime = sched_tick_ctime;
4508 sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
4509 }
4510
4511 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)|DBG_FUNC_START,
4512 sched_tick_delta, late_time, 0, 0, 0);
4513
4514 /* Add a number of pseudo-ticks corresponding to the elapsed interval
4515 * This could be greater than 1 if substantial intervals where
4516 * all processors are idle occur, which rarely occurs in practice.
4517 */
4518
4519 sched_tick += sched_tick_delta;
4520
4521 /*
4522 * Compute various averages.
4523 */
4524 compute_averages(sched_tick_delta);
4525
4526 /*
4527 * Scan the run queues for threads which
4528 * may need to be updated, and find the earliest runnable thread on the runqueue
4529 * to report its latency.
4530 */
4531 SCHED(thread_update_scan)(&scan_context);
4532
4533 rt_runq_scan(&scan_context);
4534
4535 uint64_t ctime = mach_absolute_time();
4536
4537 uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ?
4538 ctime - scan_context.earliest_bg_make_runnable_time : 0;
4539
4540 uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ?
4541 ctime - scan_context.earliest_normal_make_runnable_time : 0;
4542
4543 uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
4544 ctime - scan_context.earliest_rt_make_runnable_time : 0;
4545
4546 machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
4547
4548 /*
4549 * Check to see if the special sched VM group needs attention.
4550 */
4551 sched_vm_group_maintenance();
4552
4553
4554 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END,
4555 sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
4556 sched_pri_shifts[TH_BUCKET_SHARE_UT], 0, 0);
4557
4558 assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
4559 thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
4560 /*NOTREACHED*/
4561}
4562
4563static uint64_t sched_maintenance_wakeups;
4564
4565/*
4566 * Determine if the set of routines formerly driven by a maintenance timer
4567 * must be invoked, based on a deadline comparison. Signals the scheduler
4568 * maintenance thread on deadline expiration. Must be invoked at an interval
4569 * lower than the "sched_tick_interval", currently accomplished by
4570 * invocation via the quantum expiration timer and at context switch time.
4571 * Performance matters: this routine reuses a timestamp approximating the
4572 * current absolute time received from the caller, and should perform
4573 * no more than a comparison against the deadline in the common case.
4574 */
4575void
4576sched_timeshare_consider_maintenance(uint64_t ctime) {
4577 uint64_t ndeadline, deadline = sched_maintenance_deadline;
4578
4579 if (__improbable(ctime >= deadline)) {
4580 if (__improbable(current_thread() == sched_maintenance_thread))
4581 return;
4582 OSMemoryBarrier();
4583
4584 ndeadline = ctime + sched_tick_interval;
4585
4586 if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) {
4587 thread_wakeup((event_t)sched_timeshare_maintenance_continue);
4588 sched_maintenance_wakeups++;
4589 }
4590 }
4591}
4592
4593#endif /* CONFIG_SCHED_TIMESHARE_CORE */
4594
4595void
4596sched_init_thread(void (*continuation)(void))
4597{
4598 thread_block(THREAD_CONTINUE_NULL);
4599
4600 thread_t thread = current_thread();
4601
4602 thread_set_thread_name(thread, "sched_maintenance_thread");
4603
4604 sched_maintenance_thread = thread;
4605
4606 continuation();
4607
4608 /*NOTREACHED*/
4609}
4610
4611#if defined(CONFIG_SCHED_TIMESHARE_CORE)
4612
4613/*
4614 * thread_update_scan / runq_scan:
4615 *
4616 * Scan the run queues to account for timesharing threads
4617 * which need to be updated.
4618 *
4619 * Scanner runs in two passes. Pass one squirrels likely
4620 * threads away in an array, pass two does the update.
4621 *
4622 * This is necessary because the run queue is locked for
4623 * the candidate scan, but the thread is locked for the update.
4624 *
4625 * Array should be sized to make forward progress, without
4626 * disabling preemption for long periods.
4627 */
4628
4629#define THREAD_UPDATE_SIZE 128
4630
4631static thread_t thread_update_array[THREAD_UPDATE_SIZE];
4632static uint32_t thread_update_count = 0;
4633
4634/* Returns TRUE if thread was added, FALSE if thread_update_array is full */
4635boolean_t
4636thread_update_add_thread(thread_t thread)
4637{
4638 if (thread_update_count == THREAD_UPDATE_SIZE)
4639 return (FALSE);
4640
4641 thread_update_array[thread_update_count++] = thread;
4642 thread_reference_internal(thread);
4643 return (TRUE);
4644}
4645
4646void
4647thread_update_process_threads(void)
4648{
4649 assert(thread_update_count <= THREAD_UPDATE_SIZE);
4650
4651 for (uint32_t i = 0 ; i < thread_update_count ; i++) {
4652 thread_t thread = thread_update_array[i];
4653 assert_thread_magic(thread);
4654 thread_update_array[i] = THREAD_NULL;
4655
4656 spl_t s = splsched();
4657 thread_lock(thread);
4658 if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
4659 SCHED(update_priority)(thread);
4660 }
4661 thread_unlock(thread);
4662 splx(s);
4663
4664 thread_deallocate(thread);
4665 }
4666
4667 thread_update_count = 0;
4668}
4669
4670/*
4671 * Scan a runq for candidate threads.
4672 *
4673 * Returns TRUE if retry is needed.
4674 */
4675boolean_t
4676runq_scan(
4677 run_queue_t runq,
4678 sched_update_scan_context_t scan_context)
4679{
4680 int count = runq->count;
4681 int queue_index;
4682
4683 assert(count >= 0);
4684
4685 if (count == 0)
4686 return FALSE;
4687
4688 for (queue_index = bitmap_first(runq->bitmap, NRQS);
4689 queue_index >= 0;
4690 queue_index = bitmap_next(runq->bitmap, queue_index)) {
4691
4692 thread_t thread;
4693 queue_t queue = &runq->queues[queue_index];
4694
4695 qe_foreach_element(thread, queue, runq_links) {
4696 assert(count > 0);
4697 assert_thread_magic(thread);
4698
4699 if (thread->sched_stamp != sched_tick &&
4700 thread->sched_mode == TH_MODE_TIMESHARE) {
4701 if (thread_update_add_thread(thread) == FALSE)
4702 return TRUE;
4703 }
4704
4705 if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
4706 if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
4707 scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
4708 }
4709 } else {
4710 if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
4711 scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
4712 }
4713 }
4714 count--;
4715 }
4716 }
4717
4718 return FALSE;
4719}
4720
4721#endif /* CONFIG_SCHED_TIMESHARE_CORE */
4722
4723boolean_t
4724thread_eager_preemption(thread_t thread)
4725{
4726 return ((thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != 0);
4727}
4728
4729void
4730thread_set_eager_preempt(thread_t thread)
4731{
4732 spl_t x;
4733 processor_t p;
4734 ast_t ast = AST_NONE;
4735
4736 x = splsched();
4737 p = current_processor();
4738
4739 thread_lock(thread);
4740 thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
4741
4742 if (thread == current_thread()) {
4743
4744 ast = csw_check(p, AST_NONE);
4745 thread_unlock(thread);
4746 if (ast != AST_NONE) {
4747 (void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
4748 }
4749 } else {
4750 p = thread->last_processor;
4751
4752 if (p != PROCESSOR_NULL && p->state == PROCESSOR_RUNNING &&
4753 p->active_thread == thread) {
4754 cause_ast_check(p);
4755 }
4756
4757 thread_unlock(thread);
4758 }
4759
4760 splx(x);
4761}
4762
4763void
4764thread_clear_eager_preempt(thread_t thread)
4765{
4766 spl_t x;
4767
4768 x = splsched();
4769 thread_lock(thread);
4770
4771 thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
4772
4773 thread_unlock(thread);
4774 splx(x);
4775}
4776
4777/*
4778 * Scheduling statistics
4779 */
4780void
4781sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
4782{
4783 struct processor_sched_statistics *stats;
4784 boolean_t to_realtime = FALSE;
4785
4786 stats = &processor->processor_data.sched_stats;
4787 stats->csw_count++;
4788
4789 if (otherpri >= BASEPRI_REALTIME) {
4790 stats->rt_sched_count++;
4791 to_realtime = TRUE;
4792 }
4793
4794 if ((reasons & AST_PREEMPT) != 0) {
4795 stats->preempt_count++;
4796
4797 if (selfpri >= BASEPRI_REALTIME) {
4798 stats->preempted_rt_count++;
4799 }
4800
4801 if (to_realtime) {
4802 stats->preempted_by_rt_count++;
4803 }
4804
4805 }
4806}
4807
4808void
4809sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
4810{
4811 uint64_t timestamp = mach_absolute_time();
4812
4813 stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
4814 stats->last_change_timestamp = timestamp;
4815}
4816
4817/*
4818 * For calls from assembly code
4819 */
4820#undef thread_wakeup
4821void
4822thread_wakeup(
4823 event_t x);
4824
4825void
4826thread_wakeup(
4827 event_t x)
4828{
4829 thread_wakeup_with_result(x, THREAD_AWAKENED);
4830}
4831
4832boolean_t
4833preemption_enabled(void)
4834{
4835 return (get_preemption_level() == 0 && ml_get_interrupts_enabled());
4836}
4837
4838static void
4839sched_timer_deadline_tracking_init(void) {
4840 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
4841 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
4842}
4843
4844
4845kern_return_t
4846sched_work_interval_notify(thread_t thread, uint64_t work_interval_id, uint64_t start, uint64_t finish, uint64_t deadline, uint64_t next_start, uint32_t flags)
4847{
4848 int urgency;
4849 uint64_t urgency_param1, urgency_param2;
4850 spl_t s;
4851
4852 if (work_interval_id == 0) {
4853 return (KERN_INVALID_ARGUMENT);
4854 }
4855
4856 assert(thread == current_thread());
4857
4858 thread_mtx_lock(thread);
4859 if (thread->work_interval_id != work_interval_id) {
4860 thread_mtx_unlock(thread);
4861 return (KERN_INVALID_ARGUMENT);
4862 }
4863 thread_mtx_unlock(thread);
4864
4865 s = splsched();
4866 thread_lock(thread);
4867 urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
4868 thread_unlock(thread);
4869 splx(s);
4870
4871 machine_work_interval_notify(thread, work_interval_id, start, finish, deadline, next_start, urgency, flags);
4872 return (KERN_SUCCESS);
4873}
4874
4875void thread_set_options(uint32_t thopt) {
4876 spl_t x;
4877 thread_t t = current_thread();
4878
4879 x = splsched();
4880 thread_lock(t);
4881
4882 t->options |= thopt;
4883
4884 thread_unlock(t);
4885 splx(x);
4886}
4887
4888void thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint) {
4889 thread->pending_block_hint = block_hint;
4890}